From 524ca8c0701c5d47c9b3356645ae828fbd01d46c Mon Sep 17 00:00:00 2001 From: gqt <3217233537@qq.com> Date: Sun, 26 Apr 2026 20:33:51 +0800 Subject: [PATCH] Avoid wall-hugging during unknown recharge routes --- agent_ppo/feature/preprocessor.py | 91 +++++++++++++++++++++++++------ 1 file changed, 73 insertions(+), 18 deletions(-) diff --git a/agent_ppo/feature/preprocessor.py b/agent_ppo/feature/preprocessor.py index 5c0d197..b04492e 100644 --- a/agent_ppo/feature/preprocessor.py +++ b/agent_ppo/feature/preprocessor.py @@ -1212,9 +1212,15 @@ class Preprocessor: score += 0.35 if visit_count == 0 else -0.05 * min(visit_count, 10) if self.recharge_mode: - score += 2.2 * float(self.charger_action_delta[action]) - if self._charger_move_distance(nx, nz) < self._charger_move_distance(hx, hz): - score += 0.8 + if self.charger_route_known: + score += 2.2 * float(self.charger_action_delta[action]) + if self._charger_move_distance(nx, nz) < self._charger_move_distance(hx, hz): + score += 0.8 + else: + score += 2.0 * float(self.frontier_action_delta[action]) + score += 0.7 * max(float(self.global_dirty_action_delta[action]), 0.0) + if self._min_charger_range_dist(nx, nz) < self._min_charger_range_dist(hx, hz): + score += 0.15 else: if self.global_dirty_path_dist < self.GRID_SIZE: score += 1.8 * float(self.global_dirty_action_delta[action]) @@ -1336,18 +1342,17 @@ class Preprocessor: if any(stay): return stay + if not self.charger_route_known: + return self._filter_recharge_discovery_actions(legal_action, scored, current_range_dist) + recharge = [0] * 8 best_next_dist = min(item[0] for item in scored) ranked = sorted(scored, key=lambda item: (item[0], -item[1])) - max_recharge_actions = 4 if self.charger_route_known else 5 - dist_slack = 2.5 if self.charger_route_known else 4.0 + max_recharge_actions = 4 + dist_slack = 2.5 for next_dist, alignment, next_range_dist, action in ranked: route_progress = next_dist <= current_move_dist + 0.1 - range_progress = next_range_dist <= current_range_dist - direction_progress = alignment > 0 - if next_dist <= best_next_dist + dist_slack and ( - route_progress or (not self.charger_route_known and (range_progress or direction_progress)) - ): + if next_dist <= best_next_dist + dist_slack and route_progress: recharge[action] = 1 if sum(recharge) >= max_recharge_actions: break @@ -1358,6 +1363,46 @@ class Preprocessor: return recharge if any(recharge) else list(legal_action) + def _filter_recharge_discovery_actions(self, legal_action, scored, current_range_dist): + """When charger route is unknown, search for a route instead of pushing into walls.""" + ranked = [] + hx, hz = self.cur_pos + for next_dist, alignment, next_range_dist, action in scored: + if legal_action[action] <= 0: + continue + dx, dz = self.ACTION_DIRS[action] + nx, nz = hx + dx, hz + dz + visit_count = int(self.visit_count_map[nx, nz]) if 0 <= nx < self.GRID_SIZE and 0 <= nz < self.GRID_SIZE else 0 + frontier_gain = float(self.frontier_action_delta[action]) + dirty_gain = float(self.global_dirty_action_delta[action]) + range_gain = float(np.clip(current_range_dist - next_range_dist, -2.0, 2.0)) / 2.0 + alignment_gain = 0.25 if alignment > 0 else 0.0 + repeat_penalty = 0.8 if action == self.last_action and self.recharge_no_progress_steps >= 2 else 0.0 + wall_hug_penalty = 0.35 * float(self.local_obstacle_ratio) + score = ( + 2.4 * frontier_gain + + 0.8 * max(dirty_gain, 0.0) + + 0.35 * range_gain + + alignment_gain + - 0.04 * min(visit_count, 12) + - repeat_penalty + - wall_hug_penalty + ) + ranked.append((score, action)) + + if not ranked: + return list(legal_action) + + ranked.sort(reverse=True) + best_score = ranked[0][0] + discovery = [0] * 8 + for score, action in ranked: + if score >= best_score - 0.35 or sum(discovery) < 3: + discovery[action] = 1 + if sum(discovery) >= 5: + break + return discovery if any(discovery) else list(legal_action) + def _filter_recharge_escape_actions(self, recharge_action, safe_action): """Escape repeated no-move states during low-battery recharge.""" if not self._need_recharge_escape(): @@ -1474,16 +1519,26 @@ class Preprocessor: charge_reward -= 0.25 * min(self.charge_delta, 3) if self.has_charger and (self.recharge_mode or self.low_battery): - dist_delta = float( - np.clip(self.last_nearest_charger_path_dist - self.nearest_charger_path_dist, -4.0, 4.0) - ) recharge_risk = self._recharge_risk_score() - approach_scale = 0.07 + 0.06 * recharge_risk - retreat_scale = 0.035 + 0.045 * recharge_risk if not self.charger_route_known: - approach_scale += 0.02 - retreat_scale += 0.01 - charge_reward += approach_scale * dist_delta if dist_delta > 0 else retreat_scale * dist_delta + frontier_progress = float( + np.clip(self.last_frontier_path_dist - self.frontier_path_dist, -3.0, 3.0) + ) + range_delta = float( + np.clip(self.last_nearest_charger_range_dist - self.nearest_charger_range_dist, -2.0, 2.0) + ) + discovery_scale = 0.035 + 0.035 * recharge_risk + range_scale = 0.015 + 0.015 * recharge_risk + charge_reward += discovery_scale * frontier_progress + if self.prev_pos is not None and self.cur_pos != self.prev_pos and self.stuck_steps == 0: + charge_reward += range_scale * range_delta + else: + dist_delta = float( + np.clip(self.last_nearest_charger_path_dist - self.nearest_charger_path_dist, -4.0, 4.0) + ) + approach_scale = 0.07 + 0.06 * recharge_risk + retreat_scale = 0.035 + 0.045 * recharge_risk + charge_reward += approach_scale * dist_delta if dist_delta > 0 else retreat_scale * dist_delta if self.charger_safety_margin < self.recharge_enter_margin: safety_shortage = self.recharge_enter_margin - self.charger_safety_margin charge_reward -= min(0.55, safety_shortage / max(self.battery_max, 1))