From 524ca8c0701c5d47c9b3356645ae828fbd01d46c Mon Sep 17 00:00:00 2001
From: gqt <3217233537@qq.com>
Date: Sun, 26 Apr 2026 20:33:51 +0800
Subject: [PATCH] Avoid wall-hugging during unknown recharge routes

---
 agent_ppo/feature/preprocessor.py | 91 +++++++++++++++++++++++++------
 1 file changed, 73 insertions(+), 18 deletions(-)

diff --git a/agent_ppo/feature/preprocessor.py b/agent_ppo/feature/preprocessor.py
index 5c0d197..b04492e 100644
--- a/agent_ppo/feature/preprocessor.py
+++ b/agent_ppo/feature/preprocessor.py
@@ -1212,9 +1212,15 @@ class Preprocessor:
         score += 0.35 if visit_count == 0 else -0.05 * min(visit_count, 10)
 
         if self.recharge_mode:
-            score += 2.2 * float(self.charger_action_delta[action])
-            if self._charger_move_distance(nx, nz) < self._charger_move_distance(hx, hz):
-                score += 0.8
+            if self.charger_route_known:
+                score += 2.2 * float(self.charger_action_delta[action])
+                if self._charger_move_distance(nx, nz) < self._charger_move_distance(hx, hz):
+                    score += 0.8
+            else:
+                score += 2.0 * float(self.frontier_action_delta[action])
+                score += 0.7 * max(float(self.global_dirty_action_delta[action]), 0.0)
+                if self._min_charger_range_dist(nx, nz) < self._min_charger_range_dist(hx, hz):
+                    score += 0.15
         else:
             if self.global_dirty_path_dist < self.GRID_SIZE:
                 score += 1.8 * float(self.global_dirty_action_delta[action])
@@ -1336,18 +1342,17 @@ class Preprocessor:
             if any(stay):
                 return stay
 
+        if not self.charger_route_known:
+            return self._filter_recharge_discovery_actions(legal_action, scored, current_range_dist)
+
         recharge = [0] * 8
         best_next_dist = min(item[0] for item in scored)
         ranked = sorted(scored, key=lambda item: (item[0], -item[1]))
-        max_recharge_actions = 4 if self.charger_route_known else 5
-        dist_slack = 2.5 if self.charger_route_known else 4.0
+        max_recharge_actions = 4
+        dist_slack = 2.5
         for next_dist, alignment, next_range_dist, action in ranked:
             route_progress = next_dist <= current_move_dist + 0.1
-            range_progress = next_range_dist <= current_range_dist
-            direction_progress = alignment > 0
-            if next_dist <= best_next_dist + dist_slack and (
-                route_progress or (not self.charger_route_known and (range_progress or direction_progress))
-            ):
+            if next_dist <= best_next_dist + dist_slack and route_progress:
                 recharge[action] = 1
             if sum(recharge) >= max_recharge_actions:
                 break
@@ -1358,6 +1363,46 @@ class Preprocessor:
 
         return recharge if any(recharge) else list(legal_action)
 
+    def _filter_recharge_discovery_actions(self, legal_action, scored, current_range_dist):
+        """When charger route is unknown, search for a route instead of pushing into walls."""
+        ranked = []
+        hx, hz = self.cur_pos
+        for next_dist, alignment, next_range_dist, action in scored:
+            if legal_action[action] <= 0:
+                continue
+            dx, dz = self.ACTION_DIRS[action]
+            nx, nz = hx + dx, hz + dz
+            visit_count = int(self.visit_count_map[nx, nz]) if 0 <= nx < self.GRID_SIZE and 0 <= nz < self.GRID_SIZE else 0
+            frontier_gain = float(self.frontier_action_delta[action])
+            dirty_gain = float(self.global_dirty_action_delta[action])
+            range_gain = float(np.clip(current_range_dist - next_range_dist, -2.0, 2.0)) / 2.0
+            alignment_gain = 0.25 if alignment > 0 else 0.0
+            repeat_penalty = 0.8 if action == self.last_action and self.recharge_no_progress_steps >= 2 else 0.0
+            wall_hug_penalty = 0.35 * float(self.local_obstacle_ratio)
+            score = (
+                2.4 * frontier_gain
+                + 0.8 * max(dirty_gain, 0.0)
+                + 0.35 * range_gain
+                + alignment_gain
+                - 0.04 * min(visit_count, 12)
+                - repeat_penalty
+                - wall_hug_penalty
+            )
+            ranked.append((score, action))
+
+        if not ranked:
+            return list(legal_action)
+
+        ranked.sort(reverse=True)
+        best_score = ranked[0][0]
+        discovery = [0] * 8
+        for score, action in ranked:
+            if score >= best_score - 0.35 or sum(discovery) < 3:
+                discovery[action] = 1
+            if sum(discovery) >= 5:
+                break
+        return discovery if any(discovery) else list(legal_action)
+
     def _filter_recharge_escape_actions(self, recharge_action, safe_action):
         """Escape repeated no-move states during low-battery recharge."""
         if not self._need_recharge_escape():
@@ -1474,16 +1519,26 @@ class Preprocessor:
             charge_reward -= 0.25 * min(self.charge_delta, 3)
 
         if self.has_charger and (self.recharge_mode or self.low_battery):
-            dist_delta = float(
-                np.clip(self.last_nearest_charger_path_dist - self.nearest_charger_path_dist, -4.0, 4.0)
-            )
             recharge_risk = self._recharge_risk_score()
-            approach_scale = 0.07 + 0.06 * recharge_risk
-            retreat_scale = 0.035 + 0.045 * recharge_risk
             if not self.charger_route_known:
-                approach_scale += 0.02
-                retreat_scale += 0.01
-            charge_reward += approach_scale * dist_delta if dist_delta > 0 else retreat_scale * dist_delta
+                frontier_progress = float(
+                    np.clip(self.last_frontier_path_dist - self.frontier_path_dist, -3.0, 3.0)
+                )
+                range_delta = float(
+                    np.clip(self.last_nearest_charger_range_dist - self.nearest_charger_range_dist, -2.0, 2.0)
+                )
+                discovery_scale = 0.035 + 0.035 * recharge_risk
+                range_scale = 0.015 + 0.015 * recharge_risk
+                charge_reward += discovery_scale * frontier_progress
+                if self.prev_pos is not None and self.cur_pos != self.prev_pos and self.stuck_steps == 0:
+                    charge_reward += range_scale * range_delta
+            else:
+                dist_delta = float(
+                    np.clip(self.last_nearest_charger_path_dist - self.nearest_charger_path_dist, -4.0, 4.0)
+                )
+                approach_scale = 0.07 + 0.06 * recharge_risk
+                retreat_scale = 0.035 + 0.045 * recharge_risk
+                charge_reward += approach_scale * dist_delta if dist_delta > 0 else retreat_scale * dist_delta
             if self.charger_safety_margin < self.recharge_enter_margin:
                 safety_shortage = self.recharge_enter_margin - self.charger_safety_margin
                 charge_reward -= min(0.55, safety_shortage / max(self.battery_max, 1))