Avoid wall-hugging during unknown recharge routes

2026-04-26 20:33:51 +08:00
parent 69b8a692db
commit 524ca8c070
1 changed files with 73 additions and 18 deletions
--- a/agent_ppo/feature/preprocessor.py
+++ b/agent_ppo/feature/preprocessor.py
@@ -1212,9 +1212,15 @@ class Preprocessor:
        score += 0.35 if visit_count == 0 else -0.05 * min(visit_count, 10)

        if self.recharge_mode:
+            if self.charger_route_known:
                score += 2.2 * float(self.charger_action_delta[action])
                if self._charger_move_distance(nx, nz) < self._charger_move_distance(hx, hz):
                    score += 0.8
+            else:
+                score += 2.0 * float(self.frontier_action_delta[action])
+                score += 0.7 * max(float(self.global_dirty_action_delta[action]), 0.0)
+                if self._min_charger_range_dist(nx, nz) < self._min_charger_range_dist(hx, hz):
+                    score += 0.15
        else:
            if self.global_dirty_path_dist < self.GRID_SIZE:
                score += 1.8 * float(self.global_dirty_action_delta[action])
@@ -1336,18 +1342,17 @@ class Preprocessor:
            if any(stay):
                return stay

+        if not self.charger_route_known:
+            return self._filter_recharge_discovery_actions(legal_action, scored, current_range_dist)
+
        recharge = [0] * 8
        best_next_dist = min(item[0] for item in scored)
        ranked = sorted(scored, key=lambda item: (item[0], -item[1]))
-        max_recharge_actions = 4 if self.charger_route_known else 5
-        dist_slack = 2.5 if self.charger_route_known else 4.0
+        max_recharge_actions = 4
+        dist_slack = 2.5
        for next_dist, alignment, next_range_dist, action in ranked:
            route_progress = next_dist <= current_move_dist + 0.1
-            range_progress = next_range_dist <= current_range_dist
-            direction_progress = alignment > 0
-            if next_dist <= best_next_dist + dist_slack and (
-                route_progress or (not self.charger_route_known and (range_progress or direction_progress))
-            ):
+            if next_dist <= best_next_dist + dist_slack and route_progress:
                recharge[action] = 1
            if sum(recharge) >= max_recharge_actions:
                break
@@ -1358,6 +1363,46 @@ class Preprocessor:

        return recharge if any(recharge) else list(legal_action)

+    def _filter_recharge_discovery_actions(self, legal_action, scored, current_range_dist):
+        """When charger route is unknown, search for a route instead of pushing into walls."""
+        ranked = []
+        hx, hz = self.cur_pos
+        for next_dist, alignment, next_range_dist, action in scored:
+            if legal_action[action] <= 0:
+                continue
+            dx, dz = self.ACTION_DIRS[action]
+            nx, nz = hx + dx, hz + dz
+            visit_count = int(self.visit_count_map[nx, nz]) if 0 <= nx < self.GRID_SIZE and 0 <= nz < self.GRID_SIZE else 0
+            frontier_gain = float(self.frontier_action_delta[action])
+            dirty_gain = float(self.global_dirty_action_delta[action])
+            range_gain = float(np.clip(current_range_dist - next_range_dist, -2.0, 2.0)) / 2.0
+            alignment_gain = 0.25 if alignment > 0 else 0.0
+            repeat_penalty = 0.8 if action == self.last_action and self.recharge_no_progress_steps >= 2 else 0.0
+            wall_hug_penalty = 0.35 * float(self.local_obstacle_ratio)
+            score = (
+                2.4 * frontier_gain
+                + 0.8 * max(dirty_gain, 0.0)
+                + 0.35 * range_gain
+                + alignment_gain
+                - 0.04 * min(visit_count, 12)
+                - repeat_penalty
+                - wall_hug_penalty
+            )
+            ranked.append((score, action))
+
+        if not ranked:
+            return list(legal_action)
+
+        ranked.sort(reverse=True)
+        best_score = ranked[0][0]
+        discovery = [0] * 8
+        for score, action in ranked:
+            if score >= best_score - 0.35 or sum(discovery) < 3:
+                discovery[action] = 1
+            if sum(discovery) >= 5:
+                break
+        return discovery if any(discovery) else list(legal_action)
+
    def _filter_recharge_escape_actions(self, recharge_action, safe_action):
        """Escape repeated no-move states during low-battery recharge."""
        if not self._need_recharge_escape():
@@ -1474,15 +1519,25 @@ class Preprocessor:
            charge_reward -= 0.25 * min(self.charge_delta, 3)

        if self.has_charger and (self.recharge_mode or self.low_battery):
+            recharge_risk = self._recharge_risk_score()
+            if not self.charger_route_known:
+                frontier_progress = float(
+                    np.clip(self.last_frontier_path_dist - self.frontier_path_dist, -3.0, 3.0)
+                )
+                range_delta = float(
+                    np.clip(self.last_nearest_charger_range_dist - self.nearest_charger_range_dist, -2.0, 2.0)
+                )
+                discovery_scale = 0.035 + 0.035 * recharge_risk
+                range_scale = 0.015 + 0.015 * recharge_risk
+                charge_reward += discovery_scale * frontier_progress
+                if self.prev_pos is not None and self.cur_pos != self.prev_pos and self.stuck_steps == 0:
+                    charge_reward += range_scale * range_delta
+            else:
                dist_delta = float(
                    np.clip(self.last_nearest_charger_path_dist - self.nearest_charger_path_dist, -4.0, 4.0)
                )
-            recharge_risk = self._recharge_risk_score()
                approach_scale = 0.07 + 0.06 * recharge_risk
                retreat_scale = 0.035 + 0.045 * recharge_risk
-            if not self.charger_route_known:
-                approach_scale += 0.02
-                retreat_scale += 0.01
                charge_reward += approach_scale * dist_delta if dist_delta > 0 else retreat_scale * dist_delta
            if self.charger_safety_margin < self.recharge_enter_margin:
                safety_shortage = self.recharge_enter_margin - self.charger_safety_margin