增加PPO回充安全动作约束

This commit is contained in:
2026-04-26 17:06:54 +08:00
parent e0756b4846
commit f04feb0cd9

View File

@@ -603,36 +603,46 @@ class Preprocessor:
return False return False
def _filter_recharge_actions(self, legal_action): def _filter_recharge_actions(self, legal_action):
"""Restrict low-battery actions to moves that approach the charger.""" """Restrict recharge-mode actions to safe moves toward the charger range."""
if not self.has_charger: if not self.has_charger:
return list(legal_action) return list(legal_action)
hx, hz = self.cur_pos hx, hz = self.cur_pos
current_dist = max(abs(self.nearest_charger_dx), abs(self.nearest_charger_dz)) current_dist = self._min_charger_range_dist(hx, hz)
scored = [] scored = []
for action, (dx, dz) in enumerate(self.ACTION_DIRS): for action, (dx, dz) in enumerate(self.ACTION_DIRS):
if legal_action[action] <= 0: if legal_action[action] <= 0:
continue continue
next_dx = self.nearest_charger_dx - dx nx, nz = hx + dx, hz + dz
next_dz = self.nearest_charger_dz - dz next_dist = self._min_charger_range_dist(nx, nz)
next_dist = max(abs(next_dx), abs(next_dz))
improvement = current_dist - next_dist
alignment = dx * self.nearest_charger_dx + dz * self.nearest_charger_dz alignment = dx * self.nearest_charger_dx + dz * self.nearest_charger_dz
scored.append((improvement, alignment, action)) scored.append((next_dist, alignment, action))
if not scored: if not scored:
return list(legal_action) return list(legal_action)
best_improvement = max(item[0] for item in scored) # When already inside the charger range, stay inside until recharge mode exits.
# 已经在充电区域内时,回充模式退出前不要离开充电区域。
if current_dist <= 0.0:
stay = [0] * 8
for next_dist, _, action in scored:
if next_dist <= 0.0:
stay[action] = 1
if any(stay):
return stay
best_next_dist = min(item[0] for item in scored)
best_alignment = max(alignment for next_dist, alignment, _ in scored if next_dist <= best_next_dist + 0.1)
recharge = [0] * 8 recharge = [0] * 8
if best_improvement > 0: for next_dist, alignment, action in scored:
for improvement, _, action in scored: if next_dist <= best_next_dist + 0.1 and alignment >= best_alignment - 0.1:
if improvement >= best_improvement - 0.1: recharge[action] = 1
recharge[action] = 1
else: if not any(recharge):
best_alignment = max(item[1] for item in scored) best_alignment = max(item[1] for item in scored)
for _, alignment, action in scored: for _, alignment, action in scored:
if alignment >= best_alignment: if alignment >= best_alignment - 0.1:
recharge[action] = 1 recharge[action] = 1
return recharge if any(recharge) else list(legal_action) return recharge if any(recharge) else list(legal_action)