调整PPO奖励突出有效充电
This commit is contained in:
@@ -527,8 +527,8 @@ class Preprocessor:
|
|||||||
prev_low_risk = max(0.0, self.recharge_low_battery_ratio - prev_battery_ratio)
|
prev_low_risk = max(0.0, self.recharge_low_battery_ratio - prev_battery_ratio)
|
||||||
prev_low_risk /= max(self.recharge_low_battery_ratio, 1e-6)
|
prev_low_risk /= max(self.recharge_low_battery_ratio, 1e-6)
|
||||||
risk = max(self._recharge_risk_score(), prev_low_risk)
|
risk = max(self._recharge_risk_score(), prev_low_risk)
|
||||||
mode_bonus = 0.25 if self.was_recharge_mode or self.prev_low_battery else 0.0
|
mode_bonus = 0.4 if self.was_recharge_mode or self.prev_low_battery else 0.0
|
||||||
return float(np.clip(1.2 + 1.1 * risk + mode_bonus, 1.2, 2.6))
|
return float(np.clip(2.0 + 1.8 * risk + mode_bonus, 2.0, 4.2))
|
||||||
|
|
||||||
def battery_fail_penalty(self):
|
def battery_fail_penalty(self):
|
||||||
"""Adaptive terminal penalty for running out of battery before max steps."""
|
"""Adaptive terminal penalty for running out of battery before max steps."""
|
||||||
@@ -951,13 +951,6 @@ class Preprocessor:
|
|||||||
# Step penalty / 时间惩罚
|
# Step penalty / 时间惩罚
|
||||||
step_penalty = -0.002
|
step_penalty = -0.002
|
||||||
|
|
||||||
# Dense guidance: prefer moving toward visible dirt.
|
|
||||||
# 稠密引导:鼓励向视野内污渍靠近。
|
|
||||||
approach_reward = 0.0
|
|
||||||
if not self.recharge_mode and (self.last_nearest_dirt_dist < 200.0 or self.nearest_dirt_dist < 200.0):
|
|
||||||
dist_delta = float(np.clip(self.last_nearest_dirt_dist - self.nearest_dirt_dist, -5.0, 5.0))
|
|
||||||
approach_reward = 0.01 * dist_delta if dist_delta > 0 else 0.006 * dist_delta
|
|
||||||
|
|
||||||
# Recharge guidance only activates when battery safety is the bottleneck.
|
# Recharge guidance only activates when battery safety is the bottleneck.
|
||||||
# 仅在低电量/回充模式下引导靠近充电桩,避免高电量蹲充电桩。
|
# 仅在低电量/回充模式下引导靠近充电桩,避免高电量蹲充电桩。
|
||||||
charge_reward = 0.0
|
charge_reward = 0.0
|
||||||
@@ -1015,7 +1008,6 @@ class Preprocessor:
|
|||||||
|
|
||||||
return (
|
return (
|
||||||
cleaning_reward
|
cleaning_reward
|
||||||
+ approach_reward
|
|
||||||
+ charge_reward
|
+ charge_reward
|
||||||
+ exploration_reward
|
+ exploration_reward
|
||||||
+ stuck_penalty
|
+ stuck_penalty
|
||||||
|
|||||||
Reference in New Issue
Block a user