优化PPO充电与避障策略

扩展观测特征到157维，加入充电桩、NPC、电量安全余量、地图统计和本步清扫信息。增加低电量回充动作过滤、NPC危险区过滤，并调整奖励和终局日志以突出充电、避障和真实清扫得分。
2026-04-26 14:14:18 +08:00
parent eb3efa4df7
commit efbc612945
6 changed files with 441 additions and 52 deletions
--- a/agent_ppo/model/model.py
+++ b/agent_ppo/model/model.py
@@ -38,22 +38,22 @@ class Model(nn.Module):
        self.model_name = "robot_vacuum"
        self.device = device

-        obs_dim = Config.DIM_OF_OBSERVATION  # 69
+        obs_dim = Config.DIM_OF_OBSERVATION  # 157
        act_num = Config.ACTION_NUM  # 8

        # Shared backbone / 共享骨干网络
        self.backbone = nn.Sequential(
-            _make_fc(obs_dim, 128),
+            _make_fc(obs_dim, 256),
            nn.ReLU(),
-            _make_fc(128, 64),
+            _make_fc(256, 128),
            nn.ReLU(),
        )

        # Actor head: outputs action logits / 策略头：输出动作 logits
-        self.actor_head = _make_fc(64, act_num, gain=0.01)
+        self.actor_head = _make_fc(128, act_num, gain=0.01)

        # Critic head: outputs single state value / 价值头：输出单个状态价值
-        self.critic_head = _make_fc(64, 1, gain=0.01)
+        self.critic_head = _make_fc(128, 1, gain=0.01)

    def forward(self, s, inference=False):
        """Forward pass.