Initial robot vacuum code

2026-04-26 12:38:39 +08:00
commit ca6234c941
38 changed files with 1673 additions and 0 deletions
--- a/agent_ppo/conf/init.py
+++ b/agent_ppo/conf/init.py
--- a/agent_ppo/conf/conf.py
+++ b/agent_ppo/conf/conf.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
+###########################################################################
+# Copyright © 1998 - 2026 Tencent. All Rights Reserved.
+###########################################################################
+"""
+Author: Tencent AI Arena Authors
+
+Configuration for Robot Vacuum PPO agent.
+清扫大作战 PPO 配置。
+"""
+
+
+class Config:
+
+    # Feature dimensions (69D)
+    # 特征维度（69D）
+    FEATURES = [
+        7 * 7,
+        12,
+        8,
+    ]
+    FEATURE_SPLIT_SHAPE = FEATURES
+    FEATURE_LEN = sum(FEATURES)
+    DIM_OF_OBSERVATION = FEATURE_LEN
+
+    # Action space: 8 directional moves
+    # 动作空间：8个方向移动
+    ACTION_NUM = 8
+
+    # Single-head value
+    # 单头价值
+    VALUE_NUM = 1
+
+    # PPO hyperparameters
+    # PPO 超参数
+    GAMMA = 0.99
+    LAMDA = 0.95
+
+    INIT_LEARNING_RATE_START = 0.0003
+    BETA_START = 0.001
+    CLIP_PARAM = 0.2
+    VF_COEF = 0.5
+
+    LABEL_SIZE_LIST = [ACTION_NUM]
+    LEGAL_ACTION_SIZE_LIST = LABEL_SIZE_LIST.copy()
+
+    USE_GRAD_CLIP = True
+    GRAD_CLIP_RANGE = 0.5
--- a/agent_ppo/conf/monitor_builder.py
+++ b/agent_ppo/conf/monitor_builder.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
+###########################################################################
+# Copyright © 1998 - 2026 Tencent. All Rights Reserved.
+###########################################################################
+"""
+Author: Tencent AI Arena Authors
+
+Monitor panel configuration builder for Robot Vacuum.
+清扫大作战监控面板配置构建器。
+"""
+
+
+from kaiwudrl.common.monitor.monitor_config_builder import MonitorConfigBuilder
+
+
+def build_monitor():
+    """
+    # This function is used to create monitoring panel configurations for custom indicators.
+    # 该函数用于创建自定义指标的监控面板配置。
+    """
+    monitor = MonitorConfigBuilder()
+
+    config_dict = (
+        monitor.title("清扫大作战")
+        .add_group(
+            group_name="算法指标",
+            group_name_en="algorithm",
+        )
+        .add_panel(
+            name="累积回报",
+            name_en="reward",
+            type="line",
+        )
+        .add_metric(
+            metrics_name="reward",
+            expr="avg(reward{})",
+        )
+        .end_panel()
+        .add_panel(
+            name="总损失",
+            name_en="total_loss",
+            type="line",
+        )
+        .add_metric(
+            metrics_name="total_loss",
+            expr="avg(total_loss{})",
+        )
+        .end_panel()
+        .add_panel(
+            name="价值损失",
+            name_en="value_loss",
+            type="line",
+        )
+        .add_metric(
+            metrics_name="value_loss",
+            expr="avg(value_loss{})",
+        )
+        .end_panel()
+        .add_panel(
+            name="策略损失",
+            name_en="policy_loss",
+            type="line",
+        )
+        .add_metric(
+            metrics_name="policy_loss",
+            expr="avg(policy_loss{})",
+        )
+        .end_panel()
+        .add_panel(
+            name="熵损失",
+            name_en="entropy_loss",
+            type="line",
+        )
+        .add_metric(
+            metrics_name="entropy_loss",
+            expr="avg(entropy_loss{})",
+        )
+        .end_panel()
+        .end_group()
+        .build()
+    )
+    return config_dict
--- a/agent_ppo/conf/train_env_conf.toml
+++ b/agent_ppo/conf/train_env_conf.toml
@@ -0,0 +1,26 @@
+[env_conf]
+# Maps used for training. Customize by keeping only desired map IDs, e.g. [1, 2] for maps 1 and 2.
+# 训练使用的地图。可自定义选择期望用来训练的地图，如只期望使用1、2号地图训练数组内仅保留[1,2]即可。
+map = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+# Whether to randomly select maps. Boolean. 
+# true = randomly pick one from configured maps per episode, false = used sequentially.
+# 是否随机抽取地图。布尔值。true表示每局从配置的地图中随机抽取一张，false表示按顺序抽取地图训练。
+map_random = false
+
+# Number of official robots. Range: 1~4 (integer). 
+# In each round, official robots will be randomly generated on the road according to the configured.
+# 官方机器人数量。可配置范围为1～4（整数）。每局将按照配置数量在道路上随机生成官方机器人。
+robot_count = 4
+
+# Number of chargers. Range: 1~4 (integer). When less than 4, spawn points are randomly chosen.
+# 充电桩数量。可配置范围为1～4（整数）。当配置小于4时，将从每张地图可生成充电桩的点位随机选择对应数量的点位生成。
+charger_count = 4
+
+# Maximum steps. The task ends when the predicted steps in a single round reach the maximum. Range: 1~2000.
+# 最大步数。单局任务预测步数达到最大步数时，任务结束。可配置范围为1～2000。
+max_step = 1000
+
+# Maximum battery. The battery level when fully charged. Range: 100~999.
+# 最大电量。满电状态下的电量。可配置范围100～999。
+battery_max = 200