Initial robot vacuum code

2026-04-26 12:38:39 +08:00
commit ca6234c941
38 changed files with 1673 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,18 @@
 .DS_Store
 __pycache__/
 *.py[cod]
 *.pyo
 # Local training outputs and checkpoints
 ckpt/
 *.ckpt
 *.pkl
 # Runtime logs and temporary files
 logs/
 *.log
 tmp/
 temp/
 # IDE/editor local state
 .idea/
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -0,0 +1,17 @@
 {
  // Use IntelliSense to learn about possible attributes.
  // Hover to view descriptions of existing attributes.
  // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
  "version": "0.2.0",
  "configurations": [
    {
      "name": "TestTrain",
      "type": "python",
      "request": "launch",
      "program": "${workspaceFolder}/train_test.py",
      "console": "integratedTerminal",
      "subProcess": true,
      "justMyCode": true
    }
  ]
 }
--- a/agent_diy/init.py
+++ b/agent_diy/init.py
--- a/agent_diy/agent.py
+++ b/agent_diy/agent.py
@@ -0,0 +1,96 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 ###########################################################################
 # Copyright © 1998 - 2026 Tencent. All Rights Reserved.
 ###########################################################################
 """
 Author: Tencent AI Arena Authors
 Robot Vacuum DIY Agent class based on kaiwudrl BaseAgent interface.
 清扫大作战 DIY Agent 主类，基于 kaiwudrl BaseAgent 接口。
 """
 import torch
 from kaiwudrl.interface.agent import BaseAgent
 from agent_diy.model.model import Model
 from agent_diy.conf.conf import Config
 class Agent(BaseAgent):
    def __init__(self, agent_type="player", device=None, logger=None, monitor=None):
        """Initialize the agent.
        初始化 Agent。
        """
        super().__init__(agent_type, device, logger, monitor)
    def predict(self, list_obs_data):
        """Predict action from observation data.
        根据观测数据推理动作。
        """
        pass
    def exploit(self, list_obs_data):
        """Evaluation mode inference (greedy).
        评估模式推理（贪心）。
        """
        pass
    def learn(self, list_sample_data):
        """Train the model.
        训练模型。
        """
        pass
    def save_model(self, path=None, id="1"):
        """Save model checkpoint.
        保存模型检查点。
        """
        pass
    def load_model(self, path=None, id="1"):
        """Load model checkpoint.
        加载模型检查点。
        """
        pass
    def observation_process(self, obs, preprocessor, extra_info=None):
        """
        This function is an important feature processing function, mainly responsible for:
            - Parsing information in the raw data
            - Parsing preprocessed feature data
            - Processing the features and returning the processed feature vector
            - Concatenation of features
            - Annotation of legal actions
        Function inputs:
            - obs: Local observation information returned by the environment
            - preprocessor: Preprocessor
            - extra_info: Global information returned by the environment
        Function outputs:
            - ObsData: Observation data for model inference
            - remain_info: Other data for reward calculation
        该函数是特征处理的重要函数, 主要负责：
            - 解析原始数据里的信息
            - 解析预处理后的特征数据
            - 对特征进行处理, 并返回处理后的特征向量
            - 特征的拼接
            - 合法动作的标注
        函数的输入：
            - obs: 环境返回的局部观测信息
            - preprocessor: 预处理器
            - extra_info: 环境返回的全局状态信息
        函数的输出：
            - ObsData: 用于模型推理的观测数据
            - remain_info: 用于奖励计算的其他数据
        """
        pass
    def action_process(self, act_data):
        pass
--- a/agent_diy/algorithm/init.py
+++ b/agent_diy/algorithm/init.py
--- a/agent_diy/algorithm/algorithm.py
+++ b/agent_diy/algorithm/algorithm.py
@@ -0,0 +1,32 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 ###########################################################################
 # Copyright © 1998 - 2026 Tencent. All Rights Reserved.
 ###########################################################################
 """
 Author: Tencent AI Arena Authors
 Robot Vacuum DIY algorithm implementation.
 清扫大作战 DIY 算法实现。
 """
 class Algorithm:
    """DIY algorithm class.
    DIY 算法类。
    """
    def __init__(self, model, optimizer, scheduler, device=None, logger=None, monitor=None):
        """Initialize the algorithm.
        初始化算法。
        """
        pass
    def learn(self, list_sample_data):
        """Training entry.
        训练入口。
        """
        pass
--- a/agent_diy/conf/init.py
+++ b/agent_diy/conf/init.py
--- a/agent_diy/conf/conf.py
+++ b/agent_diy/conf/conf.py
@@ -0,0 +1,43 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 ###########################################################################
 # Copyright © 1998 - 2026 Tencent. All Rights Reserved.
 ###########################################################################
 """
 Author: Tencent AI Arena Authors
 """
 import numpy as np
 # Configuration, including dimension settings and algorithm parameter settings.
 # 配置，包含维度设置，算法参数设置
 class Config:
    # Whether to use CNN networks
    # 是否使用CNN网络
    USE_CNN = False
    VIEW_SIZE = 50 if USE_CNN else 0
    FEATURE_VECTOR_SHAPE = (153,)
    FEATURE_IMAGE_SHAPE = (4, VIEW_SIZE + 1, VIEW_SIZE + 1)
    ACTION_SHAPE = (8,)
    VALUE_SHAPE = (1,)
    # Discount factor GAMMA in RL
    # RL中的回报折扣GAMMA
    GAMMA = 0.95
    # Initial learning rate
    # 初始的学习率
    START_LR = 5e-4
    # Value function loss coefficient
    # 价值函数损失系数
    VALUE_LOSS_COEFF = 0.5
    # Entropy regularization coefficient
    # 熵正则化系数
    ENTROPY_LOSS_COEFF = 0.025
--- a/agent_diy/conf/monitor_builder.py
+++ b/agent_diy/conf/monitor_builder.py
@@ -0,0 +1,83 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 ###########################################################################
 # Copyright © 1998 - 2026 Tencent. All Rights Reserved.
 ###########################################################################
 """
 Author: Tencent AI Arena Authors
 Monitor panel configuration builder for Robot Vacuum.
 清扫大作战监控面板配置构建器。
 """
 from kaiwudrl.common.monitor.monitor_config_builder import MonitorConfigBuilder
 def build_monitor():
    """
    This function is used to create monitoring panel configurations for custom indicators.
    该函数用于创建自定义指标的监控面板配置。
    """
    monitor = MonitorConfigBuilder()
    config_dict = (
        monitor.title("扫地机器人")
        .add_group(
            group_name="算法指标",
            group_name_en="algorithm",
        )
        .add_panel(
            name="累积回报",
            name_en="reward",
            type="line",
        )
        .add_metric(
            metrics_name="reward",
            expr="avg(reward{})",
        )
        .end_panel()
        .add_panel(
            name="总损失",
            name_en="total_loss",
            type="line",
        )
        .add_metric(
            metrics_name="total_loss",
            expr="avg(total_loss{})",
        )
        .end_panel()
        .add_panel(
            name="价值损失",
            name_en="value_loss",
            type="line",
        )
        .add_metric(
            metrics_name="value_loss",
            expr="avg(value_loss{})",
        )
        .end_panel()
        .add_panel(
            name="策略损失",
            name_en="policy_loss",
            type="line",
        )
        .add_metric(
            metrics_name="policy_loss",
            expr="avg(policy_loss{})",
        )
        .end_panel()
        .add_panel(
            name="熵损失",
            name_en="entropy_loss",
            type="line",
        )
        .add_metric(
            metrics_name="entropy_loss",
            expr="avg(entropy_loss{})",
        )
        .end_panel()
        .end_group()
        .build()
    )
    return config_dict
--- a/agent_diy/conf/train_env_conf.toml
+++ b/agent_diy/conf/train_env_conf.toml
@@ -0,0 +1,26 @@
 [env_conf]
 # Maps used for training. Customize by keeping only desired map IDs, e.g. [1, 2] for maps 1 and 2.
 # 训练使用的地图。可自定义选择期望用来训练的地图，如只期望使用1、2号地图训练数组内仅保留[1,2]即可。
 map = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
 # Whether to randomly select maps. Boolean. 
 # true = randomly pick one from configured maps per episode, false = used sequentially.
 # 是否随机抽取地图。布尔值。true表示每局从配置的地图中随机抽取一张，false表示按顺序抽取地图训练。
 map_random = false
 # Number of official robots. Range: 1~4 (integer). 
 # In each round, official robots will be randomly generated on the road according to the configured.
 # 官方机器人数量。可配置范围为1～4（整数）。每局将按照配置数量在道路上随机生成官方机器人。
 robot_count = 4
 # Number of chargers. Range: 1~4 (integer). When less than 4, spawn points are randomly chosen.
 # 充电桩数量。可配置范围为1～4（整数）。当配置小于4时，将从每张地图可生成充电桩的点位随机选择对应数量的点位生成。
 charger_count = 4
 # Maximum steps. The task ends when the predicted steps in a single round reach the maximum. Range: 1~2000.
 # 最大步数。单局任务预测步数达到最大步数时，任务结束。可配置范围为1～2000。
 max_step = 1000
 # Maximum battery. The battery level when fully charged. Range: 100~999.
 # 最大电量。满电状态下的电量。可配置范围100～999。
 battery_max = 200
--- a/agent_diy/feature/init.py
+++ b/agent_diy/feature/init.py
--- a/agent_diy/feature/definition.py
+++ b/agent_diy/feature/definition.py
@@ -0,0 +1,59 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 ###########################################################################
 # Copyright © 1998 - 2026 Tencent. All Rights Reserved.
 ###########################################################################
 """
 Author: Tencent AI Arena Authors
 """
 from common_python.utils.common_func import create_cls
 import numpy as np
 from agent_diy.conf.conf import Config
 # The create_cls function is used to dynamically create a class. The first parameter of the function is the type name,
 # and the remaining parameters are the attributes of the class, which should have a default value of None.
 # create_cls函数用于动态创建一个类，函数第一个参数为类型名称，剩余参数为类的属性，属性默认值应设为None
 ObsData = create_cls(
    "ObsData",
    feature=None,
    legal_act=None,
 )
 ActData = create_cls(
    "ActData",
    act=None,
 )
 # SampleData is used to transfer training samples between aisrv and learner.
 # SampleData用于在aisrv和learner之间传递训练样本
 SampleData = create_cls(
    "SampleData",
    obs=153,  # Observation dimension / 观测维度
    legal_actions=8,  # Legal action dimension / 合法动作维度
    actions=1,  # Action dimension / 动作维度
    probs=8,  # Action probability distribution dimension / 动作概率分布维度
    rewards=1,  # Reward / 奖励
    advantages=1,  # Advantage function / 优势函数
    values=1,  # Value function / 价值函数
    dones=1,  # Whether terminated / 是否结束
 )
 def reward_shaping(frame_no, score, terminated, truncated, remain_info, _remain_info, obs, _obs):
    """Reward shaping function.
    奖励塑形函数。
    """
    pass
 def sample_process(list_game_data):
    """Sample processing function.
    样本处理函数。
    """
    pass
--- a/agent_diy/model/init.py
+++ b/agent_diy/model/init.py
--- a/agent_diy/model/model.py
+++ b/agent_diy/model/model.py
@@ -0,0 +1,34 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 ###########################################################################
 # Copyright © 1998 - 2026 Tencent. All Rights Reserved.
 ###########################################################################
 """
 Author: Tencent AI Arena Authors
 Robot Vacuum DIY model implementation.
 清扫大作战 DIY 模型实现。
 """
 import torch
 import numpy as np
 from torch import nn
 import torch.nn.functional as F
 class Model(nn.Module):
    """DIY model class.
    DIY 模型类。
    """
    def __init__(self, state_shape, action_shape=0, softmax=False):
        """Initialize the model.
        初始化模型。
        """
        super().__init__()
        # User-defined network
        # 用户自定义网络
--- a/agent_diy/workflow/init.py
+++ b/agent_diy/workflow/init.py
--- a/agent_diy/workflow/train_workflow.py
+++ b/agent_diy/workflow/train_workflow.py
@@ -0,0 +1,43 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 ###########################################################################
 # Copyright © 1998 - 2026 Tencent. All Rights Reserved.
 ###########################################################################
 """
 Author: Tencent AI Arena Authors
 """
 import time
 from common_python.utils.common_func import Frame
 from agent_diy.feature.definition import (
    sample_process,
    reward_shaping,
 )
 from tools.train_env_conf_validate import read_usr_conf
 from tools.metrics_utils import get_training_metrics
 from common_python.utils.workflow_disaster_recovery import handle_disaster_recovery
 def workflow(envs, agents, logger=None, monitor=None, *args, **kwargs):
    env, agent = envs[0], agents[0]
    # Read and validate configuration file
    # 配置文件读取和校验
    usr_conf = read_usr_conf("agent_diy/conf/train_env_conf.toml", logger)
    if usr_conf is None:
        logger.error(f"usr_conf is None, please check agent_diy/conf/train_env_conf.toml")
        return
    # Please write your DIY training process below.
    # 请在下方写你DIY的训练流程
    # At the start of each game, support loading the latest model file
    # 每次对局开始时, 支持加载最新model文件, 该调用会从远程的训练节点加载最新模型
    agent.load_model(id="latest")
    # Model saving
    # 保存模型
    agent.save_model()
    return
--- a/agent_ppo/init.py
+++ b/agent_ppo/init.py
--- a/agent_ppo/agent.py
+++ b/agent_ppo/agent.py
@@ -0,0 +1,175 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 ###########################################################################
 # Copyright © 1998 - 2026 Tencent. All Rights Reserved.
 ###########################################################################
 """
 Author: Tencent AI Arena Authors
 Robot Vacuum Agent.
 清扫大作战 Agent 主类。
 """
 import torch
 torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
 import numpy as np
 from agent_ppo.algorithm.algorithm import Algorithm
 from agent_ppo.conf.conf import Config
 from agent_ppo.feature.definition import ActData, ObsData
 from agent_ppo.feature.preprocessor import Preprocessor
 from agent_ppo.model.model import Model
 from kaiwudrl.interface.agent import BaseAgent
 class Agent(BaseAgent):
    def __init__(self, agent_type="player", device=None, logger=None, monitor=None):
        torch.manual_seed(0)
        self.device = device
        self.model = Model(device).to(self.device)
        self.optimizer = torch.optim.Adam(
            params=self.model.parameters(),
            lr=Config.INIT_LEARNING_RATE_START,
            betas=(0.9, 0.999),
            eps=1e-8,
        )
        self.logger = logger
        self.monitor = monitor
        self.algorithm = Algorithm(self.model, self.optimizer, self.device, self.logger, self.monitor)
        self.preprocessor = Preprocessor()
        self.last_action = -1
        self.last_reward = 0.0
        super().__init__(agent_type, device, logger, monitor)
    def reset(self, env_obs):
        """Reset per-episode state.
        每局开始时重置 Agent 内部状态。
        """
        self.preprocessor = Preprocessor()
        self.last_action = -1
        self.last_reward = 0.0
    def observation_process(self, env_obs):
        """Convert raw env_obs to ObsData (69D feature + legal action mask).
        将原始 env_obs 转换为 ObsData（69D 特征 + 合法动作掩码）。
        """
        feature, legal_action, reward = self.preprocessor.feature_process(env_obs, self.last_action)
        self.last_reward = reward
        obs_data = ObsData(
            feature=list(feature),
            legal_action=legal_action,
        )
        remain_info = {}
        return obs_data, remain_info
    def action_process(self, act_data, is_stochastic=True):
        """Extract int action from ActData and update last_action.
        从 ActData 中取出动作整数并更新 last_action。
        """
        action = act_data.action if is_stochastic else act_data.d_action
        self.last_action = int(action[0])
        return self.last_action
    def predict(self, list_obs_data):
        """Stochastic inference for training (exploration).
        训练时推理（随机采样动作）。
        """
        obs_data = list_obs_data[0]
        feature = obs_data.feature
        legal_action = obs_data.legal_action
        logits, value = self._run_model(feature)
        legal_arr = np.array(legal_action, dtype=np.float32)
        prob = self._legal_soft_max(logits, legal_arr)
        action = self._legal_sample(prob, use_max=False)
        d_action = self._legal_sample(prob, use_max=True)
        return [
            ActData(
                action=[action],
                d_action=[d_action],
                prob=list(prob),
                value=value,
            )
        ]
    def exploit(self, env_obs):
        """Greedy inference for evaluation.
        评估时推理（贪心）。
        """
        obs_data, _ = self.observation_process(env_obs)
        act_data = self.predict([obs_data])[0]
        return self.action_process(act_data, is_stochastic=False)
    def learn(self, list_sample_data):
        """Delegate to Algorithm for PPO update.
        委托给 Algorithm 执行训练。
        """
        return self.algorithm.learn(list_sample_data)
    def save_model(self, path=None, id="1"):
        """Save model checkpoint.
        保存模型检查点。
        """
        model_file_path = f"{path}/model.ckpt-{id}.pkl"
        state_dict_cpu = {k: v.clone().cpu() for k, v in self.model.state_dict().items()}
        torch.save(state_dict_cpu, model_file_path)
        self.logger.info(f"save model {model_file_path} successfully")
    def load_model(self, path=None, id="1"):
        """Load model checkpoint.
        加载模型检查点。
        """
        model_file_path = f"{path}/model.ckpt-{id}.pkl"
        self.model.load_state_dict(torch.load(model_file_path, map_location=self.device))
        self.logger.info(f"load model {model_file_path} successfully")
    def _run_model(self, feature):
        """Gradient-free forward pass, returns (logits_np, value_np).
        无梯度推理，返回 (logits_np, value_np)。
        """
        self.model.set_eval_mode()
        obs_tensor = (
            torch.tensor(np.array([feature], dtype=np.float32)).view(1, Config.DIM_OF_OBSERVATION).to(self.device)
        )
        with torch.no_grad():
            rst = self.model(obs_tensor, inference=True)
        logits = rst[0].cpu().numpy()[0]
        value = rst[1].cpu().numpy()[0]
        return logits, value
    def _legal_soft_max(self, logits, legal_action):
        """Softmax with legal action masking.
        合法动作掩码下的 softmax。
        """
        _w, _e = 1e20, 1e-5
        tmp = logits - _w * (1.0 - legal_action)
        tmp_max = np.max(tmp, keepdims=True)
        tmp = np.clip(tmp - tmp_max, -_w, 1)
        tmp = (np.exp(tmp) + _e) * legal_action
        return tmp / (np.sum(tmp, keepdims=True) * 1.00001)
    def _legal_sample(self, probs, use_max=False):
        """Sample action from probability distribution (argmax if use_max=True).
        按概率分布采样动作（use_max=True 时取 argmax）。
        """
        if use_max:
            return int(np.argmax(probs))
        return int(np.argmax(np.random.multinomial(1, probs, size=1)))
--- a/agent_ppo/algorithm/init.py
+++ b/agent_ppo/algorithm/init.py
--- a/agent_ppo/algorithm/algorithm.py
+++ b/agent_ppo/algorithm/algorithm.py
@@ -0,0 +1,161 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 ###########################################################################
 # Copyright © 1998 - 2026 Tencent. All Rights Reserved.
 ###########################################################################
 """
 Author: Tencent AI Arena Authors
 Standard PPO algorithm for Robot Vacuum.
 清扫大作战 PPO 算法。
 Loss composition / 损失组成：
  total_loss = vf_coef * value_loss + policy_loss - beta * entropy_loss
 """
 import os
 import time
 import torch
 from agent_ppo.conf.conf import Config
 class Algorithm:
    def __init__(self, model, optimizer, device=None, logger=None, monitor=None):
        self.model = model
        self.optimizer = optimizer
        self.parameters = [p for pg in optimizer.param_groups for p in pg["params"]]
        self.device = device
        self.logger = logger
        self.monitor = monitor
        self.clip_param = Config.CLIP_PARAM
        self.vf_coef = Config.VF_COEF
        self.var_beta = Config.BETA_START
        self.label_size = Config.ACTION_NUM
        self.train_step = 0
        self.last_report_time = 0
    def learn(self, list_sample_data):
        """Training entry: perform one PPO gradient step on a batch of SampleData.
        训练入口：接收一批 SampleData，执行一步梯度更新。
        """
        obs = torch.stack([s.obs for s in list_sample_data]).to(self.device)
        legal_action = torch.stack([s.legal_action for s in list_sample_data]).to(self.device)
        act = torch.stack([s.act for s in list_sample_data]).to(self.device).view(-1, 1)
        old_prob = torch.stack([s.prob for s in list_sample_data]).to(self.device)
        old_value = torch.stack([s.value for s in list_sample_data]).to(self.device)
        reward_sum = torch.stack([s.reward_sum for s in list_sample_data]).to(self.device)
        advantage = torch.stack([s.advantage for s in list_sample_data]).to(self.device)
        reward = torch.stack([s.reward for s in list_sample_data]).to(self.device)
        self.model.set_train_mode()
        self.optimizer.zero_grad()
        rst_list = self.model(obs)
        logits, value_pred = rst_list[0], rst_list[1]
        total_loss, info = self._compute_loss(
            logits=logits,
            value_pred=value_pred,
            legal_action=legal_action,
            old_action=act,
            old_prob=old_prob,
            old_value=old_value,
            reward_sum=reward_sum,
            advantage=advantage,
        )
        total_loss.backward()
        if Config.USE_GRAD_CLIP:
            torch.nn.utils.clip_grad_norm_(self.parameters, Config.GRAD_CLIP_RANGE)
        self.optimizer.step()
        self.train_step += 1
        results = {"total_loss": total_loss.item()}
        # Periodic monitoring report
        # 定期上报监控
        now = time.time()
        if now - self.last_report_time >= 60:
            results["value_loss"] = round(info["value_loss"], 4)
            results["policy_loss"] = round(info["policy_loss"], 4)
            results["entropy_loss"] = round(info["entropy_loss"], 4)
            results["reward"] = round(reward.mean().item(), 4)
            self.logger.info(
                f"policy_loss: {results['policy_loss']}, "
                f"value_loss: {results['value_loss']}, "
                f"entropy_loss: {results['entropy_loss']}"
            )
            if self.monitor:
                self.monitor.put_data({os.getpid(): results})
            self.last_report_time = now
        return results
    def _compute_loss(self, logits, value_pred, legal_action, old_action, old_prob, old_value, reward_sum, advantage):
        """Compute standard PPO loss (policy + value + entropy).
        计算标准 PPO 三项损失。
        """
        # Value loss (clipped)
        # 价值损失（裁剪）
        tdret = reward_sum.squeeze(-1) if reward_sum.dim() > 1 else reward_sum
        vp = value_pred.squeeze(-1) if value_pred.dim() > 1 else value_pred
        ov = old_value.squeeze(-1) if old_value.dim() > 1 else old_value
        vp_clip = ov + (vp - ov).clamp(-self.clip_param, self.clip_param)
        value_loss = (
            0.5
            * torch.maximum(
                (tdret - vp) ** 2,
                (tdret - vp_clip) ** 2,
            ).mean()
        )
        # Policy loss (PPO clip)
        # 策略损失（PPO clip）
        prob_dist = self._masked_softmax(logits, legal_action)
        entropy_loss = (-(prob_dist * torch.log(prob_dist.clamp(1e-9, 1))).sum(1)).mean()
        one_hot = torch.nn.functional.one_hot(old_action[:, 0].long(), self.label_size).float()
        new_prob = (one_hot * prob_dist).sum(1, keepdim=True)
        old_action_prob = (one_hot * old_prob).sum(1, keepdim=True)
        ratio = new_prob / old_action_prob.clamp(1e-9)
        adv = advantage.squeeze(-1) if advantage.dim() > 1 else advantage
        adv = adv.unsqueeze(-1)
        policy_loss = torch.maximum(
            -ratio * adv,
            -ratio.clamp(1 - self.clip_param, 1 + self.clip_param) * adv,
        ).mean()
        # Total loss
        # 总损失
        total_loss = self.vf_coef * value_loss + policy_loss - self.var_beta * entropy_loss
        return total_loss, {
            "value_loss": value_loss.item(),
            "policy_loss": policy_loss.item(),
            "entropy_loss": entropy_loss.item(),
        }
    def _masked_softmax(self, logits, legal_action):
        """Apply legal action mask to logits before computing softmax.
        对 logits 应用合法动作掩码后计算 softmax。
        """
        label_max, _ = torch.max(logits * legal_action, dim=1, keepdim=True)
        logits = logits - label_max
        logits = logits * legal_action
        logits = logits + 1e5 * (legal_action - 1)
        return torch.nn.functional.softmax(logits, dim=1)
--- a/agent_ppo/conf/init.py
+++ b/agent_ppo/conf/init.py
--- a/agent_ppo/conf/conf.py
+++ b/agent_ppo/conf/conf.py
@@ -0,0 +1,49 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 ###########################################################################
 # Copyright © 1998 - 2026 Tencent. All Rights Reserved.
 ###########################################################################
 """
 Author: Tencent AI Arena Authors
 Configuration for Robot Vacuum PPO agent.
 清扫大作战 PPO 配置。
 """
 class Config:
    # Feature dimensions (69D)
    # 特征维度（69D）
    FEATURES = [
        7 * 7,
        12,
        8,
    ]
    FEATURE_SPLIT_SHAPE = FEATURES
    FEATURE_LEN = sum(FEATURES)
    DIM_OF_OBSERVATION = FEATURE_LEN
    # Action space: 8 directional moves
    # 动作空间：8个方向移动
    ACTION_NUM = 8
    # Single-head value
    # 单头价值
    VALUE_NUM = 1
    # PPO hyperparameters
    # PPO 超参数
    GAMMA = 0.99
    LAMDA = 0.95
    INIT_LEARNING_RATE_START = 0.0003
    BETA_START = 0.001
    CLIP_PARAM = 0.2
    VF_COEF = 0.5
    LABEL_SIZE_LIST = [ACTION_NUM]
    LEGAL_ACTION_SIZE_LIST = LABEL_SIZE_LIST.copy()
    USE_GRAD_CLIP = True
    GRAD_CLIP_RANGE = 0.5
--- a/agent_ppo/conf/monitor_builder.py
+++ b/agent_ppo/conf/monitor_builder.py
@@ -0,0 +1,83 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 ###########################################################################
 # Copyright © 1998 - 2026 Tencent. All Rights Reserved.
 ###########################################################################
 """
 Author: Tencent AI Arena Authors
 Monitor panel configuration builder for Robot Vacuum.
 清扫大作战监控面板配置构建器。
 """
 from kaiwudrl.common.monitor.monitor_config_builder import MonitorConfigBuilder
 def build_monitor():
    """
    # This function is used to create monitoring panel configurations for custom indicators.
    # 该函数用于创建自定义指标的监控面板配置。
    """
    monitor = MonitorConfigBuilder()
    config_dict = (
        monitor.title("清扫大作战")
        .add_group(
            group_name="算法指标",
            group_name_en="algorithm",
        )
        .add_panel(
            name="累积回报",
            name_en="reward",
            type="line",
        )
        .add_metric(
            metrics_name="reward",
            expr="avg(reward{})",
        )
        .end_panel()
        .add_panel(
            name="总损失",
            name_en="total_loss",
            type="line",
        )
        .add_metric(
            metrics_name="total_loss",
            expr="avg(total_loss{})",
        )
        .end_panel()
        .add_panel(
            name="价值损失",
            name_en="value_loss",
            type="line",
        )
        .add_metric(
            metrics_name="value_loss",
            expr="avg(value_loss{})",
        )
        .end_panel()
        .add_panel(
            name="策略损失",
            name_en="policy_loss",
            type="line",
        )
        .add_metric(
            metrics_name="policy_loss",
            expr="avg(policy_loss{})",
        )
        .end_panel()
        .add_panel(
            name="熵损失",
            name_en="entropy_loss",
            type="line",
        )
        .add_metric(
            metrics_name="entropy_loss",
            expr="avg(entropy_loss{})",
        )
        .end_panel()
        .end_group()
        .build()
    )
    return config_dict
--- a/agent_ppo/conf/train_env_conf.toml
+++ b/agent_ppo/conf/train_env_conf.toml
@@ -0,0 +1,26 @@
 [env_conf]
 # Maps used for training. Customize by keeping only desired map IDs, e.g. [1, 2] for maps 1 and 2.
 # 训练使用的地图。可自定义选择期望用来训练的地图，如只期望使用1、2号地图训练数组内仅保留[1,2]即可。
 map = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
 # Whether to randomly select maps. Boolean. 
 # true = randomly pick one from configured maps per episode, false = used sequentially.
 # 是否随机抽取地图。布尔值。true表示每局从配置的地图中随机抽取一张，false表示按顺序抽取地图训练。
 map_random = false
 # Number of official robots. Range: 1~4 (integer). 
 # In each round, official robots will be randomly generated on the road according to the configured.
 # 官方机器人数量。可配置范围为1～4（整数）。每局将按照配置数量在道路上随机生成官方机器人。
 robot_count = 4
 # Number of chargers. Range: 1~4 (integer). When less than 4, spawn points are randomly chosen.
 # 充电桩数量。可配置范围为1～4（整数）。当配置小于4时，将从每张地图可生成充电桩的点位随机选择对应数量的点位生成。
 charger_count = 4
 # Maximum steps. The task ends when the predicted steps in a single round reach the maximum. Range: 1~2000.
 # 最大步数。单局任务预测步数达到最大步数时，任务结束。可配置范围为1～2000。
 max_step = 1000
 # Maximum battery. The battery level when fully charged. Range: 100~999.
 # 最大电量。满电状态下的电量。可配置范围100～999。
 battery_max = 200
--- a/agent_ppo/feature/init.py
+++ b/agent_ppo/feature/init.py
--- a/agent_ppo/feature/definition.py
+++ b/agent_ppo/feature/definition.py
@@ -0,0 +1,73 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 ###########################################################################
 # Copyright © 1998 - 2026 Tencent. All Rights Reserved.
 ###########################################################################
 """
 Author: Tencent AI Arena Authors
 Data definition and GAE computation for Robot Vacuum.
 清扫大作战数据类定义与 GAE 计算。
 """
 import numpy as np
 from common_python.utils.common_func import create_cls
 from agent_ppo.conf.conf import Config
 # ObsData: feature vector + legal action mask
 # 观测数据：feature 为特征向量，legal_action 为合法动作掩码
 ObsData = create_cls("ObsData", feature=None, legal_action=None)
 # ActData: sampled action, greedy action, action probabilities, state value
 # 动作数据：action 为采样动作，d_action 为贪心动作，prob 为动作概率，value 为状态价值
 ActData = create_cls(
    "ActData",
    action=None,
    d_action=None,
    prob=None,
    value=None,
 )
 # SampleData: int values are treated as dimensions by the framework
 # 训练样本数据：字段值为 int 时框架自动按维度处理
 SampleData = create_cls(
    "SampleData",
    obs=Config.DIM_OF_OBSERVATION,  # 69D feature vector / 特征向量
    legal_action=Config.ACTION_NUM,  # 8D legal action mask / 合法动作掩码
    act=1,  # action index / 执行的动作
    reward=Config.VALUE_NUM,  # 1D reward / 奖励
    reward_sum=Config.VALUE_NUM,  # GAE td-lambda return
    done=1,
    value=Config.VALUE_NUM,  # 1D value estimate / 价值估计
    next_value=Config.VALUE_NUM,
    advantage=Config.VALUE_NUM,  # 1D GAE advantage / GAE 优势
    prob=Config.ACTION_NUM,  # 8D action probabilities / 动作概率
 )
 def sample_process(list_sample_data):
    """Fill next_value and compute GAE advantage.
    计算 GAE 并填充 next_value。
    """
    for i in range(len(list_sample_data) - 1):
        list_sample_data[i].next_value = list_sample_data[i + 1].value
    _calc_gae(list_sample_data)
    return list_sample_data
 def _calc_gae(list_sample_data):
    """Compute advantage and cumulative return using GAE(λ).
    使用 GAE(λ) 计算优势函数与累积回报。
    """
    gae = 0.0
    gamma = Config.GAMMA
    lamda = Config.LAMDA
    for sample in reversed(list_sample_data):
        delta = -sample.value + sample.reward + gamma * sample.next_value
        gae = gae * gamma * lamda + delta
        sample.advantage = gae
        sample.reward_sum = gae + sample.value
--- a/agent_ppo/feature/preprocessor.py
+++ b/agent_ppo/feature/preprocessor.py
@@ -0,0 +1,257 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 ###########################################################################
 # Copyright © 1998 - 2026 Tencent. All Rights Reserved.
 ###########################################################################
 """
 Author: Tencent AI Arena Authors
 Feature preprocessor for Robot Vacuum.
 清扫大作战特征预处理器。
 """
 import numpy as np
 def _norm(v, v_max, v_min=0.0):
    """Normalize value to [0, 1].
    将值线性归一化到 [0, 1]。
    """
    v = float(np.clip(v, v_min, v_max))
    if v_max == v_min:
        return 0.0
    return (v - v_min) / (v_max - v_min)
 class Preprocessor:
    """Feature preprocessor for Robot Vacuum.
    清扫大作战特征预处理器。
    """
    GRID_SIZE = 128
    VIEW_HALF = 10  # Full local view radius (21×21) / 完整局部视野半径
    LOCAL_HALF = 3  # Cropped view radius (7×7) / 裁剪后的视野半径
    def __init__(self):
        self.reset()
    def reset(self):
        """Reset all internal state at episode start.
        对局开始时重置所有状态。
        """
        self.step_no = 0
        self.battery = 600
        self.battery_max = 600
        self.cur_pos = (0, 0)
        self.dirt_cleaned = 0
        self.last_dirt_cleaned = 0
        self.total_dirt = 1
        # Global passable map (0=obstacle, 1=passable), used for ray computation
        # 维护全局通行地图（0=障碍, 1=可通行），用于射线计算
        self.passable_map = np.ones((self.GRID_SIZE, self.GRID_SIZE), dtype=np.int8)
        # Nearest dirt distance
        # 最近污渍距离
        self.nearest_dirt_dist = 200.0
        self.last_nearest_dirt_dist = 200.0
        self._view_map = np.zeros((21, 21), dtype=np.float32)
        self._legal_act = [1] * 8
    def pb2struct(self, env_obs, last_action):
        """Parse and cache essential fields from observation dict.
        从 env_obs 字典中提取并缓存所有需要的状态量。
        """
        observation = env_obs["observation"]
        frame_state = observation["frame_state"]
        env_info = observation["env_info"]
        hero = frame_state["heroes"]
        self.step_no = int(observation["step_no"])
        self.cur_pos = (int(hero["pos"]["x"]), int(hero["pos"]["z"]))
        # Battery / 电量
        self.battery = int(hero["battery"])
        self.battery_max = max(int(hero["battery_max"]), 1)
        # Cleaning progress / 清扫进度
        self.last_dirt_cleaned = self.dirt_cleaned
        self.dirt_cleaned = int(hero["dirt_cleaned"])
        self.total_dirt = max(int(env_info["total_dirt"]), 1)
        # Legal actions / 合法动作
        self._legal_act = [int(x) for x in (observation.get("legal_action") or [1] * 8)]
        # Local view map (21×21) / 局部视野地图
        map_info = observation.get("map_info")
        if map_info is not None:
            self._view_map = np.array(map_info, dtype=np.float32)
            hx, hz = self.cur_pos
            self._update_passable(hx, hz)
    def _update_passable(self, hx, hz):
        """Write local view into global passable map.
        将局部视野写入全局通行地图。
        """
        view = self._view_map
        vsize = view.shape[0]
        half = vsize // 2
        for ri in range(vsize):
            for ci in range(vsize):
                gx = hx - half + ri
                gz = hz - half + ci
                if 0 <= gx < self.GRID_SIZE and 0 <= gz < self.GRID_SIZE:
                    # 0 = obstacle, 1/2 = passable
                    # 0 = 障碍, 1/2 = 可通行
                    self.passable_map[gx, gz] = 1 if view[ri, ci] != 0 else 0
    def _get_local_view_feature(self):
        """Local view feature (49D): crop center 7×7 from 21×21.
        局部视野特征（49D）：从 21×21 视野中心裁剪 7×7。
        """
        center = self.VIEW_HALF
        h = self.LOCAL_HALF
        crop = self._view_map[center - h : center + h + 1, center - h : center + h + 1]
        return (crop / 2.0).flatten()
    def _get_global_state_feature(self):
        """Global state feature (12D).
        全局状态特征（12D）。
        Dimensions / 维度说明：
          [0]  step_norm         step progress / 步数归一化 [0,1]
          [1]  battery_ratio     battery level / 电量比 [0,1]
          [2]  cleaning_progress cleaned ratio / 已清扫比例 [0,1]
          [3]  remaining_dirt    remaining dirt ratio / 剩余污渍比例 [0,1]
          [4]  pos_x_norm        x position / x 坐标归一化 [0,1]
          [5]  pos_z_norm        z position / z 坐标归一化 [0,1]
          [6]  ray_N_dirt        north ray distance / 向上（z-）方向最近污渍距离
          [7]  ray_E_dirt        east ray distance / 向右（x+）方向
          [8]  ray_S_dirt        south ray distance / 向下（z+）方向
          [9]  ray_W_dirt        west ray distance / 向左（x-）方向
          [10] nearest_dirt_norm nearest dirt Euclidean distance / 最近污渍欧氏距离归一化
          [11] dirt_delta        approaching dirt indicator / 是否在接近污渍（1=是, 0=否）
        """
        step_norm = _norm(self.step_no, 2000)
        battery_ratio = _norm(self.battery, self.battery_max)
        cleaning_progress = _norm(self.dirt_cleaned, self.total_dirt)
        remaining_dirt = 1.0 - cleaning_progress
        hx, hz = self.cur_pos
        pos_x_norm = _norm(hx, self.GRID_SIZE)
        pos_z_norm = _norm(hz, self.GRID_SIZE)
        # 4-directional ray to find nearest dirt
        # 四方向射线找最近污渍距离
        ray_dirs = [(0, -1), (1, 0), (0, 1), (-1, 0)]  # N E S W
        ray_dirt = []
        max_ray = 30
        for dx, dz in ray_dirs:
            x, z = hx, hz
            found = max_ray
            for step in range(1, max_ray + 1):
                x += dx
                z += dz
                if not (0 <= x < self.GRID_SIZE and 0 <= z < self.GRID_SIZE):
                    break
                if self._view_map is not None:
                    cell = (
                        int(
                            self._view_map[
                                np.clip(x - (hx - self.VIEW_HALF), 0, 20), np.clip(z - (hz - self.VIEW_HALF), 0, 20)
                            ]
                        )
                        if (0 <= x - hx + self.VIEW_HALF < 21 and 0 <= z - hz + self.VIEW_HALF < 21)
                        else 0
                    )
                    if cell == 2:
                        found = step
                        break
            ray_dirt.append(_norm(found, max_ray))
        # Nearest dirt Euclidean distance (estimated from 7×7 crop)
        # 最近污渍欧氏距离（视野内 7×7 粗估）
        self.last_nearest_dirt_dist = self.nearest_dirt_dist
        self.nearest_dirt_dist = self._calc_nearest_dirt_dist()
        nearest_dirt_norm = _norm(self.nearest_dirt_dist, 180)
        dirt_delta = 1.0 if self.nearest_dirt_dist < self.last_nearest_dirt_dist else 0.0
        return np.array(
            [
                step_norm,
                battery_ratio,
                cleaning_progress,
                remaining_dirt,
                pos_x_norm,
                pos_z_norm,
                ray_dirt[0],
                ray_dirt[1],
                ray_dirt[2],
                ray_dirt[3],
                nearest_dirt_norm,
                dirt_delta,
            ],
            dtype=np.float32,
        )
    def _calc_nearest_dirt_dist(self):
        """Find nearest dirt Euclidean distance from local view.
        从局部视野中找最近污渍的欧氏距离。
        """
        view = self._view_map
        if view is None:
            return 200.0
        dirt_coords = np.argwhere(view == 2)
        if len(dirt_coords) == 0:
            return 200.0
        center = self.VIEW_HALF
        dists = np.sqrt((dirt_coords[:, 0] - center) ** 2 + (dirt_coords[:, 1] - center) ** 2)
        return float(np.min(dists))
    def get_legal_action(self):
        """Return legal action mask (8D list).
        返回合法动作掩码（8D list）。
        """
        return list(self._legal_act)
    def feature_process(self, env_obs, last_action):
        """Generate 69D feature vector, legal action mask, and scalar reward.
        生成 69D 特征向量、合法动作掩码和标量奖励。
        """
        self.pb2struct(env_obs, last_action)
        local_view = self._get_local_view_feature()  # 49D
        global_state = self._get_global_state_feature()  # 12D
        legal_action = self.get_legal_action()  # 8D
        legal_arr = np.array(legal_action, dtype=np.float32)
        feature = np.concatenate([local_view, global_state, legal_arr])  # 69D
        reward = self.reward_process()
        return feature, legal_action, reward
    def reward_process(self):
        # Cleaning reward / 清扫奖励
        cleaned_this_step = max(0, self.dirt_cleaned - self.last_dirt_cleaned)
        cleaning_reward = 0.1 * cleaned_this_step
        # Step penalty / 时间惩罚
        step_penalty = -0.001
        return cleaning_reward + step_penalty
--- a/agent_ppo/model/init.py
+++ b/agent_ppo/model/init.py
--- a/agent_ppo/model/model.py
+++ b/agent_ppo/model/model.py
@@ -0,0 +1,73 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 ###########################################################################
 # Copyright © 1998 - 2026 Tencent. All Rights Reserved.
 ###########################################################################
 """
 Author: Tencent AI Arena Authors
 Simple MLP policy network for Robot Vacuum.
 清扫大作战策略网络。
 """
 import torch
 import torch.nn as nn
 from agent_ppo.conf.conf import Config
 def _make_fc(in_dim, out_dim, gain=1.41421):
    """Create a linear layer with orthogonal initialization.
    创建正交初始化的线性层。
    """
    layer = nn.Linear(in_dim, out_dim)
    nn.init.orthogonal_(layer.weight, gain=gain)
    nn.init.zeros_(layer.bias)
    return layer
 class Model(nn.Module):
    """Dual-head MLP for Robot Vacuum.
    清扫大作战双头 MLP 策略网络。
    """
    def __init__(self, device=None):
        super().__init__()
        self.model_name = "robot_vacuum"
        self.device = device
        obs_dim = Config.DIM_OF_OBSERVATION  # 69
        act_num = Config.ACTION_NUM  # 8
        # Shared backbone / 共享骨干网络
        self.backbone = nn.Sequential(
            _make_fc(obs_dim, 128),
            nn.ReLU(),
            _make_fc(128, 64),
            nn.ReLU(),
        )
        # Actor head: outputs action logits / 策略头：输出动作 logits
        self.actor_head = _make_fc(64, act_num, gain=0.01)
        # Critic head: outputs single state value / 价值头：输出单个状态价值
        self.critic_head = _make_fc(64, 1, gain=0.01)
    def forward(self, s, inference=False):
        """Forward pass.
        前向传播。
        """
        x = s.to(torch.float32)
        h = self.backbone(x)
        logits = self.actor_head(h)
        value = self.critic_head(h)
        return [logits, value]
    def set_train_mode(self):
        self.train()
    def set_eval_mode(self):
        self.eval()
--- a/agent_ppo/workflow/init.py
+++ b/agent_ppo/workflow/init.py
--- a/agent_ppo/workflow/train_workflow.py
+++ b/agent_ppo/workflow/train_workflow.py
@@ -0,0 +1,201 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 ###########################################################################
 # Copyright © 1998 - 2026 Tencent. All Rights Reserved.
 ###########################################################################
 """
 Author: Tencent AI Arena Authors
 Training workflow for Robot Vacuum.
 清扫大作战训练工作流。
 """
 import os
 import time
 import numpy as np
 from agent_ppo.conf.conf import Config
 from agent_ppo.feature.definition import SampleData, sample_process
 from tools.metrics_utils import get_training_metrics
 from tools.train_env_conf_validate import read_usr_conf
 from common_python.utils.workflow_disaster_recovery import handle_disaster_recovery
 def workflow(envs, agents, logger=None, monitor=None, *args, **kwargs):
    last_save_model_time = time.time()
    env = envs[0]
    agent = agents[0]
    # Read and validate user configuration
    # 读取和校验用户配置
    usr_conf = read_usr_conf("agent_ppo/conf/train_env_conf.toml", logger)
    if usr_conf is None:
        logger.error("usr_conf is None, please check agent_ppo/conf/train_env_conf.toml")
        return
    episode_runner = EpisodeRunner(
        env=env,
        agent=agent,
        usr_conf=usr_conf,
        logger=logger,
        monitor=monitor,
    )
    while True:
        for g_data in episode_runner.run_episodes():
            agent.send_sample_data(g_data)
            g_data.clear()
            now = time.time()
            if now - last_save_model_time >= 1800:
                agent.save_model()
                last_save_model_time = now
 class EpisodeRunner:
    def __init__(self, env, agent, usr_conf, logger, monitor):
        self.env = env
        self.agent = agent
        self.usr_conf = usr_conf
        self.logger = logger
        self.monitor = monitor
        self.episode_cnt = 0
        self.last_report_monitor_time = 0
        self.last_get_training_metrics_time = 0
    def run_episodes(self):
        """Run a single episode and yield collected samples.
        单局流程（generator），完成一局后 yield 整局样本。
        """
        while True:
            # Periodically get training metrics
            # 定期打印训练指标
            now = time.time()
            if now - self.last_get_training_metrics_time >= 60:
                training_metrics = get_training_metrics()
                self.last_get_training_metrics_time = now
                if training_metrics is not None:
                    self.logger.info(f"training_metrics: {training_metrics}")
            # Reset environment
            # 重置环境
            env_obs = self.env.reset(self.usr_conf)
            if handle_disaster_recovery(env_obs, self.logger):
                continue
            # Reset agent and load latest model
            # 重置 Agent，加载最新模型
            self.agent.reset(env_obs)
            self.agent.load_model(id="latest")
            # Initial observation processing
            # 初始观测
            obs_data, remain_info = self.agent.observation_process(env_obs)
            collector = []
            self.episode_cnt += 1
            done = False
            step = 0
            total_reward = 0.0
            self.logger.info(f"Episode {self.episode_cnt} start")
            while not done:
                # Agent inference / 推理动作
                act_data_list = self.agent.predict([obs_data])
                act_data = act_data_list[0]
                act = self.agent.action_process(act_data)
                # Environment step / 与环境交互
                env_reward, env_obs = self.env.step(act)
                if handle_disaster_recovery(env_obs, self.logger):
                    break
                terminated = env_obs["terminated"]
                truncated = env_obs["truncated"]
                frame_no = env_obs["frame_no"]
                step += 1
                done = terminated or truncated
                # Process next observation
                # 特征处理
                _obs_data, _ = self.agent.observation_process(env_obs)
                _obs_data.frame_no = frame_no
                reward_scalar = float(self.agent.last_reward)
                total_reward += reward_scalar
                # Terminal reward calculation
                # 终局奖励
                final_reward = 0.0
                if done:
                    fm = self.agent.preprocessor
                    total_score = env_obs["observation"]["env_info"]["total_score"]
                    if truncated:
                        # Survived to max steps: higher cleaning ratio → more reward
                        # 存活到最大步数：清扫比例越高奖励越多
                        cleaning_ratio = fm.dirt_cleaned / max(fm.total_dirt, 1)
                        final_reward = 5.0 + 5.0 * cleaning_ratio
                        result_str = "WIN"
                    else:
                        # Early termination (battery depleted or collision): small penalty
                        # 提前结束（电量耗尽或碰撞）：小惩罚
                        final_reward = -2.0
                        result_str = "FAIL"
                    self.logger.info(
                        f"[GAMEOVER] ep:{self.episode_cnt} steps:{step} "
                        f"result:{result_str} final_bonus:{final_reward:.2f} "
                        f"total_reward:{total_reward:.3f} "
                        f"dirt_cleaned:{fm.dirt_cleaned}/{fm.total_dirt}"
                    )
                # Build sample frame
                # 构造样本帧
                reward_arr = np.array([reward_scalar], dtype=np.float32)
                value_arr = act_data.value.flatten()[: Config.VALUE_NUM]
                frame = SampleData(
                    obs=np.array(obs_data.feature, dtype=np.float32),
                    legal_action=np.array(obs_data.legal_action, dtype=np.float32),
                    act=np.array(act_data.action),
                    reward=reward_arr,
                    done=np.array([float(done)]),
                    reward_sum=np.zeros(Config.VALUE_NUM, dtype=np.float32),
                    value=value_arr,
                    next_value=np.zeros(Config.VALUE_NUM, dtype=np.float32),
                    advantage=np.zeros(Config.VALUE_NUM, dtype=np.float32),
                    prob=np.array(act_data.prob, dtype=np.float32),
                )
                collector.append(frame)
                if done:
                    # Add terminal reward to last frame
                    # 终局奖励叠加到最后一步
                    collector[-1].reward = collector[-1].reward + np.array([final_reward], dtype=np.float32)
                    # Monitor reporting / 监控上报
                    now = time.time()
                    if now - self.last_report_monitor_time >= 60 and self.monitor:
                        self.monitor.put_data(
                            {
                                os.getpid(): {
                                    "reward": total_reward + final_reward,
                                    "episode_cnt": self.episode_cnt,
                                }
                            }
                        )
                        self.last_report_monitor_time = now
                    # Compute GAE and yield samples
                    # GAE 计算并 yield 样本
                    if collector:
                        collector = sample_process(collector)
                        yield collector
                    break
                # Advance state / 状态推进
                obs_data = _obs_data
--- a/conf/.gitignore
+++ b/conf/.gitignore
@@ -0,0 +1 @@
 kaiwudrl/
--- a/conf/init.py
+++ b/conf/init.py
--- a/conf/algo_conf_robot_vacuum.toml
+++ b/conf/algo_conf_robot_vacuum.toml
@@ -0,0 +1,15 @@
 [ppo]
 actor_agent = "agent_ppo.agent.Agent"
 learner_agent = "agent_ppo.agent.Agent"
 aisrv_agent = "agent_ppo.agent.Agent"
 train_workflow = "agent_ppo.workflow.train_workflow.workflow"
 eval_workflow = "tools.eval.workflow.eval_workflow.workflow"
 exam_workflow = "tools.eval.workflow.exam_workflow.workflow"
 [diy]
 actor_agent = "agent_diy.agent.Agent"
 learner_agent = "agent_diy.agent.Agent"
 aisrv_agent = "agent_diy.agent.Agent"
 train_workflow = "agent_diy.workflow.train_workflow.workflow"
 eval_workflow = "tools.eval.workflow.eval_workflow.workflow"
 exam_workflow = "tools.eval.workflow.exam_workflow.workflow"
--- a/conf/app_conf_robot_vacuum.toml
+++ b/conf/app_conf_robot_vacuum.toml
@@ -0,0 +1,6 @@
 [robot_vacuum]
 rl_helper = "kaiwudrl.server.aisrv.kaiwu_rl_helper_standard.KaiWuRLStandardHelper"
 [robot_vacuum.policies.train_one]
 policy_builder = "kaiwudrl.server.aisrv.async_policy.AsyncBuilder"
 algo = "ppo"
--- a/conf/configure_app.toml
+++ b/conf/configure_app.toml
@@ -0,0 +1,69 @@
 [app]
 # Replay buffer configurations
 # 样本池容量
 replay_buffer_capacity = 10000
 # The ratio of the sample pool capacity that triggers training
 # 当样本池中的样本占总容量的比例达到该值时，启动训练
 preload_ratio = 1.0
 # When new samples are added to the sample pool, the logic for removing old samples: reverb.selectors.Lifo, reverb.selectors.Fifo
 # 当新样本加入样本池时，旧样本的移除逻辑，可选项：reverb.selectors.Lifo, reverb.selectors.Fifo
 # reverb.selectors.Lifo：先进后出(Last In, First Out)
 # reverb.selectors.Fifo：先进先出(First In, First Out)
 reverb_remover = "reverb.selectors.Fifo"
 # The sampling logic of the Learner from the sample pool: reverb.selectors.Fifo, reverb.selectors.Uniform
 # Learner从样本池中采样的逻辑，可选项：reverb.selectors.Fifo, reverb.selectors.Uniform
 # reverb.selectors.Uniform：Samples are selected uniformly at random from the replay buffer, with each sample having an equal probability of being chosen.
 # reverb.selectors.Uniform：从回放缓冲区中随机均匀地选择样本，每个样本被选中的概率相同。
 # reverb.selectors.Fifo：Samples are selected in the order they were added to the replay buffer.
 # reverb.selectors.Fifo：按照先进先出从回放缓冲区中选择样本。
 reverb_sampler = "reverb.selectors.Uniform"
 # Control strategy for balancing data insertion and sampling in experience replay. Options: SampleToInsertRatio, MinSize
 # 控制经验回放库中数据插入与采样的动态平衡策略，可选项：SampleToInsertRatio, MinSize
 # How to choose
 # 如何选择:
 #   - SampleToInsertRatio: Use when training is faster than sample generation (e.g. GPU training with few envs)
 #     适用于训练速度快于样本产出速度的场景（如GPU训练、少量环境数目），严格控制每条样本被复用的次数，防止过拟合
 #   - MinSize: Use when sample generation is faster than training (e.g. local CPU training, or many envs)
 #     适用于样本产出速度快于训练速度的场景（如本地CPU训练、大量环境数目），buffer达到阈值后即可全速训练，不限制复用次数
 # reverb_samples_per_insert: Max sampling times per inserted sample (only for SampleToInsertRatio)
 # 参数reverb_samples_per_insert: 每插入1条样本允许采样的最大次数（仅SampleToInsertRatio模式生效）
 # reverb_error_buffer: Tolerance buffer for ratio constraint, similar to TCP sliding window (only for SampleToInsertRatio)
 # 参数reverb_error_buffer: 比例限制的弹性缓冲区间，类似TCP滑动窗口（仅SampleToInsertRatio模式生效）
 reverb_rate_limiter = "MinSize"
 reverb_samples_per_insert = 5
 reverb_error_buffer = 5
 # Training batch size limit for Learner
 # Learner训练时样本批处理大小
 train_batch_size = 2048
 # Model dump frequency (steps)
 # 训练间隔多少步输出模型参数文件
 dump_model_freq = 100
 # The Learner pushes model updates, and the frequency at which Actors fetch the model (in minutes).
 # Learner推送模型参数文件至模型池，以及Actor从模型池获取模型参数文件的频次（单位：分钟）
 model_file_sync_per_minutes = 1
 # The number of model updates pushed per learner iteration, and the maximum number of updates each actor can fetch at once (cap: 50).
 # Learner每次推送模型参数文件，以及Actor每次获取模型参数文件的数量（上限：50）
 modelpool_max_save_model_count = 1
 # Whether to enable the preload model function. If enabled (true), the model specified by preload_model_id will be loaded as the initial model in the preload_model_dir directory; if disabled (false), no preloading will be performed.
 # 是否启用预加载模型功能，若开启(true)，将在preload_model_dir目录下加载由preload_model_id指定的模型作为初始模型；若关闭(false)，则不进行预加载。
 preload_model = false
 # The relative path of the preloaded model folder (the variable name {agent_name} refers to the agent_algorithm name directory in the code package). It is only effective when preload_model=true. When the preload model function is enabled, you need to create a new ckpt folder under the agent_algorithm name directory in the code package and place the model file (.pkl) there.
 # 预加载模型文件夹相对路径(变量名{agent_name}指代码包中agent_算法名目录)，仅在preload_model=true时生效；当开启预加载模型功能时，需要在代码包中agent_算法名目录下新建ckpt文件夹，将模型文件（.pkl）放置此即可。
 preload_model_dir = "{agent_name}/ckpt"
 # The identification ID of the preloaded model (here refers to the number of model training steps). This ID corresponds to the number of training steps recorded in the model file name. It only takes effect when preload_model=true.
 # Note that it is forbidden to modify the original model file name, otherwise the model preloading process will fail.
 # 预加载模型的标识ID（这里指模型训练步数），该ID对应模型文件名中的训练步数记录。仅在preload_model=true时生效。
 # 注意，禁止修改原始模型文件名，否则将导致模型预加载流程失败。
 preload_model_id = 1000
--- a/kaiwu.json
+++ b/kaiwu.json
@@ -0,0 +1,4 @@
 {
  "version": "13.0.1-comp-normal-lite.26comp",
  "project_code": "robot_vacuum"
 }
--- a/train_test.py
+++ b/train_test.py
@@ -0,0 +1,29 @@
 #!/usr/bin/env python3
 # -*- coding: UTF-8 -*-
 ###########################################################################
 # Copyright © 1998 - 2026 Tencent. All Rights Reserved.
 ###########################################################################
 """
 Author: Tencent AI Arena Authors
 """
 from kaiwudrl.common.utils.train_test_utils import run_train_test
 # To run the train_test, you must modify the algorithm name here. It must be one of algorithm_name_list.
 # Simply modify the value of the algorithm_name variable.
 # 运行train_test前必须修改这里的算法名字, 必须是 algorithm_name_list 里的一个, 修改algorithm_name的值即可
 algorithm_name_list = ["ppo", "diy"]
 algorithm_name = "ppo"
 if __name__ == "__main__":
    run_train_test(
        algorithm_name=algorithm_name,
        algorithm_name_list=algorithm_name_list,
        env_vars={
            "replay_buffer_capacity": "10",
            "preload_ratio": "0.2",
            "train_batch_size": "2",
            "dump_model_freq": "1",
        },
    )