63 lines
1.7 KiB
Python
63 lines
1.7 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: UTF-8 -*-
|
||
###########################################################################
|
||
# Copyright © 1998 - 2026 Tencent. All Rights Reserved.
|
||
###########################################################################
|
||
"""
|
||
Author: Tencent AI Arena Authors
|
||
|
||
Configuration for Robot Vacuum PPO agent.
|
||
清扫大作战 PPO 配置。
|
||
"""
|
||
|
||
|
||
class Config:
|
||
|
||
# Feature dimensions: 21x21x6 local map + scalar planning features + last action.
|
||
# 特征维度:21x21x6 多通道局部地图 + 标量规划特征 + 上一步动作。
|
||
VIEW_SIZE = 21
|
||
MAP_CHANNELS = 6
|
||
FEATURES = [
|
||
VIEW_SIZE * VIEW_SIZE * MAP_CHANNELS,
|
||
66, # global memory, charger, NPC, and action-improvement features
|
||
8, # last action one-hot / 上一步动作 one-hot
|
||
]
|
||
FEATURE_SPLIT_SHAPE = FEATURES
|
||
FEATURE_LEN = sum(FEATURES)
|
||
DIM_OF_OBSERVATION = FEATURE_LEN
|
||
|
||
# Action space: 8 directional moves
|
||
# 动作空间:8个方向移动
|
||
ACTION_NUM = 8
|
||
|
||
# Single-head value
|
||
# 单头价值
|
||
VALUE_NUM = 1
|
||
|
||
# PPO hyperparameters
|
||
# PPO 超参数
|
||
GAMMA = 0.99
|
||
LAMDA = 0.95
|
||
|
||
INIT_LEARNING_RATE_START = 0.00025
|
||
BETA_START = 0.008
|
||
BETA_END = 0.002
|
||
BETA_DECAY_STEPS = 4000
|
||
CLIP_PARAM = 0.2
|
||
VF_COEF = 0.5
|
||
PPO_EPOCHS = 3
|
||
MINI_BATCH_SIZE = 256
|
||
NORMALIZE_ADVANTAGE = True
|
||
TARGET_KL = 0.04
|
||
|
||
# Evaluation tie-break: when policy probabilities are close, prefer safer
|
||
# coverage/recharge actions with a lightweight heuristic.
|
||
EVAL_TIE_BREAK_PROB_GAP = 0.015
|
||
EVAL_TIE_BREAK_SCORE_SCALE = 0.01
|
||
|
||
LABEL_SIZE_LIST = [ACTION_NUM]
|
||
LEGAL_ACTION_SIZE_LIST = LABEL_SIZE_LIST.copy()
|
||
|
||
USE_GRAD_CLIP = True
|
||
GRAD_CLIP_RANGE = 0.5
|