Optimize PPO short-run training

This commit is contained in:
2026-04-26 12:46:00 +08:00
parent ca6234c941
commit eb3efa4df7
5 changed files with 153 additions and 41 deletions

View File

@@ -37,10 +37,16 @@ class Config:
GAMMA = 0.99
LAMDA = 0.95
INIT_LEARNING_RATE_START = 0.0003
BETA_START = 0.001
INIT_LEARNING_RATE_START = 0.00025
BETA_START = 0.008
BETA_END = 0.002
BETA_DECAY_STEPS = 4000
CLIP_PARAM = 0.2
VF_COEF = 0.5
PPO_EPOCHS = 3
MINI_BATCH_SIZE = 256
NORMALIZE_ADVANTAGE = True
TARGET_KL = 0.04
LABEL_SIZE_LIST = [ACTION_NUM]
LEGAL_ACTION_SIZE_LIST = LABEL_SIZE_LIST.copy()

View File

@@ -77,6 +77,26 @@ def build_monitor():
expr="avg(entropy_loss{})",
)
.end_panel()
.add_panel(
name="近似KL",
name_en="approx_kl",
type="line",
)
.add_metric(
metrics_name="approx_kl",
expr="avg(approx_kl{})",
)
.end_panel()
.add_panel(
name="裁剪比例",
name_en="clip_fraction",
type="line",
)
.add_metric(
metrics_name="clip_fraction",
expr="avg(clip_fraction{})",
)
.end_panel()
.end_group()
.build()
)