Improve PPO diagnostics and recharge behavior

This commit is contained in:
2026-04-26 20:24:26 +08:00
parent 5b6133db13
commit 69b8a692db
6 changed files with 463 additions and 31 deletions

View File

@@ -50,6 +50,11 @@ class Config:
NORMALIZE_ADVANTAGE = True
TARGET_KL = 0.04
# Evaluation tie-break: when policy probabilities are close, prefer safer
# coverage/recharge actions with a lightweight heuristic.
EVAL_TIE_BREAK_PROB_GAP = 0.015
EVAL_TIE_BREAK_SCORE_SCALE = 0.01
LABEL_SIZE_LIST = [ACTION_NUM]
LEGAL_ACTION_SIZE_LIST = LABEL_SIZE_LIST.copy()