Improve PPO diagnostics and recharge behavior
This commit is contained in:
@@ -50,6 +50,11 @@ class Config:
|
||||
NORMALIZE_ADVANTAGE = True
|
||||
TARGET_KL = 0.04
|
||||
|
||||
# Evaluation tie-break: when policy probabilities are close, prefer safer
|
||||
# coverage/recharge actions with a lightweight heuristic.
|
||||
EVAL_TIE_BREAK_PROB_GAP = 0.015
|
||||
EVAL_TIE_BREAK_SCORE_SCALE = 0.01
|
||||
|
||||
LABEL_SIZE_LIST = [ACTION_NUM]
|
||||
LEGAL_ACTION_SIZE_LIST = LABEL_SIZE_LIST.copy()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user