Optimize PPO reward and eval planning

This commit is contained in:
2026-04-26 21:58:56 +08:00
parent 524ca8c070
commit dc86a3f338
2 changed files with 187 additions and 47 deletions

View File

@@ -115,7 +115,11 @@ class Agent(BaseAgent):
logits, value = self._run_model(obs_data.feature)
legal_arr = np.array(obs_data.legal_action, dtype=np.float32)
prob = self._legal_soft_max(logits, legal_arr)
action = self._tie_break_eval_action(prob, legal_arr)
action = None
if hasattr(self.preprocessor, "planned_eval_action"):
action = self.preprocessor.planned_eval_action(prob, legal_arr)
if action is None:
action = self._tie_break_eval_action(prob, legal_arr)
act_data = ActData(
action=[action],
d_action=[action],