Optimize PPO reward and eval planning
This commit is contained in:
@@ -115,7 +115,11 @@ class Agent(BaseAgent):
|
||||
logits, value = self._run_model(obs_data.feature)
|
||||
legal_arr = np.array(obs_data.legal_action, dtype=np.float32)
|
||||
prob = self._legal_soft_max(logits, legal_arr)
|
||||
action = self._tie_break_eval_action(prob, legal_arr)
|
||||
action = None
|
||||
if hasattr(self.preprocessor, "planned_eval_action"):
|
||||
action = self.preprocessor.planned_eval_action(prob, legal_arr)
|
||||
if action is None:
|
||||
action = self._tie_break_eval_action(prob, legal_arr)
|
||||
act_data = ActData(
|
||||
action=[action],
|
||||
d_action=[action],
|
||||
|
||||
Reference in New Issue
Block a user