refactor(gry): refactor reward model#636
Conversation
Codecov Report
@@ Coverage Diff @@
## main #636 +/- ##
==========================================
+ Coverage 82.06% 83.57% +1.51%
==========================================
Files 586 580 -6
Lines 47515 47428 -87
==========================================
+ Hits 38991 39640 +649
+ Misses 8524 7788 -736
Flags with carried forward coverage won't be shown. Click here to find out more.
|
| return {**obs, "image": image} | ||
|
|
||
|
|
||
| class ObsPlusPrevActRewWrapper(gym.Wrapper): |
There was a problem hiding this comment.
why add this wrapper here, rather than use the wrapper in ding/envs
There was a problem hiding this comment.
because in ding/envs, we use gym for the wrapper, but for minigrid we need gymnasium instead of gym. And in order to make a terrible influence on other env, I add this wrapper to minigrid wrapper.
| ), | ||
| reward_model=dict( | ||
| type='trex', | ||
| exp_name='cartpole_trex_onppo_seed0', |
There was a problem hiding this comment.
in our original implementation, we used exp_name to build the tb logger. So it uses the whole config file. our new implementation only uses the reward model config, so I add this part to the reward model.
| with open(self.cfg.data_path + '/expert_data.pkl', 'rb') as f: | ||
| self.expert_data_loader: list = pickle.load(f) | ||
| self.expert_data = self.concat_state_action_pairs(self.expert_data_loader) | ||
| self.expert_data = torch.unbind(self.expert_data, dim=0) |
There was a problem hiding this comment.
because we re-use the concat_state_action_pair function, and its return is different from the original function in Gail. So I used unbind here.
| max_train_iter: Optional[int] = int(1e10), | ||
| max_env_step: Optional[int] = int(1e10), | ||
| cooptrain_reward: Optional[bool] = True, | ||
| pretrain_reward: Optional[bool] = False, |
There was a problem hiding this comment.
add comments for new arguments
| # update reward_model, when you want to train reward_model inloop | ||
| if cooptrain_reward: | ||
| reward_model.train() | ||
| # clear buffer per fix iters to make sure replay buffer's data count isn't too few. |
There was a problem hiding this comment.
clear buffer per fixed iters to make sure the data for RM training is not too offpolicy
| @@ -108,11 +111,11 @@ def serial_pipeline_reward_model_offpolicy( | |||
| # collect data for reward_model training | |||
| reward_model.collect_data(new_data) | |||
| try: | ||
| serial_pipeline_reward_model_offpolicy(config, seed=0, max_train_iter=2) | ||
| except Exception: | ||
| assert False, "pipeline fail" |
| @@ -0,0 +1,106 @@ | |||
| from typing import Optional, List, Any | |||
| @@ -22,44 +22,49 @@ | |||
| stop_value=int(1e5), | |||
There was a problem hiding this comment.
remove pitfall and mnotezuma config
| ) | ||
|
|
||
| # train reward model | ||
| serial_pipeline_reward_model_offpolicy(main_config, create_config) |
| @@ -22,7 +22,6 @@ | |||
| action_bins_per_branch=2, # mean the action shape is 6, 2 discrete actions for each action dimension | |||
There was a problem hiding this comment.
It may be modified by format.sh, will I need to change it back?
| update_per_collect=5, | ||
| batch_size=64, | ||
| learning_rate=0.001, | ||
| learner=dict(hook=dict(save_ckpt_after_iter=100)), |
There was a problem hiding this comment.
because when we do unit test at drex, we need to modify the learner.hook.save_ckpt_after_iter, if we do not have this, the unit test will be failed, so I add this.
| max_train_iter: Optional[int] = int(1e10), | ||
| max_env_step: Optional[int] = int(1e10), | ||
| cooptrain_reward: Optional[bool] = True, | ||
| pretrain_reward: Optional[bool] = False, |
There was a problem hiding this comment.
pretrain_reward -> pretrain_reward_model?
| model: Optional[torch.nn.Module] = None, | ||
| max_train_iter: Optional[int] = int(1e10), | ||
| max_env_step: Optional[int] = int(1e10), | ||
| cooptrain_reward: Optional[bool] = True, |
There was a problem hiding this comment.
cooptrain_reward -> joint_train_reward_model?
| loss = self.reverse_scale * inverse_loss + forward_loss | ||
| self.tb_logger.add_scalar('icm_reward/total_loss', loss, self.train_cnt_icm) | ||
| inverse_loss, forward_loss, accuracy = self.reward_model.learn(data_states, data_next_states, data_actions) | ||
| loss = self.reverse_scale * inverse_loss + forward_loss |
There was a problem hiding this comment.
self.reverse_scale -> self.reverse_loss_weight
| self.tb_logger.add_scalar('icm_reward/action_accuracy', accuracy, self.train_cnt_icm) | ||
| loss = self.reverse_scale * inverse_loss + forward_loss | ||
| self.tb_logger.add_scalar('icm_reward/total_loss', loss, self.train_cnt_icm) | ||
| inverse_loss, forward_loss, accuracy = self.reward_model.learn(data_states, data_next_states, data_actions) |
There was a problem hiding this comment.
在这里 accuracy的含义是?增加注释,以及换一下变量名
| item['reward'] = item['reward'] / self.cfg.extrinsic_reward_norm_max | ||
| elif self.intrinsic_reward_type == 'assign': | ||
| item['reward'] = icm_rew | ||
| train_data_augmented = combine_intrinsic_exterinsic_reward(train_data_augmented, icm_reward, self.cfg) |
There was a problem hiding this comment.
icm_reward -> normalized_icm_reward?
| # sample episode's timestep index | ||
| train_index = np.random.randint(low=0, high=self.train_obs.shape[0], size=self.cfg.batch_size) | ||
|
|
||
| train_obs: torch.Tensor = self.train_obs[train_index].to(self.device) # shape (self.cfg.batch_size, obs_dim) |
There was a problem hiding this comment.
这里的: torch.Tensor或许可以去掉,在上面写上overview格式的注释
There was a problem hiding this comment.
这里具体指的是什么呢,为什么写了注释之后就可以不控制返回参数的类型
| """ | ||
| states_data = [] | ||
| actions_data = [] | ||
| #check data(dict) has key obs and action |
There was a problem hiding this comment.
空格 使用 bash format.sh ding 格式化代码
| def clear_data(self, iter: int) -> None: | ||
| assert hasattr( | ||
| self.cfg, 'clear_buffer_per_iters' | ||
| ), "Reward Model does not have clear_buffer_per_iters, Clear failed" |
There was a problem hiding this comment.
报错,可以给出修改建议,例如你需要参考xxx, 实现xxx方法
| type='rnd-ngu', | ||
| ), | ||
| episodic_reward_model=dict( | ||
| # means if using rescale trick to the last non-zero reward |
| type='rnd-ngu', | ||
| ), | ||
| episodic_reward_model=dict( | ||
| # means if using rescale trick to the last non-zero reward |
Description
It is a draft pr used for refactoring the reward model
Things finished
Refactoring
New system Design
Pipeline
Check List