diff --git a/README.md b/README.md old mode 100644 new mode 100755 index 3d086b3..d04f34f --- a/README.md +++ b/README.md @@ -108,6 +108,7 @@ NOTE: we follow the exact code structure as [pytorch-dnc](https://github.com/jin - [Visdom](https://github.com/facebookresearch/visdom) - [OpenAI Gym >=v0.9.0 (for lower versoins, just need to change into the available games, e.g. change PongDeterministic-v4 to PongDeterministic-v3)](https://github.com/openai/gym) - [mujoco-py (Optional: for training continuous version of a3c)](https://github.com/openai/mujoco-py) +- [opensim-rl (Optional: for training in Opensim-rl environment) (https://github.com/stanfordnmbl/osim-rl) ******* diff --git a/core/agents/dqn.py b/core/agents/dqn.py old mode 100644 new mode 100755 index 777c2d3..fd9d0c4 --- a/core/agents/dqn.py +++ b/core/agents/dqn.py @@ -150,7 +150,7 @@ def _epsilon_greedy(self, q_values_ts): self.eps = self.eps_eval # choose action if np.random.uniform() < self.eps: # then we choose a random action - action = random.randrange(self.action_dim) + action = np.random.rand(self.action_dim).tolist() else: # then we choose the greedy action if self.use_cuda: action = np.argmax(q_values_ts.cpu().numpy()) @@ -164,7 +164,7 @@ def _forward(self, observation): state_ts = torch.from_numpy(np.array(state)).unsqueeze(0).type(self.dtype) q_values_ts = self.model(Variable(state_ts, volatile=True)).data # NOTE: only doing inference here, so volatile=True if self.training and self.step < self.learn_start: # then we don't do any learning, just accumulate experiences into replay memory - action = random.randrange(self.action_dim) # thus we only randomly sample actions here, since the model hasn't been updated at all till now + action = np.random.rand(self.action_dim).tolist() # thus we only randomly sample actions here, since the model hasn't been updated at all till now else: action = self._epsilon_greedy(q_values_ts) diff --git a/opensim.py b/opensim.py new file mode 100755 index 0000000..f679f54 --- /dev/null +++ b/opensim.py @@ -0,0 +1,75 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import numpy as np +from copy import deepcopy +from gym.spaces.box import Box +import inspect + +from utils.helpers import Experience # NOTE: here state0 is always "None" +from utils.helpers import preprocessAtari, rgb2gray, rgb2y, scale +from core.env import Env + +class OpenSim(Env): # low dimensional observations + """ Class to setup the OpenSim-RL environment (https://github.com/praveen-palanisamy/pytorch-rl.git) Where the agent has to learn to run! Continuous (18 dim) action space.""" + def __init__(self, args, env_ind=0): + super(OpenSim, self).__init__(args, env_ind) + + assert self.env_type == "opensim" + try: from osim.env import RunEnv + except ImportError as e: self.logger.warning("WARNING: opensim not found") + + self.env = RunEnv(visualize= True) + #self.env.seed(self.seed) # NOTE: so each env would be different + + # action space setup + self.actions = range(self.action_dim) + self.logger.warning("Action Space: %s", self.env.action_space) + + # state space setup + self.logger.warning("State Space: %s", self.state_shape) + + # continuous space + #if args.agent_type == "a3c": + self.enable_continuous = True #args.enable_continuous + + def _preprocessState(self, state): # NOTE: here no preprecessing is needed + return state + + @property + def action_dim(self): + return self.env.action_space.shape[0] + + @property + def state_shape(self): + return self.env.observation_space.shape[0] + + def render(self): + #if self.mode == 2: + # frame = self.env.render(mode='rgb_array') + # frame_name = self.img_dir + "frame_%04d.jpg" % self.frame_ind + # self.imsave(frame_name, frame) + # self.logger.warning("Saved Frame @ Step: " + str(self.frame_ind) + " To: " + frame_name) + # self.frame_ind += 1 + # return frame + #else: + # return self.env.render() + return + + + def visual(self): + pass + + def sample_random_action(self): + return self.env.action_space.sample() + + def reset(self): + self._reset_experience() + self.exp_state1 = self.env.reset() + return self._get_experience() + + def step(self, action): + self.exp_action = action + if self.enable_continuous: + self.exp_state1, self.exp_reward, self.exp_terminal1, _ = self.env.step(self.exp_action) + return self._get_experience() diff --git a/utils/factory.py b/utils/factory.py old mode 100644 new mode 100755 index 078b03d..c7c99ec --- a/utils/factory.py +++ b/utils/factory.py @@ -6,10 +6,12 @@ from core.envs.atari_ram import AtariRamEnv from core.envs.atari import AtariEnv from core.envs.lab import LabEnv +from core.envs.opensim import OpenSim EnvDict = {"gym": GymEnv, # classic control games from openai w/ low-level input "atari-ram": AtariRamEnv, # atari integrations from openai, with low-level input "atari": AtariEnv, # atari integrations from openai, with pixel-level input - "lab": LabEnv} + "lab": LabEnv, + "opensim": OpenSim} from core.models.empty import EmptyModel from core.models.dqn_mlp import DQNMlpModel @@ -20,6 +22,7 @@ from core.models.acer_cnn_dis import ACERCnnDisModel ModelDict = {"empty": EmptyModel, # contains nothing, only should be used w/ EmptyAgent "dqn-mlp": DQNMlpModel, # for dqn low-level input + "dqn-mlp-con": DQNMlpModel, # for dqn low-level input "dqn-cnn": DQNCnnModel, # for dqn pixel-level input "a3c-mlp-con": A3CMlpConModel, # for a3c low-level input (NOTE: continuous must end in "-con") "a3c-cnn-dis": A3CCnnDisModel, # for a3c pixel-level input diff --git a/utils/options.py b/utils/options.py old mode 100644 new mode 100755 index e6aa133..69c575a --- a/utils/options.py +++ b/utils/options.py @@ -22,7 +22,8 @@ [ "dqn", "atari", "BreakoutDeterministic-v4", "dqn-cnn", "sequential"], # 4 [ "a3c", "atari", "PongDeterministic-v4", "a3c-cnn-dis", "none" ], # 5 [ "a3c", "gym", "InvertedPendulum-v1", "a3c-mlp-con", "none" ], # 6 -[ "acer", "gym", "MountainCar-v0", "acer-mlp-dis", "episodic" ] # 7 # NOTE: acer under testing +[ "acer", "gym", "MountainCar-v0", "acer-mlp-dis", "episodic" ], # 7 # NOTE: acer under testing +[ "dqn", "opensim", "opensim", "dqn-mlp-con", "sequential"] # 8 ] class Params(object): # NOTE: shared across all modules @@ -30,11 +31,11 @@ def __init__(self): self.verbose = 0 # 0(warning) | 1(info) | 2(debug) # training signature - self.machine = "aisdaim" # "machine_id" - self.timestamp = "17082400" # "yymmdd##" + self.machine = "hpc011" # "machine_id" + self.timestamp = "1" # "yymmdd##" # training configuration self.mode = 1 # 1(train) | 2(test model_file) - self.config = 7 + self.config = 8 self.seed = 123 self.render = False # whether render the window from the original envs or not @@ -53,7 +54,7 @@ def __init__(self): self.hidden_dim = 16 else: self.hist_len = 4 - self.hidden_dim = 256 + self.hidden_dim = 512#256 self.use_cuda = torch.cuda.is_available() self.dtype = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor @@ -139,8 +140,12 @@ def __init__(self): self.wid_state = 80 self.preprocess_mode = 3 # 0(nothing) | 1(rgb2gray) | 2(rgb2y) | 3(crop&resize depth) self.img_encoding_type = "passthrough" + + elif self.env_type == "opensim": + pass + else: - assert False, "env_type must be: gym | atari-ram | atari | lab" + assert False, "env_type must be: gym | atari-ram | atari | lab | opensim" class ModelParams(Params): # settings for network architecture def __init__(self): @@ -228,6 +233,31 @@ def __init__(self): self.action_repetition = 4 self.memory_interval = 1 self.train_interval = 4 + elif self.agent_type == "dqn" and self.env_type == "opensim": + self.steps = 50000000 # max #iterations + self.early_stop = None # max #steps per episode + self.gamma = 0.99 + self.clip_grad = 40.#np.inf + self.lr = 0.00025 + self.lr_decay = False + self.weight_decay = 0. + self.eval_freq = 250000#12500 # NOTE: here means every this many steps + self.eval_steps = 125000#2500 + self.prog_freq = 10000#self.eval_freq + self.test_nepisodes = 1 + + self.learn_start = 50000 # start update params after this many steps + self.batch_size = 32 + self.valid_size = 500 + self.eps_start = 1 + self.eps_end = 0.1 + self.eps_eval = 0.#0.05 + self.eps_decay = 1000000 + self.target_model_update = 10000 + self.action_repetition = 4 + self.memory_interval = 1 + self.train_interval = 4 + elif self.agent_type == "a3c": self.steps = 20000000 # max #iterations self.early_stop = None # max #steps per episode