-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathmain.py
78 lines (68 loc) · 2.65 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import numpy as np
from itertools import count
import random
import gym.spaces
from tqdm import tqdm
import gym
from utils.gym import get_env, get_wrapper_by_name
from utils.plot import plot_line
from manipulation_main.common import io_utils
import logging
logging.getLogger().setLevel(logging.DEBUG)
# Hyperparameters
config = io_utils.load_yaml("config/gripper_grasp.yaml")
env = get_env("gripper-env-v0", config=config, seed=0, idx_to_save_video=(0,))
gamma=0.99
total_timestep=100_000
assert type(env.observation_space) == gym.spaces.Box
assert type(env.action_space) == gym.spaces.Box
###############
# BUILD MODEL #
###############
img_h, img_w, img_c = 64, 64, 3
action_shape = env.action_space.shape
action_min = env.action_space.low
action_max = env.action_space.high
###############
# RUN ENV #
###############
num_param_updates = 0
mean_episode_reward = -float('nan')
best_mean_episode_reward = -float('inf')
last_obs = env.reset()
log_every_n_steps = 1000
metrics = {'steps': [], 'rewards': [], 'Qs': [], 'best_avg_reward': -float('inf')}
with tqdm(total=total_timestep) as pbar:
for t in count():
# update progress bar
pbar.n = t
pbar.refresh()
### Check stopping criterion
if t >= total_timestep:
break
action = np.random.rand(5)
action = action * (action_max - action_min) + action_min
obs, reward, done, _ = env.step(action)
# normalize rewards between -1 and 1, -200 < reward < 10000
reward = reward / 10000.0
# Resets the environment when reaching an episode boundary.
if done:
obs = env.reset()
last_obs = obs
### 4. Log progress and keep track of statistics
episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards()
if len(episode_rewards) > 0:
mean_episode_reward = np.mean(episode_rewards[-100:])
if len(episode_rewards) > 100:
best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward)
if t % log_every_n_steps == 0:
tqdm.write("=========================================")
tqdm.write("Timestep %d" % (t,))
tqdm.write("mean reward (100 episodes) %f" % mean_episode_reward)
tqdm.write("best mean reward %f" % best_mean_episode_reward)
tqdm.write("episodes %d" % len(episode_rewards))
tqdm.write("=========================================")
# Test Q-values over validation memory
metrics['rewards'].append([np.mean(episode_rewards[-5:])])
metrics['steps'].append(t)
plot_line(metrics['steps'], metrics['rewards'], 'Reward')