summaryrefslogtreecommitdiff
path: root/examples/pybullet/gym/pybullet_envs/ARS/ars.py
diff options
context:
space:
mode:
Diffstat (limited to 'examples/pybullet/gym/pybullet_envs/ARS/ars.py')
-rw-r--r--examples/pybullet/gym/pybullet_envs/ARS/ars.py460
1 files changed, 238 insertions, 222 deletions
diff --git a/examples/pybullet/gym/pybullet_envs/ARS/ars.py b/examples/pybullet/gym/pybullet_envs/ARS/ars.py
index 760ebd732..9e9c2b559 100644
--- a/examples/pybullet/gym/pybullet_envs/ARS/ars.py
+++ b/examples/pybullet/gym/pybullet_envs/ARS/ars.py
@@ -4,7 +4,7 @@ import os
import inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(os.path.dirname(currentdir))
-os.sys.path.insert(0,parentdir)
+os.sys.path.insert(0, parentdir)
# Importing the libraries
import os
@@ -20,17 +20,17 @@ import argparse
# Setting the Hyper Parameters
class Hp():
-
- def __init__(self):
- self.nb_steps = 10000
- self.episode_length = 1000
- self.learning_rate = 0.02
- self.nb_directions = 16
- self.nb_best_directions = 16
- assert self.nb_best_directions <= self.nb_directions
- self.noise = 0.03
- self.seed = 1
- self.env_name = 'HalfCheetahBulletEnv-v0'
+
+ def __init__(self):
+ self.nb_steps = 10000
+ self.episode_length = 1000
+ self.learning_rate = 0.02
+ self.nb_directions = 16
+ self.nb_best_directions = 16
+ assert self.nb_best_directions <= self.nb_directions
+ self.noise = 0.03
+ self.seed = 1
+ self.env_name = 'HalfCheetahBulletEnv-v0'
# Multiprocess Exploring the policy on one specific direction and over one episode
@@ -39,239 +39,255 @@ _RESET = 1
_CLOSE = 2
_EXPLORE = 3
-def ExploreWorker(rank,childPipe, envname, args):
- env = gym.make(envname)
- nb_inputs = env.observation_space.shape[0]
- normalizer = Normalizer(nb_inputs)
- observation_n = env.reset()
- n=0
- while True:
- n+=1
- try:
- # Only block for short times to have keyboard exceptions be raised.
- if not childPipe.poll(0.001):
- continue
- message, payload = childPipe.recv()
- except (EOFError, KeyboardInterrupt):
- break
- if message == _RESET:
- observation_n = env.reset()
- childPipe.send(["reset ok"])
- continue
- if message == _EXPLORE:
- #normalizer = payload[0] #use our local normalizer
- policy = payload[1]
- hp = payload[2]
- direction = payload[3]
- delta = payload[4]
- state = env.reset()
- done = False
- num_plays = 0.
- sum_rewards = 0
- while not done and num_plays < hp.episode_length:
- normalizer.observe(state)
- state = normalizer.normalize(state)
- action = policy.evaluate(state, delta, direction,hp)
- state, reward, done, _ = env.step(action)
- reward = max(min(reward, 1), -1)
- sum_rewards += reward
- num_plays += 1
- childPipe.send([sum_rewards])
+
+def ExploreWorker(rank, childPipe, envname, args):
+ env = gym.make(envname)
+ nb_inputs = env.observation_space.shape[0]
+ normalizer = Normalizer(nb_inputs)
+ observation_n = env.reset()
+ n = 0
+ while True:
+ n += 1
+ try:
+ # Only block for short times to have keyboard exceptions be raised.
+ if not childPipe.poll(0.001):
continue
- if message == _CLOSE:
- childPipe.send(["close ok"])
- break
- childPipe.close()
-
+ message, payload = childPipe.recv()
+ except (EOFError, KeyboardInterrupt):
+ break
+ if message == _RESET:
+ observation_n = env.reset()
+ childPipe.send(["reset ok"])
+ continue
+ if message == _EXPLORE:
+ #normalizer = payload[0] #use our local normalizer
+ policy = payload[1]
+ hp = payload[2]
+ direction = payload[3]
+ delta = payload[4]
+ state = env.reset()
+ done = False
+ num_plays = 0.
+ sum_rewards = 0
+ while not done and num_plays < hp.episode_length:
+ normalizer.observe(state)
+ state = normalizer.normalize(state)
+ action = policy.evaluate(state, delta, direction, hp)
+ state, reward, done, _ = env.step(action)
+ reward = max(min(reward, 1), -1)
+ sum_rewards += reward
+ num_plays += 1
+ childPipe.send([sum_rewards])
+ continue
+ if message == _CLOSE:
+ childPipe.send(["close ok"])
+ break
+ childPipe.close()
+
# Normalizing the states
+
class Normalizer():
-
- def __init__(self, nb_inputs):
- self.n = np.zeros(nb_inputs)
- self.mean = np.zeros(nb_inputs)
- self.mean_diff = np.zeros(nb_inputs)
- self.var = np.zeros(nb_inputs)
-
- def observe(self, x):
- self.n += 1.
- last_mean = self.mean.copy()
- self.mean += (x - self.mean) / self.n
- self.mean_diff += (x - last_mean) * (x - self.mean)
- self.var = (self.mean_diff / self.n).clip(min = 1e-2)
-
- def normalize(self, inputs):
- obs_mean = self.mean
- obs_std = np.sqrt(self.var)
- return (inputs - obs_mean) / obs_std
+
+ def __init__(self, nb_inputs):
+ self.n = np.zeros(nb_inputs)
+ self.mean = np.zeros(nb_inputs)
+ self.mean_diff = np.zeros(nb_inputs)
+ self.var = np.zeros(nb_inputs)
+
+ def observe(self, x):
+ self.n += 1.
+ last_mean = self.mean.copy()
+ self.mean += (x - self.mean) / self.n
+ self.mean_diff += (x - last_mean) * (x - self.mean)
+ self.var = (self.mean_diff / self.n).clip(min=1e-2)
+
+ def normalize(self, inputs):
+ obs_mean = self.mean
+ obs_std = np.sqrt(self.var)
+ return (inputs - obs_mean) / obs_std
+
# Building the AI
+
class Policy():
- def __init__(self, input_size, output_size, env_name, args):
- try:
- self.theta = np.load(args.policy)
- except:
- self.theta = np.zeros((output_size, input_size))
- self.env_name = env_name
- print("Starting policy theta=",self.theta)
- def evaluate(self, input, delta, direction, hp):
- if direction is None:
- return np.clip(self.theta.dot(input), -1.0, 1.0)
- elif direction == "positive":
- return np.clip((self.theta + hp.noise*delta).dot(input), -1.0, 1.0)
- else:
- return np.clip((self.theta - hp.noise*delta).dot(input), -1.0, 1.0)
-
- def sample_deltas(self):
- return [np.random.randn(*self.theta.shape) for _ in range(hp.nb_directions)]
-
- def update(self, rollouts, sigma_r, args):
- step = np.zeros(self.theta.shape)
- for r_pos, r_neg, d in rollouts:
- step += (r_pos - r_neg) * d
- self.theta += hp.learning_rate / (hp.nb_best_directions * sigma_r) * step
- timestr = time.strftime("%Y%m%d-%H%M%S")
- np.save(args.logdir+"/policy_"+self.env_name+"_"+timestr+".npy", self.theta)
+
+ def __init__(self, input_size, output_size, env_name, args):
+ try:
+ self.theta = np.load(args.policy)
+ except:
+ self.theta = np.zeros((output_size, input_size))
+ self.env_name = env_name
+ print("Starting policy theta=", self.theta)
+
+ def evaluate(self, input, delta, direction, hp):
+ if direction is None:
+ return np.clip(self.theta.dot(input), -1.0, 1.0)
+ elif direction == "positive":
+ return np.clip((self.theta + hp.noise * delta).dot(input), -1.0, 1.0)
+ else:
+ return np.clip((self.theta - hp.noise * delta).dot(input), -1.0, 1.0)
+
+ def sample_deltas(self):
+ return [np.random.randn(*self.theta.shape) for _ in range(hp.nb_directions)]
+
+ def update(self, rollouts, sigma_r, args):
+ step = np.zeros(self.theta.shape)
+ for r_pos, r_neg, d in rollouts:
+ step += (r_pos - r_neg) * d
+ self.theta += hp.learning_rate / (hp.nb_best_directions * sigma_r) * step
+ timestr = time.strftime("%Y%m%d-%H%M%S")
+ np.save(args.logdir + "/policy_" + self.env_name + "_" + timestr + ".npy", self.theta)
# Exploring the policy on one specific direction and over one episode
+
def explore(env, normalizer, policy, direction, delta, hp):
- state = env.reset()
- done = False
- num_plays = 0.
- sum_rewards = 0
- while not done and num_plays < hp.episode_length:
- normalizer.observe(state)
- state = normalizer.normalize(state)
- action = policy.evaluate(state, delta, direction, hp)
- state, reward, done, _ = env.step(action)
- reward = max(min(reward, 1), -1)
- sum_rewards += reward
- num_plays += 1
- return sum_rewards
+ state = env.reset()
+ done = False
+ num_plays = 0.
+ sum_rewards = 0
+ while not done and num_plays < hp.episode_length:
+ normalizer.observe(state)
+ state = normalizer.normalize(state)
+ action = policy.evaluate(state, delta, direction, hp)
+ state, reward, done, _ = env.step(action)
+ reward = max(min(reward, 1), -1)
+ sum_rewards += reward
+ num_plays += 1
+ return sum_rewards
+
# Training the AI
+
def train(env, policy, normalizer, hp, parentPipes, args):
-
- for step in range(hp.nb_steps):
-
- # Initializing the perturbations deltas and the positive/negative rewards
- deltas = policy.sample_deltas()
- positive_rewards = [0] * hp.nb_directions
- negative_rewards = [0] * hp.nb_directions
-
- if parentPipes:
- for k in range(hp.nb_directions):
- parentPipe = parentPipes[k]
- parentPipe.send([_EXPLORE,[normalizer, policy, hp, "positive", deltas[k]]])
- for k in range(hp.nb_directions):
- positive_rewards[k] = parentPipes[k].recv()[0]
-
- for k in range(hp.nb_directions):
- parentPipe = parentPipes[k]
- parentPipe.send([_EXPLORE,[normalizer, policy, hp, "negative", deltas[k]]])
- for k in range(hp.nb_directions):
- negative_rewards[k] = parentPipes[k].recv()[0]
-
- else:
- # Getting the positive rewards in the positive directions
- for k in range(hp.nb_directions):
- positive_rewards[k] = explore(env, normalizer, policy, "positive", deltas[k], hp)
-
-
- # Getting the negative rewards in the negative/opposite directions
- for k in range(hp.nb_directions):
- negative_rewards[k] = explore(env, normalizer, policy, "negative", deltas[k], hp)
-
-
- # Gathering all the positive/negative rewards to compute the standard deviation of these rewards
- all_rewards = np.array(positive_rewards + negative_rewards)
- sigma_r = all_rewards.std()
-
- # Sorting the rollouts by the max(r_pos, r_neg) and selecting the best directions
- scores = {k:max(r_pos, r_neg) for k,(r_pos,r_neg) in enumerate(zip(positive_rewards, negative_rewards))}
- order = sorted(scores.keys(), key = lambda x:scores[x])[:hp.nb_best_directions]
- rollouts = [(positive_rewards[k], negative_rewards[k], deltas[k]) for k in order]
-
- # Updating our policy
- policy.update(rollouts, sigma_r, args)
-
- # Printing the final reward of the policy after the update
- reward_evaluation = explore(env, normalizer, policy, None, None, hp)
- print('Step:', step, 'Reward:', reward_evaluation)
+
+ for step in range(hp.nb_steps):
+
+ # Initializing the perturbations deltas and the positive/negative rewards
+ deltas = policy.sample_deltas()
+ positive_rewards = [0] * hp.nb_directions
+ negative_rewards = [0] * hp.nb_directions
+
+ if parentPipes:
+ for k in range(hp.nb_directions):
+ parentPipe = parentPipes[k]
+ parentPipe.send([_EXPLORE, [normalizer, policy, hp, "positive", deltas[k]]])
+ for k in range(hp.nb_directions):
+ positive_rewards[k] = parentPipes[k].recv()[0]
+
+ for k in range(hp.nb_directions):
+ parentPipe = parentPipes[k]
+ parentPipe.send([_EXPLORE, [normalizer, policy, hp, "negative", deltas[k]]])
+ for k in range(hp.nb_directions):
+ negative_rewards[k] = parentPipes[k].recv()[0]
+
+ else:
+ # Getting the positive rewards in the positive directions
+ for k in range(hp.nb_directions):
+ positive_rewards[k] = explore(env, normalizer, policy, "positive", deltas[k], hp)
+
+ # Getting the negative rewards in the negative/opposite directions
+ for k in range(hp.nb_directions):
+ negative_rewards[k] = explore(env, normalizer, policy, "negative", deltas[k], hp)
+
+ # Gathering all the positive/negative rewards to compute the standard deviation of these rewards
+ all_rewards = np.array(positive_rewards + negative_rewards)
+ sigma_r = all_rewards.std()
+
+ # Sorting the rollouts by the max(r_pos, r_neg) and selecting the best directions
+ scores = {
+ k: max(r_pos, r_neg)
+ for k, (r_pos, r_neg) in enumerate(zip(positive_rewards, negative_rewards))
+ }
+ order = sorted(scores.keys(), key=lambda x: scores[x])[:hp.nb_best_directions]
+ rollouts = [(positive_rewards[k], negative_rewards[k], deltas[k]) for k in order]
+
+ # Updating our policy
+ policy.update(rollouts, sigma_r, args)
+
+ # Printing the final reward of the policy after the update
+ reward_evaluation = explore(env, normalizer, policy, None, None, hp)
+ print('Step:', step, 'Reward:', reward_evaluation)
+
# Running the main code
+
def mkdir(base, name):
- path = os.path.join(base, name)
- if not os.path.exists(path):
- os.makedirs(path)
- return path
+ path = os.path.join(base, name)
+ if not os.path.exists(path):
+ os.makedirs(path)
+ return path
+if __name__ == "__main__":
+ mp.freeze_support()
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument('--env',
+ help='Gym environment name',
+ type=str,
+ default='HalfCheetahBulletEnv-v0')
+ parser.add_argument('--seed', help='RNG seed', type=int, default=1)
+ parser.add_argument('--render', help='OpenGL Visualizer', type=int, default=0)
+ parser.add_argument('--movie', help='rgb_array gym movie', type=int, default=0)
+ parser.add_argument('--steps', help='Number of steps', type=int, default=10000)
+ parser.add_argument('--policy', help='Starting policy file (npy)', type=str, default='')
+ parser.add_argument('--logdir',
+ help='Directory root to log policy files (npy)',
+ type=str,
+ default='.')
+ parser.add_argument('--mp', help='Enable multiprocessing', type=int, default=1)
-if __name__ == "__main__":
- mp.freeze_support()
-
- parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('--env', help='Gym environment name', type=str, default='HalfCheetahBulletEnv-v0')
- parser.add_argument('--seed', help='RNG seed', type=int, default=1)
- parser.add_argument('--render', help='OpenGL Visualizer', type=int, default=0)
- parser.add_argument('--movie',help='rgb_array gym movie',type=int, default=0)
- parser.add_argument('--steps', help='Number of steps', type=int, default=10000)
- parser.add_argument('--policy', help='Starting policy file (npy)', type=str, default='')
- parser.add_argument('--logdir', help='Directory root to log policy files (npy)', type=str, default='.')
- parser.add_argument('--mp', help='Enable multiprocessing', type=int, default=1)
-
- args = parser.parse_args()
-
- hp = Hp()
- hp.env_name = args.env
- hp.seed = args.seed
- hp.nb_steps = args.steps
- print("seed = ", hp.seed)
- np.random.seed(hp.seed)
-
- parentPipes = None
- if args.mp:
- num_processes = hp.nb_directions
- processes = []
- childPipes = []
- parentPipes = []
-
- for pr in range (num_processes):
- parentPipe, childPipe = Pipe()
- parentPipes.append(parentPipe)
- childPipes.append(childPipe)
-
- for rank in range(num_processes):
- p = mp.Process(target=ExploreWorker, args=(rank,childPipes[rank], hp.env_name, args))
- p.start()
- processes.append(p)
-
- work_dir = mkdir('exp', 'brs')
- monitor_dir = mkdir(work_dir, 'monitor')
- env = gym.make(hp.env_name)
- if args.render:
- env.render(mode = "human")
- if args.movie:
- env = wrappers.Monitor(env, monitor_dir, force = True)
- nb_inputs = env.observation_space.shape[0]
- nb_outputs = env.action_space.shape[0]
- policy = Policy(nb_inputs, nb_outputs,hp.env_name, args)
- normalizer = Normalizer(nb_inputs)
-
- print("start training")
- train(env, policy, normalizer, hp, parentPipes, args)
-
- if args.mp:
- for parentPipe in parentPipes:
- parentPipe.send([_CLOSE,"pay2"])
-
- for p in processes:
- p.join()
+ args = parser.parse_args()
+
+ hp = Hp()
+ hp.env_name = args.env
+ hp.seed = args.seed
+ hp.nb_steps = args.steps
+ print("seed = ", hp.seed)
+ np.random.seed(hp.seed)
+
+ parentPipes = None
+ if args.mp:
+ num_processes = hp.nb_directions
+ processes = []
+ childPipes = []
+ parentPipes = []
+
+ for pr in range(num_processes):
+ parentPipe, childPipe = Pipe()
+ parentPipes.append(parentPipe)
+ childPipes.append(childPipe)
+
+ for rank in range(num_processes):
+ p = mp.Process(target=ExploreWorker, args=(rank, childPipes[rank], hp.env_name, args))
+ p.start()
+ processes.append(p)
+
+ work_dir = mkdir('exp', 'brs')
+ monitor_dir = mkdir(work_dir, 'monitor')
+ env = gym.make(hp.env_name)
+ if args.render:
+ env.render(mode="human")
+ if args.movie:
+ env = wrappers.Monitor(env, monitor_dir, force=True)
+ nb_inputs = env.observation_space.shape[0]
+ nb_outputs = env.action_space.shape[0]
+ policy = Policy(nb_inputs, nb_outputs, hp.env_name, args)
+ normalizer = Normalizer(nb_inputs)
+
+ print("start training")
+ train(env, policy, normalizer, hp, parentPipes, args)
+
+ if args.mp:
+ for parentPipe in parentPipes:
+ parentPipe.send([_CLOSE, "pay2"])
+
+ for p in processes:
+ p.join()