Commit 2267b85e by lsy

Initial commit

parents
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
*.local
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
fig/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
*.jpg
*.jpeg
.idea/
*.npy
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
runs/
prev_runs/
saved_models/
*.log
*.jpg
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
# DS Store
.DS_Store
#saved_model
*.pth
*.pt
*.log
*.avi
.idea/
runs/
# Learning Subgoal Representations with Slow Dynamics
We propose a slowness objective to effectively learn the subgoal representation
for goal-conditioned hierarchical reinforcement learning. [Our paper](https://openreview.net/pdf?id=wxRwhSdORKG) is accepted by ICLR 2021.
The python dependencies are as follows.
* Python 3.6 or above
* [PyTorch](https://pytorch.org/)
* [Gym](https://gym.openai.com/)
* [Mujoco](https://www.roboti.us)
Run the codes with ``python train_hier_sac.py``. The tensorboard files are saved in the ``runs`` folder and the
trained models are saved in the ``saved_models`` folder.
import numpy as np
class her_sampler:
def __init__(self, replay_strategy, replay_k, threshold, future_step, dense_reward, direction_reward, low_reward_coeff):
self.replay_strategy = replay_strategy
self.replay_k = replay_k
if self.replay_strategy == 'future':
self.future_p = 1 - (1. / (1 + replay_k))
else:
self.future_p = 0
self.threshold = threshold
self.furture_step = future_step
self.border_index = None
self.direction_reward = direction_reward
# reward type not use in direction reward
if not dense_reward:
self.reward_type = 'sparse'
else:
self.reward_type = 'dense'
self.reward_coeff = low_reward_coeff
def reward_func(self, state, goal, info=None):
assert state.shape == goal.shape
dist = np.linalg.norm(state - goal, axis=-1)
if self.reward_type == 'sparse':
return -(dist > self.threshold).astype(np.float32)
else:
return -dist * self.reward_coeff
def direction_reward_func(self, ag_next, goal, ag):
# l2 distance reward
assert ag.shape == goal.shape
dist = np.linalg.norm(ag + goal - ag_next, axis=-1)
return -dist
# # cosine distance reward
# a_direction = ag_next - ag # achieved direction
# cos_dist = np.sum(np.multiply(a_direction, goal), axis=1) / (
# (np.linalg.norm(a_direction, axis=1) * np.linalg.norm(goal, axis=1)) + 1e-6)
# return cos_dist
def sample_her_transitions(self, episode_batch, batch_size_in_transitions):
T = episode_batch['actions'].shape[1]
rollout_batch_size = episode_batch['actions'].shape[0]
batch_size = batch_size_in_transitions
# select which rollouts and which timesteps to be used
episode_idxs = np.random.randint(0, rollout_batch_size, batch_size)
t_samples = np.random.randint(T, size=batch_size)
transitions = {key: episode_batch[key][episode_idxs, t_samples].copy() for key in episode_batch.keys()}
# her idx
her_indexes = np.where(np.random.uniform(size=batch_size) < self.future_p)
# cheat in her for large step length
target_index = np.minimum(T, t_samples + self.furture_step)
future_offset = np.random.uniform(size=batch_size) * (target_index - t_samples)
future_offset = future_offset.astype(int)
future_t = (t_samples + 1 + future_offset)[her_indexes]
# replace goal with achieved goal
future_ag = episode_batch['ag'][episode_idxs[her_indexes], future_t]
transitions['g'][her_indexes] = future_ag
# to get the params to re-compute reward
if not self.direction_reward:
transitions['r'] = np.expand_dims(
self.reward_func(transitions['ag_next'], transitions['g'],
None), 1)
else:
transitions['r'] = np.expand_dims(
self.direction_reward_func(transitions['ag_next'].copy(), transitions['g'].copy(),
transitions['ag'].copy()), 1)
transitions = {k: transitions[k].reshape(batch_size, *transitions[k].shape[1:]) for k in transitions.keys()}
return transitions
def sample_her_energy(self, episode_batch, batch_size_in_transitions, temperature=1.0):
T = episode_batch['actions'].shape[1]
rollout_batch_size = episode_batch['actions'].shape[0]
batch_size = batch_size_in_transitions
# select which rollouts and which timesteps to be used
energy_trajectory = episode_batch['e']
p_trajectory = np.power(energy_trajectory, 1 / (temperature + 1e-2))
p_trajectory = p_trajectory / p_trajectory.sum()
episode_idxs = np.random.choice(rollout_batch_size, size=batch_size, replace=True, p=p_trajectory.flatten())
t_samples = np.random.randint(T, size=batch_size)
transitions = {}
for key in episode_batch.keys():
if not key == 'e':
transitions[key] = episode_batch[key][episode_idxs, t_samples].copy()
# her idx
her_indexes = np.where(np.random.uniform(size=batch_size) < self.future_p)
# cheat in her for large step length
target_index = np.minimum(T, t_samples + self.furture_step)
future_offset = np.random.uniform(size=batch_size) * (target_index - t_samples)
future_offset = future_offset.astype(int)
future_t = (t_samples + 1 + future_offset)[her_indexes]
# replace go with achieved goal
future_ag = episode_batch['ag'][episode_idxs[her_indexes], future_t]
transitions['g'][her_indexes] = future_ag
# to get the params to re-compute reward
if not self.direction_reward:
transitions['r'] = np.expand_dims(
self.reward_func(transitions['ag_next'], transitions['g'],
None), 1)
else:
transitions['r'] = np.expand_dims(
self.direction_reward_func(transitions['ag_next'], transitions['g'],
transitions['ag']), 1)
transitions = {k: transitions[k].reshape(batch_size, *transitions[k].shape[1:]) for k in transitions.keys()}
return transitions
def adjust_replay_k(self):
if self.replay_k > 1:
self.replay_k -= 1
if self.replay_strategy == 'future':
self.future_p = 1 - (1. / (1 + self.replay_k))
else:
self.future_p = 0
import os
import sys
sys.path.append('../')
from datetime import datetime
from tensorboardX import SummaryWriter
from models.networks import *
from algos.replay_buffer import replay_buffer, replay_buffer_energy
from algos.her import her_sampler
# from planner.goal_plan import *
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import time
from algos.sac.sac import SAC
from algos.sac.replay_memory import ReplayMemory, Array_ReplayMemory
import gym
import pickle
# from planner.simhash import HashingBonusEvaluator
from PIL import Image
import imageio
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
sns.set_color_codes()
SUBGOAL_RANGE = 200.0
class hier_sac_agent:
def __init__(self, args, env, env_params, test_env, test_env1=None, test_env2=None):
self.args = args
self.env = env
self.test_env = test_env
self.env_params = env_params
self.device = args.device
self.resume = args.resume
self.resume_epoch = args.resume_epoch
self.not_train_low = False
self.test_env1 = test_env1
self.test_env2 = test_env2
self.old_sample = args.old_sample
self.low_dim = env_params['obs']
self.env_params['low_dim'] = self.low_dim
self.hi_dim = env_params['obs']
print("hi_dim", self.hi_dim)
self.learn_goal_space = True
self.whole_obs = False # use whole observation space as subgoal space
self.abs_range = abs_range = args.abs_range # absolute goal range
self.feature_reg = 0.0 # feature l2 regularization
print("abs_range", abs_range)
if args.env_name[:5] == "Fetch":
maze_low = self.env.env.initial_gripper_xpos[:2] - self.env.env.target_range
maze_high = self.env.env.initial_gripper_xpos[:2] + self.env.env.target_range
self.hi_act_space = gym.spaces.Box(low=maze_low, high=maze_high)
else:
if args.env_name != "NChain-v1":
self.hi_act_space = self.env.env.maze_space
else:
self.hi_act_space = gym.spaces.Box(low=np.array([-1]), high=np.array([1]))
if self.learn_goal_space:
if args.env_name == "NChain-v1":
self.hi_act_space = gym.spaces.Box(low=np.array([-abs_range]), high=np.array([abs_range]))
else:
self.hi_act_space = gym.spaces.Box(low=np.array([-abs_range, -abs_range]), high=np.array([abs_range, abs_range]))
if self.whole_obs:
vel_low = [-10.] * 4
vel_high = [10.] * 4
maze_low = np.concatenate((self.env.env.maze_low, np.array(vel_low)))
maze_high = np.concatenate((self.env.env.maze_high, np.array(vel_high)))
self.hi_act_space = gym.spaces.Box(low=maze_low, high=maze_high)
dense_low = True
self.low_use_clip = not dense_low # only sparse reward use clip
if args.replay_strategy == "future":
self.low_forward = True
assert self.low_use_clip is True
else:
self.low_forward = False
assert self.low_use_clip is False
self.hi_sparse = (self.env.env.reward_type == "sparse")
# # params of learning phi
resume_phi = args.resume
self.not_update_phi = False
phi_path = args.resume_path
# resume_phi = True
# phi_path = 'saved_models/AntMaze1-v1_Jun01_19-26-19'
# self.not_update_phi = True
self.save_fig = False
self.save_model = False
self.start_update_phi = args.start_update_phi
self.early_stop = args.early_stop # after success rate converge, don't update low policy and feature
if args.env_name in ['AntPush-v1', 'AntFall-v1']:
if self.not_update_phi:
self.early_stop_thres = 900
else:
self.early_stop_thres = 3500
elif args.env_name in ["PointMaze1-v1"]:
self.early_stop_thres = 2000
elif args.env_name == "AntMaze1-v1":
self.early_stop_thres = 3000
else:
self.early_stop_thres = args.n_epochs
print("early_stop_threshold", self.early_stop_thres)
self.success_log = []
# scaling = self.env.env.env.MAZE_SIZE_SCALING
# print("scaling", scaling)
self.count_latent = False
if self.count_latent:
self.hash = HashingBonusEvaluator(512, 2)
self.count_obs = False
if self.count_obs:
self.hash = HashingBonusEvaluator(512, env_params['obs'])
self.high_correct = False
self.k = args.c
self.delta_k = 0
self.prediction_coeff = 0.0
tanh_output = False
self.use_prob = False
print("prediction_coeff", self.prediction_coeff)
if args.save:
current_time = datetime.now().strftime('%b%d_%H-%M-%S')
self.log_dir = 'runs/hier/' + str(args.env_name) + '/RB_Decay_' + current_time + \
"_C_" + str(args.c) + "_Image_" + str(args.image) + \
"_Seed_" + str(args.seed) + "_Reward_" + str(args.low_reward_coeff) + \
"_NoPhi_" + str(self.not_update_phi) + "_LearnG_" + str(self.learn_goal_space) + "_Early_" + str(self.early_stop_thres) + str(args.early_stop)
self.writer = SummaryWriter(log_dir=self.log_dir)
if not os.path.exists(self.args.save_dir):
os.mkdir(self.args.save_dir)
# path to save the model
self.model_path = os.path.join(self.args.save_dir, self.args.env_name + "_" + current_time)
if not os.path.exists(self.model_path):
os.mkdir(self.model_path)
# init low-level network
self.real_goal_dim = self.hi_act_space.shape[0] # low-level goal space and high-level action space
self.init_network()
# init high-level agent
self.hi_agent = SAC(self.hi_dim + env_params['goal'], self.hi_act_space, args, False, env_params['goal'],
args.gradient_flow_value, args.abs_range, tanh_output)
self.env_params['real_goal_dim'] = self.real_goal_dim
self.hi_buffer = ReplayMemory(args.buffer_size)
# her sampler
self.c = self.args.c # interval of high level action
self.low_her_module = her_sampler(args.replay_strategy, args.replay_k, args.distance, args.future_step,
dense_reward=dense_low, direction_reward=False, low_reward_coeff=args.low_reward_coeff)
if args.env_name[:5] == "Fetch":
self.low_buffer = replay_buffer_energy(self.env_params, self.args.buffer_size,
self.low_her_module.sample_her_energy, args.env_name)
else:
self.low_buffer = replay_buffer(self.env_params, self.args.buffer_size, self.low_her_module.sample_her_transitions)
not_load_buffer, not_load_high = True, False
if self.resume is True:
self.start_epoch = self.resume_epoch
if not not_load_high:
self.hi_agent.policy.load_state_dict(torch.load(self.args.resume_path + \
'/hi_actor_model.pt', map_location='cuda:4')[0])
# self.hi_agent.critic.load_state_dict(torch.load(self.args.resume_path + \
# '/hi_critic_model.pt', map_location='cuda:4')[0])
# print("not load low !!!")
print("load low !!!")
self.low_actor_network.load_state_dict(torch.load(self.args.resume_path + \
'/low_actor_model.pt', map_location='cuda:4')[0])
self.low_critic_network.load_state_dict(torch.load(self.args.resume_path + \
'/low_critic_model.pt', map_location='cuda:4')[0])
if not not_load_buffer:
# self.hi_buffer = torch.load(self.args.resume_path + '/hi_buffer.pt', map_location='cuda:1')
self.low_buffer = torch.load(self.args.resume_path + '/low_buffer.pt', map_location='cuda:1')
# sync target network of low-level
self.sync_target()
if hasattr(self.env.env, 'env'):
self.animate = self.env.env.env.visualize_goal
else:
self.animate = self.args.animate
self.distance_threshold = self.args.distance
if not (args.gradient_flow or args.use_prediction or args.gradient_flow_value):
self.representation = RepresentationNetwork(env_params, 3, self.abs_range, self.real_goal_dim).to(args.device)
if args.use_target:
self.target_phi = RepresentationNetwork(env_params, 3, self.abs_range, 2).to(args.device)
# load the weights into the target networks
self.target_phi.load_state_dict(self.representation.state_dict())
self.representation_optim = torch.optim.Adam(self.representation.parameters(), lr=0.0001)
if resume_phi is True:
print("load phi from: ", phi_path)
self.representation.load_state_dict(torch.load(phi_path + \
'/phi_model_4000.pt', map_location='cuda:4')[0])
elif args.use_prediction:
self.representation = DynamicsNetwork(env_params, self.abs_range, 2, tanh_output=tanh_output, use_prob=self.use_prob, device=args.device).to(args.device)
self.representation_optim = torch.optim.Adam(self.representation.parameters(), lr=0.0001)
if resume_phi is True:
print("load phi from: ", phi_path)
self.representation.load_state_dict(torch.load(phi_path + \
'/phi_model_4000.pt', map_location='cuda:1')[0])
print("learn goal space", self.learn_goal_space, " update phi", not self.not_update_phi)
self.train_success = 0
self.furthest_task = 0.
def adjust_lr_actor(self, epoch):
lr_actor = self.args.lr_actor * (0.5 ** (epoch // self.args.lr_decay_actor))
for param_group in self.low_actor_optim.param_groups:
param_group['lr'] = lr_actor
def adjust_lr_critic(self, epoch):
lr_critic = self.args.lr_critic * (0.5 ** (epoch // self.args.lr_decay_critic))
for param_group in self.low_critic_optim.param_groups:
param_group['lr'] = lr_critic
def learn(self):
for epoch in range(self.start_epoch, self.args.n_epochs):
if epoch > 0 and epoch % self.args.lr_decay_actor == 0:
self.adjust_lr_actor(epoch)
if epoch > 0 and epoch % self.args.lr_decay_critic == 0:
self.adjust_lr_critic(epoch)
ep_obs, ep_ag, ep_g, ep_actions = [], [], [], []
last_hi_obs = None
success = 0
observation = self.env.reset()
obs = observation['observation']
ag = observation['achieved_goal'][:self.real_goal_dim]
g = observation['desired_goal']
# identify furthest task
if g[1] >= 8:
self.furthest_task += 1
is_furthest_task = True
else:
is_furthest_task = False
if self.learn_goal_space:
if self.args.gradient_flow:
if self.args.use_target:
ag = self.hi_agent.policy_target.phi(torch.Tensor(obs).to(self.device)).detach().cpu().numpy()
else:
ag = self.hi_agent.policy.phi(torch.Tensor(obs).to(self.device)).detach().cpu().numpy()
elif self.args.gradient_flow_value:
ag = self.hi_agent.critic.phi(torch.Tensor(obs).to(self.device)).detach().cpu().numpy()[0]
elif self.args.use_prediction:
ag = self.representation.phi(torch.Tensor(obs).to(self.device)).detach().cpu().numpy()[0]
else:
if self.args.use_target:
ag = self.target_phi(torch.Tensor(obs).to(self.device)).detach().cpu().numpy()[0]
else:
ag = self.representation(torch.Tensor(obs).to(self.device)).detach().cpu().numpy()[0]
if self.whole_obs:
ag = obs.copy()
for t in range(self.env_params['max_timesteps']):
act_obs, act_g = self._preproc_inputs(obs, g)
if t % self.c == 0:
hi_act_obs = np.concatenate((obs[:self.hi_dim], g))
# append high-level rollouts
if last_hi_obs is not None:
mask = float(not done)
if self.high_correct:
last_hi_a = ag
self.hi_buffer.push(last_hi_obs, last_hi_a, last_hi_r, hi_act_obs, mask, epoch)
if epoch < self.args.start_epoch:
hi_action = self.hi_act_space.sample()
# print("sample", hi_action)
else:
hi_action = self.hi_agent.select_action(hi_act_obs)
last_hi_obs = hi_act_obs.copy()
last_hi_a = hi_action.copy()
last_hi_r = 0.
done = False
if self.old_sample:
hi_action_for_low = hi_action
else:
# make hi_action a delta phi(s)
hi_action_for_low = ag.copy() + hi_action.copy()
hi_action_for_low = np.clip(hi_action_for_low, -SUBGOAL_RANGE, SUBGOAL_RANGE)
hi_action_tensor = torch.tensor(hi_action_for_low, dtype=torch.float32).unsqueeze(0).to(self.device)
# update high-level policy
if len(self.hi_buffer) > self.args.batch_size:
self.update_hi(epoch)
with torch.no_grad():
if self.not_train_low:
action = self.test_policy(act_obs[:, :self.low_dim], hi_action_tensor)
else:
action = self.explore_policy(act_obs[:, :self.low_dim], hi_action_tensor)
# feed the actions into the environment
observation_new, r, _, info = self.env.step(action)
if info['is_success']:
done = True
# only record the first success
if success == 0 and is_furthest_task:
success = t
self.train_success += 1
if self.animate:
self.env.render()
obs_new = observation_new['observation']
ag_new = observation_new['achieved_goal'][:self.real_goal_dim]
if self.learn_goal_space:
if self.args.gradient_flow:
if self.args.use_target:
ag_new = self.hi_agent.policy_target.phi(
torch.Tensor(obs_new).to(self.device)).detach().cpu().numpy()
else:
ag_new = self.hi_agent.policy.phi(torch.Tensor(obs_new).to(self.device)).detach().cpu().numpy()
elif self.args.gradient_flow_value:
ag_new = self.hi_agent.critic.phi(torch.Tensor(obs_new).to(self.device)).detach().cpu().numpy()[0]
elif self.args.use_prediction:
ag_new = self.representation.phi(torch.Tensor(obs_new).to(self.device)).detach().cpu().numpy()[0]
else:
if self.args.use_target:
ag_new = self.target_phi(torch.Tensor(obs_new).to(self.device)).detach().cpu().numpy()[0]
else:
ag_new = self.representation(torch.Tensor(obs_new).to(self.device)).detach().cpu().numpy()[0]
if self.whole_obs:
ag_new = obs_new.copy()
if done is False:
if self.count_latent:
self.hash.inc_hash(ag[None])
r += self.hash.predict(ag_new[None])[0] * 0.1
if self.count_obs:
self.hash.inc_hash(obs[None])
r += self.hash.predict(obs_new[None])[0] * 0.1
last_hi_r += r
# append rollouts
ep_obs.append(obs[:self.low_dim].copy())
ep_ag.append(ag.copy())
ep_g.append(hi_action_for_low.copy())
ep_actions.append(action.copy())
# re-assign the observation
obs = obs_new
ag = ag_new
# slowly update phi
if epoch > self.start_update_phi and not self.not_update_phi and not self.args.gradient_flow and not self.args.gradient_flow_value:
self.slow_update_phi(epoch)
if t % self.args.period == 0 and self.args.use_target:
self._soft_update_target_network(self.target_phi, self.representation)
ep_obs.append(obs[:self.low_dim].copy())
ep_ag.append(ag.copy())
mask = float(not done)
hi_act_obs = np.concatenate((obs[:self.hi_dim], g))
self.hi_buffer.push(last_hi_obs, last_hi_a, last_hi_r, hi_act_obs, mask, epoch)
mb_obs = np.array([ep_obs])
mb_ag = np.array([ep_ag])
mb_g = np.array([ep_g])
mb_actions = np.array([ep_actions])
self.low_buffer.store_episode([mb_obs, mb_ag, mb_g, mb_actions, success, False])
if self.args.save and self.args.env_name == "NChain-v1":
self.writer.add_scalar('Explore/coverage_' + self.args.env_name, self.env.env.coverage, epoch)
# print("coverage", self.env.env.coverage)
# update low-level
if not self.not_train_low:
for n_batch in range(self.args.n_batches):
self._update_network(epoch, self.low_buffer, self.low_actor_target_network,
self.low_critic_target_network,
self.low_actor_network, self.low_critic_network, 'max_timesteps',
self.low_actor_optim, self.low_critic_optim, use_forward_loss=self.low_forward, clip=self.low_use_clip)
if n_batch % self.args.period == 0:
self._soft_update_target_network(self.low_actor_target_network, self.low_actor_network)
self._soft_update_target_network(self.low_critic_target_network, self.low_critic_network)
# start to do the evaluation
if epoch % self.args.eval_interval == 0 and epoch != 0:
if self.test_env1 is not None:
eval_success1, _ = self._eval_hier_agent(env=self.test_env1)
eval_success2, _ = self._eval_hier_agent(env=self.test_env2)
farthest_success_rate, _ = self._eval_hier_agent(env=self.test_env)
random_success_rate, _ = self._eval_hier_agent(env=self.env)
self.success_log.append(farthest_success_rate)
mean_success = np.mean(self.success_log[-5:])
# stop updating phi and low
if self.early_stop and (mean_success >= 0.9 or epoch > self.early_stop_thres):
print("early stop !!!")
self.not_update_phi = True
self.not_train_low = True
print('[{}] epoch is: {}, eval hier success rate is: {:.3f}'.format(datetime.now(), epoch, random_success_rate))
if self.save_fig:
self.vis_hier_policy(epoch=epoch)
self.visualize_representation(epoch=epoch)
if self.args.save:
print("log_dir: ", self.log_dir)
torch.save([self.hi_agent.critic.state_dict()], self.model_path + '/hi_critic_model.pt')
torch.save([self.low_critic_network.state_dict()], self.model_path + '/low_critic_model.pt')
torch.save(self.hi_buffer, self.model_path + '/hi_buffer.pt')
torch.save(self.low_buffer, self.model_path + '/low_buffer.pt')
if not self.args.gradient_flow and not self.args.gradient_flow_value:
if self.save_model:
# self.cal_MIV(epoch)
torch.save([self.representation.state_dict()], self.model_path + '/phi_model_{}.pt'.format(epoch))
torch.save([self.hi_agent.policy.state_dict()], self.model_path + '/hi_actor_{}.pt'.format(epoch))
torch.save([self.low_actor_network.state_dict()], self.model_path + '/low_actor_{}.pt'.format(epoch))
else:
torch.save([self.representation.state_dict()], self.model_path + '/phi_model.pt')
torch.save([self.hi_agent.policy.state_dict()], self.model_path + '/hi_actor_model.pt')
torch.save([self.low_actor_network.state_dict()], self.model_path + '/low_actor_model.pt')
self.writer.add_scalar('Success_rate/hier_farthest_' + self.args.env_name, farthest_success_rate, epoch)
self.writer.add_scalar('Success_rate/hier_random_' + self.args.env_name, random_success_rate, epoch)
self.writer.add_scalar('Explore/furthest_task_' + self.args.env_name, self.furthest_task, epoch)
if self.test_env1 is not None:
self.writer.add_scalar('Success_rate/eval1_' + self.args.env_name,
eval_success1, epoch)
self.writer.add_scalar('Success_rate/eval2_' + self.args.env_name, eval_success2,
epoch)
# pre_process the inputs
def _preproc_inputs(self, obs, g):
obs = torch.tensor(obs, dtype=torch.float32).unsqueeze(0).to(self.device)
g = torch.tensor(g, dtype=torch.float32).unsqueeze(0).to(self.device)
return obs, g
# this function will choose action for the agent and do the exploration
def _select_actions(self, pi):
action = pi.cpu().numpy().squeeze()
if action.shape == ():
action = np.array([action])
# add the gaussian
action += self.args.noise_eps * self.env_params['action_max'] * np.random.randn(*action.shape)
action = np.clip(action, -self.env_params['action_max'], self.env_params['action_max'])
# random actions...
if np.random.rand() < self.args.random_eps:
action = np.random.uniform(low=-self.env_params['action_max'], high=self.env_params['action_max'], \
size=self.env_params['action'])
return action
def explore_policy(self, obs, goal):
pi = self.low_actor_network(obs, goal)
action = self._select_actions(pi)
return action
def update_hi(self, epoch):
if self.args.gradient_flow or self.args.gradient_flow_value:
sample_data, _ = self.slow_collect()
sample_data = torch.tensor(sample_data, dtype=torch.float32).to(self.device)
else:
sample_data = None
critic_1_loss, critic_2_loss, policy_loss, _, _ = self.hi_agent.update_parameters(self.hi_buffer,
self.args.batch_size,
self.env_params,
self.hi_sparse,
sample_data)
if self.args.save:
self.writer.add_scalar('Loss/hi_critic_1', critic_1_loss, epoch)
self.writer.add_scalar('Loss/hi_critic_2', critic_2_loss, epoch)
self.writer.add_scalar('Loss/hi_policy', policy_loss, epoch)
def random_policy(self, obs, goal):
random_actions = np.random.uniform(low=-self.env_params['action_max'], high=self.env_params['action_max'], \
size=self.env_params['action'])
return random_actions
def test_policy(self, obs, goal):
pi = self.low_actor_network(obs, goal)
# convert the actions
actions = pi.detach().cpu().numpy().squeeze()
if actions.shape == ():
actions = np.array([actions])
return actions
# soft update
def _soft_update_target_network(self, target, source):
for target_param, param in zip(target.parameters(), source.parameters()):
target_param.data.copy_((1 - self.args.polyak) * param.data + self.args.polyak * target_param.data)
# update the network
def _update_network(self, epoch, buffer, actor_target, critic_target, actor, critic, T, actor_optim, critic_optim, use_forward_loss=True, clip=True):
# sample the episodes
transitions = buffer.sample(self.args.batch_size)
# pre-process the observation and goal
o, o_next, g, ag = transitions['obs'], transitions['obs_next'], transitions['g'], transitions['ag']
transitions['obs'], transitions['g'] = o, g
transitions['obs_next'], transitions['g_next'] = o_next, g
ag_next = transitions['ag_next']
# start to do the update
obs_cur = transitions['obs']
g_cur = transitions['g']
obs_next = transitions['obs_next']
g_next = transitions['g_next']
# done
dist = np.linalg.norm(ag_next - g_next, axis=1)
not_done = (dist > self.distance_threshold).astype(np.int32).reshape(-1, 1)
# transfer them into the tensor
obs_cur = torch.tensor(obs_cur, dtype=torch.float32).to(self.device)
g_cur = torch.tensor(g_cur, dtype=torch.float32).to(self.device)
obs_next = torch.tensor(obs_next, dtype=torch.float32).to(self.device)
g_next = torch.tensor(g_next, dtype=torch.float32).to(self.device)
ag_next = torch.tensor(ag_next, dtype=torch.float32).to(self.device)
not_done = torch.tensor(not_done, dtype=torch.int32).to(self.device)
actions_tensor = torch.tensor(transitions['actions'], dtype=torch.float32).to(self.device)
r_tensor = torch.tensor(transitions['r'], dtype=torch.float32).to(self.device)
# calculate the target Q value function
with torch.no_grad():
actions_next = actor_target(obs_next, g_next)
q_next_value = critic_target(obs_next, g_next, actions_next)
q_next_value = q_next_value.detach()
target_q_value = r_tensor + critic_target.gamma * q_next_value * not_done
target_q_value = target_q_value.detach()
if clip:
clip_return = self.env_params[T]
target_q_value = torch.clamp(target_q_value, -clip_return, 0.)
# the q loss
real_q_value = critic(obs_cur, g_cur, actions_tensor)
critic_loss = (target_q_value - real_q_value).pow(2).mean()
if use_forward_loss:
forward_loss = critic(obs_cur, ag_next, actions_tensor).pow(2).mean()
critic_loss += forward_loss
# the actor loss
actions_real = actor(obs_cur, g_cur)
actor_loss = -critic(obs_cur, g_cur, actions_real).mean()
actor_loss += self.args.action_l2 * (actions_real / self.env_params['action_max']).pow(2).mean()
# start to update the network
actor_optim.zero_grad()
actor_loss.backward()
torch.nn.utils.clip_grad_norm_(self.low_actor_network.parameters(), 1.0)
actor_optim.step()
# update the critic_network
critic_optim.zero_grad()
critic_loss.backward()
torch.nn.utils.clip_grad_norm_(self.low_critic_network.parameters(), 1.0)
critic_optim.step()
if self.args.save:
if T == 'max_timesteps':
name = 'low'
else:
name = 'high'
self.writer.add_scalar('Loss/' + name + '_actor_loss' + self.args.metric, actor_loss, epoch)
self.writer.add_scalar('Loss/' + name + '_critic_loss' + self.args.metric, critic_loss, epoch)
def _eval_hier_agent(self, env, n_test_rollouts=10):
total_success_rate = []
if not self.args.eval:
n_test_rollouts = self.args.n_test_rollouts
discount_reward = np.zeros(n_test_rollouts)
for roll in range(n_test_rollouts):
per_success_rate = []
observation = env.reset()
obs = observation['observation']
g = observation['desired_goal']
for num in range(self.env_params['max_test_timesteps']):
with torch.no_grad():
act_obs, act_g = self._preproc_inputs(obs, g)
if num % self.c == 0:
hi_act_obs = np.concatenate((obs[:self.hi_dim], g))
hi_action = self.hi_agent.select_action(hi_act_obs, evaluate=True)
if self.old_sample:
new_hi_action = hi_action
else:
ag = self.representation(torch.Tensor(obs).to(self.device)).detach().cpu().numpy()[0]
new_hi_action = ag + hi_action
new_hi_action = np.clip(new_hi_action, -SUBGOAL_RANGE, SUBGOAL_RANGE)
hi_action_tensor = torch.tensor(new_hi_action, dtype=torch.float32).unsqueeze(0).to(self.device)
action = self.test_policy(act_obs[:, :self.low_dim], hi_action_tensor)
observation_new, rew, done, info = env.step(action)
if self.animate:
env.render()
obs = observation_new['observation']
g = observation_new['desired_goal']
if done:
per_success_rate.append(info['is_success'])
if bool(info['is_success']):
# print("t:", num)
discount_reward[roll] = 1 - 1. / self.env_params['max_test_timesteps'] * num
break
total_success_rate.append(per_success_rate)
total_success_rate = np.array(total_success_rate)
global_success_rate = np.mean(total_success_rate[:, -1])
global_reward = np.mean(discount_reward)
if self.args.eval:
print("hier success rate", global_success_rate, global_reward)
return global_success_rate, global_reward
def init_network(self):
self.low_actor_network = actor(self.env_params, self.real_goal_dim).to(self.device)
self.low_actor_target_network = actor(self.env_params, self.real_goal_dim).to(self.device)
self.low_critic_network = criticWrapper(self.env_params, self.args, self.real_goal_dim).to(self.device)
self.low_critic_target_network = criticWrapper(self.env_params, self.args, self.real_goal_dim).to(self.device)
self.start_epoch = 0
# create the optimizer
self.low_actor_optim = torch.optim.Adam(self.low_actor_network.parameters(), lr=self.args.lr_actor)
self.low_critic_optim = torch.optim.Adam(self.low_critic_network.parameters(), lr=self.args.lr_critic, weight_decay=1e-5)
def sync_target(self):
# load the weights into the target networks
self.low_actor_target_network.load_state_dict(self.low_actor_network.state_dict())
self.low_critic_target_network.load_state_dict(self.low_critic_network.state_dict())
def slow_update_phi(self, epoch):
sample_data, hi_action = self.slow_collect()
sample_data = torch.tensor(sample_data, dtype=torch.float32).to(self.device)
if not self.args.use_prediction:
obs, obs_next = self.representation(sample_data[0]), self.representation(sample_data[1])
min_dist = torch.clamp((obs - obs_next).pow(2).mean(dim=1), min=0.)
hi_obs, hi_obs_next = self.representation(sample_data[2]), self.representation(sample_data[3])
max_dist = torch.clamp(1 - (hi_obs - hi_obs_next).pow(2).mean(dim=1), min=0.)
representation_loss = (min_dist + max_dist).mean()
# add l2 regularization
representation_loss += self.feature_reg * (obs / self.abs_range).pow(2).mean()
else:
hi_action = torch.tensor(hi_action, dtype=torch.float32).to(self.device)
with torch.no_grad():
target_next_obs = self.representation.phi(sample_data[3])
obs, obs_next = self.representation.phi(sample_data[0]), self.representation.phi(sample_data[1])
min_dist = torch.clamp((obs - obs_next).pow(2).mean(dim=1), min=0.)
hi_obs, hi_obs_next = self.representation.phi(sample_data[2]), self.representation.phi(sample_data[3])
max_dist = torch.clamp(1 - (hi_obs - hi_obs_next).pow(2).mean(dim=1), min=0.)
representation_loss = (min_dist + max_dist).mean()
# prediction loss
if self.use_prob:
predict_distribution = self.representation(sample_data[2], hi_action)
prediction_loss = - predict_distribution.log_prob(target_next_obs).mean()
else:
predict_state = self.representation(sample_data[2], hi_action)
prediction_loss = (predict_state - target_next_obs).pow(2).mean()
representation_loss += self.prediction_coeff * prediction_loss
self.representation_optim.zero_grad()
representation_loss.backward()
self.representation_optim.step()
if self.args.save:
self.writer.add_scalar('Loss/phi_loss' + self.args.metric, representation_loss, epoch)
def slow_collect(self, batch_size=100):
if self.args.use_prediction:
transitions = self.low_buffer.sample(batch_size)
obs, obs_next = transitions['obs'], transitions['obs_next']
hi_obs, hi_action, _, hi_obs_next, _ = self.hi_buffer.sample(batch_size)
hi_obs, hi_obs_next = hi_obs[:, :self.env_params['obs']], hi_obs_next[:, :self.env_params['obs']]
train_data = np.array([obs, obs_next, hi_obs, hi_obs_next])
return train_data, hi_action
else:
# new negative samples
episode_num = self.low_buffer.current_size
obs_array = self.low_buffer.buffers['obs'][:episode_num]
episode_idxs = np.random.randint(0, episode_num, batch_size)
t_samples = np.random.randint(self.env_params['max_timesteps'] - self.k - self.delta_k, size=batch_size)
if self.delta_k > 0:
delta = np.random.randint(self.delta_k, size=batch_size)
else:
delta = 0
hi_obs = obs_array[episode_idxs, t_samples]
hi_obs_next = obs_array[episode_idxs, t_samples + self.k + delta]
obs = hi_obs
obs_next = obs_array[episode_idxs, t_samples + 1 + delta]
train_data = np.array([obs, obs_next, hi_obs, hi_obs_next])
return train_data, None
def visualize_representation(self, epoch):
transitions = self.low_buffer.sample(800)
obs = transitions['obs']
# with open('fig/final/' + "sampled_states.pkl", 'wb') as output:
# pickle.dump(obs, output)
index1 = np.where((obs[:, 0] < 4) & (obs[:, 1] < 4))
index2 = np.where((obs[:, 0] < 4) & (obs[:, 1] > 4))
index3 = np.where((obs[:, 0] > 4) & (obs[:, 1] < 4))
index4 = np.where((obs[:, 0] > 4) & (obs[:, 1] > 4))
index_lst = [index1, index2, index3, index4]
obs_tensor = torch.Tensor(obs).to(self.device)
features = self.representation(obs_tensor).detach().cpu().numpy()
plt.scatter(features[:, 0], features[:, 1], color='green')
plt.show()
# rep = []
# for index in index_lst:
# rep.append(features[index])
#
# self.plot_fig(rep, 'slow_feature', epoch)
#
#
# obs_list = []
# for index in index_lst:
# obs_list.append(obs[index])
# self.plot_fig(obs_list, 'obs', epoch)
'''
tsne_list = []
res_tsne = TSNE(n_components=2).fit_transform(obs)
for index in index_lst:
tsne_list.append(res_tsne[index])
self.plot_fig(tsne_list, 'tsne_feature', epoch)
'''
def plot_fig(self, rep, name, epoch):
fig = plt.figure()
axes = fig.add_subplot(111)
rep1, rep2, rep3, rep4 = rep
def scatter_rep(rep1, c, marker):
if rep1.shape[0] > 0:
l1 = axes.scatter(rep1[:, 0], rep1[:, 1], c=c, marker=marker)
else:
l1 = axes.scatter([], [], c=c, marker=marker)
return l1
l1 = scatter_rep(rep1, c='y', marker='s')
l2 = scatter_rep(rep2, c='r', marker='o')
l3 = scatter_rep(rep3, c='b', marker='1')
l4 = scatter_rep(rep4, c='g', marker='2')
plt.xlabel('x')
plt.ylabel('y')
axes.legend((l1, l2, l3, l4), ('space1', 'space2', 'space3', 'space4'))
plt.savefig('fig/final/' + name + str(epoch) + '.png')
plt.close()
def vis_hier_policy(self, epoch=0, load_obs=None, color_map='RdYlBu'):
obs_vec = []
hi_action_vec = []
env = self.test_env
observation = env.reset()
obs = observation['observation']
obs_vec.append(obs)
g = observation['desired_goal']
if load_obs is None:
for num in range(self.env_params['max_test_timesteps']):
with torch.no_grad():
act_obs, act_g = self._preproc_inputs(obs, g)
if num % self.c == 0:
hi_act_obs = np.concatenate((obs[:self.hi_dim], g))
hi_action = self.hi_agent.select_action(hi_act_obs, evaluate=True)
hi_action_tensor = torch.tensor(hi_action, dtype=torch.float32).unsqueeze(0).to(self.device)
ag = self.representation(torch.Tensor(obs).to(self.device)).detach().cpu().numpy()[0]
distance = np.linalg.norm(hi_action - ag)
print("distance", distance)
hi_action_vec.append(hi_action)
action = self.test_policy(act_obs[:, :self.low_dim], hi_action_tensor)
observation_new, rew, done, info = env.step(action)
if self.animate:
env.render()
obs = observation_new['observation']
obs_vec.append(obs)
if done:
if info['is_success']:
print("success !!!")
break
else:
obs_vec = load_obs[0]
plt.figure(figsize=(12, 6))
obs_vec = np.array(obs_vec)
with open('fig/final/' + "img_push_hard.pkl", 'wb') as output:
pickle.dump(obs_vec, output)
self.plot_rollout(obs_vec, "XY_{}".format(epoch * self.env_params['max_timesteps']), 121, goal=g)
if not self.learn_goal_space:
features = obs_vec[:, :2]
feature_goal = g[:2]
else:
obs_tensor = torch.Tensor(obs_vec[:, :self.hi_dim]).to(self.device)
features = self.representation(obs_tensor).detach().cpu().numpy()
# rest = (self.env_params['obs'] - self.env_params['goal']) * [0.]
# g = np.concatenate((g, np.array(rest)))
# g = torch.tensor(g, dtype=torch.float32).unsqueeze(0).to(self.device)
# feature_goal = self.representation(g).detach().cpu().numpy()[0]
feature_goal = None
hi_action_vec = np.array(hi_action_vec)
self.plot_rollout(features, "Feature_{}".format(epoch * self.env_params['max_timesteps']), 122, feature_goal, color_map="Blues",
hi_action_vec = hi_action_vec)
if load_obs is not None and len(load_obs) > 1:
obs_vec = load_obs[1]
obs_tensor = torch.Tensor(obs_vec[:, :self.hi_dim]).to(self.device)
features = self.representation(obs_tensor).detach().cpu().numpy()
self.plot_rollout(features, "Feature_{}".format(epoch * self.env_params['max_timesteps']), 122, feature_goal,
color_map="Wistia")
file_name = 'fig/rebuttal/rollout' + str(epoch) + '.png'
plt.savefig(file_name, bbox_inches='tight', transparent=True)
# plt.show()
plt.close()
def plot_rollout(self, obs_vec, name, num, goal=None, hi_action_vec=None, no_axis=True, color_map='RdYlBu'):
plt.subplot(num)
cm = plt.cm.get_cmap(color_map)
num = np.arange(obs_vec.shape[0])
plt.scatter(obs_vec[:, 0], obs_vec[:, 1], c=num, cmap=cm)
if goal is not None:
plt.scatter([goal[0]], [goal[1]], marker='*',
color='green', s=200, label='goal')
if hi_action_vec is not None:
plt.scatter(hi_action_vec[:, 0], hi_action_vec[:, 1], c="k")
plt.title(name, fontsize=24)
if no_axis:
plt.axis('off')
if not no_axis:
plt.scatter([obs_vec[0, 0]], [obs_vec[0, 1]], marker='+',
color='green', s=200, label='start')
plt.scatter([obs_vec[-1, 0]], [obs_vec[-1, 1]], marker='+',
color='red', s=200, label='end')
plt.legend(loc=2, bbox_to_anchor=(1.05, 1.0), fontsize=14, borderaxespad=0.)
# plt.show()
import threading
import numpy as np
import torch
"""
the replay buffer here is basically from the openai baselines code
"""
class replay_buffer:
def __init__(self, env_params, buffer_size, sample_func, name='max_timesteps'):
self.env_params = env_params
self.T = env_params[name]
if name == 'max_timesteps':
# low level
goal_dim = env_params['real_goal_dim']
action_dim = self.env_params['action']
obs_dim = self.env_params['low_dim']
else:
# high level
goal_dim = env_params['goal']
action_dim = env_params['real_goal_dim']
obs_dim = self.env_params['hi_dim']
self.size = buffer_size // self.T
# memory management
self.current_size = 0
self.n_transitions_stored = 0
self.sample_func = sample_func
# create the buffer to store info
self.buffers = {'obs': np.empty([self.size, self.T + 1, obs_dim]),
'ag': np.empty([self.size, self.T + 1, goal_dim]),
'g': np.empty([self.size, self.T, goal_dim]),
'actions': np.empty([self.size, self.T, action_dim]),
'success': np.empty([self.size]),
'done': np.empty([self.size, self.T, 1])
}
self.position = 0 # record the index to update
# store the episode
def store_episode(self, episode_batch):
mb_obs, mb_ag, mb_g, mb_actions, success, done = episode_batch
batch_size = mb_obs.shape[0]
idxs = self._get_storage_idx(inc=batch_size)
# store the informations
self.buffers['obs'][idxs] = mb_obs
self.buffers['ag'][idxs] = mb_ag
self.buffers['g'][idxs] = mb_g
self.buffers['actions'][idxs] = mb_actions
self.buffers['success'][idxs] = success
self.buffers['done'][idxs] = done
self.n_transitions_stored += self.T * batch_size
# sample the data from the replay buffer
def sample(self, batch_size):
temp_buffers = {}
for key in self.buffers.keys():
if key != 'success':
temp_buffers[key] = self.buffers[key][:self.current_size]
temp_buffers['obs_next'] = temp_buffers['obs'][:, 1:, :]
temp_buffers['ag_next'] = temp_buffers['ag'][:, 1:, :]
# sample transitions
transitions = self.sample_func(temp_buffers, batch_size)
return transitions
def random_sample(self, batch_size):
temp_buffers = {}
for key in self.buffers.keys():
temp_buffers[key] = self.buffers[key][:self.current_size]
temp_buffers['obs_next'] = temp_buffers['obs'][:, 1:, :]
temp_buffers['ag_next'] = temp_buffers['ag'][:, 1:, :]
# sample transitions
# print('start random sample', self.current_size)
T = temp_buffers['actions'].shape[1] # 50 steps per traj
rollout_batch_size = temp_buffers['actions'].shape[0] # 2 trajs
batch_size = batch_size # target batches we want to sample
episode_idxs = np.random.randint(0, rollout_batch_size, batch_size)
# which traj to sample
t_samples = np.random.randint(T, size=batch_size)
# which step to sample
transitions = {key: temp_buffers[key][episode_idxs, t_samples].copy() for key in temp_buffers.keys()}
transitions = {k: transitions[k].reshape(batch_size, *transitions[k].shape[1:]) for k in transitions.keys()}
return transitions
def sample_traj(self, batch_size):
temp_buffers = {}
for key in self.buffers.keys():
temp_buffers[key] = self.buffers[key][:self.current_size]
temp_buffers['obs_next'] = temp_buffers['obs'][:, 1:, :]
temp_buffers['ag_next'] = temp_buffers['ag'][:, 1:, :]
T = temp_buffers['actions'].shape[1] # 50 steps per traj
num_traj = temp_buffers['actions'].shape[0] # number of all the trajs
episode_idxs = np.random.randint(0, num_traj, batch_size)
traj = {key: temp_buffers[key][episode_idxs, :].copy() for key in temp_buffers.keys()}
# remember obs and ag has a larger shape
return traj
def get_all_data(self):
temp_buffers = {}
for key in self.buffers.keys():
temp_buffers[key] = self.buffers[key][:self.current_size]
temp_buffers['obs_next'] = temp_buffers['obs'][:, 1:, :]
temp_buffers['ag_next'] = temp_buffers['ag'][:, 1:, :]
return temp_buffers
def _get_storage_idx(self, inc=None):
inc = inc or 1
assert inc == 1
if self.current_size + inc <= self.size:
idx = np.arange(self.current_size, self.current_size + inc)
elif self.current_size < self.size:
overflow = inc - (self.size - self.current_size)
idx_a = np.arange(self.current_size, self.size)
idx_b = np.random.randint(0, self.current_size, overflow)
idx = np.concatenate([idx_a, idx_b])
else:
idx = np.array([self.position])
# idx = np.random.randint(0, self.size, inc)
self.current_size = min(self.size, self.current_size + inc)
self.position = (self.position + 1) % self.size
if inc == 1:
idx = idx[0]
return idx
# update achieved_goal in the buffer
def update_ag(self, phi, device):
all_obs = self.buffers['obs'][:self.current_size].copy()
obs = all_obs.reshape(-1, all_obs.shape[2])
obs_tensor = torch.Tensor(obs).to(device)
ag = phi(obs_tensor).detach().cpu().numpy()
goal_dim = self.buffers['ag'].shape[-1]
ag_new = ag.reshape(self.current_size, -1, goal_dim)
self.buffers["ag"][:self.current_size] = ag_new
class replay_buffer_energy:
def __init__(self, env_params, buffer_size, sample_func, env_name, name='max_timesteps'):
self.env_params = env_params
self.T = env_params[name]
if name == 'max_timesteps':
goal_dim = env_params['real_goal_dim']
action_dim = self.env_params['action']
else:
goal_dim = env_params['goal']
action_dim = env_params['real_goal_dim']
self.size = buffer_size // self.T
# memory management
self.current_size = 0
self.n_transitions_stored = 0
self.sample_func = sample_func
# create the buffer to store info
self.buffers = {'obs': np.empty([self.size, self.T + 1, self.env_params['obs']]),
'ag': np.empty([self.size, self.T + 1, goal_dim]),
'g': np.empty([self.size, self.T, goal_dim]),
'actions': np.empty([self.size, self.T, action_dim]),
'e': np.empty([self.size, 1]), # energy
}
self.env_name = env_name
# store the episode
def store_episode(self, episode_batch, w_potential=1.0, w_linear=1.0, clip_energy=0.5):
mb_obs, mb_ag, mb_g, mb_actions = episode_batch
batch_size = mb_obs.shape[0]
idxs = self._get_storage_idx(inc=batch_size)
# store the informations
self.buffers['obs'][idxs] = mb_obs
self.buffers['ag'][idxs] = mb_ag
self.buffers['g'][idxs] = mb_g
self.buffers['actions'][idxs] = mb_actions
self.n_transitions_stored += self.T * batch_size
buffers = {}
for key in self.buffers.keys():
buffers[key] = self.buffers[key][idxs][None].copy()
# calculate energy
if self.env_name[:5] == 'Fetch':
g, m, delta_t = 9.81, 1, 0.04
if self.env_name[:9] == 'FetchPush':
potential_energy = 0.
else:
height = buffers['ag'][:, :, 2]
height_0 = np.repeat(height[:, 0].reshape(-1, 1), height[:, 1::].shape[1], axis=1)
height = height[:, 1::] - height_0
potential_energy = g * m * height
diff = np.diff(buffers['ag'], axis=1)
velocity = diff / delta_t
kinetic_energy = 0.5 * m * np.power(velocity, 2)
kinetic_energy = np.sum(kinetic_energy, axis=2)
energy_totoal = w_potential * potential_energy + w_linear * kinetic_energy
energy_diff = np.diff(energy_totoal, axis=1)
energy_transition = energy_totoal.copy()
energy_transition[:, 1::] = energy_diff.copy()
energy_transition = np.clip(energy_transition, 0, clip_energy)
energy_transition_total = np.sum(energy_transition, axis=1)
energy_final = np.sum(energy_transition_total.reshape(-1, 1))
self.buffers['e'][idxs, 0] = energy_final
else:
print('Trajectory Energy Function Not Implemented')
# sample the data from the replay buffer
def sample(self, batch_size):
temp_buffers = {}
for key in self.buffers.keys():
temp_buffers[key] = self.buffers[key][:self.current_size]
temp_buffers['obs_next'] = temp_buffers['obs'][:, 1:, :]
temp_buffers['ag_next'] = temp_buffers['ag'][:, 1:, :]
# sample transitions
transitions = self.sample_func(temp_buffers, batch_size)
return transitions
def random_sample(self, batch_size):
temp_buffers = {}
for key in self.buffers.keys():
temp_buffers[key] = self.buffers[key][:self.current_size]
temp_buffers['obs_next'] = temp_buffers['obs'][:, 1:, :]
temp_buffers['ag_next'] = temp_buffers['ag'][:, 1:, :]
# sample transitions
# print('start random sample', self.current_size)
T = temp_buffers['actions'].shape[1] # 50 steps per traj
rollout_batch_size = temp_buffers['actions'].shape[0] # 2 trajs
batch_size = batch_size # target batches we want to sample
episode_idxs = np.random.randint(0, rollout_batch_size, batch_size)
# which traj to sample
t_samples = np.random.randint(T, size=batch_size)
# which step to sample
transitions = {key: temp_buffers[key][episode_idxs, t_samples].copy() for key in temp_buffers.keys()}
transitions = {k: transitions[k].reshape(batch_size, *transitions[k].shape[1:]) for k in transitions.keys()}
return transitions
def sample_traj(self, batch_size):
temp_buffers = {}
for key in self.buffers.keys():
temp_buffers[key] = self.buffers[key][:self.current_size]
temp_buffers['obs_next'] = temp_buffers['obs'][:, 1:, :]
temp_buffers['ag_next'] = temp_buffers['ag'][:, 1:, :]
T = temp_buffers['actions'].shape[1] # 50 steps per traj
num_traj = temp_buffers['actions'].shape[0] # number of all the trajs
episode_idxs = np.random.randint(0, num_traj, batch_size)
traj = {key: temp_buffers[key][episode_idxs, :].copy() for key in temp_buffers.keys()}
# remember obs and ag has a larger shape
return traj
def get_all_data(self):
temp_buffers = {}
for key in self.buffers.keys():
temp_buffers[key] = self.buffers[key][:self.current_size]
temp_buffers['obs_next'] = temp_buffers['obs'][:, 1:, :]
temp_buffers['ag_next'] = temp_buffers['ag'][:, 1:, :]
return temp_buffers
def _get_storage_idx(self, inc=None):
inc = inc or 1
if self.current_size + inc <= self.size:
idx = np.arange(self.current_size, self.current_size + inc)
elif self.current_size < self.size:
overflow = inc - (self.size - self.current_size)
idx_a = np.arange(self.current_size, self.size)
idx_b = np.random.randint(0, self.current_size, overflow)
idx = np.concatenate([idx_a, idx_b])
else:
idx = np.random.randint(0, self.size, inc)
self.current_size = min(self.size, self.current_size + inc)
if inc == 1:
idx = idx[0]
return idx
### Description
------------
Reimplementation of [Soft Actor-Critic Algorithms and Applications](https://arxiv.org/pdf/1812.05905.pdf) and a deterministic variant of SAC from [Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement
Learning with a Stochastic Actor](https://arxiv.org/pdf/1801.01290.pdf).
Added another branch for [Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement
Learning with a Stochastic Actor](https://arxiv.org/pdf/1801.01290.pdf) -> [SAC_V](https://github.com/pranz24/pytorch-soft-actor-critic/tree/SAC_V).
### Requirements
------------
* [mujoco-py](https://github.com/openai/mujoco-py)
* [TensorboardX](https://github.com/lanpa/tensorboardX)
* [PyTorch](http://pytorch.org/)
### Default Arguments and Usage
------------
### Usage
```
usage: main.py [-h] [--env-name ENV_NAME] [--policy POLICY] [--eval EVAL]
[--gamma G] [--tau G] [--lr G] [--alpha G]
[--automatic_entropy_tuning G] [--seed N] [--batch_size N]
[--num_steps N] [--hidden_size N] [--updates_per_step N]
[--start_steps N] [--target_update_interval N]
[--replay_size N] [--cuda]
```
(Note: There is no need for setting Temperature(`--alpha`) if `--automatic_entropy_tuning` is True.)
#### For SAC
```
python main.py --env-name Humanoid-v2 --alpha 0.05
```
#### For SAC (Hard Update)
```
python main.py --env-name Humanoid-v2 --alpha 0.05 --tau 1 --target_update_interval 1000
```
#### For SAC (Deterministic, Hard Update)
```
python main.py --env-name Humanoid-v2 --policy Deterministic --tau 1 --target_update_interval 1000
```
### Arguments
------------
```
PyTorch Soft Actor-Critic Args
optional arguments:
-h, --help show this help message and exit
--env-name ENV_NAME Mujoco Gym environment (default: HalfCheetah-v2)
--policy POLICY Policy Type: Gaussian | Deterministic (default:
Gaussian)
--eval EVAL Evaluates a policy a policy every 10 episode (default:
True)
--gamma G discount factor for reward (default: 0.99)
--tau G target smoothing coefficient(τ) (default: 5e-3)
--lr G learning rate (default: 3e-4)
--alpha G Temperature parameter α determines the relative
importance of the entropy term against the reward
(default: 0.2)
--automatic_entropy_tuning G
Automaically adjust α (default: False)
--seed N random seed (default: 123456)
--batch_size N batch size (default: 256)
--num_steps N maximum number of steps (default: 1e6)
--hidden_size N hidden size (default: 256)
--updates_per_step N model updates per simulator step (default: 1)
--start_steps N Steps sampling random actions (default: 1e4)
--target_update_interval N
Value target update per no. of updates per step
(default: 1)
--replay_size N size of replay buffer (default: 1e6)
--cuda run on CUDA (default: False)
```
| Environment **(`--env-name`)**| Temperature **(`--alpha`)**|
| ---------------| -------------|
| HalfCheetah-v2| 0.2|
| Hopper-v2| 0.2|
| Walker2d-v2| 0.2|
| Ant-v2| 0.2|
| Humanoid-v2| 0.05|
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Normal
LOG_SIG_MAX = 2
LOG_SIG_MIN = -20
epsilon = 1e-6
# Initialize Policy weights
def weights_init_(m):
if isinstance(m, nn.Linear):
torch.nn.init.xavier_uniform_(m.weight, gain=1)
torch.nn.init.constant_(m.bias, 0)
class ValueNetwork(nn.Module):
def __init__(self, num_inputs, hidden_dim):
super(ValueNetwork, self).__init__()
self.linear1 = nn.Linear(num_inputs, hidden_dim)
self.linear2 = nn.Linear(hidden_dim, hidden_dim)
self.linear3 = nn.Linear(hidden_dim, 1)
self.apply(weights_init_)
def forward(self, state):
x = F.relu(self.linear1(state))
x = F.relu(self.linear2(x))
x = self.linear3(x)
return x
class QNetwork(nn.Module):
def __init__(self, num_inputs, num_actions, hidden_dim):
super(QNetwork, self).__init__()
# Q1 architecture
self.linear1 = nn.Linear(num_inputs + num_actions, hidden_dim)
self.linear2 = nn.Linear(hidden_dim, hidden_dim)
self.linear3 = nn.Linear(hidden_dim, 1)
# Q2 architecture
self.linear4 = nn.Linear(num_inputs + num_actions, hidden_dim)
self.linear5 = nn.Linear(hidden_dim, hidden_dim)
self.linear6 = nn.Linear(hidden_dim, 1)
self.apply(weights_init_)
def forward(self, state, action):
xu = torch.cat([state, action], 1)
x1 = F.relu(self.linear1(xu))
x1 = F.relu(self.linear2(x1))
x1 = self.linear3(x1)
x2 = F.relu(self.linear4(xu))
x2 = F.relu(self.linear5(x2))
x2 = self.linear6(x2)
return x1, x2
class QNetwork_out(nn.Module):
def __init__(self, num_inputs, num_actions, hidden_dim):
super(QNetwork_out, self).__init__()
# Q1 architecture
self.linear1 = nn.Linear(num_inputs, hidden_dim)
self.linear2 = nn.Linear(hidden_dim, hidden_dim)
self.linear3 = nn.Linear(hidden_dim, num_actions)
self.apply(weights_init_)
def forward(self, state):
x1 = F.relu(self.linear1(state))
x1 = F.relu(self.linear2(x1))
x1 = self.linear3(x1)
return x1
class QNetwork_phi(nn.Module):
def __init__(self, num_inputs, num_actions, hidden_dim, abs_range, tanh_output):
super(QNetwork_phi, self).__init__()
# Q1 network
# obs encoder
obs_models = [nn.Linear(num_inputs-2, hidden_dim)]
obs_models += [nn.ReLU(), nn.Linear(hidden_dim, hidden_dim)]
obs_models += [nn.ReLU(), nn.Linear(hidden_dim, 2)]
self.obs_encoder = nn.Sequential(*obs_models)
# goal input
self.action_input = nn.Linear(num_actions+2, int(hidden_dim / 2))
self.dynamics_layer = nn.Linear(int(hidden_dim / 2) + 2, hidden_dim)
self.output_layer = nn.Linear(hidden_dim, 1)
# Q2 architecture
self.linear4 = nn.Linear(num_inputs + num_actions, hidden_dim)
self.linear5 = nn.Linear(hidden_dim, hidden_dim)
self.linear6 = nn.Linear(hidden_dim, 1)
self.tanh_output = tanh_output
self.abs_range = abs_range
self.apply(weights_init_)
def forward(self, state, action):
xu = torch.cat([state, action], 1)
x2 = F.relu(self.linear4(xu))
x2 = F.relu(self.linear5(x2))
x2 = self.linear6(x2)
state = state[:, :-2]
action = torch.cat([state[:, -2:], action], 1)
latent_s = self.obs_encoder(state)
if self.tanh_output:
latent_s = self.abs_range * torch.tanh(latent_s)
action_out = self.action_input(action)
action_out = F.relu(action_out)
x = torch.cat([latent_s, action_out], 1)
x = self.dynamics_layer(x)
x = F.relu(x)
x1 = self.output_layer(x)
return x1, x2
def phi(self, obs):
if len(obs.shape) is 1:
obs = obs.unsqueeze(0)
s = self.obs_encoder(obs)
if self.tanh_output:
s = self.abs_range * torch.tanh(s)
return s
class GaussianPolicy(nn.Module):
def __init__(self, num_inputs, num_actions, hidden_dim, action_space, goal_dim):
super(GaussianPolicy, self).__init__()
# self.linear1 = nn.Linear(num_inputs - goal_dim, hidden_dim)
# self.goal_input = nn.Linear(goal_dim, hidden_dim)
self.linear1 = nn.Linear(num_inputs, hidden_dim)
self.linear2 = nn.Linear(hidden_dim, hidden_dim)
self.goal_dim = goal_dim
self.mean_linear = nn.Linear(hidden_dim, num_actions)
self.log_std_linear = nn.Linear(hidden_dim, num_actions)
# add phi layer
# self.phi_layer1 = nn.Linear(hidden_dim, hidden_dim)
# self.phi_layer2 = nn.Linear(hidden_dim, 2)
# self.phi_layer3 = nn.Linear(hidden_dim, 2)
self.apply(weights_init_)
# action rescaling
if action_space is None:
self.action_scale = torch.tensor(1.)
self.action_bias = torch.tensor(0.)
else:
self.action_scale = torch.FloatTensor(
(action_space.high - action_space.low) / 2.)
self.action_bias = torch.FloatTensor(
(action_space.high + action_space.low) / 2.)
def forward(self, state):
# x = self.linear1(state[..., :-self.goal_dim]) + self.goal_input(state[..., -self.goal_dim:])
x = self.linear1(state)
x = F.relu(x)
x = F.relu(self.linear2(x))
mean = self.mean_linear(x)
log_std = self.log_std_linear(x)
log_std = torch.clamp(log_std, min=LOG_SIG_MIN, max=LOG_SIG_MAX)
return mean, log_std
def phi(self, state):
x = F.relu(self.linear1(state))
x = F.relu(self.phi_layer1(x))
phi = self.phi_layer2(x)
return phi
def sample(self, state):
mean, log_std = self.forward(state)
std = log_std.exp()
normal = Normal(mean, std)
x_t = normal.rsample() # for reparameterization trick (mean + std * N(0,1))
y_t = torch.tanh(x_t)
action = y_t * self.action_scale + self.action_bias
log_prob = normal.log_prob(x_t)
# Enforcing Action Bound
log_prob -= torch.log(self.action_scale * (1 - y_t.pow(2)) + epsilon)
log_prob = log_prob.sum(1, keepdim=True)
mean = torch.tanh(mean) * self.action_scale + self.action_bias
return action, log_prob, mean
def to(self, device):
self.action_scale = self.action_scale.to(device)
self.action_bias = self.action_bias.to(device)
return super(GaussianPolicy, self).to(device)
class DeterministicPolicy(nn.Module):
def __init__(self, num_inputs, num_actions, hidden_dim, action_space=None):
super(DeterministicPolicy, self).__init__()
self.linear1 = nn.Linear(num_inputs, hidden_dim)
self.linear2 = nn.Linear(hidden_dim, hidden_dim)
self.mean = nn.Linear(hidden_dim, num_actions)
self.noise = torch.Tensor(num_actions)
self.apply(weights_init_)
# action rescaling
if action_space is None:
self.action_scale = 1.
self.action_bias = 0.
else:
self.action_scale = torch.FloatTensor(
(action_space.high - action_space.low) / 2.)
self.action_bias = torch.FloatTensor(
(action_space.high + action_space.low) / 2.)
def forward(self, state):
x = F.relu(self.linear1(state))
x = F.relu(self.linear2(x))
mean = torch.tanh(self.mean(x)) * self.action_scale + self.action_bias
return mean
def sample(self, state):
mean = self.forward(state)
noise = self.noise.normal_(0., std=0.1)
noise = noise.clamp(-0.25, 0.25)
action = mean + noise
return action, torch.tensor(0.), mean
def to(self, device):
self.action_scale = self.action_scale.to(device)
self.action_bias = self.action_bias.to(device)
self.noise = self.noise.to(device)
return super(DeterministicPolicy, self).to(device)
import random
import numpy as np
class ReplayMemory:
def __init__(self, capacity):
self.capacity = capacity
self.buffer = []
self.position = 0
def push(self, state, action, reward, next_state, done, epoch):
if len(self.buffer) < self.capacity:
self.buffer.append(None)
self.buffer[self.position] = (state, action, reward, next_state, done, epoch+1)
self.position = (self.position + 1) % self.capacity
def sample(self, batch_size):
batch = random.sample(self.buffer, batch_size)
state, action, reward, next_state, done, _ = map(np.stack, zip(*batch))
return state, action, reward, next_state, done
def __len__(self):
return len(self.buffer)
def get_obs(self):
obs = [x[0] for x in self.buffer]
obs = np.array(obs)
obs_next = [x[3] for x in self.buffer]
obs_next = np.array(obs_next)
return obs.copy(), obs_next.copy()
def pri_sample(self, batch_size, temperature=1.):
tmp_buffer = np.array(self.buffer)
epoch = tmp_buffer[:, -1]
p_trajectory = np.power(epoch, 1 / (temperature + 1e-2))
p_trajectory = p_trajectory / p_trajectory.sum()
p_trajectory = p_trajectory.astype(np.float64)
idxs = np.random.choice(len(self.buffer), size=batch_size, replace=False, p=p_trajectory)
batch = [self.buffer[i] for i in idxs]
state, action, reward, next_state, done, _ = map(np.stack, zip(*batch))
return state, action, reward, next_state, done
def random_sample(self, batch_size):
idxs = np.random.randint(0, len(self.buffer), batch_size)
obs = [self.buffer[i][0] for i in idxs]
obs = np.array(obs)
obs_next = [self.buffer[i][3] for i in idxs]
obs_next = np.array(obs_next)
return obs, obs_next
class Array_ReplayMemory:
def __init__(self, capacity, env_params):
self.capacity = capacity
action_dim = env_params['real_goal_dim']
obs_dim = env_params['obs'] + env_params['goal']
# create the buffer to store info
self.buffers = {'obs': np.empty([capacity, obs_dim]),
'actions': np.empty([capacity, action_dim]),
'reward': np.empty([capacity]),
'next_obs': np.empty([capacity, obs_dim]),
'done': np.empty([capacity])
}
self.position = 0
self.current_size = 0
def push(self, state, action, reward, next_state, done, epoch):
self.buffers['obs'][self.position] = state
self.buffers['actions'][self.position] = action
self.buffers['reward'][self.position] = reward
self.buffers['next_obs'][self.position] = next_state
self.buffers['done'][self.position] = done
self.position = (self.position + 1) % self.capacity
if self.current_size + 1 < self.capacity:
self.current_size += 1
def sample(self, batch_size):
idx = np.random.randint(0, self.current_size, batch_size)
state = self.buffers['obs'][idx]
action = self.buffers['actions'][idx]
reward = self.buffers['reward'][idx]
next_state = self.buffers['next_obs'][idx]
done = self.buffers['done'][idx]
return state, action, reward, next_state, done
def __len__(self):
return self.current_size
import os
import torch
import torch.nn.functional as F
from torch.optim import Adam
from algos.sac.utils import soft_update, hard_update
from algos.sac.model import GaussianPolicy, QNetwork, DeterministicPolicy, QNetwork_phi
class SAC(object):
def __init__(self, num_inputs, action_space, args, pri_replay, goal_dim, gradient_flow_value, abs_range, tanh_output):
self.gamma = args.gamma
self.tau = args.tau
self.alpha = args.alpha
self.pri_replay = pri_replay
self.policy_type = args.policy
self.target_update_interval = args.target_update_interval
self.automatic_entropy_tuning = args.automatic_entropy_tuning
self.device = args.device
self.gradient_flow_value = gradient_flow_value
if not gradient_flow_value:
self.critic = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(device=self.device)
self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)
self.critic_target = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(self.device)
hard_update(self.critic_target, self.critic)
else:
self.critic = QNetwork_phi(num_inputs, action_space.shape[0], args.hidden_size, abs_range, tanh_output).to(device=self.device)
self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)
self.critic_target = QNetwork_phi(num_inputs, action_space.shape[0], args.hidden_size, abs_range, tanh_output).to(self.device)
hard_update(self.critic_target, self.critic)
if self.policy_type == "Gaussian":
# Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
if self.automatic_entropy_tuning is True:
self.target_entropy = -torch.prod(torch.Tensor(action_space.shape).to(self.device)).item()
self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
self.alpha_optim = Adam([self.log_alpha], lr=args.lr)
self.policy = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space, goal_dim).to(self.device)
self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
self.policy_target = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space,
goal_dim).to(self.device)
hard_update(self.policy_target, self.policy)
else:
self.alpha = 0
self.automatic_entropy_tuning = False
self.policy = DeterministicPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device)
self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
def select_action(self, state, evaluate=False):
state = torch.FloatTensor(state).to(self.device).unsqueeze(0)
if evaluate is False:
action, _, _ = self.policy.sample(state)
else:
_, _, action = self.policy.sample(state)
return action.detach().cpu().numpy()[0]
def update_parameters(self, memory, batch_size, env_params, hi_sparse, feature_data):
# Sample a batch from memory
if self.pri_replay:
state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.pri_sample(batch_size=batch_size)
else:
state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample(batch_size=batch_size)
state_batch = torch.FloatTensor(state_batch).to(self.device)
next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)
action_batch = torch.FloatTensor(action_batch).to(self.device)
reward_batch = torch.FloatTensor(reward_batch).to(self.device).unsqueeze(1)
mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1)
with torch.no_grad():
next_state_action, next_state_log_pi, _ = self.policy.sample(next_state_batch)
qf1_next_target, qf2_next_target = self.critic_target(next_state_batch, next_state_action)
min_qf_next_target = torch.min(qf1_next_target, qf2_next_target) - self.alpha * next_state_log_pi
# print("min_qf_target", min_qf_next_target.shape)
next_q_value = reward_batch + mask_batch * self.gamma * (min_qf_next_target)
if hi_sparse:
# clip target value
next_q_value = torch.clamp(next_q_value, -env_params['max_timesteps'], 0.)
qf1, qf2 = self.critic(state_batch, action_batch) # Two Q-functions to mitigate positive bias in the policy improvement step
# print("qf1", qf1.shape)
# print("next_q", next_q_value.shape)
qf1_loss = F.mse_loss(qf1, next_q_value) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
qf2_loss = F.mse_loss(qf2, next_q_value) # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
pi, log_pi, _ = self.policy.sample(state_batch)
qf1_pi, qf2_pi = self.critic(state_batch, pi)
min_qf_pi = torch.min(qf1_pi, qf2_pi)
policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean() # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))]
if feature_data is not None:
if self.gradient_flow_value:
obs, obs_next = self.critic.phi(feature_data[0]), self.critic.phi(feature_data[1])
min_dist = torch.clamp((obs - obs_next).pow(2).mean(dim=1), min=0.)
hi_obs, hi_obs_next = self.critic.phi(feature_data[2]), self.critic.phi(feature_data[3])
max_dist = torch.clamp(1 - (hi_obs - hi_obs_next).pow(2).mean(dim=1), min=0.)
representation_loss = (min_dist + max_dist).mean()
qf1_loss = qf1_loss * 0.1 + representation_loss
else:
obs, obs_next = self.policy.phi(feature_data[0]), self.policy.phi(feature_data[1])
min_dist = torch.clamp((obs - obs_next).pow(2).mean(dim=1), min=0.)
hi_obs, hi_obs_next = self.policy.phi(feature_data[2]), self.policy.phi(feature_data[3])
max_dist = torch.clamp(1 - (hi_obs - hi_obs_next).pow(2).mean(dim=1), min=0.)
representation_loss = (min_dist + max_dist).mean()
policy_loss += representation_loss
self.critic_optim.zero_grad()
qf1_loss.backward()
self.critic_optim.step()
self.critic_optim.zero_grad()
qf2_loss.backward()
self.critic_optim.step()
self.policy_optim.zero_grad()
policy_loss.backward()
self.policy_optim.step()
if self.automatic_entropy_tuning:
alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean()
self.alpha_optim.zero_grad()
alpha_loss.backward()
self.alpha_optim.step()
self.alpha = self.log_alpha.exp()
alpha_tlogs = self.alpha.clone() # For TensorboardX logs
else:
alpha_loss = torch.tensor(0.).to(self.device)
alpha_tlogs = torch.tensor(self.alpha) # For TensorboardX logs
soft_update(self.critic_target, self.critic, self.tau)
soft_update(self.policy_target, self.policy, self.tau)
return qf1_loss.item(), qf2_loss.item(), policy_loss.item(), alpha_loss.item(), alpha_tlogs.item()
# Save model parameters
def save_model(self, env_name, suffix="", actor_path=None, critic_path=None):
if not os.path.exists('models/'):
os.makedirs('models/')
if actor_path is None:
actor_path = "models/sac_actor_{}_{}".format(env_name, suffix)
if critic_path is None:
critic_path = "models/sac_critic_{}_{}".format(env_name, suffix)
print('Saving models to {} and {}'.format(actor_path, critic_path))
torch.save(self.policy.state_dict(), actor_path)
torch.save(self.critic.state_dict(), critic_path)
# Load model parameters
def load_model(self, actor_path, critic_path):
print('Loading models from {} and {}'.format(actor_path, critic_path))
if actor_path is not None:
self.policy.load_state_dict(torch.load(actor_path))
if critic_path is not None:
self.critic.load_state_dict(torch.load(critic_path))
import math
import torch
def create_log_gaussian(mean, log_std, t):
quadratic = -((0.5 * (t - mean) / (log_std.exp())).pow(2))
l = mean.shape
log_z = log_std
z = l[-1] * math.log(2 * math.pi)
log_p = quadratic.sum(dim=-1) - log_z.sum(dim=-1) - 0.5 * z
return log_p
def logsumexp(inputs, dim=None, keepdim=False):
if dim is None:
inputs = inputs.view(-1)
dim = 0
s, _ = torch.max(inputs, dim=dim, keepdim=True)
outputs = s + (inputs - s).exp().sum(dim=dim, keepdim=True).log()
if not keepdim:
outputs = outputs.squeeze(dim)
return outputs
def soft_update(target, source, tau):
for target_param, param in zip(target.parameters(), source.parameters()):
target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau)
def hard_update(target, source):
for target_param, param in zip(target.parameters(), source.parameters()):
target_param.data.copy_(param.data)
import numpy as np
class normalizer:
def __init__(self, size, eps=1e-2, default_clip_range=np.inf):
self.size = size
self.eps = eps
self.default_clip_range = default_clip_range
# some local information
self.local_sum = np.zeros(self.size, np.float32)
self.local_sumsq = np.zeros(self.size, np.float32)
self.local_count = np.zeros(1, np.float32)
# get the total sum sumsq and sum count
self.total_sum = np.zeros(self.size, np.float32)
self.total_sumsq = np.zeros(self.size, np.float32)
self.total_count = np.ones(1, np.float32)
# get the mean and std
self.mean = np.zeros(self.size, np.float32)
self.std = np.ones(self.size, np.float32)
# thread locker
# update the parameters of the normalizer
def update(self, v):
v = v.reshape(-1, self.size)
self.local_sum += v.sum(axis=0)
self.local_sumsq += (np.square(v)).sum(axis=0)
self.local_count[0] += v.shape[0]
def recompute_stats(self):
local_count = self.local_count.copy()
local_sum = self.local_sum.copy()
local_sumsq = self.local_sumsq.copy()
# reset
self.local_count[...] = 0
self.local_sum[...] = 0
self.local_sumsq[...] = 0
# update the total stuff
self.total_sum += local_sum
self.total_sumsq += local_sumsq
self.total_count += local_count
# calculate the new mean and std
self.mean = self.total_sum / self.total_count
self.std = np.sqrt(np.maximum(np.square(self.eps), (self.total_sumsq / self.total_count) - np.square(
self.total_sum / self.total_count)))
# normalize the observation
def normalize(self, v, clip_range=None):
# print('now normalize', v)
if clip_range is None:
clip_range = self.default_clip_range
# print((v - self.mean) / (self.std))
return np.clip((v - self.mean) / (self.std), -clip_range, clip_range)
import numpy as np
import torch
from torch import nn
class RandomPolicy(nn.Module):
def __init__(self, action_space, is_binary=False):
nn.Module.__init__(self)
self.action_space = action_space
self.is_binary = is_binary
self.discrete = ('n' in vars(self.action_space))
def random(self):
if self.discrete:
return np.random.randint(self.action_space.n)
else:
low = np.array(self.action_space.low)
high = np.array(self.action_space.high)
if self.is_binary:
return np.random.randint(3, size=self.action_space.shape) - 1
return np.random.random(size=self.action_space.shape) * (high - low) + low
def forward(self, obs, *args):
if isinstance(obs, dict): # goal conditioned environment
obs = obs['observation']
act = torch.Tensor(np.stack([self.random() for i in range(len(obs))], axis=0))
if self.discrete:
act = act.long()
return act
def reset(self, i):
pass
import numpy as np
# [reference] https://github.com/matthiasplappert/keras-rl/blob/master/rl/random.py
class RandomProcess(object):
def reset_states(self):
pass
class AnnealedGaussianProcess(RandomProcess):
def __init__(self, mu, sigma, sigma_min, n_steps_annealing):
self.mu = mu
self.sigma = sigma
self.n_steps = 0
if sigma_min is not None:
self.m = -float(sigma - sigma_min) / float(n_steps_annealing)
self.c = sigma
self.sigma_min = sigma_min
else:
self.m = 0.
self.c = sigma
self.sigma_min = sigma
@property
def current_sigma(self):
sigma = max(self.sigma_min, self.m * float(self.n_steps) + self.c)
return sigma
# Based on http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab
class OrnsteinUhlenbeckProcess(AnnealedGaussianProcess):
def __init__(self, theta, mu=0., sigma=1., dt=1e-2, x0=None, size=1, sigma_min=None, n_steps_annealing=1000):
super(OrnsteinUhlenbeckProcess, self).__init__(mu=mu, sigma=sigma, sigma_min=sigma_min,
n_steps_annealing=n_steps_annealing)
self.theta = theta
self.mu = mu
self.dt = dt
self.x0 = x0
self.size = size
self.reset_states()
def sample(self):
x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + self.current_sigma * np.sqrt(
self.dt) * np.random.normal(size=self.size)
self.x_prev = x
self.n_steps += 1
return x
def reset_states(self):
self.x_prev = self.x0 if self.x0 is not None else np.zeros(self.size)
import argparse
"""
Here are the param for the training
"""
def get_args_ant():
parser = argparse.ArgumentParser()
# the environment setting
parser.add_argument('--env-name', type=str, default='AntMaze1-v1', help='the environment name')
parser.add_argument('--test', type=str, default='AntMaze1Test-v1')
parser.add_argument('--n-epochs', type=int, default=20000, help='the number of epochs to train the agent')
parser.add_argument('--n-batches', type=int, default=200, help='the times to update the network')
parser.add_argument('--seed', type=int, default=125, help='random seed')
parser.add_argument('--replay-strategy', type=str, default='none', help='the HER strategy')
parser.add_argument('--save-dir', type=str, default='saved_models/', help='the path to save the models')
parser.add_argument('--noise-eps', type=float, default=0.2, help='noise factor for Gaussian')
parser.add_argument('--random-eps', type=float, default=0.2, help="prob for acting randomly")
parser.add_argument('--buffer-size', type=int, default=int(1e6), help='the size of the buffer')
parser.add_argument('--replay-k', type=int, default=5, help='ratio to be replaced')
parser.add_argument('--future-step', type=int, default=200, help='future step to be sampled')
parser.add_argument('--batch-size', type=int, default=128, help='the sample batch size')
parser.add_argument('--gamma', type=float, default=0.99, help='the discount factor')
parser.add_argument('--action-l2', type=float, default=0.0, help='l2 reg')
parser.add_argument('--lr-actor', type=float, default=0.0002, help='the learning rate of the actor')
parser.add_argument('--lr-critic', type=float, default=0.0002, help='the learning rate of the critic')
parser.add_argument('--polyak', type=float, default=0.99, help='the average coefficient')
parser.add_argument('--n-test-rollouts', type=int, default=10, help='the number of tests')
parser.add_argument('--metric', type=str, default='MLP', help='the metric for the distance embedding')
parser.add_argument('--device', type=str, default="cuda:3", help='cuda device')
parser.add_argument('--lr-decay-actor', type=int, default=3000, help='actor learning rate decay')
parser.add_argument('--lr-decay-critic', type=int, default=3000, help='critic learning rate decay')
parser.add_argument('--layer', type=int, default=6, help='number of layers for critic')
parser.add_argument('--period', type=int, default=3, help='target update period')
parser.add_argument('--distance', type=float, default=0.1, help='distance threshold for HER')
parser.add_argument('--resume', type=bool, default=False, help='resume or not')
# Will be considered only if resume is True
parser.add_argument('--resume-epoch', type=int, default=0, help='resume epoch')
parser.add_argument('--resume-path', type=str, default='saved_models/AntPush-v1_Nov16_08-30-42', help='resume path')
# add for hier policy
parser.add_argument('--save', type=bool, default=True, help='save model and tensorboard data')
parser.add_argument('--animate', type=bool, default=False)
parser.add_argument("--eval", type=bool, default=False)
parser.add_argument('--eval_interval', type=int, default=50, help="every n episodes to eval once")
parser.add_argument('--c', type=int, default=50, help="interval of high-level action")
parser.add_argument('--gradient_flow', type=bool, default=False, help='end-to-end learn feature and policy')
parser.add_argument('--gradient_flow_value', type=bool, default=False, help='slow feature as a embedding of value function')
parser.add_argument('--abs_range', type=float, default=20.0, help='range of high-level action space')
parser.add_argument('--use_target', type=bool, default=False, help='use target network for learning feature')
parser.add_argument('--early_stop', type=bool, default=False, help='early stop the learning of low-level')
parser.add_argument('--low_reward_coeff', type=float, default=0.1, help='low-level reward coeff')
parser.add_argument("--use_prediction", type=bool, default=False, help='use prediction error to learn feature')
parser.add_argument("--start_update_phi", type=int, default=10, help='use prediction error to learn feature')
parser.add_argument("--image", type=bool, default=False, help='use image input')
parser.add_argument("--old_sample", type=bool, default=False, help='sample the absolute goal in the abs_range')
# args of sac
parser.add_argument('--policy', default="Gaussian",
help='Policy Type: Gaussian | Deterministic (default: Gaussian)')
parser.add_argument('--tau', type=float, default=0.005, metavar='G',
help='target smoothing coefficient(τ) (default: 0.005)')
parser.add_argument('--lr', type=float, default=0.0003, metavar='G',
help='learning rate (default: 0.0003)')
parser.add_argument('--alpha', type=float, default=0.2, metavar='G',
help='Temperature parameter α determines the relative importance of the entropy\
term against the reward (default: 0.2)')
parser.add_argument('--automatic_entropy_tuning', type=bool, default=False, metavar='G',
help='Automaically adjust α (default: False)')
parser.add_argument('--hidden_size', type=int, default=256, metavar='N',
help='hidden size (default: 256)')
parser.add_argument('--start_epoch', type=int, default=300, metavar='N',
help='Epochs sampling random actions (default: 50)')
parser.add_argument('--target_update_interval', type=int, default=1, metavar='N',
help='Value target update per no. of updates per step (default: 1)')
args = parser.parse_args()
return args
def get_args_chain():
parser = argparse.ArgumentParser()
# the environment setting
parser.add_argument('--env-name', type=str, default='NChain-v1', help='the environment name')
parser.add_argument('--test', type=str, default='NChain-v1')
parser.add_argument('--n-epochs', type=int, default=100, help='the number of epochs to train the agent')
parser.add_argument('--n-batches', type=int, default=200, help='the times to update the network')
parser.add_argument('--seed', type=int, default=160, help='random seed')
parser.add_argument('--replay-strategy', type=str, default='none', help='the HER strategy')
parser.add_argument('--save-dir', type=str, default='saved_models/', help='the path to save the models')
parser.add_argument('--noise-eps', type=float, default=0.2, help='noise factor for Gaussian')
parser.add_argument('--random-eps', type=float, default=0.2, help="prob for acting randomly")
parser.add_argument('--buffer-size', type=int, default=int(1e6), help='the size of the buffer')
parser.add_argument('--replay-k', type=int, default=5, help='ratio to be replaced')
parser.add_argument('--future-step', type=int, default=200, help='future step to be sampled')
parser.add_argument('--batch-size', type=int, default=128, help='the sample batch size')
parser.add_argument('--gamma', type=float, default=0.99, help='the discount factor')
parser.add_argument('--action-l2', type=float, default=0.0, help='l2 reg')
parser.add_argument('--lr-actor', type=float, default=0.0002, help='the learning rate of the actor')
parser.add_argument('--lr-critic', type=float, default=0.0002, help='the learning rate of the critic')
parser.add_argument('--polyak', type=float, default=0.99, help='the average coefficient')
parser.add_argument('--n-test-rollouts', type=int, default=10, help='the number of tests')
parser.add_argument('--metric', type=str, default='MLP', help='the metric for the distance embedding')
parser.add_argument('--device', type=str, default="cuda:8", help='cuda device')
parser.add_argument('--lr-decay-actor', type=int, default=3000, help='actor learning rate decay')
parser.add_argument('--lr-decay-critic', type=int, default=3000, help='critic learning rate decay')
parser.add_argument('--layer', type=int, default=6, help='number of layers for critic')
parser.add_argument('--period', type=int, default=3, help='target update period')
parser.add_argument('--distance', type=float, default=0.1, help='distance threshold for HER')
parser.add_argument('--resume', type=bool, default=False, help='resume or not')
# Will be considered only if resume is True
parser.add_argument('--resume-epoch', type=int, default=0, help='resume epoch')
parser.add_argument('--resume-path', type=str, default='saved_models/NChain-v1_Jul29_11-02-57', help='resume path')
# add for hier policy
parser.add_argument('--save', type=bool, default=True, help='save model and tensorboard data')
parser.add_argument('--animate', type=bool, default=False)
parser.add_argument("--eval", type=bool, default=False)
parser.add_argument('--eval_interval', type=int, default=50, help="every n episodes to eval once")
parser.add_argument('--c', type=int, default=30, help="interval of high-level action")
parser.add_argument('--gradient_flow', type=bool, default=False, help='end-to-end learn feature and policy')
parser.add_argument('--gradient_flow_value', type=bool, default=False, help='slow feature as a embedding of value function')
parser.add_argument('--abs_range', type=float, default=100.0, help='range of high-level action space')
parser.add_argument('--use_target', type=bool, default=False, help='use target network for learning feature')
parser.add_argument('--early_stop', type=bool, default=True, help='early stop the learning of low-level')
parser.add_argument('--low_reward_coeff', type=float, default=0.01, help='low-level reward coeff')
parser.add_argument("--use_prediction", type=bool, default=False, help='use prediction error to learn feature')
parser.add_argument("--start_update_phi", type=int, default=2, help='use prediction error to learn feature')
parser.add_argument("--image", type=bool, default=False, help='use image input')
# args of sac (high-level learning)
parser.add_argument('--policy', default="Gaussian",
help='Policy Type: Gaussian | Deterministic (default: Gaussian)')
parser.add_argument('--tau', type=float, default=0.005, metavar='G',
help='target smoothing coefficient(τ) (default: 0.005)')
parser.add_argument('--lr', type=float, default=0.0003, metavar='G',
help='learning rate (default: 0.0003)')
parser.add_argument('--alpha', type=float, default=0.2, metavar='G',
help='Temperature parameter α determines the relative importance of the entropy\
term against the reward (default: 0.2)')
parser.add_argument('--automatic_entropy_tuning', type=bool, default=False, metavar='G',
help='Automaically adjust α (default: False)')
parser.add_argument('--hidden_size', type=int, default=256, metavar='N',
help='hidden size (default: 256)')
parser.add_argument('--start_epoch', type=int, default=20000, metavar='N',
help='Epochs sampling random actions (default: 50)')
parser.add_argument('--target_update_interval', type=int, default=1, metavar='N',
help='Value target update per no. of updates per step (default: 1)')
args = parser.parse_args()
return args
import sys
sys.path.append('../')
from gym.envs.registration import register
import gym
from goal_env.bitflip import BitFlipEnv
from goal_env.fourroom import FourRoom, FourRoom2, FourRoom3, FourRoom4
from goal_env.mountaincar import MountainCarEnv
from goal_env.plane import NaivePlane, NaivePlane2, NaivePlane3, NaivePlane4, NaivePlane5
from goal_env.goal_plane_env import GoalPlane
from goal_env.nchain import NChainEnv
register(
id='Bitflip-v0',
entry_point='goal_env.bitflip:BitFlipEnv',
kwargs={'num_bits': 11},
max_episode_steps=200,
reward_threshold=100.0,
nondeterministic=False,
)
N = 64
register(
id='NChain-v1',
entry_point='goal_env.nchain:NChainEnv',
kwargs={'n': N,
'slip': 0.1,
},
max_episode_steps=N+10,
)
register(
id='FourRoom-v0',
entry_point='goal_env.fourroom:FourRoom',
kwargs={'goal_type': 'fix_goal'},
max_episode_steps=200,
reward_threshold=100.0,
nondeterministic=False,
)
register(
id='FourRoom-v1',
entry_point='goal_env.fourroom:FourRoom2',
kwargs={'goal_type': 'fix_goal'},
max_episode_steps=200,
reward_threshold=100.0,
nondeterministic=False,
)
register(
id='FourRoom-v2',
entry_point='goal_env.fourroom:FourRoom3',
kwargs={'goal_type': 'fix_goal'},
max_episode_steps=200,
reward_threshold=100.0,
nondeterministic=False,
)
register(
id='FourRoom-v4',
entry_point='goal_env.fourroom:FourRoom4',
kwargs={'goal_type': 'fix_goal'},
max_episode_steps=200,
reward_threshold=100.0,
nondeterministic=False,
)
register(
id='mcar-v0',
entry_point='goal_env.mountaincar:MountainCarEnv',
kwargs={'goal_dim': 1},
max_episode_steps=200,
reward_threshold=100.0,
nondeterministic=False,
)
register(
id='Plane-v0',
entry_point='goal_env.plane:NaivePlane5',
)
register(
id='GoalPlane-v0',
entry_point='goal_env.goal_plane_env:GoalPlane',
max_episode_steps=50,
reward_threshold=195.0,
kwargs={
"env_name": "Plane-v0",
"maze_size": 15,
"action_size": 1,
"distance": 1.,
"start": (2.5, 2.5),
}
)
register(
id='GoalPlaneMid-v0',
entry_point='goal_env.goal_plane_env:GoalPlane',
max_episode_steps=50,
reward_threshold=195.0,
kwargs={
"env_name": "Plane-v0",
"type": "mid",
"maze_size": 15,
"action_size": 1,
"distance": 1.,
"start": (2.5, 2.5),
}
)
register(
id='GoalPlaneHard-v0',
entry_point='goal_env.goal_plane_env:GoalPlane',
max_episode_steps=50,
reward_threshold=195.0,
kwargs={
"env_name": "Plane-v0",
"type": "hard",
"maze_size": 15,
"action_size": 1,
"distance": 1.,
"start": (2.5, 2.5),
}
)
register(
id='GoalPlaneEasy-v0',
entry_point='goal_env.goal_plane_env:GoalPlane',
max_episode_steps=50,
reward_threshold=195.0,
kwargs={
"env_name": "Plane-v0",
"type": "easy",
"maze_size": 15,
"action_size": 1,
"distance": 1.,
"start": (2.5, 2.5),
}
)
register(
id='GoalPlaneTest-v0',
entry_point='goal_env.goal_plane_env:GoalPlane',
max_episode_steps=50,
reward_threshold=195.0,
kwargs={
"env_name": "Plane-v0",
"maze_size": 15,
"action_size": 1,
"distance": 1.,
"start": (2.5, 2.5),
"goals": (2.5, 12.5)
}
)
## copied from RL-Adventure2
import gym
import numpy as np
from gym import spaces
class BitFlipEnv(gym.Env):
def __init__(self, num_bits):
self.num_bits = num_bits
self.observation_space = {
'observation': spaces.Box(low=0, high=1, shape=(self.num_bits,)),
'desired_goal': spaces.Box(low=0, high=1, shape=(self.num_bits,)),
'achieved_goal': spaces.Box(low=0, high=1, shape=(self.num_bits,))
}
self.action_space = spaces.Discrete(self.num_bits)
def get_obs(self):
return {
"observation": np.copy(self.state),
"achieved_goal": np.copy(self.state),
"desired_goal": np.copy(self.target),
}
def reset(self):
self.done = False
self.num_steps = 0
self.state = np.random.randint(2, size=self.num_bits)
self.target = np.random.randint(2, size=self.num_bits)
return self.get_obs()
# return self.state, self.target
def step(self, action):
self.state[action] = 1 - self.state[action]
info = {'is_success': False}
# print(self.state, self.target)
if self.num_steps > self.num_bits + 1:
self.done = True
self.num_steps += 1
if np.sum(self.state == self.target) == self.num_bits:
self.done = True
info = {'is_success': True}
return self.get_obs(), 0, self.done, info
else:
return self.get_obs(), -1, self.done, info
def compute_reward(self, state, goal, info):
calcu = np.sum(state == goal, axis=1)
reward = np.where(calcu == self.num_bits, 0, -1)
return reward
def get_pairwise(self, state, target):
dist = self.num_bits - np.sum(state == target)
return dist
## importance resampling
import gym
import numpy as np
from gym import spaces
class FourRoom(gym.Env):
def __init__(self, seed=None, goal_type='fix_goal'):
self.n = 11
self.map = np.array([
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0,
0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0,
0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
]).reshape((self.n, self.n))
self.goal_type = goal_type
self.goal = None
self.init()
def init(self):
self.observation_space = {
'observation': spaces.Box(low=0, high=1, shape=(self.n * self.n,), dtype=np.float32),
'desired_goal': spaces.Box(low=0, high=1, shape=(self.n * self.n,), dtype=np.float32),
'achieved_goal': spaces.Box(low=0, high=1, shape=(self.n * self.n,), dtype=np.float32)
}
self.observation_space['observation'].n = self.n
self.dx = [0, 1, 0, -1]
self.dy = [1, 0, -1, 0]
self.action_space = spaces.Discrete(len(self.dx))
self.reset()
def label2obs(self, x, y):
a = np.zeros((self.n * self.n,))
assert self.x < self.n and self.y < self.n
a[x * self.n + y] = 1
return a
def get_obs(self):
assert self.goal is not None
return {
'observation': self.label2obs(self.x, self.y),
'desired_goal': self.label2obs(*self.goal),
'achieved_goal': self.label2obs(self.x, self.y),
}
def reset(self):
condition = True
while condition:
self.x = np.random.randint(1, self.n)
self.y = np.random.randint(1, self.n)
condition = (self.map[self.x, self.y] == 0)
loc = np.where(self.map > 0.5)
assert len(loc) == 2
if self.goal_type == 'random':
goal_idx = np.random.randint(len(loc[0]))
elif self.goal_type == 'fix_goal':
goal_idx = 0
else:
raise NotImplementedError
self.goal = loc[0][goal_idx], loc[1][goal_idx]
self.done = False
return self.get_obs()
def step(self, action):
# assert not self.done
nx, ny = self.x + self.dx[action], self.y + self.dy[action]
info = {'is_success': False}
# before = self.get_obs().argmax()
if self.map[nx, ny]:
self.x, self.y = nx, ny
reward = -1
done = False
else:
reward = -1
done = False
if nx == self.goal[0] and ny == self.goal[1]:
reward = 0
info = {'is_success': True}
done = self.done = True
return self.get_obs(), reward, done, info
def compute_reward(self, state, goal, info):
state_obs = state.argmax(axis=1)
goal_obs = goal.argmax(axis=1)
reward = np.where(state_obs == goal_obs, 0, -1)
return reward
def restore(self, obs):
obs = obs.argmax()
self.x = obs // self.n
self.y = obs % self.n
def bfs_dist(self, state, goal):
# using bfs to search for shortest path
visited = {key: False for key in range(self.n * self.n)}
state_key = state.argmax()
goal_key = goal.argmax()
queue = []
visited[state_key] = True
queue.append(state_key)
dist = [-np.inf] * (self.n * self.n)
dist[state_key] = 0
while (queue):
par = queue.pop(0)
if par == goal_key:
break
x_par, y_par = par // self.n, par % self.n
for action in range(4):
x_child, y_child = x_par + self.dx[action], y_par + self.dy[action]
child = x_child * self.n + y_child
if self.map[x_child, y_child] == 0:
continue
if visited[child] == False:
visited[child] = True
queue.append(child)
dist[child] = dist[par] + 1
return dist[goal_key]
def get_pairwise(self, state, target):
dist = self.bfs_dist(state, target)
return dist
def all_states(self):
states = []
mask = []
for i in range(self.n):
for j in range(self.n):
self.x = i
self.y = j
states.append(self.get_obs())
if isinstance(states[-1], dict):
states[-1] = states[-1]['observation']
mask.append(self.map[self.x, self.y] > 0.5)
return np.array(states)[mask]
def all_edges(self):
A = np.zeros((self.n * self.n, self.n * self.n))
mask = []
for i in range(self.n):
for j in range(self.n):
mask.append(self.map[i, j] > 0.5)
if self.map[i][j]:
for a in range(4):
self.x = i
self.y = j
t = self.step(a)[0]
if isinstance(t, dict):
t = t['observation']
self.restore(t)
A[i * self.n + j, self.x * self.n + self.y] = 1
return A[mask][:, mask]
class FourRoom2(FourRoom):
def __init__(self, *args, **kwargs):
FourRoom.__init__(self, *args, **kwargs)
self.map = np.array([
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
]).reshape((self.n, self.n))
class FourRoom3(FourRoom):
def __init__(self, *args, **kwargs):
FourRoom.__init__(self, *args, **kwargs)
self.n = 5
self.map = np.array([
0, 0, 0, 0, 0,
0, 1, 1, 1, 0,
0, 1, 1, 1, 0,
0, 1, 1, 1, 0,
0, 0, 0, 0, 0,
]).reshape((self.n, self.n))
self.init()
class FourRoom4(FourRoom):
def __init__(self, seed=None, *args, **kwargs):
FourRoom.__init__(self, *args, **kwargs)
self.n = 16
self.map = np.array([
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
]).reshape((self.n, self.n))
self.init()
if __name__ == '__main__':
a = FourRoom()
import gym
import copy
import numpy as np
import cv2
from collections import OrderedDict
class GoalPlane(gym.Env):
def __init__(self, env_name, type='random', maze_size=16., action_size=1., distance=0.1, start=None, goals=None):
super(GoalPlane, self).__init__()
self.env = gym.make(env_name)
self.maze_size = maze_size
self.action_size = action_size
self.action_space = gym.spaces.Box(
low=-action_size, high=action_size, shape=(2,), dtype='float32')
self.ob_space = gym.spaces.Box(
low=0., high=maze_size, shape=(2,), dtype='float32')
self.easy_goal_space = gym.spaces.Box(low=np.array([0., 0.]),
high=np.array([self.maze_size, self.maze_size / 2]) \
, dtype=np.float32)
self.mid_goal_space = gym.spaces.Box(low=np.array([self.maze_size / 2, self.maze_size / 2]), \
high=np.array([self.maze_size, self.maze_size]), dtype=np.float32)
self.hard_goal_space = gym.spaces.Box(low=np.array([0., self.maze_size * 0.65]), \
high=np.array([self.maze_size / 2, self.maze_size]), dtype=np.float32)
self.type = type
if self.type == 'random':
self.goal_space = self.ob_space
elif self.type == 'easy':
self.goal_space = self.easy_goal_space
elif self.type == 'mid':
self.goal_space = self.mid_goal_space
elif self.type == 'hard':
self.goal_space = self.hard_goal_space
self.distance = distance
self.goals = goals
self.start = start
self.observation_space = gym.spaces.Dict(OrderedDict({
'observation': self.ob_space,
'desired_goal': self.goal_space,
'achieved_goal': self.ob_space,
}))
self.goal = None
def compute_reward(self, achieved_goal, desired_goal, info):
reward = -np.linalg.norm(achieved_goal - desired_goal, axis=-1)
return reward
def change_mode(self, mode='mid'):
if mode == 'random':
self.goal_space = self.ob_space
elif mode == 'easy':
self.goal_space = self.easy_goal_space
elif mode == 'mid':
self.goal_space = self.mid_goal_space
elif mode == 'hard':
self.goal_space = self.hard_goal_space
def step(self, action):
assert self.goal is not None
observation, reward, done, info = self.env.step(np.array(action) / self.maze_size) # normalize action
observation = np.array(observation) * self.maze_size
out = {'observation': observation,
'desired_goal': self.goal,
'achieved_goal': observation}
reward = -np.linalg.norm(observation - self.goal, axis=-1)
info['is_success'] = (reward > -self.distance)
return out, reward, done, info
def reset(self):
if self.start is not None:
self.env.reset()
observation = np.array(self.start)
self.env.restore(observation / self.maze_size)
else:
observation = self.env.reset()
if self.goals is None:
condition = True
while condition: # note: goal should not be in the block
self.goal = self.goal_space.sample()
condition = self.env.check_inside(self.goal / self.maze_size)
else:
self.goal = np.array(self.goals)
out = {'observation': observation, 'desired_goal': self.goal}
out['achieved_goal'] = observation
return out
def render(self, mode='rgb_array'):
image = self.env.render(mode='rgb_array')
goal_loc = copy.copy(self.goal)
goal_loc[0] = goal_loc[0] / self.maze_size * image.shape[1]
goal_loc[1] = goal_loc[1] / self.maze_size * image.shape[0]
cv2.circle(image, (int(goal_loc[0]), int(goal_loc[1])), 10, (0, 255, 0), -1)
if mode == 'human':
cv2.imshow('image', image)
cv2.waitKey(2)
else:
return image
import math
import numpy as np
import gym
from gym import spaces
from gym.utils import seeding
class MountainCarEnv(gym.Env):
metadata = {
'render.modes': ['human', 'rgb_array'],
'video.frames_per_second': 30
}
def __init__(self, goal_dim=1):
self.min_position = -1.2
self.max_position = 0.6
self.max_speed = 0.07
self.goal_position = 0.5
self.force = 0.001
self.gravity = 0.0025
self.low = np.array([self.min_position, -self.max_speed])
self.high = np.array([self.max_position, self.max_speed])
self.viewer = None
self.goal_dim = goal_dim
self.action_space = spaces.Discrete(3)
self.observation_space = {
"achieved_goal": spaces.Box(self.low[:self.goal_dim], self.high[:self.goal_dim], dtype=np.float32),
"desired_goal": spaces.Box(self.low[:self.goal_dim], self.high[:self.goal_dim], dtype=np.float32),
"observation": spaces.Box(self.low, self.high, dtype=np.float32),
}
self.seed()
def get_obs(self):
return {
"achieved_goal": np.array(self.state)[:self.goal_dim],
"desired_goal": np.array([self.goal_position, 0][:self.goal_dim]),
"observation": np.array(self.state),
}
def seed(self, seed=None):
self.np_random, seed = seeding.np_random(seed)
return [seed]
def step(self, action):
assert self.action_space.contains(
action), "%r (%s) invalid" % (action, type(action))
info = {'is_success': False}
position, velocity = self.state
velocity += (action - 1) * self.force + \
math.cos(3 * position) * (-self.gravity)
velocity = np.clip(velocity, -self.max_speed, self.max_speed)
position += velocity
position = np.clip(position, self.min_position, self.max_position)
if (position == self.min_position and velocity < 0):
velocity = 0
done = bool(position >= self.goal_position)
reward = -1.0
if done:
reward = 0.0
info['is_success'] = True
self.state = (position, velocity)
return self.get_obs(), reward, done, info
# return np.array(self.state), reward, done, {}
def reset(self):
self.state = np.array([self.np_random.uniform(low=-0.6, high=-0.4), 0])
return self.get_obs()
# return np.array(self.state)
def _height(self, xs):
return np.sin(3 * xs) * .45 + .55
def render(self, mode='human'):
screen_width = 600
screen_height = 400
world_width = self.max_position - self.min_position
scale = screen_width / world_width
carwidth = 40
carheight = 20
if self.viewer is None:
from gym.envs.classic_control import rendering
self.viewer = rendering.Viewer(screen_width, screen_height)
xs = np.linspace(self.min_position, self.max_position, 100)
ys = self._height(xs)
xys = list(zip((xs - self.min_position) * scale, ys * scale))
self.track = rendering.make_polyline(xys)
self.track.set_linewidth(4)
self.viewer.add_geom(self.track)
clearance = 10
l, r, t, b = -carwidth / 2, carwidth / 2, carheight, 0
car = rendering.FilledPolygon([(l, b), (l, t), (r, t), (r, b)])
car.add_attr(rendering.Transform(translation=(0, clearance)))
self.cartrans = rendering.Transform()
car.add_attr(self.cartrans)
self.viewer.add_geom(car)
frontwheel = rendering.make_circle(carheight / 2.5)
frontwheel.set_color(.5, .5, .5)
frontwheel.add_attr(rendering.Transform(
translation=(carwidth / 4, clearance)))
frontwheel.add_attr(self.cartrans)
self.viewer.add_geom(frontwheel)
backwheel = rendering.make_circle(carheight / 2.5)
backwheel.add_attr(rendering.Transform(
translation=(-carwidth / 4, clearance)))
backwheel.add_attr(self.cartrans)
backwheel.set_color(.5, .5, .5)
self.viewer.add_geom(backwheel)
flagx = (self.goal_position - self.min_position) * scale
flagy1 = self._height(self.goal_position) * scale
flagy2 = flagy1 + 50
flagpole = rendering.Line((flagx, flagy1), (flagx, flagy2))
self.viewer.add_geom(flagpole)
flag = rendering.FilledPolygon(
[(flagx, flagy2), (flagx, flagy2 - 10), (flagx + 25, flagy2 - 5)])
flag.set_color(.8, .8, 0)
self.viewer.add_geom(flag)
pos = self.state[0]
self.cartrans.set_translation(
(pos - self.min_position) * scale, self._height(pos) * scale)
self.cartrans.set_rotation(math.cos(3 * pos))
return self.viewer.render(return_rgb_array=mode == 'rgb_array')
def get_keys_to_action(self):
# control with left and right arrow keys
return {(): 1, (276,): 0, (275,): 2, (275, 276): 1}
def close(self):
if self.viewer:
self.viewer.close()
self.viewer = None
def compute_reward(self, state, goal):
'''
to be finish
:param state:
:param goal:
:return:
'''
def get_pairwise(self, state, target):
'''
to be finish
:param state:
:param target:
:return:
'''
from gym.envs.registration import register
import sys
print("path", sys.argv[0].split('/')[-1], "!!!")
if sys.argv[0].split('/')[-1] in ["train_ddpg.py", "visitation_plot.py", "vis_fetch.py"]:
from train_ddpg import args
elif sys.argv[0].split('/')[-1] == "train_hier_ddpg.py":
from train_hier_ddpg import args
elif sys.argv[0].split('/')[-1] == "train_hier_sac.py":
from train_hier_sac import args
elif sys.argv[0].split('/')[-1] == "train_hier_ppo.py":
from train_hier_ppo import args
elif sys.argv[0].split('/')[-1] == "train_covering.py":
from train_covering import args
else:
raise Exception("Unknown main file !!!")
robots = ['Point', 'Ant', 'Swimmer']
task_types = ['Maze', 'Maze1', 'Push', 'Fall', 'Block', 'BlockMaze']
all_name = [x + y for x in robots for y in task_types]
random_start = False
if args.image:
top_down = True
else:
top_down = False
for name_t in all_name:
# episode length
if name_t == "AntMaze":
max_timestep = 1000
else:
max_timestep = 500
for Test in ['', 'Test', 'Test1', 'Test2']:
if Test in ['Test', 'Test1', 'Test2']:
fix_goal = True
else:
if name_t == "AntBlock":
fix_goal = True
else:
fix_goal = False
goal_args = [[-5, -5], [5, 5]]
register(
id=name_t + Test + '-v0',
entry_point='goal_env.mujoco.create_maze_env:create_maze_env',
kwargs={'env_name': name_t, 'goal_args': goal_args, 'maze_size_scaling': 8, 'random_start': random_start},
max_episode_steps=max_timestep,
)
# v1 is the one we use in the main paper
register(
id=name_t + Test + '-v1',
entry_point='goal_env.mujoco.create_maze_env:create_maze_env',
kwargs={'env_name': name_t, 'goal_args': goal_args, 'maze_size_scaling': 4, 'random_start': random_start,
"fix_goal": fix_goal, "top_down_view": top_down, 'test':Test},
max_episode_steps=max_timestep,
)
register(
id=name_t + Test + '-v2',
entry_point='goal_env.mujoco.create_maze_env:create_maze_env',
kwargs={'env_name': name_t, 'goal_args': goal_args, 'maze_size_scaling': 2, 'random_start': random_start},
max_episode_steps=max_timestep,
)
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Wrapper for creating the ant environment in gym_mujoco."""
import math
import numpy as np
from gym import utils
from gym.envs.mujoco import mujoco_env
def q_inv(a):
return [a[0], -a[1], -a[2], -a[3]]
def q_mult(a, b): # multiply two quaternion
w = a[0] * b[0] - a[1] * b[1] - a[2] * b[2] - a[3] * b[3]
i = a[0] * b[1] + a[1] * b[0] + a[2] * b[3] - a[3] * b[2]
j = a[0] * b[2] - a[1] * b[3] + a[2] * b[0] + a[3] * b[1]
k = a[0] * b[3] + a[1] * b[2] - a[2] * b[1] + a[3] * b[0]
return [w, i, j, k]
class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle):
FILE = "ant.xml"
ORI_IND = 3
def __init__(self, file_path=None, expose_all_qpos=True,
expose_body_coms=None, expose_body_comvels=None, noisy_init=True):
self._expose_all_qpos = expose_all_qpos
self._expose_body_coms = expose_body_coms
self._expose_body_comvels = expose_body_comvels
self._body_com_indices = {}
self._body_comvel_indices = {}
self.noisy_init = noisy_init
self.full_obs = False
self.add_noise = False
mujoco_env.MujocoEnv.__init__(self, file_path, 10)
utils.EzPickle.__init__(self)
@property
def physics(self):
return self.model
def _step(self, a):
return self.step(a)
def step(self, a):
xposbefore = self.get_body_com("torso")[0]
self.do_simulation(a, self.frame_skip)
xposafter = self.get_body_com("torso")[0]
forward_reward = (xposafter - xposbefore) / self.dt
ctrl_cost = .5 * np.square(a).sum()
survive_reward = 1.0
reward = forward_reward - ctrl_cost + survive_reward
state = self.state_vector()
done = False
ob = self._get_obs()
return ob, reward, done, dict(
reward_forward=forward_reward,
reward_ctrl=-ctrl_cost,
reward_survive=survive_reward)
def _get_obs(self):
# No cfrc observation
if self._expose_all_qpos:
obs = np.concatenate([
self.data.qpos.flat[:15], # Ensures only ant obs.
self.data.qvel.flat[:14],
])
else:
obs = np.concatenate([
self.data.qpos.flat[2:15],
self.data.qvel.flat[:14],
])
if self._expose_body_coms is not None:
for name in self._expose_body_coms:
com = self.get_body_com(name)
if name not in self._body_com_indices:
indices = range(len(obs), len(obs) + len(com))
self._body_com_indices[name] = indices
obs = np.concatenate([obs, com])
if self._expose_body_comvels is not None:
for name in self._expose_body_comvels:
comvel = self.get_body_comvel(name)
if name not in self._body_comvel_indices:
indices = range(len(obs), len(obs) + len(comvel))
self._body_comvel_indices[name] = indices
obs = np.concatenate([obs, comvel])
if self.full_obs:
obs = np.concatenate([
self.data.qpos.flat,
self.data.qvel.flat,
np.clip(self.data.cfrc_ext, -1, 1).flat,
])
if self.add_noise:
obs = np.concatenate((obs, np.random.uniform(low=-1, high=1, size=20)))
return obs
def reset_model(self):
if self.noisy_init:
qpos = self.init_qpos + self.np_random.uniform(
size=self.model.nq, low=-.1, high=.1)
qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
else:
qpos = self.init_qpos
qvel = self.init_qvel
# Set everything other than ant to original position and 0 velocity.
qpos[15:] = self.init_qpos[15:]
qvel[14:] = 0.
self.set_state(qpos, qvel)
return self._get_obs()
def viewer_setup(self):
# self.viewer.cam.distance = self.model.stat.extent
# self.viewer.cam.trackbodyid = 1
# self.viewer.cam.distance = self.model.stat.extent * 0.7
# self.viewer.cam.lookat[2] = 0.8925
# self.viewer.cam.elevation = 0
self.viewer.cam.trackbodyid = -1
self.viewer.cam.distance = 30
self.viewer.cam.elevation = -90
def get_ori(self):
ori = [0, 1, 0, 0]
rot = self.data.qpos[self.__class__.ORI_IND:self.__class__.ORI_IND + 4] # take the quaternion
ori = q_mult(q_mult(rot, ori), q_inv(rot))[1:3] # project onto x-y plane
ori = math.atan2(ori[1], ori[0])
return ori
def set_xy(self, xy):
qpos = np.copy(self.data.qpos)
qpos[0] = xy[0]
qpos[1] = xy[1]
qvel = self.data.qvel
self.set_state(qpos, qvel)
def get_xy(self):
return self.data.qpos[:2]
from .maze_env import MazeEnv
from .ant import AntEnv
class AntMazeEnv(MazeEnv):
MODEL_CLASS = AntEnv
<mujoco model="ant">
<compiler inertiafromgeom="true" angle="degree" coordinate="local"/>
<option timestep="0.02" integrator="RK4"/>
<custom>
<numeric name="init_qpos" data="0.0 0.0 0.55 1.0 0.0 0.0 0.0 0.0 1.0 0.0 -1.0 0.0 -1.0 0.0 1.0"/>
</custom>
<default>
<joint limited="true" armature="1" damping="1"/>
<geom condim="3" conaffinity="0" margin="0.01" friction="1 0.5 0.5" solref=".02 1" solimp=".8 .8 .01"
rgba="0.8 0.6 0.4 1" density="5.0"/>
</default>
<asset>
<texture type="skybox" builtin="gradient" width="100" height="100" rgb1="1 1 1" rgb2="0 0 0"/>
<texture name="texgeom" type="cube" builtin="flat" mark="cross" width="127" height="1278" rgb1="0.8 0.6 0.4"
rgb2="0.8 0.6 0.4" markrgb="1 1 1" random="0.01"/>
<texture name="texplane" type="2d" builtin="checker" rgb1="0 0 0" rgb2="0.8 0.8 0.8" width="100" height="100"/>
<material name='MatPlane' texture="texplane" shininess="1" texrepeat="60 60" specular="1" reflectance="0.5"/>
<material name='geom' texture="texgeom" texuniform="true"/>
</asset>
<worldbody>
<light directional="true" cutoff="100" exponent="1" diffuse="1 1 1" specular=".1 .1 .1" pos="0 0 1.3"
dir="-0 0 -1.3"/>
<geom name='floor' pos='0 0 0' size='40 40 40' type='plane' conaffinity='1' rgba='0.8 0.9 0.8 1' condim='3'/>
<body name="torso" pos="0 0 0.75">
<geom name="torso_geom" type="sphere" size="0.25" pos="0 0 0"/>
<joint name="root" type="free" limited="false" pos="0 0 0" axis="0 0 1" margin="0.01" armature="0"
damping="0"/>
<body name="front_left_leg" pos="0 0 0">
<geom name="aux_1_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 0.2 0.2 0.0"/>
<body name="aux_1" pos="0.2 0.2 0">
<joint name="hip_1" type="hinge" pos="0.0 0.0 0.0" axis="0 0 1" range="-30 30"/>
<geom name="left_leg_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 0.2 0.2 0.0"/>
<body pos="0.2 0.2 0">
<joint name="ankle_1" type="hinge" pos="0.0 0.0 0.0" axis="-1 1 0" range="30 70"/>
<geom name="left_ankle_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 0.4 0.4 0.0"/>
</body>
</body>
</body>
<body name="front_right_leg" pos="0 0 0">
<geom name="aux_2_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 -0.2 0.2 0.0"/>
<body name="aux_2" pos="-0.2 0.2 0">
<joint name="hip_2" type="hinge" pos="0.0 0.0 0.0" axis="0 0 1" range="-30 30"/>
<geom name="right_leg_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 -0.2 0.2 0.0"/>
<body pos="-0.2 0.2 0">
<joint name="ankle_2" type="hinge" pos="0.0 0.0 0.0" axis="1 1 0" range="-70 -30"/>
<geom name="right_ankle_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 -0.4 0.4 0.0"/>
</body>
</body>
</body>
<body name="back_leg" pos="0 0 0">
<geom name="aux_3_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 -0.2 -0.2 0.0"/>
<body name="aux_3" pos="-0.2 -0.2 0">
<joint name="hip_3" type="hinge" pos="0.0 0.0 0.0" axis="0 0 1" range="-30 30"/>
<geom name="back_leg_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 -0.2 -0.2 0.0"/>
<body pos="-0.2 -0.2 0">
<joint name="ankle_3" type="hinge" pos="0.0 0.0 0.0" axis="-1 1 0" range="-70 -30"/>
<geom name="third_ankle_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 -0.4 -0.4 0.0"/>
</body>
</body>
</body>
<body name="right_back_leg" pos="0 0 0">
<geom name="aux_4_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 0.2 -0.2 0.0"/>
<body name="aux_4" pos="0.2 -0.2 0">
<joint name="hip_4" type="hinge" pos="0.0 0.0 0.0" axis="0 0 1" range="-30 30"/>
<geom name="rightback_leg_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 0.2 -0.2 0.0"/>
<body pos="0.2 -0.2 0">
<joint name="ankle_4" type="hinge" pos="0.0 0.0 0.0" axis="1 1 0" range="30 70"/>
<geom name="fourth_ankle_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 0.4 -0.4 0.0"/>
</body>
</body>
</body>
</body>
</worldbody>
<actuator>
<motor joint="hip_4" ctrlrange="-16.0 16.0" ctrllimited="true"/>
<motor joint="ankle_4" ctrlrange="-16.0 16.0" ctrllimited="true"/>
<motor joint="hip_1" ctrlrange="-16.0 16.0" ctrllimited="true"/>
<motor joint="ankle_1" ctrlrange="-16.0 16.0" ctrllimited="true"/>
<motor joint="hip_2" ctrlrange="-16.0 16.0" ctrllimited="true"/>
<motor joint="ankle_2" ctrlrange="-16.0 16.0" ctrllimited="true"/>
<motor joint="hip_3" ctrlrange="-16.0 16.0" ctrllimited="true"/>
<motor joint="ankle_3" ctrlrange="-16.0 16.0" ctrllimited="true"/>
</actuator>
<!--<actuator>-->
<!--<motor joint="hip_4" ctrlrange="-30.0 30.0" ctrllimited="true"/>-->
<!--<motor joint="ankle_4" ctrlrange="-30.0 30.0" ctrllimited="true"/>-->
<!--<motor joint="hip_1" ctrlrange="-30.0 30.0" ctrllimited="true"/>-->
<!--<motor joint="ankle_1" ctrlrange="-30.0 30.0" ctrllimited="true"/>-->
<!--<motor joint="hip_2" ctrlrange="-30.0 30.0" ctrllimited="true"/>-->
<!--<motor joint="ankle_2" ctrlrange="-30.0 30.0" ctrllimited="true"/>-->
<!--<motor joint="hip_3" ctrlrange="-30.0 30.0" ctrllimited="true"/>-->
<!--<motor joint="ankle_3" ctrlrange="-30.0 30.0" ctrllimited="true"/>-->
<!--</actuator>-->
</mujoco>
<mujoco>
<compiler angle="degree" coordinate="local" inertiafromgeom="true"/>
<option integrator="RK4" timestep="0.02"/>
<default>
<joint armature="0" damping="0" limited="false"/>
<geom conaffinity="0" condim="3" density="100" friction="1 0.5 0.5" margin="0" rgba="0.8 0.6 0.4 1"/>
</default>
<asset>
<texture builtin="gradient" height="100" rgb1="1 1 1" rgb2="0 0 0" type="skybox" width="100"/>
<texture builtin="flat" height="1278" mark="cross" markrgb="1 1 1" name="texgeom" random="0.01"
rgb1="0.8 0.6 0.4" rgb2="0.8 0.6 0.4" type="cube" width="127"/>
<texture builtin="checker" height="100" name="texplane" rgb1="0 0 0" rgb2="0.8 0.8 0.8" type="2d" width="100"/>
<material name="MatPlane" reflectance="0.5" shininess="1" specular="1" texrepeat="30 30" texture="texplane"/>
<material name="geom" texture="texgeom" texuniform="true"/>
</asset>
<worldbody>
<light cutoff="100" diffuse="1 1 1" dir="-0 0 -1.3" directional="true" exponent="1" pos="0 0 1.3"
specular=".1 .1 .1"/>
<geom conaffinity="1" condim="3" material="MatPlane" name="floor" pos="0 0 0" rgba="0.8 0.9 0.8 1"
size="40 40 40" type="plane"/>
<body name="torso" pos="0 0 0">
<geom name="pointbody" pos="0 0 0.5" size="0.5" type="sphere"/>
<geom name="pointarrow" pos="0.6 0 0.5" size="0.5 0.1 0.1" type="box"/>
<joint axis="1 0 0" name="ballx" pos="0 0 0" type="slide"/>
<joint axis="0 1 0" name="bally" pos="0 0 0" type="slide"/>
<joint axis="0 0 1" limited="false" name="rot" pos="0 0 0" type="hinge"/>
</body>
</worldbody>
<actuator>
<!-- Those are just dummy actuators for providing ranges -->
<motor ctrllimited="true" ctrlrange="-1 1" joint="ballx"/>
<motor ctrllimited="true" ctrlrange="-0.25 0.25" joint="rot"/>
</actuator>
</mujoco>
\ No newline at end of file
<mujoco model="swimmer">
<compiler inertiafromgeom="true" angle="degree" coordinate="local" />
<custom>
<numeric name="frame_skip" data="50" />
</custom>
<option timestep="0.001" density="4000" viscosity="0.1" collision="predefined" integrator="Euler" iterations="1000">
<flag warmstart="disable" />
</option>
<default>
<geom contype='1' conaffinity='1' condim='1' rgba='0.8 0.6 .4 1' material="geom" />
<!--<joint armature='1' />-->
</default>
<asset>
<texture type="skybox" builtin="gradient" width="100" height="100" rgb1="1 1 1" rgb2="0 0 0" />
<texture name="texgeom" type="cube" builtin="flat" mark="cross" width="127" height="1278" rgb1="0.8 0.6 0.4" rgb2="0.8 0.6 0.4" markrgb="1 1 1" random="0.01" />
<texture name="texplane" type="2d" builtin="checker" rgb1="0 0 0" rgb2="0.8 0.8 0.8" width="100" height="100" />
<material name='MatPlane' texture="texplane" shininess="1" texrepeat="30 30" specular="1" reflectance="0.5" />
<material name='geom' texture="texgeom" texuniform="true" />
</asset>
<worldbody>
<light directional="true" cutoff="100" exponent="1" diffuse="1 1 1" specular=".1 .1 .1" pos="0 0 1.3" dir="-0 0 -1.3" />
<geom name='floor' material="MatPlane" pos='0 0 -0.1' size='40 40 0.1' type='plane' conaffinity='1' rgba='0.8 0.9 0.8 1' condim='3' />
<!-- ================= SWIMMER ================= /-->
<body name="torso" pos="0 0 0">
<geom name="torso" type="capsule" fromto="1.5 0 0 0.5 0 0" size="0.1" density="1000" />
<joint pos="0 0 0" type="slide" name="slider1" axis="1 0 0" />
<joint pos="0 0 0" type="slide" name="slider2" axis="0 1 0" />
<joint name="rot" type="hinge" pos="0 0 0" axis="0 0 1" />
<body name="mid" pos="0.5 0 0">
<geom name="mid" type="capsule" fromto="0 0 0 -1 0 0" size="0.1" density="1000" />
<joint name="rot2" type="hinge" pos="0 0 0" axis="0 0 1" range="-100 100" limited="true" />
<body name="back" pos="-1 0 0">
<geom name="back" type="capsule" fromto="0 0 0 -1 0 0" size="0.1" density="1000" />
<joint name="rot3" type="hinge" pos="0 0 0" axis="0 0 1" range="-100 100" limited="true" />
</body>
</body>
</body>
</worldbody>
<actuator>
<motor joint="rot2" ctrllimited="true" ctrlrange="-50 50" />
<motor joint="rot3" ctrllimited="true" ctrlrange="-50 50" />
</actuator>
</mujoco>
from .ant_maze_env import AntMazeEnv
from .point_maze_env import PointMazeEnv
from .swimmer_maze_env import SwimmerMazeEnv
from collections import OrderedDict
import gym
import numpy as np
import copy
from gym import Wrapper
from gym.envs.registration import EnvSpec
class GoalWrapper(Wrapper):
def __init__(self, env, maze_size_scaling, random_start, low, high, fix_goal=True, top_down=False, test=None):
super(GoalWrapper, self).__init__(env)
ob_space = env.observation_space
self.maze_size_scaling = maze_size_scaling
row_num, col_num = len(self.env.MAZE_STRUCTURE), len(self.env.MAZE_STRUCTURE[0])
contain_r = [1 if "r" in row else 0 for row in self.env.MAZE_STRUCTURE]
row_r = contain_r.index(1)
col_r = self.env.MAZE_STRUCTURE[row_r].index("r")
y_low = (0.5 - row_r) * self.maze_size_scaling
x_low = (0.5 - col_r) * self.maze_size_scaling
y_high = (row_num - 1.5 - row_r) * self.maze_size_scaling
x_high = (col_num - 1.5 - col_r) * self.maze_size_scaling
self.maze_low = maze_low = np.array([x_low, y_low],
dtype=ob_space.dtype)
self.maze_high = maze_high = np.array([x_high, y_high],
dtype=ob_space.dtype)
print("maze_low, maze_high", self.maze_low, self.maze_high)
goal_low, goal_high = maze_low, maze_high
self.goal_space = gym.spaces.Box(low=goal_low, high=goal_high)
self.maze_space = gym.spaces.Box(low=maze_low, high=maze_high)
if self.env._maze_id == "Fall":
self.goal_dim = 3
else:
self.goal_dim = goal_low.size
print("goal_dim in create_maze", self.goal_dim)
self.distance_threshold = 1.5
print("distance threshold in create_maze", self.distance_threshold)
self.observation_space = gym.spaces.Dict(OrderedDict({
'observation': ob_space,
'desired_goal': self.goal_space,
'achieved_goal': self.goal_space,
}))
self.random_start = random_start
# fix goal
self.fix_goal = fix_goal
print("fix goal", self.fix_goal)
contain_g = [1 if "g" in row else 0 for row in self.env.MAZE_STRUCTURE]
if 1 in contain_g and self.fix_goal and test == "Test":
row = contain_g.index(1)
col = self.env.MAZE_STRUCTURE[row].index("g")
y = (row - row_r) * self.maze_size_scaling
x = (col - col_r) * self.maze_size_scaling
self.fix_goal_xy = np.array([x, y])
if env._maze_id == "Fall":
self.fix_goal_xy = np.concatenate((self.fix_goal_xy, [self.maze_size_scaling * 0.5 + 0.5]))
print("fix goal xy", self.fix_goal_xy)
elif test == "Test1":
if env._maze_id == "Push":
self.fix_goal_xy = np.array([-4, 0])
elif env._maze_id == "Maze1":
self.fix_goal_xy = np.array([8, 0])
else:
print("Unknown env", env._maze_id)
assert False
print("fix goal xy", self.fix_goal_xy)
elif test == "Test2":
if env._maze_id == "Push":
self.fix_goal_xy = np.array([-4, 4])
elif env._maze_id == "Maze1":
self.fix_goal_xy = np.array([8, 8])
else:
print("Unknown env", env._maze_id)
assert False
print("fix goal xy", self.fix_goal_xy)
else:
# get vacant rowcol
structure = self.env.MAZE_STRUCTURE
self.vacant_rowcol = []
for i in range(len(structure)):
for j in range(len(structure[0])):
if structure[i][j] not in [1, -1, 'r']:
self.vacant_rowcol.append((i, j))
self.reward_type = "dense"
self.top_down = top_down
def step(self, action):
observation, reward, _, info = self.env.step(action)
out = {'observation': observation,
'desired_goal': self.goal,
# 'achieved_goal': observation[..., 3:5]}
'achieved_goal': observation[..., :self.goal_dim]}
distance = np.linalg.norm(observation[..., :self.goal_dim] - self.goal[..., :self.goal_dim], axis=-1)
info['is_success'] = done = (distance < self.distance_threshold)
if self.reward_type == "sparse":
reward = -(distance > self.distance_threshold).astype(np.float32)
else:
# normlization
reward = -distance * 0.1
if self.top_down:
mask = np.array([0.0] * 2 + [1.0] * (out['observation'].shape[0] - 2))
out['observation'] = out['observation'] * mask
return out, reward, done, info
def reset(self):
if self.fix_goal:
self.goal = self.fix_goal_xy
else:
self.goal = self.goal_space.sample()
if self.env._maze_id == "Push":
while (self.env.old_invalid_goal(self.goal[:2])):
self.goal = self.goal_space.sample()
else:
while (self.env.invalid_goal(self.goal[:2])):
self.goal = self.goal_space.sample()
if self.env._maze_id == "Fall":
self.goal = np.concatenate((self.goal, [self.maze_size_scaling * 0.5 + 0.5]))
observation = self.env.reset(self.goal)
# random start a position without collision
if self.random_start:
xy = self.maze_space.sample()
while (self.env._is_in_collision(xy)):
xy = self.maze_space.sample()
self.env.wrapped_env.set_xy(xy)
observation = self.env._get_obs()
out = {'observation': observation, 'desired_goal': self.goal}
out['achieved_goal'] = observation[..., :self.goal_dim]
# out['achieved_goal'] = observation[..., 3:5]
if self.top_down:
# print("obs", out['observation'].shape)
mask = np.array([0.0] * 2 + [1.0] * (out['observation'].shape[0] - 2))
out['observation'] = out['observation'] * mask
return out
def create_maze_env(env_name=None, top_down_view=False, maze_size_scaling=4, random_start=True, goal_args=None,
fix_goal=True, test=None):
n_bins = 0
if env_name.startswith('Ego'):
n_bins = 8
env_name = env_name[3:]
if env_name.startswith('Ant'):
manual_collision = True
cls = AntMazeEnv
env_name = env_name[3:]
maze_size_scaling = maze_size_scaling
elif env_name.startswith('Point'):
cls = PointMazeEnv
manual_collision = True
env_name = env_name[5:]
maze_size_scaling = maze_size_scaling
elif env_name.startswith('Swimmer'):
cls = SwimmerMazeEnv
manual_collision = True
env_name = env_name[7:]
maze_size_scaling = maze_size_scaling
else:
assert False, 'unknown env %s' % env_name
observe_blocks = False
put_spin_near_agent = False
if env_name == 'Maze':
maze_id = 'Maze'
elif env_name == 'Maze1':
maze_id = 'Maze1'
maze_size_scaling = 4
elif env_name == 'Push':
maze_id = 'Push'
manual_collision = True
maze_size_scaling = 4
elif env_name == 'Fall':
maze_id = 'Fall'
elif env_name == 'Block':
maze_id = 'Block'
put_spin_near_agent = True
observe_blocks = True
elif env_name == 'BlockMaze':
maze_id = 'BlockMaze'
put_spin_near_agent = True
observe_blocks = True
else:
raise ValueError('Unknown maze environment %s' % env_name)
gym_mujoco_kwargs = {
'maze_id': maze_id,
'n_bins': n_bins,
'observe_blocks': observe_blocks,
'put_spin_near_agent': put_spin_near_agent,
'top_down_view': top_down_view,
'manual_collision': manual_collision,
'maze_size_scaling': maze_size_scaling,
}
gym_env = cls(**gym_mujoco_kwargs)
# gym_env.reset()
# goal_args = np.array(goal_args) / 8 * maze_size_scaling
return GoalWrapper(gym_env, maze_size_scaling, random_start, *goal_args, fix_goal=fix_goal, top_down=top_down_view, test=test)
"""Adapted from rllab maze_env.py."""
import os
import tempfile
import xml.etree.ElementTree as ET
import math
import numpy as np
import gym
from . import maze_env_utils
from gym.utils import seeding
from gym import wrappers
# Directory that contains mujoco xml files.
# MODEL_DIR = '/home/hza/ToolBox/tools/fancy/data/mujoco/assets'
MODEL_DIR = os.path.join(os.path.dirname(__file__), 'assets')
class MazeEnv(gym.Env):
MODEL_CLASS = None
MAZE_HEIGHT = None
MAZE_SIZE_SCALING = None
def __init__(
self,
maze_id=None,
maze_height=0.5,
maze_size_scaling=8,
n_bins=0,
sensor_range=3.,
sensor_span=2 * math.pi,
observe_blocks=False,
put_spin_near_agent=False,
top_down_view=False,
manual_collision=False,
goal=None,
*args,
**kwargs):
self._maze_id = maze_id
model_cls = self.__class__.MODEL_CLASS
if model_cls is None:
raise "MODEL_CLASS unspecified!"
xml_path = os.path.join(MODEL_DIR, model_cls.FILE)
self.tree = tree = ET.parse(xml_path)
self.worldbody = worldbody = tree.find(".//worldbody")
self.t = 0
self.MAZE_HEIGHT = height = maze_height
self.MAZE_SIZE_SCALING = size_scaling = maze_size_scaling
self._n_bins = n_bins
self._sensor_range = sensor_range * size_scaling
self._sensor_span = sensor_span
self._observe_blocks = observe_blocks
self._put_spin_near_agent = put_spin_near_agent
self._top_down_view = top_down_view
self._manual_collision = manual_collision
self.MAZE_STRUCTURE = structure = maze_env_utils.construct_maze(
maze_id=self._maze_id)
# Elevate the maze to allow for falling.
self.elevated = any(-1 in row for row in structure)
self.blocks = any(
any(maze_env_utils.can_move(r) for r in row)
for row in structure) # Are there any movable blocks?
torso_x, torso_y = self._find_robot() # x, y coordinates
self._init_torso_x = torso_x
self._init_torso_y = torso_y
self._init_positions = [
(x - torso_x, y - torso_y)
for x, y in self._find_all_robots()]
self._xy_to_rowcol = lambda x, y: (2 + (y + size_scaling / 2) / size_scaling,
2 + (x + size_scaling / 2) / size_scaling)
# walls (immovable), chasms (fall), movable blocks
self._view = np.zeros([5, 5, 3])
height_offset = 0.
if self.elevated:
# Increase initial z-pos of ant.
height_offset = height * size_scaling
torso = tree.find(".//body[@name='torso']")
torso.set('pos', '0 0 %.2f' % (0.75 + height_offset))
if self.blocks:
# If there are movable blocks, change simulation settings to perform
# better contact detection.
default = tree.find(".//default")
default.find('.//geom').set('solimp', '.995 .995 .01')
self.movable_blocks = []
self.not_thin = True
for i in range(len(structure)):
for j in range(len(structure[0])):
struct = structure[i][j]
if struct == 'r' and self._put_spin_near_agent:
struct = maze_env_utils.Move.SpinXY
if self.elevated and struct not in [-1]:
# Create elevated platform.
ET.SubElement(
worldbody, "geom",
name="elevated_%d_%d" % (i, j),
pos="%f %f %f" % (j * size_scaling - torso_x,
i * size_scaling - torso_y,
height / 2 * size_scaling),
size="%f %f %f" % (0.5 * size_scaling,
0.5 * size_scaling,
height / 2 * size_scaling),
type="box",
material="",
contype="1",
conaffinity="1",
rgba="0.9 0.9 0.9 1",
)
if struct == 1: # Unmovable block.
# Offset all coordinates so that robot starts at the origin.
if self.not_thin or (i == 0 or i == len(structure) - 1) or (j == 0 or j == len(structure[0])-1) or maze_id != "Maze1":
y_size = 0.5 * size_scaling
else:
y_size = 0.25
ET.SubElement(
worldbody, "geom",
name="block_%d_%d" % (i, j),
pos="%f %f %f" % (j * size_scaling - torso_x,
i * size_scaling - torso_y,
height_offset +
height / 2 * size_scaling),
size="%f %f %f" % (0.5 * size_scaling,
y_size,
height / 2 * size_scaling),
type="box",
material="",
contype="1",
conaffinity="1",
rgba="0.4 0.4 0.4 1",
)
elif maze_env_utils.can_move(struct): # Movable block.
# The "falling" blocks are shrunk slightly and increased in mass to
# ensure that it can fall easily through a gap in the platform blocks.
name = "movable_%d_%d" % (i, j)
self.movable_blocks.append((name, struct))
falling = maze_env_utils.can_move_z(struct)
spinning = maze_env_utils.can_spin(struct)
x_offset = 0.25 * size_scaling if spinning else 0.0
y_offset = 0.0
shrink = 0.2 if spinning else 0.99 if falling else 1.0
height_shrink = 0.2 if spinning else 1.0
movable_body = ET.SubElement(
worldbody, "body",
name=name,
pos="%f %f %f" % (j * size_scaling - torso_x + x_offset,
i * size_scaling - torso_y + y_offset,
height_offset +
height / 2 * size_scaling * height_shrink),
)
ET.SubElement(
movable_body, "geom",
name="block_%d_%d" % (i, j),
pos="0 0 0",
size="%f %f %f" % (0.5 * size_scaling * shrink,
0.5 * size_scaling * shrink,
height / 2 * size_scaling * height_shrink),
type="box",
material="",
mass="0.001" if falling else "0.0002",
contype="1",
conaffinity="1",
rgba="0.9 0.1 0.1 1"
)
if maze_env_utils.can_move_x(struct):
ET.SubElement(
movable_body, "joint",
armature="0",
axis="1 0 0",
damping="0.0",
limited="true" if falling else "false",
range="%f %f" % (-size_scaling, size_scaling),
margin="0.01",
name="movable_x_%d_%d" % (i, j),
pos="0 0 0",
type="slide"
)
if maze_env_utils.can_move_y(struct):
ET.SubElement(
movable_body, "joint",
armature="0",
axis="0 1 0",
damping="0.0",
limited="true" if falling else "false",
range="%f %f" % (-size_scaling, size_scaling),
margin="0.01",
name="movable_y_%d_%d" % (i, j),
pos="0 0 0",
type="slide"
)
if maze_env_utils.can_move_z(struct):
ET.SubElement(
movable_body, "joint",
armature="0",
axis="0 0 1",
damping="0.0",
limited="true",
range="%f 0" % (-height_offset),
margin="0.01",
name="movable_z_%d_%d" % (i, j),
pos="0 0 0",
type="slide"
)
if maze_env_utils.can_spin(struct):
ET.SubElement(
movable_body, "joint",
armature="0",
axis="0 0 1",
damping="0.0",
limited="false",
name="spinable_%d_%d" % (i, j),
pos="0 0 0",
type="ball"
)
torso = tree.find(".//body[@name='torso']")
geoms = torso.findall(".//geom")
for geom in geoms:
if 'name' not in geom.attrib:
raise Exception("Every geom of the torso must have a name "
"defined")
_, file_path = tempfile.mkstemp(text=True, suffix='.xml')
tree.write(file_path)
self.wrapped_env = model_cls(*args, file_path=file_path, **kwargs)
self.args = args
self.kwargs = kwargs
self.visualize_goal = True
self.GOAL = goal
if self.GOAL is not None:
self.GOAL = self.unwrapped._rowcol_to_xy(*self.GOAL)
self.EPS = self.unwrapped.MAZE_SIZE_SCALING ** 2
contain_r = [1 if "r" in row else 0 for row in self.MAZE_STRUCTURE]
self.init_row_r = contain_r.index(1)
self.init_col_r = self.MAZE_STRUCTURE[self.init_row_r].index("r")
def get_ori(self):
return self.wrapped_env.get_ori()
def get_top_down_view(self):
self._view = np.zeros_like(self._view)
def valid(row, col):
return self._view.shape[0] > row >= 0 and self._view.shape[1] > col >= 0
def update_view(x, y, d, row=None, col=None):
if row is None or col is None:
x = x - self._robot_x
y = y - self._robot_y
th = self._robot_ori
row, col = self._xy_to_rowcol(x, y)
update_view(x, y, d, row=row, col=col)
return
row, row_frac, col, col_frac = int(row), row % 1, int(col), col % 1
if row_frac < 0:
row_frac += 1
if col_frac < 0:
col_frac += 1
if valid(row, col):
self._view[row, col, d] += (
(min(1., row_frac + 0.5) - max(0., row_frac - 0.5)) *
(min(1., col_frac + 0.5) - max(0., col_frac - 0.5)))
if valid(row - 1, col):
self._view[row - 1, col, d] += (
(max(0., 0.5 - row_frac)) *
(min(1., col_frac + 0.5) - max(0., col_frac - 0.5)))
if valid(row + 1, col):
self._view[row + 1, col, d] += (
(max(0., row_frac - 0.5)) *
(min(1., col_frac + 0.5) - max(0., col_frac - 0.5)))
if valid(row, col - 1):
self._view[row, col - 1, d] += (
(min(1., row_frac + 0.5) - max(0., row_frac - 0.5)) *
(max(0., 0.5 - col_frac)))
if valid(row, col + 1):
self._view[row, col + 1, d] += (
(min(1., row_frac + 0.5) - max(0., row_frac - 0.5)) *
(max(0., col_frac - 0.5)))
if valid(row - 1, col - 1):
self._view[row - 1, col - 1, d] += (
(max(0., 0.5 - row_frac)) * max(0., 0.5 - col_frac))
if valid(row - 1, col + 1):
self._view[row - 1, col + 1, d] += (
(max(0., 0.5 - row_frac)) * max(0., col_frac - 0.5))
if valid(row + 1, col + 1):
self._view[row + 1, col + 1, d] += (
(max(0., row_frac - 0.5)) * max(0., col_frac - 0.5))
if valid(row + 1, col - 1):
self._view[row + 1, col - 1, d] += (
(max(0., row_frac - 0.5)) * max(0., 0.5 - col_frac))
# Draw ant.
robot_x, robot_y = self.wrapped_env.get_body_com("torso")[:2]
self._robot_x = robot_x
self._robot_y = robot_y
self._robot_ori = self.get_ori()
structure = self.MAZE_STRUCTURE
size_scaling = self.MAZE_SIZE_SCALING
height = self.MAZE_HEIGHT
# Draw immovable blocks and chasms.
for i in range(len(structure)):
for j in range(len(structure[0])):
if structure[i][j] == 1: # Wall.
update_view(j * size_scaling - self._init_torso_x,
i * size_scaling - self._init_torso_y,
0)
if structure[i][j] == -1: # Chasm.
update_view(j * size_scaling - self._init_torso_x,
i * size_scaling - self._init_torso_y,
1)
# Draw movable blocks.
for block_name, block_type in self.movable_blocks:
block_x, block_y = self.wrapped_env.get_body_com(block_name)[:2]
update_view(block_x, block_y, 2)
return self._view
def get_range_sensor_obs(self):
"""Returns egocentric range sensor observations of maze."""
robot_x, robot_y, robot_z = self.wrapped_env.get_body_com("torso")[:3]
ori = self.get_ori()
structure = self.MAZE_STRUCTURE
size_scaling = self.MAZE_SIZE_SCALING
height = self.MAZE_HEIGHT
segments = []
# Get line segments (corresponding to outer boundary) of each immovable
# block or drop-off.
for i in range(len(structure)):
for j in range(len(structure[0])):
if structure[i][j] in [1, -1]: # There's a wall or drop-off.
cx = j * size_scaling - self._init_torso_x
cy = i * size_scaling - self._init_torso_y
x1 = cx - 0.5 * size_scaling
x2 = cx + 0.5 * size_scaling
y1 = cy - 0.5 * size_scaling
y2 = cy + 0.5 * size_scaling
struct_segments = [
((x1, y1), (x2, y1)),
((x2, y1), (x2, y2)),
((x2, y2), (x1, y2)),
((x1, y2), (x1, y1)),
]
for seg in struct_segments:
segments.append(dict(
segment=seg,
type=structure[i][j],
))
# Get line segments (corresponding to outer boundary) of each movable
# block within the agent's z-view.
for block_name, block_type in self.movable_blocks:
block_x, block_y, block_z = self.wrapped_env.get_body_com(block_name)[
:3]
if (block_z + height * size_scaling / 2 >= robot_z and
robot_z >= block_z - height * size_scaling / 2): # Block in view.
x1 = block_x - 0.5 * size_scaling
x2 = block_x + 0.5 * size_scaling
y1 = block_y - 0.5 * size_scaling
y2 = block_y + 0.5 * size_scaling
struct_segments = [
((x1, y1), (x2, y1)),
((x2, y1), (x2, y2)),
((x2, y2), (x1, y2)),
((x1, y2), (x1, y1)),
]
for seg in struct_segments:
segments.append(dict(
segment=seg,
type=block_type,
))
# 3 for wall, drop-off, block
sensor_readings = np.zeros((self._n_bins, 3))
for ray_idx in range(self._n_bins):
ray_ori = (ori - self._sensor_span * 0.5 +
(2 * ray_idx + 1.0) / (2 * self._n_bins) * self._sensor_span)
ray_segments = []
# Get all segments that intersect with ray.
for seg in segments:
p = maze_env_utils.ray_segment_intersect(
ray=((robot_x, robot_y), ray_ori),
segment=seg["segment"])
if p is not None:
ray_segments.append(dict(
segment=seg["segment"],
type=seg["type"],
ray_ori=ray_ori,
distance=maze_env_utils.point_distance(
p, (robot_x, robot_y)),
))
if len(ray_segments) > 0:
# Find out which segment is intersected first.
first_seg = sorted(
ray_segments, key=lambda x: x["distance"])[0]
seg_type = first_seg["type"]
idx = (0 if seg_type == 1 else # Wall.
1 if seg_type == -1 else # Drop-off.
2 if maze_env_utils.can_move(seg_type) else # Block.
None)
if first_seg["distance"] <= self._sensor_range:
sensor_readings[ray_idx][idx] = (self._sensor_range - first_seg["distance"]) / self._sensor_range
return sensor_readings
def _get_obs(self):
wrapped_obs = self.wrapped_env._get_obs()
if self._observe_blocks:
additional_obs = []
for block_name, block_type in self.movable_blocks:
additional_obs.append(self.wrapped_env.get_body_com(block_name))
wrapped_obs = np.concatenate((additional_obs[0], wrapped_obs))
if self._top_down_view:
view = self.get_top_down_view().flatten()
wrapped_obs = np.concatenate((wrapped_obs, view))
return wrapped_obs
def seed(self, seed=None):
self.np_random, seed = seeding.np_random(seed)
return [seed]
def reset(self, goal):
self.goal = goal
if self.visualize_goal: # remove the prev goal and add a new goal
goal_x, goal_y = goal[0], goal[1]
size_scaling = self.MAZE_SIZE_SCALING
# remove the original goal
try:
self.worldbody.remove(self.goal_element)
except AttributeError:
pass
# offset all coordinates so that robot starts at the origin
self.goal_element = \
ET.SubElement(
self.worldbody, "geom",
name="goal_%d_%d" % (goal_x, goal_y),
pos="%f %f %f" % (goal_x,
goal_y,
self.MAZE_HEIGHT / 2 * size_scaling),
size="%f %f %f" % (0.1 * size_scaling, # smaller than the block to prevent collision
0.1 * size_scaling,
self.MAZE_HEIGHT / 2 * size_scaling),
type="box",
material="",
contype="1",
conaffinity="1",
rgba="0.0 1.0 0.0 0.5"
)
# Note: running the lines below will make the robot position wrong! (because the graph is rebuilt)
torso = self.tree.find(".//body[@name='torso']")
geoms = torso.findall(".//geom")
for geom in geoms:
if 'name' not in geom.attrib:
raise Exception("Every geom of the torso must have a name "
"defined")
_, file_path = tempfile.mkstemp(text=True, suffix='.xml')
self.tree.write(
file_path) # here we write a temporal file with the robot specifications. Why not the original one??
model_cls = self.__class__.MODEL_CLASS
self.wrapped_env = model_cls(*self.args, file_path=file_path,
**self.kwargs) # file to the robot specifications; model_cls is AntEnv
self.t = 0
self.trajectory = []
self.wrapped_env.reset()
if len(self._init_positions) > 1:
xy = self._init_positions[self.np_random.randint(len(self._init_positions))]
self.wrapped_env.set_xy(xy)
return self._get_obs()
@property
def viewer(self):
return self.wrapped_env.viewer
def render(self, *args, **kwargs):
return self.wrapped_env.render(*args, **kwargs)
@property
def observation_space(self):
shape = self._get_obs().shape
high = np.inf * np.ones(shape)
low = -high
return gym.spaces.Box(low, high)
@property
def action_space(self):
return self.wrapped_env.action_space
def _find_robot(self):
structure = self.MAZE_STRUCTURE
size_scaling = self.MAZE_SIZE_SCALING
for i in range(len(structure)):
for j in range(len(structure[0])):
if structure[i][j] == 'r':
return j * size_scaling, i * size_scaling
assert False, 'No robot in maze specification.'
def _find_all_robots(self):
structure = self.MAZE_STRUCTURE
size_scaling = self.MAZE_SIZE_SCALING
coords = []
for i in range(len(structure)):
for j in range(len(structure[0])):
if structure[i][j] == 'r':
coords.append((j * size_scaling, i * size_scaling))
return coords
def _is_in_collision(self, pos):
i, j = self.new_xy_to_rowcol(pos)
if self.MAZE_STRUCTURE[i][j] == 1:
return True
else:
return False
def invalid_goal(self, pos):
i, j = self.new_xy_to_rowcol(pos)
if self.MAZE_STRUCTURE[i][j] in [1, -1]:
return True
else:
return False
# recover the best setting for push
def old_is_in_collision(self, pos):
x, y = pos
structure = self.MAZE_STRUCTURE
size_scaling = self.MAZE_SIZE_SCALING
for i in range(len(structure)):
for j in range(len(structure[0])):
if structure[i][j] == 1:
minx = j * size_scaling - size_scaling * 0.5 - self._init_torso_x
maxx = j * size_scaling + size_scaling * 0.5 - self._init_torso_x
miny = i * size_scaling - size_scaling * 0.5 - self._init_torso_y
maxy = i * size_scaling + size_scaling * 0.5 - self._init_torso_y
if minx <= x <= maxx and miny <= y <= maxy:
# print(i, j, minx, maxx, miny, maxy, x, y)
return True
return False
def old_invalid_goal(self, pos):
x, y = pos
structure = self.MAZE_STRUCTURE
size_scaling = self.MAZE_SIZE_SCALING
for i in range(len(structure)):
for j in range(len(structure[0])):
if structure[i][j] in [1, -1]:
minx = j * size_scaling - size_scaling * 0.5 - self._init_torso_x
maxx = j * size_scaling + size_scaling * 0.5 - self._init_torso_x
miny = i * size_scaling - size_scaling * 0.5 - self._init_torso_y
maxy = i * size_scaling + size_scaling * 0.5 - self._init_torso_y
if minx <= x <= maxx and miny <= y <= maxy:
# print(i, j, minx, maxx, miny, maxy, x, y)
return True
return False
def new_xy_to_rowcol(self, pos):
x, y = pos
relative_col = math.ceil(x / self.MAZE_SIZE_SCALING - 0.5)
relative_row = math.ceil(y / self.MAZE_SIZE_SCALING - 0.5)
return self.init_row_r + relative_row, self.init_col_r + relative_col
def _rowcol_to_xy(self, j, i):
size_scaling = self.MAZE_SIZE_SCALING
minx = j * size_scaling - size_scaling * 0.5 - self._init_torso_x
maxx = j * size_scaling + size_scaling * 0.5 - self._init_torso_x
miny = i * size_scaling - size_scaling * 0.5 - self._init_torso_y
maxy = i * size_scaling + size_scaling * 0.5 - self._init_torso_y
return (minx + maxx) / 2, (miny + maxy) / 2
def step(self, action):
self.t += 1
if self._manual_collision:
old_pos = self.wrapped_env.get_xy()
inner_next_obs, inner_reward, done, info = self.wrapped_env.step(
action)
new_pos = self.wrapped_env.get_xy()
if self._maze_id == "Push":
if self.old_is_in_collision(new_pos):
self.wrapped_env.set_xy(old_pos)
else:
if self._is_in_collision(new_pos):
self.wrapped_env.set_xy(old_pos)
else:
inner_next_obs, inner_reward, done, info = self.wrapped_env.step(
action)
next_obs = self._get_obs()
done = False
if self.GOAL is not None:
# print(self.EPS, next_obs[:2], self.GOAL[:2])
done = bool(((next_obs[:2] - self.GOAL[:2]) ** 2).sum() < self.EPS)
inner_reward = int(done)
return next_obs, inner_reward, done, info
"""Adapted from rllab maze_env_utils.py."""
import numpy as np
import math
class Move(object):
X = 11
Y = 12
Z = 13
XY = 14
XZ = 15
YZ = 16
XYZ = 17
SpinXY = 18
def can_move_x(movable):
return movable in [Move.X, Move.XY, Move.XZ, Move.XYZ,
Move.SpinXY]
def can_move_y(movable):
return movable in [Move.Y, Move.XY, Move.YZ, Move.XYZ,
Move.SpinXY]
def can_move_z(movable):
return movable in [Move.Z, Move.XZ, Move.YZ, Move.XYZ]
def can_spin(movable):
return movable in [Move.SpinXY]
def can_move(movable):
return can_move_x(movable) or can_move_y(movable) or can_move_z(movable)
def construct_maze(maze_id='Maze'):
if maze_id == 'Maze':
structure = [
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1],
[1, 0, 'r', 0, 0, 1, 0, 0, 0, 0, 1],
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1],
[1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1],
[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1],
[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1],
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
[1, 0, 0, 0, 0, 1, 0, 0, 0, 'g', 1],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
]
elif maze_id == 'Maze1':
structure = [
[1, 1, 1, 1, 1],
[1, 'r', 0, 0, 1],
[1, 1, 1, 0, 1],
[1, 'g', 0, 0, 1],
[1, 1, 1, 1, 1],
]
elif maze_id == 'Push':
structure = [
[1, 1, 1, 1, 1],
[1, 0, 'r', 1, 1],
[1, 0, Move.XY, 0, 1],
[1, 1, 'g', 1, 1],
[1, 1, 1, 1, 1],
]
elif maze_id == 'Fall':
structure = [
[1, 1, 1, 1],
[1, 'r', 0, 1],
[1, 0, Move.YZ, 1],
[1, -1, -1, 1],
[1, 'g', 0, 1],
[1, 1, 1, 1],
]
elif maze_id == 'Block':
O = 'r'
structure = [
[1, 1, 1, 1, 1],
[1, O, 0, 0, 1],
[1, 0, 0, 0, 1],
[1, 0, 0, 'g', 1],
[1, 1, 1, 1, 1],
]
elif maze_id == 'BlockMaze':
O = 'r'
structure = [
[1, 1, 1, 1],
[1, O, 0, 1],
[1, 1, 0, 1],
[1, 'g', 0, 1],
[1, 1, 1, 1],
]
else:
raise NotImplementedError('The provided MazeId %s is not recognized' % maze_id)
return structure
def line_intersect(pt1, pt2, ptA, ptB):
"""
Taken from https://www.cs.hmc.edu/ACM/lectures/intersections.html
this returns the intersection of Line(pt1,pt2) and Line(ptA,ptB)
"""
DET_TOLERANCE = 0.00000001
# the first line is pt1 + r*(pt2-pt1)
# in component form:
x1, y1 = pt1
x2, y2 = pt2
dx1 = x2 - x1
dy1 = y2 - y1
# the second line is ptA + s*(ptB-ptA)
x, y = ptA
xB, yB = ptB
dx = xB - x
dy = yB - y
DET = (-dx1 * dy + dy1 * dx)
if math.fabs(DET) < DET_TOLERANCE: return (0, 0, 0, 0, 0)
# now, the determinant should be OK
DETinv = 1.0 / DET
# find the scalar amount along the "self" segment
r = DETinv * (-dy * (x - x1) + dx * (y - y1))
# find the scalar amount along the input line
s = DETinv * (-dy1 * (x - x1) + dx1 * (y - y1))
# return the average of the two descriptions
xi = (x1 + r * dx1 + x + s * dx) / 2.0
yi = (y1 + r * dy1 + y + s * dy) / 2.0
return (xi, yi, 1, r, s)
def ray_segment_intersect(ray, segment):
"""
Check if the ray originated from (x, y) with direction theta intersects the line segment (x1, y1) -- (x2, y2),
and return the intersection point if there is one
"""
(x, y), theta = ray
# (x1, y1), (x2, y2) = segment
pt1 = (x, y)
len = 1
pt2 = (x + len * math.cos(theta), y + len * math.sin(theta))
xo, yo, valid, r, s = line_intersect(pt1, pt2, *segment)
if valid and r >= 0 and 0 <= s <= 1:
return (xo, yo)
return None
def point_distance(p1, p2):
x1, y1 = p1
x2, y2 = p2
return ((x1 - x2) ** 2 + (y1 - y2) ** 2) ** 0.5
"""Wrapper for creating the ant environment in gym_mujoco."""
import math
import numpy as np
from gym import utils
from gym.envs.mujoco import mujoco_env
class PointEnv(mujoco_env.MujocoEnv, utils.EzPickle):
FILE = "point.xml"
ORI_IND = 2
def __init__(self, file_path=None, expose_all_qpos=True):
self._expose_all_qpos = expose_all_qpos
self.add_noise = False
mujoco_env.MujocoEnv.__init__(self, file_path, 1)
utils.EzPickle.__init__(self)
@property
def physics(self):
return self.model
def _step(self, a):
return self.step(a)
def step(self, action):
action[0] = 0.2 * action[0]
qpos = np.copy(self.data.qpos)
qpos[2] += action[1]
ori = qpos[2]
# compute increment in each direction
dx = math.cos(ori) * action[0]
dy = math.sin(ori) * action[0]
# ensure that the robot is within reasonable range
qpos[0] = np.clip(qpos[0] + dx, -100, 100)
qpos[1] = np.clip(qpos[1] + dy, -100, 100)
qvel = self.data.qvel
self.set_state(qpos, qvel)
for _ in range(0, self.frame_skip):
self.sim.step()
next_obs = self._get_obs()
reward = 0
done = False
info = {}
return next_obs, reward, done, info
def _get_obs(self):
if self._expose_all_qpos:
obs = np.concatenate([
self.data.qpos.flat[:3], # Only point-relevant coords.
self.data.qvel.flat[:3]])
if self.add_noise:
obs = np.concatenate((obs, np.random.uniform(low=-2, high=2, size=20)))
return obs
return np.concatenate([
self.data.qpos.flat[2:3],
self.data.qvel.flat[:3]])
def reset_model(self):
qpos = self.init_qpos + self.np_random.uniform(
size=self.model.nq, low=-.1, high=.1)
qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
# Set everything other than point to original position and 0 velocity.
qpos[3:] = self.init_qpos[3:]
qvel[3:] = 0.
self.set_state(qpos, qvel)
return self._get_obs()
def get_ori(self):
return self.data.qpos[self.__class__.ORI_IND]
def set_xy(self, xy):
qpos = np.copy(self.data.qpos)
qpos[0] = xy[0]
qpos[1] = xy[1]
qvel = self.data.qvel
self.set_state(qpos, qvel)
def get_xy(self):
qpos = np.copy(self.data.qpos)
return qpos[:2]
def viewer_setup(self):
# self.viewer.cam.trackbodyid = 1
# self.viewer.cam.distance = self.model.stat.extent * 0.7
# self.viewer.cam.lookat[2] = 0.8925
# self.viewer.cam.elevation = 0
self.viewer.cam.trackbodyid = -1
self.viewer.cam.distance = 60
self.viewer.cam.elevation = -90
from .maze_env import MazeEnv
from .point import PointEnv
class PointMazeEnv(MazeEnv):
MODEL_CLASS = PointEnv
import numpy as np
from gym import utils
from gym.envs.mujoco import mujoco_env
class SwimmerEnv(mujoco_env.MujocoEnv, utils.EzPickle):
ORI_IND = 2
FILE = "swimmer.xml"
def __init__(self, file_path=None, expose_all_qpos=True):
self._expose_all_qpos = expose_all_qpos
self.add_noise = False
mujoco_env.MujocoEnv.__init__(self, file_path, 4)
utils.EzPickle.__init__(self)
def _step(self, a):
return self.step(a)
def step(self, a):
ctrl_cost_coeff = 0.0001
xposbefore = self.sim.data.qpos[0]
self.do_simulation(a, self.frame_skip)
xposafter = self.sim.data.qpos[0]
reward_fwd = (xposafter - xposbefore) / self.dt
reward_ctrl = - ctrl_cost_coeff * np.square(a).sum()
reward = reward_fwd + reward_ctrl
ob = self._get_obs()
return ob, reward, False, dict(reward_fwd=reward_fwd, reward_ctrl=reward_ctrl)
def _get_obs(self):
qpos = self.sim.data.qpos
qvel = self.sim.data.qvel
# print("qpos", qpos)
# print("qvel", qvel)
return np.concatenate([qpos.flat, qvel.flat])
def reset_model(self):
self.set_state(
self.init_qpos + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq),
self.init_qvel + self.np_random.uniform(low=-.1, high=.1, size=self.model.nv)
)
return self._get_obs()
def get_ori(self):
return self.data.qpos[self.__class__.ORI_IND]
def set_xy(self, xy):
qpos = np.copy(self.data.qpos)
qpos[0] = xy[0]
qpos[1] = xy[1]
qvel = self.data.qvel
self.set_state(qpos, qvel)
def get_xy(self):
qpos = np.copy(self.data.qpos)
return qpos[:2]
def viewer_setup(self):
# self.viewer.cam.trackbodyid = 1
# self.viewer.cam.distance = self.model.stat.extent * 0.7
# self.viewer.cam.lookat[2] = 0.8925
# self.viewer.cam.elevation = 0
self.viewer.cam.trackbodyid = -1
self.viewer.cam.distance = 60
self.viewer.cam.elevation = -90
from .maze_env import MazeEnv
from .swimmer import SwimmerEnv
class SwimmerMazeEnv(MazeEnv):
MODEL_CLASS = SwimmerEnv
# copied from openai gym
import gym
from gym import spaces
from gym.utils import seeding
import numpy as np
class NChainEnv(gym.Env):
"""n-Chain environment
This game presents moves along a linear chain of states, with two actions:
0) forward, which moves along the chain but returns no reward
1) backward, which returns to the beginning and has a small reward
The end of the chain, however, presents a large reward, and by moving
'forward' at the end of the chain this large reward can be repeated.
At each action, there is a small probability that the agent 'slips' and the
opposite transition is instead taken.
The observed state is the current state in the chain (0 to n-1).
This environment is described in section 6.1 of:
A Bayesian Framework for Reinforcement Learning by Malcolm Strens (2000)
http://ceit.aut.ac.ir/~shiry/lecture/machine-learning/papers/BRL-2000.pdf
"""
def __init__(self, n=5, slip=0.2, small=0.001, large=1.0):
self.n = n
self.n2 = bin(n-1)
print("n2", self.n2, len(self.n2)-2)
self.slip = slip # probability of 'slipping' an action
self.small = small # payout for 'backwards' action
self.large = large # payout at end of chain for 'forwards' action
self.state = 0 # Start at beginning of the chain
self.action_space = spaces.Box(low=-1., high=1., shape=(1,))
# self.observation_space = spaces.Discrete(self.n)
self.observation_space = spaces.Discrete(len(self.n2) - 2)
self.shuffle_order = np.arange(len(self.n2) - 2)
np.random.shuffle(self.shuffle_order)
self.seed()
target = np.zeros(n)
target[n-1] = 1
self.target = target
self.reward_type = "sparse"
self.visited_count = np.zeros(n)
def seed(self, seed=None):
self.np_random, seed = seeding.np_random(seed)
return [seed]
def step(self, action):
# print("action", action)
success = False
info = {}
assert self.action_space.contains(action)
if self.np_random.rand() < self.slip:
action = 0 - action # agent slipped, reverse action taken
if action < 0 and self.state > 0: # 'backwards': go back to the beginning, get small reward
reward = self.small
self.state -= 1
elif action > 0 and self.state < self.n - 1: # 'forwards': go up along the chain
reward = 0
self.state += 1
elif self.state == self.n - 1: # 'forwards': stay at the end of the chain, collect large reward
reward = self.large
success = True
else:
reward = 0
done = False
info["is_success"] = success
# print("state", self.state)
if self.visited_count[self.state] == 0:
self.visited_count[self.state] = 1
return self.get_obs(), reward, done, info
def reset(self):
self.state = 0
if self.visited_count[self.state] == 0:
self.visited_count[self.state] = 1.
return self.get_obs()
def get_obs(self):
new = np.zeros(len(self.n2) - 2)
# new[self.state] = 1
new2 = bin(self.state)
new2 = list(new2[2:])
new2.reverse()
for i, ele in enumerate(new2):
new[-(i+1)] = int(ele)
new = new[::-1]
# new = new[self.shuffle_order]
return {
"observation": np.copy(new),
"achieved_goal": np.copy(new),
"desired_goal": np.copy(self.target),
}
@property
def coverage(self):
return np.sum(self.visited_count) / self.n
\ No newline at end of file
import gym
import numpy as np
import cv2
from gym import spaces
def line_intersection(line1, line2):
# calculate the intersection point
xdiff = (line1[0][0] - line1[1][0], line2[0][0] - line2[1][0])
ydiff = (line1[0][1] - line1[1][1], line2[0]
[1] - line2[1][1]) # Typo was here
def det(a, b):
return a[0] * b[1] - a[1] * b[0]
div = det(xdiff, ydiff)
if div == 0:
raise Exception('lines do not intersect')
d = (det(*line1), det(*line2))
x = det(d, xdiff) / div
y = det(d, ydiff) / div
return x, y
def check_cross(x0, y0, x1, y1):
x0 = np.array(x0)
y0 = np.array(y0)
x1 = np.array(x1)
y1 = np.array(y1)
return np.cross(x1 - x0, y0 - x0), np.cross(y0 - x0, y1 - x0)
def check_itersection(x0, y0, x1, y1):
EPS = 1e-10
def sign(x):
if x > EPS:
return 1
if x < -EPS:
return -1
return 0
f1, f2 = check_cross(x0, y0, x1, y1)
f3, f4 = check_cross(x1, y1, x0, y0)
if sign(f1) == sign(f2) and sign(f3) == sign(f4) and sign(f1) != 0 and sign(f3) != 0:
return True
return False
class PlaneBase(gym.Env):
def __init__(self, rects, R, is_render=False, size=512):
self.rects = rects
self.n = len(self.rects)
self.size = size
self.map = np.ones((size, size, 3), dtype=np.uint8) * 255
self.R = R
self.R2 = R ** 2
self.board = np.array(
[[0, 0],
[1, 1]],
dtype='float32')
self.action_space = gym.spaces.Box(
low=-R, high=R, shape=(2,), dtype='float32')
self.observation_space = gym.spaces.Box(
low=0., high=1., shape=(2,), dtype='float32')
if is_render:
cv2.namedWindow('image', cv2.WINDOW_NORMAL)
self.image_name = 'image'
for i in range(self.n):
for j in range(i + 1, self.n):
if check_itersection(self.rects[i][0], self.rects[i][1], self.rects[j][0], self.rects[j][0]):
raise Exception("Rectangle interaction with each other")
for ((x0, y0), (x1, y1)) in rects:
x0, y0 = int(x0 * size), int(y0 * size)
x1, y1 = int(x1 * size), int(y1 * size)
cv2.rectangle(self.map, (x0, y0), (x1, y1), (0, 255, 0), 1)
ps = np.array([
[x0, y0],
[x1, y0],
[x1, y1],
[x0, y1],
], dtype=np.int32)
cv2.fillConvexPoly(self.map, ps, (127, 127, 127))
self.state = (0, 0)
self.reset()
def restore(self, obs):
self.state = (float(obs[0]), float(obs[1]))
def rect_lines(self, rect):
(x0, y0), (x1, y1) = rect
yield (x0, y0), (x1, y0)
yield (x1, y0), (x1, y1)
yield (x1, y1), (x0, y1)
yield (x0, y1), (x0, y0)
def l2dist(self, x, y):
return ((y[0] - x[0]) ** 2) + ((y[1] - x[1]) ** 2)
def check_inside(self, p):
EPS = 1e-10
for i in self.rects:
if p[0] > i[0][0] + EPS and p[0] < i[1][0] - EPS and p[1] > i[0][1] + EPS and p[1] < i[1][1] - EPS:
return True
return False
def step(self, action):
dx, dy = action
l = 0.0001
p = (self.state[0] + dx * l, self.state[1] + dy * l)
if self.check_inside(p) or p[0] > 1 or p[1] > 1 or p[0] < 0 or p[1] < 0:
return np.array(self.state), 0, False, {}
dest = (self.state[0] + dx, self.state[1] + dy)
md = self.l2dist(self.state, dest)
_dest = dest
line = (self.state, dest)
for i in list(self.rects) + [self.board]:
for l in self.rect_lines(i):
if check_itersection(self.state, dest, l[0], l[1]):
inter_point = line_intersection(line, l)
d = self.l2dist(self.state, inter_point)
if d < md:
md = d
_dest = inter_point
self.restore(_dest)
return np.array(self.state), -md, False, {}
def render(self, mode='human'):
image = self.map.copy()
x, y = self.state
x = int(x * self.size)
y = int(y * self.size)
cv2.circle(image, (x, y), 5, (255, 0, 255), -1)
if mode == 'human':
cv2.imshow('image', image)
cv2.waitKey(2)
else:
return image
def reset(self):
inside_rect = True
while inside_rect:
a, b = np.random.random(), np.random.random()
inside_rect = self.check_inside((a, b))
self.state = (a, b)
return np.array(self.state)
class NaivePlane(PlaneBase):
def __init__(self, is_render=True, R=300, size=512):
PlaneBase.__init__(self,
[
np.array([[128, 128], [300, 386]]) / 512,
np.array([[400, 400], [500, 500]]) / 512,
],
R, is_render=is_render, size=size),
class NaivePlane2(PlaneBase):
# two rectangle
def __init__(self, is_render=True, R=300, size=512):
PlaneBase.__init__(self,
[
np.array([[64, 64], [256, 256]]) / 512,
np.array([[300, 128], [400, 500]]) / 512,
],
R, is_render=is_render, size=size),
class NaivePlane3(PlaneBase):
# four rectangle
def __init__(self, is_render=True, R=300, size=512):
PlaneBase.__init__(self,
[
np.array([[64, 64], [192, 192]]) / 512,
np.array([[320, 64], [448, 192]]) / 512,
np.array([[320, 320], [448, 448]]) / 512,
np.array([[64, 320], [192, 448]]) / 512,
],
R, is_render=is_render, size=size),
class NaivePlane4(PlaneBase):
# four rectangle
def __init__(self, is_render=True, R=300, size=512):
PlaneBase.__init__(self,
[
np.array([[64, 64], [192, 512]]) / 512,
np.array([[320, 64], [448, 512]]) / 512,
],
R, is_render=is_render, size=size),
class NaivePlane5(PlaneBase):
# four rectangle
def __init__(self, is_render=False, R=300, size=512):
PlaneBase.__init__(self,
[
np.array([[0, 1. / 3], [2. / 3, 2. / 3]]),
],
R, is_render=is_render, size=size),
if __name__ == '__main__':
env = NaivePlane5()
obs = env.reset()
while True:
print(obs)
env.render()
while True:
try:
print('entering the dir (x, y)')
act = input().strip().split(' ')
act = float(act[0]) / 512, float(act[1]) / 512
break
except KeyboardInterrupt as e:
raise e
except:
continue
obs, reward, _, _ = env.step(act)
import cv2
import torch
import numpy as np
## This is used to store a video for remote visualization
def play(env, policy, video_path="tmp.avi", time_limit=500, device='cpu'):
out = None
obs = env.reset()
num = 0
rew = None
action = None
info = None
flag = False
while True:
img = env.unwrapped.render(mode='rgb_array')[:, :, ::-1].copy()
'''
if True and isinstance(obs, dict):
np.set_printoptions(precision=3)
achieved = (float(obs['achieved_goal'][0]), float(obs['achieved_goal'][1]))
desired = (float(obs['desired_goal'][0]), float(obs['desired_goal'][1]))
cv2.putText(img, " obs: {:.3f} {:.3f}".format(achieved[0], achieved[1]), (400,25), cv2.FONT_HERSHEY_SIMPLEX, 0.3, 255)
cv2.putText(img, "goal: {:.3f} {:.3f}".format(desired[0], desired[1]), (400,50), cv2.FONT_HERSHEY_SIMPLEX, 0.3, 255)
if rew is not None:
cv2.putText(img, "rew: {:.3f}".format(rew), (400,75), cv2.FONT_HERSHEY_SIMPLEX, 0.3, 255)
if action is not None:
action = [float(i) for i in action][:2]
cv2.putText(img, "rew: {:.3f} {:.3f}".format(action[0], action[1]), (400,100), cv2.FONT_HERSHEY_SIMPLEX, 0.3, 255)
if info is not None:
if 'is_success' in info:
cv2.putText(img, "success? {}".format(info['is_success']), (400,125), cv2.FONT_HERSHEY_SIMPLEX, 0.3, 255)
cv2.putText(img, "step {}".format(num), (400,150), cv2.FONT_HERSHEY_SIMPLEX, 0.3, 255)
flag = True
'''
if out is None:
out = cv2.VideoWriter(
video_path, cv2.VideoWriter_fourcc(*'XVID'), 20.0, (img.shape[1], img.shape[0]))
out.write(img)
if isinstance(obs, dict):
goal = torch.tensor(obs['desired_goal'], dtype=torch.float32).to(device)
obs = torch.tensor(obs['observation'], dtype=torch.float32).to(device)
action = policy(obs.unsqueeze(0), goal.unsqueeze(0))
if isinstance(action, torch.Tensor):
action = action.detach().cpu().numpy()
else:
action = policy(np.array(obs)[None]).action[0].detach().cpu().numpy()
obs, rew, done, info = env.step(action)
if done:
obs = env.reset()
num += 1
# assert not info['is_success']
flag = True
if not flag:
print(num, info, rew, done, env.goal, action)
if num == time_limit - 1:
break
import torch
from torch import nn
import numpy as np
class L1(nn.Module):
def __init__(self):
nn.Module.__init__(self)
def forward(self, s, t):
if isinstance(s, np.ndarray):
s = torch.from_numpy(s).float()
if isinstance(t, np.ndarray):
t = torch.from_numpy(t).float()
out = torch.abs(s - t)
return out.view(out.size(0), -1).sum(dim=1)
class L2(nn.Module):
def __init__(self):
nn.Module.__init__(self)
def forward(self, s, t):
out = (s - t) ** 2
return (out.view(out.size(0), -1).sum(dim=1) + 1e-14) ** 0.5
class DotProd(nn.Module):
def __init__(self):
nn.Module.__init__(self)
def forward(self, s, t):
if isinstance(s, np.ndarray):
s = torch.from_numpy(s).float()
if isinstance(t, np.ndarray):
t = torch.from_numpy(t).float()
out = (s * t[:, None, :]).sum(dim=2)[:, 0]
return out
class MLPDist(nn.Module):
def __init__(self, inp_dim):
nn.Module.__init__(self)
self.dim = inp_dim
self.mlp = nn.Sequential(
nn.Linear(self.dim * 2, self.dim),
nn.ReLU(),
nn.Linear(self.dim, self.dim),
nn.ReLU(),
nn.Linear(self.dim, 1),
)
def forward(self, s, t):
if isinstance(s, np.ndarray):
s = torch.from_numpy(s).float()
if isinstance(t, np.ndarray):
t = torch.from_numpy(t).float()
out = self.mlp(torch.cat([s, t], dim=1))
return out.squeeze(-1)
class Distance(nn.Module):
def __init__(self, encoder, distance):
nn.Module.__init__(self)
self.encoder = encoder
self.metrics = distance
def forward(self, s, t):
s = self.encoder(s)
t = self.encoder(t)
return self.metrics(s, t)
class MultiEncoderDistance(nn.Module):
def __init__(self, encoder_s, encoder_t, distance):
nn.Module.__init__(self)
self.encoder_s = encoder_s
self.encoder_t = encoder_t
self.metrics = distance
def forward(self, s, t):
s = self.encoder_s(s)
t = self.encoder_t(t)
return self.metrics(s, t)
import torch.nn.functional as F
import sys
sys.path.append('../')
from models.distance import *
import numpy as np
from torch.distributions.multivariate_normal import MultivariateNormal
import torch.distributions as D
"""
the input x in both networks should be [o, g], where o is the observation and g is the goal.
"""
def initialize_metrics(metric, dim):
if metric == 'L1':
return L1()
elif metric == 'L2':
return L2()
elif metric == 'dot':
return DotProd()
elif metric == 'MLP':
return MLPDist(dim)
else:
raise NotImplementedError
# define the actor network
class actor(nn.Module):
def __init__(self, env_params, goal_dim):
super(actor, self).__init__()
self.max_action = env_params['action_max']
self.fc1 = nn.Linear(env_params['low_dim'] + goal_dim, 400)
self.fc2 = nn.Linear(400, 400)
self.fc3 = nn.Linear(400, 400)
self.fc4 = nn.Linear(400, 400)
self.action_out = nn.Linear(400, env_params['action'])
def forward(self, obs, goal):
x = torch.cat([obs, goal], dim=1)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
x = F.relu(self.fc4(x))
actions = self.max_action * torch.tanh(self.action_out(x))
return actions
# define the actor network
class Inverse_goal(nn.Module):
def __init__(self, env_params, goal_dim, hi_max_action):
super(Inverse_goal, self).__init__()
self.max_action = hi_max_action
self.fc1 = nn.Linear(env_params['obs'] * 2, 400)
self.fc2 = nn.Linear(400, 400)
self.fc3 = nn.Linear(400, 400)
self.fc4 = nn.Linear(400, 400)
self.action_out = nn.Linear(400, goal_dim)
def forward(self, obs, goal):
x = torch.cat([obs, goal], dim=1)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
x = F.relu(self.fc4(x))
actions = self.max_action * torch.tanh(self.action_out(x))
return actions
# define the high-level actor network
class Hi_actor(nn.Module):
def __init__(self, env_params, real_goal_dim, maze_high, shallow, sigmoid=False):
super(Hi_actor, self).__init__()
self.max_action = maze_high
self.fc1 = nn.Linear(env_params['hi_dim'] + env_params['goal'], 400)
self.fc2 = nn.Linear(400, 400)
self.fc3 = nn.Linear(400, 400)
self.fc4 = nn.Linear(400, 400)
self.action_out = nn.Linear(400, real_goal_dim)
self.sigmoid = sigmoid
self.shallow = shallow
def forward(self, obs, goal):
x = torch.cat([obs, goal], dim=1)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
if not self.shallow:
x = F.relu(self.fc3(x))
x = F.relu(self.fc4(x))
if self.sigmoid:
actions = self.max_action * torch.sigmoid(self.action_out(x))
else:
actions = self.max_action * torch.tanh(self.action_out(x))
return actions
class Hi_critic(nn.Module):
def __init__(self, env_params, args, real_goal_dim, maze_high):
super(Hi_critic, self).__init__()
self.max_action = maze_high
self.inp_dim = env_params['hi_dim'] + real_goal_dim + env_params['goal']
self.out_dim = 1
self.mid_dim = 400
self.gamma = args.gamma
if args.hi_layer == 1:
models = [nn.Linear(self.inp_dim, self.out_dim)]
else:
models = [nn.Linear(self.inp_dim, self.mid_dim)]
if args.hi_layer > 2:
for i in range(args.layer - 2):
models += [nn.ReLU(), nn.Linear(self.mid_dim, self.mid_dim)]
if args.hi_layer > 1:
models += [nn.ReLU(), nn.Linear(self.mid_dim, self.out_dim)]
self.base = nn.Sequential(*models)
def forward(self, obs, goal, actions):
x = torch.cat([obs, actions / self.max_action], dim=1)
x = torch.cat([x, goal], dim=1)
dist = self.base(x)
return dist
class critic(nn.Module):
def __init__(self, env_params, args, goal_dim):
super(critic, self).__init__()
self.max_action = env_params['action_max']
self.inp_dim = env_params['low_dim'] + env_params['action'] + goal_dim
self.out_dim = 1
self.mid_dim = 400
if args.layer == 1:
models = [nn.Linear(self.inp_dim, self.out_dim)]
else:
models = [nn.Linear(self.inp_dim, self.mid_dim)]
if args.layer > 2:
for i in range(args.layer - 2):
models += [nn.ReLU(), nn.Linear(self.mid_dim, self.mid_dim)]
if args.layer > 1:
models += [nn.ReLU(), nn.Linear(self.mid_dim, self.out_dim)]
self.base = nn.Sequential(*models)
def forward(self, obs, goal, actions):
x = torch.cat([obs, actions / self.max_action], dim=1)
x = torch.cat([x, goal], dim=1)
dist = self.base(x)
return dist
class Critic_double(nn.Module):
def __init__(self, env_params, args):
super(Critic_double, self).__init__()
self.max_action = env_params['action_max']
self.inp_dim = env_params['obs'] + env_params['action'] + env_params['goal']
self.out_dim = 1
self.mid_dim = 400
if args.layer == 1:
models = [nn.Linear(self.inp_dim, self.out_dim)]
else:
models = [nn.Linear(self.inp_dim, self.mid_dim)]
if args.layer > 2:
for i in range(args.layer - 2):
models += [nn.ReLU(), nn.Linear(self.mid_dim, self.mid_dim)]
if args.layer > 1:
models += [nn.ReLU(), nn.Linear(self.mid_dim, self.out_dim)]
self.base = nn.Sequential(*models)
if args.layer == 1:
models1 = [nn.Linear(self.inp_dim, self.out_dim)]
else:
models1 = [nn.Linear(self.inp_dim, self.mid_dim)]
if args.layer > 2:
for i in range(args.layer - 2):
models1 += [nn.ReLU(), nn.Linear(self.mid_dim, self.mid_dim)]
if args.layer > 1:
models1 += [nn.ReLU(), nn.Linear(self.mid_dim, self.out_dim)]
self.base1 = nn.Sequential(*models1)
def forward(self, obs, goal, actions):
x = torch.cat([obs, actions / self.max_action], dim=1)
x = torch.cat([x, goal], dim=1)
dist = self.base(x)
dist1 = self.base1(x)
return dist, dist1
class criticWrapper(nn.Module):
def __init__(self, env_params, args, goal_dim):
super(criticWrapper, self).__init__()
self.base = critic(env_params, args, goal_dim)
self.args = args
self.gamma = args.gamma
def forward(self, obs, goal, actions):
dist = self.base(obs, goal, actions)
self.alpha = np.log(self.gamma)
return -(1 - torch.exp(dist * self.alpha)) / (1 - self.gamma)
class doubleWrapper(nn.Module):
def __init__(self, env_params, args):
super(doubleWrapper, self).__init__()
self.base = Critic_double(env_params, args)
self.args = args
self.gamma = args.gamma
def forward(self, obs, goal, actions):
dist, dist1 = self.base(obs, goal, actions)
self.alpha = np.log(self.gamma)
return -(1 - torch.exp(dist * self.alpha)) / (1 - self.gamma), -(1 - torch.exp(dist1 * self.alpha)) / (1 - self.gamma)
def Q1(self, obs, goal, actions):
dist, _ = self.base(obs, goal, actions)
self.alpha = np.log(self.gamma)
return -(1 - torch.exp(dist * self.alpha)) / (1 - self.gamma)
class EmbedNet(nn.Module):
def __init__(self, env_params, args):
super(EmbedNet, self).__init__()
self.max_action = env_params['action_max']
self.obs_dim = env_params['obs'] + env_params['action']
self.goal_dim = env_params['goal']
self.out_dim = 128
self.mid_dim = 400
if args.layer == 1:
obs_models = [nn.Linear(self.obs_dim, self.out_dim)]
goal_models = [nn.Linear(self.goal_dim, self.out_dim)]
else:
obs_models = [nn.Linear(self.obs_dim, self.mid_dim)]
goal_models = [nn.Linear(self.goal_dim, self.mid_dim)]
if args.layer > 2:
for i in range(args.layer - 2):
obs_models += [nn.ReLU(), nn.Linear(self.mid_dim, self.mid_dim)]
goal_models += [nn.ReLU(), nn.Linear(self.mid_dim, self.mid_dim)]
if args.layer > 1:
obs_models += [nn.ReLU(), nn.Linear(self.mid_dim, self.out_dim)]
goal_models += [nn.ReLU(), nn.Linear(self.mid_dim, self.out_dim)]
self.obs_encoder = nn.Sequential(*obs_models)
self.goal_encoder = nn.Sequential(*goal_models)
self.metric = initialize_metrics(args.metric, self.out_dim)
def forward(self, obs, goal, actions):
s = torch.cat([obs, actions / self.max_action], dim=1)
s = self.obs_encoder(s)
g = self.goal_encoder(goal)
dist = self.metric(s, g)
return dist
class Qnet(nn.Module):
def __init__(self, env_params, args):
super(Qnet, self).__init__()
self.mid_dim = 100
self.metric = args.metric
self.action_n = env_params['action_dim']
self.obs_fc1 = nn.Linear(env_params['obs'], 256)
self.obs_fc2 = nn.Linear(256, self.mid_dim * self.action_n)
self.goal_fc1 = nn.Linear(env_params['goal'], 256)
self.goal_fc2 = nn.Linear(256, self.mid_dim)
if self.metric == 'MLP':
self.mlp = nn.Sequential(
nn.Linear(self.mid_dim * (self.action_n + 1), 128),
nn.ReLU(),
nn.Linear(128, self.action_n),
)
def forward(self, obs, goal):
s = F.relu(self.obs_fc1(obs))
s = F.relu(self.obs_fc2(s))
s = s.view(s.size(0), self.action_n, self.mid_dim)
g = F.relu(self.goal_fc1(goal))
g = F.relu(self.goal_fc2(g))
if self.metric == 'L1':
dist = torch.abs(s - g[:, None, :]).sum(dim=2)
elif self.metric == 'dot':
dist = -(s * g[:, None, :]).sum(dim=2)
elif self.metric == 'L2':
dist = ((torch.abs(s - g[:, None, :]) ** 2).sum(dim=2) + 1e-14) ** 0.5
elif self.metric == 'MLP':
s = s.view(s.size(0), -1)
x = torch.cat([s, g], dim=1)
dist = self.mlp(x)
else:
raise NotImplementedError
return dist
class QNetWrapper(nn.Module):
def __init__(self, env_params, args):
super(QNetWrapper, self).__init__()
self.base = Qnet(env_params, args)
self.args = args
self.gamma = args.gamma
def forward(self, obs, goal):
dist = self.base(obs, goal)
self.alpha = np.log(self.gamma)
qval = -(1 - torch.exp(dist * self.alpha)) / (1 - self.gamma)
return qval
class EmbedNetWrapper(nn.Module):
def __init__(self, env_params, args):
super(EmbedNetWrapper, self).__init__()
self.base = EmbedNet(env_params, args)
self.args = args
self.gamma = args.gamma
def forward(self, obs, goal, actions):
dist = self.base(obs, goal, actions)
self.alpha = np.log(self.gamma)
return -(1 - torch.exp(dist * self.alpha)) / (1 - self.gamma)
class RepresentationNetwork(nn.Module):
def __init__(self, env_params, layer, abs_range, out_dim):
super(RepresentationNetwork, self).__init__()
self.obs_dim = env_params['obs']
self.out_dim = out_dim
self.mid_dim = 100
if layer == 1:
obs_models = [nn.Linear(self.obs_dim, self.out_dim)]
else:
obs_models = [nn.Linear(self.obs_dim, self.mid_dim)]
if layer > 2:
for i in range(layer - 2):
obs_models += [nn.ReLU(), nn.Linear(self.mid_dim, self.mid_dim)]
if layer > 1:
obs_models += [nn.ReLU(), nn.Linear(self.mid_dim, self.out_dim)]
self.obs_encoder = nn.Sequential(*obs_models)
self.abs_range = abs_range
def forward(self, obs):
if len(obs.shape) is 1:
obs = obs.unsqueeze(0)
s = self.obs_encoder(obs)
return s
class DynamicsNetwork(nn.Module):
def __init__(self, env_params, abs_range, out_dim, tanh_output, use_prob, device):
super(DynamicsNetwork, self).__init__()
self.obs_dim = env_params['obs']
self.out_dim = out_dim
self.mid_dim = 100
# obs encoder
obs_models = [nn.Linear(self.obs_dim, self.mid_dim)]
obs_models += [nn.ReLU(), nn.Linear(self.mid_dim, self.mid_dim)]
obs_models += [nn.ReLU(), nn.Linear(self.mid_dim, self.out_dim)]
self.obs_encoder = nn.Sequential(*obs_models)
self.abs_range = abs_range
# goal input
self.goal_input = nn.Linear(out_dim, int(self.mid_dim / 2))
self.dynamics_layer = nn.Linear(int(self.mid_dim / 2) + self.out_dim, self.mid_dim)
self.output_layer = nn.Linear(self.mid_dim, self.out_dim)
self.tanh_output = tanh_output
self.probabilistic_output = use_prob
self.device = device
def phi(self, obs):
if len(obs.shape) is 1:
obs = obs.unsqueeze(0)
s = self.obs_encoder(obs)
return s
def forward(self, obs, hi_action):
latent_s = self.obs_encoder(obs)
action_out = self.goal_input(hi_action)
action_out = F.relu(action_out)
x = torch.cat([latent_s, action_out], 1)
x = self.dynamics_layer(x)
x = F.relu(x)
x = self.output_layer(x)
if self.tanh_output:
x = self.abs_range * torch.tanh(x)
return x
elif self.probabilistic_output:
std_dev = torch.ones(x.shape[0], self.out_dim).to(self.device)
return D.Independent(D.Normal(x, std_dev), 1)
else:
return x
import numpy as np
import gym
from arguments.arguments_hier_sac import get_args_ant, get_args_chain
from algos.hier_sac import hier_sac_agent
from goal_env.mujoco import *
import random
import torch
def get_env_params(env):
obs = env.reset()
# close the environment
params = {'obs': obs['observation'].shape[0], 'goal': obs['desired_goal'].shape[0],
'action': env.action_space.shape[0], 'action_max': env.action_space.high[0],
'max_timesteps': env._max_episode_steps}
return params
def launch(args):
# create the ddpg_agent
env = gym.make(args.env_name)
test_env = gym.make(args.test)
# if args.env_name == "AntPush-v1":
# test_env1 = gym.make("AntPushTest1-v1")
# test_env2 = gym.make("AntPushTest2-v1")
# elif args.env_name == "AntMaze1-v1":
# test_env1 = gym.make("AntMaze1Test1-v1")
# test_env2 = gym.make("AntMaze1Test2-v1")
# else:
test_env1 = test_env2 = None
print("test_env", test_env1, test_env2)
# set random seeds for reproduce
env.seed(args.seed)
if args.env_name != "NChain-v1":
env.env.env.wrapped_env.seed(args.seed)
test_env.env.env.wrapped_env.seed(args.seed)
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if args.device is not 'cpu':
torch.cuda.manual_seed(args.seed)
gym.spaces.prng.seed(args.seed)
# get the environment parameters
if args.env_name[:3] in ["Ant", "Poi", "Swi"]:
env.env.env.visualize_goal = args.animate
test_env.env.env.visualize_goal = args.animate
env_params = get_env_params(env)
env_params['max_test_timesteps'] = test_env._max_episode_steps
# create the ddpg agent to interact with the environment
sac_trainer = hier_sac_agent(args, env, env_params, test_env, test_env1, test_env2)
if args.eval:
if not args.resume:
print("random policy !!!")
# sac_trainer._eval_hier_agent(test_env)
# sac_trainer.vis_hier_policy()
# sac_trainer.cal_slow()
# sac_trainer.visualize_representation(100)
# sac_trainer.vis_learning_process()
# sac_trainer.picvideo('fig/final/', (1920, 1080))
else:
sac_trainer.learn()
# get the params
args = get_args_ant()
# args = get_args_chain()
# args = get_args_fetch()
# args = get_args_point()
if __name__ == '__main__':
launch(args)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment