Initial commit

2267b85e · lsy · 2267b85e · 2267b85e · 2267b85e · 2267b85e
Commit 2267b85e authored Feb 22, 2021 by lsy
38 changed files
--- a/.gitignore
+++ b/.gitignore
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+*.local
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+fig/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+*.jpg
+*.jpeg
+.idea/
+*.npy
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+runs/
+prev_runs/
+saved_models/
+*.log
+*.jpg
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+# DS Store
+.DS_Store
+
+#saved_model
+*.pth
+*.pt
+
+*.log
+*.avi
+.idea/
+runs/
--- a/README.md
+++ b/README.md
+# Learning Subgoal Representations with Slow Dynamics
+We propose a slowness objective to effectively learn the subgoal representation
+for goal-conditioned hierarchical reinforcement learning. [Our paper](https://openreview.net/pdf?id=wxRwhSdORKG) is accepted by ICLR 2021. 
+
+The python dependencies are as follows.
+* Python 3.6 or above
+* [PyTorch](https://pytorch.org/)
+* [Gym](https://gym.openai.com/)
+* [Mujoco](https://www.roboti.us)
+
+Run the codes with ``python train_hier_sac.py``. The tensorboard files are saved in the ``runs`` folder and the 
+trained models are saved in the ``saved_models`` folder.
--- a/algos/her.py
+++ b/algos/her.py
+import numpy as np
+
+
+class her_sampler:
+    def __init__(self, replay_strategy, replay_k, threshold, future_step, dense_reward, direction_reward, low_reward_coeff):
+        self.replay_strategy = replay_strategy
+        self.replay_k = replay_k
+        if self.replay_strategy == 'future':
+            self.future_p = 1 - (1. / (1 + replay_k))
+        else:
+            self.future_p = 0
+        self.threshold = threshold
+        self.furture_step = future_step
+        self.border_index = None
+        self.direction_reward = direction_reward
+        # reward type not use in direction reward
+        if not dense_reward:
+            self.reward_type = 'sparse'
+        else:
+            self.reward_type = 'dense'
+        self.reward_coeff = low_reward_coeff
+
+
+    def reward_func(self, state, goal, info=None):
+        assert state.shape == goal.shape
+        dist = np.linalg.norm(state - goal, axis=-1)
+        if self.reward_type == 'sparse':
+            return -(dist > self.threshold).astype(np.float32)
+        else:
+            return -dist * self.reward_coeff
+
+    def direction_reward_func(self, ag_next, goal, ag):
+
+        # l2 distance reward
+        assert ag.shape == goal.shape
+        dist = np.linalg.norm(ag + goal - ag_next, axis=-1)
+        return -dist
+
+        # # cosine distance reward
+        # a_direction = ag_next - ag  # achieved direction
+        # cos_dist = np.sum(np.multiply(a_direction, goal), axis=1) / (
+        #         (np.linalg.norm(a_direction, axis=1) * np.linalg.norm(goal, axis=1)) + 1e-6)
+        # return cos_dist
+
+
+    def sample_her_transitions(self, episode_batch, batch_size_in_transitions):
+        T = episode_batch['actions'].shape[1]
+        rollout_batch_size = episode_batch['actions'].shape[0]
+        batch_size = batch_size_in_transitions
+        # select which rollouts and which timesteps to be used
+        episode_idxs = np.random.randint(0, rollout_batch_size, batch_size)
+        t_samples = np.random.randint(T, size=batch_size)
+        transitions = {key: episode_batch[key][episode_idxs, t_samples].copy() for key in episode_batch.keys()}
+        # her idx
+        her_indexes = np.where(np.random.uniform(size=batch_size) < self.future_p)
+        # cheat in her for large step length
+
+        target_index = np.minimum(T, t_samples + self.furture_step)
+        future_offset = np.random.uniform(size=batch_size) * (target_index - t_samples)
+
+        future_offset = future_offset.astype(int)
+        future_t = (t_samples + 1 + future_offset)[her_indexes]
+        # replace goal with achieved goal
+        future_ag = episode_batch['ag'][episode_idxs[her_indexes], future_t]
+        transitions['g'][her_indexes] = future_ag
+        # to get the params to re-compute reward
+        if not self.direction_reward:
+            transitions['r'] = np.expand_dims(
+                self.reward_func(transitions['ag_next'], transitions['g'],
+                                 None), 1)
+        else:
+            transitions['r'] = np.expand_dims(
+                self.direction_reward_func(transitions['ag_next'].copy(), transitions['g'].copy(),
+                                           transitions['ag'].copy()), 1)
+        transitions = {k: transitions[k].reshape(batch_size, *transitions[k].shape[1:]) for k in transitions.keys()}
+        return transitions
+
+    def sample_her_energy(self, episode_batch, batch_size_in_transitions, temperature=1.0):
+        T = episode_batch['actions'].shape[1]
+        rollout_batch_size = episode_batch['actions'].shape[0]
+        batch_size = batch_size_in_transitions
+        # select which rollouts and which timesteps to be used
+        energy_trajectory = episode_batch['e']
+        p_trajectory = np.power(energy_trajectory, 1 / (temperature + 1e-2))
+        p_trajectory = p_trajectory / p_trajectory.sum()
+        episode_idxs = np.random.choice(rollout_batch_size, size=batch_size, replace=True, p=p_trajectory.flatten())
+
+        t_samples = np.random.randint(T, size=batch_size)
+
+        transitions = {}
+        for key in episode_batch.keys():
+            if not key == 'e':
+                transitions[key] = episode_batch[key][episode_idxs, t_samples].copy()
+
+        # her idx
+        her_indexes = np.where(np.random.uniform(size=batch_size) < self.future_p)
+        # cheat in her for large step length
+
+        target_index = np.minimum(T, t_samples + self.furture_step)
+        future_offset = np.random.uniform(size=batch_size) * (target_index - t_samples)
+
+        future_offset = future_offset.astype(int)
+        future_t = (t_samples + 1 + future_offset)[her_indexes]
+        # replace go with achieved goal
+        future_ag = episode_batch['ag'][episode_idxs[her_indexes], future_t]
+        transitions['g'][her_indexes] = future_ag
+        # to get the params to re-compute reward
+        if not self.direction_reward:
+            transitions['r'] = np.expand_dims(
+                self.reward_func(transitions['ag_next'], transitions['g'],
+                                 None), 1)
+        else:
+            transitions['r'] = np.expand_dims(
+                self.direction_reward_func(transitions['ag_next'], transitions['g'],
+                                 transitions['ag']), 1)
+        transitions = {k: transitions[k].reshape(batch_size, *transitions[k].shape[1:]) for k in transitions.keys()}
+        return transitions
+
+    def adjust_replay_k(self):
+        if self.replay_k > 1:
+            self.replay_k -= 1
+
+        if self.replay_strategy == 'future':
+            self.future_p = 1 - (1. / (1 + self.replay_k))
+        else:
+            self.future_p = 0
--- a/algos/hier_sac.py
+++ b/algos/hier_sac.py
+import os
+import sys
+
+sys.path.append('../')
+from datetime import datetime
+from tensorboardX import SummaryWriter
+from models.networks import *
+from algos.replay_buffer import replay_buffer, replay_buffer_energy
+from algos.her import her_sampler
+# from planner.goal_plan import *
+import matplotlib.pyplot as plt
+from sklearn.manifold import TSNE
+import time
+from algos.sac.sac import SAC
+from algos.sac.replay_memory import ReplayMemory, Array_ReplayMemory
+import gym
+import pickle
+# from planner.simhash import HashingBonusEvaluator
+from PIL import Image
+import imageio
+from mpl_toolkits.mplot3d import Axes3D
+import seaborn as sns
+sns.set_color_codes()
+
+SUBGOAL_RANGE = 200.0
+
+
+class hier_sac_agent:
+    def __init__(self, args, env, env_params, test_env, test_env1=None, test_env2=None):
+        self.args = args
+        self.env = env
+        self.test_env = test_env
+        self.env_params = env_params
+        self.device = args.device
+        self.resume = args.resume
+        self.resume_epoch = args.resume_epoch
+        self.not_train_low = False
+        self.test_env1 = test_env1
+        self.test_env2 = test_env2
+        self.old_sample = args.old_sample
+
+        self.low_dim = env_params['obs']
+        self.env_params['low_dim'] = self.low_dim
+        self.hi_dim = env_params['obs']
+        print("hi_dim", self.hi_dim)
+
+        self.learn_goal_space = True
+        self.whole_obs = False  # use whole observation space as subgoal space
+        self.abs_range = abs_range = args.abs_range  # absolute goal range
+        self.feature_reg = 0.0  # feature l2 regularization
+        print("abs_range", abs_range)
+
+        if args.env_name[:5] == "Fetch":
+            maze_low = self.env.env.initial_gripper_xpos[:2] - self.env.env.target_range
+            maze_high = self.env.env.initial_gripper_xpos[:2] + self.env.env.target_range
+            self.hi_act_space = gym.spaces.Box(low=maze_low, high=maze_high)
+        else:
+            if args.env_name != "NChain-v1":
+                self.hi_act_space = self.env.env.maze_space
+            else:
+                self.hi_act_space = gym.spaces.Box(low=np.array([-1]), high=np.array([1]))
+        if self.learn_goal_space:
+            if args.env_name == "NChain-v1":
+                self.hi_act_space = gym.spaces.Box(low=np.array([-abs_range]), high=np.array([abs_range]))
+            else:
+                self.hi_act_space = gym.spaces.Box(low=np.array([-abs_range, -abs_range]), high=np.array([abs_range, abs_range]))
+        if self.whole_obs:
+            vel_low = [-10.] * 4
+            vel_high = [10.] * 4
+            maze_low = np.concatenate((self.env.env.maze_low, np.array(vel_low)))
+            maze_high = np.concatenate((self.env.env.maze_high, np.array(vel_high)))
+            self.hi_act_space = gym.spaces.Box(low=maze_low, high=maze_high)
+
+
+        dense_low = True
+        self.low_use_clip = not dense_low  # only sparse reward use clip
+        if args.replay_strategy == "future":
+            self.low_forward = True
+            assert self.low_use_clip is True
+        else:
+            self.low_forward = False
+            assert self.low_use_clip is False
+        self.hi_sparse = (self.env.env.reward_type == "sparse")
+
+        # # params of learning phi
+        resume_phi = args.resume
+        self.not_update_phi = False
+        phi_path = args.resume_path
+
+        # resume_phi = True
+        # phi_path = 'saved_models/AntMaze1-v1_Jun01_19-26-19'
+        # self.not_update_phi = True
+
+        self.save_fig = False
+        self.save_model = False
+        self.start_update_phi = args.start_update_phi
+        self.early_stop = args.early_stop  # after success rate converge, don't update low policy and feature
+        if args.env_name in ['AntPush-v1', 'AntFall-v1']:
+            if self.not_update_phi:
+                self.early_stop_thres = 900
+            else:
+                self.early_stop_thres = 3500
+        elif args.env_name in ["PointMaze1-v1"]:
+            self.early_stop_thres = 2000
+        elif args.env_name == "AntMaze1-v1":
+            self.early_stop_thres = 3000
+        else:
+            self.early_stop_thres = args.n_epochs
+        print("early_stop_threshold", self.early_stop_thres)
+        self.success_log = []
+
+        # scaling = self.env.env.env.MAZE_SIZE_SCALING
+        # print("scaling", scaling)
+
+        self.count_latent = False
+        if self.count_latent:
+            self.hash = HashingBonusEvaluator(512, 2)
+        self.count_obs = False
+        if self.count_obs:
+            self.hash = HashingBonusEvaluator(512, env_params['obs'])
+
+        self.high_correct = False
+        self.k = args.c
+        self.delta_k = 0
+        self.prediction_coeff = 0.0
+        tanh_output = False
+        self.use_prob = False
+        print("prediction_coeff", self.prediction_coeff)
+
+        if args.save:
+            current_time = datetime.now().strftime('%b%d_%H-%M-%S')
+            self.log_dir = 'runs/hier/' + str(args.env_name) + '/RB_Decay_' + current_time + \
+                            "_C_" + str(args.c) + "_Image_" + str(args.image) + \
+                            "_Seed_" + str(args.seed) + "_Reward_" + str(args.low_reward_coeff) + \
+                            "_NoPhi_" + str(self.not_update_phi) + "_LearnG_" + str(self.learn_goal_space) + "_Early_" + str(self.early_stop_thres) + str(args.early_stop)
+            self.writer = SummaryWriter(log_dir=self.log_dir)
+            if not os.path.exists(self.args.save_dir):
+                os.mkdir(self.args.save_dir)
+            # path to save the model
+            self.model_path = os.path.join(self.args.save_dir, self.args.env_name + "_" + current_time)
+            if not os.path.exists(self.model_path):
+                os.mkdir(self.model_path)
+        # init low-level network
+        self.real_goal_dim = self.hi_act_space.shape[0]  # low-level goal space and high-level action space
+        self.init_network()
+        # init high-level agent
+        self.hi_agent = SAC(self.hi_dim + env_params['goal'], self.hi_act_space, args, False, env_params['goal'],
+                            args.gradient_flow_value, args.abs_range, tanh_output)
+        self.env_params['real_goal_dim'] = self.real_goal_dim
+        self.hi_buffer = ReplayMemory(args.buffer_size)
+
+        # her sampler
+        self.c = self.args.c  # interval of high level action
+        self.low_her_module = her_sampler(args.replay_strategy, args.replay_k, args.distance, args.future_step,
+                                          dense_reward=dense_low, direction_reward=False, low_reward_coeff=args.low_reward_coeff)
+        if args.env_name[:5] == "Fetch":
+            self.low_buffer = replay_buffer_energy(self.env_params, self.args.buffer_size,
+                                               self.low_her_module.sample_her_energy, args.env_name)
+        else:
+            self.low_buffer = replay_buffer(self.env_params, self.args.buffer_size, self.low_her_module.sample_her_transitions)
+
+        not_load_buffer, not_load_high = True, False
+        if self.resume is True:
+            self.start_epoch = self.resume_epoch
+            if not not_load_high:
+                self.hi_agent.policy.load_state_dict(torch.load(self.args.resume_path + \
+                                                              '/hi_actor_model.pt', map_location='cuda:4')[0])
+                # self.hi_agent.critic.load_state_dict(torch.load(self.args.resume_path + \
+                #                                                '/hi_critic_model.pt', map_location='cuda:4')[0])
+
+            # print("not load low !!!")
+            print("load low !!!")
+            self.low_actor_network.load_state_dict(torch.load(self.args.resume_path + \
+                                                             '/low_actor_model.pt', map_location='cuda:4')[0])
+            self.low_critic_network.load_state_dict(torch.load(self.args.resume_path + \
+                                                              '/low_critic_model.pt', map_location='cuda:4')[0])
+
+            if not not_load_buffer:
+                # self.hi_buffer = torch.load(self.args.resume_path + '/hi_buffer.pt', map_location='cuda:1')
+                self.low_buffer = torch.load(self.args.resume_path + '/low_buffer.pt', map_location='cuda:1')
+
+        # sync target network of low-level
+        self.sync_target()
+
+        if hasattr(self.env.env, 'env'):
+            self.animate = self.env.env.env.visualize_goal
+        else:
+            self.animate = self.args.animate
+        self.distance_threshold = self.args.distance
+
+        if not (args.gradient_flow or args.use_prediction or args.gradient_flow_value):
+            self.representation = RepresentationNetwork(env_params, 3, self.abs_range, self.real_goal_dim).to(args.device)
+            if args.use_target:
+                self.target_phi = RepresentationNetwork(env_params, 3, self.abs_range, 2).to(args.device)
+                # load the weights into the target networks
+                self.target_phi.load_state_dict(self.representation.state_dict())
+            self.representation_optim = torch.optim.Adam(self.representation.parameters(), lr=0.0001)
+            if resume_phi is True:
+                print("load phi from: ", phi_path)
+                self.representation.load_state_dict(torch.load(phi_path + \
+                                                               '/phi_model_4000.pt', map_location='cuda:4')[0])
+        elif args.use_prediction:
+            self.representation = DynamicsNetwork(env_params, self.abs_range, 2, tanh_output=tanh_output, use_prob=self.use_prob, device=args.device).to(args.device)
+            self.representation_optim = torch.optim.Adam(self.representation.parameters(), lr=0.0001)
+            if resume_phi is True:
+                print("load phi from: ", phi_path)
+                self.representation.load_state_dict(torch.load(phi_path + \
+                                                               '/phi_model_4000.pt', map_location='cuda:1')[0])
+
+
+
+        print("learn goal space", self.learn_goal_space, " update phi", not self.not_update_phi)
+        self.train_success = 0
+        self.furthest_task = 0.
+
+    def adjust_lr_actor(self, epoch):
+        lr_actor = self.args.lr_actor * (0.5 ** (epoch // self.args.lr_decay_actor))
+        for param_group in self.low_actor_optim.param_groups:
+            param_group['lr'] = lr_actor
+
+    def adjust_lr_critic(self, epoch):
+        lr_critic = self.args.lr_critic * (0.5 ** (epoch // self.args.lr_decay_critic))
+        for param_group in self.low_critic_optim.param_groups:
+            param_group['lr'] = lr_critic
+
+    def learn(self):
+        for epoch in range(self.start_epoch, self.args.n_epochs):
+            if epoch > 0 and epoch % self.args.lr_decay_actor == 0:
+                self.adjust_lr_actor(epoch)
+            if epoch > 0 and epoch % self.args.lr_decay_critic == 0:
+                self.adjust_lr_critic(epoch)
+
+            ep_obs, ep_ag, ep_g, ep_actions = [], [], [], []
+            last_hi_obs = None
+            success = 0
+            observation = self.env.reset()
+            obs = observation['observation']
+            ag = observation['achieved_goal'][:self.real_goal_dim]
+            g = observation['desired_goal']
+            # identify furthest task
+            if g[1] >= 8:
+                self.furthest_task += 1
+                is_furthest_task = True
+            else:
+                is_furthest_task = False
+            if self.learn_goal_space:
+                if self.args.gradient_flow:
+                    if self.args.use_target:
+                        ag = self.hi_agent.policy_target.phi(torch.Tensor(obs).to(self.device)).detach().cpu().numpy()
+                    else:
+                        ag = self.hi_agent.policy.phi(torch.Tensor(obs).to(self.device)).detach().cpu().numpy()
+                elif self.args.gradient_flow_value:
+                    ag = self.hi_agent.critic.phi(torch.Tensor(obs).to(self.device)).detach().cpu().numpy()[0]
+                elif self.args.use_prediction:
+                    ag = self.representation.phi(torch.Tensor(obs).to(self.device)).detach().cpu().numpy()[0]
+                else:
+                    if self.args.use_target:
+                        ag = self.target_phi(torch.Tensor(obs).to(self.device)).detach().cpu().numpy()[0]
+                    else:
+                        ag = self.representation(torch.Tensor(obs).to(self.device)).detach().cpu().numpy()[0]
+            if self.whole_obs:
+                ag = obs.copy()
+
+            for t in range(self.env_params['max_timesteps']):
+                act_obs, act_g = self._preproc_inputs(obs, g)
+                if t % self.c == 0:
+                    hi_act_obs = np.concatenate((obs[:self.hi_dim], g))
+                    # append high-level rollouts
+                    if last_hi_obs is not None:
+                        mask = float(not done)
+                        if self.high_correct:
+                            last_hi_a = ag
+                        self.hi_buffer.push(last_hi_obs, last_hi_a, last_hi_r, hi_act_obs, mask, epoch)
+                    if epoch < self.args.start_epoch:
+                        hi_action = self.hi_act_space.sample()
+                        # print("sample", hi_action)
+                    else:
+                        hi_action = self.hi_agent.select_action(hi_act_obs)
+                    last_hi_obs = hi_act_obs.copy()
+                    last_hi_a = hi_action.copy()
+                    last_hi_r = 0.
+                    done = False
+                    if self.old_sample:
+                        hi_action_for_low = hi_action
+                    else:
+                        # make hi_action a delta phi(s)
+                        hi_action_for_low = ag.copy() + hi_action.copy()
+                        hi_action_for_low = np.clip(hi_action_for_low, -SUBGOAL_RANGE, SUBGOAL_RANGE)
+                    hi_action_tensor = torch.tensor(hi_action_for_low, dtype=torch.float32).unsqueeze(0).to(self.device)
+                    # update high-level policy
+                    if len(self.hi_buffer) > self.args.batch_size:
+                        self.update_hi(epoch)
+                with torch.no_grad():
+                    if self.not_train_low:
+                        action = self.test_policy(act_obs[:, :self.low_dim], hi_action_tensor)
+                    else:
+                        action = self.explore_policy(act_obs[:, :self.low_dim], hi_action_tensor)
+                # feed the actions into the environment
+                observation_new, r, _, info = self.env.step(action)
+                if info['is_success']:
+                    done = True
+                    # only record the first success
+                    if success == 0 and is_furthest_task:
+                        success = t
+                        self.train_success += 1
+                if self.animate:
+                    self.env.render()
+                obs_new = observation_new['observation']
+                ag_new = observation_new['achieved_goal'][:self.real_goal_dim]
+                if self.learn_goal_space:
+                    if self.args.gradient_flow:
+                        if self.args.use_target:
+                            ag_new = self.hi_agent.policy_target.phi(
+                                torch.Tensor(obs_new).to(self.device)).detach().cpu().numpy()
+                        else:
+                            ag_new = self.hi_agent.policy.phi(torch.Tensor(obs_new).to(self.device)).detach().cpu().numpy()
+                    elif self.args.gradient_flow_value:
+                        ag_new = self.hi_agent.critic.phi(torch.Tensor(obs_new).to(self.device)).detach().cpu().numpy()[0]
+                    elif self.args.use_prediction:
+                        ag_new = self.representation.phi(torch.Tensor(obs_new).to(self.device)).detach().cpu().numpy()[0]
+                    else:
+                        if self.args.use_target:
+                            ag_new = self.target_phi(torch.Tensor(obs_new).to(self.device)).detach().cpu().numpy()[0]
+                        else:
+                            ag_new = self.representation(torch.Tensor(obs_new).to(self.device)).detach().cpu().numpy()[0]
+                if self.whole_obs:
+                    ag_new = obs_new.copy()
+                if done is False:
+                    if self.count_latent:
+                        self.hash.inc_hash(ag[None])
+                        r += self.hash.predict(ag_new[None])[0] * 0.1
+                    if self.count_obs:
+                        self.hash.inc_hash(obs[None])
+                        r += self.hash.predict(obs_new[None])[0] * 0.1
+                    last_hi_r += r
+                # append rollouts
+                ep_obs.append(obs[:self.low_dim].copy())
+                ep_ag.append(ag.copy())
+                ep_g.append(hi_action_for_low.copy())
+                ep_actions.append(action.copy())
+                # re-assign the observation
+                obs = obs_new
+                ag = ag_new
+                # slowly update phi
+                if epoch > self.start_update_phi and not self.not_update_phi and not self.args.gradient_flow and not self.args.gradient_flow_value:
+                    self.slow_update_phi(epoch)
+                    if t % self.args.period == 0 and self.args.use_target:
+                        self._soft_update_target_network(self.target_phi, self.representation)
+            ep_obs.append(obs[:self.low_dim].copy())
+            ep_ag.append(ag.copy())
+            mask = float(not done)
+            hi_act_obs = np.concatenate((obs[:self.hi_dim], g))
+            self.hi_buffer.push(last_hi_obs, last_hi_a, last_hi_r, hi_act_obs, mask, epoch)
+
+            mb_obs = np.array([ep_obs])
+            mb_ag = np.array([ep_ag])
+            mb_g = np.array([ep_g])
+            mb_actions = np.array([ep_actions])
+            self.low_buffer.store_episode([mb_obs, mb_ag, mb_g, mb_actions, success, False])
+
+            if self.args.save and self.args.env_name == "NChain-v1":
+                self.writer.add_scalar('Explore/coverage_' + self.args.env_name, self.env.env.coverage, epoch)
+            # print("coverage", self.env.env.coverage)
+
+            # update low-level
+            if not self.not_train_low:
+                for n_batch in range(self.args.n_batches):
+                    self._update_network(epoch, self.low_buffer, self.low_actor_target_network,
+                                         self.low_critic_target_network,
+                                         self.low_actor_network, self.low_critic_network, 'max_timesteps',
+                                         self.low_actor_optim, self.low_critic_optim, use_forward_loss=self.low_forward, clip=self.low_use_clip)
+                    if n_batch % self.args.period == 0:
+                        self._soft_update_target_network(self.low_actor_target_network, self.low_actor_network)
+                        self._soft_update_target_network(self.low_critic_target_network, self.low_critic_network)
+
+
+            # start to do the evaluation
+            if epoch % self.args.eval_interval == 0 and epoch != 0:
+                if self.test_env1 is not None:
+                    eval_success1, _ = self._eval_hier_agent(env=self.test_env1)
+                    eval_success2, _ = self._eval_hier_agent(env=self.test_env2)
+                farthest_success_rate, _ = self._eval_hier_agent(env=self.test_env)
+                random_success_rate, _ = self._eval_hier_agent(env=self.env)
+                self.success_log.append(farthest_success_rate)
+                mean_success = np.mean(self.success_log[-5:])
+                # stop updating phi and low
+                if self.early_stop and (mean_success >= 0.9 or epoch > self.early_stop_thres):
+                    print("early stop !!!")
+                    self.not_update_phi = True
+                    self.not_train_low = True
+                print('[{}] epoch is: {}, eval hier success rate is: {:.3f}'.format(datetime.now(), epoch, random_success_rate))
+                if self.save_fig:
+                    self.vis_hier_policy(epoch=epoch)
+                    self.visualize_representation(epoch=epoch)
+                if self.args.save:
+                    print("log_dir: ", self.log_dir)
+                    torch.save([self.hi_agent.critic.state_dict()], self.model_path + '/hi_critic_model.pt')
+                    torch.save([self.low_critic_network.state_dict()], self.model_path + '/low_critic_model.pt')
+                    torch.save(self.hi_buffer, self.model_path + '/hi_buffer.pt')
+                    torch.save(self.low_buffer, self.model_path + '/low_buffer.pt')
+                    if not self.args.gradient_flow and not self.args.gradient_flow_value:
+                        if self.save_model:
+                            # self.cal_MIV(epoch)
+                            torch.save([self.representation.state_dict()], self.model_path + '/phi_model_{}.pt'.format(epoch))
+                            torch.save([self.hi_agent.policy.state_dict()], self.model_path + '/hi_actor_{}.pt'.format(epoch))
+                            torch.save([self.low_actor_network.state_dict()], self.model_path + '/low_actor_{}.pt'.format(epoch))
+                        else:
+                            torch.save([self.representation.state_dict()], self.model_path + '/phi_model.pt')
+                            torch.save([self.hi_agent.policy.state_dict()], self.model_path + '/hi_actor_model.pt')
+                            torch.save([self.low_actor_network.state_dict()], self.model_path + '/low_actor_model.pt')
+                    self.writer.add_scalar('Success_rate/hier_farthest_' + self.args.env_name, farthest_success_rate, epoch)
+                    self.writer.add_scalar('Success_rate/hier_random_' + self.args.env_name, random_success_rate, epoch)
+                    self.writer.add_scalar('Explore/furthest_task_' + self.args.env_name, self.furthest_task, epoch)
+                    if self.test_env1 is not None:
+                        self.writer.add_scalar('Success_rate/eval1_' + self.args.env_name,
+                                               eval_success1, epoch)
+                        self.writer.add_scalar('Success_rate/eval2_' + self.args.env_name, eval_success2,
+                                               epoch)
+
+    # pre_process the inputs
+    def _preproc_inputs(self, obs, g):
+        obs = torch.tensor(obs, dtype=torch.float32).unsqueeze(0).to(self.device)
+        g = torch.tensor(g, dtype=torch.float32).unsqueeze(0).to(self.device)
+        return obs, g
+
+    # this function will choose action for the agent and do the exploration
+    def _select_actions(self, pi):
+        action = pi.cpu().numpy().squeeze()
+        if action.shape == ():
+            action = np.array([action])
+        # add the gaussian
+        action += self.args.noise_eps * self.env_params['action_max'] * np.random.randn(*action.shape)
+        action = np.clip(action, -self.env_params['action_max'], self.env_params['action_max'])
+        # random actions...
+        if np.random.rand() < self.args.random_eps:
+            action = np.random.uniform(low=-self.env_params['action_max'], high=self.env_params['action_max'], \
+                                       size=self.env_params['action'])
+        return action
+
+    def explore_policy(self, obs, goal):
+        pi = self.low_actor_network(obs, goal)
+        action = self._select_actions(pi)
+        return action
+
+    def update_hi(self, epoch):
+        if self.args.gradient_flow or self.args.gradient_flow_value:
+            sample_data, _ = self.slow_collect()
+            sample_data = torch.tensor(sample_data, dtype=torch.float32).to(self.device)
+        else:
+            sample_data = None
+        critic_1_loss, critic_2_loss, policy_loss, _, _ = self.hi_agent.update_parameters(self.hi_buffer,
+                                                                                          self.args.batch_size,
+                                                                                          self.env_params,
+                                                                                          self.hi_sparse,
+                                                                                          sample_data)
+        if self.args.save:
+            self.writer.add_scalar('Loss/hi_critic_1', critic_1_loss, epoch)
+            self.writer.add_scalar('Loss/hi_critic_2', critic_2_loss, epoch)
+            self.writer.add_scalar('Loss/hi_policy', policy_loss, epoch)
+
+    def random_policy(self, obs, goal):
+        random_actions = np.random.uniform(low=-self.env_params['action_max'], high=self.env_params['action_max'], \
+                                           size=self.env_params['action'])
+        return random_actions
+
+    def test_policy(self, obs, goal):
+        pi = self.low_actor_network(obs, goal)
+        # convert the actions
+        actions = pi.detach().cpu().numpy().squeeze()
+        if actions.shape == ():
+            actions = np.array([actions])
+        return actions
+
+    # soft update
+    def _soft_update_target_network(self, target, source):
+        for target_param, param in zip(target.parameters(), source.parameters()):
+            target_param.data.copy_((1 - self.args.polyak) * param.data + self.args.polyak * target_param.data)
+
+    # update the network
+    def _update_network(self, epoch, buffer, actor_target, critic_target, actor, critic, T, actor_optim, critic_optim, use_forward_loss=True, clip=True):
+        # sample the episodes
+        transitions = buffer.sample(self.args.batch_size)
+        # pre-process the observation and goal
+        o, o_next, g, ag = transitions['obs'], transitions['obs_next'], transitions['g'], transitions['ag']
+        transitions['obs'], transitions['g'] = o, g
+        transitions['obs_next'], transitions['g_next'] = o_next, g
+        ag_next = transitions['ag_next']
+
+        # start to do the update
+        obs_cur = transitions['obs']
+        g_cur = transitions['g']
+        obs_next = transitions['obs_next']
+        g_next = transitions['g_next']
+
+        # done
+        dist = np.linalg.norm(ag_next - g_next, axis=1)
+        not_done = (dist > self.distance_threshold).astype(np.int32).reshape(-1, 1)
+
+        # transfer them into the tensor
+        obs_cur = torch.tensor(obs_cur, dtype=torch.float32).to(self.device)
+        g_cur = torch.tensor(g_cur, dtype=torch.float32).to(self.device)
+        obs_next = torch.tensor(obs_next, dtype=torch.float32).to(self.device)
+        g_next = torch.tensor(g_next, dtype=torch.float32).to(self.device)
+        ag_next = torch.tensor(ag_next, dtype=torch.float32).to(self.device)
+        not_done = torch.tensor(not_done, dtype=torch.int32).to(self.device)
+
+        actions_tensor = torch.tensor(transitions['actions'], dtype=torch.float32).to(self.device)
+        r_tensor = torch.tensor(transitions['r'], dtype=torch.float32).to(self.device)
+
+        # calculate the target Q value function
+        with torch.no_grad():
+            actions_next = actor_target(obs_next, g_next)
+            q_next_value = critic_target(obs_next, g_next, actions_next)
+            q_next_value = q_next_value.detach()
+            target_q_value = r_tensor + critic_target.gamma * q_next_value * not_done
+            target_q_value = target_q_value.detach()
+            if clip:
+                clip_return = self.env_params[T]
+                target_q_value = torch.clamp(target_q_value, -clip_return, 0.)
+        # the q loss
+        real_q_value = critic(obs_cur, g_cur, actions_tensor)
+        critic_loss = (target_q_value - real_q_value).pow(2).mean()
+        if use_forward_loss:
+            forward_loss = critic(obs_cur, ag_next, actions_tensor).pow(2).mean()
+            critic_loss += forward_loss
+        # the actor loss
+        actions_real = actor(obs_cur, g_cur)
+        actor_loss = -critic(obs_cur, g_cur, actions_real).mean()
+        actor_loss += self.args.action_l2 * (actions_real / self.env_params['action_max']).pow(2).mean()
+
+        # start to update the network
+        actor_optim.zero_grad()
+        actor_loss.backward()
+        torch.nn.utils.clip_grad_norm_(self.low_actor_network.parameters(), 1.0)
+        actor_optim.step()
+        # update the critic_network
+        critic_optim.zero_grad()
+        critic_loss.backward()
+        torch.nn.utils.clip_grad_norm_(self.low_critic_network.parameters(), 1.0)
+        critic_optim.step()
+
+        if self.args.save:
+            if T == 'max_timesteps':
+                name = 'low'
+            else:
+                name = 'high'
+            self.writer.add_scalar('Loss/' + name + '_actor_loss' + self.args.metric, actor_loss, epoch)
+            self.writer.add_scalar('Loss/' + name + '_critic_loss' + self.args.metric, critic_loss, epoch)
+
+    def _eval_hier_agent(self, env, n_test_rollouts=10):
+        total_success_rate = []
+        if not self.args.eval:
+            n_test_rollouts = self.args.n_test_rollouts
+        discount_reward = np.zeros(n_test_rollouts)
+        for roll in range(n_test_rollouts):
+            per_success_rate = []
+            observation = env.reset()
+            obs = observation['observation']
+            g = observation['desired_goal']
+            for num in range(self.env_params['max_test_timesteps']):
+                with torch.no_grad():
+                    act_obs, act_g = self._preproc_inputs(obs, g)
+                    if num % self.c == 0:
+                        hi_act_obs = np.concatenate((obs[:self.hi_dim], g))
+                        hi_action = self.hi_agent.select_action(hi_act_obs, evaluate=True)
+                        if self.old_sample:
+                            new_hi_action = hi_action
+                        else:
+                            ag = self.representation(torch.Tensor(obs).to(self.device)).detach().cpu().numpy()[0]
+                            new_hi_action = ag + hi_action
+                            new_hi_action = np.clip(new_hi_action, -SUBGOAL_RANGE, SUBGOAL_RANGE)
+                        hi_action_tensor = torch.tensor(new_hi_action, dtype=torch.float32).unsqueeze(0).to(self.device)
+                    action = self.test_policy(act_obs[:, :self.low_dim], hi_action_tensor)
+                observation_new, rew, done, info = env.step(action)
+                if self.animate:
+                    env.render()
+                obs = observation_new['observation']
+                g = observation_new['desired_goal']
+                if done:
+                    per_success_rate.append(info['is_success'])
+                    if bool(info['is_success']):
+                        # print("t:", num)
+                        discount_reward[roll] = 1 - 1. / self.env_params['max_test_timesteps'] * num
+                    break
+            total_success_rate.append(per_success_rate)
+        total_success_rate = np.array(total_success_rate)
+        global_success_rate = np.mean(total_success_rate[:, -1])
+        global_reward = np.mean(discount_reward)
+        if self.args.eval:
+            print("hier success rate", global_success_rate, global_reward)
+        return global_success_rate, global_reward
+
+    def init_network(self):
+        self.low_actor_network = actor(self.env_params, self.real_goal_dim).to(self.device)
+        self.low_actor_target_network = actor(self.env_params, self.real_goal_dim).to(self.device)
+        self.low_critic_network = criticWrapper(self.env_params, self.args, self.real_goal_dim).to(self.device)
+        self.low_critic_target_network = criticWrapper(self.env_params, self.args, self.real_goal_dim).to(self.device)
+
+        self.start_epoch = 0
+
+        # create the optimizer
+        self.low_actor_optim = torch.optim.Adam(self.low_actor_network.parameters(), lr=self.args.lr_actor)
+        self.low_critic_optim = torch.optim.Adam(self.low_critic_network.parameters(), lr=self.args.lr_critic, weight_decay=1e-5)
+
+    def sync_target(self):
+        # load the weights into the target networks
+        self.low_actor_target_network.load_state_dict(self.low_actor_network.state_dict())
+        self.low_critic_target_network.load_state_dict(self.low_critic_network.state_dict())
+
+    def slow_update_phi(self, epoch):
+        sample_data, hi_action = self.slow_collect()
+        sample_data = torch.tensor(sample_data, dtype=torch.float32).to(self.device)
+        if not self.args.use_prediction:
+            obs, obs_next = self.representation(sample_data[0]), self.representation(sample_data[1])
+            min_dist = torch.clamp((obs - obs_next).pow(2).mean(dim=1), min=0.)
+            hi_obs, hi_obs_next = self.representation(sample_data[2]), self.representation(sample_data[3])
+            max_dist = torch.clamp(1 - (hi_obs - hi_obs_next).pow(2).mean(dim=1), min=0.)
+            representation_loss = (min_dist + max_dist).mean()
+            # add l2 regularization
+            representation_loss += self.feature_reg * (obs / self.abs_range).pow(2).mean()
+        else:
+            hi_action = torch.tensor(hi_action, dtype=torch.float32).to(self.device)
+            with torch.no_grad():
+                target_next_obs = self.representation.phi(sample_data[3])
+            obs, obs_next = self.representation.phi(sample_data[0]), self.representation.phi(sample_data[1])
+            min_dist = torch.clamp((obs - obs_next).pow(2).mean(dim=1), min=0.)
+            hi_obs, hi_obs_next = self.representation.phi(sample_data[2]), self.representation.phi(sample_data[3])
+            max_dist = torch.clamp(1 - (hi_obs - hi_obs_next).pow(2).mean(dim=1), min=0.)
+            representation_loss = (min_dist + max_dist).mean()
+            # prediction loss
+            if self.use_prob:
+                predict_distribution = self.representation(sample_data[2], hi_action)
+                prediction_loss = - predict_distribution.log_prob(target_next_obs).mean()
+            else:
+                predict_state = self.representation(sample_data[2], hi_action)
+                prediction_loss = (predict_state - target_next_obs).pow(2).mean()
+            representation_loss += self.prediction_coeff * prediction_loss
+        self.representation_optim.zero_grad()
+        representation_loss.backward()
+        self.representation_optim.step()
+        if self.args.save:
+            self.writer.add_scalar('Loss/phi_loss' + self.args.metric, representation_loss, epoch)
+
+    def slow_collect(self, batch_size=100):
+        if self.args.use_prediction:
+            transitions = self.low_buffer.sample(batch_size)
+            obs, obs_next = transitions['obs'], transitions['obs_next']
+
+            hi_obs, hi_action, _, hi_obs_next, _ = self.hi_buffer.sample(batch_size)
+            hi_obs, hi_obs_next = hi_obs[:, :self.env_params['obs']], hi_obs_next[:, :self.env_params['obs']]
+            train_data = np.array([obs, obs_next, hi_obs, hi_obs_next])
+            return train_data, hi_action
+        else:
+            # new negative samples
+            episode_num = self.low_buffer.current_size
+            obs_array = self.low_buffer.buffers['obs'][:episode_num]
+            episode_idxs = np.random.randint(0, episode_num, batch_size)
+            t_samples = np.random.randint(self.env_params['max_timesteps'] - self.k - self.delta_k, size=batch_size)
+            if self.delta_k > 0:
+                delta = np.random.randint(self.delta_k, size=batch_size)
+            else:
+                delta = 0
+
+            hi_obs = obs_array[episode_idxs, t_samples]
+            hi_obs_next = obs_array[episode_idxs, t_samples + self.k + delta]
+            obs = hi_obs
+            obs_next = obs_array[episode_idxs, t_samples + 1 + delta]
+
+            train_data = np.array([obs, obs_next, hi_obs, hi_obs_next])
+            return train_data, None
+
+    def visualize_representation(self, epoch):
+        transitions = self.low_buffer.sample(800)
+        obs = transitions['obs']
+        # with open('fig/final/' + "sampled_states.pkl", 'wb') as output:
+        #     pickle.dump(obs, output)
+
+        index1 = np.where((obs[:, 0] < 4) & (obs[:, 1] < 4))
+        index2 = np.where((obs[:, 0] < 4) & (obs[:, 1] > 4))
+        index3 = np.where((obs[:, 0] > 4) & (obs[:, 1] < 4))
+        index4 = np.where((obs[:, 0] > 4) & (obs[:, 1] > 4))
+        index_lst = [index1, index2, index3, index4]
+
+        obs_tensor = torch.Tensor(obs).to(self.device)
+        features = self.representation(obs_tensor).detach().cpu().numpy()
+        plt.scatter(features[:, 0], features[:, 1], color='green')
+        plt.show()
+
+        # rep = []
+        # for index in index_lst:
+        #     rep.append(features[index])
+        #
+        # self.plot_fig(rep, 'slow_feature', epoch)
+        #
+        #
+        # obs_list = []
+        # for index in index_lst:
+        #     obs_list.append(obs[index])
+        # self.plot_fig(obs_list, 'obs', epoch)
+
+        '''
+        tsne_list = []
+        res_tsne = TSNE(n_components=2).fit_transform(obs)
+        for index in index_lst:
+            tsne_list.append(res_tsne[index])
+        self.plot_fig(tsne_list, 'tsne_feature', epoch)
+        '''
+
+    def plot_fig(self, rep, name, epoch):
+        fig = plt.figure()
+        axes = fig.add_subplot(111)
+        rep1, rep2, rep3, rep4 = rep
+        def scatter_rep(rep1, c, marker):
+            if rep1.shape[0] > 0:
+                l1 = axes.scatter(rep1[:, 0], rep1[:, 1], c=c, marker=marker)
+            else:
+                l1 = axes.scatter([], [], c=c, marker=marker)
+            return l1
+        l1 = scatter_rep(rep1, c='y', marker='s')
+        l2 = scatter_rep(rep2, c='r', marker='o')
+        l3 = scatter_rep(rep3, c='b', marker='1')
+        l4 = scatter_rep(rep4, c='g', marker='2')
+
+        plt.xlabel('x')
+        plt.ylabel('y')
+        axes.legend((l1, l2, l3, l4), ('space1', 'space2', 'space3', 'space4'))
+        plt.savefig('fig/final/' + name + str(epoch) + '.png')
+        plt.close()
+
+    def vis_hier_policy(self, epoch=0, load_obs=None, color_map='RdYlBu'):
+        obs_vec = []
+        hi_action_vec = []
+        env = self.test_env
+        observation = env.reset()
+        obs = observation['observation']
+        obs_vec.append(obs)
+        g = observation['desired_goal']
+        if load_obs is None:
+            for num in range(self.env_params['max_test_timesteps']):
+                with torch.no_grad():
+                    act_obs, act_g = self._preproc_inputs(obs, g)
+                    if num % self.c == 0:
+                        hi_act_obs = np.concatenate((obs[:self.hi_dim], g))
+                        hi_action = self.hi_agent.select_action(hi_act_obs, evaluate=True)
+                        hi_action_tensor = torch.tensor(hi_action, dtype=torch.float32).unsqueeze(0).to(self.device)
+                        ag = self.representation(torch.Tensor(obs).to(self.device)).detach().cpu().numpy()[0]
+                        distance = np.linalg.norm(hi_action - ag)
+                        print("distance", distance)
+                        hi_action_vec.append(hi_action)
+                    action = self.test_policy(act_obs[:, :self.low_dim], hi_action_tensor)
+                observation_new, rew, done, info = env.step(action)
+                if self.animate:
+                    env.render()
+                obs = observation_new['observation']
+                obs_vec.append(obs)
+                if done:
+                    if info['is_success']:
+                        print("success !!!")
+                    break
+        else:
+            obs_vec = load_obs[0]
+
+        plt.figure(figsize=(12, 6))
+        obs_vec = np.array(obs_vec)
+        with open('fig/final/' + "img_push_hard.pkl", 'wb') as output:
+            pickle.dump(obs_vec, output)
+        self.plot_rollout(obs_vec, "XY_{}".format(epoch * self.env_params['max_timesteps']), 121, goal=g)
+
+        if not self.learn_goal_space:
+            features = obs_vec[:, :2]
+            feature_goal = g[:2]
+        else:
+            obs_tensor = torch.Tensor(obs_vec[:, :self.hi_dim]).to(self.device)
+            features = self.representation(obs_tensor).detach().cpu().numpy()
+            # rest = (self.env_params['obs'] - self.env_params['goal']) * [0.]
+            # g = np.concatenate((g, np.array(rest)))
+            # g = torch.tensor(g, dtype=torch.float32).unsqueeze(0).to(self.device)
+            # feature_goal = self.representation(g).detach().cpu().numpy()[0]
+            feature_goal = None
+        hi_action_vec = np.array(hi_action_vec)
+        self.plot_rollout(features, "Feature_{}".format(epoch * self.env_params['max_timesteps']), 122, feature_goal, color_map="Blues",
+                          hi_action_vec = hi_action_vec)
+        if load_obs is not None and len(load_obs) > 1:
+            obs_vec = load_obs[1]
+            obs_tensor = torch.Tensor(obs_vec[:, :self.hi_dim]).to(self.device)
+            features = self.representation(obs_tensor).detach().cpu().numpy()
+            self.plot_rollout(features, "Feature_{}".format(epoch * self.env_params['max_timesteps']), 122, feature_goal,
+                              color_map="Wistia")
+
+        file_name = 'fig/rebuttal/rollout' + str(epoch) + '.png'
+        plt.savefig(file_name, bbox_inches='tight', transparent=True)
+        # plt.show()
+        plt.close()
+
+    def plot_rollout(self, obs_vec, name, num, goal=None, hi_action_vec=None, no_axis=True, color_map='RdYlBu'):
+        plt.subplot(num)
+        cm = plt.cm.get_cmap(color_map)
+        num = np.arange(obs_vec.shape[0])
+        plt.scatter(obs_vec[:, 0], obs_vec[:, 1], c=num, cmap=cm)
+        if goal is not None:
+            plt.scatter([goal[0]], [goal[1]], marker='*',
+                        color='green', s=200, label='goal')
+        if hi_action_vec is not None:
+            plt.scatter(hi_action_vec[:, 0], hi_action_vec[:, 1], c="k")
+        plt.title(name, fontsize=24)
+        if no_axis:
+            plt.axis('off')
+        if not no_axis:
+            plt.scatter([obs_vec[0, 0]], [obs_vec[0, 1]], marker='+',
+                        color='green', s=200, label='start')
+            plt.scatter([obs_vec[-1, 0]], [obs_vec[-1, 1]], marker='+',
+                        color='red', s=200, label='end')
+            plt.legend(loc=2, bbox_to_anchor=(1.05, 1.0), fontsize=14, borderaxespad=0.)
+        # plt.show()
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/algos/replay_buffer.py
+++ b/algos/replay_buffer.py
+import threading
+import numpy as np
+import torch
+
+"""
+the replay buffer here is basically from the openai baselines code
+"""
+
+
+class replay_buffer:
+    def __init__(self, env_params, buffer_size, sample_func, name='max_timesteps'):
+        self.env_params = env_params
+        self.T = env_params[name]
+        if name == 'max_timesteps':
+            # low level
+            goal_dim = env_params['real_goal_dim']
+            action_dim = self.env_params['action']
+            obs_dim = self.env_params['low_dim']
+        else:
+            # high level
+            goal_dim = env_params['goal']
+            action_dim = env_params['real_goal_dim']
+            obs_dim = self.env_params['hi_dim']
+        self.size = buffer_size // self.T
+        # memory management
+        self.current_size = 0
+        self.n_transitions_stored = 0
+        self.sample_func = sample_func
+        # create the buffer to store info
+        self.buffers = {'obs': np.empty([self.size, self.T + 1, obs_dim]),
+                        'ag': np.empty([self.size, self.T + 1, goal_dim]),
+                        'g': np.empty([self.size, self.T, goal_dim]),
+                        'actions': np.empty([self.size, self.T, action_dim]),
+                        'success': np.empty([self.size]),
+                        'done': np.empty([self.size, self.T, 1])
+                        }
+
+        self.position = 0  # record the index to update
+
+    # store the episode
+    def store_episode(self, episode_batch):
+        mb_obs, mb_ag, mb_g, mb_actions, success, done = episode_batch
+        batch_size = mb_obs.shape[0]
+        idxs = self._get_storage_idx(inc=batch_size)
+        # store the informations
+        self.buffers['obs'][idxs] = mb_obs
+        self.buffers['ag'][idxs] = mb_ag
+        self.buffers['g'][idxs] = mb_g
+        self.buffers['actions'][idxs] = mb_actions
+        self.buffers['success'][idxs] = success
+        self.buffers['done'][idxs] = done
+        self.n_transitions_stored += self.T * batch_size
+
+    # sample the data from the replay buffer
+    def sample(self, batch_size):
+        temp_buffers = {}
+        for key in self.buffers.keys():
+            if key != 'success':
+                temp_buffers[key] = self.buffers[key][:self.current_size]
+        temp_buffers['obs_next'] = temp_buffers['obs'][:, 1:, :]
+        temp_buffers['ag_next'] = temp_buffers['ag'][:, 1:, :]
+        # sample transitions
+        transitions = self.sample_func(temp_buffers, batch_size)
+        return transitions
+
+    def random_sample(self, batch_size):
+        temp_buffers = {}
+        for key in self.buffers.keys():
+            temp_buffers[key] = self.buffers[key][:self.current_size]
+        temp_buffers['obs_next'] = temp_buffers['obs'][:, 1:, :]
+        temp_buffers['ag_next'] = temp_buffers['ag'][:, 1:, :]
+        # sample transitions
+        # print('start random sample', self.current_size)
+        T = temp_buffers['actions'].shape[1]  # 50 steps per traj
+        rollout_batch_size = temp_buffers['actions'].shape[0]  # 2 trajs
+        batch_size = batch_size  # target batches we want to sample
+        episode_idxs = np.random.randint(0, rollout_batch_size, batch_size)
+        # which traj to sample
+        t_samples = np.random.randint(T, size=batch_size)
+        # which step to sample
+        transitions = {key: temp_buffers[key][episode_idxs, t_samples].copy() for key in temp_buffers.keys()}
+        transitions = {k: transitions[k].reshape(batch_size, *transitions[k].shape[1:]) for k in transitions.keys()}
+        return transitions
+
+    def sample_traj(self, batch_size):
+        temp_buffers = {}
+        for key in self.buffers.keys():
+            temp_buffers[key] = self.buffers[key][:self.current_size]
+        temp_buffers['obs_next'] = temp_buffers['obs'][:, 1:, :]
+        temp_buffers['ag_next'] = temp_buffers['ag'][:, 1:, :]
+        T = temp_buffers['actions'].shape[1]  # 50 steps per traj
+        num_traj = temp_buffers['actions'].shape[0]  # number of all the trajs
+        episode_idxs = np.random.randint(0, num_traj, batch_size)
+        traj = {key: temp_buffers[key][episode_idxs, :].copy() for key in temp_buffers.keys()}
+        # remember obs and ag has a larger shape
+        return traj
+
+    def get_all_data(self):
+        temp_buffers = {}
+        for key in self.buffers.keys():
+            temp_buffers[key] = self.buffers[key][:self.current_size]
+        temp_buffers['obs_next'] = temp_buffers['obs'][:, 1:, :]
+        temp_buffers['ag_next'] = temp_buffers['ag'][:, 1:, :]
+        return temp_buffers
+
+    def _get_storage_idx(self, inc=None):
+        inc = inc or 1
+        assert inc == 1
+        if self.current_size + inc <= self.size:
+            idx = np.arange(self.current_size, self.current_size + inc)
+        elif self.current_size < self.size:
+            overflow = inc - (self.size - self.current_size)
+            idx_a = np.arange(self.current_size, self.size)
+            idx_b = np.random.randint(0, self.current_size, overflow)
+            idx = np.concatenate([idx_a, idx_b])
+        else:
+            idx = np.array([self.position])
+            # idx = np.random.randint(0, self.size, inc)
+        self.current_size = min(self.size, self.current_size + inc)
+        self.position = (self.position + 1) % self.size
+        if inc == 1:
+            idx = idx[0]
+        return idx
+
+    # update achieved_goal in the buffer
+    def update_ag(self, phi, device):
+        all_obs = self.buffers['obs'][:self.current_size].copy()
+        obs = all_obs.reshape(-1, all_obs.shape[2])
+        obs_tensor = torch.Tensor(obs).to(device)
+        ag = phi(obs_tensor).detach().cpu().numpy()
+        goal_dim = self.buffers['ag'].shape[-1]
+        ag_new = ag.reshape(self.current_size, -1, goal_dim)
+        self.buffers["ag"][:self.current_size] = ag_new
+
+
+class replay_buffer_energy:
+    def __init__(self, env_params, buffer_size, sample_func, env_name, name='max_timesteps'):
+        self.env_params = env_params
+        self.T = env_params[name]
+        if name == 'max_timesteps':
+            goal_dim = env_params['real_goal_dim']
+            action_dim = self.env_params['action']
+        else:
+            goal_dim = env_params['goal']
+            action_dim = env_params['real_goal_dim']
+        self.size = buffer_size // self.T
+        # memory management
+        self.current_size = 0
+        self.n_transitions_stored = 0
+        self.sample_func = sample_func
+        # create the buffer to store info
+        self.buffers = {'obs': np.empty([self.size, self.T + 1, self.env_params['obs']]),
+                        'ag': np.empty([self.size, self.T + 1, goal_dim]),
+                        'g': np.empty([self.size, self.T, goal_dim]),
+                        'actions': np.empty([self.size, self.T, action_dim]),
+                        'e': np.empty([self.size, 1]),  # energy
+                        }
+        self.env_name = env_name
+
+    # store the episode
+    def store_episode(self, episode_batch, w_potential=1.0, w_linear=1.0, clip_energy=0.5):
+        mb_obs, mb_ag, mb_g, mb_actions = episode_batch
+        batch_size = mb_obs.shape[0]
+        idxs = self._get_storage_idx(inc=batch_size)
+        # store the informations
+        self.buffers['obs'][idxs] = mb_obs
+        self.buffers['ag'][idxs] = mb_ag
+        self.buffers['g'][idxs] = mb_g
+        self.buffers['actions'][idxs] = mb_actions
+        self.n_transitions_stored += self.T * batch_size
+
+        buffers = {}
+        for key in self.buffers.keys():
+            buffers[key] = self.buffers[key][idxs][None].copy()
+
+        # calculate energy
+        if self.env_name[:5] == 'Fetch':
+            g, m, delta_t = 9.81, 1, 0.04
+            if self.env_name[:9] == 'FetchPush':
+                potential_energy = 0.
+            else:
+                height = buffers['ag'][:, :, 2]
+                height_0 = np.repeat(height[:, 0].reshape(-1, 1), height[:, 1::].shape[1], axis=1)
+                height = height[:, 1::] - height_0
+                potential_energy = g * m * height
+            diff = np.diff(buffers['ag'], axis=1)
+            velocity = diff / delta_t
+            kinetic_energy = 0.5 * m * np.power(velocity, 2)
+            kinetic_energy = np.sum(kinetic_energy, axis=2)
+            energy_totoal = w_potential * potential_energy + w_linear * kinetic_energy
+            energy_diff = np.diff(energy_totoal, axis=1)
+            energy_transition = energy_totoal.copy()
+            energy_transition[:, 1::] = energy_diff.copy()
+            energy_transition = np.clip(energy_transition, 0, clip_energy)
+            energy_transition_total = np.sum(energy_transition, axis=1)
+            energy_final = np.sum(energy_transition_total.reshape(-1, 1))
+            self.buffers['e'][idxs, 0] = energy_final
+        else:
+            print('Trajectory Energy Function Not Implemented')
+
+    # sample the data from the replay buffer
+    def sample(self, batch_size):
+        temp_buffers = {}
+        for key in self.buffers.keys():
+            temp_buffers[key] = self.buffers[key][:self.current_size]
+        temp_buffers['obs_next'] = temp_buffers['obs'][:, 1:, :]
+        temp_buffers['ag_next'] = temp_buffers['ag'][:, 1:, :]
+        # sample transitions
+        transitions = self.sample_func(temp_buffers, batch_size)
+        return transitions
+
+    def random_sample(self, batch_size):
+        temp_buffers = {}
+        for key in self.buffers.keys():
+            temp_buffers[key] = self.buffers[key][:self.current_size]
+        temp_buffers['obs_next'] = temp_buffers['obs'][:, 1:, :]
+        temp_buffers['ag_next'] = temp_buffers['ag'][:, 1:, :]
+        # sample transitions
+        # print('start random sample', self.current_size)
+        T = temp_buffers['actions'].shape[1]  # 50 steps per traj
+        rollout_batch_size = temp_buffers['actions'].shape[0]  # 2 trajs
+        batch_size = batch_size  # target batches we want to sample
+        episode_idxs = np.random.randint(0, rollout_batch_size, batch_size)
+        # which traj to sample
+        t_samples = np.random.randint(T, size=batch_size)
+        # which step to sample
+        transitions = {key: temp_buffers[key][episode_idxs, t_samples].copy() for key in temp_buffers.keys()}
+        transitions = {k: transitions[k].reshape(batch_size, *transitions[k].shape[1:]) for k in transitions.keys()}
+        return transitions
+
+    def sample_traj(self, batch_size):
+        temp_buffers = {}
+        for key in self.buffers.keys():
+            temp_buffers[key] = self.buffers[key][:self.current_size]
+        temp_buffers['obs_next'] = temp_buffers['obs'][:, 1:, :]
+        temp_buffers['ag_next'] = temp_buffers['ag'][:, 1:, :]
+        T = temp_buffers['actions'].shape[1]  # 50 steps per traj
+        num_traj = temp_buffers['actions'].shape[0]  # number of all the trajs
+        episode_idxs = np.random.randint(0, num_traj, batch_size)
+        traj = {key: temp_buffers[key][episode_idxs, :].copy() for key in temp_buffers.keys()}
+        # remember obs and ag has a larger shape
+        return traj
+
+    def get_all_data(self):
+        temp_buffers = {}
+        for key in self.buffers.keys():
+            temp_buffers[key] = self.buffers[key][:self.current_size]
+        temp_buffers['obs_next'] = temp_buffers['obs'][:, 1:, :]
+        temp_buffers['ag_next'] = temp_buffers['ag'][:, 1:, :]
+        return temp_buffers
+
+    def _get_storage_idx(self, inc=None):
+        inc = inc or 1
+        if self.current_size + inc <= self.size:
+            idx = np.arange(self.current_size, self.current_size + inc)
+        elif self.current_size < self.size:
+            overflow = inc - (self.size - self.current_size)
+            idx_a = np.arange(self.current_size, self.size)
+            idx_b = np.random.randint(0, self.current_size, overflow)
+            idx = np.concatenate([idx_a, idx_b])
+        else:
+            idx = np.random.randint(0, self.size, inc)
+        self.current_size = min(self.size, self.current_size + inc)
+        if inc == 1:
+            idx = idx[0]
+        return idx
--- a/algos/sac/README.md
+++ b/algos/sac/README.md
+### Description
+------------
+Reimplementation of [Soft Actor-Critic Algorithms and Applications](https://arxiv.org/pdf/1812.05905.pdf) and a deterministic variant of SAC from [Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement
+Learning with a Stochastic Actor](https://arxiv.org/pdf/1801.01290.pdf).
+
+Added another branch for [Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement
+Learning with a Stochastic Actor](https://arxiv.org/pdf/1801.01290.pdf) -> [SAC_V](https://github.com/pranz24/pytorch-soft-actor-critic/tree/SAC_V).
+
+### Requirements
+------------
+*   [mujoco-py](https://github.com/openai/mujoco-py)
+*   [TensorboardX](https://github.com/lanpa/tensorboardX)
+*   [PyTorch](http://pytorch.org/)
+
+### Default Arguments and Usage
+------------
+### Usage
+
+```
+usage: main.py [-h] [--env-name ENV_NAME] [--policy POLICY] [--eval EVAL]
+               [--gamma G] [--tau G] [--lr G] [--alpha G]
+               [--automatic_entropy_tuning G] [--seed N] [--batch_size N]
+               [--num_steps N] [--hidden_size N] [--updates_per_step N]
+               [--start_steps N] [--target_update_interval N]
+               [--replay_size N] [--cuda]
+```
+
+(Note: There is no need for setting Temperature(`--alpha`) if `--automatic_entropy_tuning` is True.)
+
+#### For SAC
+
+```
+python main.py --env-name Humanoid-v2 --alpha 0.05
+```
+
+#### For SAC (Hard Update)
+
+```
+python main.py --env-name Humanoid-v2 --alpha 0.05 --tau 1 --target_update_interval 1000
+```
+
+#### For SAC (Deterministic, Hard Update)
+
+```
+python main.py --env-name Humanoid-v2 --policy Deterministic --tau 1 --target_update_interval 1000
+```
+
+### Arguments
+------------
+```
+PyTorch Soft Actor-Critic Args
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --env-name ENV_NAME   Mujoco Gym environment (default: HalfCheetah-v2)
+  --policy POLICY       Policy Type: Gaussian | Deterministic (default:
+                        Gaussian)
+  --eval EVAL           Evaluates a policy a policy every 10 episode (default:
+                        True)
+  --gamma G             discount factor for reward (default: 0.99)
+  --tau G               target smoothing coefficient(τ) (default: 5e-3)
+  --lr G                learning rate (default: 3e-4)
+  --alpha G             Temperature parameter α determines the relative
+                        importance of the entropy term against the reward
+                        (default: 0.2)
+  --automatic_entropy_tuning G
+                        Automaically adjust α (default: False)
+  --seed N              random seed (default: 123456)
+  --batch_size N        batch size (default: 256)
+  --num_steps N         maximum number of steps (default: 1e6)
+  --hidden_size N       hidden size (default: 256)
+  --updates_per_step N  model updates per simulator step (default: 1)
+  --start_steps N       Steps sampling random actions (default: 1e4)
+  --target_update_interval N
+                        Value target update per no. of updates per step
+                        (default: 1)
+  --replay_size N       size of replay buffer (default: 1e6)
+  --cuda                run on CUDA (default: False)
+```
+
+| Environment **(`--env-name`)**| Temperature **(`--alpha`)**|
+| ---------------| -------------|
+| HalfCheetah-v2| 0.2|
+| Hopper-v2| 0.2|
+| Walker2d-v2| 0.2|
+| Ant-v2| 0.2|
+| Humanoid-v2| 0.05|
+
--- a/algos/sac/model.py
+++ b/algos/sac/model.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.distributions import Normal
+
+LOG_SIG_MAX = 2
+LOG_SIG_MIN = -20
+epsilon = 1e-6
+
+# Initialize Policy weights
+def weights_init_(m):
+    if isinstance(m, nn.Linear):
+        torch.nn.init.xavier_uniform_(m.weight, gain=1)
+        torch.nn.init.constant_(m.bias, 0)
+
+
+class ValueNetwork(nn.Module):
+    def __init__(self, num_inputs, hidden_dim):
+        super(ValueNetwork, self).__init__()
+
+        self.linear1 = nn.Linear(num_inputs, hidden_dim)
+        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
+        self.linear3 = nn.Linear(hidden_dim, 1)
+
+        self.apply(weights_init_)
+
+    def forward(self, state):
+        x = F.relu(self.linear1(state))
+        x = F.relu(self.linear2(x))
+        x = self.linear3(x)
+        return x
+
+
+class QNetwork(nn.Module):
+    def __init__(self, num_inputs, num_actions, hidden_dim):
+        super(QNetwork, self).__init__()
+
+        # Q1 architecture
+        self.linear1 = nn.Linear(num_inputs + num_actions, hidden_dim)
+        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
+        self.linear3 = nn.Linear(hidden_dim, 1)
+
+        # Q2 architecture
+        self.linear4 = nn.Linear(num_inputs + num_actions, hidden_dim)
+        self.linear5 = nn.Linear(hidden_dim, hidden_dim)
+        self.linear6 = nn.Linear(hidden_dim, 1)
+
+        self.apply(weights_init_)
+
+    def forward(self, state, action):
+        xu = torch.cat([state, action], 1)
+        
+        x1 = F.relu(self.linear1(xu))
+        x1 = F.relu(self.linear2(x1))
+        x1 = self.linear3(x1)
+
+        x2 = F.relu(self.linear4(xu))
+        x2 = F.relu(self.linear5(x2))
+        x2 = self.linear6(x2)
+
+        return x1, x2
+
+
+class QNetwork_out(nn.Module):
+    def __init__(self, num_inputs, num_actions, hidden_dim):
+        super(QNetwork_out, self).__init__()
+
+        # Q1 architecture
+        self.linear1 = nn.Linear(num_inputs, hidden_dim)
+        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
+        self.linear3 = nn.Linear(hidden_dim, num_actions)
+
+        self.apply(weights_init_)
+
+    def forward(self, state):
+        x1 = F.relu(self.linear1(state))
+        x1 = F.relu(self.linear2(x1))
+        x1 = self.linear3(x1)
+
+        return x1
+
+
+class QNetwork_phi(nn.Module):
+    def __init__(self, num_inputs, num_actions, hidden_dim, abs_range, tanh_output):
+        super(QNetwork_phi, self).__init__()
+
+        # Q1 network
+        # obs encoder
+        obs_models = [nn.Linear(num_inputs-2, hidden_dim)]
+        obs_models += [nn.ReLU(), nn.Linear(hidden_dim, hidden_dim)]
+        obs_models += [nn.ReLU(), nn.Linear(hidden_dim, 2)]
+
+        self.obs_encoder = nn.Sequential(*obs_models)
+
+        # goal input
+        self.action_input = nn.Linear(num_actions+2, int(hidden_dim / 2))
+        self.dynamics_layer = nn.Linear(int(hidden_dim / 2) + 2, hidden_dim)
+        self.output_layer = nn.Linear(hidden_dim, 1)
+
+        # Q2 architecture
+        self.linear4 = nn.Linear(num_inputs + num_actions, hidden_dim)
+        self.linear5 = nn.Linear(hidden_dim, hidden_dim)
+        self.linear6 = nn.Linear(hidden_dim, 1)
+
+        self.tanh_output = tanh_output
+        self.abs_range = abs_range
+
+        self.apply(weights_init_)
+
+    def forward(self, state, action):
+        xu = torch.cat([state, action], 1)
+
+        x2 = F.relu(self.linear4(xu))
+        x2 = F.relu(self.linear5(x2))
+        x2 = self.linear6(x2)
+
+        state = state[:, :-2]
+        action = torch.cat([state[:, -2:], action], 1)
+        latent_s = self.obs_encoder(state)
+        if self.tanh_output:
+            latent_s = self.abs_range * torch.tanh(latent_s)
+        action_out = self.action_input(action)
+        action_out = F.relu(action_out)
+        x = torch.cat([latent_s, action_out], 1)
+        x = self.dynamics_layer(x)
+        x = F.relu(x)
+        x1 = self.output_layer(x)
+
+        return x1, x2
+
+    def phi(self, obs):
+        if len(obs.shape) is 1:
+            obs = obs.unsqueeze(0)
+        s = self.obs_encoder(obs)
+        if self.tanh_output:
+            s = self.abs_range * torch.tanh(s)
+        return s
+
+
+class GaussianPolicy(nn.Module):
+    def __init__(self, num_inputs, num_actions, hidden_dim, action_space, goal_dim):
+        super(GaussianPolicy, self).__init__()
+        
+        # self.linear1 = nn.Linear(num_inputs - goal_dim, hidden_dim)
+        # self.goal_input = nn.Linear(goal_dim, hidden_dim)
+
+        self.linear1 = nn.Linear(num_inputs, hidden_dim)
+        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
+        self.goal_dim = goal_dim
+
+        self.mean_linear = nn.Linear(hidden_dim, num_actions)
+        self.log_std_linear = nn.Linear(hidden_dim, num_actions)
+
+        # add phi layer
+        # self.phi_layer1 = nn.Linear(hidden_dim, hidden_dim)
+        # self.phi_layer2 = nn.Linear(hidden_dim, 2)
+        # self.phi_layer3 = nn.Linear(hidden_dim, 2)
+
+        self.apply(weights_init_)
+
+        # action rescaling
+        if action_space is None:
+            self.action_scale = torch.tensor(1.)
+            self.action_bias = torch.tensor(0.)
+        else:
+            self.action_scale = torch.FloatTensor(
+                (action_space.high - action_space.low) / 2.)
+            self.action_bias = torch.FloatTensor(
+                (action_space.high + action_space.low) / 2.)
+
+    def forward(self, state):
+        # x = self.linear1(state[..., :-self.goal_dim]) + self.goal_input(state[..., -self.goal_dim:])
+        x = self.linear1(state)
+        x = F.relu(x)
+
+        x = F.relu(self.linear2(x))
+        mean = self.mean_linear(x)
+        log_std = self.log_std_linear(x)
+        log_std = torch.clamp(log_std, min=LOG_SIG_MIN, max=LOG_SIG_MAX)
+        return mean, log_std
+
+    def phi(self, state):
+        x = F.relu(self.linear1(state))
+        x = F.relu(self.phi_layer1(x))
+        phi = self.phi_layer2(x)
+        return phi
+
+
+    def sample(self, state):
+        mean, log_std = self.forward(state)
+        std = log_std.exp()
+        normal = Normal(mean, std)
+        x_t = normal.rsample()  # for reparameterization trick (mean + std * N(0,1))
+        y_t = torch.tanh(x_t)
+        action = y_t * self.action_scale + self.action_bias
+        log_prob = normal.log_prob(x_t)
+        # Enforcing Action Bound
+        log_prob -= torch.log(self.action_scale * (1 - y_t.pow(2)) + epsilon)
+        log_prob = log_prob.sum(1, keepdim=True)
+        mean = torch.tanh(mean) * self.action_scale + self.action_bias
+        return action, log_prob, mean
+
+    def to(self, device):
+        self.action_scale = self.action_scale.to(device)
+        self.action_bias = self.action_bias.to(device)
+        return super(GaussianPolicy, self).to(device)
+
+
+class DeterministicPolicy(nn.Module):
+    def __init__(self, num_inputs, num_actions, hidden_dim, action_space=None):
+        super(DeterministicPolicy, self).__init__()
+        self.linear1 = nn.Linear(num_inputs, hidden_dim)
+        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
+
+        self.mean = nn.Linear(hidden_dim, num_actions)
+        self.noise = torch.Tensor(num_actions)
+
+        self.apply(weights_init_)
+
+        # action rescaling
+        if action_space is None:
+            self.action_scale = 1.
+            self.action_bias = 0.
+        else:
+            self.action_scale = torch.FloatTensor(
+                (action_space.high - action_space.low) / 2.)
+            self.action_bias = torch.FloatTensor(
+                (action_space.high + action_space.low) / 2.)
+
+    def forward(self, state):
+        x = F.relu(self.linear1(state))
+        x = F.relu(self.linear2(x))
+        mean = torch.tanh(self.mean(x)) * self.action_scale + self.action_bias
+        return mean
+
+    def sample(self, state):
+        mean = self.forward(state)
+        noise = self.noise.normal_(0., std=0.1)
+        noise = noise.clamp(-0.25, 0.25)
+        action = mean + noise
+        return action, torch.tensor(0.), mean
+
+    def to(self, device):
+        self.action_scale = self.action_scale.to(device)
+        self.action_bias = self.action_bias.to(device)
+        self.noise = self.noise.to(device)
+        return super(DeterministicPolicy, self).to(device)
--- a/algos/sac/replay_memory.py
+++ b/algos/sac/replay_memory.py
+import random
+import numpy as np
+
+class ReplayMemory:
+    def __init__(self, capacity):
+        self.capacity = capacity
+        self.buffer = []
+        self.position = 0
+
+    def push(self, state, action, reward, next_state, done, epoch):
+        if len(self.buffer) < self.capacity:
+            self.buffer.append(None)
+        self.buffer[self.position] = (state, action, reward, next_state, done, epoch+1)
+        self.position = (self.position + 1) % self.capacity
+
+    def sample(self, batch_size):
+        batch = random.sample(self.buffer, batch_size)
+        state, action, reward, next_state, done, _ = map(np.stack, zip(*batch))
+        return state, action, reward, next_state, done
+
+    def __len__(self):
+        return len(self.buffer)
+
+    def get_obs(self):
+        obs = [x[0] for x in self.buffer]
+        obs = np.array(obs)
+        obs_next = [x[3] for x in self.buffer]
+        obs_next = np.array(obs_next)
+        return obs.copy(), obs_next.copy()
+
+    def pri_sample(self, batch_size, temperature=1.):
+        tmp_buffer = np.array(self.buffer)
+        epoch = tmp_buffer[:, -1]
+        p_trajectory = np.power(epoch, 1 / (temperature + 1e-2))
+        p_trajectory = p_trajectory / p_trajectory.sum()
+        p_trajectory = p_trajectory.astype(np.float64)
+        idxs = np.random.choice(len(self.buffer), size=batch_size, replace=False, p=p_trajectory)
+        batch = [self.buffer[i] for i in idxs]
+        state, action, reward, next_state, done, _ = map(np.stack, zip(*batch))
+        return state, action, reward, next_state, done
+
+    def random_sample(self, batch_size):
+        idxs = np.random.randint(0, len(self.buffer), batch_size)
+        obs = [self.buffer[i][0] for i in idxs]
+        obs = np.array(obs)
+        obs_next = [self.buffer[i][3] for i in idxs]
+        obs_next = np.array(obs_next)
+        return obs, obs_next
+
+class Array_ReplayMemory:
+    def __init__(self, capacity, env_params):
+        self.capacity = capacity
+        action_dim = env_params['real_goal_dim']
+        obs_dim = env_params['obs'] + env_params['goal']
+        # create the buffer to store info
+        self.buffers = {'obs': np.empty([capacity, obs_dim]),
+                        'actions': np.empty([capacity, action_dim]),
+                        'reward': np.empty([capacity]),
+                        'next_obs': np.empty([capacity, obs_dim]),
+                        'done': np.empty([capacity])
+                        }
+        self.position = 0
+        self.current_size = 0
+
+    def push(self, state, action, reward, next_state, done, epoch):
+        self.buffers['obs'][self.position] = state
+        self.buffers['actions'][self.position] = action
+        self.buffers['reward'][self.position] = reward
+        self.buffers['next_obs'][self.position] = next_state
+        self.buffers['done'][self.position] = done
+
+        self.position = (self.position + 1) % self.capacity
+        if self.current_size + 1 < self.capacity:
+            self.current_size += 1
+
+
+    def sample(self, batch_size):
+        idx = np.random.randint(0, self.current_size, batch_size)
+        state = self.buffers['obs'][idx]
+        action = self.buffers['actions'][idx]
+        reward = self.buffers['reward'][idx]
+        next_state = self.buffers['next_obs'][idx]
+        done = self.buffers['done'][idx]
+        return state, action, reward, next_state, done
+
+    def __len__(self):
+        return self.current_size
+
+
+
+
--- a/algos/sac/sac.py
+++ b/algos/sac/sac.py
+import os
+import torch
+import torch.nn.functional as F
+from torch.optim import Adam
+from algos.sac.utils import soft_update, hard_update
+from algos.sac.model import GaussianPolicy, QNetwork, DeterministicPolicy, QNetwork_phi
+
+
+class SAC(object):
+    def __init__(self, num_inputs, action_space, args, pri_replay, goal_dim, gradient_flow_value, abs_range, tanh_output):
+
+        self.gamma = args.gamma
+        self.tau = args.tau
+        self.alpha = args.alpha
+        self.pri_replay = pri_replay
+
+        self.policy_type = args.policy
+        self.target_update_interval = args.target_update_interval
+        self.automatic_entropy_tuning = args.automatic_entropy_tuning
+
+        self.device = args.device
+        self.gradient_flow_value = gradient_flow_value
+
+        if not gradient_flow_value:
+            self.critic = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(device=self.device)
+            self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)
+
+            self.critic_target = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(self.device)
+            hard_update(self.critic_target, self.critic)
+        else:
+            self.critic = QNetwork_phi(num_inputs, action_space.shape[0], args.hidden_size, abs_range, tanh_output).to(device=self.device)
+            self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)
+
+            self.critic_target = QNetwork_phi(num_inputs, action_space.shape[0], args.hidden_size, abs_range, tanh_output).to(self.device)
+            hard_update(self.critic_target, self.critic)
+
+
+        if self.policy_type == "Gaussian":
+            # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
+            if self.automatic_entropy_tuning is True:
+                self.target_entropy = -torch.prod(torch.Tensor(action_space.shape).to(self.device)).item()
+                self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
+                self.alpha_optim = Adam([self.log_alpha], lr=args.lr)
+
+            self.policy = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space, goal_dim).to(self.device)
+            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
+
+            self.policy_target = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space,
+                                         goal_dim).to(self.device)
+            hard_update(self.policy_target, self.policy)
+
+
+        else:
+            self.alpha = 0
+            self.automatic_entropy_tuning = False
+            self.policy = DeterministicPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(self.device)
+            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
+
+    def select_action(self, state, evaluate=False):
+        state = torch.FloatTensor(state).to(self.device).unsqueeze(0)
+        if evaluate is False:
+            action, _, _ = self.policy.sample(state)
+        else:
+            _, _, action = self.policy.sample(state)
+        return action.detach().cpu().numpy()[0]
+
+    def update_parameters(self, memory, batch_size, env_params, hi_sparse, feature_data):
+        # Sample a batch from memory
+        if self.pri_replay:
+            state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.pri_sample(batch_size=batch_size)
+        else:
+            state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample(batch_size=batch_size)
+
+        state_batch = torch.FloatTensor(state_batch).to(self.device)
+        next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)
+        action_batch = torch.FloatTensor(action_batch).to(self.device)
+        reward_batch = torch.FloatTensor(reward_batch).to(self.device).unsqueeze(1)
+        mask_batch = torch.FloatTensor(mask_batch).to(self.device).unsqueeze(1)
+
+        with torch.no_grad():
+            next_state_action, next_state_log_pi, _ = self.policy.sample(next_state_batch)
+            qf1_next_target, qf2_next_target = self.critic_target(next_state_batch, next_state_action)
+            min_qf_next_target = torch.min(qf1_next_target, qf2_next_target) - self.alpha * next_state_log_pi
+            # print("min_qf_target", min_qf_next_target.shape)
+            next_q_value = reward_batch + mask_batch * self.gamma * (min_qf_next_target)
+            if hi_sparse:
+                # clip target value
+                next_q_value = torch.clamp(next_q_value, -env_params['max_timesteps'], 0.)
+        qf1, qf2 = self.critic(state_batch, action_batch)  # Two Q-functions to mitigate positive bias in the policy improvement step
+        # print("qf1", qf1.shape)
+        # print("next_q", next_q_value.shape)
+        qf1_loss = F.mse_loss(qf1, next_q_value)  # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
+        qf2_loss = F.mse_loss(qf2, next_q_value)  # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
+
+        pi, log_pi, _ = self.policy.sample(state_batch)
+
+        qf1_pi, qf2_pi = self.critic(state_batch, pi)
+        min_qf_pi = torch.min(qf1_pi, qf2_pi)
+
+        policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean() # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))]
+        if feature_data is not None:
+            if self.gradient_flow_value:
+                obs, obs_next = self.critic.phi(feature_data[0]), self.critic.phi(feature_data[1])
+                min_dist = torch.clamp((obs - obs_next).pow(2).mean(dim=1), min=0.)
+                hi_obs, hi_obs_next = self.critic.phi(feature_data[2]), self.critic.phi(feature_data[3])
+                max_dist = torch.clamp(1 - (hi_obs - hi_obs_next).pow(2).mean(dim=1), min=0.)
+                representation_loss = (min_dist + max_dist).mean()
+                qf1_loss = qf1_loss * 0.1 + representation_loss
+            else:
+                obs, obs_next = self.policy.phi(feature_data[0]), self.policy.phi(feature_data[1])
+                min_dist = torch.clamp((obs - obs_next).pow(2).mean(dim=1), min=0.)
+                hi_obs, hi_obs_next = self.policy.phi(feature_data[2]), self.policy.phi(feature_data[3])
+                max_dist = torch.clamp(1 - (hi_obs - hi_obs_next).pow(2).mean(dim=1), min=0.)
+                representation_loss = (min_dist + max_dist).mean()
+                policy_loss += representation_loss
+
+
+        self.critic_optim.zero_grad()
+        qf1_loss.backward()
+        self.critic_optim.step()
+
+        self.critic_optim.zero_grad()
+        qf2_loss.backward()
+        self.critic_optim.step()
+
+        self.policy_optim.zero_grad()
+        policy_loss.backward()
+        self.policy_optim.step()
+
+        if self.automatic_entropy_tuning:
+            alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean()
+
+            self.alpha_optim.zero_grad()
+            alpha_loss.backward()
+            self.alpha_optim.step()
+
+            self.alpha = self.log_alpha.exp()
+            alpha_tlogs = self.alpha.clone() # For TensorboardX logs
+        else:
+            alpha_loss = torch.tensor(0.).to(self.device)
+            alpha_tlogs = torch.tensor(self.alpha) # For TensorboardX logs
+
+
+        soft_update(self.critic_target, self.critic, self.tau)
+        soft_update(self.policy_target, self.policy, self.tau)
+
+        return qf1_loss.item(), qf2_loss.item(), policy_loss.item(), alpha_loss.item(), alpha_tlogs.item()
+
+    # Save model parameters
+    def save_model(self, env_name, suffix="", actor_path=None, critic_path=None):
+        if not os.path.exists('models/'):
+            os.makedirs('models/')
+
+        if actor_path is None:
+            actor_path = "models/sac_actor_{}_{}".format(env_name, suffix)
+        if critic_path is None:
+            critic_path = "models/sac_critic_{}_{}".format(env_name, suffix)
+        print('Saving models to {} and {}'.format(actor_path, critic_path))
+        torch.save(self.policy.state_dict(), actor_path)
+        torch.save(self.critic.state_dict(), critic_path)
+
+    # Load model parameters
+    def load_model(self, actor_path, critic_path):
+        print('Loading models from {} and {}'.format(actor_path, critic_path))
+        if actor_path is not None:
+            self.policy.load_state_dict(torch.load(actor_path))
+        if critic_path is not None:
+            self.critic.load_state_dict(torch.load(critic_path))
+
--- a/algos/sac/utils.py
+++ b/algos/sac/utils.py
+import math
+import torch
+
+def create_log_gaussian(mean, log_std, t):
+    quadratic = -((0.5 * (t - mean) / (log_std.exp())).pow(2))
+    l = mean.shape
+    log_z = log_std
+    z = l[-1] * math.log(2 * math.pi)
+    log_p = quadratic.sum(dim=-1) - log_z.sum(dim=-1) - 0.5 * z
+    return log_p
+
+def logsumexp(inputs, dim=None, keepdim=False):
+    if dim is None:
+        inputs = inputs.view(-1)
+        dim = 0
+    s, _ = torch.max(inputs, dim=dim, keepdim=True)
+    outputs = s + (inputs - s).exp().sum(dim=dim, keepdim=True).log()
+    if not keepdim:
+        outputs = outputs.squeeze(dim)
+    return outputs
+
+def soft_update(target, source, tau):
+    for target_param, param in zip(target.parameters(), source.parameters()):
+        target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau)
+
+def hard_update(target, source):
+    for target_param, param in zip(target.parameters(), source.parameters()):
+        target_param.data.copy_(param.data)
--- a/algos/utils/normalizer.py
+++ b/algos/utils/normalizer.py
+import numpy as np
+
+
+class normalizer:
+    def __init__(self, size, eps=1e-2, default_clip_range=np.inf):
+        self.size = size
+        self.eps = eps
+        self.default_clip_range = default_clip_range
+        # some local information
+        self.local_sum = np.zeros(self.size, np.float32)
+        self.local_sumsq = np.zeros(self.size, np.float32)
+        self.local_count = np.zeros(1, np.float32)
+        # get the total sum sumsq and sum count
+        self.total_sum = np.zeros(self.size, np.float32)
+        self.total_sumsq = np.zeros(self.size, np.float32)
+        self.total_count = np.ones(1, np.float32)
+        # get the mean and std
+        self.mean = np.zeros(self.size, np.float32)
+        self.std = np.ones(self.size, np.float32)
+        # thread locker
+
+    # update the parameters of the normalizer
+    def update(self, v):
+        v = v.reshape(-1, self.size)
+        self.local_sum += v.sum(axis=0)
+        self.local_sumsq += (np.square(v)).sum(axis=0)
+        self.local_count[0] += v.shape[0]
+
+    def recompute_stats(self):
+        local_count = self.local_count.copy()
+        local_sum = self.local_sum.copy()
+        local_sumsq = self.local_sumsq.copy()
+        # reset
+        self.local_count[...] = 0
+        self.local_sum[...] = 0
+        self.local_sumsq[...] = 0
+        # update the total stuff
+        self.total_sum += local_sum
+        self.total_sumsq += local_sumsq
+        self.total_count += local_count
+        # calculate the new mean and std
+        self.mean = self.total_sum / self.total_count
+        self.std = np.sqrt(np.maximum(np.square(self.eps), (self.total_sumsq / self.total_count) - np.square(
+            self.total_sum / self.total_count)))
+
+    # normalize the observation
+    def normalize(self, v, clip_range=None):
+        # print('now normalize', v)
+        if clip_range is None:
+            clip_range = self.default_clip_range
+        # print((v - self.mean) / (self.std))
+        return np.clip((v - self.mean) / (self.std), -clip_range, clip_range)
--- a/algos/utils/random_policy.py
+++ b/algos/utils/random_policy.py
+import numpy as np
+import torch
+from torch import nn
+
+
+class RandomPolicy(nn.Module):
+    def __init__(self, action_space, is_binary=False):
+        nn.Module.__init__(self)
+        self.action_space = action_space
+        self.is_binary = is_binary
+        self.discrete = ('n' in vars(self.action_space))
+
+    def random(self):
+        if self.discrete:
+            return np.random.randint(self.action_space.n)
+        else:
+            low = np.array(self.action_space.low)
+            high = np.array(self.action_space.high)
+            if self.is_binary:
+                return np.random.randint(3, size=self.action_space.shape) - 1
+            return np.random.random(size=self.action_space.shape) * (high - low) + low
+
+    def forward(self, obs, *args):
+        if isinstance(obs, dict):  # goal conditioned environment
+            obs = obs['observation']
+        act = torch.Tensor(np.stack([self.random() for i in range(len(obs))], axis=0))
+        if self.discrete:
+            act = act.long()
+        return act
+
+    def reset(self, i):
+        pass
--- a/algos/utils/random_process.py
+++ b/algos/utils/random_process.py
+import numpy as np
+
+
+# [reference] https://github.com/matthiasplappert/keras-rl/blob/master/rl/random.py
+
+class RandomProcess(object):
+    def reset_states(self):
+        pass
+
+
+class AnnealedGaussianProcess(RandomProcess):
+    def __init__(self, mu, sigma, sigma_min, n_steps_annealing):
+        self.mu = mu
+        self.sigma = sigma
+        self.n_steps = 0
+
+        if sigma_min is not None:
+            self.m = -float(sigma - sigma_min) / float(n_steps_annealing)
+            self.c = sigma
+            self.sigma_min = sigma_min
+        else:
+            self.m = 0.
+            self.c = sigma
+            self.sigma_min = sigma
+
+    @property
+    def current_sigma(self):
+        sigma = max(self.sigma_min, self.m * float(self.n_steps) + self.c)
+        return sigma
+
+
+# Based on http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab
+class OrnsteinUhlenbeckProcess(AnnealedGaussianProcess):
+    def __init__(self, theta, mu=0., sigma=1., dt=1e-2, x0=None, size=1, sigma_min=None, n_steps_annealing=1000):
+        super(OrnsteinUhlenbeckProcess, self).__init__(mu=mu, sigma=sigma, sigma_min=sigma_min,
+                                                       n_steps_annealing=n_steps_annealing)
+        self.theta = theta
+        self.mu = mu
+        self.dt = dt
+        self.x0 = x0
+        self.size = size
+        self.reset_states()
+
+    def sample(self):
+        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + self.current_sigma * np.sqrt(
+            self.dt) * np.random.normal(size=self.size)
+        self.x_prev = x
+        self.n_steps += 1
+        return x
+
+    def reset_states(self):
+        self.x_prev = self.x0 if self.x0 is not None else np.zeros(self.size)
--- a/arguments/arguments_hier_sac.py
+++ b/arguments/arguments_hier_sac.py
+import argparse
+
+"""
+Here are the param for the training
+
+"""
+
+
+
+
+def get_args_ant():
+    parser = argparse.ArgumentParser()
+    # the environment setting
+    parser.add_argument('--env-name', type=str, default='AntMaze1-v1', help='the environment name')
+    parser.add_argument('--test', type=str, default='AntMaze1Test-v1')
+    parser.add_argument('--n-epochs', type=int, default=20000, help='the number of epochs to train the agent')
+    parser.add_argument('--n-batches', type=int, default=200, help='the times to update the network')
+    parser.add_argument('--seed', type=int, default=125, help='random seed')
+
+    parser.add_argument('--replay-strategy', type=str, default='none', help='the HER strategy')
+    parser.add_argument('--save-dir', type=str, default='saved_models/', help='the path to save the models')
+
+    parser.add_argument('--noise-eps', type=float, default=0.2, help='noise factor for Gaussian')
+    parser.add_argument('--random-eps', type=float, default=0.2, help="prob for acting randomly")
+
+    parser.add_argument('--buffer-size', type=int, default=int(1e6), help='the size of the buffer')
+    parser.add_argument('--replay-k', type=int, default=5, help='ratio to be replaced')
+    parser.add_argument('--future-step', type=int, default=200, help='future step to be sampled')
+    parser.add_argument('--batch-size', type=int, default=128, help='the sample batch size')
+    parser.add_argument('--gamma', type=float, default=0.99, help='the discount factor')
+    parser.add_argument('--action-l2', type=float, default=0.0, help='l2 reg')
+    parser.add_argument('--lr-actor', type=float, default=0.0002, help='the learning rate of the actor')
+    parser.add_argument('--lr-critic', type=float, default=0.0002, help='the learning rate of the critic')
+    parser.add_argument('--polyak', type=float, default=0.99, help='the average coefficient')
+    parser.add_argument('--n-test-rollouts', type=int, default=10, help='the number of tests')
+
+    parser.add_argument('--metric', type=str, default='MLP', help='the metric for the distance embedding')
+    parser.add_argument('--device', type=str, default="cuda:3", help='cuda device')
+
+    parser.add_argument('--lr-decay-actor', type=int, default=3000, help='actor learning rate decay')
+    parser.add_argument('--lr-decay-critic', type=int, default=3000, help='critic learning rate decay')
+    parser.add_argument('--layer', type=int, default=6, help='number of layers for critic')
+
+    parser.add_argument('--period', type=int, default=3, help='target update period')
+    parser.add_argument('--distance', type=float, default=0.1, help='distance threshold for HER')
+
+    parser.add_argument('--resume', type=bool, default=False, help='resume or not')
+    # Will be considered only if resume is True
+    parser.add_argument('--resume-epoch', type=int, default=0, help='resume epoch')
+    parser.add_argument('--resume-path', type=str, default='saved_models/AntPush-v1_Nov16_08-30-42', help='resume path')
+
+    # add for hier policy
+    parser.add_argument('--save', type=bool, default=True, help='save model and tensorboard data')
+    parser.add_argument('--animate', type=bool, default=False)
+    parser.add_argument("--eval", type=bool, default=False)
+    parser.add_argument('--eval_interval', type=int, default=50, help="every n episodes to eval once")
+    parser.add_argument('--c', type=int, default=50, help="interval of high-level action")
+    parser.add_argument('--gradient_flow', type=bool, default=False, help='end-to-end learn feature and policy')
+    parser.add_argument('--gradient_flow_value', type=bool, default=False, help='slow feature as a embedding of value function')
+    parser.add_argument('--abs_range', type=float, default=20.0, help='range of high-level action space')
+    parser.add_argument('--use_target', type=bool, default=False, help='use target network for learning feature')
+    parser.add_argument('--early_stop', type=bool, default=False, help='early stop the learning of low-level')
+    parser.add_argument('--low_reward_coeff', type=float, default=0.1, help='low-level reward coeff')
+    parser.add_argument("--use_prediction", type=bool, default=False, help='use prediction error to learn feature')
+    parser.add_argument("--start_update_phi", type=int, default=10, help='use prediction error to learn feature')
+    parser.add_argument("--image", type=bool, default=False, help='use image input')
+    parser.add_argument("--old_sample", type=bool, default=False, help='sample the absolute goal in the abs_range')
+
+    # args of sac
+    parser.add_argument('--policy', default="Gaussian",
+                        help='Policy Type: Gaussian | Deterministic (default: Gaussian)')
+    parser.add_argument('--tau', type=float, default=0.005, metavar='G',
+                        help='target smoothing coefficient(τ) (default: 0.005)')
+    parser.add_argument('--lr', type=float, default=0.0003, metavar='G',
+                        help='learning rate (default: 0.0003)')
+    parser.add_argument('--alpha', type=float, default=0.2, metavar='G',
+                        help='Temperature parameter α determines the relative importance of the entropy\
+                                    term against the reward (default: 0.2)')
+    parser.add_argument('--automatic_entropy_tuning', type=bool, default=False, metavar='G',
+                        help='Automaically adjust α (default: False)')
+    parser.add_argument('--hidden_size', type=int, default=256, metavar='N',
+                        help='hidden size (default: 256)')
+    parser.add_argument('--start_epoch', type=int, default=300, metavar='N',
+                        help='Epochs sampling random actions (default: 50)')
+    parser.add_argument('--target_update_interval', type=int, default=1, metavar='N',
+                        help='Value target update per no. of updates per step (default: 1)')
+
+    args = parser.parse_args()
+    return args
+
+
+
+
+def get_args_chain():
+    parser = argparse.ArgumentParser()
+    # the environment setting
+    parser.add_argument('--env-name', type=str, default='NChain-v1', help='the environment name')
+    parser.add_argument('--test', type=str, default='NChain-v1')
+    parser.add_argument('--n-epochs', type=int, default=100, help='the number of epochs to train the agent')
+    parser.add_argument('--n-batches', type=int, default=200, help='the times to update the network')
+    parser.add_argument('--seed', type=int, default=160, help='random seed')
+
+    parser.add_argument('--replay-strategy', type=str, default='none', help='the HER strategy')
+    parser.add_argument('--save-dir', type=str, default='saved_models/', help='the path to save the models')
+
+    parser.add_argument('--noise-eps', type=float, default=0.2, help='noise factor for Gaussian')
+    parser.add_argument('--random-eps', type=float, default=0.2, help="prob for acting randomly")
+
+    parser.add_argument('--buffer-size', type=int, default=int(1e6), help='the size of the buffer')
+    parser.add_argument('--replay-k', type=int, default=5, help='ratio to be replaced')
+    parser.add_argument('--future-step', type=int, default=200, help='future step to be sampled')
+    parser.add_argument('--batch-size', type=int, default=128, help='the sample batch size')
+    parser.add_argument('--gamma', type=float, default=0.99, help='the discount factor')
+    parser.add_argument('--action-l2', type=float, default=0.0, help='l2 reg')
+    parser.add_argument('--lr-actor', type=float, default=0.0002, help='the learning rate of the actor')
+    parser.add_argument('--lr-critic', type=float, default=0.0002, help='the learning rate of the critic')
+    parser.add_argument('--polyak', type=float, default=0.99, help='the average coefficient')
+    parser.add_argument('--n-test-rollouts', type=int, default=10, help='the number of tests')
+
+    parser.add_argument('--metric', type=str, default='MLP', help='the metric for the distance embedding')
+    parser.add_argument('--device', type=str, default="cuda:8", help='cuda device')
+
+    parser.add_argument('--lr-decay-actor', type=int, default=3000, help='actor learning rate decay')
+    parser.add_argument('--lr-decay-critic', type=int, default=3000, help='critic learning rate decay')
+    parser.add_argument('--layer', type=int, default=6, help='number of layers for critic')
+
+    parser.add_argument('--period', type=int, default=3, help='target update period')
+    parser.add_argument('--distance', type=float, default=0.1, help='distance threshold for HER')
+
+    parser.add_argument('--resume', type=bool, default=False, help='resume or not')
+    # Will be considered only if resume is True
+    parser.add_argument('--resume-epoch', type=int, default=0, help='resume epoch')
+    parser.add_argument('--resume-path', type=str, default='saved_models/NChain-v1_Jul29_11-02-57', help='resume path')
+
+    # add for hier policy
+    parser.add_argument('--save', type=bool, default=True, help='save model and tensorboard data')
+    parser.add_argument('--animate', type=bool, default=False)
+    parser.add_argument("--eval", type=bool, default=False)
+    parser.add_argument('--eval_interval', type=int, default=50, help="every n episodes to eval once")
+    parser.add_argument('--c', type=int, default=30, help="interval of high-level action")
+    parser.add_argument('--gradient_flow', type=bool, default=False, help='end-to-end learn feature and policy')
+    parser.add_argument('--gradient_flow_value', type=bool, default=False, help='slow feature as a embedding of value function')
+    parser.add_argument('--abs_range', type=float, default=100.0, help='range of high-level action space')
+    parser.add_argument('--use_target', type=bool, default=False, help='use target network for learning feature')
+    parser.add_argument('--early_stop', type=bool, default=True, help='early stop the learning of low-level')
+    parser.add_argument('--low_reward_coeff', type=float, default=0.01, help='low-level reward coeff')
+    parser.add_argument("--use_prediction", type=bool, default=False, help='use prediction error to learn feature')
+    parser.add_argument("--start_update_phi", type=int, default=2, help='use prediction error to learn feature')
+    parser.add_argument("--image", type=bool, default=False, help='use image input')
+
+    # args of sac (high-level learning)
+    parser.add_argument('--policy', default="Gaussian",
+                        help='Policy Type: Gaussian | Deterministic (default: Gaussian)')
+    parser.add_argument('--tau', type=float, default=0.005, metavar='G',
+                        help='target smoothing coefficient(τ) (default: 0.005)')
+    parser.add_argument('--lr', type=float, default=0.0003, metavar='G',
+                        help='learning rate (default: 0.0003)')
+    parser.add_argument('--alpha', type=float, default=0.2, metavar='G',
+                        help='Temperature parameter α determines the relative importance of the entropy\
+                                    term against the reward (default: 0.2)')
+    parser.add_argument('--automatic_entropy_tuning', type=bool, default=False, metavar='G',
+                        help='Automaically adjust α (default: False)')
+    parser.add_argument('--hidden_size', type=int, default=256, metavar='N',
+                        help='hidden size (default: 256)')
+    parser.add_argument('--start_epoch', type=int, default=20000, metavar='N',
+                        help='Epochs sampling random actions (default: 50)')
+    parser.add_argument('--target_update_interval', type=int, default=1, metavar='N',
+                        help='Value target update per no. of updates per step (default: 1)')
+
+    args = parser.parse_args()
+    return args
--- a/goal_env/__init__.py
+++ b/goal_env/__init__.py
+import sys
+
+sys.path.append('../')
+from gym.envs.registration import register
+import gym
+from goal_env.bitflip import BitFlipEnv
+from goal_env.fourroom import FourRoom, FourRoom2, FourRoom3, FourRoom4
+from goal_env.mountaincar import MountainCarEnv
+from goal_env.plane import NaivePlane, NaivePlane2, NaivePlane3, NaivePlane4, NaivePlane5
+from goal_env.goal_plane_env import GoalPlane
+from goal_env.nchain import NChainEnv
+
+register(
+    id='Bitflip-v0',
+    entry_point='goal_env.bitflip:BitFlipEnv',
+    kwargs={'num_bits': 11},
+    max_episode_steps=200,
+    reward_threshold=100.0,
+    nondeterministic=False,
+)
+
+N = 64
+register(
+    id='NChain-v1',
+    entry_point='goal_env.nchain:NChainEnv',
+    kwargs={'n': N,
+            'slip': 0.1,
+            },
+    max_episode_steps=N+10,
+)
+
+register(
+    id='FourRoom-v0',
+    entry_point='goal_env.fourroom:FourRoom',
+    kwargs={'goal_type': 'fix_goal'},
+    max_episode_steps=200,
+    reward_threshold=100.0,
+    nondeterministic=False,
+)
+
+register(
+    id='FourRoom-v1',
+    entry_point='goal_env.fourroom:FourRoom2',
+    kwargs={'goal_type': 'fix_goal'},
+    max_episode_steps=200,
+    reward_threshold=100.0,
+    nondeterministic=False,
+)
+
+register(
+    id='FourRoom-v2',
+    entry_point='goal_env.fourroom:FourRoom3',
+    kwargs={'goal_type': 'fix_goal'},
+    max_episode_steps=200,
+    reward_threshold=100.0,
+    nondeterministic=False,
+)
+
+register(
+    id='FourRoom-v4',
+    entry_point='goal_env.fourroom:FourRoom4',
+    kwargs={'goal_type': 'fix_goal'},
+    max_episode_steps=200,
+    reward_threshold=100.0,
+    nondeterministic=False,
+)
+
+register(
+    id='mcar-v0',
+    entry_point='goal_env.mountaincar:MountainCarEnv',
+    kwargs={'goal_dim': 1},
+    max_episode_steps=200,
+    reward_threshold=100.0,
+    nondeterministic=False,
+)
+
+register(
+    id='Plane-v0',
+    entry_point='goal_env.plane:NaivePlane5',
+)
+
+register(
+    id='GoalPlane-v0',
+    entry_point='goal_env.goal_plane_env:GoalPlane',
+    max_episode_steps=50,
+    reward_threshold=195.0,
+    kwargs={
+        "env_name": "Plane-v0",
+        "maze_size": 15,
+        "action_size": 1,
+        "distance": 1.,
+        "start": (2.5, 2.5),
+    }
+)
+
+register(
+    id='GoalPlaneMid-v0',
+    entry_point='goal_env.goal_plane_env:GoalPlane',
+    max_episode_steps=50,
+    reward_threshold=195.0,
+    kwargs={
+        "env_name": "Plane-v0",
+        "type": "mid",
+        "maze_size": 15,
+        "action_size": 1,
+        "distance": 1.,
+        "start": (2.5, 2.5),
+    }
+)
+
+register(
+    id='GoalPlaneHard-v0',
+    entry_point='goal_env.goal_plane_env:GoalPlane',
+    max_episode_steps=50,
+    reward_threshold=195.0,
+    kwargs={
+        "env_name": "Plane-v0",
+        "type": "hard",
+        "maze_size": 15,
+        "action_size": 1,
+        "distance": 1.,
+        "start": (2.5, 2.5),
+    }
+)
+
+register(
+    id='GoalPlaneEasy-v0',
+    entry_point='goal_env.goal_plane_env:GoalPlane',
+    max_episode_steps=50,
+    reward_threshold=195.0,
+    kwargs={
+        "env_name": "Plane-v0",
+        "type": "easy",
+        "maze_size": 15,
+        "action_size": 1,
+        "distance": 1.,
+        "start": (2.5, 2.5),
+    }
+)
+
+register(
+    id='GoalPlaneTest-v0',
+    entry_point='goal_env.goal_plane_env:GoalPlane',
+    max_episode_steps=50,
+    reward_threshold=195.0,
+    kwargs={
+        "env_name": "Plane-v0",
+        "maze_size": 15,
+        "action_size": 1,
+        "distance": 1.,
+        "start": (2.5, 2.5),
+        "goals": (2.5, 12.5)
+    }
+)
--- a/goal_env/bitflip.py
+++ b/goal_env/bitflip.py
+## copied from RL-Adventure2
+import gym
+import numpy as np
+from gym import spaces
+
+
+class BitFlipEnv(gym.Env):
+    def __init__(self, num_bits):
+        self.num_bits = num_bits
+
+        self.observation_space = {
+            'observation': spaces.Box(low=0, high=1, shape=(self.num_bits,)),
+            'desired_goal': spaces.Box(low=0, high=1, shape=(self.num_bits,)),
+            'achieved_goal': spaces.Box(low=0, high=1, shape=(self.num_bits,))
+        }
+        self.action_space = spaces.Discrete(self.num_bits)
+
+    def get_obs(self):
+        return {
+            "observation": np.copy(self.state),
+            "achieved_goal": np.copy(self.state),
+            "desired_goal": np.copy(self.target),
+        }
+
+    def reset(self):
+        self.done = False
+        self.num_steps = 0
+        self.state = np.random.randint(2, size=self.num_bits)
+        self.target = np.random.randint(2, size=self.num_bits)
+        return self.get_obs()
+        # return self.state, self.target
+
+    def step(self, action):
+        self.state[action] = 1 - self.state[action]
+        info = {'is_success': False}
+        # print(self.state, self.target)
+        if self.num_steps > self.num_bits + 1:
+            self.done = True
+        self.num_steps += 1
+
+        if np.sum(self.state == self.target) == self.num_bits:
+            self.done = True
+            info = {'is_success': True}
+            return self.get_obs(), 0, self.done, info
+        else:
+            return self.get_obs(), -1, self.done, info
+
+    def compute_reward(self, state, goal, info):
+        calcu = np.sum(state == goal, axis=1)
+        reward = np.where(calcu == self.num_bits, 0, -1)
+        return reward
+
+    def get_pairwise(self, state, target):
+        dist = self.num_bits - np.sum(state == target)
+        return dist
--- a/goal_env/fourroom.py
+++ b/goal_env/fourroom.py
+## importance resampling
+import gym
+import numpy as np
+from gym import spaces
+
+
+class FourRoom(gym.Env):
+    def __init__(self, seed=None, goal_type='fix_goal'):
+        self.n = 11
+        self.map = np.array([
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+            0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0,
+            0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0,
+            0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+            0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
+            0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0,
+            0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0,
+            0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+            0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        ]).reshape((self.n, self.n))
+        self.goal_type = goal_type
+        self.goal = None
+        self.init()
+
+    def init(self):
+        self.observation_space = {
+            'observation': spaces.Box(low=0, high=1, shape=(self.n * self.n,), dtype=np.float32),
+            'desired_goal': spaces.Box(low=0, high=1, shape=(self.n * self.n,), dtype=np.float32),
+            'achieved_goal': spaces.Box(low=0, high=1, shape=(self.n * self.n,), dtype=np.float32)
+        }
+        self.observation_space['observation'].n = self.n
+        self.dx = [0, 1, 0, -1]
+        self.dy = [1, 0, -1, 0]
+        self.action_space = spaces.Discrete(len(self.dx))
+        self.reset()
+
+    def label2obs(self, x, y):
+        a = np.zeros((self.n * self.n,))
+        assert self.x < self.n and self.y < self.n
+        a[x * self.n + y] = 1
+        return a
+
+    def get_obs(self):
+        assert self.goal is not None
+        return {
+            'observation': self.label2obs(self.x, self.y),
+            'desired_goal': self.label2obs(*self.goal),
+            'achieved_goal': self.label2obs(self.x, self.y),
+        }
+
+    def reset(self):
+        condition = True
+        while condition:
+            self.x = np.random.randint(1, self.n)
+            self.y = np.random.randint(1, self.n)
+            condition = (self.map[self.x, self.y] == 0)
+
+        loc = np.where(self.map > 0.5)
+        assert len(loc) == 2
+        if self.goal_type == 'random':
+            goal_idx = np.random.randint(len(loc[0]))
+        elif self.goal_type == 'fix_goal':
+            goal_idx = 0
+        else:
+            raise NotImplementedError
+        self.goal = loc[0][goal_idx], loc[1][goal_idx]
+        self.done = False
+        return self.get_obs()
+
+    def step(self, action):
+        # assert not self.done
+        nx, ny = self.x + self.dx[action], self.y + self.dy[action]
+        info = {'is_success': False}
+        # before = self.get_obs().argmax()
+        if self.map[nx, ny]:
+            self.x, self.y = nx, ny
+            reward = -1
+            done = False
+        else:
+            reward = -1
+            done = False
+        if nx == self.goal[0] and ny == self.goal[1]:
+            reward = 0
+            info = {'is_success': True}
+            done = self.done = True
+        return self.get_obs(), reward, done, info
+
+    def compute_reward(self, state, goal, info):
+        state_obs = state.argmax(axis=1)
+        goal_obs = goal.argmax(axis=1)
+        reward = np.where(state_obs == goal_obs, 0, -1)
+        return reward
+
+    def restore(self, obs):
+        obs = obs.argmax()
+        self.x = obs // self.n
+        self.y = obs % self.n
+
+    def bfs_dist(self, state, goal):
+        # using bfs to search for shortest path
+        visited = {key: False for key in range(self.n * self.n)}
+        state_key = state.argmax()
+        goal_key = goal.argmax()
+        queue = []
+        visited[state_key] = True
+        queue.append(state_key)
+        dist = [-np.inf] * (self.n * self.n)
+        dist[state_key] = 0
+
+        while (queue):
+            par = queue.pop(0)
+            if par == goal_key:
+                break
+            x_par, y_par = par // self.n, par % self.n
+            for action in range(4):
+                x_child, y_child = x_par + self.dx[action], y_par + self.dy[action]
+                child = x_child * self.n + y_child
+                if self.map[x_child, y_child] == 0:
+                    continue
+                if visited[child] == False:
+                    visited[child] = True
+                    queue.append(child)
+                    dist[child] = dist[par] + 1
+
+        return dist[goal_key]
+
+    def get_pairwise(self, state, target):
+        dist = self.bfs_dist(state, target)
+        return dist
+
+    def all_states(self):
+        states = []
+        mask = []
+        for i in range(self.n):
+            for j in range(self.n):
+                self.x = i
+                self.y = j
+                states.append(self.get_obs())
+                if isinstance(states[-1], dict):
+                    states[-1] = states[-1]['observation']
+                mask.append(self.map[self.x, self.y] > 0.5)
+        return np.array(states)[mask]
+
+    def all_edges(self):
+        A = np.zeros((self.n * self.n, self.n * self.n))
+        mask = []
+        for i in range(self.n):
+            for j in range(self.n):
+                mask.append(self.map[i, j] > 0.5)
+                if self.map[i][j]:
+                    for a in range(4):
+                        self.x = i
+                        self.y = j
+                        t = self.step(a)[0]
+                        if isinstance(t, dict):
+                            t = t['observation']
+                        self.restore(t)
+                        A[i * self.n + j, self.x * self.n + self.y] = 1
+        return A[mask][:, mask]
+
+
+class FourRoom2(FourRoom):
+    def __init__(self, *args, **kwargs):
+        FourRoom.__init__(self, *args, **kwargs)
+        self.map = np.array([
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+            0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+            0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+            0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+            0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+            0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+            0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+            0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+            0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        ]).reshape((self.n, self.n))
+
+
+class FourRoom3(FourRoom):
+    def __init__(self, *args, **kwargs):
+        FourRoom.__init__(self, *args, **kwargs)
+        self.n = 5
+        self.map = np.array([
+            0, 0, 0, 0, 0,
+            0, 1, 1, 1, 0,
+            0, 1, 1, 1, 0,
+            0, 1, 1, 1, 0,
+            0, 0, 0, 0, 0,
+        ]).reshape((self.n, self.n))
+        self.init()
+
+
+class FourRoom4(FourRoom):
+    def __init__(self, seed=None, *args, **kwargs):
+        FourRoom.__init__(self, *args, **kwargs)
+        self.n = 16
+        self.map = np.array([
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+            0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+            0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+            0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+            0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+            0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+            0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+            0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+            0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+            0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+            0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+            0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+            0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+            0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        ]).reshape((self.n, self.n))
+        self.init()
+
+
+if __name__ == '__main__':
+    a = FourRoom()
--- a/goal_env/goal_plane_env.py
+++ b/goal_env/goal_plane_env.py
+import gym
+import copy
+import numpy as np
+import cv2
+from collections import OrderedDict
+
+
+class GoalPlane(gym.Env):
+    def __init__(self, env_name, type='random', maze_size=16., action_size=1., distance=0.1, start=None, goals=None):
+        super(GoalPlane, self).__init__()
+        self.env = gym.make(env_name)
+        self.maze_size = maze_size
+        self.action_size = action_size
+
+        self.action_space = gym.spaces.Box(
+            low=-action_size, high=action_size, shape=(2,), dtype='float32')
+
+        self.ob_space = gym.spaces.Box(
+            low=0., high=maze_size, shape=(2,), dtype='float32')
+
+        self.easy_goal_space = gym.spaces.Box(low=np.array([0., 0.]),
+                                              high=np.array([self.maze_size, self.maze_size / 2]) \
+                                              , dtype=np.float32)
+        self.mid_goal_space = gym.spaces.Box(low=np.array([self.maze_size / 2, self.maze_size / 2]), \
+                                             high=np.array([self.maze_size, self.maze_size]), dtype=np.float32)
+        self.hard_goal_space = gym.spaces.Box(low=np.array([0., self.maze_size * 0.65]), \
+                                              high=np.array([self.maze_size / 2, self.maze_size]), dtype=np.float32)
+        self.type = type
+        if self.type == 'random':
+            self.goal_space = self.ob_space
+        elif self.type == 'easy':
+            self.goal_space = self.easy_goal_space
+        elif self.type == 'mid':
+            self.goal_space = self.mid_goal_space
+        elif self.type == 'hard':
+            self.goal_space = self.hard_goal_space
+
+        self.distance = distance
+        self.goals = goals
+        self.start = start
+
+        self.observation_space = gym.spaces.Dict(OrderedDict({
+            'observation': self.ob_space,
+            'desired_goal': self.goal_space,
+            'achieved_goal': self.ob_space,
+        }))
+        self.goal = None
+
+    def compute_reward(self, achieved_goal, desired_goal, info):
+        reward = -np.linalg.norm(achieved_goal - desired_goal, axis=-1)
+        return reward
+
+    def change_mode(self, mode='mid'):
+        if mode == 'random':
+            self.goal_space = self.ob_space
+        elif mode == 'easy':
+            self.goal_space = self.easy_goal_space
+        elif mode == 'mid':
+            self.goal_space = self.mid_goal_space
+        elif mode == 'hard':
+            self.goal_space = self.hard_goal_space
+
+    def step(self, action):
+        assert self.goal is not None
+        observation, reward, done, info = self.env.step(np.array(action) / self.maze_size)  # normalize action
+        observation = np.array(observation) * self.maze_size
+
+        out = {'observation': observation,
+               'desired_goal': self.goal,
+               'achieved_goal': observation}
+        reward = -np.linalg.norm(observation - self.goal, axis=-1)
+        info['is_success'] = (reward > -self.distance)
+        return out, reward, done, info
+
+    def reset(self):
+        if self.start is not None:
+            self.env.reset()
+            observation = np.array(self.start)
+            self.env.restore(observation / self.maze_size)
+        else:
+            observation = self.env.reset()
+        if self.goals is None:
+            condition = True
+            while condition:  # note: goal should not be in the block
+                self.goal = self.goal_space.sample()
+                condition = self.env.check_inside(self.goal / self.maze_size)
+        else:
+            self.goal = np.array(self.goals)
+        out = {'observation': observation, 'desired_goal': self.goal}
+        out['achieved_goal'] = observation
+        return out
+
+    def render(self, mode='rgb_array'):
+        image = self.env.render(mode='rgb_array')
+        goal_loc = copy.copy(self.goal)
+        goal_loc[0] = goal_loc[0] / self.maze_size * image.shape[1]
+        goal_loc[1] = goal_loc[1] / self.maze_size * image.shape[0]
+        cv2.circle(image, (int(goal_loc[0]), int(goal_loc[1])), 10, (0, 255, 0), -1)
+        if mode == 'human':
+            cv2.imshow('image', image)
+            cv2.waitKey(2)
+        else:
+            return image
--- a/goal_env/mountaincar.py
+++ b/goal_env/mountaincar.py
+import math
+import numpy as np
+
+import gym
+from gym import spaces
+from gym.utils import seeding
+
+
+class MountainCarEnv(gym.Env):
+    metadata = {
+        'render.modes': ['human', 'rgb_array'],
+        'video.frames_per_second': 30
+    }
+
+    def __init__(self, goal_dim=1):
+        self.min_position = -1.2
+        self.max_position = 0.6
+        self.max_speed = 0.07
+        self.goal_position = 0.5
+
+        self.force = 0.001
+        self.gravity = 0.0025
+
+        self.low = np.array([self.min_position, -self.max_speed])
+        self.high = np.array([self.max_position, self.max_speed])
+
+        self.viewer = None
+        self.goal_dim = goal_dim
+
+        self.action_space = spaces.Discrete(3)
+        self.observation_space = {
+            "achieved_goal": spaces.Box(self.low[:self.goal_dim], self.high[:self.goal_dim], dtype=np.float32),
+            "desired_goal": spaces.Box(self.low[:self.goal_dim], self.high[:self.goal_dim], dtype=np.float32),
+            "observation": spaces.Box(self.low, self.high, dtype=np.float32),
+        }
+
+        self.seed()
+
+    def get_obs(self):
+        return {
+            "achieved_goal": np.array(self.state)[:self.goal_dim],
+            "desired_goal": np.array([self.goal_position, 0][:self.goal_dim]),
+            "observation": np.array(self.state),
+        }
+
+    def seed(self, seed=None):
+        self.np_random, seed = seeding.np_random(seed)
+        return [seed]
+
+    def step(self, action):
+        assert self.action_space.contains(
+            action), "%r (%s) invalid" % (action, type(action))
+        info = {'is_success': False}
+        position, velocity = self.state
+        velocity += (action - 1) * self.force + \
+                    math.cos(3 * position) * (-self.gravity)
+        velocity = np.clip(velocity, -self.max_speed, self.max_speed)
+        position += velocity
+        position = np.clip(position, self.min_position, self.max_position)
+        if (position == self.min_position and velocity < 0):
+            velocity = 0
+
+        done = bool(position >= self.goal_position)
+        reward = -1.0
+        if done:
+            reward = 0.0
+            info['is_success'] = True
+
+        self.state = (position, velocity)
+        return self.get_obs(), reward, done, info
+        # return np.array(self.state), reward, done, {}
+
+    def reset(self):
+        self.state = np.array([self.np_random.uniform(low=-0.6, high=-0.4), 0])
+        return self.get_obs()
+        # return np.array(self.state)
+
+    def _height(self, xs):
+        return np.sin(3 * xs) * .45 + .55
+
+    def render(self, mode='human'):
+        screen_width = 600
+        screen_height = 400
+
+        world_width = self.max_position - self.min_position
+        scale = screen_width / world_width
+        carwidth = 40
+        carheight = 20
+
+        if self.viewer is None:
+            from gym.envs.classic_control import rendering
+            self.viewer = rendering.Viewer(screen_width, screen_height)
+            xs = np.linspace(self.min_position, self.max_position, 100)
+            ys = self._height(xs)
+            xys = list(zip((xs - self.min_position) * scale, ys * scale))
+
+            self.track = rendering.make_polyline(xys)
+            self.track.set_linewidth(4)
+            self.viewer.add_geom(self.track)
+
+            clearance = 10
+
+            l, r, t, b = -carwidth / 2, carwidth / 2, carheight, 0
+            car = rendering.FilledPolygon([(l, b), (l, t), (r, t), (r, b)])
+            car.add_attr(rendering.Transform(translation=(0, clearance)))
+            self.cartrans = rendering.Transform()
+            car.add_attr(self.cartrans)
+            self.viewer.add_geom(car)
+            frontwheel = rendering.make_circle(carheight / 2.5)
+            frontwheel.set_color(.5, .5, .5)
+            frontwheel.add_attr(rendering.Transform(
+                translation=(carwidth / 4, clearance)))
+            frontwheel.add_attr(self.cartrans)
+            self.viewer.add_geom(frontwheel)
+            backwheel = rendering.make_circle(carheight / 2.5)
+            backwheel.add_attr(rendering.Transform(
+                translation=(-carwidth / 4, clearance)))
+            backwheel.add_attr(self.cartrans)
+            backwheel.set_color(.5, .5, .5)
+            self.viewer.add_geom(backwheel)
+            flagx = (self.goal_position - self.min_position) * scale
+            flagy1 = self._height(self.goal_position) * scale
+            flagy2 = flagy1 + 50
+            flagpole = rendering.Line((flagx, flagy1), (flagx, flagy2))
+            self.viewer.add_geom(flagpole)
+            flag = rendering.FilledPolygon(
+                [(flagx, flagy2), (flagx, flagy2 - 10), (flagx + 25, flagy2 - 5)])
+            flag.set_color(.8, .8, 0)
+            self.viewer.add_geom(flag)
+
+        pos = self.state[0]
+        self.cartrans.set_translation(
+            (pos - self.min_position) * scale, self._height(pos) * scale)
+        self.cartrans.set_rotation(math.cos(3 * pos))
+
+        return self.viewer.render(return_rgb_array=mode == 'rgb_array')
+
+    def get_keys_to_action(self):
+        # control with left and right arrow keys
+        return {(): 1, (276,): 0, (275,): 2, (275, 276): 1}
+
+    def close(self):
+        if self.viewer:
+            self.viewer.close()
+            self.viewer = None
+
+    def compute_reward(self, state, goal):
+        '''
+        to be finish
+        :param state:
+        :param goal:
+        :return:
+        '''
+
+    def get_pairwise(self, state, target):
+        '''
+        to be finish
+        :param state:
+        :param target:
+        :return:
+        '''
--- a/goal_env/mujoco/__init__.py
+++ b/goal_env/mujoco/__init__.py
+from gym.envs.registration import register
+import sys
+
+print("path", sys.argv[0].split('/')[-1], "!!!")
+if sys.argv[0].split('/')[-1] in ["train_ddpg.py", "visitation_plot.py", "vis_fetch.py"]:
+    from train_ddpg import args
+elif sys.argv[0].split('/')[-1] == "train_hier_ddpg.py":
+    from train_hier_ddpg import args
+elif sys.argv[0].split('/')[-1] == "train_hier_sac.py":
+    from train_hier_sac import args
+elif sys.argv[0].split('/')[-1] == "train_hier_ppo.py":
+    from train_hier_ppo import args
+elif sys.argv[0].split('/')[-1] == "train_covering.py":
+    from train_covering import args
+else:
+    raise Exception("Unknown main file !!!")
+
+robots = ['Point', 'Ant', 'Swimmer']
+task_types = ['Maze', 'Maze1', 'Push', 'Fall', 'Block', 'BlockMaze']
+all_name = [x + y for x in robots for y in task_types]
+random_start = False
+
+if args.image:
+    top_down = True
+else:
+    top_down = False
+
+for name_t in all_name:
+    # episode length
+    if name_t == "AntMaze":
+        max_timestep = 1000
+    else:
+        max_timestep = 500
+    for Test in ['', 'Test', 'Test1', 'Test2']:
+
+        if Test in ['Test', 'Test1', 'Test2']:
+            fix_goal = True
+        else:
+            if name_t == "AntBlock":
+                fix_goal = True
+            else:
+                fix_goal = False
+        goal_args = [[-5, -5], [5, 5]]
+
+        register(
+            id=name_t + Test + '-v0',
+            entry_point='goal_env.mujoco.create_maze_env:create_maze_env',
+            kwargs={'env_name': name_t, 'goal_args': goal_args, 'maze_size_scaling': 8, 'random_start': random_start},
+            max_episode_steps=max_timestep,
+        )
+
+        # v1 is the one we use in the main paper
+        register(
+            id=name_t + Test + '-v1',
+            entry_point='goal_env.mujoco.create_maze_env:create_maze_env',
+            kwargs={'env_name': name_t, 'goal_args': goal_args, 'maze_size_scaling': 4, 'random_start': random_start,
+                    "fix_goal": fix_goal, "top_down_view": top_down, 'test':Test},
+            max_episode_steps=max_timestep,
+        )
+
+        register(
+            id=name_t + Test + '-v2',
+            entry_point='goal_env.mujoco.create_maze_env:create_maze_env',
+            kwargs={'env_name': name_t, 'goal_args': goal_args, 'maze_size_scaling': 2, 'random_start': random_start},
+            max_episode_steps=max_timestep,
+        )
--- a/goal_env/mujoco/ant.py
+++ b/goal_env/mujoco/ant.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Wrapper for creating the ant environment in gym_mujoco."""
+
+import math
+import numpy as np
+from gym import utils
+from gym.envs.mujoco import mujoco_env
+
+
+def q_inv(a):
+    return [a[0], -a[1], -a[2], -a[3]]
+
+
+def q_mult(a, b):  # multiply two quaternion
+    w = a[0] * b[0] - a[1] * b[1] - a[2] * b[2] - a[3] * b[3]
+    i = a[0] * b[1] + a[1] * b[0] + a[2] * b[3] - a[3] * b[2]
+    j = a[0] * b[2] - a[1] * b[3] + a[2] * b[0] + a[3] * b[1]
+    k = a[0] * b[3] + a[1] * b[2] - a[2] * b[1] + a[3] * b[0]
+    return [w, i, j, k]
+
+
+class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle):
+    FILE = "ant.xml"
+    ORI_IND = 3
+
+    def __init__(self, file_path=None, expose_all_qpos=True,
+                 expose_body_coms=None, expose_body_comvels=None, noisy_init=True):
+        self._expose_all_qpos = expose_all_qpos
+        self._expose_body_coms = expose_body_coms
+        self._expose_body_comvels = expose_body_comvels
+        self._body_com_indices = {}
+        self._body_comvel_indices = {}
+        self.noisy_init = noisy_init
+        self.full_obs = False
+        self.add_noise = False
+
+        mujoco_env.MujocoEnv.__init__(self, file_path, 10)
+        utils.EzPickle.__init__(self)
+
+    @property
+    def physics(self):
+        return self.model
+
+    def _step(self, a):
+        return self.step(a)
+
+    def step(self, a):
+        xposbefore = self.get_body_com("torso")[0]
+        self.do_simulation(a, self.frame_skip)
+        xposafter = self.get_body_com("torso")[0]
+        forward_reward = (xposafter - xposbefore) / self.dt
+        ctrl_cost = .5 * np.square(a).sum()
+        survive_reward = 1.0
+        reward = forward_reward - ctrl_cost + survive_reward
+        state = self.state_vector()
+        done = False
+        ob = self._get_obs()
+        return ob, reward, done, dict(
+            reward_forward=forward_reward,
+            reward_ctrl=-ctrl_cost,
+            reward_survive=survive_reward)
+
+    def _get_obs(self):
+        # No cfrc observation
+        if self._expose_all_qpos:
+            obs = np.concatenate([
+                self.data.qpos.flat[:15],  # Ensures only ant obs.
+                self.data.qvel.flat[:14],
+            ])
+        else:
+            obs = np.concatenate([
+                self.data.qpos.flat[2:15],
+                self.data.qvel.flat[:14],
+            ])
+
+        if self._expose_body_coms is not None:
+            for name in self._expose_body_coms:
+                com = self.get_body_com(name)
+                if name not in self._body_com_indices:
+                    indices = range(len(obs), len(obs) + len(com))
+                    self._body_com_indices[name] = indices
+                obs = np.concatenate([obs, com])
+
+        if self._expose_body_comvels is not None:
+            for name in self._expose_body_comvels:
+                comvel = self.get_body_comvel(name)
+                if name not in self._body_comvel_indices:
+                    indices = range(len(obs), len(obs) + len(comvel))
+                    self._body_comvel_indices[name] = indices
+                obs = np.concatenate([obs, comvel])
+
+        if self.full_obs:
+            obs = np.concatenate([
+            self.data.qpos.flat,
+            self.data.qvel.flat,
+            np.clip(self.data.cfrc_ext, -1, 1).flat,
+        ])
+
+        if self.add_noise:
+            obs = np.concatenate((obs, np.random.uniform(low=-1, high=1, size=20)))
+
+        return obs
+
+    def reset_model(self):
+        if self.noisy_init:
+            qpos = self.init_qpos + self.np_random.uniform(
+                size=self.model.nq, low=-.1, high=.1)
+            qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
+        else:
+            qpos = self.init_qpos
+            qvel = self.init_qvel
+
+        # Set everything other than ant to original position and 0 velocity.
+        qpos[15:] = self.init_qpos[15:]
+        qvel[14:] = 0.
+        self.set_state(qpos, qvel)
+        return self._get_obs()
+
+    def viewer_setup(self):
+        # self.viewer.cam.distance = self.model.stat.extent
+
+        # self.viewer.cam.trackbodyid = 1
+        # self.viewer.cam.distance = self.model.stat.extent * 0.7
+        # self.viewer.cam.lookat[2] = 0.8925
+        # self.viewer.cam.elevation = 0
+
+        self.viewer.cam.trackbodyid = -1
+        self.viewer.cam.distance = 30
+        self.viewer.cam.elevation = -90
+
+    def get_ori(self):
+        ori = [0, 1, 0, 0]
+        rot = self.data.qpos[self.__class__.ORI_IND:self.__class__.ORI_IND + 4]  # take the quaternion
+        ori = q_mult(q_mult(rot, ori), q_inv(rot))[1:3]  # project onto x-y plane
+        ori = math.atan2(ori[1], ori[0])
+        return ori
+
+    def set_xy(self, xy):
+        qpos = np.copy(self.data.qpos)
+        qpos[0] = xy[0]
+        qpos[1] = xy[1]
+
+        qvel = self.data.qvel
+        self.set_state(qpos, qvel)
+
+    def get_xy(self):
+        return self.data.qpos[:2]
--- a/goal_env/mujoco/ant_maze_env.py
+++ b/goal_env/mujoco/ant_maze_env.py
+from .maze_env import MazeEnv
+from .ant import AntEnv
+
+
+class AntMazeEnv(MazeEnv):
+    MODEL_CLASS = AntEnv
--- a/goal_env/mujoco/assets/ant.xml
+++ b/goal_env/mujoco/assets/ant.xml
+<mujoco model="ant">
+    <compiler inertiafromgeom="true" angle="degree" coordinate="local"/>
+    <option timestep="0.02" integrator="RK4"/>
+    <custom>
+        <numeric name="init_qpos" data="0.0 0.0 0.55 1.0 0.0 0.0 0.0 0.0 1.0 0.0 -1.0 0.0 -1.0 0.0 1.0"/>
+    </custom>
+    <default>
+        <joint limited="true" armature="1" damping="1"/>
+        <geom condim="3" conaffinity="0" margin="0.01" friction="1 0.5 0.5" solref=".02 1" solimp=".8 .8 .01"
+              rgba="0.8 0.6 0.4 1" density="5.0"/>
+    </default>
+    <asset>
+        <texture type="skybox" builtin="gradient" width="100" height="100" rgb1="1 1 1" rgb2="0 0 0"/>
+        <texture name="texgeom" type="cube" builtin="flat" mark="cross" width="127" height="1278" rgb1="0.8 0.6 0.4"
+                 rgb2="0.8 0.6 0.4" markrgb="1 1 1" random="0.01"/>
+        <texture name="texplane" type="2d" builtin="checker" rgb1="0 0 0" rgb2="0.8 0.8 0.8" width="100" height="100"/>
+        <material name='MatPlane' texture="texplane" shininess="1" texrepeat="60 60" specular="1" reflectance="0.5"/>
+        <material name='geom' texture="texgeom" texuniform="true"/>
+    </asset>
+    <worldbody>
+        <light directional="true" cutoff="100" exponent="1" diffuse="1 1 1" specular=".1 .1 .1" pos="0 0 1.3"
+               dir="-0 0 -1.3"/>
+        <geom name='floor' pos='0 0 0' size='40 40 40' type='plane' conaffinity='1' rgba='0.8 0.9 0.8 1' condim='3'/>
+        <body name="torso" pos="0 0 0.75">
+            <geom name="torso_geom" type="sphere" size="0.25" pos="0 0 0"/>
+            <joint name="root" type="free" limited="false" pos="0 0 0" axis="0 0 1" margin="0.01" armature="0"
+                   damping="0"/>
+            <body name="front_left_leg" pos="0 0 0">
+                <geom name="aux_1_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 0.2 0.2 0.0"/>
+                <body name="aux_1" pos="0.2 0.2 0">
+                    <joint name="hip_1" type="hinge" pos="0.0 0.0 0.0" axis="0 0 1" range="-30 30"/>
+                    <geom name="left_leg_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 0.2 0.2 0.0"/>
+                    <body pos="0.2 0.2 0">
+                        <joint name="ankle_1" type="hinge" pos="0.0 0.0 0.0" axis="-1 1 0" range="30 70"/>
+                        <geom name="left_ankle_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 0.4 0.4 0.0"/>
+                    </body>
+                </body>
+            </body>
+            <body name="front_right_leg" pos="0 0 0">
+                <geom name="aux_2_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 -0.2 0.2 0.0"/>
+                <body name="aux_2" pos="-0.2 0.2 0">
+                    <joint name="hip_2" type="hinge" pos="0.0 0.0 0.0" axis="0 0 1" range="-30 30"/>
+                    <geom name="right_leg_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 -0.2 0.2 0.0"/>
+                    <body pos="-0.2 0.2 0">
+                        <joint name="ankle_2" type="hinge" pos="0.0 0.0 0.0" axis="1 1 0" range="-70 -30"/>
+                        <geom name="right_ankle_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 -0.4 0.4 0.0"/>
+                    </body>
+                </body>
+            </body>
+            <body name="back_leg" pos="0 0 0">
+                <geom name="aux_3_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 -0.2 -0.2 0.0"/>
+                <body name="aux_3" pos="-0.2 -0.2 0">
+                    <joint name="hip_3" type="hinge" pos="0.0 0.0 0.0" axis="0 0 1" range="-30 30"/>
+                    <geom name="back_leg_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 -0.2 -0.2 0.0"/>
+                    <body pos="-0.2 -0.2 0">
+                        <joint name="ankle_3" type="hinge" pos="0.0 0.0 0.0" axis="-1 1 0" range="-70 -30"/>
+                        <geom name="third_ankle_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 -0.4 -0.4 0.0"/>
+                    </body>
+                </body>
+            </body>
+            <body name="right_back_leg" pos="0 0 0">
+                <geom name="aux_4_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 0.2 -0.2 0.0"/>
+                <body name="aux_4" pos="0.2 -0.2 0">
+                    <joint name="hip_4" type="hinge" pos="0.0 0.0 0.0" axis="0 0 1" range="-30 30"/>
+                    <geom name="rightback_leg_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 0.2 -0.2 0.0"/>
+                    <body pos="0.2 -0.2 0">
+                        <joint name="ankle_4" type="hinge" pos="0.0 0.0 0.0" axis="1 1 0" range="30 70"/>
+                        <geom name="fourth_ankle_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 0.4 -0.4 0.0"/>
+                    </body>
+                </body>
+            </body>
+        </body>
+
+    </worldbody>
+    <actuator>
+        <motor joint="hip_4" ctrlrange="-16.0 16.0" ctrllimited="true"/>
+        <motor joint="ankle_4" ctrlrange="-16.0 16.0" ctrllimited="true"/>
+        <motor joint="hip_1" ctrlrange="-16.0 16.0" ctrllimited="true"/>
+        <motor joint="ankle_1" ctrlrange="-16.0 16.0" ctrllimited="true"/>
+        <motor joint="hip_2" ctrlrange="-16.0 16.0" ctrllimited="true"/>
+        <motor joint="ankle_2" ctrlrange="-16.0 16.0" ctrllimited="true"/>
+        <motor joint="hip_3" ctrlrange="-16.0 16.0" ctrllimited="true"/>
+        <motor joint="ankle_3" ctrlrange="-16.0 16.0" ctrllimited="true"/>
+    </actuator>
+    <!--<actuator>-->
+        <!--<motor joint="hip_4" ctrlrange="-30.0 30.0" ctrllimited="true"/>-->
+        <!--<motor joint="ankle_4" ctrlrange="-30.0 30.0" ctrllimited="true"/>-->
+        <!--<motor joint="hip_1" ctrlrange="-30.0 30.0" ctrllimited="true"/>-->
+        <!--<motor joint="ankle_1" ctrlrange="-30.0 30.0" ctrllimited="true"/>-->
+        <!--<motor joint="hip_2" ctrlrange="-30.0 30.0" ctrllimited="true"/>-->
+        <!--<motor joint="ankle_2" ctrlrange="-30.0 30.0" ctrllimited="true"/>-->
+        <!--<motor joint="hip_3" ctrlrange="-30.0 30.0" ctrllimited="true"/>-->
+        <!--<motor joint="ankle_3" ctrlrange="-30.0 30.0" ctrllimited="true"/>-->
+    <!--</actuator>-->
+</mujoco>
--- a/goal_env/mujoco/assets/point.xml
+++ b/goal_env/mujoco/assets/point.xml
+<mujoco>
+    <compiler angle="degree" coordinate="local" inertiafromgeom="true"/>
+    <option integrator="RK4" timestep="0.02"/>
+    <default>
+        <joint armature="0" damping="0" limited="false"/>
+        <geom conaffinity="0" condim="3" density="100" friction="1 0.5 0.5" margin="0" rgba="0.8 0.6 0.4 1"/>
+    </default>
+    <asset>
+        <texture builtin="gradient" height="100" rgb1="1 1 1" rgb2="0 0 0" type="skybox" width="100"/>
+        <texture builtin="flat" height="1278" mark="cross" markrgb="1 1 1" name="texgeom" random="0.01"
+                 rgb1="0.8 0.6 0.4" rgb2="0.8 0.6 0.4" type="cube" width="127"/>
+        <texture builtin="checker" height="100" name="texplane" rgb1="0 0 0" rgb2="0.8 0.8 0.8" type="2d" width="100"/>
+        <material name="MatPlane" reflectance="0.5" shininess="1" specular="1" texrepeat="30 30" texture="texplane"/>
+        <material name="geom" texture="texgeom" texuniform="true"/>
+    </asset>
+    <worldbody>
+        <light cutoff="100" diffuse="1 1 1" dir="-0 0 -1.3" directional="true" exponent="1" pos="0 0 1.3"
+               specular=".1 .1 .1"/>
+        <geom conaffinity="1" condim="3" material="MatPlane" name="floor" pos="0 0 0" rgba="0.8 0.9 0.8 1"
+              size="40 40 40" type="plane"/>
+        <body name="torso" pos="0 0 0">
+            <geom name="pointbody" pos="0 0 0.5" size="0.5" type="sphere"/>
+            <geom name="pointarrow" pos="0.6 0 0.5" size="0.5 0.1 0.1" type="box"/>
+            <joint axis="1 0 0" name="ballx" pos="0 0 0" type="slide"/>
+            <joint axis="0 1 0" name="bally" pos="0 0 0" type="slide"/>
+            <joint axis="0 0 1" limited="false" name="rot" pos="0 0 0" type="hinge"/>
+        </body>
+    </worldbody>
+    <actuator>
+        <!-- Those are just dummy actuators for providing ranges -->
+        <motor ctrllimited="true" ctrlrange="-1 1" joint="ballx"/>
+        <motor ctrllimited="true" ctrlrange="-0.25 0.25" joint="rot"/>
+    </actuator>
+</mujoco>
\ No newline at end of file
--- a/goal_env/mujoco/assets/swimmer.xml
+++ b/goal_env/mujoco/assets/swimmer.xml
+<mujoco model="swimmer">
+  <compiler inertiafromgeom="true" angle="degree" coordinate="local" />
+  <custom>
+    <numeric name="frame_skip" data="50" />
+  </custom>
+  <option timestep="0.001" density="4000" viscosity="0.1" collision="predefined" integrator="Euler" iterations="1000">
+    <flag warmstart="disable" />
+  </option>
+  <default>
+    <geom contype='1' conaffinity='1' condim='1' rgba='0.8 0.6 .4 1' material="geom" />
+    <!--<joint armature='1'  />-->
+  </default>
+  <asset>
+    <texture type="skybox" builtin="gradient" width="100" height="100" rgb1="1 1 1" rgb2="0 0 0" />
+    <texture name="texgeom" type="cube" builtin="flat" mark="cross" width="127" height="1278" rgb1="0.8 0.6 0.4" rgb2="0.8 0.6 0.4" markrgb="1 1 1" random="0.01" />
+    <texture name="texplane" type="2d" builtin="checker" rgb1="0 0 0" rgb2="0.8 0.8 0.8" width="100" height="100" />
+    <material name='MatPlane' texture="texplane" shininess="1" texrepeat="30 30" specular="1"  reflectance="0.5" />
+    <material name='geom' texture="texgeom" texuniform="true" />
+  </asset>
+  <worldbody>
+    <light directional="true" cutoff="100" exponent="1" diffuse="1 1 1" specular=".1 .1 .1" pos="0 0 1.3" dir="-0 0 -1.3" />
+    <geom name='floor' material="MatPlane" pos='0 0 -0.1' size='40 40 0.1' type='plane' conaffinity='1' rgba='0.8 0.9 0.8 1' condim='3' />
+    <!--  ================= SWIMMER ================= /-->
+    <body name="torso" pos="0 0 0">
+      <geom name="torso" type="capsule" fromto="1.5 0 0 0.5 0 0" size="0.1" density="1000" />
+      <joint pos="0 0 0" type="slide" name="slider1" axis="1 0 0" />
+      <joint pos="0 0 0" type="slide" name="slider2" axis="0 1 0" />
+      <joint name="rot" type="hinge" pos="0 0 0" axis="0 0 1" />
+      <body name="mid" pos="0.5 0 0">
+        <geom name="mid" type="capsule" fromto="0 0 0 -1 0 0" size="0.1" density="1000" />
+        <joint name="rot2" type="hinge" pos="0 0 0" axis="0 0 1" range="-100 100" limited="true" />
+        <body name="back" pos="-1 0 0">
+          <geom name="back" type="capsule" fromto="0 0 0 -1 0 0" size="0.1" density="1000" />
+          <joint name="rot3" type="hinge" pos="0 0 0" axis="0 0 1" range="-100 100" limited="true" />
+        </body>
+      </body>
+    </body>
+  </worldbody>
+  <actuator>
+    <motor joint="rot2" ctrllimited="true" ctrlrange="-50 50" />
+    <motor joint="rot3" ctrllimited="true" ctrlrange="-50 50" />
+  </actuator>
+</mujoco>
--- a/goal_env/mujoco/create_maze_env.py
+++ b/goal_env/mujoco/create_maze_env.py
+from .ant_maze_env import AntMazeEnv
+from .point_maze_env import PointMazeEnv
+from .swimmer_maze_env import SwimmerMazeEnv
+from collections import OrderedDict
+import gym
+import numpy as np
+import copy
+from gym import Wrapper
+from gym.envs.registration import EnvSpec
+
+
+class GoalWrapper(Wrapper):
+    def __init__(self, env, maze_size_scaling, random_start, low, high, fix_goal=True, top_down=False, test=None):
+        super(GoalWrapper, self).__init__(env)
+        ob_space = env.observation_space
+        self.maze_size_scaling = maze_size_scaling
+
+        row_num, col_num = len(self.env.MAZE_STRUCTURE), len(self.env.MAZE_STRUCTURE[0])
+        contain_r = [1 if "r" in row else 0 for row in self.env.MAZE_STRUCTURE]
+        row_r = contain_r.index(1)
+        col_r = self.env.MAZE_STRUCTURE[row_r].index("r")
+        y_low = (0.5 - row_r) * self.maze_size_scaling
+        x_low = (0.5 - col_r) * self.maze_size_scaling
+        y_high = (row_num - 1.5 - row_r) * self.maze_size_scaling
+        x_high = (col_num - 1.5 - col_r) * self.maze_size_scaling
+        self.maze_low = maze_low = np.array([x_low, y_low],
+                            dtype=ob_space.dtype)
+        self.maze_high = maze_high = np.array([x_high, y_high],
+                             dtype=ob_space.dtype)
+        print("maze_low, maze_high", self.maze_low, self.maze_high)
+
+        goal_low, goal_high = maze_low, maze_high
+
+        self.goal_space = gym.spaces.Box(low=goal_low, high=goal_high)
+        self.maze_space = gym.spaces.Box(low=maze_low, high=maze_high)
+
+        if self.env._maze_id == "Fall":
+            self.goal_dim = 3
+        else:
+            self.goal_dim = goal_low.size
+        print("goal_dim in create_maze", self.goal_dim)
+        self.distance_threshold = 1.5
+        print("distance threshold in create_maze", self.distance_threshold)
+
+        self.observation_space = gym.spaces.Dict(OrderedDict({
+            'observation': ob_space,
+            'desired_goal': self.goal_space,
+            'achieved_goal': self.goal_space,
+        }))
+        self.random_start = random_start
+
+        # fix goal
+        self.fix_goal = fix_goal
+        print("fix goal", self.fix_goal)
+        contain_g = [1 if "g" in row else 0 for row in self.env.MAZE_STRUCTURE]
+        if 1 in contain_g and self.fix_goal and test == "Test":
+            row = contain_g.index(1)
+            col = self.env.MAZE_STRUCTURE[row].index("g")
+            y = (row - row_r) * self.maze_size_scaling
+            x = (col - col_r) * self.maze_size_scaling
+            self.fix_goal_xy = np.array([x, y])
+            if env._maze_id == "Fall":
+                self.fix_goal_xy = np.concatenate((self.fix_goal_xy, [self.maze_size_scaling * 0.5 + 0.5]))
+            print("fix goal xy", self.fix_goal_xy)
+        elif test == "Test1":
+            if env._maze_id == "Push":
+                self.fix_goal_xy = np.array([-4, 0])
+            elif env._maze_id == "Maze1":
+                self.fix_goal_xy = np.array([8, 0])
+            else:
+                print("Unknown env", env._maze_id)
+                assert False
+            print("fix goal xy", self.fix_goal_xy)
+        elif test == "Test2":
+            if env._maze_id == "Push":
+                self.fix_goal_xy = np.array([-4, 4])
+            elif env._maze_id == "Maze1":
+                self.fix_goal_xy = np.array([8, 8])
+            else:
+                print("Unknown env", env._maze_id)
+                assert False
+            print("fix goal xy", self.fix_goal_xy)
+        else:
+            # get vacant rowcol
+            structure = self.env.MAZE_STRUCTURE
+            self.vacant_rowcol = []
+            for i in range(len(structure)):
+                for j in range(len(structure[0])):
+                    if structure[i][j] not in [1, -1, 'r']:
+                        self.vacant_rowcol.append((i, j))
+        self.reward_type = "dense"
+
+        self.top_down = top_down
+
+    def step(self, action):
+        observation, reward, _, info = self.env.step(action)
+        out = {'observation': observation,
+               'desired_goal': self.goal,
+               # 'achieved_goal': observation[..., 3:5]}
+               'achieved_goal': observation[..., :self.goal_dim]}
+        distance = np.linalg.norm(observation[..., :self.goal_dim] - self.goal[..., :self.goal_dim], axis=-1)
+        info['is_success'] = done = (distance < self.distance_threshold)
+        if self.reward_type == "sparse":
+            reward = -(distance > self.distance_threshold).astype(np.float32)
+        else:
+            # normlization
+            reward = -distance * 0.1
+        if self.top_down:
+            mask = np.array([0.0] * 2 + [1.0] * (out['observation'].shape[0] - 2))
+            out['observation'] = out['observation'] * mask
+        return out, reward, done, info
+
+    def reset(self):
+        if self.fix_goal:
+            self.goal = self.fix_goal_xy
+        else:
+            self.goal = self.goal_space.sample()
+            if self.env._maze_id == "Push":
+                while (self.env.old_invalid_goal(self.goal[:2])):
+                    self.goal = self.goal_space.sample()
+            else:
+                while (self.env.invalid_goal(self.goal[:2])):
+                    self.goal = self.goal_space.sample()
+            if self.env._maze_id == "Fall":
+                self.goal = np.concatenate((self.goal, [self.maze_size_scaling * 0.5 + 0.5]))
+        observation = self.env.reset(self.goal)
+
+        # random start a position without collision
+        if self.random_start:
+            xy = self.maze_space.sample()
+            while (self.env._is_in_collision(xy)):
+                xy = self.maze_space.sample()
+            self.env.wrapped_env.set_xy(xy)
+            observation = self.env._get_obs()
+
+        out = {'observation': observation, 'desired_goal': self.goal}
+        out['achieved_goal'] = observation[..., :self.goal_dim]
+        # out['achieved_goal'] = observation[..., 3:5]
+        if self.top_down:
+            # print("obs", out['observation'].shape)
+            mask = np.array([0.0] * 2 + [1.0] * (out['observation'].shape[0] - 2))
+            out['observation'] = out['observation'] * mask
+        return out
+
+
+def create_maze_env(env_name=None, top_down_view=False, maze_size_scaling=4, random_start=True, goal_args=None,
+                    fix_goal=True, test=None):
+    n_bins = 0
+    if env_name.startswith('Ego'):
+        n_bins = 8
+        env_name = env_name[3:]
+    if env_name.startswith('Ant'):
+        manual_collision = True
+        cls = AntMazeEnv
+        env_name = env_name[3:]
+        maze_size_scaling = maze_size_scaling
+    elif env_name.startswith('Point'):
+        cls = PointMazeEnv
+        manual_collision = True
+        env_name = env_name[5:]
+        maze_size_scaling = maze_size_scaling
+    elif env_name.startswith('Swimmer'):
+        cls = SwimmerMazeEnv
+        manual_collision = True
+        env_name = env_name[7:]
+        maze_size_scaling = maze_size_scaling
+    else:
+        assert False, 'unknown env %s' % env_name
+
+    observe_blocks = False
+    put_spin_near_agent = False
+    if env_name == 'Maze':
+        maze_id = 'Maze'
+    elif env_name == 'Maze1':
+        maze_id = 'Maze1'
+        maze_size_scaling = 4
+    elif env_name == 'Push':
+        maze_id = 'Push'
+        manual_collision = True
+        maze_size_scaling = 4
+    elif env_name == 'Fall':
+        maze_id = 'Fall'
+    elif env_name == 'Block':
+        maze_id = 'Block'
+        put_spin_near_agent = True
+        observe_blocks = True
+    elif env_name == 'BlockMaze':
+        maze_id = 'BlockMaze'
+        put_spin_near_agent = True
+        observe_blocks = True
+    else:
+        raise ValueError('Unknown maze environment %s' % env_name)
+
+    gym_mujoco_kwargs = {
+        'maze_id': maze_id,
+        'n_bins': n_bins,
+        'observe_blocks': observe_blocks,
+        'put_spin_near_agent': put_spin_near_agent,
+        'top_down_view': top_down_view,
+        'manual_collision': manual_collision,
+        'maze_size_scaling': maze_size_scaling,
+    }
+    gym_env = cls(**gym_mujoco_kwargs)
+    # gym_env.reset()
+    # goal_args = np.array(goal_args) / 8 * maze_size_scaling
+    return GoalWrapper(gym_env, maze_size_scaling, random_start, *goal_args, fix_goal=fix_goal, top_down=top_down_view, test=test)
--- a/goal_env/mujoco/maze_env.py
+++ b/goal_env/mujoco/maze_env.py
+"""Adapted from rllab maze_env.py."""
+
+import os
+import tempfile
+import xml.etree.ElementTree as ET
+import math
+import numpy as np
+import gym
+from . import maze_env_utils
+from gym.utils import seeding
+from gym import wrappers
+
+# Directory that contains mujoco xml files.
+# MODEL_DIR = '/home/hza/ToolBox/tools/fancy/data/mujoco/assets'
+MODEL_DIR = os.path.join(os.path.dirname(__file__), 'assets')
+
+
+class MazeEnv(gym.Env):
+    MODEL_CLASS = None
+
+    MAZE_HEIGHT = None
+    MAZE_SIZE_SCALING = None
+
+    def __init__(
+            self,
+            maze_id=None,
+            maze_height=0.5,
+            maze_size_scaling=8,
+            n_bins=0,
+            sensor_range=3.,
+            sensor_span=2 * math.pi,
+            observe_blocks=False,
+            put_spin_near_agent=False,
+            top_down_view=False,
+            manual_collision=False,
+            goal=None,
+            *args,
+            **kwargs):
+        self._maze_id = maze_id
+
+        model_cls = self.__class__.MODEL_CLASS
+        if model_cls is None:
+            raise "MODEL_CLASS unspecified!"
+        xml_path = os.path.join(MODEL_DIR, model_cls.FILE)
+        self.tree = tree = ET.parse(xml_path)
+        self.worldbody = worldbody = tree.find(".//worldbody")
+
+        self.t = 0
+        self.MAZE_HEIGHT = height = maze_height
+        self.MAZE_SIZE_SCALING = size_scaling = maze_size_scaling
+        self._n_bins = n_bins
+        self._sensor_range = sensor_range * size_scaling
+        self._sensor_span = sensor_span
+        self._observe_blocks = observe_blocks
+        self._put_spin_near_agent = put_spin_near_agent
+        self._top_down_view = top_down_view
+        self._manual_collision = manual_collision
+
+        self.MAZE_STRUCTURE = structure = maze_env_utils.construct_maze(
+            maze_id=self._maze_id)
+        # Elevate the maze to allow for falling.
+        self.elevated = any(-1 in row for row in structure)
+        self.blocks = any(
+            any(maze_env_utils.can_move(r) for r in row)
+            for row in structure)  # Are there any movable blocks?
+
+        torso_x, torso_y = self._find_robot()  # x, y coordinates
+        self._init_torso_x = torso_x
+        self._init_torso_y = torso_y
+        self._init_positions = [
+            (x - torso_x, y - torso_y)
+            for x, y in self._find_all_robots()]
+
+        self._xy_to_rowcol = lambda x, y: (2 + (y + size_scaling / 2) / size_scaling,
+                                          2 + (x + size_scaling / 2) / size_scaling)
+        # walls (immovable), chasms (fall), movable blocks
+        self._view = np.zeros([5, 5, 3])
+
+        height_offset = 0.
+        if self.elevated:
+            # Increase initial z-pos of ant.
+            height_offset = height * size_scaling
+            torso = tree.find(".//body[@name='torso']")
+            torso.set('pos', '0 0 %.2f' % (0.75 + height_offset))
+        if self.blocks:
+            # If there are movable blocks, change simulation settings to perform
+            # better contact detection.
+            default = tree.find(".//default")
+            default.find('.//geom').set('solimp', '.995 .995 .01')
+
+        self.movable_blocks = []
+        self.not_thin = True
+        for i in range(len(structure)):
+            for j in range(len(structure[0])):
+                struct = structure[i][j]
+                if struct == 'r' and self._put_spin_near_agent:
+                    struct = maze_env_utils.Move.SpinXY
+                if self.elevated and struct not in [-1]:
+                    # Create elevated platform.
+                    ET.SubElement(
+                        worldbody, "geom",
+                        name="elevated_%d_%d" % (i, j),
+                        pos="%f %f %f" % (j * size_scaling - torso_x,
+                                          i * size_scaling - torso_y,
+                                          height / 2 * size_scaling),
+                        size="%f %f %f" % (0.5 * size_scaling,
+                                           0.5 * size_scaling,
+                                           height / 2 * size_scaling),
+                        type="box",
+                        material="",
+                        contype="1",
+                        conaffinity="1",
+                        rgba="0.9 0.9 0.9 1",
+                    )
+                if struct == 1:  # Unmovable block.
+                    # Offset all coordinates so that robot starts at the origin.
+                    if self.not_thin or (i == 0 or i == len(structure) - 1) or (j == 0 or j == len(structure[0])-1) or maze_id != "Maze1":
+                        y_size = 0.5 * size_scaling
+                    else:
+                        y_size = 0.25
+                    ET.SubElement(
+                        worldbody, "geom",
+                        name="block_%d_%d" % (i, j),
+                        pos="%f %f %f" % (j * size_scaling - torso_x,
+                                          i * size_scaling - torso_y,
+                                          height_offset +
+                                          height / 2 * size_scaling),
+                        size="%f %f %f" % (0.5 * size_scaling,
+                                           y_size,
+                                           height / 2 * size_scaling),
+                        type="box",
+                        material="",
+                        contype="1",
+                        conaffinity="1",
+                        rgba="0.4 0.4 0.4 1",
+                    )
+                elif maze_env_utils.can_move(struct):  # Movable block.
+                    # The "falling" blocks are shrunk slightly and increased in mass to
+                    # ensure that it can fall easily through a gap in the platform blocks.
+                    name = "movable_%d_%d" % (i, j)
+                    self.movable_blocks.append((name, struct))
+                    falling = maze_env_utils.can_move_z(struct)
+                    spinning = maze_env_utils.can_spin(struct)
+                    x_offset = 0.25 * size_scaling if spinning else 0.0
+                    y_offset = 0.0
+                    shrink = 0.2 if spinning else 0.99 if falling else 1.0
+                    height_shrink = 0.2 if spinning else 1.0
+                    movable_body = ET.SubElement(
+                        worldbody, "body",
+                        name=name,
+                        pos="%f %f %f" % (j * size_scaling - torso_x + x_offset,
+                                          i * size_scaling - torso_y + y_offset,
+                                          height_offset +
+                                          height / 2 * size_scaling * height_shrink),
+                    )
+                    ET.SubElement(
+                        movable_body, "geom",
+                        name="block_%d_%d" % (i, j),
+                        pos="0 0 0",
+                        size="%f %f %f" % (0.5 * size_scaling * shrink,
+                                           0.5 * size_scaling * shrink,
+                                           height / 2 * size_scaling * height_shrink),
+                        type="box",
+                        material="",
+                        mass="0.001" if falling else "0.0002",
+                        contype="1",
+                        conaffinity="1",
+                        rgba="0.9 0.1 0.1 1"
+                    )
+                    if maze_env_utils.can_move_x(struct):
+                        ET.SubElement(
+                            movable_body, "joint",
+                            armature="0",
+                            axis="1 0 0",
+                            damping="0.0",
+                            limited="true" if falling else "false",
+                            range="%f %f" % (-size_scaling, size_scaling),
+                            margin="0.01",
+                            name="movable_x_%d_%d" % (i, j),
+                            pos="0 0 0",
+                            type="slide"
+                        )
+                    if maze_env_utils.can_move_y(struct):
+                        ET.SubElement(
+                            movable_body, "joint",
+                            armature="0",
+                            axis="0 1 0",
+                            damping="0.0",
+                            limited="true" if falling else "false",
+                            range="%f %f" % (-size_scaling, size_scaling),
+                            margin="0.01",
+                            name="movable_y_%d_%d" % (i, j),
+                            pos="0 0 0",
+                            type="slide"
+                        )
+                    if maze_env_utils.can_move_z(struct):
+                        ET.SubElement(
+                            movable_body, "joint",
+                            armature="0",
+                            axis="0 0 1",
+                            damping="0.0",
+                            limited="true",
+                            range="%f 0" % (-height_offset),
+                            margin="0.01",
+                            name="movable_z_%d_%d" % (i, j),
+                            pos="0 0 0",
+                            type="slide"
+                        )
+                    if maze_env_utils.can_spin(struct):
+                        ET.SubElement(
+                            movable_body, "joint",
+                            armature="0",
+                            axis="0 0 1",
+                            damping="0.0",
+                            limited="false",
+                            name="spinable_%d_%d" % (i, j),
+                            pos="0 0 0",
+                            type="ball"
+                        )
+
+        torso = tree.find(".//body[@name='torso']")
+        geoms = torso.findall(".//geom")
+        for geom in geoms:
+            if 'name' not in geom.attrib:
+                raise Exception("Every geom of the torso must have a name "
+                                "defined")
+
+        _, file_path = tempfile.mkstemp(text=True, suffix='.xml')
+        tree.write(file_path)
+
+        self.wrapped_env = model_cls(*args, file_path=file_path, **kwargs)
+        self.args = args
+        self.kwargs = kwargs
+        self.visualize_goal = True
+        self.GOAL = goal
+        if self.GOAL is not None:
+            self.GOAL = self.unwrapped._rowcol_to_xy(*self.GOAL)
+            self.EPS = self.unwrapped.MAZE_SIZE_SCALING ** 2
+
+        contain_r = [1 if "r" in row else 0 for row in self.MAZE_STRUCTURE]
+        self.init_row_r = contain_r.index(1)
+        self.init_col_r = self.MAZE_STRUCTURE[self.init_row_r].index("r")
+
+    def get_ori(self):
+        return self.wrapped_env.get_ori()
+
+    def get_top_down_view(self):
+        self._view = np.zeros_like(self._view)
+
+        def valid(row, col):
+            return self._view.shape[0] > row >= 0 and self._view.shape[1] > col >= 0
+
+        def update_view(x, y, d, row=None, col=None):
+            if row is None or col is None:
+                x = x - self._robot_x
+                y = y - self._robot_y
+                th = self._robot_ori
+
+                row, col = self._xy_to_rowcol(x, y)
+                update_view(x, y, d, row=row, col=col)
+                return
+
+            row, row_frac, col, col_frac = int(row), row % 1, int(col), col % 1
+            if row_frac < 0:
+                row_frac += 1
+            if col_frac < 0:
+                col_frac += 1
+
+            if valid(row, col):
+                self._view[row, col, d] += (
+                        (min(1., row_frac + 0.5) - max(0., row_frac - 0.5)) *
+                        (min(1., col_frac + 0.5) - max(0., col_frac - 0.5)))
+            if valid(row - 1, col):
+                self._view[row - 1, col, d] += (
+                        (max(0., 0.5 - row_frac)) *
+                        (min(1., col_frac + 0.5) - max(0., col_frac - 0.5)))
+            if valid(row + 1, col):
+                self._view[row + 1, col, d] += (
+                        (max(0., row_frac - 0.5)) *
+                        (min(1., col_frac + 0.5) - max(0., col_frac - 0.5)))
+            if valid(row, col - 1):
+                self._view[row, col - 1, d] += (
+                        (min(1., row_frac + 0.5) - max(0., row_frac - 0.5)) *
+                        (max(0., 0.5 - col_frac)))
+            if valid(row, col + 1):
+                self._view[row, col + 1, d] += (
+                        (min(1., row_frac + 0.5) - max(0., row_frac - 0.5)) *
+                        (max(0., col_frac - 0.5)))
+            if valid(row - 1, col - 1):
+                self._view[row - 1, col - 1, d] += (
+                        (max(0., 0.5 - row_frac)) * max(0., 0.5 - col_frac))
+            if valid(row - 1, col + 1):
+                self._view[row - 1, col + 1, d] += (
+                        (max(0., 0.5 - row_frac)) * max(0., col_frac - 0.5))
+            if valid(row + 1, col + 1):
+                self._view[row + 1, col + 1, d] += (
+                        (max(0., row_frac - 0.5)) * max(0., col_frac - 0.5))
+            if valid(row + 1, col - 1):
+                self._view[row + 1, col - 1, d] += (
+                        (max(0., row_frac - 0.5)) * max(0., 0.5 - col_frac))
+
+        # Draw ant.
+        robot_x, robot_y = self.wrapped_env.get_body_com("torso")[:2]
+        self._robot_x = robot_x
+        self._robot_y = robot_y
+        self._robot_ori = self.get_ori()
+
+        structure = self.MAZE_STRUCTURE
+        size_scaling = self.MAZE_SIZE_SCALING
+        height = self.MAZE_HEIGHT
+
+        # Draw immovable blocks and chasms.
+        for i in range(len(structure)):
+            for j in range(len(structure[0])):
+                if structure[i][j] == 1:  # Wall.
+                    update_view(j * size_scaling - self._init_torso_x,
+                                i * size_scaling - self._init_torso_y,
+                                0)
+                if structure[i][j] == -1:  # Chasm.
+                    update_view(j * size_scaling - self._init_torso_x,
+                                i * size_scaling - self._init_torso_y,
+                                1)
+
+        # Draw movable blocks.
+        for block_name, block_type in self.movable_blocks:
+            block_x, block_y = self.wrapped_env.get_body_com(block_name)[:2]
+            update_view(block_x, block_y, 2)
+
+        return self._view
+
+    def get_range_sensor_obs(self):
+        """Returns egocentric range sensor observations of maze."""
+        robot_x, robot_y, robot_z = self.wrapped_env.get_body_com("torso")[:3]
+        ori = self.get_ori()
+
+        structure = self.MAZE_STRUCTURE
+        size_scaling = self.MAZE_SIZE_SCALING
+        height = self.MAZE_HEIGHT
+
+        segments = []
+        # Get line segments (corresponding to outer boundary) of each immovable
+        # block or drop-off.
+        for i in range(len(structure)):
+            for j in range(len(structure[0])):
+                if structure[i][j] in [1, -1]:  # There's a wall or drop-off.
+                    cx = j * size_scaling - self._init_torso_x
+                    cy = i * size_scaling - self._init_torso_y
+                    x1 = cx - 0.5 * size_scaling
+                    x2 = cx + 0.5 * size_scaling
+                    y1 = cy - 0.5 * size_scaling
+                    y2 = cy + 0.5 * size_scaling
+                    struct_segments = [
+                        ((x1, y1), (x2, y1)),
+                        ((x2, y1), (x2, y2)),
+                        ((x2, y2), (x1, y2)),
+                        ((x1, y2), (x1, y1)),
+                    ]
+                    for seg in struct_segments:
+                        segments.append(dict(
+                            segment=seg,
+                            type=structure[i][j],
+                        ))
+        # Get line segments (corresponding to outer boundary) of each movable
+        # block within the agent's z-view.
+        for block_name, block_type in self.movable_blocks:
+            block_x, block_y, block_z = self.wrapped_env.get_body_com(block_name)[
+                                        :3]
+            if (block_z + height * size_scaling / 2 >= robot_z and
+                    robot_z >= block_z - height * size_scaling / 2):  # Block in view.
+                x1 = block_x - 0.5 * size_scaling
+                x2 = block_x + 0.5 * size_scaling
+                y1 = block_y - 0.5 * size_scaling
+                y2 = block_y + 0.5 * size_scaling
+                struct_segments = [
+                    ((x1, y1), (x2, y1)),
+                    ((x2, y1), (x2, y2)),
+                    ((x2, y2), (x1, y2)),
+                    ((x1, y2), (x1, y1)),
+                ]
+                for seg in struct_segments:
+                    segments.append(dict(
+                        segment=seg,
+                        type=block_type,
+                    ))
+
+        # 3 for wall, drop-off, block
+        sensor_readings = np.zeros((self._n_bins, 3))
+        for ray_idx in range(self._n_bins):
+            ray_ori = (ori - self._sensor_span * 0.5 +
+                       (2 * ray_idx + 1.0) / (2 * self._n_bins) * self._sensor_span)
+            ray_segments = []
+            # Get all segments that intersect with ray.
+            for seg in segments:
+                p = maze_env_utils.ray_segment_intersect(
+                    ray=((robot_x, robot_y), ray_ori),
+                    segment=seg["segment"])
+                if p is not None:
+                    ray_segments.append(dict(
+                        segment=seg["segment"],
+                        type=seg["type"],
+                        ray_ori=ray_ori,
+                        distance=maze_env_utils.point_distance(
+                            p, (robot_x, robot_y)),
+                    ))
+            if len(ray_segments) > 0:
+                # Find out which segment is intersected first.
+                first_seg = sorted(
+                    ray_segments, key=lambda x: x["distance"])[0]
+                seg_type = first_seg["type"]
+                idx = (0 if seg_type == 1 else  # Wall.
+                       1 if seg_type == -1 else  # Drop-off.
+                       2 if maze_env_utils.can_move(seg_type) else  # Block.
+                       None)
+                if first_seg["distance"] <= self._sensor_range:
+                    sensor_readings[ray_idx][idx] = (self._sensor_range - first_seg["distance"]) / self._sensor_range
+        return sensor_readings
+
+    def _get_obs(self):
+        wrapped_obs = self.wrapped_env._get_obs()
+        if self._observe_blocks:
+            additional_obs = []
+            for block_name, block_type in self.movable_blocks:
+                additional_obs.append(self.wrapped_env.get_body_com(block_name))
+            wrapped_obs = np.concatenate((additional_obs[0], wrapped_obs))
+
+        if self._top_down_view:
+            view = self.get_top_down_view().flatten()
+            wrapped_obs = np.concatenate((wrapped_obs, view))
+
+        return wrapped_obs
+
+    def seed(self, seed=None):
+        self.np_random, seed = seeding.np_random(seed)
+        return [seed]
+
+    def reset(self, goal):
+        self.goal = goal
+
+        if self.visualize_goal:  # remove the prev goal and add a new goal
+            goal_x, goal_y = goal[0], goal[1]
+            size_scaling = self.MAZE_SIZE_SCALING
+            # remove the original goal
+            try:
+                self.worldbody.remove(self.goal_element)
+            except AttributeError:
+                pass
+            # offset all coordinates so that robot starts at the origin
+            self.goal_element = \
+                ET.SubElement(
+                    self.worldbody, "geom",
+                    name="goal_%d_%d" % (goal_x, goal_y),
+                    pos="%f %f %f" % (goal_x,
+                                      goal_y,
+                                      self.MAZE_HEIGHT / 2 * size_scaling),
+                    size="%f %f %f" % (0.1 * size_scaling,  # smaller than the block to prevent collision
+                                       0.1 * size_scaling,
+                                       self.MAZE_HEIGHT / 2 * size_scaling),
+                    type="box",
+                    material="",
+                    contype="1",
+                    conaffinity="1",
+                    rgba="0.0 1.0 0.0 0.5"
+                )
+            # Note: running the lines below will make the robot position wrong! (because the graph is rebuilt)
+            torso = self.tree.find(".//body[@name='torso']")
+            geoms = torso.findall(".//geom")
+            for geom in geoms:
+                if 'name' not in geom.attrib:
+                    raise Exception("Every geom of the torso must have a name "
+                                    "defined")
+            _, file_path = tempfile.mkstemp(text=True, suffix='.xml')
+            self.tree.write(
+                file_path)  # here we write a temporal file with the robot specifications. Why not the original one??
+
+            model_cls = self.__class__.MODEL_CLASS
+            self.wrapped_env = model_cls(*self.args, file_path=file_path,
+                                         **self.kwargs)  # file to the robot specifications; model_cls is AntEnv
+
+        self.t = 0
+        self.trajectory = []
+        self.wrapped_env.reset()
+        if len(self._init_positions) > 1:
+            xy = self._init_positions[self.np_random.randint(len(self._init_positions))]
+            self.wrapped_env.set_xy(xy)
+        return self._get_obs()
+
+    @property
+    def viewer(self):
+        return self.wrapped_env.viewer
+
+    def render(self, *args, **kwargs):
+        return self.wrapped_env.render(*args, **kwargs)
+
+    @property
+    def observation_space(self):
+        shape = self._get_obs().shape
+        high = np.inf * np.ones(shape)
+        low = -high
+        return gym.spaces.Box(low, high)
+
+    @property
+    def action_space(self):
+        return self.wrapped_env.action_space
+
+    def _find_robot(self):
+        structure = self.MAZE_STRUCTURE
+        size_scaling = self.MAZE_SIZE_SCALING
+        for i in range(len(structure)):
+            for j in range(len(structure[0])):
+                if structure[i][j] == 'r':
+                    return j * size_scaling, i * size_scaling
+        assert False, 'No robot in maze specification.'
+
+    def _find_all_robots(self):
+        structure = self.MAZE_STRUCTURE
+        size_scaling = self.MAZE_SIZE_SCALING
+        coords = []
+        for i in range(len(structure)):
+            for j in range(len(structure[0])):
+                if structure[i][j] == 'r':
+                    coords.append((j * size_scaling, i * size_scaling))
+        return coords
+
+    def _is_in_collision(self, pos):
+        i, j = self.new_xy_to_rowcol(pos)
+        if self.MAZE_STRUCTURE[i][j] == 1:
+            return True
+        else:
+            return False
+
+    def invalid_goal(self, pos):
+        i, j = self.new_xy_to_rowcol(pos)
+        if self.MAZE_STRUCTURE[i][j] in [1, -1]:
+            return True
+        else:
+            return False
+
+    # recover the best setting for push
+    def old_is_in_collision(self, pos):
+        x, y = pos
+        structure = self.MAZE_STRUCTURE
+        size_scaling = self.MAZE_SIZE_SCALING
+        for i in range(len(structure)):
+            for j in range(len(structure[0])):
+                if structure[i][j] == 1:
+                    minx = j * size_scaling - size_scaling * 0.5 - self._init_torso_x
+                    maxx = j * size_scaling + size_scaling * 0.5 - self._init_torso_x
+                    miny = i * size_scaling - size_scaling * 0.5 - self._init_torso_y
+                    maxy = i * size_scaling + size_scaling * 0.5 - self._init_torso_y
+                    if minx <= x <= maxx and miny <= y <= maxy:
+                        # print(i, j, minx, maxx, miny, maxy, x, y)
+                        return True
+        return False
+
+    def old_invalid_goal(self, pos):
+        x, y = pos
+        structure = self.MAZE_STRUCTURE
+        size_scaling = self.MAZE_SIZE_SCALING
+        for i in range(len(structure)):
+            for j in range(len(structure[0])):
+                if structure[i][j] in [1, -1]:
+                    minx = j * size_scaling - size_scaling * 0.5 - self._init_torso_x
+                    maxx = j * size_scaling + size_scaling * 0.5 - self._init_torso_x
+                    miny = i * size_scaling - size_scaling * 0.5 - self._init_torso_y
+                    maxy = i * size_scaling + size_scaling * 0.5 - self._init_torso_y
+                    if minx <= x <= maxx and miny <= y <= maxy:
+                        # print(i, j, minx, maxx, miny, maxy, x, y)
+                        return True
+        return False
+
+
+    def new_xy_to_rowcol(self, pos):
+        x, y = pos
+        relative_col = math.ceil(x / self.MAZE_SIZE_SCALING - 0.5)
+        relative_row = math.ceil(y / self.MAZE_SIZE_SCALING - 0.5)
+        return self.init_row_r + relative_row, self.init_col_r + relative_col
+
+    def _rowcol_to_xy(self, j, i):
+        size_scaling = self.MAZE_SIZE_SCALING
+        minx = j * size_scaling - size_scaling * 0.5 - self._init_torso_x
+        maxx = j * size_scaling + size_scaling * 0.5 - self._init_torso_x
+        miny = i * size_scaling - size_scaling * 0.5 - self._init_torso_y
+        maxy = i * size_scaling + size_scaling * 0.5 - self._init_torso_y
+        return (minx + maxx) / 2, (miny + maxy) / 2
+
+    def step(self, action):
+        self.t += 1
+
+        if self._manual_collision:
+            old_pos = self.wrapped_env.get_xy()
+            inner_next_obs, inner_reward, done, info = self.wrapped_env.step(
+                action)
+            new_pos = self.wrapped_env.get_xy()
+            if self._maze_id == "Push":
+                if self.old_is_in_collision(new_pos):
+                    self.wrapped_env.set_xy(old_pos)
+            else:
+                if self._is_in_collision(new_pos):
+                    self.wrapped_env.set_xy(old_pos)
+        else:
+            inner_next_obs, inner_reward, done, info = self.wrapped_env.step(
+                action)
+        next_obs = self._get_obs()
+        done = False
+        if self.GOAL is not None:
+            # print(self.EPS, next_obs[:2], self.GOAL[:2])
+            done = bool(((next_obs[:2] - self.GOAL[:2]) ** 2).sum() < self.EPS)
+            inner_reward = int(done)
+        return next_obs, inner_reward, done, info
--- a/goal_env/mujoco/maze_env_utils.py
+++ b/goal_env/mujoco/maze_env_utils.py
+"""Adapted from rllab maze_env_utils.py."""
+import numpy as np
+import math
+
+
+class Move(object):
+    X = 11
+    Y = 12
+    Z = 13
+    XY = 14
+    XZ = 15
+    YZ = 16
+    XYZ = 17
+    SpinXY = 18
+
+
+def can_move_x(movable):
+    return movable in [Move.X, Move.XY, Move.XZ, Move.XYZ,
+                       Move.SpinXY]
+
+
+def can_move_y(movable):
+    return movable in [Move.Y, Move.XY, Move.YZ, Move.XYZ,
+                       Move.SpinXY]
+
+
+def can_move_z(movable):
+    return movable in [Move.Z, Move.XZ, Move.YZ, Move.XYZ]
+
+
+def can_spin(movable):
+    return movable in [Move.SpinXY]
+
+
+def can_move(movable):
+    return can_move_x(movable) or can_move_y(movable) or can_move_z(movable)
+
+
+def construct_maze(maze_id='Maze'):
+    if maze_id == 'Maze':
+        structure = [
+            [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+            [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1],
+            [1, 0, 'r', 0, 0, 1, 0, 0, 0, 0, 1],
+            [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
+            [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1],
+            [1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1],
+            [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1],
+            [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1],
+            [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
+            [1, 0, 0, 0, 0, 1, 0, 0, 0, 'g', 1],
+            [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+        ]
+    elif maze_id == 'Maze1':
+        structure = [
+            [1, 1, 1, 1, 1],
+            [1, 'r', 0, 0, 1],
+            [1, 1, 1, 0, 1],
+            [1, 'g', 0, 0, 1],
+            [1, 1, 1, 1, 1],
+        ]
+    elif maze_id == 'Push':
+        structure = [
+            [1, 1, 1, 1, 1],
+            [1, 0, 'r', 1, 1],
+            [1, 0, Move.XY, 0, 1],
+            [1, 1, 'g', 1, 1],
+            [1, 1, 1, 1, 1],
+        ]
+    elif maze_id == 'Fall':
+        structure = [
+            [1, 1, 1, 1],
+            [1, 'r', 0, 1],
+            [1, 0, Move.YZ, 1],
+            [1, -1, -1, 1],
+            [1, 'g', 0, 1],
+            [1, 1, 1, 1],
+        ]
+    elif maze_id == 'Block':
+        O = 'r'
+        structure = [
+            [1, 1, 1, 1, 1],
+            [1, O, 0, 0, 1],
+            [1, 0, 0, 0, 1],
+            [1, 0, 0, 'g', 1],
+            [1, 1, 1, 1, 1],
+        ]
+    elif maze_id == 'BlockMaze':
+        O = 'r'
+        structure = [
+            [1, 1, 1, 1],
+            [1, O, 0, 1],
+            [1, 1, 0, 1],
+            [1, 'g', 0, 1],
+            [1, 1, 1, 1],
+        ]
+    else:
+        raise NotImplementedError('The provided MazeId %s is not recognized' % maze_id)
+
+    return structure
+
+
+def line_intersect(pt1, pt2, ptA, ptB):
+    """
+    Taken from https://www.cs.hmc.edu/ACM/lectures/intersections.html
+    this returns the intersection of Line(pt1,pt2) and Line(ptA,ptB)
+    """
+
+    DET_TOLERANCE = 0.00000001
+
+    # the first line is pt1 + r*(pt2-pt1)
+    # in component form:
+    x1, y1 = pt1
+    x2, y2 = pt2
+    dx1 = x2 - x1
+    dy1 = y2 - y1
+
+    # the second line is ptA + s*(ptB-ptA)
+    x, y = ptA
+    xB, yB = ptB
+    dx = xB - x
+    dy = yB - y
+
+    DET = (-dx1 * dy + dy1 * dx)
+
+    if math.fabs(DET) < DET_TOLERANCE: return (0, 0, 0, 0, 0)
+
+    # now, the determinant should be OK
+    DETinv = 1.0 / DET
+
+    # find the scalar amount along the "self" segment
+    r = DETinv * (-dy * (x - x1) + dx * (y - y1))
+
+    # find the scalar amount along the input line
+    s = DETinv * (-dy1 * (x - x1) + dx1 * (y - y1))
+
+    # return the average of the two descriptions
+    xi = (x1 + r * dx1 + x + s * dx) / 2.0
+    yi = (y1 + r * dy1 + y + s * dy) / 2.0
+    return (xi, yi, 1, r, s)
+
+
+def ray_segment_intersect(ray, segment):
+    """
+    Check if the ray originated from (x, y) with direction theta intersects the line segment (x1, y1) -- (x2, y2),
+    and return the intersection point if there is one
+    """
+    (x, y), theta = ray
+    # (x1, y1), (x2, y2) = segment
+    pt1 = (x, y)
+    len = 1
+    pt2 = (x + len * math.cos(theta), y + len * math.sin(theta))
+    xo, yo, valid, r, s = line_intersect(pt1, pt2, *segment)
+    if valid and r >= 0 and 0 <= s <= 1:
+        return (xo, yo)
+    return None
+
+
+def point_distance(p1, p2):
+    x1, y1 = p1
+    x2, y2 = p2
+    return ((x1 - x2) ** 2 + (y1 - y2) ** 2) ** 0.5
--- a/goal_env/mujoco/point.py
+++ b/goal_env/mujoco/point.py
+"""Wrapper for creating the ant environment in gym_mujoco."""
+
+import math
+import numpy as np
+from gym import utils
+from gym.envs.mujoco import mujoco_env
+
+
+class PointEnv(mujoco_env.MujocoEnv, utils.EzPickle):
+    FILE = "point.xml"
+    ORI_IND = 2
+
+    def __init__(self, file_path=None, expose_all_qpos=True):
+        self._expose_all_qpos = expose_all_qpos
+        self.add_noise = False
+
+        mujoco_env.MujocoEnv.__init__(self, file_path, 1)
+        utils.EzPickle.__init__(self)
+
+    @property
+    def physics(self):
+        return self.model
+
+    def _step(self, a):
+        return self.step(a)
+
+    def step(self, action):
+        action[0] = 0.2 * action[0]
+        qpos = np.copy(self.data.qpos)
+        qpos[2] += action[1]
+        ori = qpos[2]
+        # compute increment in each direction
+        dx = math.cos(ori) * action[0]
+        dy = math.sin(ori) * action[0]
+        # ensure that the robot is within reasonable range
+        qpos[0] = np.clip(qpos[0] + dx, -100, 100)
+        qpos[1] = np.clip(qpos[1] + dy, -100, 100)
+        qvel = self.data.qvel
+        self.set_state(qpos, qvel)
+        for _ in range(0, self.frame_skip):
+            self.sim.step()
+        next_obs = self._get_obs()
+        reward = 0
+        done = False
+        info = {}
+        return next_obs, reward, done, info
+
+    def _get_obs(self):
+        if self._expose_all_qpos:
+            obs = np.concatenate([
+                self.data.qpos.flat[:3],  # Only point-relevant coords.
+                self.data.qvel.flat[:3]])
+            if self.add_noise:
+                obs = np.concatenate((obs, np.random.uniform(low=-2, high=2, size=20)))
+            return obs
+        return np.concatenate([
+            self.data.qpos.flat[2:3],
+            self.data.qvel.flat[:3]])
+
+    def reset_model(self):
+        qpos = self.init_qpos + self.np_random.uniform(
+            size=self.model.nq, low=-.1, high=.1)
+        qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
+
+        # Set everything other than point to original position and 0 velocity.
+        qpos[3:] = self.init_qpos[3:]
+        qvel[3:] = 0.
+        self.set_state(qpos, qvel)
+        return self._get_obs()
+
+    def get_ori(self):
+        return self.data.qpos[self.__class__.ORI_IND]
+
+    def set_xy(self, xy):
+        qpos = np.copy(self.data.qpos)
+        qpos[0] = xy[0]
+        qpos[1] = xy[1]
+
+        qvel = self.data.qvel
+        self.set_state(qpos, qvel)
+
+    def get_xy(self):
+        qpos = np.copy(self.data.qpos)
+        return qpos[:2]
+
+    def viewer_setup(self):
+        # self.viewer.cam.trackbodyid = 1
+        # self.viewer.cam.distance = self.model.stat.extent * 0.7
+        # self.viewer.cam.lookat[2] = 0.8925
+        # self.viewer.cam.elevation = 0
+
+        self.viewer.cam.trackbodyid = -1
+        self.viewer.cam.distance = 60
+        self.viewer.cam.elevation = -90
--- a/goal_env/mujoco/point_maze_env.py
+++ b/goal_env/mujoco/point_maze_env.py
+from .maze_env import MazeEnv
+from .point import PointEnv
+
+
+class PointMazeEnv(MazeEnv):
+    MODEL_CLASS = PointEnv
--- a/goal_env/mujoco/swimmer.py
+++ b/goal_env/mujoco/swimmer.py
+import numpy as np
+from gym import utils
+from gym.envs.mujoco import mujoco_env
+
+class SwimmerEnv(mujoco_env.MujocoEnv, utils.EzPickle):
+    ORI_IND = 2
+    FILE = "swimmer.xml"
+    def __init__(self, file_path=None, expose_all_qpos=True):
+        self._expose_all_qpos = expose_all_qpos
+        self.add_noise = False
+
+        mujoco_env.MujocoEnv.__init__(self, file_path, 4)
+        utils.EzPickle.__init__(self)
+
+    def _step(self, a):
+        return self.step(a)
+
+    def step(self, a):
+        ctrl_cost_coeff = 0.0001
+        xposbefore = self.sim.data.qpos[0]
+        self.do_simulation(a, self.frame_skip)
+        xposafter = self.sim.data.qpos[0]
+        reward_fwd = (xposafter - xposbefore) / self.dt
+        reward_ctrl = - ctrl_cost_coeff * np.square(a).sum()
+        reward = reward_fwd + reward_ctrl
+        ob = self._get_obs()
+        return ob, reward, False, dict(reward_fwd=reward_fwd, reward_ctrl=reward_ctrl)
+
+    def _get_obs(self):
+        qpos = self.sim.data.qpos
+        qvel = self.sim.data.qvel
+        # print("qpos", qpos)
+        # print("qvel", qvel)
+        return np.concatenate([qpos.flat, qvel.flat])
+
+    def reset_model(self):
+        self.set_state(
+            self.init_qpos + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq),
+            self.init_qvel + self.np_random.uniform(low=-.1, high=.1, size=self.model.nv)
+        )
+        return self._get_obs()
+
+    def get_ori(self):
+        return self.data.qpos[self.__class__.ORI_IND]
+
+    def set_xy(self, xy):
+        qpos = np.copy(self.data.qpos)
+        qpos[0] = xy[0]
+        qpos[1] = xy[1]
+
+        qvel = self.data.qvel
+        self.set_state(qpos, qvel)
+
+    def get_xy(self):
+        qpos = np.copy(self.data.qpos)
+        return qpos[:2]
+
+    def viewer_setup(self):
+        # self.viewer.cam.trackbodyid = 1
+        # self.viewer.cam.distance = self.model.stat.extent * 0.7
+        # self.viewer.cam.lookat[2] = 0.8925
+        # self.viewer.cam.elevation = 0
+
+        self.viewer.cam.trackbodyid = -1
+        self.viewer.cam.distance = 60
+        self.viewer.cam.elevation = -90
--- a/goal_env/mujoco/swimmer_maze_env.py
+++ b/goal_env/mujoco/swimmer_maze_env.py
+from .maze_env import MazeEnv
+from .swimmer import SwimmerEnv
+
+
+class SwimmerMazeEnv(MazeEnv):
+    MODEL_CLASS = SwimmerEnv
--- a/goal_env/nchain.py
+++ b/goal_env/nchain.py
+# copied from openai gym
+
+import gym
+from gym import spaces
+from gym.utils import seeding
+import numpy as np
+
+class NChainEnv(gym.Env):
+    """n-Chain environment
+    This game presents moves along a linear chain of states, with two actions:
+     0) forward, which moves along the chain but returns no reward
+     1) backward, which returns to the beginning and has a small reward
+    The end of the chain, however, presents a large reward, and by moving
+    'forward' at the end of the chain this large reward can be repeated.
+    At each action, there is a small probability that the agent 'slips' and the
+    opposite transition is instead taken.
+    The observed state is the current state in the chain (0 to n-1).
+    This environment is described in section 6.1 of:
+    A Bayesian Framework for Reinforcement Learning by Malcolm Strens (2000)
+    http://ceit.aut.ac.ir/~shiry/lecture/machine-learning/papers/BRL-2000.pdf
+    """
+    def __init__(self, n=5, slip=0.2, small=0.001, large=1.0):
+        self.n = n
+        self.n2 = bin(n-1)
+        print("n2", self.n2, len(self.n2)-2)
+        self.slip = slip  # probability of 'slipping' an action
+        self.small = small  # payout for 'backwards' action
+        self.large = large  # payout at end of chain for 'forwards' action
+        self.state = 0  # Start at beginning of the chain
+        self.action_space = spaces.Box(low=-1., high=1., shape=(1,))
+        # self.observation_space = spaces.Discrete(self.n)
+        self.observation_space = spaces.Discrete(len(self.n2) - 2)
+        self.shuffle_order = np.arange(len(self.n2) - 2)
+        np.random.shuffle(self.shuffle_order)
+        self.seed()
+        target = np.zeros(n)
+        target[n-1] = 1
+        self.target = target
+        self.reward_type = "sparse"
+        self.visited_count = np.zeros(n)
+
+    def seed(self, seed=None):
+        self.np_random, seed = seeding.np_random(seed)
+        return [seed]
+
+    def step(self, action):
+        # print("action", action)
+        success = False
+        info = {}
+        assert self.action_space.contains(action)
+        if self.np_random.rand() < self.slip:
+            action = 0 - action  # agent slipped, reverse action taken
+        if action < 0 and self.state > 0:  # 'backwards': go back to the beginning, get small reward
+            reward = self.small
+            self.state -= 1
+        elif action > 0 and self.state < self.n - 1:  # 'forwards': go up along the chain
+            reward = 0
+            self.state += 1
+        elif self.state == self.n - 1:  # 'forwards': stay at the end of the chain, collect large reward
+            reward = self.large
+            success = True
+        else:
+            reward = 0
+        done = False
+        info["is_success"] = success
+        # print("state", self.state)
+        if self.visited_count[self.state] == 0:
+            self.visited_count[self.state] = 1
+        return self.get_obs(), reward, done, info
+
+    def reset(self):
+        self.state = 0
+        if self.visited_count[self.state] == 0:
+            self.visited_count[self.state] = 1.
+        return self.get_obs()
+
+    def get_obs(self):
+        new = np.zeros(len(self.n2) - 2)
+        # new[self.state] = 1
+        new2 = bin(self.state)
+        new2 = list(new2[2:])
+        new2.reverse()
+
+        for i, ele in enumerate(new2):
+            new[-(i+1)] = int(ele)
+
+        new = new[::-1]
+        # new = new[self.shuffle_order]
+
+        return {
+            "observation": np.copy(new),
+            "achieved_goal": np.copy(new),
+            "desired_goal": np.copy(self.target),
+        }
+
+    @property
+    def coverage(self):
+        return np.sum(self.visited_count) / self.n
\ No newline at end of file
--- a/goal_env/plane.py
+++ b/goal_env/plane.py
+import gym
+import numpy as np
+import cv2
+from gym import spaces
+
+
+def line_intersection(line1, line2):
+    # calculate the intersection point
+    xdiff = (line1[0][0] - line1[1][0], line2[0][0] - line2[1][0])
+    ydiff = (line1[0][1] - line1[1][1], line2[0]
+    [1] - line2[1][1])  # Typo was here
+
+    def det(a, b):
+        return a[0] * b[1] - a[1] * b[0]
+
+    div = det(xdiff, ydiff)
+    if div == 0:
+        raise Exception('lines do not intersect')
+
+    d = (det(*line1), det(*line2))
+    x = det(d, xdiff) / div
+    y = det(d, ydiff) / div
+    return x, y
+
+
+def check_cross(x0, y0, x1, y1):
+    x0 = np.array(x0)
+    y0 = np.array(y0)
+    x1 = np.array(x1)
+    y1 = np.array(y1)
+    return np.cross(x1 - x0, y0 - x0), np.cross(y0 - x0, y1 - x0)
+
+
+def check_itersection(x0, y0, x1, y1):
+    EPS = 1e-10
+
+    def sign(x):
+        if x > EPS:
+            return 1
+        if x < -EPS:
+            return -1
+        return 0
+
+    f1, f2 = check_cross(x0, y0, x1, y1)
+    f3, f4 = check_cross(x1, y1, x0, y0)
+    if sign(f1) == sign(f2) and sign(f3) == sign(f4) and sign(f1) != 0 and sign(f3) != 0:
+        return True
+    return False
+
+
+class PlaneBase(gym.Env):
+    def __init__(self, rects, R, is_render=False, size=512):
+        self.rects = rects
+        self.n = len(self.rects)
+        self.size = size
+        self.map = np.ones((size, size, 3), dtype=np.uint8) * 255
+        self.R = R
+        self.R2 = R ** 2
+        self.board = np.array(
+            [[0, 0],
+             [1, 1]],
+            dtype='float32')
+
+        self.action_space = gym.spaces.Box(
+            low=-R, high=R, shape=(2,), dtype='float32')
+        self.observation_space = gym.spaces.Box(
+            low=0., high=1., shape=(2,), dtype='float32')
+
+        if is_render:
+            cv2.namedWindow('image', cv2.WINDOW_NORMAL)
+            self.image_name = 'image'
+
+        for i in range(self.n):
+            for j in range(i + 1, self.n):
+                if check_itersection(self.rects[i][0], self.rects[i][1], self.rects[j][0], self.rects[j][0]):
+                    raise Exception("Rectangle interaction with each other")
+
+        for ((x0, y0), (x1, y1)) in rects:
+            x0, y0 = int(x0 * size), int(y0 * size)
+            x1, y1 = int(x1 * size), int(y1 * size)
+            cv2.rectangle(self.map, (x0, y0), (x1, y1), (0, 255, 0), 1)
+
+            ps = np.array([
+                [x0, y0],
+                [x1, y0],
+                [x1, y1],
+                [x0, y1],
+            ], dtype=np.int32)
+            cv2.fillConvexPoly(self.map, ps, (127, 127, 127))
+
+        self.state = (0, 0)
+        self.reset()
+
+    def restore(self, obs):
+        self.state = (float(obs[0]), float(obs[1]))
+
+    def rect_lines(self, rect):
+        (x0, y0), (x1, y1) = rect
+        yield (x0, y0), (x1, y0)
+        yield (x1, y0), (x1, y1)
+        yield (x1, y1), (x0, y1)
+        yield (x0, y1), (x0, y0)
+
+    def l2dist(self, x, y):
+        return ((y[0] - x[0]) ** 2) + ((y[1] - x[1]) ** 2)
+
+    def check_inside(self, p):
+        EPS = 1e-10
+        for i in self.rects:
+            if p[0] > i[0][0] + EPS and p[0] < i[1][0] - EPS and p[1] > i[0][1] + EPS and p[1] < i[1][1] - EPS:
+                return True
+        return False
+
+    def step(self, action):
+        dx, dy = action
+        l = 0.0001
+        p = (self.state[0] + dx * l, self.state[1] + dy * l)
+        if self.check_inside(p) or p[0] > 1 or p[1] > 1 or p[0] < 0 or p[1] < 0:
+            return np.array(self.state), 0, False, {}
+
+        dest = (self.state[0] + dx, self.state[1] + dy)
+
+        md = self.l2dist(self.state, dest)
+
+        _dest = dest
+        line = (self.state, dest)
+
+        for i in list(self.rects) + [self.board]:
+            for l in self.rect_lines(i):
+                if check_itersection(self.state, dest, l[0], l[1]):
+                    inter_point = line_intersection(line, l)
+                    d = self.l2dist(self.state, inter_point)
+                    if d < md:
+                        md = d
+                        _dest = inter_point
+
+        self.restore(_dest)
+        return np.array(self.state), -md, False, {}
+
+    def render(self, mode='human'):
+        image = self.map.copy()
+        x, y = self.state
+        x = int(x * self.size)
+        y = int(y * self.size)
+        cv2.circle(image, (x, y), 5, (255, 0, 255), -1)
+        if mode == 'human':
+            cv2.imshow('image', image)
+            cv2.waitKey(2)
+        else:
+            return image
+
+    def reset(self):
+        inside_rect = True
+        while inside_rect:
+            a, b = np.random.random(), np.random.random()
+            inside_rect = self.check_inside((a, b))
+        self.state = (a, b)
+        return np.array(self.state)
+
+
+class NaivePlane(PlaneBase):
+    def __init__(self, is_render=True, R=300, size=512):
+        PlaneBase.__init__(self,
+                           [
+                               np.array([[128, 128], [300, 386]]) / 512,
+                               np.array([[400, 400], [500, 500]]) / 512,
+                           ],
+                           R, is_render=is_render, size=size),
+
+
+class NaivePlane2(PlaneBase):
+    # two rectangle
+    def __init__(self, is_render=True, R=300, size=512):
+        PlaneBase.__init__(self,
+                           [
+                               np.array([[64, 64], [256, 256]]) / 512,
+                               np.array([[300, 128], [400, 500]]) / 512,
+                           ],
+                           R, is_render=is_render, size=size),
+
+
+class NaivePlane3(PlaneBase):
+    # four rectangle
+    def __init__(self, is_render=True, R=300, size=512):
+        PlaneBase.__init__(self,
+                           [
+                               np.array([[64, 64], [192, 192]]) / 512,
+                               np.array([[320, 64], [448, 192]]) / 512,
+                               np.array([[320, 320], [448, 448]]) / 512,
+                               np.array([[64, 320], [192, 448]]) / 512,
+                           ],
+                           R, is_render=is_render, size=size),
+
+
+class NaivePlane4(PlaneBase):
+    # four rectangle
+    def __init__(self, is_render=True, R=300, size=512):
+        PlaneBase.__init__(self,
+                           [
+                               np.array([[64, 64], [192, 512]]) / 512,
+                               np.array([[320, 64], [448, 512]]) / 512,
+                           ],
+                           R, is_render=is_render, size=size),
+
+
+class NaivePlane5(PlaneBase):
+    # four rectangle
+    def __init__(self, is_render=False, R=300, size=512):
+        PlaneBase.__init__(self,
+                           [
+                               np.array([[0, 1. / 3], [2. / 3, 2. / 3]]),
+                           ],
+                           R, is_render=is_render, size=size),
+
+
+if __name__ == '__main__':
+    env = NaivePlane5()
+    obs = env.reset()
+    while True:
+        print(obs)
+        env.render()
+        while True:
+            try:
+                print('entering the dir (x, y)')
+                act = input().strip().split(' ')
+                act = float(act[0]) / 512, float(act[1]) / 512
+                break
+            except KeyboardInterrupt as e:
+                raise e
+            except:
+                continue
+
+        obs, reward, _, _ = env.step(act)
--- a/goal_env/recorder.py
+++ b/goal_env/recorder.py
+import cv2
+import torch
+import numpy as np
+
+
+## This is used to store a video for remote visualization
+def play(env, policy, video_path="tmp.avi", time_limit=500, device='cpu'):
+    out = None
+    obs = env.reset()
+    num = 0
+
+    rew = None
+    action = None
+    info = None
+    flag = False
+    while True:
+        img = env.unwrapped.render(mode='rgb_array')[:, :, ::-1].copy()
+        '''
+        if True and isinstance(obs, dict):
+            np.set_printoptions(precision=3)
+            achieved = (float(obs['achieved_goal'][0]), float(obs['achieved_goal'][1]))
+            desired = (float(obs['desired_goal'][0]), float(obs['desired_goal'][1]))
+
+            cv2.putText(img, " obs: {:.3f} {:.3f}".format(achieved[0], achieved[1]), (400,25), cv2.FONT_HERSHEY_SIMPLEX, 0.3, 255)
+            cv2.putText(img, "goal: {:.3f} {:.3f}".format(desired[0], desired[1]), (400,50), cv2.FONT_HERSHEY_SIMPLEX, 0.3, 255)
+            if rew is not None:
+                cv2.putText(img, "rew: {:.3f}".format(rew), (400,75), cv2.FONT_HERSHEY_SIMPLEX, 0.3, 255)
+            if action is not None:
+                action = [float(i) for i in action][:2]
+                cv2.putText(img, "rew: {:.3f} {:.3f}".format(action[0], action[1]), (400,100), cv2.FONT_HERSHEY_SIMPLEX, 0.3, 255)
+            if info is not None:
+                if 'is_success' in info:
+                    cv2.putText(img, "success? {}".format(info['is_success']), (400,125), cv2.FONT_HERSHEY_SIMPLEX, 0.3, 255)
+            cv2.putText(img, "step {}".format(num), (400,150), cv2.FONT_HERSHEY_SIMPLEX, 0.3, 255)
+            flag = True
+        '''
+        if out is None:
+            out = cv2.VideoWriter(
+                video_path, cv2.VideoWriter_fourcc(*'XVID'), 20.0, (img.shape[1], img.shape[0]))
+        out.write(img)
+        if isinstance(obs, dict):
+            goal = torch.tensor(obs['desired_goal'], dtype=torch.float32).to(device)
+            obs = torch.tensor(obs['observation'], dtype=torch.float32).to(device)
+            action = policy(obs.unsqueeze(0), goal.unsqueeze(0))
+            if isinstance(action, torch.Tensor):
+                action = action.detach().cpu().numpy()
+        else:
+            action = policy(np.array(obs)[None]).action[0].detach().cpu().numpy()
+        obs, rew, done, info = env.step(action)
+        if done:
+            obs = env.reset()
+        num += 1
+        # assert not info['is_success']
+        flag = True
+        if not flag:
+            print(num, info, rew, done, env.goal, action)
+        if num == time_limit - 1:
+            break
--- a/models/distance.py
+++ b/models/distance.py
+import torch
+from torch import nn
+import numpy as np
+
+
+class L1(nn.Module):
+    def __init__(self):
+        nn.Module.__init__(self)
+
+    def forward(self, s, t):
+        if isinstance(s, np.ndarray):
+            s = torch.from_numpy(s).float()
+        if isinstance(t, np.ndarray):
+            t = torch.from_numpy(t).float()
+        out = torch.abs(s - t)
+        return out.view(out.size(0), -1).sum(dim=1)
+
+
+class L2(nn.Module):
+    def __init__(self):
+        nn.Module.__init__(self)
+
+    def forward(self, s, t):
+        out = (s - t) ** 2
+        return (out.view(out.size(0), -1).sum(dim=1) + 1e-14) ** 0.5
+
+
+class DotProd(nn.Module):
+    def __init__(self):
+        nn.Module.__init__(self)
+
+    def forward(self, s, t):
+        if isinstance(s, np.ndarray):
+            s = torch.from_numpy(s).float()
+        if isinstance(t, np.ndarray):
+            t = torch.from_numpy(t).float()
+
+        out = (s * t[:, None, :]).sum(dim=2)[:, 0]
+        return out
+
+
+class MLPDist(nn.Module):
+    def __init__(self, inp_dim):
+        nn.Module.__init__(self)
+        self.dim = inp_dim
+        self.mlp = nn.Sequential(
+            nn.Linear(self.dim * 2, self.dim),
+            nn.ReLU(),
+            nn.Linear(self.dim, self.dim),
+            nn.ReLU(),
+            nn.Linear(self.dim, 1),
+        )
+
+    def forward(self, s, t):
+        if isinstance(s, np.ndarray):
+            s = torch.from_numpy(s).float()
+        if isinstance(t, np.ndarray):
+            t = torch.from_numpy(t).float()
+        out = self.mlp(torch.cat([s, t], dim=1))
+        return out.squeeze(-1)
+
+
+class Distance(nn.Module):
+    def __init__(self, encoder, distance):
+        nn.Module.__init__(self)
+        self.encoder = encoder
+        self.metrics = distance
+
+    def forward(self, s, t):
+        s = self.encoder(s)
+        t = self.encoder(t)
+        return self.metrics(s, t)
+
+
+class MultiEncoderDistance(nn.Module):
+    def __init__(self, encoder_s, encoder_t, distance):
+        nn.Module.__init__(self)
+        self.encoder_s = encoder_s
+        self.encoder_t = encoder_t
+        self.metrics = distance
+
+    def forward(self, s, t):
+        s = self.encoder_s(s)
+        t = self.encoder_t(t)
+        return self.metrics(s, t)
--- a/models/networks.py
+++ b/models/networks.py
+import torch.nn.functional as F
+import sys
+
+sys.path.append('../')
+from models.distance import *
+import numpy as np
+from torch.distributions.multivariate_normal import MultivariateNormal
+import torch.distributions as D
+
+"""
+the input x in both networks should be [o, g], where o is the observation and g is the goal.
+"""
+
+
+def initialize_metrics(metric, dim):
+    if metric == 'L1':
+        return L1()
+    elif metric == 'L2':
+        return L2()
+    elif metric == 'dot':
+        return DotProd()
+    elif metric == 'MLP':
+        return MLPDist(dim)
+    else:
+        raise NotImplementedError
+
+
+# define the actor network
+class actor(nn.Module):
+    def __init__(self, env_params, goal_dim):
+        super(actor, self).__init__()
+        self.max_action = env_params['action_max']
+        self.fc1 = nn.Linear(env_params['low_dim'] + goal_dim, 400)
+        self.fc2 = nn.Linear(400, 400)
+        self.fc3 = nn.Linear(400, 400)
+        self.fc4 = nn.Linear(400, 400)
+        self.action_out = nn.Linear(400, env_params['action'])
+
+    def forward(self, obs, goal):
+        x = torch.cat([obs, goal], dim=1)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = F.relu(self.fc3(x))
+        x = F.relu(self.fc4(x))
+        actions = self.max_action * torch.tanh(self.action_out(x))
+        return actions
+
+
+# define the actor network
+class Inverse_goal(nn.Module):
+    def __init__(self, env_params, goal_dim, hi_max_action):
+        super(Inverse_goal, self).__init__()
+        self.max_action = hi_max_action
+        self.fc1 = nn.Linear(env_params['obs'] * 2, 400)
+        self.fc2 = nn.Linear(400, 400)
+        self.fc3 = nn.Linear(400, 400)
+        self.fc4 = nn.Linear(400, 400)
+        self.action_out = nn.Linear(400, goal_dim)
+
+    def forward(self, obs, goal):
+        x = torch.cat([obs, goal], dim=1)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = F.relu(self.fc3(x))
+        x = F.relu(self.fc4(x))
+        actions = self.max_action * torch.tanh(self.action_out(x))
+        return actions
+
+
+# define the high-level actor network
+class Hi_actor(nn.Module):
+    def __init__(self, env_params, real_goal_dim, maze_high, shallow, sigmoid=False):
+        super(Hi_actor, self).__init__()
+        self.max_action = maze_high
+        self.fc1 = nn.Linear(env_params['hi_dim'] + env_params['goal'], 400)
+        self.fc2 = nn.Linear(400, 400)
+        self.fc3 = nn.Linear(400, 400)
+        self.fc4 = nn.Linear(400, 400)
+        self.action_out = nn.Linear(400, real_goal_dim)
+        self.sigmoid = sigmoid
+        self.shallow = shallow
+
+    def forward(self, obs, goal):
+        x = torch.cat([obs, goal], dim=1)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        if not self.shallow:
+            x = F.relu(self.fc3(x))
+            x = F.relu(self.fc4(x))
+        if self.sigmoid:
+            actions = self.max_action * torch.sigmoid(self.action_out(x))
+        else:
+            actions = self.max_action * torch.tanh(self.action_out(x))
+        return actions
+
+
+class Hi_critic(nn.Module):
+    def __init__(self, env_params, args, real_goal_dim, maze_high):
+        super(Hi_critic, self).__init__()
+        self.max_action = maze_high
+        self.inp_dim = env_params['hi_dim'] + real_goal_dim + env_params['goal']
+        self.out_dim = 1
+        self.mid_dim = 400
+        self.gamma = args.gamma
+
+        if args.hi_layer == 1:
+            models = [nn.Linear(self.inp_dim, self.out_dim)]
+        else:
+            models = [nn.Linear(self.inp_dim, self.mid_dim)]
+        if args.hi_layer > 2:
+            for i in range(args.layer - 2):
+                models += [nn.ReLU(), nn.Linear(self.mid_dim, self.mid_dim)]
+        if args.hi_layer > 1:
+            models += [nn.ReLU(), nn.Linear(self.mid_dim, self.out_dim)]
+
+        self.base = nn.Sequential(*models)
+
+    def forward(self, obs, goal, actions):
+        x = torch.cat([obs, actions / self.max_action], dim=1)
+        x = torch.cat([x, goal], dim=1)
+        dist = self.base(x)
+        return dist
+
+
+class critic(nn.Module):
+    def __init__(self, env_params, args, goal_dim):
+        super(critic, self).__init__()
+        self.max_action = env_params['action_max']
+        self.inp_dim = env_params['low_dim'] + env_params['action'] + goal_dim
+        self.out_dim = 1
+        self.mid_dim = 400
+
+        if args.layer == 1:
+            models = [nn.Linear(self.inp_dim, self.out_dim)]
+        else:
+            models = [nn.Linear(self.inp_dim, self.mid_dim)]
+        if args.layer > 2:
+            for i in range(args.layer - 2):
+                models += [nn.ReLU(), nn.Linear(self.mid_dim, self.mid_dim)]
+        if args.layer > 1:
+            models += [nn.ReLU(), nn.Linear(self.mid_dim, self.out_dim)]
+
+        self.base = nn.Sequential(*models)
+
+    def forward(self, obs, goal, actions):
+        x = torch.cat([obs, actions / self.max_action], dim=1)
+        x = torch.cat([x, goal], dim=1)
+        dist = self.base(x)
+        return dist
+
+class Critic_double(nn.Module):
+    def __init__(self, env_params, args):
+        super(Critic_double, self).__init__()
+        self.max_action = env_params['action_max']
+        self.inp_dim = env_params['obs'] + env_params['action'] + env_params['goal']
+        self.out_dim = 1
+        self.mid_dim = 400
+
+        if args.layer == 1:
+            models = [nn.Linear(self.inp_dim, self.out_dim)]
+        else:
+            models = [nn.Linear(self.inp_dim, self.mid_dim)]
+        if args.layer > 2:
+            for i in range(args.layer - 2):
+                models += [nn.ReLU(), nn.Linear(self.mid_dim, self.mid_dim)]
+        if args.layer > 1:
+            models += [nn.ReLU(), nn.Linear(self.mid_dim, self.out_dim)]
+
+        self.base = nn.Sequential(*models)
+
+        if args.layer == 1:
+            models1 = [nn.Linear(self.inp_dim, self.out_dim)]
+        else:
+            models1 = [nn.Linear(self.inp_dim, self.mid_dim)]
+        if args.layer > 2:
+            for i in range(args.layer - 2):
+                models1 += [nn.ReLU(), nn.Linear(self.mid_dim, self.mid_dim)]
+        if args.layer > 1:
+            models1 += [nn.ReLU(), nn.Linear(self.mid_dim, self.out_dim)]
+
+        self.base1 = nn.Sequential(*models1)
+
+    def forward(self, obs, goal, actions):
+        x = torch.cat([obs, actions / self.max_action], dim=1)
+        x = torch.cat([x, goal], dim=1)
+        dist = self.base(x)
+        dist1 = self.base1(x)
+        return dist, dist1
+
+
+class criticWrapper(nn.Module):
+    def __init__(self, env_params, args, goal_dim):
+        super(criticWrapper, self).__init__()
+        self.base = critic(env_params, args, goal_dim)
+        self.args = args
+        self.gamma = args.gamma
+
+    def forward(self, obs, goal, actions):
+        dist = self.base(obs, goal, actions)
+        self.alpha = np.log(self.gamma)
+        return -(1 - torch.exp(dist * self.alpha)) / (1 - self.gamma)
+
+
+class doubleWrapper(nn.Module):
+    def __init__(self, env_params, args):
+        super(doubleWrapper, self).__init__()
+        self.base = Critic_double(env_params, args)
+        self.args = args
+        self.gamma = args.gamma
+
+    def forward(self, obs, goal, actions):
+        dist, dist1 = self.base(obs, goal, actions)
+        self.alpha = np.log(self.gamma)
+        return -(1 - torch.exp(dist * self.alpha)) / (1 - self.gamma), -(1 - torch.exp(dist1 * self.alpha)) / (1 - self.gamma)
+
+    def Q1(self, obs, goal, actions):
+        dist, _ = self.base(obs, goal, actions)
+        self.alpha = np.log(self.gamma)
+        return -(1 - torch.exp(dist * self.alpha)) / (1 - self.gamma)
+
+
+class EmbedNet(nn.Module):
+    def __init__(self, env_params, args):
+        super(EmbedNet, self).__init__()
+        self.max_action = env_params['action_max']
+        self.obs_dim = env_params['obs'] + env_params['action']
+        self.goal_dim = env_params['goal']
+        self.out_dim = 128
+        self.mid_dim = 400
+
+        if args.layer == 1:
+            obs_models = [nn.Linear(self.obs_dim, self.out_dim)]
+            goal_models = [nn.Linear(self.goal_dim, self.out_dim)]
+        else:
+            obs_models = [nn.Linear(self.obs_dim, self.mid_dim)]
+            goal_models = [nn.Linear(self.goal_dim, self.mid_dim)]
+        if args.layer > 2:
+            for i in range(args.layer - 2):
+                obs_models += [nn.ReLU(), nn.Linear(self.mid_dim, self.mid_dim)]
+                goal_models += [nn.ReLU(), nn.Linear(self.mid_dim, self.mid_dim)]
+        if args.layer > 1:
+            obs_models += [nn.ReLU(), nn.Linear(self.mid_dim, self.out_dim)]
+            goal_models += [nn.ReLU(), nn.Linear(self.mid_dim, self.out_dim)]
+
+        self.obs_encoder = nn.Sequential(*obs_models)
+        self.goal_encoder = nn.Sequential(*goal_models)
+        self.metric = initialize_metrics(args.metric, self.out_dim)
+
+    def forward(self, obs, goal, actions):
+        s = torch.cat([obs, actions / self.max_action], dim=1)
+        s = self.obs_encoder(s)
+        g = self.goal_encoder(goal)
+        dist = self.metric(s, g)
+        return dist
+
+
+class Qnet(nn.Module):
+    def __init__(self, env_params, args):
+        super(Qnet, self).__init__()
+        self.mid_dim = 100
+        self.metric = args.metric
+
+        self.action_n = env_params['action_dim']
+        self.obs_fc1 = nn.Linear(env_params['obs'], 256)
+        self.obs_fc2 = nn.Linear(256, self.mid_dim * self.action_n)
+
+        self.goal_fc1 = nn.Linear(env_params['goal'], 256)
+        self.goal_fc2 = nn.Linear(256, self.mid_dim)
+        if self.metric == 'MLP':
+            self.mlp = nn.Sequential(
+                nn.Linear(self.mid_dim * (self.action_n + 1), 128),
+                nn.ReLU(),
+                nn.Linear(128, self.action_n),
+            )
+
+    def forward(self, obs, goal):
+        s = F.relu(self.obs_fc1(obs))
+        s = F.relu(self.obs_fc2(s))
+        s = s.view(s.size(0), self.action_n, self.mid_dim)
+
+        g = F.relu(self.goal_fc1(goal))
+        g = F.relu(self.goal_fc2(g))
+
+        if self.metric == 'L1':
+            dist = torch.abs(s - g[:, None, :]).sum(dim=2)
+        elif self.metric == 'dot':
+            dist = -(s * g[:, None, :]).sum(dim=2)
+        elif self.metric == 'L2':
+            dist = ((torch.abs(s - g[:, None, :]) ** 2).sum(dim=2) + 1e-14) ** 0.5
+        elif self.metric == 'MLP':
+            s = s.view(s.size(0), -1)
+            x = torch.cat([s, g], dim=1)
+            dist = self.mlp(x)
+        else:
+            raise NotImplementedError
+        return dist
+
+
+class QNetWrapper(nn.Module):
+    def __init__(self, env_params, args):
+        super(QNetWrapper, self).__init__()
+        self.base = Qnet(env_params, args)
+        self.args = args
+        self.gamma = args.gamma
+
+    def forward(self, obs, goal):
+        dist = self.base(obs, goal)
+        self.alpha = np.log(self.gamma)
+        qval = -(1 - torch.exp(dist * self.alpha)) / (1 - self.gamma)
+        return qval
+
+
+class EmbedNetWrapper(nn.Module):
+    def __init__(self, env_params, args):
+        super(EmbedNetWrapper, self).__init__()
+        self.base = EmbedNet(env_params, args)
+        self.args = args
+        self.gamma = args.gamma
+
+    def forward(self, obs, goal, actions):
+        dist = self.base(obs, goal, actions)
+        self.alpha = np.log(self.gamma)
+        return -(1 - torch.exp(dist * self.alpha)) / (1 - self.gamma)
+
+
+class RepresentationNetwork(nn.Module):
+    def __init__(self, env_params, layer, abs_range, out_dim):
+        super(RepresentationNetwork, self).__init__()
+        self.obs_dim = env_params['obs']
+        self.out_dim = out_dim
+        self.mid_dim = 100
+
+        if layer == 1:
+            obs_models = [nn.Linear(self.obs_dim, self.out_dim)]
+        else:
+            obs_models = [nn.Linear(self.obs_dim, self.mid_dim)]
+        if layer > 2:
+            for i in range(layer - 2):
+                obs_models += [nn.ReLU(), nn.Linear(self.mid_dim, self.mid_dim)]
+        if layer > 1:
+            obs_models += [nn.ReLU(), nn.Linear(self.mid_dim, self.out_dim)]
+
+        self.obs_encoder = nn.Sequential(*obs_models)
+        self.abs_range = abs_range
+
+    def forward(self, obs):
+        if len(obs.shape) is 1:
+            obs = obs.unsqueeze(0)
+        s = self.obs_encoder(obs)
+        return s
+
+
+class DynamicsNetwork(nn.Module):
+    def __init__(self, env_params, abs_range, out_dim, tanh_output, use_prob, device):
+        super(DynamicsNetwork, self).__init__()
+        self.obs_dim = env_params['obs']
+        self.out_dim = out_dim
+        self.mid_dim = 100
+
+        # obs encoder
+        obs_models = [nn.Linear(self.obs_dim, self.mid_dim)]
+        obs_models += [nn.ReLU(), nn.Linear(self.mid_dim, self.mid_dim)]
+        obs_models += [nn.ReLU(), nn.Linear(self.mid_dim, self.out_dim)]
+
+        self.obs_encoder = nn.Sequential(*obs_models)
+        self.abs_range = abs_range
+
+        # goal input
+        self.goal_input = nn.Linear(out_dim, int(self.mid_dim / 2))
+        self.dynamics_layer = nn.Linear(int(self.mid_dim / 2) + self.out_dim, self.mid_dim)
+        self.output_layer = nn.Linear(self.mid_dim, self.out_dim)
+
+        self.tanh_output = tanh_output
+        self.probabilistic_output = use_prob
+        self.device = device
+
+    def phi(self, obs):
+        if len(obs.shape) is 1:
+            obs = obs.unsqueeze(0)
+        s = self.obs_encoder(obs)
+        return s
+
+    def forward(self, obs, hi_action):
+        latent_s = self.obs_encoder(obs)
+        action_out = self.goal_input(hi_action)
+        action_out = F.relu(action_out)
+        x = torch.cat([latent_s, action_out], 1)
+        x = self.dynamics_layer(x)
+        x = F.relu(x)
+        x = self.output_layer(x)
+
+        if self.tanh_output:
+            x = self.abs_range * torch.tanh(x)
+            return x
+        elif self.probabilistic_output:
+            std_dev = torch.ones(x.shape[0], self.out_dim).to(self.device)
+            return D.Independent(D.Normal(x, std_dev), 1)
+        else:
+           return x
--- a/train_hier_sac.py
+++ b/train_hier_sac.py
+import numpy as np
+import gym
+from arguments.arguments_hier_sac import get_args_ant, get_args_chain
+from algos.hier_sac import hier_sac_agent
+from goal_env.mujoco import *
+import random
+import torch
+
+
+def get_env_params(env):
+    obs = env.reset()
+    # close the environment
+    params = {'obs': obs['observation'].shape[0], 'goal': obs['desired_goal'].shape[0],
+              'action': env.action_space.shape[0], 'action_max': env.action_space.high[0],
+              'max_timesteps': env._max_episode_steps}
+    return params
+
+
+def launch(args):
+    # create the ddpg_agent
+    env = gym.make(args.env_name)
+    test_env = gym.make(args.test)
+    # if args.env_name == "AntPush-v1":
+    #     test_env1 = gym.make("AntPushTest1-v1")
+    #     test_env2 = gym.make("AntPushTest2-v1")
+    # elif args.env_name == "AntMaze1-v1":
+    #     test_env1 = gym.make("AntMaze1Test1-v1")
+    #     test_env2 = gym.make("AntMaze1Test2-v1")
+    # else:
+    test_env1 = test_env2 = None
+    print("test_env", test_env1, test_env2)
+
+    # set random seeds for reproduce
+    env.seed(args.seed)
+    if args.env_name != "NChain-v1":
+        env.env.env.wrapped_env.seed(args.seed)
+        test_env.env.env.wrapped_env.seed(args.seed)
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if args.device is not 'cpu':
+        torch.cuda.manual_seed(args.seed)
+    gym.spaces.prng.seed(args.seed)
+    # get the environment parameters
+    if args.env_name[:3] in ["Ant", "Poi", "Swi"]:
+        env.env.env.visualize_goal = args.animate
+        test_env.env.env.visualize_goal = args.animate
+    env_params = get_env_params(env)
+    env_params['max_test_timesteps'] = test_env._max_episode_steps
+    # create the ddpg agent to interact with the environment
+    sac_trainer = hier_sac_agent(args, env, env_params, test_env, test_env1, test_env2)
+    if args.eval:
+        if not args.resume:
+            print("random policy !!!")
+        # sac_trainer._eval_hier_agent(test_env)
+        # sac_trainer.vis_hier_policy()
+        # sac_trainer.cal_slow()
+        # sac_trainer.visualize_representation(100)
+        # sac_trainer.vis_learning_process()
+        # sac_trainer.picvideo('fig/final/', (1920, 1080))
+    else:
+        sac_trainer.learn()
+
+
+# get the params
+args = get_args_ant()
+# args = get_args_chain()
+# args = get_args_fetch()
+# args = get_args_point()
+if __name__ == '__main__':
+    launch(args)