Commit c9059c39 by Werner Duvaud

Fix cpu mode

parent a5724559
...@@ -11,12 +11,7 @@ It is designed to be easily adaptable for every games or reinforcement learning ...@@ -11,12 +11,7 @@ It is designed to be easily adaptable for every games or reinforcement learning
MuZero is a model based reinforcement learning algorithm, successor of AlphaZero. It learns to master games without knowing the rules. It only knows actions and then learn to play and master the game. It is at least more efficient than similar algorithms like [AlphaZero](https://arxiv.org/abs/1712.01815), [SimPLe](https://arxiv.org/abs/1903.00374) and [World Models](https://arxiv.org/abs/1803.10122). MuZero is a model based reinforcement learning algorithm, successor of AlphaZero. It learns to master games without knowing the rules. It only knows actions and then learn to play and master the game. It is at least more efficient than similar algorithms like [AlphaZero](https://arxiv.org/abs/1712.01815), [SimPLe](https://arxiv.org/abs/1903.00374) and [World Models](https://arxiv.org/abs/1803.10122).
It uses [PyTorch](https://github.com/pytorch/pytorch) and [Ray](https://github.com/ray-project/ray) for running the different components simultaneously. There is a complete GPU support. It uses [PyTorch](https://github.com/pytorch/pytorch) and [Ray](https://github.com/ray-project/ray) for running the different components simultaneously. GPU training is supported. See [How it works](https://github.com/werner-duvaud/muzero-general/wiki/How-MuZero-works)
There are four components which are classes that run simultaneously in a dedicated thread.
The `shared storage` holds the latest neural network weights, the `self-play` uses those weights to generate self-play games and store them in the `replay buffer`. Finally, those played games are used to `train` a network and store the weights in the shared storage. The circle is complete. See [How it works](https://github.com/werner-duvaud/muzero-general/wiki/How-MuZero-works)
Those components are launched and managed from the MuZero class in `muzero.py` and the structure of the neural network is defined in `models.py`.
All performances are tracked and displayed in real time in tensorboard. All performances are tracked and displayed in real time in tensorboard.
......
import gym import gym
import numpy import numpy
import torch
class MuZeroConfig: class MuZeroConfig:
...@@ -42,6 +43,7 @@ class MuZeroConfig: ...@@ -42,6 +43,7 @@ class MuZeroConfig:
self.window_size = 1000 # Number of self-play games to keep in the replay buffer self.window_size = 1000 # Number of self-play games to keep in the replay buffer
self.td_steps = 10 # Number of steps in the futur to take into account for calculating the target value self.td_steps = 10 # Number of steps in the futur to take into account for calculating the target value
self.training_delay = 0 # Number of seconds to wait after each training to adjust the self play / training ratio to avoid over/underfitting self.training_delay = 0 # Number of seconds to wait after each training to adjust the self play / training ratio to avoid over/underfitting
self.training_device = "cuda" if torch.cuda.is_available() else "cpu" # Train on GPU if available
self.weight_decay = 1e-4 # L2 weights regularization self.weight_decay = 1e-4 # L2 weights regularization
self.momentum = 0.9 self.momentum = 0.9
...@@ -55,6 +57,7 @@ class MuZeroConfig: ...@@ -55,6 +57,7 @@ class MuZeroConfig:
### Test ### Test
self.test_episodes = 2 # Number of game played to evaluate the network self.test_episodes = 2 # Number of game played to evaluate the network
def visit_softmax_temperature_fn(self, trained_steps): def visit_softmax_temperature_fn(self, trained_steps):
""" """
Parameter to alter the visit count distribution to ensure that the action selection becomes greedier as training progresses. Parameter to alter the visit count distribution to ensure that the action selection becomes greedier as training progresses.
......
...@@ -36,20 +36,21 @@ class MuZeroConfig: ...@@ -36,20 +36,21 @@ class MuZeroConfig:
### Training ### Training
self.results_path = "./pretrained" # Path to store the model weights self.results_path = "./pretrained" # Path to store the model weights
self.training_steps = 10000 # Total number of training steps (ie weights update according to a batch) self.training_steps = 20000 # Total number of training steps (ie weights update according to a batch)
self.batch_size = 128 # Number of parts of games to train on at each training step self.batch_size = 128 # Number of parts of games to train on at each training step
self.num_unroll_steps = 5 # Number of game moves to keep for every batch element self.num_unroll_steps = 5 # Number of game moves to keep for every batch element
self.checkpoint_interval = 10 # Number of training steps before using the model for sef-playing self.checkpoint_interval = 10 # Number of training steps before using the model for sef-playing
self.window_size = 1000 # Number of self-play games to keep in the replay buffer self.window_size = 1000 # Number of self-play games to keep in the replay buffer
self.td_steps = 10 # Number of steps in the futur to take into account for calculating the target value self.td_steps = 10 # Number of steps in the futur to take into account for calculating the target value
self.training_delay = 0 # Number of seconds to wait after each training to adjust the self play / training ratio to avoid over/underfitting self.training_delay = 0 # Number of seconds to wait after each training to adjust the self play / training ratio to avoid over/underfitting
self.training_device = "cuda" if torch.cuda.is_available() else "cpu" # Train on GPU if available
self.weight_decay = 1e-4 # L2 weights regularization self.weight_decay = 1e-4 # L2 weights regularization
self.momentum = 0.9 self.momentum = 0.9
# Exponential learning rate schedule # Exponential learning rate schedule
self.lr_init = 0.01 # Initial learning rate self.lr_init = 0.01 # Initial learning rate
self.lr_decay_rate = 0.005 self.lr_decay_rate = 0.001
self.lr_decay_steps = 10000 self.lr_decay_steps = 10000
......
...@@ -44,6 +44,11 @@ class MuZero: ...@@ -44,6 +44,11 @@ class MuZero:
) )
) )
raise err raise err
try:
os.mkdir(os.path.join(self.config.results_path))
except FileExistsError:
pass
# Fix random generator seed for reproductibility # Fix random generator seed for reproductibility
numpy.random.seed(self.config.seed) numpy.random.seed(self.config.seed)
...@@ -65,10 +70,7 @@ class MuZero: ...@@ -65,10 +70,7 @@ class MuZero:
# Initialize workers # Initialize workers
training_worker = trainer.Trainer.remote( training_worker = trainer.Trainer.remote(
copy.deepcopy(self.muzero_weights), copy.deepcopy(self.muzero_weights), self.config
self.config,
# Train on GPU if available
"cuda" if torch.cuda.is_available() else "cpu",
) )
shared_storage_worker = shared_storage.SharedStorage.remote( shared_storage_worker = shared_storage.SharedStorage.remote(
copy.deepcopy(self.muzero_weights), self.game_name, self.config, copy.deepcopy(self.muzero_weights), self.game_name, self.config,
...@@ -79,12 +81,11 @@ class MuZero: ...@@ -79,12 +81,11 @@ class MuZero:
copy.deepcopy(self.muzero_weights), copy.deepcopy(self.muzero_weights),
self.Game(self.config.seed + seed), self.Game(self.config.seed + seed),
self.config, self.config,
"cpu",
) )
for seed in range(self.config.num_actors) for seed in range(self.config.num_actors)
] ]
test_worker = self_play.SelfPlay.remote( test_worker = self_play.SelfPlay.remote(
copy.deepcopy(self.muzero_weights), self.Game(), self.config, "cpu", copy.deepcopy(self.muzero_weights), self.Game(), self.config
) )
# Launch workers # Launch workers
...@@ -145,7 +146,7 @@ class MuZero: ...@@ -145,7 +146,7 @@ class MuZero:
print("Testing...") print("Testing...")
ray.init() ray.init()
self_play_workers = self_play.SelfPlay.remote( self_play_workers = self_play.SelfPlay.remote(
copy.deepcopy(self.muzero_weights), self.Game(), self.config, "cpu", copy.deepcopy(self.muzero_weights), self.Game(), self.config
) )
test_rewards = [] test_rewards = []
with torch.no_grad(): with torch.no_grad():
...@@ -169,5 +170,5 @@ if __name__ == "__main__": ...@@ -169,5 +170,5 @@ if __name__ == "__main__":
muzero = MuZero("cartpole") muzero = MuZero("cartpole")
muzero.train() muzero.train()
# muzero.load_model() #muzero.load_model()
muzero.test() muzero.test()
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "muzero.ipynb",
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Google colab stuffs\n",
"!pip install -r requirements.txt\n",
"!pip uninstall -y pyarrow\n",
"%load_ext tensorboard\n",
"# If you have an import issue with ray in google colab, restart the environment (execution menu)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# You must have the repository imported along with your notebook. \n",
"# For google colab, click on \">\" buttton (left) and import files (muzero.py, self_play.py, ...).\n",
"\n",
"import muzero as mz"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Train on cartpole game\n",
"muzero = mz.MuZero(\"cartpole\")\n",
"muzero.train()\n",
"muzero.test()"
]
}
]
}
...@@ -14,7 +14,7 @@ class SelfPlay: ...@@ -14,7 +14,7 @@ class SelfPlay:
Class which run in a dedicated thread to play games and save them to the replay-buffer. Class which run in a dedicated thread to play games and save them to the replay-buffer.
""" """
def __init__(self, initial_weights, game, config, device): def __init__(self, initial_weights, game, config):
self.config = config self.config = config
self.game = game self.game = game
...@@ -26,7 +26,7 @@ class SelfPlay: ...@@ -26,7 +26,7 @@ class SelfPlay:
self.config.hidden_size, self.config.hidden_size,
) )
self.model.set_weights(initial_weights) self.model.set_weights(initial_weights)
self.model.to(torch.device(device)) self.model.to(torch.device('cpu'))
self.model.eval() self.model.eval()
def continuous_self_play(self, shared_storage, replay_buffer, test_mode=False): def continuous_self_play(self, shared_storage, replay_buffer, test_mode=False):
...@@ -272,6 +272,7 @@ class GameHistory: ...@@ -272,6 +272,7 @@ class GameHistory:
def store_search_statistics(self, root, action_space): def store_search_statistics(self, root, action_space):
sum_visits = sum(child.visit_count for child in root.children.values()) sum_visits = sum(child.visit_count for child in root.children.values())
# TODO: action could be of any type, not only integers
self.child_visits.append( self.child_visits.append(
[ [
root.children[a].visit_count / sum_visits if a in root.children else 0 root.children[a].visit_count / sum_visits if a in root.children else 0
......
...@@ -7,14 +7,14 @@ import torch ...@@ -7,14 +7,14 @@ import torch
import models import models
@ray.remote(num_gpus=1) @ray.remote(num_gpus=1 if torch.cuda.is_available() else 0)
class Trainer: class Trainer:
""" """
Class which run in a dedicated thread to train a neural network and save it Class which run in a dedicated thread to train a neural network and save it
in the shared storage. in the shared storage.
""" """
def __init__(self, initial_weights, config, device): def __init__(self, initial_weights, config):
self.config = config self.config = config
self.training_step = 0 self.training_step = 0
...@@ -26,7 +26,7 @@ class Trainer: ...@@ -26,7 +26,7 @@ class Trainer:
self.config.hidden_size, self.config.hidden_size,
) )
self.model.set_weights(initial_weights) self.model.set_weights(initial_weights)
self.model.to(torch.device(device)) self.model.to(torch.device(config.training_device))
self.model.train() self.model.train()
self.optimizer = torch.optim.SGD( self.optimizer = torch.optim.SGD(
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment