Fix cpu mode

c9059c39 · Werner Duvaud · a5724559 · c9059c39 · c9059c39 · c9059c39
Commit c9059c39 authored Jan 15, 2020 by Werner Duvaud
Hide whitespace changes
Inline Side-by-side

Showing with 22 additions and 74 deletions

README.md
+1 -6

games/cartpole.py
+3 -0

games/lunarlander.py
+3 -2

muzero.py
+9 -8

notebook.ipynb
+0 -53

self_play.py
+3 -2

trainer.py
+3 -3

No files found.
--- a/README.md
+++ b/README.md
@@ -11,12 +11,7 @@ It is designed to be easily adaptable for every games or reinforcement learning 
 MuZero is a model based reinforcement learning algorithm, successor of AlphaZero. It learns to master games without knowing the rules. It only knows actions and then learn to play and master the game. It is at least more efficient than similar algorithms like [AlphaZero](https://arxiv.org/abs/1712.01815), [SimPLe](https://arxiv.org/abs/1903.00374) and [World Models](https://arxiv.org/abs/1803.10122).
-It uses [PyTorch](https://github.com/pytorch/pytorch) and [Ray](https://github.com/ray-project/ray) for running the different components simultaneously. There is a complete GPU support.
+It uses [PyTorch](https://github.com/pytorch/pytorch) and [Ray](https://github.com/ray-project/ray) for running the different components simultaneously. GPU training is supported. See [How it works](https://github.com/werner-duvaud/muzero-general/wiki/How-MuZero-works)
-There are four components which are classes that run simultaneously in a dedicated thread.
-The `shared storage` holds the latest neural network weights, the `self-play` uses those weights to generate self-play games and store them in the `replay buffer`. Finally, those played games are used to `train` a network and store the weights in the shared storage. The circle is complete. See [How it works](https://github.com/werner-duvaud/muzero-general/wiki/How-MuZero-works)
-Those components are launched and managed from the MuZero class in `muzero.py` and the structure of the neural network is defined in `models.py`.
 All performances are tracked and displayed in real time in tensorboard.

--- a/games/cartpole.py
+++ b/games/cartpole.py
 import gym
 import numpy
+import torch
 class MuZeroConfig:
@@ -42,6 +43,7 @@ class MuZeroConfig:
        self.window_size = 1000  # Number of self-play games to keep in the replay buffer
        self.td_steps = 10  # Number of steps in the futur to take into account for calculating the target value
        self.training_delay = 0 # Number of seconds to wait after each training to adjust the self play / training ratio to avoid over/underfitting
+        self.training_device = "cuda" if torch.cuda.is_available() else "cpu"  # Train on GPU if available
        self.weight_decay = 1e-4  # L2 weights regularization
        self.momentum = 0.9
@@ -55,6 +57,7 @@ class MuZeroConfig:
        ### Test
        self.test_episodes = 2  # Number of game played to evaluate the network
    def visit_softmax_temperature_fn(self, trained_steps):
        """
        Parameter to alter the visit count distribution to ensure that the action selection becomes greedier as training progresses.

--- a/games/lunarlander.py
+++ b/games/lunarlander.py
@@ -36,20 +36,21 @@ class MuZeroConfig:
        ### Training
        self.results_path = "./pretrained"  # Path to store the model weights
-        self.training_steps = 10000  # Total number of training steps (ie weights update according to a batch)
+        self.training_steps = 20000  # Total number of training steps (ie weights update according to a batch)
        self.batch_size = 128  # Number of parts of games to train on at each training step
        self.num_unroll_steps = 5  # Number of game moves to keep for every batch element
        self.checkpoint_interval = 10  # Number of training steps before using the model for sef-playing
        self.window_size = 1000  # Number of self-play games to keep in the replay buffer
        self.td_steps = 10  # Number of steps in the futur to take into account for calculating the target value
        self.training_delay = 0 # Number of seconds to wait after each training to adjust the self play / training ratio to avoid over/underfitting
+        self.training_device = "cuda" if torch.cuda.is_available() else "cpu"  # Train on GPU if available
        self.weight_decay = 1e-4  # L2 weights regularization
        self.momentum = 0.9
        # Exponential learning rate schedule
        self.lr_init = 0.01  # Initial learning rate
-        self.lr_decay_rate = 0.005
+        self.lr_decay_rate = 0.001
        self.lr_decay_steps = 10000

--- a/muzero.py
+++ b/muzero.py
@@ -44,6 +44,11 @@ class MuZero:
                )
            )
            raise err
+        try:
+            os.mkdir(os.path.join(self.config.results_path))
+        except FileExistsError:
+            pass
        # Fix random generator seed for reproductibility
        numpy.random.seed(self.config.seed)
@@ -65,10 +70,7 @@ class MuZero:
        # Initialize workers
        training_worker = trainer.Trainer.remote(
-            copy.deepcopy(self.muzero_weights),
+            copy.deepcopy(self.muzero_weights), self.config
-            self.config,
-            # Train on GPU if available
-            "cuda" if torch.cuda.is_available() else "cpu",
        )
        shared_storage_worker = shared_storage.SharedStorage.remote(
            copy.deepcopy(self.muzero_weights), self.game_name, self.config,
@@ -79,12 +81,11 @@ class MuZero:
                copy.deepcopy(self.muzero_weights),
                self.Game(self.config.seed + seed),
                self.config,
-                "cpu",
            )
            for seed in range(self.config.num_actors)
        ]
        test_worker = self_play.SelfPlay.remote(
-            copy.deepcopy(self.muzero_weights), self.Game(), self.config, "cpu",
+            copy.deepcopy(self.muzero_weights), self.Game(), self.config
        )
        # Launch workers
@@ -145,7 +146,7 @@ class MuZero:
        print("Testing...")
        ray.init()
        self_play_workers = self_play.SelfPlay.remote(
-            copy.deepcopy(self.muzero_weights), self.Game(), self.config, "cpu",
+            copy.deepcopy(self.muzero_weights), self.Game(), self.config
        )
        test_rewards = []
        with torch.no_grad():
@@ -169,5 +170,5 @@ if __name__ == "__main__":
    muzero = MuZero("cartpole")
    muzero.train()
-    # muzero.load_model()
+    #muzero.load_model()
    muzero.test()
--- a/notebook.ipynb
+++ b/notebook.ipynb
-{
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "name": "muzero.ipynb",
-      "provenance": []
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    }
-  },
-  "cells": [
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Google colab stuffs\n",
-        "!pip install -r requirements.txt\n",
-        "!pip uninstall -y pyarrow\n",
-        "%load_ext tensorboard\n",
-        "# If you have an import issue with ray in google colab, restart the environment (execution menu)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# You must have the repository imported along with your notebook. \n",
-        "# For google colab, click on \">\" buttton (left) and import files (muzero.py, self_play.py, ...).\n",
-        "\n",
-        "import muzero as mz"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "#Train on cartpole game\n",
-        "muzero = mz.MuZero(\"cartpole\")\n",
-        "muzero.train()\n",
-        "muzero.test()"
-      ]
-    }
-  ]
-}
--- a/self_play.py
+++ b/self_play.py
@@ -14,7 +14,7 @@ class SelfPlay:
    Class which run in a dedicated thread to play games and save them to the replay-buffer.
    """
-    def __init__(self, initial_weights, game, config, device):
+    def __init__(self, initial_weights, game, config):
        self.config = config
        self.game = game
@@ -26,7 +26,7 @@ class SelfPlay:
            self.config.hidden_size,
        )
        self.model.set_weights(initial_weights)
-        self.model.to(torch.device(device))
+        self.model.to(torch.device('cpu'))
        self.model.eval()
    def continuous_self_play(self, shared_storage, replay_buffer, test_mode=False):
@@ -272,6 +272,7 @@ class GameHistory:
    def store_search_statistics(self, root, action_space):
        sum_visits = sum(child.visit_count for child in root.children.values())
+        # TODO: action could be of any type, not only integers
        self.child_visits.append(
            [
                root.children[a].visit_count / sum_visits if a in root.children else 0

--- a/trainer.py
+++ b/trainer.py
@@ -7,14 +7,14 @@ import torch
 import models
-@ray.remote(num_gpus=1)
+@ray.remote(num_gpus=1 if torch.cuda.is_available() else 0)
 class Trainer:
    """
    Class which run in a dedicated thread to train a neural network and save it
    in the shared storage.
    """
-    def __init__(self, initial_weights, config, device):
+    def __init__(self, initial_weights, config):
        self.config = config
        self.training_step = 0
@@ -26,7 +26,7 @@ class Trainer:
            self.config.hidden_size,
        )
        self.model.set_weights(initial_weights)
-        self.model.to(torch.device(device))
+        self.model.to(torch.device(config.training_device))
        self.model.train()
        self.optimizer = torch.optim.SGD(