Improve performance

fd660a25 · Werner Duvaud · 3967fb57 · fd660a25 · fd660a25 · fd660a25
Commit fd660a25 authored Feb 03, 2020 by Werner Duvaud
15 changed files
--- a/README.md
+++ b/README.md
@@ -25,12 +25,7 @@ MuZero is a model based reinforcement learning algorithm, successor of AlphaZero
 * [ ] Play against MuZero mode with policy and value tracking 
 * [ ] Residual Network
 * [ ] Atari games
-
-## Games already implemented with pretrained network available
-
-* Cartpole
-* Lunar Lander
-* Connect4
+* [ ] Windows support ([workaround by ihexx](https://github.com/ihexx/muzero-general))

 ## Demo

@@ -42,6 +37,16 @@ Testing Lunar Lander :

 ![lunarlander training preview](https://github.com/werner-duvaud/muzero-general/blob/master/docs/lunarlander_training_preview.png)

+## Code structure
+
+![code structure](https://github.com/werner-duvaud/muzero-general/blob/master/docs/how-it-works-werner-duvaud.png)
+
+## Games already implemented with pretrained network available
+
+* Cartpole
+* Lunar Lander
+* Connect4
+
 ## Getting started
 ### Installation


--- a/games/cartpole.py
+++ b/games/cartpole.py
@@ -37,12 +37,12 @@ class MuZeroConfig:

        ### Training
        self.results_path = "./pretrained"  # Path to store the model weights
-        self.training_steps = 10000  # Total number of training steps (ie weights update according to a batch)
+        self.training_steps = 5000  # Total number of training steps (ie weights update according to a batch)
        self.batch_size = 128  # Number of parts of games to train on at each training step
        self.num_unroll_steps = 5  # Number of game moves to keep for every batch element
        self.checkpoint_interval = 10  # Number of training steps before using the model for sef-playing
        self.window_size = 1000  # Number of self-play games to keep in the replay buffer
-        self.td_steps = 10  # Number of steps in the futur to take into account for calculating the target value
+        self.td_steps = 30  # Number of steps in the futur to take into account for calculating the target value
        self.training_delay = 0 # Number of seconds to wait after each training to adjust the self play / training ratio to avoid over/underfitting
        self.training_device = "cuda" if torch.cuda.is_available() else "cpu"  # Train on GPU if available

@@ -51,7 +51,7 @@ class MuZeroConfig:

        # Exponential learning rate schedule
        self.lr_init = 0.008  # Initial learning rate
-        self.lr_decay_rate = 0.01
+        self.lr_decay_rate = 1
        self.lr_decay_steps = 10000



--- a/games/connect4.py
+++ b/games/connect4.py
@@ -51,7 +51,7 @@ class MuZeroConfig:

        # Exponential learning rate schedule
        self.lr_init = 0.05  # Initial learning rate
-        self.lr_decay_rate = 0.01
+        self.lr_decay_rate = 1
        self.lr_decay_steps = 10000



--- a/games/lunarlander.py
+++ b/games/lunarlander.py
@@ -16,10 +16,10 @@ class MuZeroConfig:

        ### Self-Play
        self.num_actors = 10  # Number of simultaneous threads self-playing to feed the replay buffer
-        self.max_moves = 200  # Maximum number of moves if game is not finished before
+        self.max_moves = 1000  # Maximum number of moves if game is not finished before
        self.num_simulations = 50  # Number of futur moves self-simulated
        self.discount = 0.997  # Chronological discount of the reward
-        self.self_play_delay = 0 # Number of seconds to wait after each played game to adjust the self play / training ratio to avoid over/underfitting
+        self.self_play_delay = 0  # Number of seconds to wait after each played game to adjust the self play / training ratio to avoid over/underfitting

        # Root prior exploration noise
        self.root_dirichlet_alpha = 0.25
@@ -31,28 +31,28 @@ class MuZeroConfig:


        ### Network
-        self.encoding_size = 32
-        self.hidden_size = 64
+        self.encoding_size = 64
+        self.hidden_size = 128


        ### Training
        self.results_path = "./pretrained"  # Path to store the model weights
-        self.training_steps = 20000  # Total number of training steps (ie weights update according to a batch)
+        self.training_steps = 3000  # Total number of training steps (ie weights update according to a batch)
        self.batch_size = 128  # Number of parts of games to train on at each training step
-        self.num_unroll_steps = 5  # Number of game moves to keep for every batch element
-        self.checkpoint_interval = 10  # Number of training steps before using the model for sef-playing
+        self.num_unroll_steps = 10  # Number of game moves to keep for every batch element
+        self.checkpoint_interval = 3  # Number of training steps before using the model for sef-playing
        self.window_size = 1000  # Number of self-play games to keep in the replay buffer
        self.td_steps = 10  # Number of steps in the futur to take into account for calculating the target value
-        self.training_delay = 0 # Number of seconds to wait after each training to adjust the self play / training ratio to avoid over/underfitting
+        self.training_delay = 0  # Number of seconds to wait after each training to adjust the self play / training ratio to avoid over/underfitting
        self.training_device = "cuda" if torch.cuda.is_available() else "cpu"  # Train on GPU if available

        self.weight_decay = 1e-4  # L2 weights regularization
        self.momentum = 0.9

        # Exponential learning rate schedule
-        self.lr_init = 0.01  # Initial learning rate
-        self.lr_decay_rate = 0.001
-        self.lr_decay_steps = 10000
+        self.lr_init = 0.00005  # Initial learning rate
+        self.lr_decay_rate = 1
+        self.lr_decay_steps = 100000


        ### Test
@@ -67,12 +67,16 @@ class MuZeroConfig:
        Returns:
            Positive float.
        """
-        if trained_steps < 0.5 * self.training_steps:
-            return 1.0
-        elif trained_steps < 0.75 * self.training_steps:
-            return 0.5
-        else:
-            return 0.25
+        # if trained_steps < 0.2 * self.training_steps:
+        #     return float('inf')
+        # if trained_steps < 0.5 * self.training_steps:
+        #     return 0.8
+        # elif trained_steps < 0.75 * self.training_steps:
+        #     return 0.5
+        # else:
+        #     return 0.25
+
+        return 1


 class Game:

--- a/models.py
+++ b/models.py
@@ -45,7 +45,7 @@ class MuZeroNetwork(torch.nn.Module):
            lambda module, grad_i, grad_o: (grad_i[0] * 0.5,)
        )
        self.dynamics_reward_network = FullyConnectedNetwork(
-            encoding_size + self.action_space_size, [hidden_size], 1
+            encoding_size + self.action_space_size, [hidden_size], 1, activation=None
        )

        self.prediction_policy_network = FullyConnectedNetwork(

--- a/muzero.py
+++ b/muzero.py
@@ -134,9 +134,9 @@ class MuZero:
                )
                counter += 1
                time.sleep(3)
-        except KeyboardInterrupt:
+        except KeyboardInterrupt as err:
            # Comment the line below to be able to stop the training but keep running
-            raise KeyboardInterrupt
+            raise err
            pass
        self.muzero_weights = ray.get(shared_storage_worker.get_weights.remote())
        ray.shutdown()

--- a/pretrained/cartpole
+++ b/pretrained/cartpole
--- a/pretrained/cartpole_summary/events.out.tfevents.1578748427.Ordinateur.11070.0
+++ b/pretrained/cartpole_summary/events.out.tfevents.1578748427.Ordinateur.11070.0
--- a/pretrained/cartpole_summary/events.out.tfevents.1580685785.Ordinateur.1446.0
+++ b/pretrained/cartpole_summary/events.out.tfevents.1580685785.Ordinateur.1446.0
--- a/pretrained/connect4
+++ b/pretrained/connect4
--- a/pretrained/connect4_summary/events.out.tfevents.1580170903.Ordinateur.29836.0
+++ b/pretrained/connect4_summary/events.out.tfevents.1580170903.Ordinateur.29836.0
--- a/pretrained/lunarlander
+++ b/pretrained/lunarlander
--- a/pretrained/lunarlander_summary/events.out.tfevents.1578754627.Ordinateur.16211.0
+++ b/pretrained/lunarlander_summary/events.out.tfevents.1578754627.Ordinateur.16211.0
--- a/pretrained/lunarlander_summary/events.out.tfevents.1580660125.Ordinateur.5556.0
+++ b/pretrained/lunarlander_summary/events.out.tfevents.1580660125.Ordinateur.5556.0
--- a/self_play.py
+++ b/self_play.py
@@ -73,7 +73,7 @@ class SelfPlay:
                    self.model,
                    observation,
                    self.game.to_play(),
-                    True if temperature else False,
+                    False if temperature == 0 else True,
                )

                action = select_action(root, temperature)
@@ -103,6 +103,8 @@ def select_action(node, temperature):
    ).T
    if temperature == 0:
        action_pos = numpy.argmax(visit_counts[0])
+    elif temperature == float("inf"):
+        action_pos = numpy.random.choice(len(visit_counts[1]))
    else:
        # See paper appendix Data Generation
        visit_count_distribution = visit_counts[0] ** (1 / temperature)
@@ -113,9 +115,6 @@ def select_action(node, temperature):
            len(visit_counts[1]), p=visit_count_distribution
        )

-    if temperature == float("inf"):
-        action_pos = numpy.random.choice(len(visit_counts[1]))
-
    return visit_counts[1][action_pos]