cpnota
diff --git a/‎Makefile‎
Lines changed: 3 additions & 2 deletions b/‎Makefile‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎all/agents/_agent.py‎
Lines changed: 2 additions & 1 deletion b/‎all/agents/_agent.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎all/agents/a2c.py‎
Lines changed: 18 additions & 12 deletions b/‎all/agents/a2c.py‎
Lines changed: 18 additions & 12 deletions
diff --git a/‎all/agents/ddpg.py‎
Lines changed: 1 addition & 1 deletion b/‎all/agents/ddpg.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎all/agents/dqn.py‎
Lines changed: 1 addition & 1 deletion b/‎all/agents/dqn.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎all/agents/evaluation/greedy_agent.py‎ b/‎all/agents/evaluation/greedy_agent.py‎
diff --git a/‎all/agents/ppo.py‎
Lines changed: 15 additions & 14 deletions b/‎all/agents/ppo.py‎
Lines changed: 15 additions & 14 deletions
diff --git a/‎all/agents/sac.py‎
Lines changed: 20 additions & 11 deletions b/‎all/agents/sac.py‎
Lines changed: 20 additions & 11 deletions
diff --git a/‎all/agents/vac.py‎
Lines changed: 1 addition & 1 deletion b/‎all/agents/vac.py‎
Lines changed: 1 addition & 1 deletion
@@ -1,6 +1,7 @@
 install:
-	pip install https://download.pytorch.org/whl/cu100/torch-1.0.1.post2-cp37-cp37m-linux_x86_64.whl
-	pip install torchvision tensorflow
+	pip install https://download.pytorch.org/whl/cu100/torch-1.1.0-cp37-cp37m-linux_x86_64.whl
+	pip install https://download.pytorch.org/whl/cu100/torchvision-0.3.0-cp37-cp37m-linux_x86_64.whl
+	pip install tensorflow
 	pip install -e .
 
 lint:
 
@@ -34,6 +34,7 @@ We provide out-of-the-box modules for:
 - [x] Generalized Advantage Estimation (GAE)
 - [x] Target networks
 - [x] Polyak averaging
+- [x] Easy parameter and learning rate scheduling
 - [x] An enhanced `nn` module (includes dueling layers, noisy layers, action bounds, and the coveted `nn.Flatten`)
 - [x] `gym` to `pytorch` wrappers
 - [x] Atari wrappers
 
@@ -1,6 +1,7 @@
 from abc import ABC, abstractmethod
+from all.optim import Schedulable
 
-class Agent(ABC):
+class Agent(ABC, Schedulable):
     """
     A reinforcement learning agent.
 
 
@@ -1,5 +1,3 @@
-import torch
-from all.environments import State
 from all.memory import NStepAdvantageBuffer
 from ._agent import Agent
 
@@ -22,26 +20,34 @@ def __init__(
         self.n_envs = n_envs
         self.n_steps = n_steps
         self.discount_factor = discount_factor
+        self._states = None
+        self._actions = None
         self._batch_size = n_envs * n_steps
         self._buffer = self._make_buffer()
         self._features = []
 
     def act(self, states, rewards):
-        self._buffer.store(states, torch.zeros(self.n_envs), rewards)
-        self._train()
-        features = self.features(states)
-        self._features.append(features)
-        return self.policy(features)
+        self._store_transitions(rewards)
+        self._train(states)
+        self._states = states
+        self._actions = self.policy.eval(self.features.eval(states))
+        return self._actions
 
-    def _train(self):
+    def _store_transitions(self, rewards):
+        if self._states:
+            self._buffer.store(self._states, self._actions, rewards)
+
+    def _train(self, states):
         if len(self._buffer) >= self._batch_size:
-            states = State.from_list(self._features)
-            _, _, advantages = self._buffer.sample(self._batch_size)
-            self.v(states)
+            states, actions, advantages = self._buffer.advantages(states)
+            # forward pass
+            features = self.features(states)
+            self.v(features)
+            self.policy(features, actions)
+            # backward pass
             self.v.reinforce(advantages)
             self.policy.reinforce(advantages)
             self.features.reinforce()
-            self._features = []
 
     def _make_buffer(self):
         return NStepAdvantageBuffer(
 
@@ -46,7 +46,7 @@ def _train(self):
             # train q function
             td_errors = (
                 rewards +
-                self.discount_factor * self.q.eval(next_states, self.policy.eval(next_states)) -
+                self.discount_factor * self.q.target(next_states, self.policy.target(next_states)) -
                 self.q(states, torch.cat(actions))
             )
             self.q.reinforce(weights * td_errors)
 
@@ -45,7 +45,7 @@ def _train(self):
                 self.minibatch_size)
             td_errors = (
                 rewards +
-                self.discount_factor * torch.max(self.q.eval(next_states), dim=1)[0] -
+                self.discount_factor * torch.max(self.q.target(next_states), dim=1)[0] -
                 self.q(states, actions)
             )
             self.q.reinforce(weights * td_errors)
 
@@ -26,6 +26,8 @@ def __init__(
         self.n_steps = n_steps
         self.discount_factor = discount_factor
         self.lam = lam
+        self._states = None
+        self._actions = None
         self._epsilon = epsilon
         self._epochs = epochs
         self._batch_size = n_envs * n_steps
@@ -34,14 +36,19 @@ def __init__(
         self._features = []
 
     def act(self, states, rewards):
-        self._train()
-        actions = self.policy.eval(self.features.eval(states))
-        self._buffer.store(states, actions, rewards)
-        return actions
+        self._store_transitions(rewards)
+        self._train(states)
+        self._states = states
+        self._actions = self.policy.eval(self.features.eval(states))
+        return self._actions
 
-    def _train(self):
+    def _store_transitions(self, rewards):
+        if self._states:
+            self._buffer.store(self._states, self._actions, rewards)
+
+    def _train(self, _states):
         if len(self._buffer) >= self._batch_size:
-            states, actions, advantages = self._buffer.sample(self._batch_size)
+            states, actions, advantages = self._buffer.advantages(_states)
             with torch.no_grad():
                 features = self.features.eval(states)
                 pi_0 = self.policy.eval(features, actions)
@@ -65,18 +72,12 @@ def _train_minibatch(self, states, actions, pi_0, advantages, targets):
         self.v.reinforce(targets - self.v(features))
         self.features.reinforce()
 
-    def _compute_targets(self, returns, next_states, lengths):
-        return (
-            returns +
-            (self.discount_factor ** lengths)
-            * self.v.eval(self.features.eval(next_states))
-        )
-
     def _compute_policy_loss(self, pi_0, advantages):
         def _policy_loss(pi_i):
             ratios = torch.exp(pi_i - pi_0)
             surr1 = ratios * advantages
-            surr2 = torch.clamp(ratios, 1.0 - self._epsilon, 1.0 + self._epsilon) * advantages
+            epsilon = self._epsilon
+            surr2 = torch.clamp(ratios, 1.0 - epsilon, 1.0 + epsilon) * advantages
             return -torch.min(surr1, surr2).mean()
         return _policy_loss
 
 
@@ -1,5 +1,5 @@
 import torch
-from all.experiments import DummyWriter
+from all.logging import DummyWriter
 from ._agent import Agent
 
 class SAC(Agent):
@@ -9,7 +9,9 @@ def __init__(self,
                  q_2,
                  v,
                  replay_buffer,
-                 entropy_regularizer=0.01,
+                 entropy_target=-2., # usually -action_space.size[0]
+                 temperature_initial=0.1,
+                 lr_temperature=1e-4,
                  discount_factor=0.99,
                  minibatch_size=32,
                  replay_start_size=5000,
@@ -28,7 +30,10 @@ def __init__(self,
         self.update_frequency = update_frequency
         self.minibatch_size = minibatch_size
         self.discount_factor = discount_factor
-        self.entropy_regularizer = entropy_regularizer
+        # vars for learning the temperature
+        self.entropy_target = entropy_target
+        self.temperature = temperature_initial
+        self.lr_temperature = lr_temperature
         # data
         self.env = None
         self.state = None
@@ -39,8 +44,7 @@ def act(self, state, reward):
         self._store_transition(state, reward)
         self._train()
         self.state = state
-        with torch.no_grad():
-            self.action = self.policy(state)
+        self.action = self.policy.eval(state)
         return self.action
 
     def _store_transition(self, state, reward):
@@ -58,14 +62,17 @@ def _train(self):
             # compute targets for Q and V
             with torch.no_grad():
                 _actions, _log_probs = self.policy(states, log_prob=True)
-                q_targets = rewards + self.discount_factor * self.v.eval(next_states)
+                q_targets = rewards + self.discount_factor * self.v.target(next_states)
                 v_targets = torch.min(
-                    self.q_1.eval(states, _actions),
-                    self.q_2.eval(states, _actions),
-                ) - self.entropy_regularizer * _log_probs
+                    self.q_1.target(states, _actions),
+                    self.q_2.target(states, _actions),
+                ) - self.temperature * _log_probs
+                temperature_loss = ((_log_probs + self.entropy_target).detach().mean())
                 self.writer.add_loss('entropy', -_log_probs.mean())
                 self.writer.add_loss('v_mean', v_targets.mean())
                 self.writer.add_loss('r_mean', rewards.mean())
+                self.writer.add_loss('temperature_loss', temperature_loss)
+                self.writer.add_loss('temperature', self.temperature)
 
             # update Q-functions
             q_1_errors = q_targets - self.q_1(states, actions)
@@ -79,15 +86,17 @@ def _train(self):
 
             # train policy
             _actions, _log_probs = self.policy(states, log_prob=True)
-
             loss = -(
                 self.q_1(states, _actions, detach=False)
-                - self.entropy_regularizer * _log_probs
+                - self.temperature * _log_probs
             ).mean()
             loss.backward()
             self.policy.step()
             self.q_1.zero_grad()
 
+            # adjust temperature
+            self.temperature += self.lr_temperature * temperature_loss
+
     def _should_train(self):
         return (self.frames_seen > self.replay_start_size and
                 self.frames_seen % self.update_frequency == 0)
@@ -13,7 +13,7 @@ def act(self, state, reward):
         if self._previous_features:
             td_error = (
                 reward
-                + self.gamma * self.v.eval(self.features.eval(state))
+                + self.gamma * self.v.target(self.features.eval(state))
                 - self.v(self._previous_features)
             )
             self.v.reinforce(td_error)
Original file line number	Diff line number	Diff line change
`@@ -46,7 +46,7 @@ def _train(self):`
`46`	`46`	`# train q function`
`47`	`47`	`td_errors = (`
`48`	`48`	`rewards +`
`49`		`- self.discount_factor * self.q.eval(next_states, self.policy.eval(next_states)) -`
	`49`	`+ self.discount_factor * self.q.target(next_states, self.policy.target(next_states)) -`
`50`	`50`	`self.q(states, torch.cat(actions))`
`51`	`51`	`)`
`52`	`52`	`self.q.reinforce(weights * td_errors)`
Original file line number	Diff line number	Diff line change
`@@ -45,7 +45,7 @@ def _train(self):`
`45`	`45`	`self.minibatch_size)`
`46`	`46`	`td_errors = (`
`47`	`47`	`rewards +`
`48`		`- self.discount_factor * torch.max(self.q.eval(next_states), dim=1)[0] -`
	`48`	`+ self.discount_factor * torch.max(self.q.target(next_states), dim=1)[0] -`
`49`	`49`	`self.q(states, actions)`
`50`	`50`	`)`
`51`	`51`	`self.q.reinforce(weights * td_errors)`
Original file line number	Diff line number	Diff line change
`@@ -13,7 +13,7 @@ def act(self, state, reward):`
`13`	`13`	`if self._previous_features:`
`14`	`14`	`td_error = (`
`15`	`15`	`reward`
`16`		`- + self.gamma * self.v.eval(self.features.eval(state))`
	`16`	`+ + self.gamma * self.v.target(self.features.eval(state))`
`17`	`17`	`- self.v(self._previous_features)`
`18`	`18`	`)`
`19`	`19`	`self.v.reinforce(td_error)`