journey/agents.py at main · Fittiboy/journey · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
import numpy as np
import pickle

from math import inf
from dataclasses import dataclass, field, asdict
from abc import ABC, abstractmethod
from collections.abc import Callable
from itertools import count, product
from pathlib import Path
from datetime import datetime

from tqdm.auto import tqdm

################################
# Abstract Agent components
################################

class ActionSelector(ABC):
    """Agent component that selects an action for the current state s."""

    @abstractmethod
    def __call__(self, agent: 'Agent', s: int, t: int) -> int:
        """
        Return an action to take, conditional on the current
        observation.
        """
        pass

    @abstractmethod
    def action_probs(self, agent: 'Agent', s:int, t: int) -> np.ndarray:
        """
        Return an array corresponding to the probabilities of each
        action being chosen.
        """
        pass


class UpdateMethod(ABC):
    """
    Component to update the agent's internal state.

    Examples include learning value estimates, updating the
    value estimator's parameters, or planning with a model.
    """

    @abstractmethod
    def __call__(
        self,
        agent: 'Agent',
        s: int, a: int, r: float, s_: int, a_: int,
        t: int, T: int,
        ep: int, num_eps: int,
    ):
        """Update some of the agent's internal state."""
        pass

################################
# Action selectors
################################

@dataclass
class Greedy(ActionSelector):
    """The greedy action selector."""

    def __call__(self, agent: 'Agent', s: int, t: int) -> int:
        return np.argmax(agent.Q[s])

    def action_probs(self, agent: 'Agent', s:int, t: int) -> np.ndarray:
        probs = np.zeros(len(agent.Q[s]))

        best_action = np.argmax(agent.Q[s])
        probs[best_action] = 1.0

        return probs


@dataclass
class BaseEpsilonGreedy(ActionSelector):
    """Base class for epsilon-greedy action selectors."""
    epsilon: float
    rng: np.random.Generator = field(default_factory = lambda: np.random.default_rng())

    def __post_init__(self):
        if not 0 <= self.epsilon <= 1:
            raise ValueError(f"Epsilon must be in [0, 1], got {self.epsilon}")

    def __call__(self, agent: 'Agent', s: int, t: int) -> int:
        if self.rng.random() < self.epsilon:
            return self.rng.integers(0, agent.num_actions)
        return self._get_greedy_action(agent, s, t)

    def action_probs(self, agent: 'Agent', s:int, t: int) -> np.ndarray:
        num_actions = len(agent.Q[s])
        probs = np.ones(num_actions) * self.epsilon / num_actions
        best_action = self._get_greedy_action(agent, s, t)
        probs[best_action] += 1 - self.epsilon
        return probs

    @abstractmethod
    def _get_greedy_action(self, agent: 'Agent', s: int, t: int) -> int:
        """Return the greedy action for the current state."""
        pass


@dataclass
class EpsilonGreedy(BaseEpsilonGreedy):
    """The epsilon-greedy action selector."""
    def _get_greedy_action(self, agent: 'Agent', s: int, t: int) -> int:
        return np.argmax(agent.Q[s])


@dataclass
class EpsilonGreedyExpBonus(BaseEpsilonGreedy):
    """Epsilon-greedy with exploration bonus based on time since last use."""
    kappa: float = field(default=0.01)
    last_used_a: np.ndarray = field(init=False, default=None)

    def __post_init__(self):
        super().__post_init__()
        if self.kappa < 0:
            raise ValueError(f"Exploration bonus coefficient (kappa) must be non-negative, got {self.kappa}")

    def _init_last_used(self, agent: 'Agent', t: int):
        """Initialize or reset the last_used array if needed"""
        if self.last_used_a is None or t == 0:
            self.last_used_a = np.zeros_like(agent.Q)

    def _get_Q_with_bonus(self, agent: 'Agent', s: int, t: int) -> np.ndarray:
        """Get Q-values with exploration bonus for the given state"""
        self._init_last_used(agent, t)
        Q_s = agent.Q[s].copy()
        tau_s = t - self.last_used_a[s]
        bonus = self.kappa * np.sqrt(tau_s)
        return Q_s + bonus

    def _get_greedy_action(self, agent: 'Agent', s: int, t: int) -> int:
        Q_s = self._get_Q_with_bonus(agent, s, t)
        a = np.argmax(Q_s)
        self.last_used_a[s, a] = t
        return a

    def action_probs(self, agent: 'Agent', s:int, t: int) -> np.ndarray:
        Q_s = self._get_Q_with_bonus(agent, s, t)
        num_actions = len(Q_s)
        probs = np.ones(num_actions) * self.epsilon / num_actions
        best_action = np.argmax(Q_s)
        probs[best_action] += 1 - self.epsilon
        return probs


@dataclass
class BaseOneStep(UpdateMethod):
    """Base class for one-step update methods"""
    alpha: float
    gamma: float = field(default=1.0)

    def __post_init__(self):
        if not 0 < self.alpha <= 1:
            raise ValueError(f"Learning rate (alpha) must be in (0, 1], got {self.alpha}")
        if not 0 <= self.gamma <= 1:
            raise ValueError(f"Discount factor (gamma) must be in [0, 1], got {self.gamma}")

    def __call__(
        self,
        agent: 'Agent',
        s: int, a: int, r: float, s_: int, a_: int,
        t: int, T: int,
        ep: int, num_eps: int,
    ):
        Q = agent.Q
        if T > t + 1:
            next_value = self._get_next_value(agent, s_, a_, t)
            target = r + self.gamma * next_value - Q[s, a]
        else:
            target = r - Q[s, a]
        agent.Q[s, a] += self.alpha * target

    @abstractmethod
    def _get_next_value(self, agent: 'Agent', s_: int, a_: int, t: int) -> float:
        """Calculate the value estimate for the next state."""
        pass


@dataclass
class QLearning(BaseOneStep):
    """Default Q-learning update method"""
    def _get_next_value(self, agent: 'Agent', s_: int, a_: int, t: int) -> float:
        return np.max(agent.Q[s_])


@dataclass
class Sarsa(BaseOneStep):
    """Default Sarsa update method"""
    def _get_next_value(self, agent: 'Agent', s_: int, a_: int, t: int) -> float:
        return agent.Q[s_, a_]


@dataclass
class ExpectedSarsa(BaseOneStep):
    """Default Expected Sarsa update method"""
    def _get_next_value(self, agent: 'Agent', s_: int, a_: int, t: int) -> float:
        action_probs = agent.selector.action_probs(agent, s_, t)
        return np.dot(action_probs, agent.Q[s_])

################################
# Value update methods
################################

@dataclass
class BaseNStep(UpdateMethod):
    """Base class for n-step methods"""
    n: int
    alpha: float
    gamma: float = field(default=1.0)
    states: list[int] = field(init=False)
    actions: list[int] = field(init=False)
    rewards: list[int] = field(init=False)

    def __post_init__(self):
        """Set up the n-step buffers"""
        if not isinstance(self.n, int) or self.n < 1:
            raise ValueError(f"n must be a positive integer, got {self.n}")
        if not 0 < self.alpha <= 1:
            raise ValueError(f"Learning rate (alpha) must be in (0, 1], got {self.alpha}")
        if not 0 <= self.gamma <= 1:
            raise ValueError(f"Discount factor (gamma) must be in [0, 1], got {self.gamma}")

        self.states = [None] * (self.n + 1)
        self.actions = [None] * (self.n + 1)
        self.rewards = [None] * (self.n + 1)

    def _update_buffers(self, s: int, a: int, r: float, s_: int, a_: int, t: int):
        """Store newly encountered step in buffers"""
        self.states[t % (self.n+1)] = s
        self.actions[t % (self.n+1)] = a
        self.rewards[(t+1) % (self.n+1)] = r
        self.states[(t+1) % (self.n+1)] = s_
        self.actions[(t+1) % (self.n+1)] = a_

    @abstractmethod
    def _calculate_target(
        self,
        agent: 'Agent',
        tau: int, t: int, T: int,
        s_: int, a_: int,
    ) -> float:
        """Calculate the update target for the specific n-step method."""
        pass

    def __call__(
        self,
        agent: 'Agent',
        s: int, a: int, r: float, s_: int, a_: int,
        t: int, T: int,
        ep: int, num_eps: int,
    ):
        Q = agent.Q
        n = self.n

        # Store newly encountered step in buffers
        self._update_buffers(s, a, r, s_, a_, t)

        # The first timestep for which to update Q
        first_tau = t + 1 - n
        if first_tau >= 0:
            # If the episode has terminated, update for all remaining
            # time steps.
            tau_range_limit = T if T == t + 1 else first_tau + 1
            for tau in range(first_tau, tau_range_limit):
                index = tau % (n+1)
                update_state = self.states[index]
                update_action = self.actions[index]

                target = self._calculate_target(agent, tau, t, T, s_, a_)
                agent.Q[update_state, update_action] += self.alpha * target


@dataclass
class NStepSarsa(BaseNStep):
    """n-step Sarsa update method"""
    def _calculate_target(
        self,
        agent: 'Agent',
        tau: int, t: int, T: int,
        s_: int, a_: int,
    ) -> float:
        Q = agent.Q
        n = self.n

        # A running dicount factor
        discount = 1
        # Update target, into which the return is accumulated
        target = 0

        # The return is truncated if the episode has terminated
        k_range_limit = min(T, tau + n) + 1
        for k in range(tau + 1, k_range_limit):
            target += discount * self.rewards[k % (n+1)]
            # Update running discount factor
            discount *= self.gamma

        # If the episode has not yet terminated, add the final term
        # to the udpate target
        index = tau % (n+1)
        update_state = self.states[index]
        update_action = self.actions[index]
        if T > t + 1:
            target += discount * Q[s_, a_] - Q[update_state, update_action]
        else:
            target += -Q[update_state, update_action]

        return target


@dataclass
class NStepExpectedSarsa(BaseNStep):
    """n-step Expected Sarsa update method"""
    def _calculate_target(
        self,
        agent: 'Agent',
        tau: int, t: int, T: int,
        s_: int, a_: int,
    ) -> float:
        Q = agent.Q
        n = self.n

        # A running dicount factor
        discount = 1
        # Update target, into which the return is accumulated
        target = 0

        # The return is truncated if the episode has terminated
        k_range_limit = min(T, tau + n) + 1
        for k in range(tau + 1, k_range_limit):
            target += discount * self.rewards[k % (n+1)]
            # Update running discount factor
            discount *= self.gamma

        # If the episode has not yet terminated, add the final term
        # to the udpate target
        index = tau % (n+1)
        update_state = self.states[index]
        update_action = self.actions[index]
        if T > t + 1:
            action_probs = agent.selector.action_probs(agent, s_, t)
            expected_value = np.dot(action_probs, Q[s_])
            target += discount * expected_value - Q[update_state, update_action]
        else:
            target += -Q[update_state, update_action]

        return target


@dataclass
class NStepTreeBackup(BaseNStep):
    """n-step Tree Backup update method"""
    def _calculate_target(
        self,
        agent: 'Agent',
        tau: int, t: int, T: int,
        s_: int, a_: int,
    ) -> float:
        Q = agent.Q
        selector = agent.selector
        n = self.n

        # A running dicount factor
        discount = 1
        # Update target, into which the return is accumulated
        target = 0

        # The return is truncated if the episode has terminated
        k_range_limit = min(T, tau + n) + 1
        for k in range(tau + 1, k_range_limit):
            index = k % (n+1)
            reward = self.rewards[index]
            next_state = self.states[index]
            next_action = self.actions[index]

            target += discount * reward

            # The final term includes only the terminal reward
            if k == T:
                break

            # Get the probabilities for each action, and extract
            # the probability for the action that was actually
            # taken
            probs = selector.action_probs(agent, next_state, t)
            pi = probs[next_action]

            # Calculate expectation, excluding the action that was
            # actually taken (except for the final state, where the
            # full expectation is used)
            if k < k_range_limit - 1:
                probs[next_action] = 0
            expected_value = np.dot(probs, Q[next_state])
            target += discount * self.gamma * expected_value

            # Update running discount factor
            discount *= self.gamma * pi

        # If the episode has not yet terminated, add the final term
        # to the udpate target
        index = tau % (n+1)
        update_state = self.states[index]
        update_action = self.actions[index]
        target += -Q[update_state, update_action]

        return target

################################
# Parameter schedules
################################

@dataclass
class Schedule(UpdateMethod):
    """Schedule for updating an agent's parameter"""
    param_path: list[str]
    initial: float = field(default=None)
    final: float = field(default=None)

    def __post_init__(self):
        assert len(self.param_path) > 0
        x = np.linspace(0, 1)
        assert (
            np.all(0 <= self.weight(x))
            and np.all(self.weight(x) <= 1.0)
        )

    def weight(self, progress):
        raise NotImplementedError

    def __call__(
        self,
        agent: 'Agent',
        s: int, a: int, r: float, s_: int, a_: int,
        t: int, T: int,
        ep: int, num_eps: int,
    ):
        """Update the agent's parameter, according to the weight function"""
        progress = min(ep / num_eps, 1)
        weight = self.weight(progress)
        value = (1 - weight) * self.initial + weight * self.final

        # Get the parameter from the agent.
        # The parameter may be nested, for example:
        # ["selector", "epsilon"], which corresponds to
        # agent.selector.epsilon
        try:
            current = agent
            for part in self.param_path[:-1]:
                current = getattr(current, part)
            setattr(current, self.param_path[-1], value)
        except AttributeError:
            raise ValueError(f"Invalid parameter path: {self.param_path}")


@dataclass
class LinearSchedule(Schedule):
    """A linear parameter schedule"""
    def weight(self, progress):
        return progress


def sigmoid(x):
    return 1 / (1 + np.exp(-x))


@dataclass
class SigmoidSchedule(Schedule):
    """A sigmoid parameter schedule"""
    scale: int = 6

    def __post_init__(self):
        self.zero_shift = sigmoid(0)
        self.scale_factor = 1 / (sigmoid(SigmoidSchedule.scale) + self.zero_shift)

    def weight(self, progress):
        return (sigmoid(SigmoidSchedule.scale * progress) + self.zero_shift) * self.scale_factor


@dataclass
class UpDownSchedule(Schedule):
    """
    A parameter schedule that reaches final at 0.5, then going back
    down to initial towards the end.
    """
    def weight(self, progress: float) -> float:
        return (- (progress - 0.5) ** 2) * 4 + 1

################################
# Planners
################################

@dataclass
class NoPlanner(UpdateMethod):
    """Does nothing"""
    def __call__(
        self,
        agent: 'Agent',
        s: int, a: int, r: float, s_: int, a_: int,
        t: int, T: int,
        ep: int, num_eps: int,
    ):
        """Update some of the agent's internal state."""
        pass


@dataclass
class Dyna(UpdateMethod):
    """Default Dyna planner"""
    plan_steps: int
    selector: ActionSelector
    learner: UpdateMethod
    plus: bool = field(default=False)
    kappa: float = field(default=0.01)
    model: dict = field(default_factory=dict)
    last_used_a: np.ndarray = field(init=False, default=None)
    rng: np.random.Generator = field(default_factory=np.random.default_rng)

    def __call__(
        self,
        agent: 'Agent',
        s: int, a: int, r: float, s_: int, a_: int,
        t: int, T: int,
        ep: int, num_eps: int,
    ):
        if self.plus and self.last_used_a is None or self.plus and t == 0:
            self.last_used_a = np.zeros_like(agent.Q)
        done = t + 1 == T
        # Update the model naively by storing transition
        self.model[(s, a)] = (r, s_, done)

        # Record the timestep at which this action was last
        # performed
        if self.plus:
            self.last_used_a[s, a] = t

        # Perform actual planning
        for _ in range(self.plan_steps):
            # Sample random S, A pair from the model
            s, a = self.rng.choice(list(self.model))
            # Retreive reward last experienced from S, A
            r, s_, done = self.model[(s, a)]
            # If running Dyna+, add the exploration bonus
            if self.plus:
                # How long it has been since the action was last
                # chosen
                tau = t - self.last_used_a[s, a]
                # The Dyna+ exploration bonus
                if tau < 0:
                    print(tau)
                bonus = self.kappa * np.sqrt(tau)
            # Aside from Dyna-Q, this Dyna method also works with other update
            # methods, like Sarsa, in which case there is a need for providing
            # A_{t+1} as well
            a_ = self.selector(agent, s_, t)
            # Planning involves simply learning from a simulated transition as if
            # it was actually experienced again, so the learner is called with this
            # synthetic data
            self.learner(agent, s, a, r, s_, a_, 0, 1 if done else inf, None, None)

################################
# Agent
################################

@dataclass
class Agent:
    """
    An RL agent that is to be trained in a specific environment,
    holding value estimates, an action selection strategy, update
    rules, and, optionally a planning strategy and hyperparameter
    update schedules.
    """
    num_states: int
    num_actions: int
    Q: np.ndarray = field(init=False)
    selector: ActionSelector
    learner: UpdateMethod
    schedules: list[UpdateMethod] = field(default_factory = list)
    planner: UpdateMethod = field(default_factory = NoPlanner)
    ep_lengths: list[int] = field(default_factory = list, init=False)
    ep_returns: list[int] = field(default_factory = list, init=False)

    def __post_init__(self):
        self.Q = np.zeros((self.num_states, self.num_actions))

    def save(self, filepath):
        with open(filepath, "wb") as f:
            pickle.dump(self, f)

    @classmethod
    def load(cls, filepath) -> 'Agent':
        with open(filepath, "rb") as f:
            return pickle.load(f)

    @classmethod
    def combinations(
        cls,
        num_states: int,
        num_actions: int,
        selectors: list[ActionSelector],
        learners: list[UpdateMethod],
        scheduless: list[list[Schedule]],
        planners: list[UpdateMethod],
    ):
        param_space = product(selectors, learners, scheduless, planners)

        agents = [
            Agent(
                num_states=num_states,
                num_actions=num_actions,
                selector=selector,
                learner=learner,
                schedules=schedules,
                planner=planner,
            ) for (selector, learner, schedules, planner) in param_space
        ]

        return agents

    ### Evaulation methods ###
    def smoothed_ep_lengths(self, trail_length: int) -> tuple[list[int], list[int]]:
        """
        Return xs and ys for plotting smoothed out episode lengths.

        Args:
            trail_length: Used to calculate the update weight, 1/trail_length
        """
        xs = np.arange(len(self.ep_lengths))
        ys = np.zeros(len(self.ep_lengths))
        ys[0] = self.ep_lengths[0]

        weight = 1 / trail_length

        for i in range(1, len(ys)):
            ys[i] = weight * self.ep_lengths[i] + (1 - weight) * ys[i-1]

        return xs, ys

    def smoothed_ep_returns(self, trail_length: int) -> tuple[list[int], list[int]]:
        """
        Return xs and ys for plotting smoothed out episode returns.

        Args:
            trail_length: Used to calculate the update weight, 1/trail_length
        """

        xs = np.arange(len(self.ep_returns))
        ys = np.zeros(len(self.ep_returns))
        ys[0] = self.ep_returns[0]

        weight = 1.0 / trail_length
        assert 0 < weight <= 1

        for i in range(1, len(ys)):
            ys[i] = weight * self.ep_returns[i] + (1 - weight) * ys[i-1]

        return xs, ys

    def cumulative_eps(self) -> tuple[list[int], list[int]]:
        """Return xs and ys for plotting cumulative episodes over timesteps."""
        xs = np.cumsum(self.ep_lengths)
        ys = list(range(len(self.ep_lengths)))

        return xs, ys

    def cumulative_returns(self) -> tuple[list[int], list[int]]:
        """Return xs and ys for plotting cumulative rewards over episodes."""
        xs = list(range(len(self.ep_lengths)))
        ys = np.cumsum(self.ep_returns)

        return xs, ys

    def play_episode(self, env, max_steps=100):
        s = env.reset()
        a = self.selector(self, s, 0)
        states = [s]
        actions = [a]
        rewards = [0]
        for t in range(max_steps):
            s, r, done, *info = env.step(a)
            a = self.selector(self, s, t)
            rewards.append(r)
            states.append(s)
            actions.append(a)
            if done:
                break

        return states, actions, rewards

################################
# Training loop
################################

class TrainingInterrupt(Exception):
    pass


@dataclass
class Trainer:
    """
    Trains a given agent in a given environment,
    while keeping track of the progress along the way.
    """
    agent: Agent
    env: object
    episodes: int = field(default=0, init=False)

    def train(self, num_episodes, quiet=False, early_stop=-1):
        """
        Run the training loop for a given number of episodes, while
        allowing for early stopping and continuing, as well as muting
        the output, which is useful during parameter studies.
        """
        # Iterator that keeps track of progress even when training
        # loop is interrupted.
        train_iter = tqdm(
            range(self.episodes, num_episodes),
            desc="Episodes",
            initial=self.episodes,
            total=num_episodes,
            disable=quiet,
        )
        env = self.env
        agent = self.agent
        try:
            for ep in train_iter:
                self.episodes = ep
                # Interrupt the training process if our early stopping
                # point was reached. Ensure that training is only
                # continued if the early stopping point is removed.
                if early_stop != -1 and ep == early_stop:
                    raise TrainingInterrupt

                # Initialize the beginning of the episodes
                s = env.reset()
                a = agent.selector(agent, s, 0)
                # The terminal time step, used for some update methods
                T = inf

                # Keep track of the episode's return
                ret = 0
                discount = 1

                for t in count():

                    # Take a step in the environment and record the reward
                    s_, r, done, *info = env.step(a)
                    ret += discount * r
                    discount *= agent.learner.gamma

                    # Select a next action to take, unless the episode
                    # has finished, in which case set the terminal time
                    # step to its true value.
                    if done:
                        T = t + 1
                    else:
                        a_ = agent.selector(agent, s_, t)

                    # Update the agent's value estimates through direct LR
                    agent.learner(agent, s, a, r, s_, a_, t, T, ep, num_episodes)

                    # Apply the agent's parameter update schedules
                    for schedule in agent.schedules:
                        schedule(agent, s, a, r, s_, a_, t, T, ep, num_episodes)

                    # Execute the agent's planning strategy
                    if agent.planner:
                        agent.planner(agent, s, a, r, s_, a_, t, T, ep, num_episodes)

                    if done:
                        agent.ep_lengths.append(T)
                        agent.ep_returns.append(ret)
                        break

                    # Update values for the next time step
                    s, a = s_, a_

        except (KeyboardInterrupt, TrainingInterrupt):
            print(f"Training paused after {self.episodes} episodes.")