Updates RL libraries training performance comparison (isaac-sim#4109)

Toni-SM · web-flow · commit f17db880f5a6 · 2025-12-10T11:55:28.000-08:00
# Description > Reopening pending PR (closed at that point) for when the cleanup and removal of the internal repository was performed. This PR updates the agent configuration (to be as similar as possible) for the `Isaac-Humanoid-v0` task to ensure a more accurate comparison of the RL libraries when generating the [Training Performance](https://isaac-sim.github.io/IsaacLab/main/source/overview/reinforcement-learning/rl_frameworks.html#training-performance) table. To this end: 1. A common Training time info (e.g.: `Training time: XXX.YY seconds`) is printed when running existing `train.py` scripts. Currently the RL libraries output training information in different formats and extends. 2. A note is added to involved agent configurations to notify/ensure that any modification should be propagated to the other agent configuration files. 3. The commands used to benchmark the RL libraries is added to docs, for clearness and repro. ## Screenshots Difference between current agent configuration (red) and new agent configuration (green) showing that the new configuration does not represent a radical change in learning <img width="1230" height="880" alt="Screenshot from 2025-11-28 13-19-14" src="https://github.com/user-attachments/assets/12a098c1-c169-4e09-b60f-b5f105341fbd" /> ## Checklist - [x] I have read and understood the [contribution guidelines](https://isaac-sim.github.io/IsaacLab/main/source/refs/contributing.html) - [x] I have run the [`pre-commit` checks](https://pre-commit.com/) with `./isaaclab.sh --format` - [x] I have made corresponding changes to the documentation - [x] My changes generate no new warnings - [ ] I have added tests that prove my fix is effective or that my feature works - [ ] I have updated the changelog and the corresponding version in the extension's `config/extension.toml` file - [x] I have added my name to the `CONTRIBUTORS.md` or my name already exists there
diff --git a/docs/source/overview/reinforcement-learning/rl_frameworks.rst b/docs/source/overview/reinforcement-learning/rl_frameworks.rst
@@ -71,18 +71,26 @@ Training Performance
 --------------------
 
 We performed training with each RL library on the same ``Isaac-Humanoid-v0`` environment
-with ``--headless`` on a single RTX PRO 6000 GPU using 4096 environments
-and logged the total training time for 65.5M steps for each RL library.
-
+with ``--headless`` on a single NVIDIA GeForce RTX 4090 and logged the total training time
+for 65.5M steps (4096 environments x 32 rollout steps x 500 iterations).
 
 +--------------------+-----------------+
 | RL Library         | Time in seconds |
 +====================+=================+
-| RL-Games           | 207             |
+| RL-Games           | 201             |
 +--------------------+-----------------+
-| SKRL               | 208             |
+| SKRL               | 201             |
 +--------------------+-----------------+
-| RSL RL             | 199             |
+| RSL RL             | 198             |
 +--------------------+-----------------+
-| Stable-Baselines3  | 322             |
+| Stable-Baselines3  | 287             |
 +--------------------+-----------------+
+
+Training commands (check for the *'Training time: XXX seconds'* line in the terminal output):
+
+.. code:: bash
+
+    python scripts/reinforcement_learning/rl_games/train.py --task Isaac-Humanoid-v0 --max_iterations 500 --headless
+    python scripts/reinforcement_learning/skrl/train.py --task Isaac-Humanoid-v0 --max_iterations 500 --headless
+    python scripts/reinforcement_learning/rsl_rl/train.py --task Isaac-Humanoid-v0 --max_iterations 500 --headless
+    python scripts/reinforcement_learning/sb3/train.py --task Isaac-Humanoid-v0 --max_iterations 500 --headless
diff --git a/scripts/reinforcement_learning/rl_games/train.py b/scripts/reinforcement_learning/rl_games/train.py
@@ -67,6 +67,7 @@
 import math
 import os
 import random
+import time
 from datetime import datetime
 
 from rl_games.common import env_configurations, vecenv
@@ -201,6 +202,8 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen
         print_dict(video_kwargs, nesting=4)
         env = gym.wrappers.RecordVideo(env, **video_kwargs)
 
+    start_time = time.time()
+
     # wrap around environment for rl-games
     env = RlGamesVecEnvWrapper(env, rl_device, clip_obs, clip_actions, obs_groups, concate_obs_groups)
 
@@ -250,6 +253,8 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen
     else:
         runner.run({"train": True, "play": False, "sigma": train_sigma})
 
+    print(f"Training time: {round(time.time() - start_time, 2)} seconds")
+
     # close the simulator
     env.close()
 
diff --git a/scripts/reinforcement_learning/rsl_rl/train.py b/scripts/reinforcement_learning/rsl_rl/train.py
@@ -78,6 +78,7 @@
 import gymnasium as gym
 import logging
 import os
+import time
 import torch
 from datetime import datetime
 
@@ -187,6 +188,8 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen
         print_dict(video_kwargs, nesting=4)
         env = gym.wrappers.RecordVideo(env, **video_kwargs)
 
+    start_time = time.time()
+
     # wrap around environment for rsl-rl
     env = RslRlVecEnvWrapper(env, clip_actions=agent_cfg.clip_actions)
 
@@ -212,6 +215,8 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen
     # run training
     runner.learn(num_learning_iterations=agent_cfg.max_iterations, init_at_random_ep_len=True)
 
+    print(f"Training time: {round(time.time() - start_time, 2)} seconds")
+
     # close the simulator
     env.close()
 
diff --git a/scripts/reinforcement_learning/sb3/train.py b/scripts/reinforcement_learning/sb3/train.py
@@ -80,6 +80,7 @@ def cleanup_pbar(*args):
 import numpy as np
 import os
 import random
+import time
 from datetime import datetime
 
 from stable_baselines3 import PPO
@@ -176,6 +177,8 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen
         print_dict(video_kwargs, nesting=4)
         env = gym.wrappers.RecordVideo(env, **video_kwargs)
 
+    start_time = time.time()
+
     # wrap around environment for stable baselines
     env = Sb3VecEnvWrapper(env, fast_variant=not args_cli.keep_all_info)
 
@@ -223,6 +226,8 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen
         print("Saving normalization")
         env.save(os.path.join(log_dir, "model_vecnormalize.pkl"))
 
+    print(f"Training time: {round(time.time() - start_time, 2)} seconds")
+
     # close the simulator
     env.close()
 
diff --git a/scripts/reinforcement_learning/skrl/train.py b/scripts/reinforcement_learning/skrl/train.py
@@ -78,6 +78,7 @@
 import logging
 import os
 import random
+import time
 from datetime import datetime
 
 import skrl
@@ -214,6 +215,8 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen
         print_dict(video_kwargs, nesting=4)
         env = gym.wrappers.RecordVideo(env, **video_kwargs)
 
+    start_time = time.time()
+
     # wrap around environment for skrl
     env = SkrlVecEnvWrapper(env, ml_framework=args_cli.ml_framework)  # same as: `wrap_env(env, wrapper="auto")`
 
@@ -229,6 +232,8 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen
     # run training
     runner.run()
 
+    print(f"Training time: {round(time.time() - start_time, 2)} seconds")
+
     # close the simulator
     env.close()
 
diff --git a/source/isaaclab_tasks/isaaclab_tasks/manager_based/classic/humanoid/agents/rl_games_ppo_cfg.yaml b/source/isaaclab_tasks/isaaclab_tasks/manager_based/classic/humanoid/agents/rl_games_ppo_cfg.yaml
@@ -3,6 +3,14 @@
 #
 # SPDX-License-Identifier: BSD-3-Clause
 
+# ========================================= IMPORTANT NOTICE =========================================
+#
+# This file defines the agent configuration used to generate the "Training Performance" table in
+# https://isaac-sim.github.io/IsaacLab/main/source/overview/reinforcement-learning/rl_frameworks.html.
+# Ensure that the configurations for the other RL libraries are updated if this one is modified.
+#
+# ====================================================================================================
+
 params:
   seed: 42
 
@@ -50,13 +58,13 @@ params:
     device_name: 'cuda:0'
     multi_gpu: False
     ppo: True
-    mixed_precision: True
+    mixed_precision: False
     normalize_input: True
     normalize_value: True
     value_bootstrap: True
     num_actors: -1
     reward_shaper:
-      scale_value: 0.6
+      scale_value: 1.0
     normalize_advantage: True
     gamma: 0.99
     tau: 0.95
@@ -72,7 +80,7 @@ params:
     truncate_grads: True
     e_clip: 0.2
     horizon_length: 32
-    minibatch_size: 32768
+    minibatch_size: 32768  # num_envs * horizon_length / num_mini_batches
     mini_epochs: 5
     critic_coef: 4
     clip_value: True
diff --git a/source/isaaclab_tasks/isaaclab_tasks/manager_based/classic/humanoid/agents/rsl_rl_ppo_cfg.py b/source/isaaclab_tasks/isaaclab_tasks/manager_based/classic/humanoid/agents/rsl_rl_ppo_cfg.py
@@ -3,6 +3,17 @@
 #
 # SPDX-License-Identifier: BSD-3-Clause
 
+"""
+========================================= IMPORTANT NOTICE =========================================
+
+This file defines the agent configuration used to generate the "Training Performance" table in
+https://isaac-sim.github.io/IsaacLab/main/source/overview/reinforcement-learning/rl_frameworks.html.
+Ensure that the configurations for the other RL libraries are updated if this one is modified.
+
+====================================================================================================
+"""
+
+
 from isaaclab.utils import configclass
 
 from isaaclab_rl.rsl_rl import RslRlOnPolicyRunnerCfg, RslRlPpoActorCriticCfg, RslRlPpoAlgorithmCfg
@@ -12,18 +23,18 @@
 class HumanoidPPORunnerCfg(RslRlOnPolicyRunnerCfg):
     num_steps_per_env = 32
     max_iterations = 1000
-    save_interval = 50
+    save_interval = 100
     experiment_name = "humanoid"
     policy = RslRlPpoActorCriticCfg(
         init_noise_std=1.0,
-        actor_obs_normalization=False,
-        critic_obs_normalization=False,
+        actor_obs_normalization=True,
+        critic_obs_normalization=True,
         actor_hidden_dims=[400, 200, 100],
         critic_hidden_dims=[400, 200, 100],
         activation="elu",
     )
     algorithm = RslRlPpoAlgorithmCfg(
-        value_loss_coef=1.0,
+        value_loss_coef=2.0,
         use_clipped_value_loss=True,
         clip_param=0.2,
         entropy_coef=0.0,
diff --git a/source/isaaclab_tasks/isaaclab_tasks/manager_based/classic/humanoid/agents/sb3_ppo_cfg.yaml b/source/isaaclab_tasks/isaaclab_tasks/manager_based/classic/humanoid/agents/sb3_ppo_cfg.yaml
@@ -3,7 +3,14 @@
 #
 # SPDX-License-Identifier: BSD-3-Clause
 
-# Adapted from rsl_rl config
+# ========================================= IMPORTANT NOTICE =========================================
+#
+# This file defines the agent configuration used to generate the "Training Performance" table in
+# https://isaac-sim.github.io/IsaacLab/main/source/overview/reinforcement-learning/rl_frameworks.html.
+# Ensure that the configurations for the other RL libraries are updated if this one is modified.
+#
+# ====================================================================================================
+
 seed: 42
 policy: "MlpPolicy"
 n_timesteps: !!float 5e7
@@ -18,7 +25,7 @@ clip_range: 0.2
 n_epochs: 5
 gae_lambda: 0.95
 max_grad_norm: 1.0
-vf_coef: 0.5
+vf_coef: 2.0
 policy_kwargs:
   activation_fn: 'nn.ELU'
   net_arch: [400, 200, 100]
diff --git a/source/isaaclab_tasks/isaaclab_tasks/manager_based/classic/humanoid/agents/skrl_ppo_cfg.yaml b/source/isaaclab_tasks/isaaclab_tasks/manager_based/classic/humanoid/agents/skrl_ppo_cfg.yaml
@@ -3,6 +3,14 @@
 #
 # SPDX-License-Identifier: BSD-3-Clause
 
+# ========================================= IMPORTANT NOTICE =========================================
+#
+# This file defines the agent configuration used to generate the "Training Performance" table in
+# https://isaac-sim.github.io/IsaacLab/main/source/overview/reinforcement-learning/rl_frameworks.html.
+# Ensure that the configurations for the other RL libraries are updated if this one is modified.
+#
+# ====================================================================================================
+
 seed: 42
 
 
@@ -67,14 +75,13 @@ agent:
   entropy_loss_scale: 0.0
   value_loss_scale: 2.0
   kl_threshold: 0.0
-  rewards_shaper_scale: 0.6
   time_limit_bootstrap: False
   # logging and checkpoint
   experiment:
     directory: "humanoid"
     experiment_name: ""
-    write_interval: auto
-    checkpoint_interval: auto
+    write_interval: 32
+    checkpoint_interval: 3200
 
 
 # Sequential trainer