Skip to content

Commit 2d357a9

Browse files
committed
Merge remote-tracking branch 'upstream/main' into chtruong/remove-transition-msg
Signed-off-by: Charlie Truong <chtruong@nvidia.com>
2 parents dadfa55 + f85ad03 commit 2d357a9

File tree

549 files changed

+128085
-33077
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

549 files changed

+128085
-33077
lines changed

.github/actions/action.yml

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -11,28 +11,28 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14-
name: 'Test Template'
15-
description: 'Template for running NeMo tests in a containerized environment'
14+
name: "Test Template"
15+
description: "Template for running NeMo tests in a containerized environment"
1616

1717
inputs:
1818
container-image:
19-
description: 'Container image to use for test'
19+
description: "Container image to use for test"
2020
required: true
2121
timeout:
22-
description: 'Max runtime of test in minutes'
22+
description: "Max runtime of test in minutes"
2323
required: false
24-
default: '30'
24+
default: "30"
2525
script:
26-
description: 'Test script to execute'
26+
description: "Test script to execute"
2727
required: true
2828
is-optional:
29-
description: 'Pass this job on failure.'
29+
description: "Pass this job on failure."
3030
required: false
31-
default: 'false'
31+
default: "false"
3232
is_unit_test:
33-
description: 'Upload coverage as unit test'
33+
description: "Upload coverage as unit test"
3434
required: false
35-
default: 'false'
35+
default: "false"
3636
tag:
3737
description: Latest or legacy test suite
3838
required: true
@@ -43,11 +43,14 @@ inputs:
4343
description: Model to launch
4444
required: false
4545
PAT:
46-
description: 'GitHub Personal Access Token'
46+
description: "GitHub Personal Access Token"
47+
required: true
48+
is_ci_workload:
49+
description: "Is CI workload"
4750
required: true
4851

4952
runs:
50-
using: 'composite'
53+
using: "composite"
5154
steps:
5255
- name: Checkout repository
5356
uses: actions/checkout@v2
@@ -77,6 +80,7 @@ runs:
7780
7881
export PYTHONPATH=$(pwd)
7982
export NEMORUN_HOME=$(pwd)
83+
export NCCL_DEBUG=INFO
8084
pip install --no-cache-dir uv
8185
uv sync --only-group test
8286
uv run python tests/test_utils/python_scripts/launch_nemo_run_workload.py \
@@ -119,9 +123,11 @@ runs:
119123
id: has-run-functional-tests-label
120124
env:
121125
GH_TOKEN: ${{ github.token }}
126+
IS_CI_WORKLOAD: ${{ inputs.is_ci_workload }}
122127
run: |
123128
PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}
124-
HAS_RUN_FUNCTIONAL_TESTS_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "Run functional tests")') || echo "false"
129+
HAS_RUN_FUNCTIONAL_TESTS_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "Run functional tests")') || echo "$IS_CI_WORKLOAD"
130+
HAS_RUN_FUNCTIONAL_TESTS_LABEL=${HAS_RUN_FUNCTIONAL_TESTS_LABEL:-$IS_CI_WORKLOAD}
125131
echo "main=$HAS_RUN_FUNCTIONAL_TESTS_LABEL" | tee -a $GITHUB_OUTPUT
126132
127133
- name: Create run-script (e2e test)

.github/copy-pr-bot.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
enabled: true
22
auto_sync_draft: false
33
auto_sync_ready: true
4-
trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "ChenhanYu", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "Phlip79", "QiZhangNV", "ShriyaRishab", "Victarry", "Wohox", "ZhiyuLi-Nvidia", "ahmadki", "aklife97", "ananthsub", "asolergi-nv", "buptzyb", "chtruong814", "cspades", "cuichenx", "deepakn94", "dimapihtar", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "frsun-nvda", "gautham-kollu", "gdengk", "guyueh1", "hxbai", "jalbericiola", "jaredcasper", "jenchen13", "jiemingz", "jkamalu", "jon-barker", "jstjohn", "kanz-nv", "kevalmorabia97", "ko3n1g", "kunlunl", "kvareddy", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mehraakash", "mkhona-nvidia", "pablo-garay", "parthmannan", "pthombre", "rogerwaleffe", "sanandaraj5597", "santhnm2", "sbak5", "shanmugamr1992", "shifangx", "shjwudp", "sidsingh-nvidia", "skyw", "sudhakarsingh27", "tdene", "theothermike", "thomasdhc", "trintamaki", "tylerpoon", "wdykas", "xiaoyao0115", "xuwchen", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yeyu-nvidia", "yobibyte", "youngeunkwon0405", "yuzhongw-nvidia", "zhongbozhu"]
4+
trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "ChenhanYu", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "Phlip79", "QiZhangNV", "ShriyaRishab", "Victarry", "Wohox", "ZhiyuLi-Nvidia", "ahmadki", "aklife97", "ananthsub", "asolergi-nv", "buptzyb", "chtruong814", "cspades", "cuichenx", "deepakn94", "dimapihtar", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "frsun-nvda", "gautham-kollu", "gdengk", "guyueh1", "hxbai", "jalbericiola", "jaredcasper", "jenchen13", "jiemingz", "jingqiny-99", "jkamalu", "jon-barker", "jstjohn", "kanz-nv", "kevalmorabia97", "ko3n1g", "kunlunl", "kvareddy", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mehraakash", "mkhona-nvidia", "pablo-garay", "parthmannan", "pthombre", "rogerwaleffe", "sanandaraj5597", "santhnm2", "sbak5", "shanmugamr1992", "shifangx", "shjwudp", "sidsingh-nvidia", "skyw", "sudhakarsingh27", "tdene", "theothermike", "thomasdhc", "trintamaki", "tylerpoon", "wdykas", "xiaoyao0115", "xuwchen", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yeyu-nvidia", "yobibyte", "youngeunkwon0405", "yuzhongw-nvidia", "zhongbozhu"]

.github/oncall_schedule.json

Lines changed: 30 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,50 @@
11
[
22
{
3-
"user": "Phlip79",
4-
"date": "2026-01-07"
3+
"user": "maanug-nv",
4+
"date": "2026-01-21"
55
},
66
{
7-
"user": "BoxiangW",
8-
"date": "2026-01-14"
7+
"user": "dimapihtar",
8+
"date": "2026-01-28"
99
},
1010
{
11-
"user": "maanug-nv",
12-
"date": "2026-01-21"
11+
"user": "gautham-kollu",
12+
"date": "2026-02-04"
13+
},
14+
{
15+
"user": "janEbert",
16+
"date": "2026-02-11"
17+
},
18+
{
19+
"user": "Phlip79",
20+
"date": "2026-02-18"
1321
},
1422
{
1523
"user": "asolergi-nv",
16-
"date": "2026-01-28"
24+
"date": "2026-02-25"
25+
},
26+
{
27+
"user": "BoxiangW",
28+
"date": "2026-03-04"
29+
},
30+
{
31+
"user": "maanug-nv",
32+
"date": "2026-03-11"
1733
},
1834
{
1935
"user": "dimapihtar",
20-
"date": "2026-02-04"
36+
"date": "2026-03-18"
2137
},
2238
{
2339
"user": "gautham-kollu",
24-
"date": "2026-02-11"
40+
"date": "2026-03-25"
2541
},
2642
{
2743
"user": "janEbert",
28-
"date": "2026-02-18"
44+
"date": "2026-04-01"
45+
},
46+
{
47+
"user": "maanug-nv",
48+
"date": "2026-04-08"
2949
}
3050
]

.github/scripts/oncall_manager.py

Lines changed: 170 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,21 @@
1919
import argparse
2020
from datetime import datetime, timedelta, timezone
2121

22+
from slack_sdk import WebClient
23+
from slack_sdk.errors import SlackApiError
24+
2225
# Constants
2326
GITHUB_API_URL = "https://api.github.com"
2427
SCHEDULE_FILE = ".github/oncall_schedule.json"
2528
ROTATION_TEAM_SLUG = "mcore-oncall-rotation"
2629
ACTIVE_ONCALL_TEAM_SLUG = "mcore-oncall"
30+
SLACK_USERGROUP_HANDLE = "mcore-oncall"
2731
TARGET_WEEKS = 12
2832

33+
# Caches for email and Slack lookups
34+
_email_cache = {}
35+
_slack_id_cache = {}
36+
2937
def get_headers():
3038
token = os.environ.get("GH_TOKEN")
3139
if not token:
@@ -74,6 +82,159 @@ def get_team_members(org, team_slug):
7482

7583
return members
7684

85+
def get_user_email(username):
86+
"""Get user's email from GitHub, prioritizing @nvidia.com emails.
87+
88+
Checks in order:
89+
1. Public profile email
90+
2. Recent commits in the repository
91+
"""
92+
if username in _email_cache:
93+
return _email_cache[username]
94+
95+
headers = get_headers()
96+
public_email = None
97+
98+
try:
99+
# 1. Try to get user's public profile email first
100+
resp = requests.get(f"{GITHUB_API_URL}/users/{username}", headers=headers)
101+
if resp.status_code == 200:
102+
user_data = resp.json()
103+
email = user_data.get('email')
104+
if email and not email.endswith("@users.noreply.github.com"):
105+
if email.endswith("@nvidia.com"):
106+
_email_cache[username] = email
107+
return email
108+
# Store non-nvidia email as fallback
109+
public_email = email
110+
111+
# 2. Check recent commits in the repository for @nvidia.com email
112+
repo_env = os.environ.get("GITHUB_REPOSITORY", "NVIDIA/Megatron-LM")
113+
commits_url = f"{GITHUB_API_URL}/repos/{repo_env}/commits?author={username}&per_page=10"
114+
resp = requests.get(commits_url, headers=headers)
115+
116+
if resp.status_code == 200:
117+
commits = resp.json()
118+
for commit in commits:
119+
# Get email from commit author
120+
commit_data = commit.get('commit', {})
121+
author_data = commit_data.get('author', {})
122+
email = author_data.get('email')
123+
124+
if email and not email.endswith("@users.noreply.github.com"):
125+
if email.endswith("@nvidia.com"):
126+
_email_cache[username] = email
127+
print(f"Found @nvidia.com email for {username} from commits: {email}")
128+
return email
129+
elif public_email is None:
130+
public_email = email
131+
132+
# 3. Use public email if found, otherwise fallback
133+
if public_email:
134+
_email_cache[username] = public_email
135+
print(f"Using public email for {username}: {public_email}")
136+
return public_email
137+
138+
# Fallback to noreply email
139+
fallback = f"{username}@users.noreply.github.com"
140+
_email_cache[username] = fallback
141+
print(f"Warning: No email found for {username}, using fallback: {fallback}")
142+
return fallback
143+
144+
except Exception as e:
145+
print(f"Warning: Could not get email for {username}: {e}")
146+
fallback = f"{username}@users.noreply.github.com"
147+
_email_cache[username] = fallback
148+
return fallback
149+
150+
def get_slack_client():
151+
"""Get Slack WebClient if token is available."""
152+
slack_token = os.environ.get("SLACK_TOKEN")
153+
if not slack_token:
154+
return None
155+
156+
return WebClient(token=slack_token)
157+
158+
def get_slack_user_id(slack_client, email):
159+
"""Get Slack user ID from email."""
160+
if not slack_client:
161+
return None
162+
163+
if email in _slack_id_cache:
164+
return _slack_id_cache[email]
165+
166+
try:
167+
response = slack_client.users_lookupByEmail(email=email)
168+
user_id = response["user"]["id"]
169+
_slack_id_cache[email] = user_id
170+
return user_id
171+
except SlackApiError as e:
172+
print(f"Warning: Could not find Slack user for {email}: {e.response['error']}")
173+
_slack_id_cache[email] = None
174+
return None
175+
176+
def get_slack_usergroup_id(slack_client, handle):
177+
"""Get Slack usergroup ID from handle."""
178+
if not slack_client:
179+
return None
180+
181+
try:
182+
response = slack_client.usergroups_list(include_users=True)
183+
for usergroup in response.get("usergroups", []):
184+
if usergroup.get("handle") == handle:
185+
return usergroup.get("id"), usergroup.get("users", [])
186+
print(f"Warning: Slack usergroup '{handle}' not found")
187+
return None, []
188+
except SlackApiError as e:
189+
print(f"Warning: Could not list Slack usergroups: {e.response['error']}")
190+
return None, []
191+
192+
def update_slack_usergroup(new_oncall_username, old_members_usernames):
193+
"""
194+
Updates the Slack usergroup to contain only the new oncall user.
195+
Adds new oncall first, then removes old members (usergroups need at least one member).
196+
"""
197+
slack_client = get_slack_client()
198+
if not slack_client:
199+
print("Slack token not configured, skipping Slack usergroup update")
200+
return
201+
202+
# Get the new oncall's email and Slack user ID
203+
new_email = get_user_email(new_oncall_username)
204+
new_slack_id = get_slack_user_id(slack_client, new_email)
205+
206+
if not new_slack_id:
207+
print(f"Could not find Slack user ID for {new_oncall_username} ({new_email}), skipping Slack update")
208+
return
209+
210+
# Get the usergroup ID and current members
211+
usergroup_id, current_slack_members = get_slack_usergroup_id(slack_client, SLACK_USERGROUP_HANDLE)
212+
213+
if not usergroup_id:
214+
print(f"Could not find Slack usergroup '{SLACK_USERGROUP_HANDLE}', skipping Slack update")
215+
return
216+
217+
try:
218+
# Step 1: Add new oncall first (include current members to avoid removing anyone yet)
219+
# This ensures usergroup always has at least one member
220+
if new_slack_id not in current_slack_members:
221+
updated_members = list(set(current_slack_members + [new_slack_id]))
222+
slack_client.usergroups_users_update(
223+
usergroup=usergroup_id,
224+
users=updated_members
225+
)
226+
print(f"Added {new_oncall_username} to Slack usergroup '{SLACK_USERGROUP_HANDLE}'")
227+
228+
# Step 2: Now set the usergroup to contain only the new oncall
229+
slack_client.usergroups_users_update(
230+
usergroup=usergroup_id,
231+
users=[new_slack_id]
232+
)
233+
print(f"Updated Slack usergroup '{SLACK_USERGROUP_HANDLE}' to contain only {new_oncall_username}")
234+
235+
except SlackApiError as e:
236+
print(f"Failed to update Slack usergroup: {e.response['error']}")
237+
77238
def load_schedule():
78239
if not os.path.exists(SCHEDULE_FILE):
79240
return []
@@ -111,14 +272,19 @@ def update_active_oncall_team(org, new_oncall):
111272
print(f"Failed to add {new_oncall} to {ACTIVE_ONCALL_TEAM_SLUG}: {resp.status_code} {resp.text}")
112273

113274
# 3. Remove everyone else
275+
old_members = []
114276
for member in current_members:
115277
if member not in [new_oncall, 'svcnvidia-nemo-ci']:
278+
old_members.append(member)
116279
url = f"{GITHUB_API_URL}/orgs/{org}/teams/{ACTIVE_ONCALL_TEAM_SLUG}/memberships/{member}"
117280
resp = requests.delete(url, headers=get_headers())
118281
if resp.status_code == 204:
119282
print(f"Removed {member} from {ACTIVE_ONCALL_TEAM_SLUG}")
120283
else:
121284
print(f"Failed to remove {member} from {ACTIVE_ONCALL_TEAM_SLUG}: {resp.status_code} {resp.text}")
285+
286+
# 4. Update Slack usergroup (add new oncall first, then remove old members)
287+
update_slack_usergroup(new_oncall, old_members)
122288

123289
def rotate_schedule(repo_owner, dry_run=False):
124290
schedule = load_schedule()
@@ -225,25 +391,16 @@ def ensure_schedule_filled(schedule, repo_owner):
225391
print(f"Appended: {new_entry}")
226392

227393
def assign_reviewer(pr_number):
228-
"""Assigns the current oncall as the reviewer for the PR."""
229-
schedule = load_schedule()
230-
if not schedule:
231-
print("Error: Schedule is empty. Cannot assign reviewer.")
232-
sys.exit(1)
233-
234-
current_entry = schedule[0]
235-
current_oncall = current_entry['user']
236-
print(f"Current oncall: {current_oncall} (Since {current_entry['date']})")
237-
394+
"""Assigns the mcore-oncall team as the reviewer for the PR."""
238395
owner, repo = get_repo_info()
239396
url = f"{GITHUB_API_URL}/repos/{owner}/{repo}/pulls/{pr_number}/requested_reviewers"
240397

241-
# We can assign the user directly
242-
data = {"reviewers": [current_oncall]}
398+
# Assign the oncall team as reviewer
399+
data = {"team_reviewers": [ACTIVE_ONCALL_TEAM_SLUG]}
243400
resp = requests.post(url, headers=get_headers(), json=data)
244401

245402
if resp.status_code in [201, 200]:
246-
print(f"Successfully requested review from {current_oncall}")
403+
print(f"Successfully requested review from team NVIDIA/{ACTIVE_ONCALL_TEAM_SLUG}")
247404
else:
248405
print(f"Failed to request review: {resp.status_code} {resp.text}")
249406
sys.exit(1)

0 commit comments

Comments
 (0)