Skip to content

Commit 353c8dd

Browse files
feat: Support LSF job arrays (#491)
* fix: Support LSF job arrays * This fix includes the necessary changes in cgroup regex and bjobs handling to account job arrays in LSF Signed-off-by: Mahendra Paipuri <mahendra.paipuri@gmail.com> * style: Fix lint issues Signed-off-by: Mahendra Paipuri <mahendra.paipuri@gmail.com> --------- Signed-off-by: Mahendra Paipuri <mahendra.paipuri@gmail.com>
1 parent a73775b commit 353c8dd

7 files changed

Lines changed: 101 additions & 27 deletions

File tree

pkg/collector/cgroup.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,9 +98,11 @@ var (
9898
/*
9999
For v1 possibilities are /cpuacct/lsf/cluster_name/job.42.34071.1775289779
100100
/memory/lsf/cluster_name/job.42.34071.1775289779
101+
/memory/lsf/cluster_name/job.42[2].34071.1775289779 for job arrays
101102
102103
For v2 possibilities are /lsf/cluster1/job.9.30675.1775295256
103104
/lsf/cluster1/job.9.30675.1775295256/leaf
105+
/lsf/cluster1/job.9[1].30675.1775295256/leaf for job arrays
104106
105107
Seems like there are no cgroups for each MPI task created by LSF. At least I am not
106108
able to find proper settings to get them created
@@ -112,8 +114,8 @@ var (
112114
/bin/bash /home/usr1/.lsbatch/1775296755.13.shell
113115
*/
114116
var (
115-
lsfCgroupV1PathRegex = regexp.MustCompile(`^.*/lsf/(?:(?P<cluster_name>.*?))/job\.(?P<id>[0-9]+)\.(?:(?P<int_id>[0-9]+))\.(?:(?P<ts>[0-9]+))(?:.*$)`)
116-
lsfCgroupV2PathRegex = regexp.MustCompile(`^.*/lsf/(?:(?P<cluster_name>.*?))/job\.(?P<id>[0-9]+)\.(?:(?P<int_id>[0-9]+))\.(?:(?P<ts>[0-9]+))/leaf(?:.*$)`)
117+
lsfCgroupV1PathRegex = regexp.MustCompile(`^.*/lsf/(?:(?P<cluster_name>.*?))/job\.(?P<id>[0-9]+(?:\[[0-9]+\])?)\.(?:(?P<int_id>[0-9]+))\.(?:(?P<ts>[0-9]+))(?:.*$)`)
118+
lsfCgroupV2PathRegex = regexp.MustCompile(`^.*/lsf/(?:(?P<cluster_name>.*?))/job\.(?P<id>[0-9]+(?:\[[0-9]+\])?)\.(?:(?P<int_id>[0-9]+))\.(?:(?P<ts>[0-9]+))/leaf(?:.*$)`)
117119
lsfIgnoreProcsRegex = regexp.MustCompile("(.*)/.lsbatch/(.*)")
118120
)
119121

pkg/collector/helper.go

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -488,7 +488,7 @@ func readProcEnvirons(data any) error {
488488
var targetEnvVarsToFetch []string
489489

490490
for _, targetEnvVar := range d.targetEnvVars {
491-
if _, ok := d.targetEnvVarValues[targetEnvVar]; !ok {
491+
if val, ok := d.targetEnvVarValues[targetEnvVar]; !ok || val == "" {
492492
targetEnvVarsToFetch = append(targetEnvVarsToFetch, targetEnvVar)
493493
}
494494
}
@@ -510,8 +510,12 @@ func readProcEnvirons(data any) error {
510510
// When env var entry found, get all necessary env vars
511511
for _, env := range environments {
512512
for _, targetEnvVar := range targetEnvVarsToFetch {
513-
if strings.Contains(env, targetEnvVar) {
514-
d.targetEnvVarValues[targetEnvVar] = strings.Split(env, "=")[1]
513+
// Split env var name and value and compare the name with targetEnvVar.
514+
// This is because if we have targetEnvVar as CUDA_VISIBLE_DEVICES and
515+
// env is CUDA_VISIBLE_DEVICES_SOMEPREFIX, we will match targetEnvVar with
516+
// CUDA_VISIBLE_DEVICES_SOMEPREFIX which is not we want.
517+
if p := strings.Split(env, "="); len(p) == 2 && p[0] == targetEnvVar {
518+
d.targetEnvVarValues[targetEnvVar] = p[1]
515519
}
516520
}
517521
}

pkg/collector/helper_test.go

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,6 @@ func TestReadProcEnvirons(t *testing.T) {
255255
targetEnvVars: []string{"CUDA_VISIBLE_DEVICES", "CUDA_VISIBLE_DEVICES1"},
256256
},
257257
expected: map[string]string{
258-
"CUDA_VISIBLE_DEVICES": "MIG-GPU-956348bc-d43d-23ed-53d4-857749fa2b67/1/0,MIG-GPU-feba7e40-d724-01ff-b00f-3a439a28a6c7/1/0",
259258
"CUDA_VISIBLE_DEVICES1": "MIG-GPU-956348bc-d43d-23ed-53d4-857749fa2b67/1/0,MIG-GPU-feba7e40-d724-01ff-b00f-3a439a28a6c7/1/0",
260259
},
261260
},
@@ -266,7 +265,6 @@ func TestReadProcEnvirons(t *testing.T) {
266265
targetEnvVars: []string{"CUDA_VISIBLE_DEVICES", "CUDA_VISIBLE_DEVICES1"},
267266
},
268267
expected: map[string]string{
269-
"CUDA_VISIBLE_DEVICES": "MIG-GPU-956348bc-d43d-23ed-53d4-857749fa2b67/1/0,MIG-GPU-feba7e40-d724-01ff-b00f-3a439a28a6c7/1/0",
270268
"CUDA_VISIBLE_DEVICES1": "MIG-GPU-956348bc-d43d-23ed-53d4-857749fa2b67/1/0,MIG-GPU-feba7e40-d724-01ff-b00f-3a439a28a6c7/1/0",
271269
},
272270
},

pkg/collector/lsf.go

Lines changed: 70 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,9 +50,11 @@ const (
5050
lsfReadProcCtx = "lsf_read_procs"
5151
)
5252

53-
// Regex to get physical GPU UUID and GPU_I_ID, GPU_C_ID from MIG device IDs.
5453
var (
54+
// Regex to get physical GPU UUID and GPU_I_ID, GPU_C_ID from MIG device IDs.
5555
migDeviceIDRegex = regexp.MustCompile("^MIG-(?P<GPU_UUID>.*?)/(?P<GPU_I_ID>[0-9]+?)/(?P<GPU_C_ID>[0-9]+)$")
56+
// Regex to extract job index from job ID.
57+
jobIndexRegex = regexp.MustCompile(`^(?P<id>[0-9]+)(?:\[(?P<index>[0-9]+)\])?$`)
5658
)
5759

5860
// Cache interval.
@@ -68,6 +70,70 @@ type LSFJobRecord struct {
6870
GPUSlot string `json:"GPU_ALLOC"`
6971
}
7072

73+
// UnmarshalJSON unmarshals byte array into LSFJobRecord.
74+
func (r *LSFJobRecord) UnmarshalJSON(b []byte) error {
75+
// Define a temporary type to avoid infinite looping
76+
type LSFJobRecordTmp LSFJobRecord
77+
78+
type tmp struct {
79+
LSFJobRecordTmp
80+
81+
Index string `json:"JOBINDEX"`
82+
}
83+
84+
var s tmp
85+
86+
err := json.Unmarshal(b, &s)
87+
if err != nil {
88+
return err
89+
}
90+
91+
*r = LSFJobRecord(s.LSFJobRecordTmp)
92+
93+
// If Index exists, attach it to job ID. So, if id is 6 and index is 2,
94+
// the final id will be "6[2]" as appaears in cgroup paths
95+
if s.Index != "" {
96+
r.ID = fmt.Sprintf("%s[%s]", r.ID, s.Index)
97+
}
98+
99+
return nil
100+
}
101+
102+
// MarshalJSON marshals LSFJobRecord into byte array.
103+
func (r LSFJobRecord) MarshalJSON() ([]byte, error) {
104+
// Define a temporary type to avoid infinite looping
105+
type LSFJobRecordTmp LSFJobRecord
106+
107+
// Another tmp struct that contains job index
108+
type tmp struct {
109+
LSFJobRecordTmp
110+
111+
Index string `json:"JOBINDEX"`
112+
}
113+
114+
var s tmp
115+
116+
s.LSFJobRecordTmp = LSFJobRecordTmp(r)
117+
118+
// If jobID has index inside it, extract it
119+
match := jobIndexRegex.FindStringSubmatch(r.ID)
120+
// If no matches found, return
121+
if len(match) > 0 {
122+
// Get index of the job
123+
for i, name := range jobIndexRegex.SubexpNames() {
124+
if name == "id" {
125+
s.ID = strings.TrimSpace(match[i])
126+
}
127+
128+
if name == "index" {
129+
s.Index = strings.TrimSpace(match[i])
130+
}
131+
}
132+
}
133+
134+
return json.Marshal(s)
135+
}
136+
71137
// LSFJobsList contains list of all job records.
72138
type LSFJobsList struct {
73139
Command string `json:"COMMAND"`
@@ -379,7 +445,7 @@ func (c *lsfCollector) jobResources(cgroups []cgroup) {
379445
// Get job resources from bjobs command
380446
cmdOut, err := osexec.ExecuteContext(
381447
ctx, c.bjobsExePath,
382-
[]string{"-u", "all", "-m", c.hostname, "-json", "-o", "jobid alloc_slot nalloc_slot gpu_alloc"},
448+
[]string{"-u", "all", "-m", c.hostname, "-json", "-o", "jobid jobindex alloc_slot nalloc_slot gpu_alloc"},
383449
nil,
384450
)
385451
if err != nil {
@@ -563,7 +629,8 @@ func (c *lsfCollector) jobGPUInstanceResources(uuid string, procs []procfs.Proc,
563629
dataPtr := &readProcSecurityCtxData{
564630
procs: procs,
565631
ignoreProc: func(envs []string) bool {
566-
return !slices.Contains(envs, "LSB_JOBID="+uuid)
632+
// LSB_JOBID WILL NOT HAVE job index. LSB_BATCH_JID is more realiable
633+
return !slices.Contains(envs, "LSB_BATCH_JID="+uuid)
567634
},
568635
// In the case of AMD GPUs, GPU_DEVICE_ORDINAL and GPU_DEVICE_ORDINAL1 are
569636
// exported with "real" GPU minor numbers. But at the same time, bjobs

pkg/collector/lsf_test.go

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -145,10 +145,10 @@ func TestLSFJobDevices(t *testing.T) {
145145
}
146146

147147
func writeLSFEnvironFile(procFS string, jobid string, val string) error {
148-
envs := []string{"LSB_JOBID=" + jobid, "CUDA_VISIBLE_DEVICES=" + val}
148+
envs := []string{"LSB_BATCH_JID=" + jobid, "CUDA_VISIBLE_DEVICES=" + val}
149149

150150
return os.WriteFile(
151-
procFS+"/"+jobid+"/environ",
151+
procFS+"/"+strings.ReplaceAll(strings.ReplaceAll(jobid, "[", ""), "]", "")+"/environ",
152152
[]byte(strings.Join(envs, "\000")+"\000"),
153153
0o600,
154154
)
@@ -443,20 +443,20 @@ func TestLSFJobDevicesCaching(t *testing.T) {
443443
require.NoError(t, err)
444444
}
445445

446-
for i := 19; i < 40; i++ {
447-
dir := fmt.Sprintf("%s/cpuacct/lsf/cluster1/job.%d.12345.123443", cgroupsPath, i)
446+
for i := 1; i <= 20; i++ {
447+
dir := fmt.Sprintf("%s/cpuacct/lsf/cluster1/job.19[%d].12345.123443", cgroupsPath, i)
448448

449449
err = os.MkdirAll(dir, 0o750)
450450
require.NoError(t, err)
451451

452452
err = os.WriteFile(
453453
dir+"/cgroup.procs",
454-
fmt.Appendf(nil, "%d\n", i),
454+
fmt.Appendf(nil, "19%d\n", i),
455455
0o600,
456456
)
457457
require.NoError(t, err)
458458

459-
procDir := fmt.Sprintf("%s/%d", procFS, i)
459+
procDir := fmt.Sprintf("%s/19%d", procFS, i)
460460

461461
err = os.MkdirAll(procDir, 0o750)
462462
require.NoError(t, err)
@@ -539,32 +539,32 @@ func TestLSFJobDevicesCaching(t *testing.T) {
539539
NumJobs: 4,
540540
Records: []LSFJobRecord{
541541
{
542-
ID: "19",
542+
ID: "19[1]",
543543
AllocSlot: "testhost-1.example.com:testhost-1.example.com:testhost-1.example.com:testhost-1.example.com",
544544
GPUSlot: "testhost-1.example.com:0,0,0,0",
545545
},
546546
{
547-
ID: "20",
547+
ID: "19[2]",
548548
AllocSlot: "testhost-1.example.com:testhost-1.example.com:testhost-1.example.com:testhost-1.example.com",
549549
GPUSlot: "testhost-1.example.com:2,2,2,2",
550550
},
551551
{
552-
ID: "21",
552+
ID: "19[3]",
553553
AllocSlot: "testhost-1.example.com:testhost-1.example.com:testhost-2.example.com:testhost-2.example.com",
554554
GPUSlot: `testhost-1.example.com:1:1\/0;testhost-2.example.com:0:1\/0`,
555555
},
556556
{
557-
ID: "22",
557+
ID: "19[4]",
558558
AllocSlot: "testhost-1.example.com:testhost-1.example.com",
559559
GPUSlot: "testhost-1.example.com:0,2",
560560
},
561561
{
562-
ID: "23",
562+
ID: "19[5]",
563563
AllocSlot: "testhost-1.example.com:testhost-1.example.com",
564564
GPUSlot: `testhost-1.example.com:3:13\/1`,
565565
},
566566
{
567-
ID: "24",
567+
ID: "19[6]",
568568
AllocSlot: "testhost-1.example.com:testhost-1.example.com",
569569
GPUSlot: `testhost-1.example.com:3:13\/1`,
570570
},
@@ -610,10 +610,10 @@ func TestLSFJobDevicesCaching(t *testing.T) {
610610

611611
// New expected jobs
612612
expected = map[string][]ComputeUnit{
613-
"0": {{UUID: "19", NumShares: 1}, {UUID: "22", NumShares: 1}},
614-
"1": {{UUID: "21", NumShares: 1}},
615-
"2": {{UUID: "20", NumShares: 1}, {UUID: "22", NumShares: 1}},
616-
"5": {{UUID: "23", NumShares: 1}, {UUID: "24", NumShares: 1}},
613+
"0": {{UUID: "19[1]", NumShares: 1}, {UUID: "19[4]", NumShares: 1}},
614+
"1": {{UUID: "19[3]", NumShares: 1}},
615+
"2": {{UUID: "19[2]", NumShares: 1}, {UUID: "19[4]", NumShares: 1}},
616+
"5": {{UUID: "19[5]", NumShares: 1}, {UUID: "19[6]", NumShares: 1}},
617617
}
618618

619619
for _, gpu := range c.gpuSMI.Devices {

pkg/collector/testdata/bjobs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,18 +6,21 @@ echo -e '{
66
"RECORDS":[
77
{
88
"JOBID":"1009248",
9+
"JOBINDEX":"",
910
"ALLOC_SLOT":"testhost-1.example.com:testhost-1.example.com:testhost-1.example.com:testhost-1.example.com:testhost-1.example.com:testhost-1.example.com",
1011
"NALLOC_SLOT":"6",
1112
"GPU_ALLOC":"testhost-1.example.com:2:1\/0,3:1\/0,2:1\/0,3:1\/0,2:1\/0,3:1\/0"
1213
},
1314
{
1415
"JOBID":"1009249",
16+
"JOBINDEX":"",
1517
"ALLOC_SLOT":"testhost-1.example.com:testhost-1.example.com:testhost-1.example.com:testhost-1.example.com:testhost-2.example.com:testhost-2.example.com:testhost-2.example.com:testhost-2.example.com",
1618
"NALLOC_SLOT":"8",
1719
"GPU_ALLOC":"testhost-1.example.com:0,0,1,1;testhost-2.example.com:0,0,1,1"
1820
},
1921
{
2022
"JOBID":"1009250",
23+
"JOBINDEX":"",
2124
"ALLOC_SLOT":"testhost-1.example.com:testhost-1.example.com:testhost-1.example.com:testhost-1.example.com",
2225
"NALLOC_SLOT":"4",
2326
"GPU_ALLOC":"testhost-1.example.com:1,4,1,4"

pkg/collector/testdata/proc.ttar

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9920,7 +9920,7 @@ SymlinkTo: /usr/bin
99209920
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
99219921
Path: proc/46281/environ
99229922
Lines: 1
9923-
PATH=/go/bin:/usr/local/go/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/binNULLBYTEHOSTNAME=cd24e11f73a5NULLBYTETERM=xtermNULLBYTEGOLANG_VERSION=1.12.5NULLBYTEGOPATH=/goNULLBYTEHOME=/rootNULLBYTEENABLE_PROFILING=1NULLBYTESLURM_JOB_GPUS=2NULLBYTESLURM_JOB_ID=1009248NULLBYTESLURM_SHARDS_ON_NODE=5NULLBYTELSB_JOBID=1009248NULLBYTECUDA_VISIBLE_DEVICES1=MIG-GPU-956348bc-d43d-23ed-53d4-857749fa2b67/1/0,MIG-GPU-feba7e40-d724-01ff-b00f-3a439a28a6c7/1/0NULLBYTE
9923+
PATH=/go/bin:/usr/local/go/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/binNULLBYTEHOSTNAME=cd24e11f73a5NULLBYTETERM=xtermNULLBYTEGOLANG_VERSION=1.12.5NULLBYTEGOPATH=/goNULLBYTEHOME=/rootNULLBYTEENABLE_PROFILING=1NULLBYTESLURM_JOB_GPUS=2NULLBYTESLURM_JOB_ID=1009248NULLBYTESLURM_SHARDS_ON_NODE=5NULLBYTELSB_JOBID=1009248NULLBYTECUDA_VISIBLE_DEVICES1=MIG-GPU-956348bc-d43d-23ed-53d4-857749fa2b67/1/0,MIG-GPU-feba7e40-d724-01ff-b00f-3a439a28a6c7/1/0NULLBYTECUDA_VISIBLE_DEVICES1_ORIG=NULLBYTELSB_BATCH_JID=1009248NULLBYTE
99249924
Mode: 644
99259925
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
99269926
Path: proc/46281/exe

0 commit comments

Comments
 (0)