Skip to content

Commit ce29387

Browse files
committed
Add LCOW assigned device logs; increase timeout
Drop logs when iterating over host devices from info to trace. Add more logs (and span) to `AddAssignedDevice` function for tracing where time is spent when assigning devices. Increase `AddAssignedDevice` timeout from 10 seconds to 60 to allow more time when adding a large number of devices or waiting on host to make them available. Add log when adding OCI runtime hook for nvidia tool. Signed-off-by: Hamza El-Saawy <hamzaelsaawy@microsoft.com>
1 parent cbc0126 commit ce29387

File tree

2 files changed

+31
-10
lines changed

2 files changed

+31
-10
lines changed

internal/guest/runtime/hcsv2/nvidia_utils.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,12 @@ import (
1313

1414
oci "github.com/opencontainers/runtime-spec/specs-go"
1515
"github.com/pkg/errors"
16+
"github.com/sirupsen/logrus"
1617

1718
"github.com/Microsoft/hcsshim/cmd/gcstools/generichook"
1819
"github.com/Microsoft/hcsshim/internal/guest/storage/pci"
1920
"github.com/Microsoft/hcsshim/internal/hooks"
21+
"github.com/Microsoft/hcsshim/internal/log"
2022
"github.com/Microsoft/hcsshim/pkg/annotations"
2123
)
2224

@@ -69,6 +71,9 @@ func addNvidiaDeviceHook(ctx context.Context, spec *oci.Spec, ociBundlePath stri
6971
hookEnv := append(updateEnvWithNvidiaVariables(), hookLogDebugFileEnvOpt)
7072

7173
nvidiaHook := hooks.NewOCIHook(genericHookPath, args, hookEnv)
74+
if logrus.IsLevelEnabled(logrus.DebugLevel) {
75+
log.G(ctx).WithField("hook", log.Format(ctx, nvidiaHook)).Debug("adding nvidia device runtime hook")
76+
}
7277
return hooks.AddOCIHook(spec, hooks.CreateRuntime, nvidiaHook)
7378
}
7479

internal/guest/spec/spec_devices.go

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import (
1818

1919
"github.com/Microsoft/hcsshim/internal/guest/storage/pci"
2020
"github.com/Microsoft/hcsshim/internal/log"
21+
"github.com/Microsoft/hcsshim/internal/oc"
2122
)
2223

2324
const (
@@ -37,34 +38,48 @@ const (
3738
// into the resulting container by the runtime.
3839
//
3940
// GPU devices are skipped, since they are handled in [addNvidiaDeviceHook].
40-
func AddAssignedDevice(ctx context.Context, spec *oci.Spec) error {
41+
func AddAssignedDevice(ctx context.Context, spec *oci.Spec) (err error) {
42+
ctx, span := oc.StartSpan(ctx, "AddAssignedDevice")
43+
defer span.End()
44+
defer func() { oc.SetSpanStatus(span, err) }()
45+
4146
// Add an explicit timeout before we try to find the dev nodes so we
4247
// aren't waiting forever.
43-
ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
48+
ctx, cancel := context.WithTimeout(ctx, 60*time.Second)
4449
defer cancel()
4550

4651
for _, d := range spec.Windows.Devices {
52+
entry := log.G(ctx).WithField("windows-device", log.Format(ctx, d))
53+
4754
switch d.IDType {
4855
case vpciDeviceIDTypeLegacy, vpciDeviceIDType:
56+
entry.Trace("adding vPCI device")
4957
// validate that the device is available
5058
fullPCIPath, err := pci.FindDeviceFullPath(ctx, d.ID)
5159
if err != nil {
5260
return errors.Wrapf(err, "failed to find device pci path for device %v", d)
5361
}
62+
63+
entry.WithField("path", fullPCIPath).Trace("found PCI path for Windows device")
5464
// find the device nodes that link to the pci path we just got
5565
devs, err := devicePathsFromPCIPath(ctx, fullPCIPath)
5666
if err != nil {
57-
return errors.Wrapf(err, "failed to find dev node for device %v", d)
67+
return errors.Wrapf(err, "failed to find dev node for device %v with path %q", d, fullPCIPath)
68+
}
69+
70+
if logrus.IsLevelEnabled(logrus.DebugLevel) {
71+
entry.WithFields(logrus.Fields{
72+
"pci-path": fullPCIPath,
73+
"host-devices": log.Format(ctx, devs),
74+
}).Debug("adding host devices associated with Windows device")
5875
}
5976
for _, dev := range devs {
6077
AddLinuxDeviceToSpec(ctx, dev, spec, true)
6178
}
6279
case gpuDeviceIDType:
80+
entry.Trace("skipping GPU device")
6381
default:
64-
log.G(ctx).WithFields(logrus.Fields{
65-
"type": d.IDType,
66-
"id": d.ID,
67-
}).Warn("unknown device type")
82+
entry.Warn("unknown device type")
6883
}
6984
}
7085

@@ -98,11 +113,12 @@ func devicePathsFromPCIPath(ctx context.Context, pciPath string) ([]*config.Devi
98113

99114
// find corresponding entries in sysfs
100115
for _, d := range hostDevices {
116+
entry := log.G(ctx).WithField("host-device", log.Format(ctx, d))
117+
entry.Trace("looking at host device")
118+
101119
major := d.Major
102120
minor := d.Minor
103121

104-
log.G(ctx).WithField("device", d).Infof("looking at device: %+v", d)
105-
106122
deviceTypeString := ""
107123
switch d.Type {
108124
case config.BlockDevice:
@@ -118,7 +134,7 @@ func devicePathsFromPCIPath(ctx context.Context, pciPath string) ([]*config.Devi
118134
if err != nil {
119135
// Some drivers will make dev nodes that do not have a matching block or
120136
// char device -- skip those.
121-
log.G(ctx).WithError(err).Debugf("failed to find sysfs path for device %s", d.Path)
137+
entry.WithError(err).Debugf("failed to find sysfs path for device %s", d.Path)
122138
continue
123139
}
124140
if strings.HasPrefix(sysfsFullPath, pciFullPath) {

0 commit comments

Comments
 (0)