[harness] XPU timer

adam-smnk · adam-smnk · commit 7448d87db07f · 2025-11-28T07:06:16.000-08:00
diff --git a/ai_bench/harness/runner/kernel_bench_runner.py b/ai_bench/harness/runner/kernel_bench_runner.py
@@ -81,7 +81,10 @@ def run_kernels(self):
                     fn = model.forward
                     args = ai_hc.get_inputs(variant, inputs, device=self.device)
                     if self.device.type == "cpu":
-                        meas = testing.time(fn, args, warmup=3, rep=10)
+                        meas = testing.time(fn, args, warmup=5, rep=20)
+                        print(f"time: {meas}us")
+                    if self.device.type == "xpu":
+                        meas = testing.time(fn, args, warmup=20, rep=100)
                         print(f"time: {meas}us")
                     else:
                         fn(*args)
diff --git a/ai_bench/harness/testing/timer.py b/ai_bench/harness/testing/timer.py
@@ -1,4 +1,5 @@
 from collections.abc import Callable
+import itertools
 
 import torch
 from torch.profiler import ProfilerActivity
@@ -34,6 +35,65 @@ def time_cpu(fn: Callable, args: tuple, warmup: int = 25, rep: int = 100) -> flo
     return torch.mean(times).item()
 
 
+# Based on Intel XPU Triton backend benchmarks.
+def time_xpu(fn: Callable, args: tuple, warmup: int = 25, rep: int = 100) -> float:
+    """Measure execution time of the provided function on XPU.
+    Args:
+        fn: Function to measure
+        args: Arguments to pass to the function
+        warmup: Warmup iterations
+        rep: Measurement iterations
+    Returns:
+        Mean runtime in microseconds
+    """
+
+    # A device buffer used to clear L2 cache between kernel runs.
+    cache_size = 256 * 1024 * 1024
+    cache = torch.empty(cache_size, dtype=torch.int8, device=torch.device("xpu"))
+
+    for _ in range(warmup):
+        fn(*args)
+        torch.accelerator.synchronize()
+
+    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.XPU]) as prof:
+        for _ in range(rep):
+            # Clear L2 cache.
+            cache.zero_()
+            torch.accelerator.synchronize()
+
+            with record_function("profiled_fn"):
+                fn(*args)
+        # Ensure all measurements are recorded.
+        torch.accelerator.synchronize()
+
+    def extract_kernels(funcs):
+        """Traverse event tree recursively to extract device kernels."""
+        kernels = []
+        kernels += list(
+            itertools.chain.from_iterable(
+                map(lambda func: extract_kernels(func.cpu_children), funcs)
+            )
+        )
+        kernels += list(itertools.chain.from_iterable([func.kernels for func in funcs]))
+        return kernels
+
+    events = [e for e in prof.events() if e.name.startswith("profiled_fn")]
+    kernels = [extract_kernels(func.cpu_children) for func in events]
+    kernels = [kernel for kernel in kernels if kernel]
+    if len(kernels) != rep:
+        raise AssertionError("Unexpected number of profiled kernels")
+
+    times = torch.tensor(
+        [sum([k.duration for k in kernel]) for kernel in kernels], dtype=torch.float
+    )
+
+    # Trim extremes if there are enough measurements.
+    if len(times) >= 10:
+        times = torch.sort(times).values[1:-1]
+
+    return torch.mean(times).item()
+
+
 def time(
     fn: Callable,
     args: tuple,
@@ -53,4 +113,6 @@ def time(
     """
     if not device or device.type == "cpu":
         return time_cpu(fn, args, warmup=warmup, rep=rep)
+    if device.type == "xpu":
+        return time_xpu(fn, args, warmup=warmup, rep=rep)
     raise ValueError("Unsupported device for timing")