spotify · stephen29xie · Feb 24, 2026 · Feb 24, 2026 · Feb 24, 2026 · Feb 26, 2026
diff --git a/.github/workflows/all.yml b/.github/workflows/all.yml
@@ -214,14 +214,11 @@ jobs:
           LD_PRELOAD=$(clang -print-file-name=libclang_rt.asan-x86_64.so) \
           tox
 
-  run-benchmarks:
-    strategy:
-      matrix:
-        os: ['ubuntu-latest']
-    runs-on: ${{ matrix.os }}
+  run-python-benchmarks:
+    runs-on: ubuntu-latest
     continue-on-error: true
     if: github.event_name == 'pull_request' 
-    name: Run benchmarks on ${{ matrix.os }}
+    name: Run Python benchmarks
     steps:
       - name: Set up Python
         uses: actions/setup-python@v3
@@ -244,6 +241,55 @@ jobs:
              asv machine --yes
              asv continuous --sort name --no-only-changed refs/remotes/origin/main ${{ github.sha }} | tee >(sed '1,/All benchmarks:/d' > $GITHUB_STEP_SUMMARY)
 
+  run-java-benchmarks:
+    runs-on: ubuntu-latest
+    continue-on-error: true
+    if: github.event_name == 'pull_request'
+    name: Run Java benchmarks
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+      - name: Set up JDK 17
+        uses: actions/setup-java@v3
+        with:
+          java-version: '17'
+          distribution: 'corretto'
+
+      # Benchmark the base branch first
+      - name: Checkout base branch
+        run: git checkout ${{ github.event.pull_request.base.sha }}
+      - name: Build native library (base)
+        working-directory: java
+        run: make
+        continue-on-error: true
+      - name: Compile benchmarks (base)
+        working-directory: java
+        run: mvn --batch-mode test-compile
+        continue-on-error: true
+      - name: Run benchmarks (base)
+        working-directory: java
+        run: mvn --batch-mode exec:exec -Dexec.executable=java -Dexec.classpathScope=test -Dexec.args="-cp %classpath org.openjdk.jmh.Main -rf json -rff /tmp/base-results.json -f 1 -wi 2 -i 3 -w 2s -r 2s -jvmArgs -Xms2g -jvmArgs -Xmx2g"
+        continue-on-error: true
+
+      # Benchmark the PR branch
+      - name: Checkout PR branch
+        run: git checkout ${{ github.sha }}
+      - name: Build native library (PR)
+        working-directory: java
+        run: make clean && make
+      - name: Compile benchmarks (PR)
+        working-directory: java
+        run: mvn clean --batch-mode test-compile
+      - name: Run benchmarks (PR)
+        working-directory: java
+        run: mvn --batch-mode exec:exec -Dexec.executable=java -Dexec.classpathScope=test -Dexec.args="-cp %classpath org.openjdk.jmh.Main -rf json -rff /tmp/pr-results.json -f 1 -wi 2 -i 3 -w 2s -r 2s -jvmArgs -Xms2g -jvmArgs -Xmx2g"
+
+      # Compare and report
+      - name: Compare benchmarks
+        run: python3 java/scripts/compare_benchmarks.py /tmp/base-results.json /tmp/pr-results.json >> $GITHUB_STEP_SUMMARY
+        continue-on-error: true
+
   build-python-sdist:
     needs: [run-python-tests, run-python-tests-with-address-sanitizer]
     continue-on-error: false

diff --git a/java/pom.xml b/java/pom.xml
@@ -103,6 +103,18 @@
       <version>1.3.2</version>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.openjdk.jmh</groupId>
+      <artifactId>jmh-core</artifactId>
+      <version>1.37</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.openjdk.jmh</groupId>
+      <artifactId>jmh-generator-annprocess</artifactId>
+      <version>1.37</version>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
 
   <build>
@@ -134,6 +146,13 @@
         <configuration>
           <source>1.8</source>
           <target>1.8</target>
+          <annotationProcessorPaths>
+            <path>
+              <groupId>org.openjdk.jmh</groupId>
+              <artifactId>jmh-generator-annprocess</artifactId>
+              <version>1.37</version>
+            </path>
+          </annotationProcessorPaths>
         </configuration>
       </plugin>
       <plugin>

diff --git a/java/scripts/compare_benchmarks.py b/java/scripts/compare_benchmarks.py
@@ -0,0 +1,129 @@
+#!/usr/bin/env python3
+"""Compare two JMH JSON benchmark result files and output a markdown summary.
+
+Usage:
+    python3 compare_benchmarks.py <base_results.json> <pr_results.json>
+
+If the base results file does not exist (e.g. when benchmarks are first added),
+only the PR results are printed.
+
+Uses only Python standard library (json, sys, os).
+"""
+
+import json
+import os
+import sys
+
+
+def load_results(path):
+    """Load JMH JSON results and return a dict keyed by benchmark name + params."""
+    with open(path) as f:
+        data = json.load(f)
+
+    results = {}
+    for entry in data:
+        benchmark = entry["benchmark"]
+        # Extract short method name from fully qualified name
+        short_name = benchmark.rsplit(".", 1)[-1]
+
+        params = entry.get("params", {})
+        param_key = ", ".join(f"{k}={v}" for k, v in sorted(params.items()))
+
+        key = f"{short_name}({param_key})" if param_key else short_name
+
+        score = entry["primaryMetric"]["score"]
+        error = entry["primaryMetric"]["scoreError"]
+        unit = entry["primaryMetric"]["scoreUnit"]
+
+        results[key] = {"score": score, "error": error, "unit": unit}
+
+    return results
+
+
+def format_score(score, error):
+    """Format a score with error margin."""
+    return f"{score:.3f} \u00b1 {error:.3f}"
+
+
+def main():
+    if len(sys.argv) < 3:
+        print(f"Usage: {sys.argv[0]} <base_results.json> <pr_results.json>", file=sys.stderr)
+        sys.exit(1)
+
+    base_path = sys.argv[1]
+    pr_path = sys.argv[2]
+
+    if not os.path.exists(pr_path):
+        print("Error: PR results file not found.", file=sys.stderr)
+        sys.exit(1)
+
+    pr_results = load_results(pr_path)
+
+    if not os.path.exists(base_path):
+        # Base results don't exist yet (first PR adding benchmarks)
+        print("## Java Benchmark Results\n")
+        print("_No base branch results available for comparison._\n")
+        print("| Benchmark | Score | Unit |")
+        print("|-----------|-------|------|")
+        for name in sorted(pr_results.keys()):
+            r = pr_results[name]
+            print(f"| {name} | {format_score(r['score'], r['error'])} | {r['unit']} |")
+        return
+
+    base_results = load_results(base_path)
+
+    print("## Java Benchmark Comparison\n")
+    print("| Benchmark | Base | PR | Delta | Status |")
+    print("|-----------|------|-----|-------|--------|")
+
+    all_keys = sorted(set(list(base_results.keys()) + list(pr_results.keys())))
+
+    for name in all_keys:
+        if name not in base_results:
+            r = pr_results[name]
+            print(
+                f"| {name} | _new_ | {format_score(r['score'], r['error'])} {r['unit']}"
+                f" | - | \U0001f195 |"
+            )
+            continue
+
+        if name not in pr_results:
+            r = base_results[name]
+            print(
+                f"| {name} | {format_score(r['score'], r['error'])} {r['unit']}"
+                f" | _removed_ | - | - |"
+            )
+            continue
+
+        base = base_results[name]
+        pr = pr_results[name]
+
+        if base["score"] == 0:
+            delta_pct = 0.0
+        else:
+            delta_pct = ((pr["score"] - base["score"]) / base["score"]) * 100
+
+        # Determine if the change is significant by comparing against combined error margins
+        combined_error = base["error"] + pr["error"]
+        abs_diff = abs(pr["score"] - base["score"])
+
+        if abs_diff > combined_error:
+            # For time-based benchmarks, lower is better
+            if pr["score"] < base["score"]:
+                status = "\u2705 faster"
+            else:
+                status = "\u26a0\ufe0f slower"
+        else:
+            status = "\u2194\ufe0f unchanged"
+
+        print(
+            f"| {name}"
+            f" | {format_score(base['score'], base['error'])}"
+            f" | {format_score(pr['score'], pr['error'])}"
+            f" | {delta_pct:+.1f}%"
+            f" | {status} |"
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/java/src/test/java/com/spotify/voyager/jni/IndexCreationBenchmark.java b/java/src/test/java/com/spotify/voyager/jni/IndexCreationBenchmark.java
@@ -0,0 +1,99 @@
+/*-
+ * -\-\-
+ * voyager
+ * --
+ * Copyright (C) 2016 - 2023 Spotify AB
+ * --
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * -/-/-
+ */
+
+package com.spotify.voyager.jni;
+
+import java.io.IOException;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import org.openjdk.jmh.annotations.*;
+
+/**
+ * JMH benchmarks for index creation performance.
+ *
+ * <p>Mirrors the Python benchmark in benchmarks/index_creation.py. Measures the time to add 1024
+ * random vectors of 256 dimensions to a fresh index, parameterized over space type and storage data
+ * type.
+ */
+@State(Scope.Benchmark)
+@BenchmarkMode(Mode.SingleShotTime)
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@Fork(2)
+@Warmup(iterations = 3)
+@Measurement(iterations = 5)
+public class IndexCreationBenchmark {
+
+  @Param({"256"})
+  public int numDimensions;
+
+  @Param({"1024"})
+  public int numElements;
+
+  @Param({"Euclidean", "InnerProduct", "Cosine"})
+  public String spaceType;
+
+  @Param({"Float32", "Float8", "E4M3"})
+  public String storageDataType;
+
+  @Param({"24"})
+  public int efConstruction;
+
+  private static final int M = 20;
+  private static final long RANDOM_SEED = 4321;
+
+  private float[][] inputData;
+  private Index index;
+
+  @Setup(Level.Trial)
+  public void generateData() {
+    Random rng = new Random(1234);
+    inputData = new float[numElements][numDimensions];
+    boolean isFloat8 = "Float8".equals(storageDataType);
+
+    for (int i = 0; i < numElements; i++) {
+      for (int j = 0; j < numDimensions; j++) {
+        float val = rng.nextFloat() * 2 - 1;
+        if (isFloat8) {
+          val = Math.round(val * 127f) / 127f;
+        }
+        inputData[i][j] = val;
+      }
+    }
+  }
+
+  @Setup(Level.Invocation)
+  public void createFreshIndex() {
+    Index.SpaceType space = Index.SpaceType.valueOf(spaceType);
+    Index.StorageDataType storage = Index.StorageDataType.valueOf(storageDataType);
+    index = new Index(space, numDimensions, M, efConstruction, RANDOM_SEED, numElements, storage);
+  }
+
+  @TearDown(Level.Invocation)
+  public void closeIndex() throws IOException {
+    if (index != null) {
+      index.close();
+    }
+  }
+
+  @Benchmark
+  public void addItems() {
+    index.addItems(inputData, 1);
+  }
+}