Skip to content

Commit 7300b1c

Browse files
authored
Use ScoreTracker to avoid wasteful searching for very large k (#384)
* clarify * use scoreTracker to short circuit new edge evaluation once we hit a local maximum
1 parent dcb4139 commit 7300b1c

File tree

2 files changed

+13
-15
lines changed

2 files changed

+13
-15
lines changed

jvector-base/src/main/java/io/github/jbellis/jvector/graph/GraphSearcher.java

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -264,11 +264,8 @@ private SearchResult resume(int initialVisited, int topK, int rerankK, float thr
264264
rerankedResults.setMaxSize(topK);
265265

266266
int numVisited = initialVisited;
267-
// A bound that holds the minimum similarity to the query vector that a candidate vector must
268-
// have to be considered -- will be set to the lowest score in the results queue once the queue is full.
269-
var minAcceptedSimilarity = Float.NEGATIVE_INFINITY;
270267
// track scores to predict when we are done with threshold queries
271-
var scoreTracker = threshold > 0 ? new ScoreTracker.TwoPhaseTracker(threshold) : ScoreTracker.NO_OP;
268+
var scoreTracker = threshold > 0 ? new ScoreTracker.TwoPhaseTracker(threshold) : new ScoreTracker.TwoPhaseTracker(1.0);
272269
VectorFloat<?> similarities = null;
273270

274271
// add evicted results from the last call back to the candidates
@@ -283,37 +280,36 @@ private SearchResult resume(int initialVisited, int topK, int rerankK, float thr
283280
while (candidates.size() > 0) {
284281
// we're done when we have K results and the best candidate is worse than the worst result so far
285282
float topCandidateScore = candidates.topScore();
286-
if (topCandidateScore < minAcceptedSimilarity) {
283+
if (approximateResults.size() >= rerankK && topCandidateScore < approximateResults.topScore()) {
287284
break;
288285
}
289286
// when querying by threshold, also stop when we are probabilistically unlikely to find more qualifying results
290-
if (scoreTracker.shouldStop()) {
287+
if (threshold > 0 && scoreTracker.shouldStop()) {
291288
break;
292289
}
293290

294291
// process the top candidate
295292
int topCandidateNode = candidates.pop();
296293
if (acceptOrds.get(topCandidateNode) && topCandidateScore >= threshold) {
297294
addTopCandidate(topCandidateNode, topCandidateScore, rerankK);
298-
299-
// update minAcceptedSimilarity if we've found K results
300-
if (approximateResults.size() >= rerankK) {
301-
minAcceptedSimilarity = approximateResults.topScore();
302-
}
303295
}
304296

305297
// if this candidate came from evictedResults, we don't need to evaluate its neighbors again
306298
if (previouslyEvicted.get(topCandidateNode)) {
307299
continue;
308300
}
309301

302+
// skip edge loading if we've found a local maximum and we have enough results
303+
if (scoreTracker.shouldStop() && candidates.size() >= rerankK - approximateResults.size()) {
304+
continue;
305+
}
306+
310307
// score the neighbors of the top candidate and add them to the queue
311308
var scoreFunction = scoreProvider.scoreFunction();
312309
var useEdgeLoading = scoreFunction.supportsEdgeLoadingSimilarity();
313310
if (useEdgeLoading) {
314311
similarities = scoreFunction.edgeLoadingSimilarityTo(topCandidateNode);
315312
}
316-
317313
var it = view.getNeighborsIterator(topCandidateNode);
318314
for (int i = 0; i < it.size(); i++) {
319315
var friendOrd = it.nextInt();

jvector-base/src/main/java/io/github/jbellis/jvector/graph/ScoreTracker.java

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ class TwoPhaseTracker implements ScoreTracker {
5555
private int recentEntryIndex;
5656

5757
// Heap of the best scores seen so far
58-
AbstractLongHeap bestScores;
58+
BoundedLongHeap bestScores;
5959

6060
// observation count
6161
private int observationCount;
@@ -87,8 +87,10 @@ public boolean shouldStop() {
8787
return false;
8888
}
8989

90-
// we're in phase 2 if the 99th percentile of the recent scores is worse than the best score
91-
// (paper suggests median, but experimentally that is too prone to false positives.
90+
// We're in phase 2 if the 99th percentile of the recent scores evaluated is lower
91+
// than the worst of the best scores seen.
92+
//
93+
// (paper suggests using the median of recent scores, but experimentally that is too prone to false positives.
9294
// 90th does seem to be enough, but 99th doesn't result in much extra work, so we'll be conservative)
9395
double windowMedian = StatUtils.percentile(recentScores, 99);
9496
double worstBest = sortableIntToFloat((int) bestScores.top());

0 commit comments

Comments
 (0)