Skip to content

Commit 0f0f34e

Browse files
committed
feat: AI keyword extraction for search using gpt-5-nano
1 parent a0489f6 commit 0f0f34e

File tree

3 files changed

+128
-0
lines changed

3 files changed

+128
-0
lines changed

server/server/api/search.get.ts

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
import type { VectorizeIndex } from '@cloudflare/workers-types'
2+
import type { Collections } from '@nuxt/content'
3+
import type { BMDocument } from 'okapibm25'
4+
import { openai } from '@ai-sdk/openai'
5+
import { queryCollectionSearchSections } from '@nuxt/content/server'
6+
import { embed } from 'ai'
7+
import BM25 from 'okapibm25'
8+
import { z } from 'zod'
9+
10+
const querySchema = z.object({
11+
q: z.string().min(1).describe('Search query'),
12+
module: z.enum(['webClient', 'protocol', 'nodes', 'rpc', 'nimiqUtils', 'hub', 'all']).optional().default('all'),
13+
limit: z.coerce.number().min(1).max(50).optional().default(10),
14+
})
15+
16+
export default defineEventHandler(async (event) => {
17+
const { q, module, limit } = await getValidatedQuery(event, querySchema.parse)
18+
19+
const modules: (keyof Collections)[] = module === 'all'
20+
? ['webClient', 'protocol', 'nodes', 'rpc', 'nimiqUtils', 'hub']
21+
: [module]
22+
23+
const allSections = (await Promise.all(
24+
modules.map(m => queryCollectionSearchSections(event, m, { ignoredTags: ['code'] })),
25+
)).flat()
26+
27+
if (!allSections.length)
28+
return []
29+
30+
// Build corpus and section map
31+
const corpus = allSections.map(s => `${s.titles.join(' ')} ${s.title} ${s.content}`)
32+
const sectionById = new Map(allSections.map(s => [s.id.split('#')[0], s]))
33+
34+
// BM25 keyword search
35+
const keywords = await extractKeywords(q)
36+
const bm25Scored = BM25(corpus, keywords, { k1: 1.5, b: 0.75 }, (a, b) => b.score - a.score) as BMDocument[]
37+
const bm25Paths: string[] = []
38+
const seenBm25 = new Set<string>()
39+
for (const item of bm25Scored) {
40+
if (item.score <= 0)
41+
continue
42+
const idx = corpus.indexOf(item.document)
43+
if (idx === -1)
44+
continue
45+
const path = allSections[idx]!.id.split('#')[0]!
46+
if (seenBm25.has(path))
47+
continue
48+
seenBm25.add(path)
49+
bm25Paths.push(path)
50+
if (bm25Paths.length >= limit * 2)
51+
break
52+
}
53+
54+
// Semantic search via Vectorize
55+
const vectorize = (event.context.cloudflare?.env as Record<string, unknown> | undefined)?.VECTORIZE as VectorizeIndex | undefined
56+
const semanticPaths: string[] = []
57+
58+
if (vectorize) {
59+
const { embedding } = await embed({ model: openai.embedding('text-embedding-3-small'), value: q })
60+
const matches = await vectorize.query(embedding, { topK: limit * 2 })
61+
const seenSemantic = new Set<string>()
62+
for (const match of matches.matches) {
63+
const path = (match.metadata?.path as string) || match.id.split('#')[0]!
64+
if (!path || seenSemantic.has(path))
65+
continue
66+
seenSemantic.add(path)
67+
semanticPaths.push(path)
68+
}
69+
}
70+
71+
// RRF fusion of both rankings
72+
const fusedPaths = rrf([bm25Paths, semanticPaths])
73+
74+
// Build results from fused ranking
75+
const results: { path: string, title: string, section: string, content: string, url: string }[] = []
76+
for (const path of fusedPaths) {
77+
const section = sectionById.get(path) || allSections.find(s => s.id.split('#')[0] === path)
78+
if (!section)
79+
continue
80+
results.push({
81+
path,
82+
title: section.titles[0] || section.title,
83+
section: section.title,
84+
content: section.content.slice(0, 200),
85+
url: `https://nimiq.com/developers${path}`,
86+
})
87+
if (results.length >= limit)
88+
break
89+
}
90+
91+
return results
92+
})
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
import { openai } from '@ai-sdk/openai'
2+
import { generateObject } from 'ai'
3+
import { z } from 'zod'
4+
5+
export async function extractKeywords(query: string): Promise<string[]> {
6+
try {
7+
const { object } = await generateObject({
8+
model: openai('gpt-5-nano'),
9+
output: 'array',
10+
schema: z.string(),
11+
system: `<role>Search keyword extractor for Nimiq blockchain documentation</role>
12+
<task>Extract search terms from user query. Include synonyms and expand abbreviations.</task>
13+
<format>Lowercase single words or short phrases. No duplicates.</format>`,
14+
prompt: query,
15+
})
16+
return object
17+
}
18+
catch {
19+
return query.toLowerCase().split(/\s+/).filter(Boolean)
20+
}
21+
}

server/server/utils/rrf.ts

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
/**
2+
* Reciprocal Rank Fusion - combines multiple ranked lists into a single ranking.
3+
* @param rankings Array of ranked ID lists (best first)
4+
* @param k Ranking constant (default 60, standard RRF value)
5+
* @returns Fused ranking of IDs sorted by combined score
6+
*/
7+
export function rrf(rankings: string[][], k = 60): string[] {
8+
const scores = new Map<string, number>()
9+
for (const ranking of rankings) {
10+
ranking.forEach((id, rank) => {
11+
scores.set(id, (scores.get(id) || 0) + 1 / (k + rank + 1))
12+
})
13+
}
14+
return [...scores.entries()].sort((a, b) => b[1] - a[1]).map(([id]) => id)
15+
}

0 commit comments

Comments
 (0)