feat: AI keyword extraction for search using gpt-5-nano

onmax · onmax · commit 0f0f34ebd0a0 · 2025-12-08T19:38:17.000+01:00
diff --git a/server/server/api/search.get.ts b/server/server/api/search.get.ts
@@ -0,0 +1,92 @@
+import type { VectorizeIndex } from '@cloudflare/workers-types'
+import type { Collections } from '@nuxt/content'
+import type { BMDocument } from 'okapibm25'
+import { openai } from '@ai-sdk/openai'
+import { queryCollectionSearchSections } from '@nuxt/content/server'
+import { embed } from 'ai'
+import BM25 from 'okapibm25'
+import { z } from 'zod'
+
+const querySchema = z.object({
+  q: z.string().min(1).describe('Search query'),
+  module: z.enum(['webClient', 'protocol', 'nodes', 'rpc', 'nimiqUtils', 'hub', 'all']).optional().default('all'),
+  limit: z.coerce.number().min(1).max(50).optional().default(10),
+})
+
+export default defineEventHandler(async (event) => {
+  const { q, module, limit } = await getValidatedQuery(event, querySchema.parse)
+
+  const modules: (keyof Collections)[] = module === 'all'
+    ? ['webClient', 'protocol', 'nodes', 'rpc', 'nimiqUtils', 'hub']
+    : [module]
+
+  const allSections = (await Promise.all(
+    modules.map(m => queryCollectionSearchSections(event, m, { ignoredTags: ['code'] })),
+  )).flat()
+
+  if (!allSections.length)
+    return []
+
+  // Build corpus and section map
+  const corpus = allSections.map(s => `${s.titles.join(' ')} ${s.title} ${s.content}`)
+  const sectionById = new Map(allSections.map(s => [s.id.split('#')[0], s]))
+
+  // BM25 keyword search
+  const keywords = await extractKeywords(q)
+  const bm25Scored = BM25(corpus, keywords, { k1: 1.5, b: 0.75 }, (a, b) => b.score - a.score) as BMDocument[]
+  const bm25Paths: string[] = []
+  const seenBm25 = new Set<string>()
+  for (const item of bm25Scored) {
+    if (item.score <= 0)
+      continue
+    const idx = corpus.indexOf(item.document)
+    if (idx === -1)
+      continue
+    const path = allSections[idx]!.id.split('#')[0]!
+    if (seenBm25.has(path))
+      continue
+    seenBm25.add(path)
+    bm25Paths.push(path)
+    if (bm25Paths.length >= limit * 2)
+      break
+  }
+
+  // Semantic search via Vectorize
+  const vectorize = (event.context.cloudflare?.env as Record<string, unknown> | undefined)?.VECTORIZE as VectorizeIndex | undefined
+  const semanticPaths: string[] = []
+
+  if (vectorize) {
+    const { embedding } = await embed({ model: openai.embedding('text-embedding-3-small'), value: q })
+    const matches = await vectorize.query(embedding, { topK: limit * 2 })
+    const seenSemantic = new Set<string>()
+    for (const match of matches.matches) {
+      const path = (match.metadata?.path as string) || match.id.split('#')[0]!
+      if (!path || seenSemantic.has(path))
+        continue
+      seenSemantic.add(path)
+      semanticPaths.push(path)
+    }
+  }
+
+  // RRF fusion of both rankings
+  const fusedPaths = rrf([bm25Paths, semanticPaths])
+
+  // Build results from fused ranking
+  const results: { path: string, title: string, section: string, content: string, url: string }[] = []
+  for (const path of fusedPaths) {
+    const section = sectionById.get(path) || allSections.find(s => s.id.split('#')[0] === path)
+    if (!section)
+      continue
+    results.push({
+      path,
+      title: section.titles[0] || section.title,
+      section: section.title,
+      content: section.content.slice(0, 200),
+      url: `https://nimiq.com/developers${path}`,
+    })
+    if (results.length >= limit)
+      break
+  }
+
+  return results
+})
diff --git a/server/server/utils/extract-keywords.ts b/server/server/utils/extract-keywords.ts
@@ -0,0 +1,21 @@
+import { openai } from '@ai-sdk/openai'
+import { generateObject } from 'ai'
+import { z } from 'zod'
+
+export async function extractKeywords(query: string): Promise<string[]> {
+  try {
+    const { object } = await generateObject({
+      model: openai('gpt-5-nano'),
+      output: 'array',
+      schema: z.string(),
+      system: `<role>Search keyword extractor for Nimiq blockchain documentation</role>
+<task>Extract search terms from user query. Include synonyms and expand abbreviations.</task>
+<format>Lowercase single words or short phrases. No duplicates.</format>`,
+      prompt: query,
+    })
+    return object
+  }
+  catch {
+    return query.toLowerCase().split(/\s+/).filter(Boolean)
+  }
+}
diff --git a/server/server/utils/rrf.ts b/server/server/utils/rrf.ts
@@ -0,0 +1,15 @@
+/**
+ * Reciprocal Rank Fusion - combines multiple ranked lists into a single ranking.
+ * @param rankings Array of ranked ID lists (best first)
+ * @param k Ranking constant (default 60, standard RRF value)
+ * @returns Fused ranking of IDs sorted by combined score
+ */
+export function rrf(rankings: string[][], k = 60): string[] {
+  const scores = new Map<string, number>()
+  for (const ranking of rankings) {
+    ranking.forEach((id, rank) => {
+      scores.set(id, (scores.get(id) || 0) + 1 / (k + rank + 1))
+    })
+  }
+  return [...scores.entries()].sort((a, b) => b[1] - a[1]).map(([id]) => id)
+}