|
| 1 | +import type { VectorizeIndex } from '@cloudflare/workers-types' |
| 2 | +import type { Collections } from '@nuxt/content' |
| 3 | +import type { BMDocument } from 'okapibm25' |
| 4 | +import { openai } from '@ai-sdk/openai' |
| 5 | +import { queryCollectionSearchSections } from '@nuxt/content/server' |
| 6 | +import { embed } from 'ai' |
| 7 | +import BM25 from 'okapibm25' |
| 8 | +import { z } from 'zod' |
| 9 | + |
| 10 | +const querySchema = z.object({ |
| 11 | + q: z.string().min(1).describe('Search query'), |
| 12 | + module: z.enum(['webClient', 'protocol', 'nodes', 'rpc', 'nimiqUtils', 'hub', 'all']).optional().default('all'), |
| 13 | + limit: z.coerce.number().min(1).max(50).optional().default(10), |
| 14 | +}) |
| 15 | + |
| 16 | +export default defineEventHandler(async (event) => { |
| 17 | + const { q, module, limit } = await getValidatedQuery(event, querySchema.parse) |
| 18 | + |
| 19 | + const modules: (keyof Collections)[] = module === 'all' |
| 20 | + ? ['webClient', 'protocol', 'nodes', 'rpc', 'nimiqUtils', 'hub'] |
| 21 | + : [module] |
| 22 | + |
| 23 | + const allSections = (await Promise.all( |
| 24 | + modules.map(m => queryCollectionSearchSections(event, m, { ignoredTags: ['code'] })), |
| 25 | + )).flat() |
| 26 | + |
| 27 | + if (!allSections.length) |
| 28 | + return [] |
| 29 | + |
| 30 | + // Build corpus and section map |
| 31 | + const corpus = allSections.map(s => `${s.titles.join(' ')} ${s.title} ${s.content}`) |
| 32 | + const sectionById = new Map(allSections.map(s => [s.id.split('#')[0], s])) |
| 33 | + |
| 34 | + // BM25 keyword search |
| 35 | + const keywords = await extractKeywords(q) |
| 36 | + const bm25Scored = BM25(corpus, keywords, { k1: 1.5, b: 0.75 }, (a, b) => b.score - a.score) as BMDocument[] |
| 37 | + const bm25Paths: string[] = [] |
| 38 | + const seenBm25 = new Set<string>() |
| 39 | + for (const item of bm25Scored) { |
| 40 | + if (item.score <= 0) |
| 41 | + continue |
| 42 | + const idx = corpus.indexOf(item.document) |
| 43 | + if (idx === -1) |
| 44 | + continue |
| 45 | + const path = allSections[idx]!.id.split('#')[0]! |
| 46 | + if (seenBm25.has(path)) |
| 47 | + continue |
| 48 | + seenBm25.add(path) |
| 49 | + bm25Paths.push(path) |
| 50 | + if (bm25Paths.length >= limit * 2) |
| 51 | + break |
| 52 | + } |
| 53 | + |
| 54 | + // Semantic search via Vectorize |
| 55 | + const vectorize = (event.context.cloudflare?.env as Record<string, unknown> | undefined)?.VECTORIZE as VectorizeIndex | undefined |
| 56 | + const semanticPaths: string[] = [] |
| 57 | + |
| 58 | + if (vectorize) { |
| 59 | + const { embedding } = await embed({ model: openai.embedding('text-embedding-3-small'), value: q }) |
| 60 | + const matches = await vectorize.query(embedding, { topK: limit * 2 }) |
| 61 | + const seenSemantic = new Set<string>() |
| 62 | + for (const match of matches.matches) { |
| 63 | + const path = (match.metadata?.path as string) || match.id.split('#')[0]! |
| 64 | + if (!path || seenSemantic.has(path)) |
| 65 | + continue |
| 66 | + seenSemantic.add(path) |
| 67 | + semanticPaths.push(path) |
| 68 | + } |
| 69 | + } |
| 70 | + |
| 71 | + // RRF fusion of both rankings |
| 72 | + const fusedPaths = rrf([bm25Paths, semanticPaths]) |
| 73 | + |
| 74 | + // Build results from fused ranking |
| 75 | + const results: { path: string, title: string, section: string, content: string, url: string }[] = [] |
| 76 | + for (const path of fusedPaths) { |
| 77 | + const section = sectionById.get(path) || allSections.find(s => s.id.split('#')[0] === path) |
| 78 | + if (!section) |
| 79 | + continue |
| 80 | + results.push({ |
| 81 | + path, |
| 82 | + title: section.titles[0] || section.title, |
| 83 | + section: section.title, |
| 84 | + content: section.content.slice(0, 200), |
| 85 | + url: `https://nimiq.com/developers${path}`, |
| 86 | + }) |
| 87 | + if (results.length >= limit) |
| 88 | + break |
| 89 | + } |
| 90 | + |
| 91 | + return results |
| 92 | +}) |
0 commit comments