Skip to content

Commit 546b117

Browse files
authored
Tweak token prioritization in Typesense (#5776)
* Tweak toke prioritization in typesense * tweaks * allow configuring max_candidates * tweak max_candidates * final changes
1 parent 3d5f29a commit 546b117

File tree

3 files changed

+325
-14
lines changed

3 files changed

+325
-14
lines changed

apps/labrinth/.env.local

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ DATABASE_URL=postgresql://labrinth:labrinth@localhost/labrinth
1616
DATABASE_MIN_CONNECTIONS=0
1717
DATABASE_MAX_CONNECTIONS=16
1818

19-
SEARCH_BACKEND=meilisearch
19+
SEARCH_BACKEND=typesense
2020

2121
# Meilisearch configuration
2222
MEILISEARCH_READ_ADDR=http://localhost:7700

apps/labrinth/src/search/backend/typesense/mod.rs

Lines changed: 51 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -83,10 +83,16 @@ pub struct RequestConfig {
8383
pub prioritize_exact_match: bool,
8484
#[serde(default = "default_prioritize_num_matching_fields")]
8585
pub prioritize_num_matching_fields: bool,
86+
#[serde(default = "default_prioritize_token_positions")]
87+
pub prioritize_token_positions: bool,
88+
#[serde(default = "default_drop_tokens_threshold")]
89+
pub drop_tokens_threshold: usize,
8690
#[serde(default)]
8791
pub text_match_type: TextMatchType,
8892
#[serde(default)]
8993
pub bucketing: Bucketing,
94+
#[serde(default = "default_max_candidates")]
95+
pub max_candidates: usize,
9096
}
9197

9298
impl Default for RequestConfig {
@@ -98,32 +104,38 @@ impl Default for RequestConfig {
98104
prioritize_exact_match: default_prioritize_exact_match(),
99105
prioritize_num_matching_fields:
100106
default_prioritize_num_matching_fields(),
107+
prioritize_token_positions: default_prioritize_token_positions(),
108+
drop_tokens_threshold: default_drop_tokens_threshold(),
101109
text_match_type: TextMatchType::default(),
102110
bucketing: Bucketing::default(),
111+
max_candidates: default_max_candidates(),
103112
}
104113
}
105114
}
106115

107116
fn default_query_by() -> Vec<String> {
108-
[
109-
"name",
110-
"indexed_name",
111-
"slug",
112-
"author",
113-
"indexed_author",
114-
"summary",
115-
]
116-
.into_iter()
117-
.map(str::to_string)
118-
.collect()
117+
// [
118+
// "name",
119+
// "indexed_name",
120+
// "slug",
121+
// "author",
122+
// "indexed_author",
123+
// "summary",
124+
// ]
125+
["name", "indexed_name", "slug", "author", "indexed_author"]
126+
.into_iter()
127+
.map(str::to_string)
128+
.collect()
119129
}
120130

121131
fn default_query_by_weights() -> Vec<u8> {
122-
vec![15, 15, 10, 3, 3, 1]
132+
// vec![15, 15, 10, 3, 3, 1]
133+
vec![15, 15, 10, 3, 3]
123134
}
124135

125136
fn default_prefix() -> Vec<bool> {
126-
vec![true, true, true, true, true, true]
137+
// vec![true, true, true, true, true, true]
138+
vec![true, true, true, true, true]
127139
}
128140

129141
const fn default_prioritize_exact_match() -> bool {
@@ -134,6 +146,20 @@ const fn default_prioritize_num_matching_fields() -> bool {
134146
false
135147
}
136148

149+
const fn default_prioritize_token_positions() -> bool {
150+
// true
151+
false
152+
}
153+
154+
const fn default_drop_tokens_threshold() -> usize {
155+
// 0
156+
1
157+
}
158+
159+
const fn default_max_candidates() -> usize {
160+
8
161+
}
162+
137163
impl TypesenseConfig {
138164
pub fn new(meta_namespace: Option<String>) -> Self {
139165
Self {
@@ -696,6 +722,14 @@ impl SearchBackend for Typesense {
696722
.prioritize_num_matching_fields
697723
.to_string(),
698724
),
725+
(
726+
"prioritize_token_positions",
727+
info.typesense_config.prioritize_token_positions.to_string(),
728+
),
729+
(
730+
"drop_tokens_threshold",
731+
info.typesense_config.drop_tokens_threshold.to_string(),
732+
),
699733
(
700734
"text_match_type",
701735
info.typesense_config.text_match_type.as_str().to_string(),
@@ -707,6 +741,10 @@ impl SearchBackend for Typesense {
707741
("group_limit", "1".to_string()),
708742
("facet_by", "project_id".to_string()),
709743
("max_facet_values", "0".to_string()),
744+
(
745+
"max_candidates",
746+
info.typesense_config.max_candidates.to_string(),
747+
),
710748
];
711749
if let Some(query_by_weights) =
712750
Self::query_by_weights(&info.typesense_config)

scripts/import-projects.py

Lines changed: 273 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,273 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Search projects on api.modrinth.com and import results into the local database
4+
with correct author names.
5+
6+
Modes:
7+
search - Import top N results for a text query
8+
top - Import the top N projects by total downloads (for building a
9+
representative corpus that mirrors prod IDF distributions)
10+
11+
Usage:
12+
python3 scripts/import-projects.py search <query> [limit]
13+
python3 scripts/import-projects.py top [count]
14+
15+
Examples:
16+
python3 scripts/import-projects.py search "sodium" 5
17+
python3 scripts/import-projects.py top 1000
18+
"""
19+
20+
import json
21+
import subprocess
22+
import sys
23+
import time
24+
import urllib.parse
25+
import urllib.request
26+
27+
ADMIN_USER_ID = 103587649610509
28+
DB_CONTAINER = "labrinth-postgres"
29+
DB_USER = "labrinth"
30+
DB_NAME = "labrinth"
31+
API_BASE = "https://api.modrinth.com/v2"
32+
HEADERS = {"User-Agent": "import-projects-script/1.0"}
33+
34+
seen_slugs = set()
35+
author_user_ids = {}
36+
next_user_id = 200_000_000_000_000
37+
38+
39+
def api_get(url):
40+
req = urllib.request.Request(url, headers=HEADERS)
41+
with urllib.request.urlopen(req) as resp:
42+
return json.loads(resp.read().decode())
43+
44+
45+
def psql(sql):
46+
result = subprocess.run(
47+
[
48+
"podman",
49+
"exec",
50+
DB_CONTAINER,
51+
"psql",
52+
"-U",
53+
DB_USER,
54+
"-d",
55+
DB_NAME,
56+
"-c",
57+
sql,
58+
],
59+
capture_output=True,
60+
text=True,
61+
)
62+
if result.returncode != 0:
63+
print(f" DB error: {result.stderr.strip()}", file=sys.stderr)
64+
return False
65+
return True
66+
67+
68+
def sql_escape(s):
69+
return s.replace("'", "''")
70+
71+
72+
def get_or_create_author_user(author_name):
73+
global next_user_id
74+
if author_name in author_user_ids:
75+
return author_user_ids[author_name]
76+
uid = next_user_id
77+
next_user_id += 1
78+
name_e = sql_escape(author_name)
79+
sql = f"""
80+
INSERT INTO users (id, username, email, created, role)
81+
VALUES ({uid}, '{name_e}', '{name_e}@imported.local', NOW(), 'developer')
82+
ON CONFLICT (id) DO NOTHING;
83+
"""
84+
if psql(sql):
85+
author_user_ids[author_name] = uid
86+
else:
87+
author_user_ids[author_name] = ADMIN_USER_ID
88+
return author_user_ids[author_name]
89+
90+
91+
def import_project(hit, counter):
92+
slug = hit.get("slug", "")
93+
if slug in seen_slugs:
94+
return False
95+
seen_slugs.add(slug)
96+
97+
title = hit.get("title", "")
98+
summary = hit.get("description", "")[:2048]
99+
project_id_api = hit.get("project_id", "")
100+
downloads = hit.get("downloads", 0)
101+
follows = hit.get("follows", 0)
102+
icon_url = hit.get("icon_url") or None
103+
author_name = hit.get("author", "Unknown")
104+
105+
print(f" Fetching: {title}")
106+
try:
107+
project_data = api_get(f"{API_BASE}/project/{project_id_api}")
108+
description = (project_data.get("body") or "")[:65536]
109+
icon_url = project_data.get("icon_url") or icon_url
110+
except Exception:
111+
description = summary
112+
113+
author_id = get_or_create_author_user(author_name)
114+
115+
base = int(time.time() * 1e9) % 900_000_000_000_000 + 100_000_000_000_000
116+
mod_id = base + counter * 5
117+
team_id = base + counter * 5 + 1
118+
member_id = base + counter * 5 + 2
119+
version_id = base + counter * 5 + 3
120+
121+
title_e = sql_escape(title)
122+
summary_e = sql_escape(summary)
123+
description_e = sql_escape(description)
124+
slug_e = sql_escape(slug)
125+
icon_col = f"'{sql_escape(icon_url)}'" if icon_url else "NULL"
126+
127+
print(
128+
f" Importing: {title} (author={author_name}, downloads={downloads}, followers={follows})"
129+
)
130+
131+
sql = f"""
132+
BEGIN;
133+
134+
INSERT INTO teams (id) VALUES ({team_id});
135+
136+
INSERT INTO mods (
137+
id, team_id, name, summary, description,
138+
published, downloads, follows,
139+
status, license, side_types_migration_review_status,
140+
components, monetization_status, slug,
141+
icon_url, raw_icon_url
142+
) VALUES (
143+
{mod_id},
144+
{team_id},
145+
'{title_e}',
146+
'{summary_e}',
147+
'{description_e}',
148+
NOW(),
149+
{downloads},
150+
{follows},
151+
'approved',
152+
'LicenseRef-All-Rights-Reserved',
153+
'reviewed',
154+
'{{}}'::jsonb,
155+
'monetized',
156+
LOWER('{slug_e}'),
157+
{icon_col},
158+
{icon_col}
159+
);
160+
161+
INSERT INTO team_members (
162+
id, team_id, user_id, role, permissions,
163+
accepted, payouts_split, ordering, is_owner
164+
) VALUES (
165+
{member_id},
166+
{team_id},
167+
{author_id},
168+
'Owner',
169+
1275068466,
170+
true,
171+
1.00000000000000000000,
172+
0,
173+
true
174+
);
175+
176+
INSERT INTO versions (
177+
id, mod_id, name, version_number, version_type,
178+
author_id, downloads, changelog, status, components
179+
) VALUES (
180+
{version_id},
181+
{mod_id},
182+
'1.0.0',
183+
'1.0.0',
184+
'release',
185+
{author_id},
186+
{downloads},
187+
'',
188+
'listed',
189+
'{{}}'::jsonb
190+
);
191+
192+
INSERT INTO loaders_versions (loader_id, version_id) VALUES (2, {version_id});
193+
194+
COMMIT;
195+
"""
196+
return psql(sql)
197+
198+
199+
def mode_search(query, limit=5):
200+
encoded_query = urllib.parse.quote(query)
201+
search_url = f"{API_BASE}/search?query={encoded_query}&limit={limit}&facets=[]"
202+
print(f"Searching Modrinth for: {query} (limit: {limit})")
203+
204+
search_data = api_get(search_url)
205+
hits = search_data.get("hits", [])
206+
207+
if not hits:
208+
print("No results found.")
209+
return
210+
211+
imported = 0
212+
for i, hit in enumerate(hits):
213+
if import_project(hit, i):
214+
imported += 1
215+
216+
print(f"Done. Imported {imported} project(s).")
217+
218+
219+
def mode_top(count=1000):
220+
print(f"Fetching top {count} projects by downloads from Modrinth...")
221+
222+
imported = 0
223+
batch_size = 50
224+
counter = 0
225+
226+
for offset in range(0, count, batch_size):
227+
limit = min(batch_size, count - offset)
228+
url = (
229+
f"{API_BASE}/search?limit={limit}&offset={offset}&index=downloads&facets=[]"
230+
)
231+
print(f"\n Batch offset={offset}, limit={limit}")
232+
233+
data = api_get(url)
234+
hits = data.get("hits", [])
235+
236+
if not hits:
237+
break
238+
239+
for hit in hits:
240+
if import_project(hit, counter):
241+
imported += 1
242+
counter += 1
243+
244+
time.sleep(1)
245+
246+
print(f"\nDone. Imported {imported} project(s).")
247+
248+
249+
def main():
250+
if len(sys.argv) < 2:
251+
print(f"Usage: {sys.argv[0]} search <query> [limit]")
252+
print(f" {sys.argv[0]} top [count]")
253+
sys.exit(1)
254+
255+
mode = sys.argv[1]
256+
257+
if mode == "search":
258+
if len(sys.argv) < 3:
259+
print("Usage: {sys.argv[0]} search <query> [limit]")
260+
sys.exit(1)
261+
query = sys.argv[2]
262+
limit = int(sys.argv[3]) if len(sys.argv) > 3 else 5
263+
mode_search(query, limit)
264+
elif mode == "top":
265+
count = int(sys.argv[2]) if len(sys.argv) > 2 else 1000
266+
mode_top(count)
267+
else:
268+
print(f"Unknown mode: {mode}. Use 'search' or 'top'.")
269+
sys.exit(1)
270+
271+
272+
if __name__ == "__main__":
273+
main()

0 commit comments

Comments
 (0)