-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
106 lines (90 loc) · 3.33 KB
/
main.py
File metadata and controls
106 lines (90 loc) · 3.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer
import json
es = Elasticsearch("http://localhost:9200")
index_name = "translations_with_vectors"
model = SentenceTransformer('all-MiniLM-L12-v2')
# Indice generalizzato compatibile con i nomi dei tuoi JSON
if not es.indices.exists(index=index_name):
es.indices.create(index=index_name, body={
"mappings": {
"properties": {
"source_language": {"type": "keyword"},
"target_language": {"type": "keyword"},
"sentence": {"type": "text"}, # frase sorgente
"translation": {"type": "text"}, # frase tradotta
"sentence_vector": {"type": "dense_vector", "dims": 384} # embedding della frase sorgente
}
}
})
def get_embedding(sentence: str):
embedding = model.encode(sentence).tolist()
if len(embedding) != 384:
raise HTTPException(status_code=400, detail="Embedding length mismatch")
return embedding
app = FastAPI()
class TranslationPair(BaseModel):
source_language: str
target_language: str
sentence: str
translation: str
class TranslationRequest(BaseModel):
source_language: str
target_language: str
query_sentence: str
@app.post("/pairs")
def add_translation(pair: TranslationPair):
"""
Inserisce una nuova coppia nel database.
"""
vector = get_embedding(pair.sentence)
es.index(index=index_name, body={
"source_language": pair.source_language,
"target_language": pair.target_language,
"sentence": pair.sentence,
"translation": pair.translation,
"sentence_vector": vector
})
return {"status": "ok"}
@app.get("/prompt")
def get_translation_prompt(source_language: str, target_language: str, query_sentence: str):
"""
Restituisce un prompt di traduzione basato su frasi parallele simili.
"""
query_vector = get_embedding(query_sentence)
query = {
"query": {
"script_score": {
"query": {
"bool": {
"filter": [
{"term": {"source_language": source_language}},
{"term": {"target_language": target_language}}
]
}
},
"script": {
"source": "Math.max(0, cosineSimilarity(params.query_vector, doc['sentence_vector']))",
"params": {"query_vector": query_vector}
}
}
},
"size": 4
}
res = es.search(index=index_name, body=query)
hits = res["hits"]["hits"]
if not hits:
return {"prompt": f"No similar sentences found for '{query_sentence}'."}
# Creazione del prompt con enfasi sulle frasi parallele
examples = "\n".join([
f"Source: {hit['_source']['sentence']}\nTranslation: {hit['_source']['translation']}"
for hit in hits
])
prompt = (
f"These are example parallel sentences with their translations from {source_language} to {target_language}:\n"
f"{examples}\n\n"
f"Now, based on these examples, please translate the following sentence:\n{query_sentence}"
)
return {"prompt": prompt}