-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathnaam.py
More file actions
109 lines (91 loc) · 3.51 KB
/
naam.py
File metadata and controls
109 lines (91 loc) · 3.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import spacy
from transformers import pipeline
from openai import OpenAI
import warnings
# Ignore warnings
warnings.filterwarnings("ignore")
# Initialize clients
openai_client = OpenAI(api_key="sk-proj-631uHpaVGU6vTjZeC9tCT3BlbkFJI7BVzFMOHFazRoW1Q7y0")
# Load spaCy model
nlp = spacy.load("en_core_web_trf")
# Load Hugging Face NER pipeline
ner_pipeline = pipeline("ner", model="dslim/bert-base-NER")
def preprocess_text(text):
return ' '.join(str(text).split())
def detect_names_spacy(text):
doc = nlp(text)
return [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
def detect_names_bert(text):
ner_results = ner_pipeline(text)
names = []
current_name = ""
for entity in ner_results:
if entity['entity'] in ['B-PER', 'I-PER']:
word = entity['word']
if word.startswith("##"):
current_name += word[2:]
else:
if current_name:
names.append(current_name.strip())
current_name = word
else:
if current_name:
names.append(current_name.strip())
current_name = ""
if current_name:
names.append(current_name.strip())
return names
def detect_names_llm(text):
prompt = f"""
Analyze the following text and extract any PERSON/HUMAN names mentioned, including names from various cultures:
"{text}"
Provide the names in a comma-separated list, or respond with "No names found" if no names are present.
"""
try:
completion = openai_client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": "You are a helpful assistant that extracts person names from text, including names from diverse cultural backgrounds."},
{"role": "user", "content": prompt}
]
)
names = completion.choices[0].message.content.strip()
if names.lower() == "no names found":
return []
return [name.strip() for name in names.split(',')]
except Exception as e:
print(f"Error in OpenAI API: {str(e)}")
return []
def is_valid_name(name, text):
if len(name.split()) < 2:
return False
if name.lower() in {'mr', 'mrs', 'ms', 'dr', 'prof'}:
return False
return True
def detect_names(text):
preprocessed_text = preprocess_text(text)
spacy_names = detect_names_spacy(preprocessed_text)
bert_names = detect_names_bert(preprocessed_text)
llm_names = detect_names_llm(preprocessed_text)
all_names = set(spacy_names + bert_names + llm_names)
validated_names = []
for name in all_names:
if is_valid_name(name, preprocessed_text):
count = (name in spacy_names) + (name in bert_names) + (name in llm_names)
if count >= 2:
validated_names.append(name)
return len(validated_names) > 0
def filter_transactions_without_names(transactions):
"""
Filter out transactions that contain human names in their memo field.
:param transactions: List of Transaction objects
:return: Tuple of (transactions_without_names, transactions_with_names)
"""
transactions_without_names = []
transactions_with_names = []
for transaction in transactions:
if detect_names(transaction.memo):
transactions_with_names.append(transaction)
else:
transactions_without_names.append(transaction)
return transactions_without_names, transactions_with_names