-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtransaction.py
More file actions
45 lines (39 loc) · 1.82 KB
/
transaction.py
File metadata and controls
45 lines (39 loc) · 1.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# transaction.py
import re
from constants import BUSINESS_SUFFIXES, BANK_TERMS, US_STATES
from nltk.corpus import stopwords
import nltk
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
class Transaction:
def __init__(self, **kwargs):
for key, value in kwargs.items():
setattr(self, key, value)
self.cleaned_memo = self.clean_memo() if hasattr(self, 'memo') else None
self.cleaned_name = self.clean_name() if hasattr(self, 'name') else None
self.matching_steps = {}
self.predicted_category = None
def clean_memo(self):
if not hasattr(self, 'memo') or not isinstance(self.memo, str):
return None
memo = self.memo
memo = re.sub(r'https?://\S+', '', memo)
memo = re.sub(r'\S+@\S+', '', memo)
tokens = memo.split()
stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word.lower() not in stop_words and len(word) > 2 and not self.is_location(word) and not self.is_transaction_detail(word)]
tokens = [word for word in tokens if word.upper() not in BUSINESS_SUFFIXES]
return ' '.join(tokens)
def clean_name(self):
if not hasattr(self, 'name') or not isinstance(self.name, str):
return None
tokens = self.name.split()
tokens = [word for word in tokens if not self.is_location(word) and word.upper() not in BANK_TERMS and word.upper() not in BUSINESS_SUFFIXES]
return ' '.join(tokens)
@staticmethod
def is_location(word):
common_city_names = {'LAS', 'NEW', 'SAN', 'LOS', 'SAINT', 'ST'}
return word.upper() in US_STATES or word.upper() in common_city_names
@staticmethod
def is_transaction_detail(word):
return word.isdigit() or sum(c.isdigit() for c in word) / len(word) > 0.5