-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDataProcesor.py
More file actions
executable file
·42 lines (36 loc) · 1.46 KB
/
DataProcesor.py
File metadata and controls
executable file
·42 lines (36 loc) · 1.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
from nltk import WordNetLemmatizer
import nltk
import re
import string
class DataProcesor:
def __init__(self, data):
self.data = data
self.wordLemmatizer = WordNetLemmatizer()
self.stem = nltk.stem.SnowballStemmer('english')
def getStopWords(self, useDescriptionInsteadOfTitle=False, threshoold=0):
data = []
if useDescriptionInsteadOfTitle:
data = [row[3] for row in self.data]
else:
data = [row[1] for row in self.data]
wordsDict = {}
for sntc in data:
words = nltk.tokenize.word_tokenize(sntc.lower())
for word in words:
if word not in wordsDict:
wordsDict[word] = 1
else:
wordsDict[word] += 1
return set(t for t in wordsDict.keys() if wordsDict[t] > threshoold)
def cleanUp(self, s, stopwords):
s = s.lower()
pattern = '[0-9]'
tokens = nltk.tokenize.word_tokenize(s)
tokens = [t for t in tokens if len(t) > 2]
tokens = [re.sub(pattern, '', i) for i in tokens]
tokens = [t for t in tokens if t is not string.punctuation]
tokens = [self.wordLemmatizer.lemmatize(t) for t in tokens]
tokens = [t for t in tokens if t not in stopwords]
tokens = [self.stem.stem(t) for t in tokens]
result = " ".join(tokens)
return re.sub(' +', ' ', result).replace('.', '')