-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDatasetAnalysis.py
More file actions
114 lines (76 loc) · 2.77 KB
/
Copy pathDatasetAnalysis.py
File metadata and controls
114 lines (76 loc) · 2.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
'''
This document 'DatasetAnalysis.py' analyzes the lengths of files in the Corpus folder and plots them.
The methods in this file can be used at any time to generate statistics for the dataset.
'''
import Preprocessing as pp
import os
import matplotlib.pyplot as plt
def build_length_dict():
'''
Creates a dictionary for the lengths of each file.
The key of each entry is the name of the text file, and the value is the number of sentences
in the file.
@return - a dictionary containing the lengths of every text file.
'''
# Open text file
rootDir = "Corpus"
file_list = []
# add the file path in a list so you can reach all the file paths
for file in os.listdir(rootDir):
if(file[-4:] == ".txt"):
file_list.append( os.path.join(rootDir, file))
lengths = {}
for i in file_list:
with open(i, mode = 'r', encoding='utf-8') as file:
sentences = pp.preprocess(file.read())
lengths[i] = len(sentences)
return lengths
def analyze_dataset():
'''
Generates and prints summary statistics for the corpus based on document length in sentences.
Prints the number of files, the maximum length of a file, the minimum length of a file,
and the average length of a file.
'''
# Open text file
rootDir = "Corpus"
file_list = []
# add the file path in a list so you can reach all the file paths
for file in os.listdir(rootDir):
if(file[-4:] == ".txt"):
file_list.append( os.path.join(rootDir, file))
lengths = []
# Build a list of lengths
for i in file_list:
with open(i, mode = 'r', encoding='utf-8') as file:
sentences = pp.preprocess(file.read())
l = len(sentences)
if len(sentences) >= 617:
print(i)
lengths.append(len(sentences))
# Sort the lengths to gather summary statistics
lengths = sorted(lengths)
print("File count: " + str(len(lengths)))
print("Max len: " + str(lengths[-1]))
print("Min length: " + str(lengths[0]))
print("Average length: " + str(sum(lengths) / len(lengths)))
print("Length of every file:")
print(lengths)# Open text file
def plot_distribution():
'''
Graphically plots the distribution of lengths of documents in a histogram.
'''
lengths = build_length_dict().values()
lengths = sorted(lengths)
lengths = lengths[:-10] # Exclude outliers
width = max(lengths) - min(lengths)
bin_count = 200
plt.xlabel("Size of Document (# Sentences)")
plt.ylabel("Frequency")
plt.title("Distribution of Document Size")
plt.hist(lengths, bin_count)
plt.show()
def main():
plot_distribution()
# analyze_dataset()
if __name__ == "__main__":
main()