-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathWordList.cpp
More file actions
211 lines (201 loc) · 5.52 KB
/
WordList.cpp
File metadata and controls
211 lines (201 loc) · 5.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
#include "WordList.h"
WordList::WordList()
{
}
WordList::WordList(const string &name) :mFileName(name)
{
}
bool WordList::load()
{
std::ifstream file(mFileName);
if (!file)
{
// Simple error handling without exceptions
cerr << "Error opening file: " << mFileName << endl;
return false;
}
// Clear the data structures
mWords.clear();
mWords.shrink_to_fit();
mMasks.clear();
mMasks.shrink_to_fit();
mEditDistanceOneGraph.clear();
mEditDistanceOneGraph.shrink_to_fit();
string line;
while (std::getline(file, line))
{
addWord(line);
}
file.close();
cout << "Loaded " << mNumWords << " words from " << mFileName << endl;
for (size_t i = 0; i < mWords.size(); ++i)
{
cout << "Words of length " << i + 1 << ": " << mWords[i].size() << endl;
}
// Create the graphs for words of different lengths
for (size_t i = 0; i < mWords.size(); ++i)
{
auto &masks = mMasks[i];
mEditDistanceOneGraph.push_back(Graph(mWords[i].size()));
for (const auto& [mask, wordIds] : masks)
{
// Go through all pairs of words with the same mask and add an edge between them
for (size_t j = 0; j < wordIds.size(); ++j)
{
for (size_t k = j + 1; k < wordIds.size(); ++k)
{
mEditDistanceOneGraph[i].addEdge(wordIds[j], wordIds[k]);
}
}
}
}
return true;
}
size_t WordList::addWord(const string &word)
{
// Add word to the list
int len = word.size();
if (len > mWords.size())
{
mWords.resize(len);
mMasks.resize(len);
}
mNumWords++;
int word_id = mWords[len - 1].size();
mWords[len - 1].push_back(word);
// Add masks for a word
for (size_t i = 0; i < word.size(); ++i)
{
// Generate a mask by replacing the i-th character with '*'
std::string mask = word;
mask[i] = '*';
// Store the word index in the vector of words with the same mask
mMasks[len - 1][mask].push_back(word_id);
}
return word_id;
}
void WordList::addWordToGraph(const string &word)
{
// Method assumes it follows addWord for a given word
size_t word_len_id = word.size()-1;
auto& masks = mMasks[word_len_id];
auto& words = mWords[word_len_id];
mEditDistanceOneGraph[word_len_id].addNode(words.size()-1);
for (const auto& [mask, wordIds] : masks)
{
if (wordIds.size() < 2)
continue;
// Just added word is the last one always
int word_id = wordIds[wordIds.size() - 1];
if (words[word_id] != word)
continue;
for (size_t i = 0; i < wordIds.size() - 1; ++i)
{
mEditDistanceOneGraph[word_len_id].addEdge(wordIds[i], word_id);
}
}
}
StringList WordList::findTransform(const string &a, const string &b)
{
// Search for tranform is straightforward
// Just use the graph to find the shortest path between nodes corresponding to words
if (a.size() != b.size())
{
std::cerr << "Words must be of the same length" << std::endl;
return StringList();
}
int a_id = -1;
int b_id = -1;
int word_len_id = a.size()-1;
for (size_t i = 0; i< mWords[word_len_id].size(); ++i)
{
auto &word = mWords[word_len_id][i];
if (word == a)
a_id = i;
if (word == b)
b_id = i;
}
if (a_id == -1)
{
cout << "Word "<< a<<" is not in the word list, adding it to the list" << endl;
a_id = addWord(a);
addWordToGraph(a);
}
if (b_id == -1)
{
cout << "Word " << b << " is not in the word list, adding it to the list" << endl;
b_id = addWord(b);
addWordToGraph(b);
}
auto path = mEditDistanceOneGraph[word_len_id].getShortestPath(a_id, b_id);
if (path.empty())
{
cerr << "No transform between "<<a<<" and "<<b<<" found" << endl;
return StringList();
}
StringList words;
for (auto& id : path)
{
words.push_back(mWords[word_len_id][id]);
}
cout << "Transform between " << a << " and " << b << " was found :" << endl;
return words;
}
StringList WordList::findRandomTransform()
{
std::random_device r;
std::default_random_engine e1(r());
std::uniform_int_distribution<int> uniform_dist_len(4, 12);
// Select random words from "bigger"(4-12 in enable.txt list) sets of words
int word_len_id = uniform_dist_len(e1);
int a_id = 0;
int b_id = 0;
while (a_id == b_id)
{
std::uniform_int_distribution<int> uniform_dist_word(0, mWords[word_len_id].size());
a_id = uniform_dist_word(e1);
b_id = uniform_dist_word(e1);
}
return findTransform(mWords[word_len_id][a_id], mWords[word_len_id][b_id]);
}
void WordList::printMemoryUsage() const
{
// It doesn't show perfectly exact calculation
// Just takes the main contributors to memory usage into account
cout << "Memory usage estimation (main contributors, not exact):" << endl;
size_t size_graph = 0;
for (size_t i = 0; i < mWords.size(); ++i)
{
size_graph += mEditDistanceOneGraph[i].getMemoryUsage();
}
cout << "All graphs consume: " << size_graph << " bytes" << endl;
size_t size_words = 0;
for (size_t i = 0; i < mWords.size(); ++i)
{
//just estimate size of actually stored strings as the main contributor to memory usage
for (const auto& word : mWords[i])
{
size_words += word.capacity() * sizeof(char);
}
}
cout << "All words consume: " << size_words << " bytes" << endl;
size_t size_masks = 0;
for (size_t i = 0; i < mMasks.size(); ++i)
{
// estimate hash map table usage
size_masks += mMasks[i].bucket_count() * sizeof(void*);
// estimate entries
for (const auto& [mask, wordIds] : mMasks[i])
{
// key is a string
size_masks += mask.capacity() * sizeof(char);
// value for given key is a vector of integers
size_masks += wordIds.capacity() * sizeof(int);
}
}
cout << "All masks consume: " << size_masks << " bytes" << endl;
cout << "Total memory usage: " << size_graph + size_words + size_masks << " bytes" << endl;
}
WordList::~WordList()
{
}