Cooperbot/dataset.py at master · jmiller656/Cooperbot · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import re
import numpy as np
import collections


def getText(end=10000):
	f = open('coopertext.txt','r')
	text = ''
	i = 0
	while i<end:
		line = f.readline()
		if not line:
			break
		line = re.sub(' +',' ',line)
		line = re.sub('\n',' ',line)
		text = text + line
		i+=1
	text = list(text)
	for i in range(len(text)):
		text[i] = ord(text[i])
	return np.asarray(text).astype(np.float32,copy=False)

def get_words():
	f = open('coopertext.txt','r')
	text = ''
	while True:
		line = f.readline()
		if not line:
			break
		line = re.sub(' +',' ',line)
		line = re.sub('\n',' ',line)
		line = re.sub(r"[^\w\s]+", "", line)
		line = line.lower()
		text = text + line
	text = list(text.split())
	return text


# Step 2: Build the dictionary and replace rare words with UNK token.
def build_dataset(words, vocabulary_size):
	count = [['UNK', -1]]
	count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
	dictionary = dict()
	for word, _ in count:
		dictionary[word] = len(dictionary)
	data = list()
	unk_count = 0
	for word in words:
		if word in dictionary:
			index = dictionary[word]
		else:
			index = 0  # dictionary['UNK']
			unk_count += 1
		data.append(index)
	count[0][1] = unk_count
	reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
	return data, count, dictionary, reverse_dictionary

def get_data_vectors(vec_len=10,end=1):
	text= getText(end=1)
	x = []
	y = []
	for i in range((len(text)-1)//vec_len):
		in_var = text[i:vec_len+i]
		out_var = text[i+vec_len]
		x.append(in_var)
		ov = np.zeros(128)
		ov[int(out_var)] = 1
		y.append(ov)
	np.asarray(x).astype(np.float32,copy=False)
	np.asarray(y).astype(np.float32,copy=False)
	return[x,y]