-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathexperiment_2.py
More file actions
78 lines (57 loc) · 2.62 KB
/
experiment_2.py
File metadata and controls
78 lines (57 loc) · 2.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
"""Experiment 2 in the paper."""
import numpy as np
import torch
from torch.autograd import Variable
from tqdm import tqdm
from conll.mlp import Perceptron, train
from conll.helpers import load_featurizers_ortho, to_csv
from conll.data import load_data
from sklearn.metrics import pairwise_distances
torch.cuda.set_device(1)
if __name__ == "__main__":
np.random.seed(44)
print("started")
for lang in ("nld", "fra", "eng-uk"):
words, rt_data, subset_words = load_data(lang)
ortho_forms = [x['orthography'] for x in words]
freqs = [x['frequency'] for x in subset_words]
lengths = [len(x['orthography']) for x in subset_words]
rt_data = [rt_data[x['orthography']] for x in subset_words]
ortho_w = [x['orthography'] for x in subset_words]
estims = []
req = [('LinearTransformer', 'fourteen'),
('LinearTransformer', 'one hot'),
('WeightedOpenBigramTransformer', 'weighted bigrams'),
('WickelTransformer', 'wickelfeatures')]
f = load_featurizers_ortho(words)
featurizers, ids = zip(*[(x, y) for x, y in f if y in req])
ids = list(ids)
estims = []
for idx, f in tqdm(enumerate(featurizers), total=len(featurizers)):
X = f.fit_transform(words).astype(np.float32)
y = np.arange(X.shape[0])
p = Perceptron(X.shape[1], 500, X.shape[0])
p.cuda()
train(p, 1000, X, batch_size=250)
x_ = []
for x in range(0, len(X), 250):
data = Variable(torch.from_numpy(X[x:x+250])).cuda()
x_.extend(torch.max(p(data), 1)[1].cpu())
corr = len([x for x in y == x_ if x])
print("Accuracy: {} {} {}".format(lang,
ids[idx],
corr / X.shape[0]))
X = f.transform(subset_words).astype(np.float32)
hid = []
for x in range(0, len(X), 250):
data = Variable(torch.from_numpy(X[x:x+250])).cuda()
hid.extend(p.hidden(data).detach().cpu().numpy())
# Estimate density
dist = pairwise_distances(hid, metric="cosine")
s = np.partition(dist, axis=1, kth=21)[:, :21]
s = np.sort(s, 1)[:, 1:21].mean(1)
estims.append(list(zip(s, freqs, lengths, rt_data, ortho_w)))
sample_results = np.array(estims)
to_csv("data/experiment_mlp_{}_all_words.csv".format(lang),
dict(zip(ids, sample_results)),
("score", "freq", "length", "rt", "ortho_form"))