-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathstrip-pcsh.py
More file actions
55 lines (47 loc) · 1.35 KB
/
strip-pcsh.py
File metadata and controls
55 lines (47 loc) · 1.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import os
import sys
from dotenv import load_dotenv
from pipeline.config import Config
import json
load_dotenv()
basepath = os.getenv("LUX_BASEPATH", "")
cfgs = Config(basepath=basepath)
idmap = cfgs.get_idmap()
cfgs.cache_globals()
cfgs.instantiate_all()
# Run through Types in merged and strip out any that are PCSH without other equivalents
merged = cfgs.results["merged"]["recordcache"]
idx = cfgs.internal["ils"]["indexLoader"].load_index()
vocabs = ["wikidata.org", "getty.edu", "art.yale", "ycba-lux", "images.peabody"]
killed = []
kept = []
for recid in idx:
sys.stdout.write(".")
sys.stdout.flush()
if not len(killed) % 50000:
print(len(killed))
try:
yuid = idmap[f"{recid}##quaType"]
equivs = idmap[yuid]
except Exception as e:
# print(f"Error processing {recid}: {e}")
continue
okay = 0
# min is self and token
if len(equivs) > 2:
for e in equivs:
for s in vocabs:
if s in e:
okay += 1
if not okay:
killed.append(yuid)
del merged[yuid.rsplit("/", 1)[1]]
else:
print(f"{recid} / {yuid} has {okay} equivalents: {equivs}")
kept.append(yuid)
jstr = json.dumps(killed)
with open("killed.json", "w") as f:
f.write(jstr)
kstr = json.dumps(kept)
with open("kept.json", "w") as f:
f.write(kstr)