-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathvalidator.py
More file actions
182 lines (154 loc) · 8.3 KB
/
Copy pathvalidator.py
File metadata and controls
182 lines (154 loc) · 8.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
from lxml import etree
import subprocess
import click
import glob
import os
import re
rootdir = os.path.dirname(os.path.abspath(__file__))
ns = {"tei": "http://www.tei-c.org/ns/1.0"}
n = 0
# https://stackoverflow.com/questions/8236107/how-to-replace-node-values-in-xml-with-python
# XPATH : target = root.xpath(".//tei:bibl", namespaces=ns)
# SEE README.md OF THIS REPOSITORY FOR A LIST OF # NECESSARY # ENHANCEMENTS
@click.group()
def validator():
"""
command line interface to check the validity of XML files against a relaxng schema
and update the files with corrected data
"""
@validator.command("errlogger")
def errlogger():
"""
function to validate xml files against XML relaxng using a rng validator CLI called 'jingtrang'
https://pypi.org/project/jingtrang/
:return: list of distinct rng errors
"""
errlog = [] # list of distinct error messages
# iterate through all matching files
for dir in glob.iglob("*"):
if os.path.isdir(dir) and re.match(r"[A-Z]{3}_à_corriger", dir):
curdir = os.path.abspath(dir) # the directory currently being scanned by the script
for fpath in glob.iglob(f"{curdir}/*.xml"):
# validate the file against rng schema using jingtrang in a shell subprocess
out, err = subprocess.Popen(f"pyjing ../_schemas/odd_katabase.rng {fpath}",
shell=True, encoding="utf-8", stdout=subprocess.PIPE,
stderr=subprocess.PIPE).communicate()
# build a list of errors
try:
out = out.splitlines() # store stdout (relaxng error log) as a list
except Exception as e:
click.echo(f"# EXCEPTION --- {e}")
# build a list of distinct errors
for o in out:
o = str(o).replace(f"{fpath}", "")
o = re.sub(r"^:\d+:\d+: error: ", "", o)
if o not in errlog:
errlog.append(o)
click.echo(errlog)
return errlog
@validator.command("corrector")
def corrector():
"""
function to correct XML files of New_OutputData
:return:
"""
# create output directories
outclean = os.path.join(rootdir, "out_clean") # directory to store clean files
if not os.path.isdir(outclean):
os.makedirs(outclean)
outerr = os.path.join(rootdir, "out_a_corriger") # directory to store weird files
if not os.path.isdir(outerr):
os.makedirs(outerr)
# iterate through all matching files
for dir in glob.iglob("*"):
if os.path.isdir(dir) and re.match(r"[A-Z]{3}_à_corriger", dir):
curdir = os.path.abspath(dir) # the directory currently being scanned by the script
for fpath in glob.iglob(f"{curdir}/*.xml"): # fpath = filepath of the currently used file
basename = os.path.basename(fpath) # basename = name of the file without its path
errcount = 0 # number of errors in the XML file
# open and prepare file
with open(fpath, mode="r") as f:
tree = etree.parse(f)
root = tree.getroot()
# update tei:bibl with an @ana attribute
bibl = root.xpath(".//tei:sourceDesc/tei:bibl", namespaces=ns)
bibl = bibl[0] # an xpath returns a list => select the first and only
# item of the list
if bibl.xpath("not(./@ana)"):
# get value to use for @ana from the parent directory (LAV, LAC...)
# and insert @ana in val
attr = re.sub(r"_à_corriger", r"", dir)
bibl.set("ana", attr)
# update tei:name with an @type attribute (the type of name mentionned
# for each catalog item) ;
# for each file, name returns a list of tei:name with no @type attribute
# context returns the tei:desc of the tei:item associated with each tei:name
mapping = {} # dicitionnary matching to each name its desc
name = root.xpath(".//tei:item/tei:name[not(./@type)]", namespaces=ns)
context = root.xpath(".//tei:item[not(tei:name/@type)]/tei:desc", namespaces=ns)
# in this weird case (empty tei:desc or tei:name), move the files
# in another directory to do the cleaning by hand
if len(name) != len(context):
errcount += 1
# if the files are aldready clean, move them to the destination directory
elif len(name) == 0 and len(context) == 0:
errcount += 1
# if tei:name or tei:desc is None
# "normal error case" : @type is missing from tei:name, but there is
# a description for each missing tei:item with a missing @type
else:
# for each invalid tei:item in the file
for n in name:
# create a dictionnary mapping to each tei:name its tei:desc
idx = name.index(n)
mapping[n.text] = context[idx].text
# additional exception handling: if there are several invalid
# tei:items but in a file but one or more
# tei:name / tei:desc are empty
if n.text is not None and mapping[n.text] is not None:
# prompt user with name and description and ask them to add a
# valid @type for the tei:name
click.echo(basename)
click.echo(f"--- NAME --- \n {n.text}")
click.echo(f"--- DESC --- \n {mapping[n.text]}")
# get @type's value from user input and check its validity
data = click.prompt("value of tei:name's @type (author|recipient|other)")
if not re.match(r"^(author|recipient|other)$", data):
data = click.prompt("ERROR ; value of tei:name's @type (author|recipient|other)")
# set attribute value
n.set("type", data)
else:
errcount += 1
# write it to the proper directory
if errcount != 0:
writer(input_xml=tree, output_dir=outerr, fname=basename)
else:
writer(input_xml=tree, output_dir=outclean, fname=basename)
# TO DO LATER: ADD THING TO DELETE FILES THAT HAVE BEEN WRITTEN TO
# OUTPUT DIRECTORIES ; CAN BE DONE IN WRITER()
def writer(input_xml, output_dir, fname):
"""
function to write the updated xml to the proper directory
:param input_xml: input xml, parsed with lxml and updated in corrector()
:param output_dir: path to the directory where the file should be written
:param fname: name of the file to be written (base name only, not the path)
:return:
"""
with open(f"{output_dir}/{fname}", mode="wb") as out:
input_xml.write(out, encoding="utf-8", xml_declaration=True, pretty_print=True)
if __name__ == "__main__":
validator()
# OUTPUT OF ERROR LOG:
"""['value of attribute "quantity" is invalid;
must be a string matching the regular expression "(\\-?[\\d]+/\\-?[\\d]+)",
must be a decimal number or must be a floating-point number',
'element "name" missing required attribute "type"']
"""
# À PARTIR DE LÀ :
# - parser out pour n'avoir que des données pertinentes ;
# - si out correspond à certaine erreurs répandues, imprimer les erreurs
# - ensuite, imprimer le xml correspondant
# - enfin modifier le xml avec lxml en fonction de ces erreurs
# (si erreur: name[not(@type)] et contenu de name = truc, alors ajouter tel attrib @type
# - enregistrer les nv fichiers xml (modifiés avec lxml) dans un nv dossier 'out'
# - faire une dernière vérif avec pyjing