-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathmain.py
More file actions
130 lines (111 loc) · 5.69 KB
/
main.py
File metadata and controls
130 lines (111 loc) · 5.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
from udi_grammar_py import Chart, Op, rolling
import pandas as pd
import sys
import template_generation
import process_datapackage
import insert_reference_values
import template_expansion
import paraphraser
import upload_to_huggingface
import export_sqlite
import json
sys.path.append('.')
# default all to False, use command line args to set them to True
UPDATE_SCHEMA = False # Set to True to update the data package schema
SAVE_HUGGINGFACE_LOCAL = False # Saves the data locally in a format similar to the HF upload
UPLOAD_TO_HUGGINGFACE = False # Set to True if you want to upload the training data to Hugging Face
PERFORM_PARAPHRASING = False # paraphrasing is time consuming, so skipping makes it easier to test the rest of the pipeline
ONLY_CACHED = False # if True, only cached data for paraphrasing will be used only matters if PERFORM_PARAPHRASING is True
GENERATE_SQLITE = False # Set to True if you want to export the data to SQLite DB
GENERATE_JSON = False # Set to True if you want to export the data to JSON
SAMPLE_SQLITE = False # Set to True if you want to subsample the data for SQLite DB
GENERATE_PARQUET = False # Set to True if you want to export the data to parquet
def main():
print_header("1. Generate templates")
df = template_generation.generate()
template_question_count = df.shape[0]
# update data schema based on files in ./datasets folder and export updated data packages
if UPDATE_SCHEMA:
print('Updating data schema')
process_datapackage.main()
insert_reference_values.main()
print_header("2. Contextualize templates with real entity names and fields")
# Contextualize the template training data by putting in real entity names and fields if they satisfy the constraints.
with open('./datasets/output_catalogue.json') as f:
schema_list = json.load(f)
df = template_expansion.expand(df, schema_list)
print_header("3. Paraphrase the contextualized templates")
# The paraphraser will use LLM to paraphrase the query_base into several options
expanded_question_count = df.shape[0]
if PERFORM_PARAPHRASING:
if ONLY_CACHED:
print('Using only cached data for paraphrasing, will not call LLM.')
df = paraphraser.paraphrase(df, schema_list, ONLY_CACHED)
else:
print('Skipping paraphrasing, using only the original query_base.')
df['query'] = df['query_base']
df['expertise'] = -1
df['formality'] = -1
paraphrased_question_count = df.shape[0]
# Sanity Check output
print_header("4. Sanity Check output dimensions")
print(f"Generated {template_question_count:,} templates and expanded to {expanded_question_count:,} questions and paraphrased to {paraphrased_question_count:,}.")
print_header("5. Export data")
if GENERATE_SQLITE:
print_header('Exporting data to SQLite DB')
# ## Export as SQLite DB
export_sqlite.export('./out/database.sqlite', df, sample=SAMPLE_SQLITE)
if GENERATE_JSON:
print_header("exporting ./out/training_data.json...")
df.to_json('./out/training_data.json', orient='records')
if GENERATE_PARQUET:
print_header("exporting ./out/training_data.parquet...")
# drop solution since parquet is not supported
df.drop(["solution"], axis=1).to_parquet('./out/training_data.parquet')
# ## Upload data to Huggging Face
if UPLOAD_TO_HUGGINGFACE or SAVE_HUGGINGFACE_LOCAL:
if UPLOAD_TO_HUGGINGFACE and SAVE_HUGGINGFACE_LOCAL:
print('Uploading to Hugging Face and saving locally')
elif UPLOAD_TO_HUGGINGFACE:
print_header('Uploading to Hugging Face')
elif SAVE_HUGGINGFACE_LOCAL:
print_header('Saving data locally in format for Hugging Face')
upload_to_huggingface.save(
df,
'./datasets/output_catalogue.json',
'./datasets/UDIGrammarSchema.json',
'./datasets/multi_step_links.json',
'./datasets/reviews.json',
'./datasets/hf_readme.md',
'./out/huggingface/',
'HIDIVE/DQVis',
save_local=SAVE_HUGGINGFACE_LOCAL,
push_to_hub=UPLOAD_TO_HUGGINGFACE
)
def print_header(message):
print("\n" + "#" * 80)
print("| " + message + " " * (77 - len(message)) + "|")
print("#" * 80)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='Generate training data for UDI Grammar')
parser.add_argument('--schema', action='store_true', help='Update the data package schema based on files in ./datasets folder')
parser.add_argument('--upload', action='store_true', help='Upload the training data to Hugging Face')
parser.add_argument('--hf_local', action='store_true', help='Save the training data locally in a format similar to the HF upload')
parser.add_argument('--paraphrase', action='store_true', help='Perform paraphrasing')
parser.add_argument('--only_cached', action='store_true', help='Use only cached data for paraphrasing')
parser.add_argument('--sqlite', action='store_true', help='Export the data to SQLite DB')
parser.add_argument('--sample', action='store_true', help='Sample the data for SQLite DB')
parser.add_argument('--json', action='store_true', help='Export the data to JSON')
parser.add_argument('--parquet', action='store_true', help='Export the data to parquet')
args = parser.parse_args()
UPDATE_SCHEMA = args.schema
UPLOAD_TO_HUGGINGFACE = args.upload
SAVE_HUGGINGFACE_LOCAL = args.hf_local
PERFORM_PARAPHRASING = args.paraphrase
GENERATE_SQLITE = args.sqlite
SAMPLE_SQLITE = args.sample
GENERATE_JSON = args.json
GENERATE_PARQUET = args.parquet
ONLY_CACHED = args.only_cached
main()