Skip to content

Commit 17c2534

Browse files
authored
Added the reconstruct sub-command (#51)
* Add subcommand reconstruct * Add a default reconstruct home folder to store compiled graph * Copy and load enriched files to Blazegraph * Implement code to query ASCT+B graph * Implement code to load the graph * Implement transformRecords to convert query result into ASCT+B table format * Fix carriage return split * Make sure the column order is correct * Shorten concept IRI * Fix the order between LABEL and ID * Add header to the constructed table * Fix references to remove duplicates and add PMID, if present * Fix getting the reference ordering * Normalized PubMed identifier * Make ORCID field as string type * Append only clonality and clone_id if they are present * Implement reconstruct OMAP data * Uncomment loadGraph * Don't create any columns if the BM entry is empty * Add at least a set of REF columns (e.g., REF/1, REF/1/ID) if no reference is present * Implement reconstruct CTAnn data * Implement reconstruct Ref-Organ data * Get label from UBERON for accuracy * Fix the data type for part_of_illustration property * Implement reconstruct 2D FTU data * Implement reconstruct Collection data * Use Papaparse library to create the CSV content * Remove quoteIfNeeded since we use Papaparse to handle quotes * Move reconstruction artifacts to individual DO directories * Cleanup reconstruction artifacts after generating the final output Use the CLEANUP_ARTIFACTS flag to enable or disable this feature * Convert query output from JSON to CSV using json2csv * Change the output records to CSV format * Fix missing general publications * Remove unnecessary BIND(STR()) * Implement CSV parsing for ASCT+B DO outputs with Papaparse * Remove the unnecessary BIND(STR()) for the rest of query files * Implement CSV parsing for the rest DO outputs with Papaparse * Add reference notes into the graph * Add FTU columns into the graph * Add FTU and reference notes to reconstruction process * Fix sorting logic to handle numbers properly * Fix malformed headers when reconstructing ASCT+B tables * Add CT notes to the graph for use in reconstruction * Remove unused folder to store the reconstruction artifacts * Replace constant flag with --keep-artifacts CLI argument * Extract queryGraph() into a reusable function * Minor fix removing unnecessary quotes when reformatting date * Add validation for reconstructed collection digital objects * Fix array comparison to ignore item order * Extract validation logging into its own module * Implement validation technique through comparing CSV tables * Reformat the warning log message * Fix bug on locating the row number * Reformat error log message * Rename file * Implement CTAnn validation that includes checks for table metadata section * Add helper function to retrieve raw data name * Undo comment out * Implement OMAP validation * Fix missing rationale for 'binds to antibody' relationships * Add the Notes columns for AS and BM * Make ASCT+B general publication query optional * Modify ASCT+B record construction to include biomarker type for disambiguating duplicate entries * Implement soft validation for the table header * Enhance value comparison logic to handle array-like strings * Cleanup the directory after validation * Fix reference name and notes weren't captured properly * Fix issue where protein presence is not retained in normalized form * Add support for wildcard '*' to select columns for soft validation * Ensure all columns from reconstructed CSV are present in the original CSV * Skip row comparison if row sizes differ * Rename variables * Replace soft validation with per-cell comparison for unmatched rows * Rename variables
1 parent 9e4b6ae commit 17c2534

32 files changed

+2727
-48
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -299,3 +299,6 @@ dist
299299

300300
# VSCode settings
301301
.vscode
302+
303+
# MacOS specific ignores
304+
.DS_Store

package-lock.json

Lines changed: 43 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
"commander": "^10.0.0",
3434
"glob": "^9.2.1",
3535
"js-yaml": "^4.1.0",
36+
"jsonld-cli": "^0.2.0",
3637
"mime-types": "^2.1.35",
3738
"node-fetch": "^3.3.0",
3839
"nunjucks": "^3.2.4",
@@ -41,9 +42,9 @@
4142
"rdflib": "^2.2.34-1",
4243
"semver": "^7.5.3",
4344
"shelljs": "^0.8.5",
45+
"sparqljson-parse": "^3.1.0",
4446
"tslib": "^2.5.0",
45-
"validate-iri": "^1.0.1",
46-
"jsonld-cli": "^0.2.0"
47+
"validate-iri": "^1.0.1"
4748
},
4849
"devDependencies": {
4950
"eslint": "^8.35.0",

schemas/src/digital-objects/2d-ftu.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,5 +175,6 @@ slots:
175175
description: >-
176176
Indicates the part of the illustration to which this node belongs to
177177
connect to the overall FTU structure.
178+
range: EntityID
178179
annotations:
179180
owl: AnnotationProperty, AnnotationAssertion

schemas/src/digital-objects/asct-b.yaml

Lines changed: 80 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -137,12 +137,8 @@ classes:
137137
- lipid_marker_list
138138
- metabolites_marker_list
139139
- proteoforms_marker_list
140-
- references
141-
slot_usage:
142-
references:
143-
range: string
144-
annotations:
145-
owl: AnnotationAssertion, AnnotationProperty
140+
- ftu_list
141+
- reference_list
146142

147143
AnatomicalStructureRecord:
148144
title: Anatomical Structure Record
@@ -156,6 +152,7 @@ classes:
156152
slots:
157153
- ccf_pref_label
158154
- source_concept
155+
- notes
159156
- record_number
160157
- order_number
161158

@@ -171,6 +168,7 @@ classes:
171168
slots:
172169
- ccf_pref_label
173170
- source_concept
171+
- notes
174172
- record_number
175173
- order_number
176174

@@ -187,6 +185,39 @@ classes:
187185
- ccf_pref_label
188186
- ccf_biomarker_type
189187
- source_concept
188+
- notes
189+
- record_number
190+
- order_number
191+
192+
FtuRecord:
193+
title: Functional Tissue Unit Record
194+
description: >-
195+
A functional tissue unit (FTU) record represents a specific value in
196+
the ASCT+B table, located in the "FTU" column, identified by its row
197+
(record number) and column (order number).
198+
mixins:
199+
- Named
200+
- Instance
201+
slots:
202+
- ccf_pref_label
203+
- source_concept
204+
- notes
205+
- record_number
206+
- order_number
207+
208+
ReferenceRecord:
209+
title: Reference Record
210+
description: >-
211+
A reference record represents a specific value in the ASCT+B table,
212+
located in the "REF" column, identified by its row (record number) and
213+
column (order number).
214+
mixins:
215+
- Named
216+
- Instance
217+
slots:
218+
- doi
219+
- external_id
220+
- notes
190221
- record_number
191222
- order_number
192223

@@ -473,6 +504,42 @@ slots:
473504
slot_uri: ccf:proteoform_marker
474505
annotations:
475506
owl: AnnotationAssertion, AnnotationProperty
507+
ftu_list:
508+
title: functional tissue unit
509+
description: List of FTUs referenced within ASCT+B records.
510+
required: false
511+
multivalued: true
512+
inlined_as_list: true
513+
range: FtuRecord
514+
slot_uri: ccf:ftu_types
515+
annotations:
516+
owl: AnnotationAssertion, AnnotationProperty
517+
reference_list:
518+
title: reference list
519+
description: List of references for the ASCT+B record.
520+
required: false
521+
multivalued: true
522+
inlined_as_list: true
523+
range: ReferenceRecord
524+
slot_uri: ccf:reference
525+
annotations:
526+
owl: AnnotationAssertion, AnnotationProperty
527+
doi:
528+
title: doi
529+
description: Digital Object Identifier for the reference record.
530+
required: true
531+
slot_uri: ccf:doi
532+
range: uriorcurie
533+
annotations:
534+
owl: AnnotationAssertion, AnnotationProperty
535+
external_id:
536+
title: external id
537+
description: External identifier for the reference record.
538+
required: false
539+
slot_uri: ccf:external_id
540+
range: string
541+
annotations:
542+
owl: AnnotationAssertion, AnnotationProperty
476543
primary_cell_type:
477544
title: primary cell type
478545
description: The cell type in the cell marker descriptor.
@@ -503,3 +570,10 @@ slots:
503570
range: AsctbRecord
504571
annotations:
505572
owl: AnnotationAssertion, AnnotationProperty
573+
notes:
574+
title: notes
575+
description: >-
576+
Reference to the notes column in the ASCT+B table
577+
slot_uri: ccf:notes
578+
annotations:
579+
owl: AnnotationAssertion, AnnotationProperty

schemas/src/digital-objects/omap.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -364,7 +364,6 @@ slots:
364364
title: author ORCID
365365
description: Reference to the author by the ORCID identifier.
366366
required: false
367-
range: Named
368367
inlined: false
369368
multivalued: true
370369
annotations:

src/cli.js

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import { newDraft } from './drafting/new-draft.js';
99
import { enrich } from './enrichment/enrich.js';
1010
import { finalize } from './finalizing/finalize.js';
1111
import { validate } from './validation/validate.js';
12+
import { reconstruct } from './reconstruction/reconstruct.js';
1213
import { deployDoiXml } from './finalizing/misc-files.js';
1314
import { genAsctbCollectionJson } from './gen-asctb-collection-json.js';
1415
import { list } from './list.js';
@@ -118,6 +119,15 @@ program
118119
enrich(getContext(program, command, str));
119120
});
120121

122+
program
123+
.command('reconstruct')
124+
.description('Reconstructs a given digital object from the enriched graph data')
125+
.argument('<digital-object-path>', 'Path to the digital object relative to DO_HOME')
126+
.option('--keep-artifacts', 'Keep intermediate files for debugging (default: cleanup artifacts)', false)
127+
.action((str, _options, command) => {
128+
reconstruct(getContext(program, command, str));
129+
});
130+
121131
program
122132
.command('build')
123133
.description('Given a Digital Object, checks for and runs normalization, enrichment, and packaging in one command.')

src/normalization/asct-b-utils/api.model.js

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,16 @@ export const objectFieldMap = {
135135
NOTE: 'notes',
136136
};
137137

138+
export function getBiomarkerColumnName(biomarkerType) {
139+
switch (biomarkerType) {
140+
case "gene": return "BG";
141+
case "protein": return "BP";
142+
case "lipids": return "BL";
143+
case "metabolites": return "BM";
144+
case "proteoforms": return "BF";
145+
}
146+
}
147+
138148
export function createObject(name, structureType) {
139149
switch (structureType) {
140150
case 'REF':
@@ -146,11 +156,15 @@ export function createObject(name, structureType) {
146156
}
147157

148158
export class Reference {
149-
constructor(id) {
150-
this.id = id;
159+
constructor(name) {
160+
this.name = name;
151161
}
152162
isValid() {
153-
return !!this.id || !!this.doi || !!this.notes;
163+
return !!this.id;
164+
}
165+
checkIsDoi(str) {
166+
const doiRegex = /(10\.\d{4,9}\/[\w\-._;()/:]+)/i;
167+
return doiRegex.test(str);
154168
}
155169
}
156170

@@ -219,6 +233,15 @@ export class Row {
219233
this.cell_types = this.cell_types.filter((s) => s.isValid());
220234
this.ftu_types = this.ftu_types.filter((s) => s.isValid());
221235
this.references = this.references.filter((s) => s.isValid());
236+
237+
// Remove duplicates based on 'id' property
238+
const seenIds = new Set();
239+
this.references = this.references.filter((s) => {
240+
if (s.id && seenIds.has(s.id)) return false;
241+
if (s.id) seenIds.add(s.id);
242+
return true;
243+
});
244+
222245
this.biomarkers_gene = this.biomarkers_gene.filter((s) => s.isValid());
223246
this.biomarkers_protein = this.biomarkers_protein.filter((s) => s.isValid());
224247
this.biomarkers_lipids = this.biomarkers_lipids.filter((s) => s.isValid());

0 commit comments

Comments
 (0)