Skip to content

Commit 47d186e

Browse files
committed
edge case name and doi handling
1 parent 259c7d3 commit 47d186e

File tree

3 files changed

+24
-2
lines changed

3 files changed

+24
-2
lines changed

dvcurator/dataverse.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,21 @@ def get_metadata(doi, token=None, host=None):
1515
:return: Metadata block from Dataverse API, or None if error
1616
:rtype: list[str]
1717
"""
18-
import requests, dvcurator.hosts
18+
import re, requests, dvcurator.hosts
1919

2020
doi = doi.strip()
2121
if not doi.startswith("doi:"):
2222
print("Error: DOIs should start with \"doi:\"")
2323
return None
2424

25+
# check for invalid DOI
26+
doi_suffix = doi[4:] # Remove the "doi:" prefix
27+
doi_pattern = r"^10\.\d{4,9}/[-._;()/:A-Z0-9]+$"
28+
if not re.match(doi_pattern, doi_suffix, re.IGNORECASE):
29+
print(f"Error: Invalid DOI format: {doi}")
30+
return None
31+
32+
2533
# Scrape data and metadata from dataverse
2634
host = dvcurator.hosts.qdr_dataverse if not host else host
2735
dataset_url = host + '/api/datasets/:persistentId/?persistentId=' + doi

dvcurator/rename.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ def project_name(citation):
1515
:rtype: String
1616
1717
"""
18-
import re
18+
import re, unicodedata
1919

2020
author1_last_name = citation['author'][0]['authorName']['value'].split(', ')[0]
2121
title = citation['title']
@@ -40,6 +40,10 @@ def project_name(citation):
4040
# replace multiple spaces with one space
4141
folder_name = re.sub(r'\s+', ' ', folder_name).strip()
4242
folder_name = folder_name.strip() # Remove leading/trailing whitespace
43+
44+
# revert accented characters to their ascii equivalents.
45+
# also totally strips unicode characters
46+
folder_name = unicodedata.normalize('NFKD', folder_name).encode('ascii', 'ignore').decode('ascii')
4347

4448
return folder_name
4549

test.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,12 @@ def test_new_step(self):
3737
class TestDataverseAPI(unittest.TestCase):
3838

3939
def test_citation(self):
40+
# these should all fail
4041
self.assertIsNone(dataverse.get_metadata("foobar", host=harvard_host))
4142
self.assertIsNone(dataverse.get_metadata("doi:foobar", host=harvard_host))
43+
self.assertIsNone(dataverse.get_metadata("doi:10.5064/F6FHTB9H"))
44+
self.assertIsNone(dataverse.get_metadata("doi:doi:doi:10.5064/F6FHTB9E"))
45+
# now let's test a success case
4246
metadata = dataverse.get_metadata(harvard_doi, host=harvard_host)
4347
citation = dataverse.get_citation(metadata)
4448
self.assertIsNotNone(citation)
@@ -76,6 +80,7 @@ def test_check(self):
7680

7781
def test_search(self):
7882
self.assertTrue(github.search_existing("Karcher - Anonymous Peer Review", repo="QualitativeDataRepository/testing-demos"))
83+
self.assertFalse(github.search_existing("Nobody - Project that doesnt exist", repo="QualitativeDataRepository/testing-demos"))
7984

8085
def test_version(self):
8186
self.assertFalse(github.check_version())
@@ -105,6 +110,11 @@ def test_projectname(self):
105110
os.makedirs(new_folder_path)
106111
self.assertTrue(os.path.exists(new_folder_path)) # Ensure it was successfully made
107112
d.cleanup()
113+
# test accented character removal
114+
metadata = dataverse.get_metadata("doi:10.5064/F6FHTB9E")
115+
citation = dataverse.get_citation(metadata)
116+
self.assertIsNotNone(citation)
117+
self.assertEqual(rename.project_name(citation), "Rabello Sodre - Memories about Colegio Sao Vicente")
108118

109119
def test_rename(self):
110120
f = tempfile.TemporaryDirectory()

0 commit comments

Comments
 (0)