Skip to content

Commit 5c8fb3a

Browse files
committed
bugfix for batch retrieval
1 parent 11b8d86 commit 5c8fb3a

File tree

3 files changed

+56
-35
lines changed

3 files changed

+56
-35
lines changed

server/preprocessing/other-scripts/base.R

Lines changed: 26 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -156,37 +156,32 @@ get_papers <- function(query, params,
156156
has_custom_clustering_annotation <- unlist(lapply(metadata$subject_orig, function(x) grepl(paste0(cc, ":"), x, fixed=TRUE)))
157157
metadata <- metadata[has_custom_clustering_annotation,]
158158
}}
159-
# don't deduplicate if params$deduplicate_base is set to FALSE
160-
if (!is.null(params$deduplicate_base) && params$deduplicate_base != FALSE) {
161-
# log to skip deduplication
162-
blog$info(paste("vis_id:", .GlobalEnv$VIS_ID, "Deduplication skipped"))
163-
} else {
164-
while (nrow(metadata) - sum(metadata$is_duplicate) < limit && attr(res_raw, "numFound") > offset+120 && r < req_limit) {
165-
offset <- offset+120
166-
res_raw <- get_raw_data(limit,
167-
base_query,
168-
return_fields,
169-
sortby_string,
170-
filter,
171-
repo,
172-
coll,
173-
retry_opts,
174-
offset,
175-
non_public)
176-
res <- bind_rows(res, res_raw$docs)
177-
metadata <- etl(res, repo, non_public)
178-
metadata <- unique(metadata, by = "id")
179-
metadata <- sanitize_abstract(metadata)
180-
metadata <- mark_duplicates(metadata)
181-
metadata$has_dataset <- unlist(lapply(metadata$resulttype, function(x) "Dataset" %in% x))
182-
# check if custom clustering annotation param is in metadata
183-
if (!is.null(cc)) {
184-
if (!(cc %in% names(fieldmapper))) {
185-
has_custom_clustering_annotation <- unlist(lapply(metadata$subject_orig, function(x) grepl(paste0(cc, ":"), x, fixed=TRUE)))
186-
metadata <- metadata[has_custom_clustering_annotation,]
187-
}}
188-
r <- r+1
189-
}
159+
160+
while (nrow(metadata) - sum(metadata$is_duplicate) < limit && attr(res_raw, "numFound") > offset+120 && r < req_limit) {
161+
offset <- offset+120
162+
res_raw <- get_raw_data(limit,
163+
base_query,
164+
return_fields,
165+
sortby_string,
166+
filter,
167+
repo,
168+
coll,
169+
retry_opts,
170+
offset,
171+
non_public)
172+
res <- bind_rows(res, res_raw$docs)
173+
metadata <- etl(res, repo, non_public)
174+
metadata <- unique(metadata, by = "id")
175+
metadata <- sanitize_abstract(metadata)
176+
metadata <- mark_duplicates(metadata)
177+
metadata$has_dataset <- unlist(lapply(metadata$resulttype, function(x) "Dataset" %in% x))
178+
# check if custom clustering annotation param is in metadata
179+
if (!is.null(cc)) {
180+
if (!(cc %in% names(fieldmapper))) {
181+
has_custom_clustering_annotation <- unlist(lapply(metadata$subject_orig, function(x) grepl(paste0(cc, ":"), x, fixed=TRUE)))
182+
metadata <- metadata[has_custom_clustering_annotation,]
183+
}}
184+
r <- r+1
190185
}
191186
# check if custom clustering annotation param is in metadata
192187
if (!is.null(cc)) {

server/workers/base/src/base.py

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
from datetime import datetime
2929
import dateparser
3030
import sys
31+
from typing import Dict
3132
from common.rate_limiter import RateLimiter
3233

3334
logger = logging.getLogger(__name__)
@@ -86,7 +87,7 @@ def execute_search(self, params):
8687
else:
8788
metadata = pd.DataFrame(raw_metadata)
8889
metadata = self.sanitize_metadata(metadata)
89-
metadata = filter_duplicates(metadata, original_service)
90+
metadata = filter_duplicates(metadata, original_service, params)
9091
metadata = pd.concat(
9192
[metadata, parse_annotations_for_all(metadata, "subject_orig")],
9293
axis=1,
@@ -240,7 +241,11 @@ def handle_contentproviders(self, request_id, params):
240241
pattern_annotations = re.compile(r"([A-Za-z]+:[\w'\- ]+);?")
241242

242243

243-
def filter_duplicates(df, service):
244+
def filter_duplicates(df, service, params):
245+
if logger.isEnabledFor(logging.DEBUG):
246+
logger.debug(f"Filtering duplicates for service: {service}")
247+
logger.debug(f"Initial number of records: {len(df)}")
248+
_log_dataframe(df, params, "initial_records")
244249
df.drop_duplicates("id", inplace=True, keep="first")
245250
df["is_anchor"] = False
246251
df["doi_duplicate"] = False
@@ -303,6 +308,9 @@ def filter_duplicates(df, service):
303308
if c in filtered.columns:
304309
filtered.drop(c, axis=1, inplace=True)
305310

311+
if logger.isEnabledFor(logging.DEBUG):
312+
logger.debug(f"Number of records after filtering: {len(filtered)}")
313+
_log_dataframe(filtered, params, "filtered_records")
306314
return filtered
307315

308316

@@ -362,3 +370,22 @@ def sanitize_year(year_str):
362370
sanitized_year = year_str # here we keep the original string
363371

364372
return sanitized_year
373+
374+
def _log_dataframe(df: pd.DataFrame, params: Dict[str, str], name: str, ):
375+
vis_id = params.get('vis_id')
376+
377+
columns_to_print = ['id', 'title', 'doi', 'merged_dois', 'paper_abstract', 'link', 'subject', 'subject_orig', 'oa_state']
378+
379+
available_columns = df.columns.tolist()
380+
columns_to_print = [col for col in columns_to_print if col in available_columns]
381+
382+
transformed = df.copy().reindex(columns=columns_to_print)
383+
384+
transformed = transformed.fillna(value='missing')
385+
386+
# create folder
387+
folder = f'./output/{vis_id}'
388+
if not os.path.exists(folder):
389+
os.makedirs(folder)
390+
file_path = f"{folder}/{name}.csv"
391+
transformed.to_csv(file_path, index=False)

server/workers/orcid/src/orcid_service.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ def _log_dataframe(self, df: pd.DataFrame, params: Dict[str, str], name: str, ):
152152

153153
def request_base_metadata(self, dois: List[str], params: Dict[str, str]) -> pd.DataFrame:
154154
orcid = params.get('orcid')
155-
batch_size = 15
155+
batch_size = 20
156156
batches = [dois[i:i + batch_size] for i in range(0, len(dois), batch_size)]
157157
base_metadata = pd.DataFrame(dtype=object)
158158

@@ -185,7 +185,6 @@ def request_base_metadata(self, dois: List[str], params: Dict[str, str]) -> pd.D
185185
'vis_id': request_id,
186186
'limit': 360,
187187
'list_size': 360,
188-
'deduplicate_base': 'false',
189188
'exclude_date_filters': 'true',
190189
'q_advanced_only': 'true'
191190
},

0 commit comments

Comments
 (0)