66import os
77import json
88import zipfile
9+ import time
10+ from tqdm import tqdm
911from .utils import create_output_folder , check_download_args
1012# Import global variables
1113from .utils import ZENODO_COMMUNITY_URL
1214
15+ # Constants for retry logic
16+ MAX_RETRIES = 3
17+ DEFAULT_RETRY_WAIT = 20 # seconds
18+
19+ # Custom headers to avoid User-Agent based rate limiting
20+ HEADERS = {'User-Agent' : 'YACHT' }
21+
1322# Configure Loguru logger
1423logger .remove ()
1524logger .add (
@@ -42,26 +51,57 @@ def fetch_zenodo_records():
4251 logger .info ("Fetching list of files from Zenodo community 'yacht'" )
4352 all_records = []
4453 page = 1
45- try :
46- while True :
47- url = f"{ ZENODO_COMMUNITY_URL } &page={ page } "
48- response = requests .get (url )
49- response .raise_for_status ()
50- data = response .json ()
51- hits = data .get ("hits" , {}).get ("hits" , [])
52- if not hits :
53- break
54- all_records .extend (hits )
55- # Check if we've fetched all records
56- total = data .get ("hits" , {}).get ("total" , 0 )
57- if len (all_records ) >= total :
58- break
59- page += 1
60- logger .info (f"Fetched { len (all_records )} records from Zenodo" )
61- return all_records
62- except requests .exceptions .RequestException as e :
63- logger .error (f"Error fetching data from Zenodo: { e } " )
64- return []
54+
55+ while True :
56+ url = f"{ ZENODO_COMMUNITY_URL } &page={ page } "
57+
58+ # Retry logic for rate limiting
59+ for attempt in range (MAX_RETRIES ):
60+ try :
61+ response = requests .get (url , headers = HEADERS )
62+
63+ # Handle rate limiting (HTTP 429)
64+ if response .status_code == 429 :
65+ retry_after = int (response .headers .get ('retry-after' , DEFAULT_RETRY_WAIT ))
66+ logger .warning (f"Rate limited by Zenodo (429). Waiting { retry_after } seconds before retry (attempt { attempt + 1 } /{ MAX_RETRIES } )..." )
67+ time .sleep (retry_after )
68+ continue
69+
70+ response .raise_for_status ()
71+ data = response .json ()
72+ hits = data .get ("hits" , {}).get ("hits" , [])
73+ if not hits :
74+ logger .info (f"Fetched { len (all_records )} records from Zenodo" )
75+ return all_records
76+ all_records .extend (hits )
77+ # Check if we've fetched all records
78+ total = data .get ("hits" , {}).get ("total" , 0 )
79+ if len (all_records ) >= total :
80+ logger .info (f"Fetched { len (all_records )} records from Zenodo" )
81+ return all_records
82+ page += 1
83+ break # Success, move to next page
84+
85+ except requests .exceptions .RequestException as e :
86+ if attempt < MAX_RETRIES - 1 :
87+ wait_time = DEFAULT_RETRY_WAIT * (attempt + 1 ) # Exponential backoff
88+ logger .warning (f"Request failed: { e } . Retrying in { wait_time } seconds (attempt { attempt + 1 } /{ MAX_RETRIES } )..." )
89+ time .sleep (wait_time )
90+ else :
91+ logger .error (f"Error fetching data from Zenodo after { MAX_RETRIES } attempts: { e } " )
92+ # Return whatever records we collected so far
93+ if all_records :
94+ logger .warning (f"Returning { len (all_records )} records collected before failure" )
95+ return all_records
96+ else :
97+ # All retries exhausted for this page
98+ logger .error (f"Failed to fetch page { page } from Zenodo after { MAX_RETRIES } attempts" )
99+ # Return whatever records we collected so far (page 1 might have succeeded)
100+ if all_records :
101+ logger .warning (f"Returning { len (all_records )} records collected before failure" )
102+ return all_records
103+
104+ return all_records
65105
66106
67107def generate_download_url (args ):
@@ -87,17 +127,55 @@ def download_file(url, output_path):
87127 if os .path .exists (output_path ):
88128 logger .info (f"File { output_path } already exists. Skipping download." )
89129 return True
90- try :
91- logger .info (f"Starting download from { url } " )
92- response = requests .get (url )
93- response .raise_for_status ()
94- with open (output_path , "wb" ) as file :
95- file .write (response .content )
96- logger .success (f"Downloaded successfully and saved to { output_path } " )
97- return True
98- except requests .exceptions .RequestException as e :
99- logger .error (f"Failed to download { url } : { e } " )
100- return False
130+
131+ for attempt in range (MAX_RETRIES ):
132+ try :
133+ logger .info (f"Starting download from { url } " )
134+
135+ # Use streaming to avoid loading entire file into memory
136+ response = requests .get (url , stream = True , headers = HEADERS )
137+
138+ # Handle rate limiting (HTTP 429)
139+ if response .status_code == 429 :
140+ retry_after = int (response .headers .get ('retry-after' , DEFAULT_RETRY_WAIT ))
141+ logger .warning (f"Rate limited by Zenodo (429). Waiting { retry_after } seconds before retry (attempt { attempt + 1 } /{ MAX_RETRIES } )..." )
142+ time .sleep (retry_after )
143+ continue
144+
145+ response .raise_for_status ()
146+
147+ # Get file size for progress bar
148+ total_size = int (response .headers .get ('content-length' , 0 ))
149+ chunk_size = 8192 # 8KB chunks
150+
151+ # Download with progress bar
152+ with open (output_path , "wb" ) as file :
153+ with tqdm (
154+ total = total_size ,
155+ unit = 'B' ,
156+ unit_scale = True ,
157+ unit_divisor = 1024 ,
158+ desc = os .path .basename (output_path ),
159+ disable = total_size == 0 # Disable if size unknown
160+ ) as pbar :
161+ for chunk in response .iter_content (chunk_size = chunk_size ):
162+ if chunk :
163+ file .write (chunk )
164+ pbar .update (len (chunk ))
165+
166+ logger .success (f"Downloaded successfully and saved to { output_path } " )
167+ return True
168+
169+ except requests .exceptions .RequestException as e :
170+ if attempt < MAX_RETRIES - 1 :
171+ wait_time = DEFAULT_RETRY_WAIT * (attempt + 1 ) # Exponential backoff
172+ logger .warning (f"Download failed: { e } . Retrying in { wait_time } seconds (attempt { attempt + 1 } /{ MAX_RETRIES } )..." )
173+ time .sleep (wait_time )
174+ else :
175+ logger .error (f"Failed to download { url } after { MAX_RETRIES } attempts: { e } " )
176+ return False
177+
178+ return False
101179
102180def update_config_file (file_path ):
103181 try :
@@ -184,11 +262,16 @@ def main(args):
184262 None ,
185263 )
186264
187- if file_url and download_file (file_url , output_path ):
188- unzip_file (file_name_to_search , args .outfolder )
189- update_config_file (output_path )
190- else :
191- logger .warning (f"File '{ file_name_to_search } ' not found in Zenodo records." )
265+ if not file_url :
266+ logger .error (f"File '{ file_name_to_search } ' not found in Zenodo records." )
267+ sys .exit (1 )
268+
269+ if not download_file (file_url , output_path ):
270+ logger .error (f"Failed to download '{ file_name_to_search } ' from Zenodo." )
271+ sys .exit (1 )
272+
273+ unzip_file (file_name_to_search , args .outfolder )
274+ update_config_file (output_path )
192275
193276
194277if __name__ == "__main__" :
0 commit comments