Skip to content

Commit 286d142

Browse files
committed
Added Custom Headers to fix User-Agent based rate limiting
1 parent ae04da8 commit 286d142

File tree

1 file changed

+119
-36
lines changed

1 file changed

+119
-36
lines changed

src/yacht/download_pretrained_ref_db.py

Lines changed: 119 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,19 @@
66
import os
77
import json
88
import zipfile
9+
import time
10+
from tqdm import tqdm
911
from .utils import create_output_folder, check_download_args
1012
# Import global variables
1113
from .utils import ZENODO_COMMUNITY_URL
1214

15+
# Constants for retry logic
16+
MAX_RETRIES = 3
17+
DEFAULT_RETRY_WAIT = 20 # seconds
18+
19+
# Custom headers to avoid User-Agent based rate limiting
20+
HEADERS = {'User-Agent': 'YACHT'}
21+
1322
# Configure Loguru logger
1423
logger.remove()
1524
logger.add(
@@ -42,26 +51,57 @@ def fetch_zenodo_records():
4251
logger.info("Fetching list of files from Zenodo community 'yacht'")
4352
all_records = []
4453
page = 1
45-
try:
46-
while True:
47-
url = f"{ZENODO_COMMUNITY_URL}&page={page}"
48-
response = requests.get(url)
49-
response.raise_for_status()
50-
data = response.json()
51-
hits = data.get("hits", {}).get("hits", [])
52-
if not hits:
53-
break
54-
all_records.extend(hits)
55-
# Check if we've fetched all records
56-
total = data.get("hits", {}).get("total", 0)
57-
if len(all_records) >= total:
58-
break
59-
page += 1
60-
logger.info(f"Fetched {len(all_records)} records from Zenodo")
61-
return all_records
62-
except requests.exceptions.RequestException as e:
63-
logger.error(f"Error fetching data from Zenodo: {e}")
64-
return []
54+
55+
while True:
56+
url = f"{ZENODO_COMMUNITY_URL}&page={page}"
57+
58+
# Retry logic for rate limiting
59+
for attempt in range(MAX_RETRIES):
60+
try:
61+
response = requests.get(url, headers=HEADERS)
62+
63+
# Handle rate limiting (HTTP 429)
64+
if response.status_code == 429:
65+
retry_after = int(response.headers.get('retry-after', DEFAULT_RETRY_WAIT))
66+
logger.warning(f"Rate limited by Zenodo (429). Waiting {retry_after} seconds before retry (attempt {attempt + 1}/{MAX_RETRIES})...")
67+
time.sleep(retry_after)
68+
continue
69+
70+
response.raise_for_status()
71+
data = response.json()
72+
hits = data.get("hits", {}).get("hits", [])
73+
if not hits:
74+
logger.info(f"Fetched {len(all_records)} records from Zenodo")
75+
return all_records
76+
all_records.extend(hits)
77+
# Check if we've fetched all records
78+
total = data.get("hits", {}).get("total", 0)
79+
if len(all_records) >= total:
80+
logger.info(f"Fetched {len(all_records)} records from Zenodo")
81+
return all_records
82+
page += 1
83+
break # Success, move to next page
84+
85+
except requests.exceptions.RequestException as e:
86+
if attempt < MAX_RETRIES - 1:
87+
wait_time = DEFAULT_RETRY_WAIT * (attempt + 1) # Exponential backoff
88+
logger.warning(f"Request failed: {e}. Retrying in {wait_time} seconds (attempt {attempt + 1}/{MAX_RETRIES})...")
89+
time.sleep(wait_time)
90+
else:
91+
logger.error(f"Error fetching data from Zenodo after {MAX_RETRIES} attempts: {e}")
92+
# Return whatever records we collected so far
93+
if all_records:
94+
logger.warning(f"Returning {len(all_records)} records collected before failure")
95+
return all_records
96+
else:
97+
# All retries exhausted for this page
98+
logger.error(f"Failed to fetch page {page} from Zenodo after {MAX_RETRIES} attempts")
99+
# Return whatever records we collected so far (page 1 might have succeeded)
100+
if all_records:
101+
logger.warning(f"Returning {len(all_records)} records collected before failure")
102+
return all_records
103+
104+
return all_records
65105

66106

67107
def generate_download_url(args):
@@ -87,17 +127,55 @@ def download_file(url, output_path):
87127
if os.path.exists(output_path):
88128
logger.info(f"File {output_path} already exists. Skipping download.")
89129
return True
90-
try:
91-
logger.info(f"Starting download from {url}")
92-
response = requests.get(url)
93-
response.raise_for_status()
94-
with open(output_path, "wb") as file:
95-
file.write(response.content)
96-
logger.success(f"Downloaded successfully and saved to {output_path}")
97-
return True
98-
except requests.exceptions.RequestException as e:
99-
logger.error(f"Failed to download {url}: {e}")
100-
return False
130+
131+
for attempt in range(MAX_RETRIES):
132+
try:
133+
logger.info(f"Starting download from {url}")
134+
135+
# Use streaming to avoid loading entire file into memory
136+
response = requests.get(url, stream=True, headers=HEADERS)
137+
138+
# Handle rate limiting (HTTP 429)
139+
if response.status_code == 429:
140+
retry_after = int(response.headers.get('retry-after', DEFAULT_RETRY_WAIT))
141+
logger.warning(f"Rate limited by Zenodo (429). Waiting {retry_after} seconds before retry (attempt {attempt + 1}/{MAX_RETRIES})...")
142+
time.sleep(retry_after)
143+
continue
144+
145+
response.raise_for_status()
146+
147+
# Get file size for progress bar
148+
total_size = int(response.headers.get('content-length', 0))
149+
chunk_size = 8192 # 8KB chunks
150+
151+
# Download with progress bar
152+
with open(output_path, "wb") as file:
153+
with tqdm(
154+
total=total_size,
155+
unit='B',
156+
unit_scale=True,
157+
unit_divisor=1024,
158+
desc=os.path.basename(output_path),
159+
disable=total_size == 0 # Disable if size unknown
160+
) as pbar:
161+
for chunk in response.iter_content(chunk_size=chunk_size):
162+
if chunk:
163+
file.write(chunk)
164+
pbar.update(len(chunk))
165+
166+
logger.success(f"Downloaded successfully and saved to {output_path}")
167+
return True
168+
169+
except requests.exceptions.RequestException as e:
170+
if attempt < MAX_RETRIES - 1:
171+
wait_time = DEFAULT_RETRY_WAIT * (attempt + 1) # Exponential backoff
172+
logger.warning(f"Download failed: {e}. Retrying in {wait_time} seconds (attempt {attempt + 1}/{MAX_RETRIES})...")
173+
time.sleep(wait_time)
174+
else:
175+
logger.error(f"Failed to download {url} after {MAX_RETRIES} attempts: {e}")
176+
return False
177+
178+
return False
101179

102180
def update_config_file(file_path):
103181
try:
@@ -184,11 +262,16 @@ def main(args):
184262
None,
185263
)
186264

187-
if file_url and download_file(file_url, output_path):
188-
unzip_file(file_name_to_search, args.outfolder)
189-
update_config_file(output_path)
190-
else:
191-
logger.warning(f"File '{file_name_to_search}' not found in Zenodo records.")
265+
if not file_url:
266+
logger.error(f"File '{file_name_to_search}' not found in Zenodo records.")
267+
sys.exit(1)
268+
269+
if not download_file(file_url, output_path):
270+
logger.error(f"Failed to download '{file_name_to_search}' from Zenodo.")
271+
sys.exit(1)
272+
273+
unzip_file(file_name_to_search, args.outfolder)
274+
update_config_file(output_path)
192275

193276

194277
if __name__ == "__main__":

0 commit comments

Comments
 (0)