Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions sportsdataverse/cfb/cfb_loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import List, Callable, Iterator, Union, Optional
from sportsdataverse.config import CFB_BASE_URL, CFB_ROSTER_URL, CFB_TEAM_LOGO_URL, CFB_TEAM_SCHEDULE_URL, CFB_TEAM_INFO_URL
from sportsdataverse.errors import SeasonNotFoundError
from sportsdataverse.dl_utils import download
from sportsdataverse.dl_utils import download, valid_url

def load_cfb_pbp(seasons: List[int]) -> pd.DataFrame:
"""Load college football play by play data going back to 2003
Expand All @@ -20,14 +20,16 @@ def load_cfb_pbp(seasons: List[int]) -> pd.DataFrame:
pd.DataFrame: Pandas dataframe containing the play-by-plays available for the requested seasons.

Raises:
ValueError: If `season` is less than 2003.
SeasonNotFoundError: If `season` is less than 2003 or data cannot be found.
"""
data = pd.DataFrame()
if type(seasons) is int:
seasons = [seasons]
for i in tqdm(seasons):
if int(i) < 2003:
raise SeasonNotFoundError("season cannot be less than 2003")
if not valid_url(CFB_BASE_URL.format(season=i)):
raise SeasonNotFoundError(f"We don't seem to have data for the {i} season.")
i_data = pd.read_parquet(CFB_BASE_URL.format(season=i), engine='auto', columns=None)
#data = data.append(i_data)
data = pd.concat([data,i_data],ignore_index=True)
Expand All @@ -48,14 +50,16 @@ def load_cfb_schedule(seasons: List[int]) -> pd.DataFrame:
pd.DataFrame: Pandas dataframe containing the schedule for the requested seasons.

Raises:
ValueError: If `season` is less than 2002.
SeasonNotFoundError: If `season` is less than 2002 or data cannot be found.
"""
data = pd.DataFrame()
if type(seasons) is int:
seasons = [seasons]
for i in tqdm(seasons):
if int(i) < 2002:
raise SeasonNotFoundError("season cannot be less than 2002")
if not valid_url(CFB_TEAM_SCHEDULE_URL.format(season=i)):
raise SeasonNotFoundError(f"We don't seem to have data for the {i} season.")
i_data = pd.read_parquet(CFB_TEAM_SCHEDULE_URL.format(season = i), engine='auto', columns=None)
#data = data.append(i_data)
data = pd.concat([data,i_data],ignore_index=True)
Expand All @@ -77,7 +81,7 @@ def load_cfb_rosters(seasons: List[int]) -> pd.DataFrame:
pd.DataFrame: Pandas dataframe containing rosters available for the requested seasons.

Raises:
ValueError: If `season` is less than 2014.
SeasonNotFoundError: If `season` is less than 2014.
"""
data = pd.DataFrame()
if type(seasons) is int:
Expand Down Expand Up @@ -105,7 +109,7 @@ def load_cfb_team_info(seasons: List[int]) -> pd.DataFrame:
pd.DataFrame: Pandas dataframe containing the team info available for the requested seasons.

Raises:
ValueError: If `season` is less than 2002.
SeasonNotFoundError: If `season` is less than 2002.
"""
data = pd.DataFrame()
if type(seasons) is int:
Expand Down
24 changes: 23 additions & 1 deletion sportsdataverse/dl_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,4 +183,26 @@ def camelize(string, uppercase_first_letter=True):
if uppercase_first_letter:
return re.sub(r"(?:^|_)(.)", lambda m: m.group(1).upper(), string)
else:
return string[0].lower() + camelize(string)[1:]
return string[0].lower() + camelize(string)[1:]

def valid_url(url, params = {}, num_retries=15):
try:
req = requests.head(url)
if hasattr(req, 'status_code'):
if req.status_code < 400:
# Successful responses and redirections are valid
return True
elif 500 <= req.status_code < 600:
time.sleep(2)
# recursively retry 5xx HTTP errors
return valid_url(url, params, num_retries=num_retries-1)
else:
# 4xx HTTP responses
return False
except requests.ConnectTimeout as e:
# Per docs, requests that produced this error are safe to retry.
return valid_url(url, params, num_retries=num_retries-1)
except Exception:
print('Invalid URL:', url)
return False
return False