diff --git a/sportsdataverse/cfb/cfb_loaders.py b/sportsdataverse/cfb/cfb_loaders.py index 415dc436..868854c9 100644 --- a/sportsdataverse/cfb/cfb_loaders.py +++ b/sportsdataverse/cfb/cfb_loaders.py @@ -5,7 +5,7 @@ from typing import List, Callable, Iterator, Union, Optional from sportsdataverse.config import CFB_BASE_URL, CFB_ROSTER_URL, CFB_TEAM_LOGO_URL, CFB_TEAM_SCHEDULE_URL, CFB_TEAM_INFO_URL from sportsdataverse.errors import SeasonNotFoundError -from sportsdataverse.dl_utils import download +from sportsdataverse.dl_utils import download, valid_url def load_cfb_pbp(seasons: List[int]) -> pd.DataFrame: """Load college football play by play data going back to 2003 @@ -20,7 +20,7 @@ def load_cfb_pbp(seasons: List[int]) -> pd.DataFrame: pd.DataFrame: Pandas dataframe containing the play-by-plays available for the requested seasons. Raises: - ValueError: If `season` is less than 2003. + SeasonNotFoundError: If `season` is less than 2003 or data cannot be found. """ data = pd.DataFrame() if type(seasons) is int: @@ -28,6 +28,8 @@ def load_cfb_pbp(seasons: List[int]) -> pd.DataFrame: for i in tqdm(seasons): if int(i) < 2003: raise SeasonNotFoundError("season cannot be less than 2003") + if not valid_url(CFB_BASE_URL.format(season=i)): + raise SeasonNotFoundError(f"We don't seem to have data for the {i} season.") i_data = pd.read_parquet(CFB_BASE_URL.format(season=i), engine='auto', columns=None) #data = data.append(i_data) data = pd.concat([data,i_data],ignore_index=True) @@ -48,7 +50,7 @@ def load_cfb_schedule(seasons: List[int]) -> pd.DataFrame: pd.DataFrame: Pandas dataframe containing the schedule for the requested seasons. Raises: - ValueError: If `season` is less than 2002. + SeasonNotFoundError: If `season` is less than 2002 or data cannot be found. """ data = pd.DataFrame() if type(seasons) is int: @@ -56,6 +58,8 @@ def load_cfb_schedule(seasons: List[int]) -> pd.DataFrame: for i in tqdm(seasons): if int(i) < 2002: raise SeasonNotFoundError("season cannot be less than 2002") + if not valid_url(CFB_TEAM_SCHEDULE_URL.format(season=i)): + raise SeasonNotFoundError(f"We don't seem to have data for the {i} season.") i_data = pd.read_parquet(CFB_TEAM_SCHEDULE_URL.format(season = i), engine='auto', columns=None) #data = data.append(i_data) data = pd.concat([data,i_data],ignore_index=True) @@ -77,7 +81,7 @@ def load_cfb_rosters(seasons: List[int]) -> pd.DataFrame: pd.DataFrame: Pandas dataframe containing rosters available for the requested seasons. Raises: - ValueError: If `season` is less than 2014. + SeasonNotFoundError: If `season` is less than 2014. """ data = pd.DataFrame() if type(seasons) is int: @@ -105,7 +109,7 @@ def load_cfb_team_info(seasons: List[int]) -> pd.DataFrame: pd.DataFrame: Pandas dataframe containing the team info available for the requested seasons. Raises: - ValueError: If `season` is less than 2002. + SeasonNotFoundError: If `season` is less than 2002. """ data = pd.DataFrame() if type(seasons) is int: diff --git a/sportsdataverse/dl_utils.py b/sportsdataverse/dl_utils.py index 56bc8cf9..fc95a6fa 100644 --- a/sportsdataverse/dl_utils.py +++ b/sportsdataverse/dl_utils.py @@ -183,4 +183,26 @@ def camelize(string, uppercase_first_letter=True): if uppercase_first_letter: return re.sub(r"(?:^|_)(.)", lambda m: m.group(1).upper(), string) else: - return string[0].lower() + camelize(string)[1:] \ No newline at end of file + return string[0].lower() + camelize(string)[1:] + +def valid_url(url, params = {}, num_retries=15): + try: + req = requests.head(url) + if hasattr(req, 'status_code'): + if req.status_code < 400: + # Successful responses and redirections are valid + return True + elif 500 <= req.status_code < 600: + time.sleep(2) + # recursively retry 5xx HTTP errors + return valid_url(url, params, num_retries=num_retries-1) + else: + # 4xx HTTP responses + return False + except requests.ConnectTimeout as e: + # Per docs, requests that produced this error are safe to retry. + return valid_url(url, params, num_retries=num_retries-1) + except Exception: + print('Invalid URL:', url) + return False + return False \ No newline at end of file