-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraper.py
More file actions
61 lines (53 loc) · 2.05 KB
/
scraper.py
File metadata and controls
61 lines (53 loc) · 2.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import os
import re
import requests
from bs4 import BeautifulSoup
from config import CINEMA_URLS, HEADERS, DAYS_ABBR
from datetime import datetime
def scrape(output_dir="text/"):
"""
Scrape selected cinemas and save their plain text output.
"""
current_week = datetime.now().isocalendar()[1]
do_scrape = False
if not os.listdir(output_dir):
do_scrape = True
else:
for filename in os.listdir(output_dir):
file_path = os.path.join(output_dir, filename)
if os.path.isfile(file_path):
file_week = datetime.fromtimestamp(os.path.getmtime(file_path)).isocalendar()[1]
if file_week != current_week:
do_scrape = True
break
if not do_scrape:
return
os.makedirs(output_dir, exist_ok=True)
for cinema in CINEMA_URLS:
print(f"Scraping {cinema}...")
response = None
soup = None
for _ in range(3):
try:
response = requests.get(CINEMA_URLS[cinema], headers=HEADERS, timeout=5)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
break
except (requests.exceptions.RequestException, requests.exceptions.Timeout) as e:
print(f"Attempt failed: {e}")
if response is None or soup is None:
print(f"Failed to scrape {cinema} after 3 attempts.")
continue
for tag in ["head", "footer", "nav", "script"]:
for element in soup.find_all(tag):
element.decompose()
plain_text = soup.get_text(separator="\n", strip=True).lower()
for abbr, full in DAYS_ABBR.items():
plain_text = re.sub(
r"\b" + re.escape(abbr) + r"\b", full.lower(), plain_text
)
with open(
os.path.join(output_dir, f"{cinema}_w{current_week}.txt"), "w", encoding="utf-8"
) as file:
file.write(plain_text)
print(f"Scraped {cinema} successfully!")