-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
45 lines (35 loc) · 1.17 KB
/
utils.py
File metadata and controls
45 lines (35 loc) · 1.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
"""
Author:
Amin Dziri
"""
def read_initial_urls(path:str) -> list:
"""
Reads the initial URLs from the initial_urls.txt file
Args:
path (str): relative path to the file
Returns:
initial_urls (list): The initial URLs from the file
Raises:
/
"""
with open(path, 'r') as urls_file:
initial_urls = [file.replace("\n", "") for file in urls_file]
return initial_urls
def filter_urls(urls: list, target_site: str) -> list:
"""
Filters the URLs-List to only contain URLs to webpages with textual information and stay on the target webpage
Args:
url (list): List of all crawled URLs
target_site (str): Site on which the crawler should focus
Returns:
filtered_urls (list): All the filtered URLs that are on the same page and
Raises:
/
"""
filtered_urls = []
file_endings = ['png', 'js', 'jpg', 'jpeg', 'gif', 'css', 'php', 'mp3']
for url in urls:
url_ending = url.split('.')[-1]
if (url_ending not in file_endings) and (target_site in url):
filtered_urls.append(url)
return filtered_urls