web-crawler/utils.py at main · amndzdzdz/web-crawler · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
"""
Author:
    Amin Dziri
"""

def read_initial_urls(path:str) -> list:
    """
    Reads the initial URLs from the initial_urls.txt file

    Args:
        path (str): relative path to the file

    Returns:
        initial_urls (list): The initial URLs from the file

    Raises:
        /
    """
    with open(path, 'r') as urls_file:
        initial_urls = [file.replace("\n", "") for file in urls_file]

    return initial_urls

def filter_urls(urls: list, target_site: str) -> list:
    """
    Filters the URLs-List to only contain URLs to webpages with textual information and stay on the target webpage

    Args:
        url (list): List of all crawled URLs
        target_site (str): Site on which the crawler should focus

    Returns:
        filtered_urls (list): All the filtered URLs that are on the same page and

    Raises:
        /
    """
    filtered_urls = []
    file_endings = ['png', 'js', 'jpg', 'jpeg', 'gif', 'css', 'php', 'mp3']
    for url in urls:
        url_ending = url.split('.')[-1]
        if (url_ending not in file_endings) and (target_site in url):
            filtered_urls.append(url)

    return filtered_urls