-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathwebtree.py
More file actions
92 lines (78 loc) · 2.8 KB
/
webtree.py
File metadata and controls
92 lines (78 loc) · 2.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import requests
from bs4 import BeautifulSoup
import urllib3
import logging
import datetime
import os
import click
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
logging.basicConfig(filename=os.path.join(os.getcwd(), f'webtree.log'), level=logging.INFO, format='')
class Node:
# node_type: 0=root, 1=file, 2=directory, 3=failure
def __init__(self, name, node_type, predecessor):
self.name = name
self.node_type = node_type
self.predecessor = predecessor
self.successors = list()
def add_successor(self, successor):
self.successors.append(successor)
def traverse(node):
request_url = build_url(node)
logging.info(request_url)
fetch_result = fetch_url(request_url)
if not fetch_result[0]:
node.add_successor(Node("? FAILED TO FETCH ?", 3, node))
return node
soup = BeautifulSoup(fetch_result[1], 'html5lib')
all_links = soup.find_all("a")
for link in all_links:
href = str(link.attrs['href'])
if(href.startswith('?') or href.startswith('/') or href.startswith('../')):
# link is for sorting or parent
continue
elif(href.endswith('/')):
# link is to directory
successor = Node(href, 2, node)
node.add_successor(traverse(successor))
else:
# is file
node.add_successor(Node(href, 1, node))
return node
def fetch_url(url):
for i in range(1,3):
try:
resp = requests.get(url, verify=False)
if resp.status_code == 200:
return (True, resp.content)
except:
pass
return (False, "")
# Builds url from a node and its assigned predecessors
def build_url(node):
url = ""
current_node = node
while current_node.predecessor is not None:
url = f"{current_node.name}{ '/' if not current_node.name.endswith('/') else ''}{url}"
current_node = current_node.predecessor
url = f"{current_node.name}{ '/' if not current_node.name.endswith('/') else ''}{url}"
return url
def print_tree(node, prefix=''):
output = f'{prefix}{node.name}\n'
for child in node.successors:
output = output + print_tree(child, "| " + prefix)
return output
@click.command()
@click.argument('url')
@click.argument('outfile')
def entrypoint(url, outfile):
"""Recursively explores the directory listing at URL and writes the resulting tree to OUTFILE"""
logging.info(f'Starting new run at: {datetime.datetime.now()}')
root = Node(url, 0, None)
root= traverse(root)
visualization = print_tree(root)
with open(outfile, 'w+') as f:
f.write(visualization)
print(visualization)
logging.info(f'Finished current run at: {datetime.datetime.now()}')
if __name__ == "__main__":
entrypoint()