-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcrawler.py
More file actions
213 lines (165 loc) · 7.54 KB
/
crawler.py
File metadata and controls
213 lines (165 loc) · 7.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
"""
AI-powered web crawler that can navigate websites intelligently
Uses Playwright for browser automation and LiteLLM for AI decisions
"""
import asyncio
import json
from typing import List, Dict, Any, Optional
from urllib.parse import urljoin, urlparse
from playwright.async_api import async_playwright, Page, Browser
from bs4 import BeautifulSoup
from loguru import logger
import litellm
from config import settings
class AIWebCrawler:
"""
Intelligent web crawler that uses AI to navigate and extract information
Unlike simple HTML fetchers, this actually navigates the site like a browser
"""
def __init__(self):
self.browser: Optional[Browser] = None
self.visited_urls: set = set()
async def __aenter__(self):
"""Async context manager entry"""
playwright = await async_playwright().start()
self.browser = await playwright.chromium.launch(headless=True)
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
"""Async context manager exit"""
if self.browser:
await self.browser.close()
async def crawl_website(self, url: str, max_pages: int = 10) -> Dict[str, Any]:
"""
Intelligently crawl a website using AI to navigate
Args:
url: Starting URL to crawl
max_pages: Maximum number of pages to visit
Returns:
Dictionary containing extracted information
"""
logger.info(f"Starting AI-powered crawl of {url}")
if not self.browser:
raise RuntimeError("Browser not initialized. Use async context manager.")
self.visited_urls = set()
crawl_data = {
"url": url,
"pages_crawled": [],
"main_content": "",
"metadata": {}
}
try:
# Create a new page
page = await self.browser.new_page()
await page.set_extra_http_headers({"User-Agent": settings.user_agent})
# Start with the homepage
await self._crawl_page(page, url, crawl_data, max_pages)
await page.close()
except Exception as e:
logger.error(f"Error crawling {url}: {e}")
crawl_data["error"] = str(e)
return crawl_data
async def _crawl_page(self, page: Page, url: str, crawl_data: Dict, max_pages: int):
"""Recursively crawl pages using AI navigation"""
if len(self.visited_urls) >= max_pages or url in self.visited_urls:
return
self.visited_urls.add(url)
logger.info(f"Crawling page {len(self.visited_urls)}/{max_pages}: {url}")
try:
# Navigate to the page
response = await page.goto(url, timeout=settings.crawl_timeout * 1000, wait_until="networkidle")
if not response or response.status != 200:
logger.warning(f"Failed to load {url}: Status {response.status if response else 'None'}")
return
# Wait for content to load
await page.wait_for_timeout(2000)
# Extract page content
content = await page.content()
soup = BeautifulSoup(content, 'lxml')
# Remove scripts, styles, and navigation elements
for tag in soup(['script', 'style', 'nav', 'header', 'footer']):
tag.decompose()
# Get text content
text_content = soup.get_text(separator=' ', strip=True)
text_content = ' '.join(text_content.split()) # Normalize whitespace
# Store page data
page_data = {
"url": url,
"title": await page.title(),
"content": text_content[:5000], # First 5000 chars
"full_content": text_content
}
crawl_data["pages_crawled"].append(page_data)
# Accumulate main content
crawl_data["main_content"] += "\n\n" + text_content
# Use AI to decide which links to follow
if len(self.visited_urls) < max_pages:
important_links = await self._ai_select_links(page, url, soup)
# Follow important links
for link in important_links[:min(3, max_pages - len(self.visited_urls))]:
if len(self.visited_urls) < max_pages:
await self._crawl_page(page, link, crawl_data, max_pages)
except Exception as e:
logger.error(f"Error crawling page {url}: {e}")
async def _ai_select_links(self, page: Page, current_url: str, soup: BeautifulSoup) -> List[str]:
"""
Use AI to intelligently select which links to follow
This is what makes it "AI-powered navigation"
"""
# Extract all links
links = []
base_domain = urlparse(current_url).netloc
for a_tag in soup.find_all('a', href=True):
href = a_tag['href']
full_url = urljoin(current_url, href)
parsed = urlparse(full_url)
# Only follow links on the same domain
if parsed.netloc == base_domain and full_url not in self.visited_urls:
link_text = a_tag.get_text(strip=True)
links.append({
"url": full_url,
"text": link_text,
"context": str(a_tag.parent)[:200] if a_tag.parent else ""
})
if not links:
return []
# Limit to most promising links
links = links[:20]
# Use AI to rank links by importance
try:
prompt = f"""You are helping to crawl a company website to understand their values and business.
Current page: {current_url}
Available links to follow:
{json.dumps(links[:15], indent=2)}
Select the 3 MOST IMPORTANT links to follow to learn about:
- Company values, mission, vision
- About us / Company information
- Services or products
- Team or culture pages
Return ONLY a JSON array of the 3 most important URLs, like:
["url1", "url2", "url3"]
"""
response = litellm.completion(
model=settings.llm_model,
messages=[{"role": "user", "content": prompt}],
temperature=0.3,
max_tokens=200
)
result = response.choices[0].message.content.strip()
# Parse JSON response
if result.startswith('['):
selected_urls = json.loads(result)
return selected_urls[:3]
except Exception as e:
logger.warning(f"AI link selection failed, using fallback: {e}")
# Fallback: select links with keywords
keywords = ['about', 'values', 'company', 'team', 'culture', 'mission', 'who-we-are']
scored_links = []
for link in links:
score = sum(1 for kw in keywords if kw in link['url'].lower() or kw in link['text'].lower())
scored_links.append((score, link['url']))
scored_links.sort(reverse=True, key=lambda x: x[0])
return [url for _, url in scored_links[:3]]
async def crawl_single_site(url: str) -> Dict[str, Any]:
"""Helper function to crawl a single site"""
async with AIWebCrawler() as crawler:
return await crawler.crawl_website(url, max_pages=8)