AgenticCrawling/crawler.py at master · solita/AgenticCrawling · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
"""
AI-powered web crawler that can navigate websites intelligently
Uses Playwright for browser automation and LiteLLM for AI decisions
"""
import asyncio
import json
from typing import List, Dict, Any, Optional
from urllib.parse import urljoin, urlparse
from playwright.async_api import async_playwright, Page, Browser
from bs4 import BeautifulSoup
from loguru import logger
import litellm
from config import settings


class AIWebCrawler:
    """
    Intelligent web crawler that uses AI to navigate and extract information
    Unlike simple HTML fetchers, this actually navigates the site like a browser
    """

    def __init__(self):
        self.browser: Optional[Browser] = None
        self.visited_urls: set = set()

    async def __aenter__(self):
        """Async context manager entry"""
        playwright = await async_playwright().start()
        self.browser = await playwright.chromium.launch(headless=True)
        return self

    async def __aexit__(self, exc_type, exc_val, exc_tb):
        """Async context manager exit"""
        if self.browser:
            await self.browser.close()

    async def crawl_website(self, url: str, max_pages: int = 10) -> Dict[str, Any]:
        """
        Intelligently crawl a website using AI to navigate

        Args:
            url: Starting URL to crawl
            max_pages: Maximum number of pages to visit

        Returns:
            Dictionary containing extracted information
        """
        logger.info(f"Starting AI-powered crawl of {url}")

        if not self.browser:
            raise RuntimeError("Browser not initialized. Use async context manager.")

        self.visited_urls = set()
        crawl_data = {
            "url": url,
            "pages_crawled": [],
            "main_content": "",
            "metadata": {}
        }

        try:
            # Create a new page
            page = await self.browser.new_page()
            await page.set_extra_http_headers({"User-Agent": settings.user_agent})

            # Start with the homepage
            await self._crawl_page(page, url, crawl_data, max_pages)

            await page.close()

        except Exception as e:
            logger.error(f"Error crawling {url}: {e}")
            crawl_data["error"] = str(e)

        return crawl_data

    async def _crawl_page(self, page: Page, url: str, crawl_data: Dict, max_pages: int):
        """Recursively crawl pages using AI navigation"""

        if len(self.visited_urls) >= max_pages or url in self.visited_urls:
            return

        self.visited_urls.add(url)
        logger.info(f"Crawling page {len(self.visited_urls)}/{max_pages}: {url}")

        try:
            # Navigate to the page
            response = await page.goto(url, timeout=settings.crawl_timeout * 1000, wait_until="networkidle")

            if not response or response.status != 200:
                logger.warning(f"Failed to load {url}: Status {response.status if response else 'None'}")
                return

            # Wait for content to load
            await page.wait_for_timeout(2000)

            # Extract page content
            content = await page.content()
            soup = BeautifulSoup(content, 'lxml')

            # Remove scripts, styles, and navigation elements
            for tag in soup(['script', 'style', 'nav', 'header', 'footer']):
                tag.decompose()

            # Get text content
            text_content = soup.get_text(separator=' ', strip=True)
            text_content = ' '.join(text_content.split())  # Normalize whitespace

            # Store page data
            page_data = {
                "url": url,
                "title": await page.title(),
                "content": text_content[:5000],  # First 5000 chars
                "full_content": text_content
            }
            crawl_data["pages_crawled"].append(page_data)

            # Accumulate main content
            crawl_data["main_content"] += "\n\n" + text_content

            # Use AI to decide which links to follow
            if len(self.visited_urls) < max_pages:
                important_links = await self._ai_select_links(page, url, soup)

                # Follow important links
                for link in important_links[:min(3, max_pages - len(self.visited_urls))]:
                    if len(self.visited_urls) < max_pages:
                        await self._crawl_page(page, link, crawl_data, max_pages)

        except Exception as e:
            logger.error(f"Error crawling page {url}: {e}")

    async def _ai_select_links(self, page: Page, current_url: str, soup: BeautifulSoup) -> List[str]:
        """
        Use AI to intelligently select which links to follow
        This is what makes it "AI-powered navigation"
        """

        # Extract all links
        links = []
        base_domain = urlparse(current_url).netloc

        for a_tag in soup.find_all('a', href=True):
            href = a_tag['href']
            full_url = urljoin(current_url, href)
            parsed = urlparse(full_url)

            # Only follow links on the same domain
            if parsed.netloc == base_domain and full_url not in self.visited_urls:
                link_text = a_tag.get_text(strip=True)
                links.append({
                    "url": full_url,
                    "text": link_text,
                    "context": str(a_tag.parent)[:200] if a_tag.parent else ""
                })

        if not links:
            return []

        # Limit to most promising links
        links = links[:20]

        # Use AI to rank links by importance
        try:
            prompt = f"""You are helping to crawl a company website to understand their values and business.
Current page: {current_url}

Available links to follow:
{json.dumps(links[:15], indent=2)}

Select the 3 MOST IMPORTANT links to follow to learn about:
- Company values, mission, vision
- About us / Company information
- Services or products
- Team or culture pages

Return ONLY a JSON array of the 3 most important URLs, like:
["url1", "url2", "url3"]
"""

            response = litellm.completion(
                model=settings.llm_model,
                messages=[{"role": "user", "content": prompt}],
                temperature=0.3,
                max_tokens=200
            )

            result = response.choices[0].message.content.strip()

            # Parse JSON response
            if result.startswith('['):
                selected_urls = json.loads(result)
                return selected_urls[:3]

        except Exception as e:
            logger.warning(f"AI link selection failed, using fallback: {e}")

        # Fallback: select links with keywords
        keywords = ['about', 'values', 'company', 'team', 'culture', 'mission', 'who-we-are']
        scored_links = []

        for link in links:
            score = sum(1 for kw in keywords if kw in link['url'].lower() or kw in link['text'].lower())
            scored_links.append((score, link['url']))

        scored_links.sort(reverse=True, key=lambda x: x[0])
        return [url for _, url in scored_links[:3]]


async def crawl_single_site(url: str) -> Dict[str, Any]:
    """Helper function to crawl a single site"""
    async with AIWebCrawler() as crawler:
        return await crawler.crawl_website(url, max_pages=8)