-
-
Notifications
You must be signed in to change notification settings - Fork 11
Expand file tree
/
Copy pathconfig.default
More file actions
329 lines (310 loc) · 14.4 KB
/
config.default
File metadata and controls
329 lines (310 loc) · 14.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
---
# ==========================
# CROWler LOCAL config (annotated)
# ==========================
version: 1.0.0 # Freeform version of YOUR config (semver pattern)
author: default # Who authored/maintains this config
description: Default LOCAL configuration for The CROWler
created_at: "2025-08-13T00:00:00" # Accepts multiple date/time formats per schema regex
# --------------------------
# Database (required in LOCAL mode)
# --------------------------
database:
type: postgres # DB engine: postgres | mysql | sqlite (postgres recommended)
host: ${DOCKER_DB_HOST} # DB host (FQDN/IP/env-var allowed by regex in schema)
port: 5432 # DB port (1..65535)
user: ${CROWLER_DB_USER} # DB username
password: ${CROWLER_DB_PASSWORD} # DB password (can use envsubst at runtime)
dbname: SitesIndex # DB name (example value from schema)
retry_time: 10 # Seconds before retrying a failed DB connection (≥5)
ping_time: 10 # Seconds between liveness pings (≥5)
sslmode: disable # enable | disable | "" (empty) per enum
optimize_for: write # write | query | none | "" – pick for workload pattern
max_conns: 50 # Max open connections (≥25)
max_idle_conns: 30 # Idle pool size (≥25; ~25–30% of max_conns is typical)
# --------------------------
# Crawler Engine behavior
# --------------------------
crawler:
query_timer: 30 # Seconds between DB polls for new Sources
workers: 3 # Concurrent workers; use ≥3 when network discovery is enabled
platform: desktop # desktop | mobile (mobile is experimental per docs)
browser_platform: linux # darwin | windows | linux | random – affects UA selection
interval: random(1, 3) # Human Behavior Simulation pacing; supports ExprTerpreter strings
timeout: 30 # Page fetch/render timeout (seconds)
crawling_interval: 3 days # Re-crawl cadence per Source (e.g., "1 hour", "3 days")
crawling_if_error: 15 minutes # Retry-after when a crawl fails (string)
crawling_if_ok: 3 days # Retry-after when a crawl succeeds (string)
maintenance: 120 # Engine’s internal housekeeping tick (seconds)
source_screenshot: false # Save a screenshot per seed/source page
full_site_screenshot: false # Attempt broader site screenshots (heavier)
max_depth: 3 # Max DOM/URL traversal depth; 0 often means unlimited
#max_links: 0 # Limit links per page
#max_sources: 0 # Limit number of sources processed per cycle
delay: random(1, 3) # Extra per-action delay (ExprTerpreter supported)
browsing_mode: recursive # human | fuzzing | recursive
max_retries: 3 # Per-URL retry budget
max_requests: 0 # Global cap per run (0 = unlimited)
change_useragent: never # never | random | per_source (varies by build)
force_sec_fetch_site_same_origin: false # Force Sec-Fetch-Site header to same-origin (anti-fingerprinting)
reset_cookies_policy: on_start # on_start | per_domain | never (depends on build)
no_third_party_cookies: true # Block third-party cookies in sessions
request_images: true # Allow image requests
request_css: true # Allow CSS requests
request_scripts: true # Allow JS requests (turn off to reduce JS-heavy sites load)
request_plugins: false # Allow browser plugins (usually off)
request_frames: false # Allow iframes (off reduces surface/fingerprinting)
prevent_duplicate_urls: true # De-duplicate URLs during crawl
collect_html: true # Persist raw HTML
collect_images: true # Persist discovered image metadata/links
collect_files: false # Persist non-image file metadata/links
collect_content: true # Persist text content extraction
collect_keywords: true # Extract keyword metrics
collect_metatags: true # Extract <meta> tags
collect_performance: false # Collect timing/navigation metrics
collect_events: false # Collect DOM/browser events during crawl
collect_xhr: false # Collect XHR/fetch requests
filter_xhr: # (Optional) deny types for XHRs collection
- text/empty
- text/css
- text/html
- image/png
- image/jpeg
- image/gif
- image/x-icon
- image/svg+xml
- audio/mpeg
- video/mp4
- font/woff2
- font/woff
- font/ttf
- application/xhtml+xml
- application/javascript
- application/pdf
- application/msword
- application/vnd.openxmlformats-officedocument.wordprocessingml.document
- application/vnd.ms-excel
- application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
- application/vnd.ms-powerpoint
- application/vnd.openxmlformats-officedocument.presentationml.presentation
- application/zip
- application/x-tar
- application/gzip
- application/x-bzip2
- application/x-rar-compressed
- application/x-msdownload
- application/x-msi
- application/x-debian-package
- application/x-rpm
- application/vnd.android.package-archive
- application/x-iso9660-image
collect_links: true # Collect outgoing links for graph/crawl queue
create_event_when_done: false # Emit a final “crawl done” event automatically
# Control API for the Engine (local admin interface)
control:
host: 0.0.0.0 # Bind address for control endpoint
port: 8443 # Control API port (1..65535)
timeout: 20 # Control API request timeout (seconds)
sslmode: disable # enable | disable
cert_file: "" # Required if sslmode=enable
key_file: "" # Required if sslmode=enable
rate_limit: "100,100" # e.g., "100,100" max 100 req per second with a max burst of 100
readheader_timeout: 30 # Seconds to read headers
write_timeout: 30 # Seconds for writes
# --------------------------
# General API (query/search/trigger)
# --------------------------
api:
host: 0.0.0.0 # Bind address
port: 8080 # Public API port
timeout: 30 # Request timeout (seconds)
sslmode: disable # enable | disable
cert_file: "" # If sslmode=enable
key_file: "" # If sslmode=enable
rate_limit: "1000,1000" # Throttle (string allowed) 1000 requests per second with a burst of 1000 requests
readheader_timeout: 30 # Seconds to read headers
write_timeout: 30 # Seconds to write responses
enable_console: true # Enable built-in API console (dev only)
return_404: false # Respond 404 for unknown endpoints when true
content_search: false # Allow fulltext-like searches on collected content
return_content: false # Include raw page content in API results (heavier)
# --------------------------
# Selenium / VDI pool
# Provide 1+ entries; names should match docker-compose service names
# --------------------------
selenium:
- type: chrome # chrome | chromium | firefox (depends on images available)
host: localhost # Selenium host/container DNS
port: 4444 # Selenium hub/standalone port
sslmode: disable # enable | disable (TLS to Selenium)
headless: false # Headless browser sessions
name: crowler-vdi-1 # Logical VDI name (used by Engine pinning)
location: "" # Freeform location tag (e.g., "eu-west")
language: "" # Preferred UI language/locale
path: "" # Optional: path to browser binary (usually empty in containers)
driver_path: "" # Optional: custom driver binary path
use_service: false # Use OS/browser service manager where applicable
download_path: "./downloads" # Temporary download dir inside the container (exists in image)
proxy_url: "" # http(s)://user:pass@host:port – route all traffic via proxy
sys_manager: # Optional sidecar/system manager for the VDI
port: 4445 # Manager port
timeout: 30 # Request timeout (seconds)
sslmode: disable # TLS to manager
# --------------------------
# Network information (DNS/WHOIS/Net/Geo/Service scout)
# Keep disabled unless you need them; some actions can be noisy.
# --------------------------
network_info:
dns:
enabled: false # Enable DNS info gathering (recursive/authoritative)
timeout: 60 # Seconds per request
rate_limit: 20/s # Throttle expression/string
whois:
enabled: false # WHOIS lookups (can be slow)
timeout: 60
rate_limit: 20/s
netlookup:
enabled: false # General network lookups (reverse, ptr, etc.)
timeout: 60
rate_limit: 20/s
geo_localization:
enabled: false # IP geolocation enrichment (local DB or remote API)
type: maxmind # "" | maxmind | ip2location (depends on your setup)
path: "" # Path to local DB (e.g., MaxMind). Required if local mode.
timeout: 60 # Seconds per lookup
#api_key: '' # If using a remote API provider
#sslmode: '' # "" | enable | disable (TLS to provider)
service_scout:
enabled: false # Network/service scanner (use with caution)
timeout: 60 # Seconds per scan (≥5)
#idle_scan:
# host: localhost # Zombie host (only if you know what you’re doing)
# port: 80 # 1..65535
ping_scan: false # ICMP discovery
connect_scan: false # TCP connect() scan
syn_scan: false # TCP SYN scan
udp_scan: false # UDP scan
no_dns_resolution: false # Don’t resolve names during scan
service_detection: false # Try to fingerprint services/versions
service_db: "" # Optional DB path for service sigs
os_finger_print: false # Try to fingerprint OS
aggressive_scan: false # Faster/noisier timings
script_scan: [] # list of scripts/categories (e.g. default, vuln)
excluded_hosts: [] # Never scan these (CIDRs/IPs)
timing_template: T3 # Nmap-like T0..T5; T3 is balanced
host_timeout: "60" # Strings allowed; works with ExprTerpreter (e.g., "random(30,90)")
min_rate: "10" # Strings allowed; desired packets/sec
max_retries: 0 # Integer retries
#source_port: 0 # set specific port as string in some builds (see docs)
interface: "" # Network interface name (e.g., eth0)
spoof_ip: "" # Source IP spoofing (advanced)
randomize_hosts: true # Randomize target order
data_length: 0 # Payload length (bytes)
delay: "1" # Inter-packet delay (string/ExprTerpreter)
mtu_discovery: true # Try to discover MTU (evasion)
scan_flags: SYN # Raw flags (advanced; e.g., SYN)
ip_fragment: false # Fragment IP packets (evasion)
min_port_number: 1 # Minimum port to scan (≥1)
max_port_number: 9000 # Maximum port (≤65535; defaults to 9000)
max_parallelism: 10 # Parallel probes per target
dns_servers: [] # Optional list of DNS servers (e.g., ["1.1.1.1"])
proxies: [] # Optional list of proxies for scanning (strings)
# --------------------------
# Events Manager (CEMI)
# --------------------------
events:
host: 0.0.0.0 # Listen address for Events Manager API
port: 8082 # Port for events API
timeout: 30 # Request timeout (seconds)
sslmode: disable # enable | disable
cert_file: "" # If sslmode=enable
key_file: "" # If sslmode=enable
automatic_events_removal: "on_success"
rate_limit: "1200,1200" # Throttle (string allowed) 1200 requests per second with a burst of 1200 requests
readheader_timeout: 30 # Seconds to read headers
write_timeout: 30 # Seconds to write responses
# --------------------------
# Storage backends
# --------------------------
image_storage:
type: local # local | http | s3-like (depends on deployment)
path: "./data/images" # Base path (local) or bucket/prefix (remote)
timeout: 30 # Request timeout (remote backends)
#host: '' # If remote
#port: 0 # If remote
#region: '' # If remote
#token: '' # If remote: auth token
#secret: '' # If remote: auth secret
#sslmode: enable # TLS to storage backend
file_storage:
type: local # local | http | s3 (depends on deployment)
path: "./data/files"
timeout: 30
#host: '' # if remote
#port: 0 # if remote
#region: '' # if remote
#token: '' # if remote: auth token
#secret: '' # if remote: auth secret
#sslmode: enable # TLS to storage backend
# --------------------------
# HTTP header discovery helper (optional)
# --------------------------
http_headers:
enabled: false # When true, perform HTTP header/SSL discovery passes
timeout: 30 # Seconds per request
follow_redirects: true # Follow 3xx during discovery
ssl_discovery:
enabled: false # Probe TLS/SSL info (certs, params)
proxies: [] # Optional list of HTTP proxies (objects if enabled; keep empty here)
# --------------------------
# Rulesets loader (where to load YAML rules)
# --------------------------
rulesets_schema_path: "./schemas/crowler-rulesets-schema.json" # Path to ruleset JSON Schema (used for rules validation)
rulesets:
- type: local # local | http | s3 (you can mix multiple entries)
path:
- "./rules/*.yaml" # Supports wildcards; multiple paths allowed
timeout: 30 # Remote fetch timeout (applies to http/s3 types)
sslmode: disable # TLS to ruleset distributor (http/s3)
refresh: 0 # Optional: seconds between refresh checks (0=off)
# --------------------------
# Agents loader (automation flows)
# --------------------------
agents:
- type: local # local | http | s3 – where agent files are located
path:
- "./agents/*.yaml" # One or more globs/paths
global_parameters: {} # K/V injected into every agent at runtime
timeout: 30 # Loader timeout for remote sources
agents_timeout: 30 # Per-agent execution timeout baseline
plugins_timeout: 30 # Per-plugin execution timeout baseline
#sslmode: disable # if remote
#host: '' # For http/s3
#port: 0 # if remote
#region: '' # if remote
#token: '' # if remote: auth token
#secret: '' # if remote: auth secret
refresh: 0 # Optional: seconds between refresh checks (0=off)
# --------------------------
# Plugins loader (engine/vdi/event plugins)
# --------------------------
plugins:
global_parameters: {} # K/V available to every plugin (e.g., API keys)
timeout: 30
plugins_timeout: 30
locations:
- type: local # local | http | s3
path:
- "./plugins/" # Directory or specific files/globs
timeout: 30
#sslmode: disable # TLS to plugin backend
#host: '' # If remote repository
#port: 0 # If remote
#region: '' # If remote
#token: '' # If remote: auth token
#secret: '' # If remote: auth secret
refresh: 0
# --------------------------
# Debugging
# --------------------------
debug_level: 0 # 0 = silent; higher values increase verbosity (logs can grow quickly)