@@ -40,115 +40,95 @@ jobs:
4040 TEMPORARY_WEBSITE_URL : ' http://127.0.0.1:8080'
4141 ACTUAL_WEBSITE_URL : ' https://ddmal.ca/Neon/'
4242 run : |
43- # Function to retry URLs with retryable errors
44- retry_urls() {
45- local urls="$1"
46- while IFS= read -r url; do
47- [ -z "$url" ] && continue
48- echo "🔄 Retrying: $url"
49-
50- for attempt in 1 2 3; do
51- echo " Attempt $attempt/3..."
52- http_code=$(curl -L -s -o /dev/null -w "%{http_code}" \
53- -H "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" \
54- --connect-timeout 30 --max-time 60 --insecure "$url" 2>/dev/null)
55-
56- if echo "$http_code" | grep -E "^(200|301|302|303)$" > /dev/null; then
57- echo " ✅ Success! HTTP $http_code"
58- echo "RETRY_SUCCESS:$url" >> /tmp/retry_results
59- break
60- elif [ $attempt -eq 3 ]; then
61- echo " ❌ Failed after 3 attempts (HTTP $http_code)"
62- echo "RETRY_FAILED:$url" >> /tmp/retry_results
63- else
64- echo " ⏳ Failed with HTTP $http_code, retrying in 5 seconds..."
65- sleep 5
66- fi
67- done
68- echo ""
69- done <<< "$urls"
70- }
71-
72- # Initialize retry results file
73- > /tmp/retry_results
74-
75- # Run broken link checker and filter output
76- echo "Running broken link check..."
77- output=$(blc $TEMPORARY_WEBSITE_URL --filter-level=3 | \
78- grep -v -E '├───OK───|└───OK───' | \
79- awk '
80- BEGIN { buf="" }
81- /^Getting links from:/ { buf=$0; next }
82- /^Finished!.*0 broken\./ {
83- if (length(buf)>0) { buf=""; next }
84- }
85- {
86- if(length(buf)>0) print buf
87- if (NF > 0) print
88- buf=""
89- }
90- /^Finished!/ { print "" }
91- ' | sed "s|$TEMPORARY_WEBSITE_URL|$ACTUAL_WEBSITE_URL|g")
92-
93- echo "Initial link check results:"
94- echo "$output"
95-
96- # Handle retryable errors
97- retryable_urls=$(echo "$output" | grep -E "(BLC_UNKNOWN|HTTP_429)" | \
98- sed -n 's/.*├─BROKEN─ \(https\?:\/\/[^[:space:]]*\).*/\1/p')
99-
100- if [ -n "$retryable_urls" ]; then
101- echo ""
102- echo "🔄 Found URLs with retryable errors, starting retry process..."
103- retry_urls "$retryable_urls"
43+ echo "Running broken link check with rate limiting..."
44+
45+ # Run blc with CLI options to avoid rate limiting
46+ # --filter-level 3: Check all link types including metadata
47+ # --ordered: Check links sequentially (helps avoid rate limiting)
48+ # --get: Use GET requests instead of HEAD (more compatible)
49+ # --user-agent: Use realistic browser user agent
50+ # --host-requests 1: Limit to 1 concurrent request per host (key for avoiding 429)
51+ set +e # Don't exit on blc failure, we'll handle it
52+ blc $TEMPORARY_WEBSITE_URL \
53+ --filter-level 3 \
54+ --ordered \
55+ --get \
56+ --user-agent "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" \
57+ --host-requests 1 \
58+ --recursive \
59+ --verbose \
60+ > /tmp/blc_output.txt 2>&1
61+ blc_exit_code=$?
62+ set -e
63+
64+ # Display the output
65+ cat /tmp/blc_output.txt
66+
67+ # Get all broken links
68+ all_broken_links=$(grep -E "├─BROKEN─" /tmp/blc_output.txt || true)
10469
105- # Show retry summary
106- success_count=$(grep -c "^RETRY_SUCCESS:" /tmp/retry_results 2>/dev/null || echo "0")
107- failed_count=$(grep -c "^RETRY_FAILED:" /tmp/retry_results 2>/dev/null || echo "0")
108- echo "📊 Retry Summary: $success_count succeeded, $failed_count failed"
70+ echo ""
71+ echo "=== Broken Links Found by blc ==="
72+ if [ -n "$all_broken_links" ]; then
73+ echo "$all_broken_links"
74+ else
75+ echo "None"
10976 fi
11077
111- # Determine final status
112- has_errors=false
78+ # Function to verify links with curl
79+ verify_with_curl() {
80+ local url="$1"
81+ echo " 🔄 Verifying: $url"
82+
83+ http_code=$(curl -L -s -o /dev/null -w "%{http_code}" \
84+ -H "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" \
85+ --connect-timeout 30 --max-time 60 --insecure \
86+ --retry 3 --retry-delay 5 --retry-all-errors \
87+ "$url" 2>/dev/null || echo "000")
88+
89+ if echo "$http_code" | grep -E "^(200|301|302|303)$" > /dev/null; then
90+ echo " ✅ Success: HTTP $http_code"
91+ return 0
92+ elif [ "$http_code" = "429" ]; then
93+ echo " ⚠️ HTTP 429 (rate limited)"
94+ rate_limited_urls="${rate_limited_urls}${url}\n"
95+ return 0
96+ else
97+ echo " ❌ Failed: HTTP $http_code"
98+ return 1
99+ fi
100+ }
113101
114- # Check for 4xx errors not resolved by retries
115- if echo "$output" | grep -Eq 'HTTP_4[0-9]{2}'; then
116- successful_urls=$(grep "^RETRY_SUCCESS:" /tmp/retry_results 2>/dev/null | cut -d: -f2- || echo "")
102+ # Verify all broken links with curl
103+ verified_failures=""
104+ rate_limited_urls=""
117105
118- unresolved_4xx=$(echo "$output" | grep 'HTTP_4[0-9]{2}' | while read -r line; do
119- url=$(echo "$line" | sed -n 's/.*├─BROKEN─ \(https\?:\/\/[^[:space:]]*\).*/\1/p')
120- if [ -n "$url" ] && ! echo "$successful_urls" | grep -Fxq "$url"; then
121- echo "$line"
122- fi
123- done)
106+ if [ -n "$all_broken_links" ]; then
107+ echo ""
108+ echo "=== Verifying Links with curl ==="
124109
125- if [ -n "$unresolved_4xx" ]; then
126- echo ""
127- echo "❌ Unresolved HTTP 4xx errors:"
128- echo "$unresolved_4xx"
129- has_errors=true
130- fi
110+ # Extract URLs and verify them
111+ urls_to_verify=$(echo "$all_broken_links" | sed -n 's/.*├─BROKEN─ \(https\?:\/\/[^[:space:]]*\).*/\1/p')
131112
132- # Check for failed retries
133- if grep -q "^RETRY_FAILED:" /tmp/retry_results 2>/dev/null; then
134- echo ""
135- echo "❌ URLs that failed after retries:"
136- grep "^RETRY_FAILED:" /tmp/retry_results | cut -d: -f2-
137- has_errors=true
138- fi
113+ while IFS= read -r url; do
114+ [ -z "$url" ] && continue
115+ if ! verify_with_curl "$url"; then
116+ verified_failures="${verified_failures}${url}\n"
117+ fi
118+ done <<< "$urls_to_verify"
139119 fi
140120
141- # Final result
121+ # Final decision
142122 echo ""
143- if [ "$has_errors" = true ]; then
144- echo "❌ Broken links found that could not be resolved."
123+ if [ -n "$verified_failures" ]; then
124+ echo "❌ CI Failed: The following links failed:"
125+ echo -e "$verified_failures"
145126 exit 1
146127 else
147- if grep -q "^RETRY_SUCCESS:" /tmp/retry_results 2>/dev/null; then
148- echo "✅ All broken links resolved via retries! Successfully fixed:"
149- grep "^RETRY_SUCCESS:" /tmp/retry_results | cut -d: -f2- | sed 's/^/ - /'
150- else
151- echo "✅ No broken links found."
128+ if [ -n "$rate_limited_urls" ]; then
129+ echo "⚠️ Note: These links returned HTTP 429 (rate limited, not broken):"
130+ echo -e "$rate_limited_urls"
152131 fi
132+ echo "✅ CI Passed: All links verified successfully"
153133 exit 0
154134 fi
0 commit comments