Skip to content

Commit 9d82e11

Browse files
committed
refactor: clean up link checker and curl verification
1 parent 6350d12 commit 9d82e11

File tree

1 file changed

+77
-97
lines changed

1 file changed

+77
-97
lines changed

.github/workflows/broken-link-checker.yml

Lines changed: 77 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -40,115 +40,95 @@ jobs:
4040
TEMPORARY_WEBSITE_URL: 'http://127.0.0.1:8080'
4141
ACTUAL_WEBSITE_URL: 'https://ddmal.ca/Neon/'
4242
run: |
43-
# Function to retry URLs with retryable errors
44-
retry_urls() {
45-
local urls="$1"
46-
while IFS= read -r url; do
47-
[ -z "$url" ] && continue
48-
echo "🔄 Retrying: $url"
49-
50-
for attempt in 1 2 3; do
51-
echo " Attempt $attempt/3..."
52-
http_code=$(curl -L -s -o /dev/null -w "%{http_code}" \
53-
-H "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" \
54-
--connect-timeout 30 --max-time 60 --insecure "$url" 2>/dev/null)
55-
56-
if echo "$http_code" | grep -E "^(200|301|302|303)$" > /dev/null; then
57-
echo " ✅ Success! HTTP $http_code"
58-
echo "RETRY_SUCCESS:$url" >> /tmp/retry_results
59-
break
60-
elif [ $attempt -eq 3 ]; then
61-
echo " ❌ Failed after 3 attempts (HTTP $http_code)"
62-
echo "RETRY_FAILED:$url" >> /tmp/retry_results
63-
else
64-
echo " ⏳ Failed with HTTP $http_code, retrying in 5 seconds..."
65-
sleep 5
66-
fi
67-
done
68-
echo ""
69-
done <<< "$urls"
70-
}
71-
72-
# Initialize retry results file
73-
> /tmp/retry_results
74-
75-
# Run broken link checker and filter output
76-
echo "Running broken link check..."
77-
output=$(blc $TEMPORARY_WEBSITE_URL --filter-level=3 | \
78-
grep -v -E '├───OK───|└───OK───' | \
79-
awk '
80-
BEGIN { buf="" }
81-
/^Getting links from:/ { buf=$0; next }
82-
/^Finished!.*0 broken\./ {
83-
if (length(buf)>0) { buf=""; next }
84-
}
85-
{
86-
if(length(buf)>0) print buf
87-
if (NF > 0) print
88-
buf=""
89-
}
90-
/^Finished!/ { print "" }
91-
' | sed "s|$TEMPORARY_WEBSITE_URL|$ACTUAL_WEBSITE_URL|g")
92-
93-
echo "Initial link check results:"
94-
echo "$output"
95-
96-
# Handle retryable errors
97-
retryable_urls=$(echo "$output" | grep -E "(BLC_UNKNOWN|HTTP_429)" | \
98-
sed -n 's/.*├─BROKEN─ \(https\?:\/\/[^[:space:]]*\).*/\1/p')
99-
100-
if [ -n "$retryable_urls" ]; then
101-
echo ""
102-
echo "🔄 Found URLs with retryable errors, starting retry process..."
103-
retry_urls "$retryable_urls"
43+
echo "Running broken link check with rate limiting..."
44+
45+
# Run blc with CLI options to avoid rate limiting
46+
# --filter-level 3: Check all link types including metadata
47+
# --ordered: Check links sequentially (helps avoid rate limiting)
48+
# --get: Use GET requests instead of HEAD (more compatible)
49+
# --user-agent: Use realistic browser user agent
50+
# --host-requests 1: Limit to 1 concurrent request per host (key for avoiding 429)
51+
set +e # Don't exit on blc failure, we'll handle it
52+
blc $TEMPORARY_WEBSITE_URL \
53+
--filter-level 3 \
54+
--ordered \
55+
--get \
56+
--user-agent "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" \
57+
--host-requests 1 \
58+
--recursive \
59+
--verbose \
60+
> /tmp/blc_output.txt 2>&1
61+
blc_exit_code=$?
62+
set -e
63+
64+
# Display the output
65+
cat /tmp/blc_output.txt
66+
67+
# Get all broken links
68+
all_broken_links=$(grep -E "├─BROKEN─" /tmp/blc_output.txt || true)
10469
105-
# Show retry summary
106-
success_count=$(grep -c "^RETRY_SUCCESS:" /tmp/retry_results 2>/dev/null || echo "0")
107-
failed_count=$(grep -c "^RETRY_FAILED:" /tmp/retry_results 2>/dev/null || echo "0")
108-
echo "📊 Retry Summary: $success_count succeeded, $failed_count failed"
70+
echo ""
71+
echo "=== Broken Links Found by blc ==="
72+
if [ -n "$all_broken_links" ]; then
73+
echo "$all_broken_links"
74+
else
75+
echo "None"
10976
fi
11077
111-
# Determine final status
112-
has_errors=false
78+
# Function to verify links with curl
79+
verify_with_curl() {
80+
local url="$1"
81+
echo " 🔄 Verifying: $url"
82+
83+
http_code=$(curl -L -s -o /dev/null -w "%{http_code}" \
84+
-H "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" \
85+
--connect-timeout 30 --max-time 60 --insecure \
86+
--retry 3 --retry-delay 5 --retry-all-errors \
87+
"$url" 2>/dev/null || echo "000")
88+
89+
if echo "$http_code" | grep -E "^(200|301|302|303)$" > /dev/null; then
90+
echo " ✅ Success: HTTP $http_code"
91+
return 0
92+
elif [ "$http_code" = "429" ]; then
93+
echo " ⚠️ HTTP 429 (rate limited)"
94+
rate_limited_urls="${rate_limited_urls}${url}\n"
95+
return 0
96+
else
97+
echo " ❌ Failed: HTTP $http_code"
98+
return 1
99+
fi
100+
}
113101
114-
# Check for 4xx errors not resolved by retries
115-
if echo "$output" | grep -Eq 'HTTP_4[0-9]{2}'; then
116-
successful_urls=$(grep "^RETRY_SUCCESS:" /tmp/retry_results 2>/dev/null | cut -d: -f2- || echo "")
102+
# Verify all broken links with curl
103+
verified_failures=""
104+
rate_limited_urls=""
117105
118-
unresolved_4xx=$(echo "$output" | grep 'HTTP_4[0-9]{2}' | while read -r line; do
119-
url=$(echo "$line" | sed -n 's/.*├─BROKEN─ \(https\?:\/\/[^[:space:]]*\).*/\1/p')
120-
if [ -n "$url" ] && ! echo "$successful_urls" | grep -Fxq "$url"; then
121-
echo "$line"
122-
fi
123-
done)
106+
if [ -n "$all_broken_links" ]; then
107+
echo ""
108+
echo "=== Verifying Links with curl ==="
124109
125-
if [ -n "$unresolved_4xx" ]; then
126-
echo ""
127-
echo "❌ Unresolved HTTP 4xx errors:"
128-
echo "$unresolved_4xx"
129-
has_errors=true
130-
fi
110+
# Extract URLs and verify them
111+
urls_to_verify=$(echo "$all_broken_links" | sed -n 's/.*├─BROKEN─ \(https\?:\/\/[^[:space:]]*\).*/\1/p')
131112
132-
# Check for failed retries
133-
if grep -q "^RETRY_FAILED:" /tmp/retry_results 2>/dev/null; then
134-
echo ""
135-
echo "❌ URLs that failed after retries:"
136-
grep "^RETRY_FAILED:" /tmp/retry_results | cut -d: -f2-
137-
has_errors=true
138-
fi
113+
while IFS= read -r url; do
114+
[ -z "$url" ] && continue
115+
if ! verify_with_curl "$url"; then
116+
verified_failures="${verified_failures}${url}\n"
117+
fi
118+
done <<< "$urls_to_verify"
139119
fi
140120
141-
# Final result
121+
# Final decision
142122
echo ""
143-
if [ "$has_errors" = true ]; then
144-
echo "❌ Broken links found that could not be resolved."
123+
if [ -n "$verified_failures" ]; then
124+
echo "❌ CI Failed: The following links failed:"
125+
echo -e "$verified_failures"
145126
exit 1
146127
else
147-
if grep -q "^RETRY_SUCCESS:" /tmp/retry_results 2>/dev/null; then
148-
echo "✅ All broken links resolved via retries! Successfully fixed:"
149-
grep "^RETRY_SUCCESS:" /tmp/retry_results | cut -d: -f2- | sed 's/^/ - /'
150-
else
151-
echo "✅ No broken links found."
128+
if [ -n "$rate_limited_urls" ]; then
129+
echo "⚠️ Note: These links returned HTTP 429 (rate limited, not broken):"
130+
echo -e "$rate_limited_urls"
152131
fi
132+
echo "✅ CI Passed: All links verified successfully"
153133
exit 0
154134
fi

0 commit comments

Comments
 (0)