|
1 | 1 | name: Check for Broken Links |
2 | | -on: [push, pull_request] |
| 2 | +on: |
| 3 | + pull_request: |
| 4 | + push: |
| 5 | + branches: |
| 6 | + - main |
| 7 | + - master |
| 8 | + - develop |
| 9 | + |
3 | 10 | jobs: |
4 | 11 | build_and_check: |
5 | 12 | runs-on: ubuntu-latest |
@@ -33,115 +40,101 @@ jobs: |
33 | 40 | TEMPORARY_WEBSITE_URL: 'http://127.0.0.1:8080' |
34 | 41 | ACTUAL_WEBSITE_URL: 'https://ddmal.ca/Neon/' |
35 | 42 | run: | |
36 | | - # Function to retry URLs with retryable errors |
37 | | - retry_urls() { |
38 | | - local urls="$1" |
39 | | - while IFS= read -r url; do |
40 | | - [ -z "$url" ] && continue |
41 | | - echo "🔄 Retrying: $url" |
42 | | -
|
43 | | - for attempt in 1 2 3; do |
44 | | - echo " Attempt $attempt/3..." |
45 | | - http_code=$(curl -L -s -o /dev/null -w "%{http_code}" \ |
46 | | - -H "User-Agent: Mozilla/5.0 (compatible; BrokenLinkChecker)" \ |
47 | | - --connect-timeout 30 --max-time 60 "$url" 2>/dev/null) |
48 | | -
|
49 | | - if echo "$http_code" | grep -E "^(200|301|302|303)$" > /dev/null; then |
50 | | - echo " ✅ Success! HTTP $http_code" |
51 | | - echo "RETRY_SUCCESS:$url" >> /tmp/retry_results |
52 | | - break |
53 | | - elif [ $attempt -eq 3 ]; then |
54 | | - echo " ❌ Failed after 3 attempts (HTTP $http_code)" |
55 | | - echo "RETRY_FAILED:$url" >> /tmp/retry_results |
56 | | - else |
57 | | - echo " ⏳ Failed with HTTP $http_code, retrying in 5 seconds..." |
58 | | - sleep 5 |
59 | | - fi |
60 | | - done |
61 | | - echo "" |
62 | | - done <<< "$urls" |
63 | | - } |
64 | | -
|
65 | | - # Initialize retry results file |
66 | | - > /tmp/retry_results |
67 | | -
|
68 | | - # Run broken link checker and filter output |
69 | | - echo "Running broken link check..." |
70 | | - output=$(blc $TEMPORARY_WEBSITE_URL --filter-level=3 | \ |
71 | | - grep -v -E '├───OK───|└───OK───' | \ |
72 | | - awk ' |
73 | | - BEGIN { buf="" } |
74 | | - /^Getting links from:/ { buf=$0; next } |
75 | | - /^Finished!.*0 broken\./ { |
76 | | - if (length(buf)>0) { buf=""; next } |
77 | | - } |
78 | | - { |
79 | | - if(length(buf)>0) print buf |
80 | | - if (NF > 0) print |
81 | | - buf="" |
82 | | - } |
83 | | - /^Finished!/ { print "" } |
84 | | - ' | sed "s|$TEMPORARY_WEBSITE_URL|$ACTUAL_WEBSITE_URL|g") |
85 | | -
|
86 | | - echo "Initial link check results:" |
87 | | - echo "$output" |
88 | | -
|
89 | | - # Handle retryable errors |
90 | | - retryable_urls=$(echo "$output" | grep -E "(BLC_UNKNOWN|HTTP_429)" | \ |
91 | | - sed -n 's/.*├─BROKEN─ \(https\?:\/\/[^[:space:]]*\).*/\1/p') |
92 | | -
|
93 | | - if [ -n "$retryable_urls" ]; then |
94 | | - echo "" |
95 | | - echo "🔄 Found URLs with retryable errors, starting retry process..." |
96 | | - retry_urls "$retryable_urls" |
| 43 | + echo "Running broken link check with rate limiting..." |
| 44 | +
|
| 45 | + # Run blc with CLI options to avoid rate limiting |
| 46 | + # --filter-level 3: Check all link types including metadata |
| 47 | + # --ordered: Check links sequentially (helps avoid rate limiting) |
| 48 | + # --get: Use GET requests instead of HEAD (more compatible) |
| 49 | + # --user-agent: Use realistic browser user agent |
| 50 | + # --host-requests 1: Limit to 1 concurrent request per host (key for avoiding 429) |
| 51 | + set +e # Don't exit on blc failure, we'll handle it |
| 52 | + blc $TEMPORARY_WEBSITE_URL \ |
| 53 | + --filter-level 3 \ |
| 54 | + --ordered \ |
| 55 | + --get \ |
| 56 | + --user-agent "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" \ |
| 57 | + --host-requests 1 \ |
| 58 | + --recursive \ |
| 59 | + --verbose \ |
| 60 | + > /tmp/blc_output.txt 2>&1 |
| 61 | + blc_exit_code=$? |
| 62 | + set -e |
| 63 | +
|
| 64 | + # Display the output |
| 65 | + cat /tmp/blc_output.txt |
| 66 | +
|
| 67 | + # Get all broken links |
| 68 | + all_broken_links=$(grep -E "├─BROKEN─" /tmp/blc_output.txt || true) |
97 | 69 |
|
98 | | - # Show retry summary |
99 | | - success_count=$(grep -c "^RETRY_SUCCESS:" /tmp/retry_results 2>/dev/null || echo "0") |
100 | | - failed_count=$(grep -c "^RETRY_FAILED:" /tmp/retry_results 2>/dev/null || echo "0") |
101 | | - echo "📊 Retry Summary: $success_count succeeded, $failed_count failed" |
| 70 | + echo "" |
| 71 | + echo "=== Broken Links Found by blc ===" |
| 72 | + if [ -n "$all_broken_links" ]; then |
| 73 | + echo "$all_broken_links" |
| 74 | + else |
| 75 | + echo "None" |
102 | 76 | fi |
103 | 77 |
|
104 | | - # Determine final status |
105 | | - has_errors=false |
| 78 | + # Function to verify links with curl |
| 79 | + verify_with_curl() { |
| 80 | + local url="$1" |
| 81 | + echo " 🔄 Verifying: $url" |
| 82 | +
|
| 83 | + # Use temp file instead of /dev/null to avoid truncation errors on retry |
| 84 | + temp_body=$(mktemp) |
| 85 | +
|
| 86 | + http_code=$(curl -L -s -o "$temp_body" -w "%{http_code}" \ |
| 87 | + -H "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" \ |
| 88 | + --connect-timeout 30 --max-time 60 --insecure \ |
| 89 | + --retry 3 --retry-delay 5 --retry-all-errors \ |
| 90 | + "$url" 2>/dev/null || echo "000") |
| 91 | +
|
| 92 | + rm -f "$temp_body" |
| 93 | +
|
| 94 | + if echo "$http_code" | grep -E "^(200|301|302|303)$" > /dev/null; then |
| 95 | + echo " ✅ Success: HTTP $http_code" |
| 96 | + return 0 |
| 97 | + elif [ "$http_code" = "429" ]; then |
| 98 | + echo " ⚠️ HTTP 429 (rate limited)" |
| 99 | + rate_limited_urls="${rate_limited_urls}${url}\n" |
| 100 | + return 0 |
| 101 | + else |
| 102 | + echo " ❌ Failed: HTTP $http_code" |
| 103 | + failed_http_code="$http_code" |
| 104 | + return 1 |
| 105 | + fi |
| 106 | + } |
106 | 107 |
|
107 | | - # Check for 4xx errors not resolved by retries |
108 | | - if echo "$output" | grep -Eq 'HTTP_4[0-9]{2}'; then |
109 | | - successful_urls=$(grep "^RETRY_SUCCESS:" /tmp/retry_results 2>/dev/null | cut -d: -f2- || echo "") |
| 108 | + # Verify all broken links with curl |
| 109 | + verified_failures="" |
| 110 | + rate_limited_urls="" |
110 | 111 |
|
111 | | - unresolved_4xx=$(echo "$output" | grep 'HTTP_4[0-9]{2}' | while read -r line; do |
112 | | - url=$(echo "$line" | sed -n 's/.*├─BROKEN─ \(https\?:\/\/[^[:space:]]*\).*/\1/p') |
113 | | - if [ -n "$url" ] && ! echo "$successful_urls" | grep -Fxq "$url"; then |
114 | | - echo "$line" |
115 | | - fi |
116 | | - done) |
| 112 | + if [ -n "$all_broken_links" ]; then |
| 113 | + echo "" |
| 114 | + echo "=== Verifying Links with curl ===" |
117 | 115 |
|
118 | | - if [ -n "$unresolved_4xx" ]; then |
119 | | - echo "" |
120 | | - echo "❌ Unresolved HTTP 4xx errors:" |
121 | | - echo "$unresolved_4xx" |
122 | | - has_errors=true |
123 | | - fi |
| 116 | + # Extract URLs and verify them |
| 117 | + urls_to_verify=$(echo "$all_broken_links" | sed -n 's/.*├─BROKEN─ \(https\?:\/\/[^[:space:]]*\).*/\1/p') |
124 | 118 |
|
125 | | - # Check for failed retries |
126 | | - if grep -q "^RETRY_FAILED:" /tmp/retry_results 2>/dev/null; then |
127 | | - echo "" |
128 | | - echo "❌ URLs that failed after retries:" |
129 | | - grep "^RETRY_FAILED:" /tmp/retry_results | cut -d: -f2- |
130 | | - has_errors=true |
131 | | - fi |
| 119 | + while IFS= read -r url; do |
| 120 | + [ -z "$url" ] && continue |
| 121 | + if ! verify_with_curl "$url"; then |
| 122 | + verified_failures="${verified_failures}${url} (HTTP ${failed_http_code})\n" |
| 123 | + fi |
| 124 | + done <<< "$urls_to_verify" |
132 | 125 | fi |
133 | 126 |
|
134 | | - # Final result |
| 127 | + # Final results |
135 | 128 | echo "" |
136 | | - if [ "$has_errors" = true ]; then |
137 | | - echo "❌ Broken links found that could not be resolved." |
| 129 | + if [ -n "$verified_failures" ]; then |
| 130 | + echo "❌ CI Failed: The following links failed:" |
| 131 | + echo -e "$verified_failures" |
138 | 132 | exit 1 |
139 | 133 | else |
140 | | - if grep -q "^RETRY_SUCCESS:" /tmp/retry_results 2>/dev/null; then |
141 | | - echo "✅ All broken links resolved via retries! Successfully fixed:" |
142 | | - grep "^RETRY_SUCCESS:" /tmp/retry_results | cut -d: -f2- | sed 's/^/ - /' |
143 | | - else |
144 | | - echo "✅ No broken links found." |
| 134 | + if [ -n "$rate_limited_urls" ]; then |
| 135 | + echo "⚠️ Note: These links returned HTTP 429 (rate limited, not broken):" |
| 136 | + echo -e "$rate_limited_urls" |
145 | 137 | fi |
| 138 | + echo "✅ CI Passed: All links verified successfully" |
146 | 139 | exit 0 |
147 | 140 | fi |
0 commit comments