@@ -222,115 +222,40 @@ else
222222fi
223223log " Downloaded runner binary"
224224
225- # Create job tracking scripts - these are called by GitHub runner hooks
226- # job-started-hook.sh is called when a job starts
227- cat > $B /job-started-hook.sh << 'EOFS '
228- #!/bin/bash
229- exec >> /tmp/job-started-hook.log 2>&1
230- I="${RUNNER_INDEX:-0}"
231- echo "[$(date)] Runner-$I: JOB_STARTED_LOG Runner-$I: ${GITHUB_JOB}"
232- mkdir -p /var/run/github-runner-jobs
233- echo '{"status":"running","runner":"'$I'"}' > /var/run/github-runner-jobs/${GITHUB_RUN_ID}-${GITHUB_JOB}-$I.job
234- touch /var/run/github-runner-last-activity /var/run/github-runner-has-run-job
235- EOFS
236-
237- # Replace JOB_STARTED_LOG with actual prefix
238- sed -i " s/JOB_STARTED_LOG/${log_prefix_job_started} /g" $B /job-started-hook.sh
239-
240- # job-completed-hook.sh is called when a job completes
241- cat > $B /job-completed-hook.sh << 'EOFC '
242- #!/bin/bash
243- exec >> /tmp/job-completed-hook.log 2>&1
244- I="${RUNNER_INDEX:-0}"
245- echo "[$(date)] Runner-$I: JOB_COMPLETED_LOG ${GITHUB_JOB}"
246- rm -f /var/run/github-runner-jobs/${GITHUB_RUN_ID}-${GITHUB_JOB}-$I.job
247- touch /var/run/github-runner-last-activity
248- EOFC
249-
250- # Replace JOB_COMPLETED_LOG with actual prefix
251- sed -i " s/JOB_COMPLETED_LOG/${log_prefix_job_completed} /g" $B /job-completed-hook.sh
252-
253- # check-runner-termination.sh is called periodically to check if the instance should terminate
254- cat > $B /check-runner-termination.sh << 'EOFT '
255- #!/bin/bash
256- exec >> /tmp/termination-check.log 2>&1
257- source /usr/local/bin/runner-common.sh
258- A="/var/run/github-runner-last-activity"
259- J="/var/run/github-runner-jobs"
260- H="/var/run/github-runner-has-run-job"
261-
262- # Check if any runners are actually running
263- RUNNER_PROCS=$(pgrep -f "Runner.Listener" | wc -l)
264- if [ $RUNNER_PROCS -eq 0 ]; then
265- # No runner processes, check if we have stale job files
266- if ls $J/*.job 2>/dev/null | grep -q .; then
267- log "WARNING: Found job files but no runner processes - cleaning up stale jobs"
268- rm -f $J/*.job
225+ # Helper function to fetch scripts
226+ fetch_script () {
227+ local script_name=" $1 "
228+ local url=" ${BASE_URL} /${script_name} "
229+ local dest=" ${B} /${script_name} "
230+
231+ if command -v curl > /dev/null 2>&1 ; then
232+ curl -fsSL " $url " -o " $dest " || {
233+ log_error " Failed to fetch $script_name "
234+ terminate_instance " Failed to download $script_name "
235+ }
236+ elif command -v wget > /dev/null 2>&1 ; then
237+ wget -q " $url " -O " $dest " || {
238+ log_error " Failed to fetch $script_name "
239+ terminate_instance " Failed to download $script_name "
240+ }
241+ else
242+ log_error " Neither curl nor wget found. Cannot download scripts."
243+ terminate_instance " No download tool available"
269244 fi
270- fi
245+ }
271246
272- # Check job files and update timestamps for active runners
273- # This creates a heartbeat mechanism to detect stuck/failed job completion
274- for job_file in $J/*.job; do
275- [ -f "$job_file" ] || continue
276- if grep -q '"status":"running"' "$job_file" 2>/dev/null; then
277- # Extract runner number from job file name (format: RUNID-JOBNAME-RUNNER.job)
278- runner_num=$(basename "$job_file" .job | rev | cut -d- -f1 | rev)
279-
280- # For a job to be truly running, we need BOTH Listener AND Worker processes
281- # Listener alone means the runner is idle/waiting, not actually running a job
282- listener_alive=$(pgrep -f "runner-${runner_num}/.*Runner.Listener" 2>/dev/null | wc -l)
283- worker_alive=$(pgrep -f "runner-${runner_num}/.*Runner.Worker" 2>/dev/null | wc -l)
284-
285- if [ $listener_alive -gt 0 ] && [ $worker_alive -gt 0 ]; then
286- # Both processes exist, job is truly running - update heartbeat
287- touch "$job_file" 2>/dev/null || true
288- elif [ $listener_alive -gt 0 ] && [ $worker_alive -eq 0 ]; then
289- # Listener exists but no Worker - job has likely failed/completed but hook couldn't run
290- job_age=$((N - $(stat -c %Y "$job_file" 2>/dev/null || echo 0)))
291- log "WARNING: Runner $runner_num Listener alive but Worker dead - job likely completed (file age: ${job_age}s)"
292- rm -f "$job_file"
293- touch "$A" # Update last activity since we just cleaned up a job
294- else
295- # No Listener at all - runner is completely dead
296- job_age=$((N - $(stat -c %Y "$job_file" 2>/dev/null || echo 0)))
297- log "WARNING: Job file $(basename $job_file) exists but runner $runner_num is dead (file age: ${job_age}s)"
298- rm -f "$job_file"
299- fi
300- fi
301- done
247+ # Fetch job tracking scripts from GitHub
248+ # These scripts are called by GitHub runner hooks
249+ log " Fetching runner hook scripts"
250+ BASE_URL=" https://raw.githubusercontent.com/Open-Athena/ec2-gha/${action_sha} /src/ec2_gha/scripts"
302251
303- # Now check for stale job files that couldn't be touched (e.g., disk full)
304- # With polling every ${RUNNER_POLL_INTERVAL:-10}s, files should never be older than ~30s
305- # If they are, something is preventing the touch (likely disk full)
306- STALE_THRESHOLD=$((${RUNNER_POLL_INTERVAL:-10} * 3)) # 3x the poll interval
307- for job_file in $J/*.job; do
308- [ -f "$job_file" ] || continue
309- if grep -q '"status":"running"' "$job_file" 2>/dev/null; then
310- job_age=$((N - $(stat -c %Y "$job_file" 2>/dev/null || echo 0)))
311- if [ $job_age -gt $STALE_THRESHOLD ]; then
312- log "ERROR: Job file $(basename $job_file) is stale (${job_age}s old, threshold ${STALE_THRESHOLD}s)"
313- log "Touch must be failing (disk full?) - removing stale job file"
314- rm -f "$job_file"
315- fi
316- fi
317- done
252+ fetch_script " job-started-hook.sh"
253+ fetch_script " job-completed-hook.sh"
254+ fetch_script " check-runner-termination.sh"
318255
319- [ ! -f "$A" ] && touch "$A"
320- L=$(stat -c %Y "$A" 2>/dev/null || echo 0)
321- N=$(date +%s)
322- I=$((N-L))
323- [ -f "$H" ] && G=${RUNNER_GRACE_PERIOD:-60} || G=${RUNNER_INITIAL_GRACE_PERIOD:-180}
324- R=$(grep -l '"status":"running"' $J/*.job 2>/dev/null | wc -l || echo 0)
325- if [ $R -eq 0 ] && [ $I -gt $G ]; then
326- log "TERMINATING: idle $I > grace $G"
327- deregister_all_runners
328- flush_cloudwatch_logs
329- debug_sleep_and_shutdown
330- else
331- [ $R -gt 0 ] && log "$R job(s) running" || log "Idle $I/$G sec"
332- fi
333- EOFT
256+ # Replace log prefix placeholders with actual values
257+ sed -i " s/LOG_PREFIX_JOB_STARTED/${log_prefix_job_started} /g" $B /job-started-hook.sh
258+ sed -i " s/LOG_PREFIX_JOB_COMPLETED/${log_prefix_job_completed} /g" $B /job-completed-hook.sh
334259
335260chmod +x $B /job-started-hook.sh $B /job-completed-hook.sh $B /check-runner-termination.sh
336261
@@ -347,6 +272,7 @@ After=network.target
347272Type=oneshot
348273Environment="RUNNER_GRACE_PERIOD=$runner_grace_period "
349274Environment="RUNNER_INITIAL_GRACE_PERIOD=$runner_initial_grace_period "
275+ Environment="RUNNER_POLL_INTERVAL=$runner_poll_interval "
350276ExecStart=$B /check-runner-termination.sh
351277EOF
352278
0 commit comments