Skip to content

Commit 8bee5fb

Browse files
committed
factor some hooks out of runner-setup.sh
1 parent 3ecd651 commit 8bee5fb

File tree

4 files changed

+167
-105
lines changed

4 files changed

+167
-105
lines changed
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
#!/bin/bash
2+
# Periodic check for GitHub Actions runner termination conditions
3+
# Called by systemd timer to determine if the instance should shut down
4+
5+
exec >> /tmp/termination-check.log 2>&1
6+
7+
# Source common functions and variables
8+
source /usr/local/bin/runner-common.sh
9+
10+
# File paths for tracking
11+
A="/var/run/github-runner-last-activity"
12+
J="/var/run/github-runner-jobs"
13+
H="/var/run/github-runner-has-run-job"
14+
15+
# Current timestamp
16+
N=$(date +%s)
17+
18+
# Check if any runners are actually running
19+
RUNNER_PROCS=$(pgrep -f "Runner.Listener" | wc -l)
20+
if [ $RUNNER_PROCS -eq 0 ]; then
21+
# No runner processes, check if we have stale job files
22+
if ls $J/*.job 2>/dev/null | grep -q .; then
23+
log "WARNING: Found job files but no runner processes - cleaning up stale jobs"
24+
rm -f $J/*.job
25+
fi
26+
fi
27+
28+
# Check job files and update timestamps for active runners
29+
# This creates a heartbeat mechanism to detect stuck/failed job completion
30+
for job_file in $J/*.job; do
31+
[ -f "$job_file" ] || continue
32+
if grep -q '"status":"running"' "$job_file" 2>/dev/null; then
33+
# Extract runner number from job file name (format: RUNID-JOBNAME-RUNNER.job)
34+
runner_num=$(basename "$job_file" .job | rev | cut -d- -f1 | rev)
35+
36+
# For a job to be truly running, we need BOTH Listener AND Worker processes
37+
# Listener alone means the runner is idle/waiting, not actually running a job
38+
listener_alive=$(pgrep -f "runner-${runner_num}/.*Runner.Listener" 2>/dev/null | wc -l)
39+
worker_alive=$(pgrep -f "runner-${runner_num}/.*Runner.Worker" 2>/dev/null | wc -l)
40+
41+
if [ $listener_alive -gt 0 ] && [ $worker_alive -gt 0 ]; then
42+
# Both processes exist, job is truly running - update heartbeat
43+
touch "$job_file" 2>/dev/null || true
44+
elif [ $listener_alive -gt 0 ] && [ $worker_alive -eq 0 ]; then
45+
# Listener exists but no Worker - job has likely failed/completed but hook couldn't run
46+
job_age=$((N - $(stat -c %Y "$job_file" 2>/dev/null || echo 0)))
47+
log "WARNING: Runner $runner_num Listener alive but Worker dead - job likely completed (file age: ${job_age}s)"
48+
rm -f "$job_file"
49+
touch "$A" # Update last activity since we just cleaned up a job
50+
else
51+
# No Listener at all - runner is completely dead
52+
job_age=$((N - $(stat -c %Y "$job_file" 2>/dev/null || echo 0)))
53+
log "WARNING: Job file $(basename $job_file) exists but runner $runner_num is dead (file age: ${job_age}s)"
54+
rm -f "$job_file"
55+
fi
56+
fi
57+
done
58+
59+
# Now check for stale job files that couldn't be touched (e.g., disk full)
60+
# With polling every ${RUNNER_POLL_INTERVAL:-10}s, files should never be older than ~30s
61+
# If they are, something is preventing the touch (likely disk full)
62+
STALE_THRESHOLD=$((${RUNNER_POLL_INTERVAL:-10} * 3)) # 3x the poll interval
63+
for job_file in $J/*.job; do
64+
[ -f "$job_file" ] || continue
65+
if grep -q '"status":"running"' "$job_file" 2>/dev/null; then
66+
job_age=$((N - $(stat -c %Y "$job_file" 2>/dev/null || echo 0)))
67+
if [ $job_age -gt $STALE_THRESHOLD ]; then
68+
log "ERROR: Job file $(basename $job_file) is stale (${job_age}s old, threshold ${STALE_THRESHOLD}s)"
69+
log "Touch must be failing (disk full?) - removing stale job file"
70+
rm -f "$job_file"
71+
fi
72+
fi
73+
done
74+
75+
# Ensure activity file exists and get its timestamp
76+
[ ! -f "$A" ] && touch "$A"
77+
L=$(stat -c %Y "$A" 2>/dev/null || echo 0)
78+
79+
# Calculate idle time
80+
I=$((N-L))
81+
82+
# Determine grace period based on whether any job has run yet
83+
[ -f "$H" ] && G=${RUNNER_GRACE_PERIOD:-60} || G=${RUNNER_INITIAL_GRACE_PERIOD:-180}
84+
85+
# Count running jobs
86+
R=$(grep -l '"status":"running"' $J/*.job 2>/dev/null | wc -l || echo 0)
87+
88+
# Check if we should terminate
89+
if [ $R -eq 0 ] && [ $I -gt $G ]; then
90+
log "TERMINATING: idle $I > grace $G"
91+
deregister_all_runners
92+
flush_cloudwatch_logs
93+
debug_sleep_and_shutdown
94+
else
95+
[ $R -gt 0 ] && log "$R job(s) running" || log "Idle $I/$G sec"
96+
fi
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#!/bin/bash
2+
# GitHub Actions runner job-completed hook
3+
# Called when a job finishes (success or failure) on this runner
4+
# Environment variables provided by GitHub Actions runner
5+
6+
exec >> /tmp/job-completed-hook.log 2>&1
7+
8+
# Get runner index from environment (defaults to 0 for single-runner instances)
9+
I="${RUNNER_INDEX:-0}"
10+
11+
# Log the job completion with a specific prefix for CloudWatch filtering
12+
# The LOG_PREFIX will be substituted during setup
13+
echo "[$(date)] Runner-$I: LOG_PREFIX_JOB_COMPLETED ${GITHUB_JOB}"
14+
15+
# Remove the job tracking file to indicate this runner no longer has an active job
16+
rm -f /var/run/github-runner-jobs/${GITHUB_RUN_ID}-${GITHUB_JOB}-$I.job
17+
18+
# Update activity timestamp to reset the idle timer
19+
touch /var/run/github-runner-last-activity
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#!/bin/bash
2+
# GitHub Actions runner job-started hook
3+
# Called when a job starts running on this runner
4+
# Environment variables provided by GitHub Actions runner
5+
6+
exec >> /tmp/job-started-hook.log 2>&1
7+
8+
# Get runner index from environment (defaults to 0 for single-runner instances)
9+
I="${RUNNER_INDEX:-0}"
10+
11+
# Log the job start with a specific prefix for CloudWatch filtering
12+
# The LOG_PREFIX will be substituted during setup
13+
echo "[$(date)] Runner-$I: LOG_PREFIX_JOB_STARTED Runner-$I: ${GITHUB_JOB}"
14+
15+
# Create a job tracking file to indicate this runner has an active job
16+
# Format: RUNID-JOBNAME-RUNNER.job
17+
mkdir -p /var/run/github-runner-jobs
18+
echo '{"status":"running","runner":"'$I'"}' > /var/run/github-runner-jobs/${GITHUB_RUN_ID}-${GITHUB_JOB}-$I.job
19+
20+
# Update activity timestamps to reset the idle timer
21+
touch /var/run/github-runner-last-activity /var/run/github-runner-has-run-job

src/ec2_gha/scripts/runner-setup.sh

Lines changed: 31 additions & 105 deletions
Original file line numberDiff line numberDiff line change
@@ -222,115 +222,40 @@ else
222222
fi
223223
log "Downloaded runner binary"
224224
225-
# Create job tracking scripts - these are called by GitHub runner hooks
226-
# job-started-hook.sh is called when a job starts
227-
cat > $B/job-started-hook.sh << 'EOFS'
228-
#!/bin/bash
229-
exec >> /tmp/job-started-hook.log 2>&1
230-
I="${RUNNER_INDEX:-0}"
231-
echo "[$(date)] Runner-$I: JOB_STARTED_LOG Runner-$I: ${GITHUB_JOB}"
232-
mkdir -p /var/run/github-runner-jobs
233-
echo '{"status":"running","runner":"'$I'"}' > /var/run/github-runner-jobs/${GITHUB_RUN_ID}-${GITHUB_JOB}-$I.job
234-
touch /var/run/github-runner-last-activity /var/run/github-runner-has-run-job
235-
EOFS
236-
237-
# Replace JOB_STARTED_LOG with actual prefix
238-
sed -i "s/JOB_STARTED_LOG/${log_prefix_job_started}/g" $B/job-started-hook.sh
239-
240-
# job-completed-hook.sh is called when a job completes
241-
cat > $B/job-completed-hook.sh << 'EOFC'
242-
#!/bin/bash
243-
exec >> /tmp/job-completed-hook.log 2>&1
244-
I="${RUNNER_INDEX:-0}"
245-
echo "[$(date)] Runner-$I: JOB_COMPLETED_LOG ${GITHUB_JOB}"
246-
rm -f /var/run/github-runner-jobs/${GITHUB_RUN_ID}-${GITHUB_JOB}-$I.job
247-
touch /var/run/github-runner-last-activity
248-
EOFC
249-
250-
# Replace JOB_COMPLETED_LOG with actual prefix
251-
sed -i "s/JOB_COMPLETED_LOG/${log_prefix_job_completed}/g" $B/job-completed-hook.sh
252-
253-
# check-runner-termination.sh is called periodically to check if the instance should terminate
254-
cat > $B/check-runner-termination.sh << 'EOFT'
255-
#!/bin/bash
256-
exec >> /tmp/termination-check.log 2>&1
257-
source /usr/local/bin/runner-common.sh
258-
A="/var/run/github-runner-last-activity"
259-
J="/var/run/github-runner-jobs"
260-
H="/var/run/github-runner-has-run-job"
261-
262-
# Check if any runners are actually running
263-
RUNNER_PROCS=$(pgrep -f "Runner.Listener" | wc -l)
264-
if [ $RUNNER_PROCS -eq 0 ]; then
265-
# No runner processes, check if we have stale job files
266-
if ls $J/*.job 2>/dev/null | grep -q .; then
267-
log "WARNING: Found job files but no runner processes - cleaning up stale jobs"
268-
rm -f $J/*.job
225+
# Helper function to fetch scripts
226+
fetch_script() {
227+
local script_name="$1"
228+
local url="${BASE_URL}/${script_name}"
229+
local dest="${B}/${script_name}"
230+
231+
if command -v curl >/dev/null 2>&1; then
232+
curl -fsSL "$url" -o "$dest" || {
233+
log_error "Failed to fetch $script_name"
234+
terminate_instance "Failed to download $script_name"
235+
}
236+
elif command -v wget >/dev/null 2>&1; then
237+
wget -q "$url" -O "$dest" || {
238+
log_error "Failed to fetch $script_name"
239+
terminate_instance "Failed to download $script_name"
240+
}
241+
else
242+
log_error "Neither curl nor wget found. Cannot download scripts."
243+
terminate_instance "No download tool available"
269244
fi
270-
fi
245+
}
271246
272-
# Check job files and update timestamps for active runners
273-
# This creates a heartbeat mechanism to detect stuck/failed job completion
274-
for job_file in $J/*.job; do
275-
[ -f "$job_file" ] || continue
276-
if grep -q '"status":"running"' "$job_file" 2>/dev/null; then
277-
# Extract runner number from job file name (format: RUNID-JOBNAME-RUNNER.job)
278-
runner_num=$(basename "$job_file" .job | rev | cut -d- -f1 | rev)
279-
280-
# For a job to be truly running, we need BOTH Listener AND Worker processes
281-
# Listener alone means the runner is idle/waiting, not actually running a job
282-
listener_alive=$(pgrep -f "runner-${runner_num}/.*Runner.Listener" 2>/dev/null | wc -l)
283-
worker_alive=$(pgrep -f "runner-${runner_num}/.*Runner.Worker" 2>/dev/null | wc -l)
284-
285-
if [ $listener_alive -gt 0 ] && [ $worker_alive -gt 0 ]; then
286-
# Both processes exist, job is truly running - update heartbeat
287-
touch "$job_file" 2>/dev/null || true
288-
elif [ $listener_alive -gt 0 ] && [ $worker_alive -eq 0 ]; then
289-
# Listener exists but no Worker - job has likely failed/completed but hook couldn't run
290-
job_age=$((N - $(stat -c %Y "$job_file" 2>/dev/null || echo 0)))
291-
log "WARNING: Runner $runner_num Listener alive but Worker dead - job likely completed (file age: ${job_age}s)"
292-
rm -f "$job_file"
293-
touch "$A" # Update last activity since we just cleaned up a job
294-
else
295-
# No Listener at all - runner is completely dead
296-
job_age=$((N - $(stat -c %Y "$job_file" 2>/dev/null || echo 0)))
297-
log "WARNING: Job file $(basename $job_file) exists but runner $runner_num is dead (file age: ${job_age}s)"
298-
rm -f "$job_file"
299-
fi
300-
fi
301-
done
247+
# Fetch job tracking scripts from GitHub
248+
# These scripts are called by GitHub runner hooks
249+
log "Fetching runner hook scripts"
250+
BASE_URL="https://raw.githubusercontent.com/Open-Athena/ec2-gha/${action_sha}/src/ec2_gha/scripts"
302251
303-
# Now check for stale job files that couldn't be touched (e.g., disk full)
304-
# With polling every ${RUNNER_POLL_INTERVAL:-10}s, files should never be older than ~30s
305-
# If they are, something is preventing the touch (likely disk full)
306-
STALE_THRESHOLD=$((${RUNNER_POLL_INTERVAL:-10} * 3)) # 3x the poll interval
307-
for job_file in $J/*.job; do
308-
[ -f "$job_file" ] || continue
309-
if grep -q '"status":"running"' "$job_file" 2>/dev/null; then
310-
job_age=$((N - $(stat -c %Y "$job_file" 2>/dev/null || echo 0)))
311-
if [ $job_age -gt $STALE_THRESHOLD ]; then
312-
log "ERROR: Job file $(basename $job_file) is stale (${job_age}s old, threshold ${STALE_THRESHOLD}s)"
313-
log "Touch must be failing (disk full?) - removing stale job file"
314-
rm -f "$job_file"
315-
fi
316-
fi
317-
done
252+
fetch_script "job-started-hook.sh"
253+
fetch_script "job-completed-hook.sh"
254+
fetch_script "check-runner-termination.sh"
318255
319-
[ ! -f "$A" ] && touch "$A"
320-
L=$(stat -c %Y "$A" 2>/dev/null || echo 0)
321-
N=$(date +%s)
322-
I=$((N-L))
323-
[ -f "$H" ] && G=${RUNNER_GRACE_PERIOD:-60} || G=${RUNNER_INITIAL_GRACE_PERIOD:-180}
324-
R=$(grep -l '"status":"running"' $J/*.job 2>/dev/null | wc -l || echo 0)
325-
if [ $R -eq 0 ] && [ $I -gt $G ]; then
326-
log "TERMINATING: idle $I > grace $G"
327-
deregister_all_runners
328-
flush_cloudwatch_logs
329-
debug_sleep_and_shutdown
330-
else
331-
[ $R -gt 0 ] && log "$R job(s) running" || log "Idle $I/$G sec"
332-
fi
333-
EOFT
256+
# Replace log prefix placeholders with actual values
257+
sed -i "s/LOG_PREFIX_JOB_STARTED/${log_prefix_job_started}/g" $B/job-started-hook.sh
258+
sed -i "s/LOG_PREFIX_JOB_COMPLETED/${log_prefix_job_completed}/g" $B/job-completed-hook.sh
334259
335260
chmod +x $B/job-started-hook.sh $B/job-completed-hook.sh $B/check-runner-termination.sh
336261
@@ -347,6 +272,7 @@ After=network.target
347272
Type=oneshot
348273
Environment="RUNNER_GRACE_PERIOD=$runner_grace_period"
349274
Environment="RUNNER_INITIAL_GRACE_PERIOD=$runner_initial_grace_period"
275+
Environment="RUNNER_POLL_INTERVAL=$runner_poll_interval"
350276
ExecStart=$B/check-runner-termination.sh
351277
EOF
352278

0 commit comments

Comments
 (0)