diff --git a/cmd/main.go b/cmd/main.go index a5870d4..b2e2ffc 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -265,12 +265,10 @@ func runRunner() error { } } - // Ensure the `fduty` CLI is in the bundled-tools dir AND resolves on the bash - // PATH. Hard-fails startup rather than serving a runner that 127s every fduty - // call; no-op staging when the cloud image / install.sh already placed it. - if err = ensureFdutyCLI(); err != nil { - return fmt.Errorf("fduty CLI not ready: %w", err) - } + // Best-effort: stage the `fduty` CLI onto the bash PATH and log a manual- + // install hint if it is missing. Deliberately non-fatal — the runner must + // start even without fduty (it can still do non-fduty work); see ensureFdutyCLI. + ensureFdutyCLI() checker := permission.NewChecker(map[string]string{"*": "allow"}) diff --git a/cmd/provision.go b/cmd/provision.go index b966a96..f984cdc 100644 --- a/cmd/provision.go +++ b/cmd/provision.go @@ -24,13 +24,14 @@ import ( // local testing). var cliInstallURL = "" -// ensureFdutyCLI guarantees the `fduty` CLI resolves by bare name through the -// EXACT environment the bash tool runs with, then HARD-FAILS startup if it does -// not. A runner that boots "healthy" but 127s every fduty call is worse than one -// that refuses to start, so any unrecoverable problem here returns an error that -// aborts serve/run. +// ensureFdutyCLI makes a BEST-EFFORT attempt to put the `fduty` CLI on the bash +// tool's PATH, then reports whether it is usable. It NEVER aborts startup: a +// runner missing fduty can still do non-fduty work, and a missing CLI is far +// easier to diagnose on a running, loudly-logging runner than on one that +// refuses to boot. When fduty is unavailable it logs an actionable manual- +// install hint; agent commands that call `fduty` will 127 until it is resolved. // -// Resolution order for getting fduty into the bundled-tools dir (the dir +// Auto-staging order for getting fduty into the bundled-tools dir (the dir // environment.BundledToolsDir prepends to every bash PATH — writable by // construction, see its doc comment): // 1. already present there (cloud image bakes it; install.sh stages the bundled @@ -39,20 +40,24 @@ var cliInstallURL = "" // network); // 3. CDN install.sh fallback, when a URL is configured. // -// Regardless of which branch ran — including the no-op "already provisioned" -// path — the functional self-check below runs and gates startup. -func ensureFdutyCLI() error { - dir := environment.BundledToolsDir() - if dir == "" { - return fmt.Errorf("fduty CLI provisioning failed: cannot resolve bundled-tools dir (set FLASHDUTY_RUNNER_HOME or FLASHDUTY_RUNNER_BIN_DIR)") +// A staging miss is not fatal — fduty may already be reachable elsewhere on the +// bash PATH (e.g. /usr/local/bin from a binary-only install). verifyFdutyOnPath +// is the single source of truth for "is fduty usable", and it logs the outcome. +func ensureFdutyCLI() { + if dir := environment.BundledToolsDir(); dir != "" { + if err := provisionFduty(dir, filepath.Join(dir, "fduty")); err != nil { + // Quiet on purpose: the PATH self-check below decides usability, and + // an fduty installed elsewhere on PATH makes this miss irrelevant. + slog.Debug("fduty auto-stage skipped; relying on the bash-PATH self-check", "error", err) + } } - target := filepath.Join(dir, "fduty") - if err := provisionFduty(dir, target); err != nil { - return err + if err := verifyFdutyOnPath(); err != nil { + slog.Info("fduty CLI is NOT available on the bash PATH — the runner will start, "+ + "but agent commands that call `fduty` will fail until you install it. To fix: place an "+ + "executable `fduty` in the runner's tools dir ($FLASHDUTY_RUNNER_HOME/bin, default ~/.flashduty/bin) "+ + "or anywhere on PATH (e.g. /usr/local/bin), then restart the runner.", "error", err) } - - return verifyFdutyOnPath() } // provisionFduty places an fduty binary at target (inside the bundled-tools diff --git a/cmd/provision_test.go b/cmd/provision_test.go index 7b4f304..d3a44bb 100644 --- a/cmd/provision_test.go +++ b/cmd/provision_test.go @@ -80,6 +80,29 @@ func TestProvisionFduty_NoSourceErrors(t *testing.T) { assert.Contains(t, err.Error(), "no install URL configured") } +// ensureFdutyCLI must NEVER abort the process, even when fduty cannot be +// auto-provisioned (no staged copy, no bundled-next-to-exe, no install URL). +// The runner has to start regardless — it can still do non-fduty work, and a +// missing CLI is logged for the operator, not turned into a boot failure. +// Reaching the line after the call IS the assertion: a reintroduced +// os.Exit/log.Fatal would kill this test binary right here. +func TestEnsureFdutyCLI_NeverFatalWhenUnprovisionable(t *testing.T) { + binDir := t.TempDir() // empty: no fduty staged, so provisioning has no source + t.Setenv("FLASHDUTY_RUNNER_BIN_DIR", binDir) + t.Setenv("FLASHDUTY_CLI_INSTALL_URL", "") + // Scrub PATH to the base system dirs (bash present, fduty absent) so the + // self-check deterministically misses fduty and we exercise the non-fatal + // warning branch — not whatever fduty the dev box has on its real PATH. + if runtime.GOOS != "windows" { + t.Setenv("PATH", "/usr/bin:/bin") + } + saved := cliInstallURL + cliInstallURL = "" + t.Cleanup(func() { cliInstallURL = saved }) + + ensureFdutyCLI() // reaching here is the assertion — os.Exit/log.Fatal would kill the binary first +} + // verifyFdutyOnPath runs `fduty version` through the bash tool env and gates on // exit 0. With a fake fduty placed in the tools dir (FLASHDUTY_RUNNER_BIN_DIR), // the bundled-tools dir is first on PATH, so bare `fduty` resolves to ours. diff --git a/cmd/serve.go b/cmd/serve.go index 7096133..47f725e 100644 --- a/cmd/serve.go +++ b/cmd/serve.go @@ -62,12 +62,10 @@ func runServe() error { return fmt.Errorf("pin runner home: %w", err) } - // Ensure the `fduty` CLI is in the bundled-tools dir AND resolves on the bash - // PATH. Hard-fails startup rather than serving a runner that 127s every fduty - // call; no-op staging when the cloud image / install.sh already placed it. - if err := ensureFdutyCLI(); err != nil { - return fmt.Errorf("fduty CLI not ready: %w", err) - } + // Best-effort: stage the `fduty` CLI onto the bash PATH and log a manual- + // install hint if it is missing. Deliberately non-fatal — the runner must + // start even without fduty (it can still do non-fduty work); see ensureFdutyCLI. + ensureFdutyCLI() checker := permission.NewChecker(map[string]string{"*": "allow"}) wspace, err := environment.New(workspaceRoot, checker) diff --git a/install.sh b/install.sh index 766a3cf..bd6fad3 100755 --- a/install.sh +++ b/install.sh @@ -376,20 +376,41 @@ ensure_workdir() { chmod 0750 "$STATE_DIR" "$WORKSPACE_DIR" "$BIN_DIR" "$RUNNER_BIN_DIR" } -# install_bundled_fduty stages the fduty CLI that ships inside the release -# tarball (extracted to $TMPDIR_ by download_and_verify) into the runner's -# writable tools dir, so the runner never has to curl|sh the CDN installer at -# boot. Best-effort: if the archive predates bundling (no fduty member), the -# runner falls back to its CDN install path at first run. -install_bundled_fduty() { +# stage_bundled_fduty stages the fduty CLI that ships inside the release tarball +# (extracted to $TMPDIR_ by download_and_verify) into dir, so the runner never +# has to curl|sh the CDN installer at boot. Best-effort: if the archive predates +# bundling (no fduty member), the runner falls back to its CDN install path at +# first run. The chown to the service user is tolerated-fail so the binary-only +# path (no service user) still works. +# +# Service installs stage into $BIN_DIR (the env-pinned runtime tools dir). +# Binary-only / darwin installs stage into $INSTALL_DIR instead: that is on the +# user's PATH and, on darwin, is the dir the runner's bundledFdutyNextToExe() +# probes (os.Executable() there resolves to the /usr/local/bin symlink, not its +# target) — so the runner self-stages it into the per-user tools dir at boot. +stage_bundled_fduty() { + dir="$1" src="${TMPDIR_}/fduty" if [ ! -f "$src" ]; then info "Release archive has no bundled fduty; runner will provision it at first run" return fi - install -m 0755 "$src" "${BIN_DIR}/fduty" - chown "$SERVICE_USER":"$SERVICE_USER" "${BIN_DIR}/fduty" 2>/dev/null || true - info "Installed bundled fduty: ${BIN_DIR}/fduty" + install -m 0755 "$src" "${dir}/fduty" + chown "$SERVICE_USER":"$SERVICE_USER" "${dir}/fduty" 2>/dev/null || true + info "Installed bundled fduty: ${dir}/fduty" +} + +# fduty_present reports whether a staged fduty already sits where this host's +# install path puts it (service → $BIN_DIR, binary-only → $INSTALL_DIR). Used to +# keep the "already up to date" short-circuit from declaring success while the +# bundled CLI is still missing — which would strand the runner exactly as a +# fresh install with the staging step skipped did. +fduty_present() { + if [ "$NO_SERVICE" = "true" ] || [ "$OS" = "darwin" ]; then + [ -x "${INSTALL_DIR}/fduty" ] + else + [ -x "${BIN_DIR}/fduty" ] + fi } ensure_token() { @@ -554,7 +575,7 @@ do_install() { needs_migration=true fi # Binary reports "0.0.5"; VERSION carries "v0.0.5" — strip the leading "v" for comparison. - if [ -n "$installed" ] && [ "$installed" = "${VERSION#v}" ] && [ "$needs_migration" = "false" ]; then + if [ -n "$installed" ] && [ "$installed" = "${VERSION#v}" ] && [ "$needs_migration" = "false" ] && fduty_present; then info "Already at ${VERSION}, nothing to do." return fi @@ -567,6 +588,10 @@ do_install() { install_binary if [ "$NO_SERVICE" = "true" ] || [ "$OS" = "darwin" ]; then + # Stage fduty next to the runner symlink: $INSTALL_DIR is on the user's + # PATH and is the dir darwin's bundledFdutyNextToExe() probes, so a + # manually-launched runner finds it without an env file or service user. + stage_bundled_fduty "$INSTALL_DIR" info "Binary-only install complete." info "Start manually: ${INSTALL_DIR}/${BINARY_NAME} run --token " return @@ -574,7 +599,7 @@ do_install() { ensure_user ensure_workdir - install_bundled_fduty + stage_bundled_fduty "$BIN_DIR" ensure_token write_env_file