project_harrier/scripts/train.py at main · RobotFlow-Labs/project_harrier · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/env python3
"""HARRIER YOLO26 training entry point.

Respects the ANIMA training rules:
    - Dual backend: ``--backend auto|cuda|mlx|cpu``
    - Artifacts disk only: everything goes under ``/mnt/artifacts-datai``
    - ``--dry-run`` validates the plan without importing ultralytics
    - ``--resume PATH`` for checkpoint resume

Usage:

    python scripts/train.py --experiment eo_daynight_to_eo_day --dry-run
    python scripts/train.py --experiment ir_daynight_to_ir_night \\
        --backend cuda --batch 32
"""

from __future__ import annotations

import argparse
import sys
from pathlib import Path

_REPO_ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(_REPO_ROOT / "src"))

from anima_harrier.experiments import YOLO26_EXPERIMENTS  # noqa: E402
from anima_harrier.training import (  # noqa: E402
    HarrierTrainer,
    build_training_plan,
    render_plan_report,
)


def _parse_args(argv: list[str] | None) -> argparse.Namespace:
    parser = argparse.ArgumentParser(prog="anima-harrier-train")
    parser.add_argument(
        "--experiment",
        required=True,
        choices=sorted(YOLO26_EXPERIMENTS),
        help="HARRIER YOLO26 experiment name.",
    )
    parser.add_argument(
        "--configs-dir",
        type=Path,
        default=_REPO_ROOT / "configs",
    )
    parser.add_argument(
        "--artifact-root",
        type=Path,
        default=None,
        help="Override the artifacts disk root (defaults to /mnt/artifacts-datai).",
    )
    parser.add_argument(
        "--run-name",
        type=str,
        default=None,
        help="Optional run name; otherwise auto-derived from experiment + model.",
    )
    parser.add_argument("--backend", default="auto", choices=["auto", "cuda", "mlx", "cpu"])
    parser.add_argument("--model", default=None, help="Override the YOLO26 model.")
    parser.add_argument("--batch", type=int, default=16)
    parser.add_argument("--resume", type=Path, default=None)
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Render the plan and persist it to the artifacts disk without training.",
    )
    return parser.parse_args(argv)


def main(argv: list[str] | None = None) -> int:
    args = _parse_args(argv)
    plan = build_training_plan(
        experiment_name=args.experiment,
        configs_dir=args.configs_dir,
        artifact_root=args.artifact_root,
        run_name=args.run_name,
        backend=args.backend,
        model=args.model,
        batch_size=args.batch,
        resume_from=args.resume,
    )
    print(render_plan_report(plan))
    trainer = HarrierTrainer(plan)
    if args.dry_run:
        outcome = trainer.dry_run()
        print(f"\n[dry-run] {outcome.message}")
        return 0 if outcome.success else 1
    outcome = trainer.run()
    print(f"\n[train] {outcome.message} (success={outcome.success})")
    if outcome.best_checkpoint:
        print(f"[train] best_checkpoint={outcome.best_checkpoint}")
    return 0 if outcome.success else 2


if __name__ == "__main__":
    raise SystemExit(main())