diff --git a/Jenkinsfile b/Jenkinsfile index a064bdc17f8..db495967056 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -114,7 +114,7 @@ pipeline { ["build", "scons"], ["flash", "cd scripts/ && ./reflash_internal_panda.py"], ["flash jungle", "cd board/jungle && ./flash.py --all"], - ["test", "cd tests/hitl && pytest --durations=0 2*.py [5-9]*.py"], + ["test", "cd tests/hitl && pytest -v --durations=0 -o 'addopts=' 10*.py"], ]) } } @@ -126,7 +126,7 @@ pipeline { ["build", "scons"], ["flash", "cd scripts/ && ./reflash_internal_panda.py"], ["flash jungle", "cd board/jungle && ./flash.py --all"], - ["test", "cd tests/hitl && pytest --durations=0 2*.py [5-9]*.py"], + ["test", "cd tests/hitl && pytest -v --durations=0 -o 'addopts=' 10*.py"], ]) } } diff --git a/tests/hitl/10_can_errors.py b/tests/hitl/10_can_errors.py new file mode 100644 index 00000000000..013c24d6221 --- /dev/null +++ b/tests/hitl/10_can_errors.py @@ -0,0 +1,187 @@ +import time +import pytest + +from opendbc.car.structs import CarParams +from panda.python.spi import PandaSpiException +from panda.tests.hitl.conftest import SPEED_NORMAL +from panda.tests.hitl.helpers import clear_can_buffers + +SPEED_MISMATCH = 250 + + +def _set_speed_mismatch(p, panda_jungle): + """Set panda and jungle to different CAN speeds to induce errors.""" + for bus in range(3): + p.set_can_speed_kbps(bus, SPEED_NORMAL) + panda_jungle.set_can_speed_kbps(bus, SPEED_MISMATCH) + + +def _reset_speeds(p, panda_jungle): + """Restore matching CAN speeds and clear buffers.""" + for bus in range(3): + panda_jungle.set_can_speed_kbps(bus, SPEED_NORMAL) + try: + p.reset(reconnect=True) + for bus in range(3): + p.set_can_speed_kbps(bus, SPEED_NORMAL) + clear_can_buffers(p) + except PandaSpiException: + p.reconnect() + + +def _health_check(p): + """Get health, return None if SPI fails (lockup detected).""" + try: + return p.health() + except PandaSpiException: + return None + + +def _poll_health_during_errors(p, panda_jungle, duration, send_from_panda=False): + """Send CAN at mismatched speed while polling health. Returns stats.""" + msg = b"\xaa" * 8 + start = time.monotonic() + health_count = 0 + spi_failures = 0 + max_gap = 0.0 + gap_sum = 0.0 + gap_count = 0 + last_response = start + + while time.monotonic() - start < duration: + for bus in range(3): + if send_from_panda: + try: + p.can_send(0x456, msg, bus) + except PandaSpiException: + spi_failures += 1 + continue + else: + panda_jungle.can_send(0x123, msg, bus) + + h = _health_check(p) + now = time.monotonic() + gap = now - last_response + if gap > max_gap: + max_gap = gap + gap_sum += gap + gap_count += 1 + last_response = now + + if h is not None: + health_count += 1 + else: + spi_failures += 1 + + avg_gap = (gap_sum / gap_count) if gap_count else 0 + + return { + 'health_count': health_count, + 'spi_failures': spi_failures, + 'max_gap': max_gap, + 'avg_gap': avg_gap, + } + + +def _print_stats(stats): + mg = stats['max_gap'] * 1000 + ag = stats['avg_gap'] * 1000 + print(f"health={stats['health_count']} spi_fail={stats['spi_failures']} max_gap={mg:.1f}ms avg={ag:.1f}ms") + + +def _print_can_health(p): + for bus in range(3): + ch = p.can_health(bus) + print(f" bus {bus}: errs={ch['total_error_cnt']} irq0={ch['irq0_call_rate']} busoff={ch['bus_off_cnt']} resets={ch['can_core_reset_count']}") + + +@pytest.mark.panda_expect_can_error +@pytest.mark.timeout(60) +class TestCanErrorResilience: + """Verify panda stays responsive over SPI during CAN error conditions.""" + + def test_spi_responsive_during_can_errors(self, p, panda_jungle): + """Speed mismatch causes CAN error interrupts; SPI must stay responsive.""" + p.set_safety_mode(CarParams.SafetyModel.allOutput) + _set_speed_mismatch(p, panda_jungle) + + stats = _poll_health_during_errors(p, panda_jungle, duration=8.0) + _print_stats(stats) + + if stats['spi_failures'] == 0: + _print_can_health(p) + + assert stats['spi_failures'] == 0, f"SPI failed {stats['spi_failures']} times (panda locked up)" + assert stats['max_gap'] < 0.250, f"SPI gap too large: {stats['max_gap']*1000:.1f}ms" + + def test_spi_responsive_during_bus_off(self, p, panda_jungle): + """TX with no ACK -> bus-off -> must not block SPI.""" + p.set_safety_mode(CarParams.SafetyModel.allOutput) + _set_speed_mismatch(p, panda_jungle) + + stats = _poll_health_during_errors(p, panda_jungle, duration=5.0, send_from_panda=True) + _print_stats(stats) + + if stats['spi_failures'] == 0: + _print_can_health(p) + + assert stats['spi_failures'] == 0, f"SPI failed {stats['spi_failures']} times (panda locked up)" + assert stats['max_gap'] < 0.250, f"SPI gap too large: {stats['max_gap']*1000:.1f}ms" + + def test_sustained_error_storm(self, p, panda_jungle): + """Sustained CAN errors for 15s must not degrade SPI responsiveness.""" + p.set_safety_mode(CarParams.SafetyModel.allOutput) + _set_speed_mismatch(p, panda_jungle) + + stats = _poll_health_during_errors(p, panda_jungle, duration=15.0) + _print_stats(stats) + + assert stats['spi_failures'] == 0, f"SPI failed {stats['spi_failures']} times" + assert stats['max_gap'] < 0.250, f"SPI max gap: {stats['max_gap']*1000:.1f}ms" + assert stats['health_count'] > 100, f"Too few responses: {stats['health_count']}" + + def test_can_recovery_after_errors(self, p, panda_jungle): + """After CAN errors, normal communication must resume.""" + p.set_safety_mode(CarParams.SafetyModel.allOutput) + _set_speed_mismatch(p, panda_jungle) + + msg = b"\xdd" * 8 + for _ in range(100): + for bus in range(3): + panda_jungle.can_send(0xabc, msg, bus) + time.sleep(1.0) + + _reset_speeds(p, panda_jungle) + time.sleep(0.5) + + test_msg = b"\xee" * 8 + for bus in range(3): + panda_jungle.can_send(0x100, test_msg, bus) + time.sleep(0.5) + + msgs = p.can_recv() + buses_received = {m[2] for m in msgs if m[0] == 0x100} + print(f"Received on buses: {buses_received}") + assert len(buses_received) == 3, f"CAN didn't recover on all buses, only got: {buses_received}" + + def test_no_faults_during_errors(self, p, panda_jungle): + """CAN errors should not trigger interrupt rate faults.""" + p.set_safety_mode(CarParams.SafetyModel.allOutput) + _set_speed_mismatch(p, panda_jungle) + + msg = b"\xff" * 8 + for _ in range(200): + for bus in range(3): + panda_jungle.can_send(0x555, msg, bus) + time.sleep(0.001) + + time.sleep(2.0) + + h = _health_check(p) + assert h is not None, "Panda unresponsive after CAN errors" + print(f"faults: 0x{h['faults']:x}, interrupt_load: {h['interrupt_load']}") + for bus in range(3): + ch = p.can_health(bus) + print(f" bus {bus}: irq0_rate={ch['irq0_call_rate']}, irq1_rate={ch['irq1_call_rate']}") + + assert h['faults'] == 0, f"Faults during CAN errors: 0x{h['faults']:x}"