Skip to content

Commit 7a4bb93

Browse files
iduartgomezclaude
andauthored
fix(bench): improve benchmark stability and output formatting (#2597)
Co-authored-by: Claude <noreply@anthropic.com>
1 parent ccbc21b commit 7a4bb93

File tree

11 files changed

+1485
-329
lines changed

11 files changed

+1485
-329
lines changed

.github/workflows/benchmarks-extended.yml

Lines changed: 43 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,8 @@ jobs:
3232
name: Extended Benchmarks
3333
runs-on: ubuntu-latest
3434
timeout-minutes: 60 # Extended benchmarks take longer
35-
# Don't fail workflow if regressions detected
36-
continue-on-error: true
35+
# Job will fail on severe regressions (>25%) for nightly/main builds
36+
# PRs only get informational comments, not failures
3737

3838
env:
3939
CARGO_TARGET_DIR: ${{ github.workspace }}/target
@@ -120,26 +120,43 @@ jobs:
120120
echo "" >> $GITHUB_STEP_SUMMARY
121121
122122
# Run extended benchmark suite
123-
cargo bench --bench transport_extended --features bench 2>&1 | tee bench_output.txt || true
123+
# Use --color=never to avoid ANSI escape codes in output that break markdown
124+
cargo bench --bench transport_extended --features bench --color=never 2>&1 | tee bench_output.txt || true
124125
125126
# Parse results with structured script
126127
- name: Parse Benchmark Results
127128
id: parse_results
128129
run: |
129130
python3 --version
130131
131-
if python3 scripts/parse_bench_output.py bench_output.txt > parsed_output.txt 2>&1; then
132+
# Run parser - captures exit code
133+
# Exit codes: 0=ok, 1=regressions, 2=severe regressions
134+
set +e
135+
python3 scripts/parse_bench_output.py bench_output.txt > parsed_output.txt 2>&1
136+
PARSE_EXIT=$?
137+
set -e
138+
139+
if [ $PARSE_EXIT -eq 0 ]; then
132140
echo "regression_detected=false" >> $GITHUB_OUTPUT
133-
else
141+
echo "severe_regression=false" >> $GITHUB_OUTPUT
142+
elif [ $PARSE_EXIT -eq 1 ]; then
143+
echo "regression_detected=true" >> $GITHUB_OUTPUT
144+
echo "severe_regression=false" >> $GITHUB_OUTPUT
145+
elif [ $PARSE_EXIT -eq 2 ]; then
134146
echo "regression_detected=true" >> $GITHUB_OUTPUT
147+
echo "severe_regression=true" >> $GITHUB_OUTPUT
148+
else
149+
echo "regression_detected=false" >> $GITHUB_OUTPUT
150+
echo "severe_regression=false" >> $GITHUB_OUTPUT
135151
fi
136152
137153
# Append parsed results
138154
echo "" >> $GITHUB_STEP_SUMMARY
139155
cat bench_summary.md >> $GITHUB_STEP_SUMMARY 2>/dev/null || {
140156
echo "⚠️ Failed to parse results" >> $GITHUB_STEP_SUMMARY
141157
echo '```' >> $GITHUB_STEP_SUMMARY
142-
tail -100 bench_output.txt >> $GITHUB_STEP_SUMMARY
158+
# Strip ANSI escape codes for clean markdown output
159+
tail -100 bench_output.txt | sed 's/\x1b\[[0-9;]*m//g' >> $GITHUB_STEP_SUMMARY
143160
echo '```' >> $GITHUB_STEP_SUMMARY
144161
}
145162
@@ -211,7 +228,20 @@ jobs:
211228
target/criterion/**/report/index.html
212229
retention-days: 90 # Keep extended results longer
213230

214-
# Summary job that always succeeds
231+
# Fail nightly/main builds on severe throughput regressions (>25%)
232+
# PRs only get informational comments, not failures
233+
- name: Fail on Severe Regressions
234+
if: steps.parse_results.outputs.severe_regression == 'true' && github.event_name != 'pull_request'
235+
run: |
236+
echo "🚨 SEVERE THROUGHPUT REGRESSION DETECTED" >> $GITHUB_STEP_SUMMARY
237+
echo "" >> $GITHUB_STEP_SUMMARY
238+
echo "One or more throughput benchmarks regressed by more than 25%." >> $GITHUB_STEP_SUMMARY
239+
echo "This indicates a significant performance problem that should be investigated." >> $GITHUB_STEP_SUMMARY
240+
echo "" >> $GITHUB_STEP_SUMMARY
241+
echo "See the benchmark results above for details." >> $GITHUB_STEP_SUMMARY
242+
exit 1
243+
244+
# Summary job - reports status but doesn't block PRs
215245
benchmark-summary:
216246
name: Extended Benchmark Summary
217247
runs-on: ubuntu-latest
@@ -223,7 +253,12 @@ jobs:
223253
- name: Check Status
224254
run: |
225255
if [ "${{ needs.extended-benchmark.result }}" == "failure" ]; then
226-
echo "⚠️ Extended benchmarks detected regressions (non-blocking)"
256+
if [ "${{ github.event_name }}" == "pull_request" ]; then
257+
echo "⚠️ Extended benchmarks detected regressions (informational, not blocking PR)"
258+
else
259+
echo "🚨 Extended benchmarks failed - severe throughput regression detected (>25%)"
260+
echo "This indicates a significant performance problem that should be investigated."
261+
fi
227262
elif [ "${{ needs.extended-benchmark.result }}" == "cancelled" ]; then
228263
echo "⚠️ Extended benchmarks were cancelled"
229264
else

.github/workflows/benchmarks.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,8 @@ jobs:
135135
136136
# Run transport benchmarks (requires bench feature)
137137
# Uses transport_ci benchmark suite - fast, deterministic subset for CI
138-
cargo bench --bench transport_ci --features bench 2>&1 | tee bench_output.txt
138+
# Use --color=never to avoid ANSI escape codes in output that break markdown
139+
cargo bench --bench transport_ci --features bench --color=never 2>&1 | tee bench_output.txt
139140
140141
# Parse benchmark results with structured script
141142
- name: Parse Benchmark Results
@@ -159,7 +160,8 @@ jobs:
159160
cat bench_summary.md >> $GITHUB_STEP_SUMMARY 2>/dev/null || {
160161
echo "⚠️ Failed to parse benchmark results" >> $GITHUB_STEP_SUMMARY
161162
echo '```' >> $GITHUB_STEP_SUMMARY
162-
tail -50 bench_output.txt >> $GITHUB_STEP_SUMMARY
163+
# Strip ANSI escape codes for clean markdown output
164+
tail -50 bench_output.txt | sed 's/\x1b\[[0-9;]*m//g' >> $GITHUB_STEP_SUMMARY
163165
echo '```' >> $GITHUB_STEP_SUMMARY
164166
}
165167

crates/core/Cargo.toml

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,12 +127,19 @@ ureq = { version = "3.1", features = ["json"] }
127127
which = "8.0"
128128
regex = "1"
129129

130-
# CI-optimized benchmark suite (~8 min, deterministic)
130+
# CI-optimized benchmark suite (~5 min, deterministic)
131131
[[bench]]
132132
name = "transport_ci"
133133
harness = false
134134
required-features = ["bench"]
135135

136+
# Extended benchmark suite (~30-45 min, nightly)
137+
# Includes high-latency paths, packet loss, large transfers, and micro-benchmarks
138+
[[bench]]
139+
name = "transport_extended"
140+
harness = false
141+
required-features = ["bench"]
142+
136143
# LEDBAT validation benchmarks (manual, ~3-5 min)
137144
[[bench]]
138145
name = "transport_ledbat"

crates/core/benches/transport/blackbox.rs

Lines changed: 37 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -91,28 +91,53 @@ pub fn bench_message_throughput(c: &mut Criterion) {
9191
(Arc::new(DashMap::new()) as Channels, message)
9292
},
9393
|(channels, message)| async move {
94-
// Create peers
95-
let (peer_a_pub, mut peer_a, peer_a_addr) =
96-
create_mock_peer(PacketDropPolicy::ReceiveAll, channels.clone())
97-
.await
98-
.unwrap();
94+
// Create peers - handle errors gracefully
95+
let (peer_a_pub, mut peer_a, peer_a_addr) = match create_mock_peer(
96+
PacketDropPolicy::ReceiveAll,
97+
channels.clone(),
98+
)
99+
.await
100+
{
101+
Ok(p) => p,
102+
Err(e) => {
103+
eprintln!("throughput peer_a creation failed: {:?}", e);
104+
return;
105+
}
106+
};
99107
let (peer_b_pub, mut peer_b, peer_b_addr) =
100-
create_mock_peer(PacketDropPolicy::ReceiveAll, channels)
101-
.await
102-
.unwrap();
108+
match create_mock_peer(PacketDropPolicy::ReceiveAll, channels).await {
109+
Ok(p) => p,
110+
Err(e) => {
111+
eprintln!("throughput peer_b creation failed: {:?}", e);
112+
return;
113+
}
114+
};
103115

104116
// Connect
105117
let (conn_a_inner, conn_b_inner) = futures::join!(
106118
peer_a.connect(peer_b_pub, peer_b_addr),
107119
peer_b.connect(peer_a_pub, peer_a_addr),
108120
);
109121
let (conn_a, conn_b) = futures::join!(conn_a_inner, conn_b_inner);
110-
let (mut conn_a, mut conn_b) = (conn_a.unwrap(), conn_b.unwrap());
122+
let (mut conn_a, mut conn_b) = match (conn_a, conn_b) {
123+
(Ok(a), Ok(b)) => (a, b),
124+
(Err(e), _) | (_, Err(e)) => {
125+
eprintln!("throughput connection failed: {:?}", e);
126+
return;
127+
}
128+
};
111129

112130
// Send and receive message (this is what we're measuring)
113-
conn_a.send(message).await.unwrap();
114-
let received: Vec<u8> = conn_b.recv().await.unwrap();
115-
std_black_box(received);
131+
if let Err(e) = conn_a.send(message).await {
132+
eprintln!("throughput send failed: {:?}", e);
133+
return;
134+
}
135+
match conn_b.recv().await {
136+
Ok(received) => {
137+
std_black_box(received);
138+
}
139+
Err(e) => eprintln!("throughput recv failed: {:?}", e),
140+
}
116141
},
117142
BatchSize::SmallInput,
118143
);

crates/core/benches/transport/common.rs

Lines changed: 35 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -58,13 +58,16 @@ pub struct PeerPair {
5858
pub peer_b_pub: TransportPublicKey,
5959
pub peer_b: OutboundConnectionHandler<MockSocket>,
6060
pub peer_b_addr: SocketAddr,
61+
/// Keep channels alive - dropping this closes MockSocket inbound channels
62+
#[allow(dead_code)]
63+
channels: Channels,
6164
}
6265

6366
/// Connected peer pair ready for data transfer
6467
///
65-
/// Encapsulates both connections and their peer handlers. The peer handlers
66-
/// MUST be kept alive - dropping them closes the `inbound_packet_sender`
67-
/// channel which causes `ConnectionClosed` errors.
68+
/// Encapsulates both connections and their peer handlers. The channels map
69+
/// MUST be kept alive - dropping it closes the MockSocket inbound channels
70+
/// which causes the listener tasks to exit and `ConnectionClosed` errors.
6871
///
6972
/// ## Example
7073
///
@@ -76,12 +79,15 @@ pub struct PeerPair {
7679
pub struct ConnectedPeerPair {
7780
pub conn_a: PeerConnection<MockSocket>,
7881
pub conn_b: PeerConnection<MockSocket>,
79-
/// Must keep peer_a alive - it holds the inbound_packet_sender channel
82+
/// Must keep peer_a alive for send_queue channel
8083
#[allow(dead_code)]
8184
peer_a: OutboundConnectionHandler<MockSocket>,
82-
/// Must keep peer_b alive - it holds the inbound_packet_sender channel
85+
/// Must keep peer_b alive for send_queue channel
8386
#[allow(dead_code)]
8487
peer_b: OutboundConnectionHandler<MockSocket>,
88+
/// Keep channels alive - dropping this closes MockSocket inbound channels
89+
#[allow(dead_code)]
90+
channels: Channels,
8591
}
8692

8793
impl PeerPair {
@@ -102,6 +108,7 @@ impl PeerPair {
102108
conn_b,
103109
peer_a: self.peer_a,
104110
peer_b: self.peer_b,
111+
channels: self.channels,
105112
}
106113
}
107114
}
@@ -112,18 +119,32 @@ impl ConnectedPeerPair {
112119
/// Performs `iterations` send/receive cycles to allow LEDBAT congestion
113120
/// control to reach steady state before measurements begin.
114121
///
122+
/// Returns the number of successful warmup iterations. If warmup fails
123+
/// early, benchmarks can still proceed (the connection may be less stable).
124+
///
115125
/// ## Example
116126
///
117127
/// ```rust,ignore
118128
/// let mut peers = create_connected_peers().await;
119-
/// peers.warmup(5, 65536).await; // 5 x 64KB warmup transfers
129+
/// let completed = peers.warmup(5, 65536).await; // 5 x 64KB warmup transfers
120130
/// ```
121-
pub async fn warmup(&mut self, iterations: usize, message_size: usize) {
122-
for _ in 0..iterations {
131+
pub async fn warmup(&mut self, iterations: usize, message_size: usize) -> usize {
132+
let mut completed = 0;
133+
for i in 0..iterations {
123134
let msg = vec![0xABu8; message_size];
124-
self.conn_a.send(msg).await.expect("warmup send");
125-
let _: Vec<u8> = self.conn_b.recv().await.expect("warmup recv");
135+
if let Err(e) = self.conn_a.send(msg).await {
136+
eprintln!("Warmup send {} failed: {:?}", i, e);
137+
break;
138+
}
139+
match self.conn_b.recv().await {
140+
Ok(_) => completed += 1,
141+
Err(e) => {
142+
eprintln!("Warmup recv {} failed: {:?}", i, e);
143+
break;
144+
}
145+
}
126146
}
147+
completed
127148
}
128149

129150
/// Get mutable references to both connections
@@ -147,7 +168,7 @@ pub async fn create_peer_pair(channels: Channels) -> PeerPair {
147168
.expect("create peer A");
148169

149170
let (peer_b_pub, peer_b, peer_b_addr) =
150-
create_mock_peer(PacketDropPolicy::ReceiveAll, channels)
171+
create_mock_peer(PacketDropPolicy::ReceiveAll, channels.clone())
151172
.await
152173
.expect("create peer B");
153174

@@ -158,6 +179,7 @@ pub async fn create_peer_pair(channels: Channels) -> PeerPair {
158179
peer_b_pub,
159180
peer_b,
160181
peer_b_addr,
182+
channels,
161183
}
162184
}
163185

@@ -174,7 +196,7 @@ pub async fn create_peer_pair_with_delay(channels: Channels, delay: Duration) ->
174196
let (peer_b_pub, peer_b, peer_b_addr) = create_mock_peer_with_delay(
175197
PacketDropPolicy::ReceiveAll,
176198
PacketDelayPolicy::Fixed(delay),
177-
channels,
199+
channels.clone(),
178200
)
179201
.await
180202
.expect("create peer B");
@@ -186,6 +208,7 @@ pub async fn create_peer_pair_with_delay(channels: Channels, delay: Duration) ->
186208
peer_b_pub,
187209
peer_b,
188210
peer_b_addr,
211+
channels,
189212
}
190213
}
191214

0 commit comments

Comments
 (0)