Skip to content

Commit 2db0389

Browse files
sanityclaude
andauthored
fix: use Kleinberg-optimal gap_target for bootstrap connections (#3503)
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent de08c91 commit 2db0389

File tree

2 files changed

+153
-35
lines changed

2 files changed

+153
-35
lines changed

crates/core/src/operations/connect.rs

Lines changed: 144 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -2179,49 +2179,93 @@ pub(crate) async fn join_ring_request(
21792179
}
21802180

21812181
let own = op_manager.ring.connection_manager.own_location();
2182-
// Use the joiner's own location as the desired_location for the connect request.
2183-
//
2184-
// History: PR #2907 fixed a critical issue where using `gateway.location()` caused
2185-
// 99.7% of join requests to converge on a single acceptor. That PR intended to use
2186-
// the joiner's own location but implemented `Location::random()` instead, which
2187-
// broke simulation determinism for certain seeds (issues #3028, #3030).
2188-
//
2189-
// Using `own.location()` is correct because:
2190-
// - Each joiner has a unique location → requests spread across gateway neighbors
2191-
// - The gateway routes toward the joiner's ring neighborhood → better initial placement
2192-
// - Deterministic in simulation tests (same seed → same location → same topology)
2193-
// - Falls back to random only if own address is unknown (NAT before ObservedAddress)
21942182
let base_location = own.location().unwrap_or_else(Location::random);
21952183

21962184
// Capture now once so the jitter failure counter and any other time-sensitive
21972185
// calls in this function share a consistent timestamp.
21982186
let now = Instant::now();
21992187

2200-
// Apply location jitter on retry to explore different ring regions.
2188+
let current_connections = op_manager.ring.connection_manager.connection_count();
2189+
2190+
// Helper: apply jitter to base_location for NAT-compatible acceptor exploration.
22012191
// After consecutive hole-punch failures, the same location routes to the same
2202-
// acceptors. Jittering the target location causes routing to converge on
2203-
// different ring peers, increasing the chance of finding a NAT-compatible acceptor.
2204-
let failures = op_manager
2205-
.ring
2206-
.connection_manager
2207-
.increment_connect_jitter_failures(now);
2208-
let desired_location = if failures > 1 {
2209-
// Jitter magnitude: 0.05 * 2^(failures-2), capped at 0.25
2210-
let magnitude = (0.05 * (1u32 << (failures - 2).min(3)) as f64).min(0.25);
2211-
// Generate uniform random offset in [-magnitude, +magnitude]
2212-
let uniform_01 = (GlobalRng::random_u64() as f64) / (u64::MAX as f64);
2213-
let offset = magnitude * (2.0 * uniform_01 - 1.0);
2214-
let jittered = (base_location.as_f64() + offset).rem_euclid(1.0);
2215-
tracing::info!(
2216-
failures,
2217-
magnitude,
2218-
base = %base_location,
2219-
jittered,
2220-
"connect: applying location jitter after consecutive failures"
2221-
);
2222-
Location::new(jittered)
2192+
// acceptors. Jittering explores different ring regions.
2193+
let apply_bootstrap_jitter = |base: Location| -> Location {
2194+
let failures = op_manager
2195+
.ring
2196+
.connection_manager
2197+
.increment_connect_jitter_failures(now);
2198+
if failures > 1 {
2199+
// Jitter magnitude: 0.05 * 2^(failures-2), capped at 0.25
2200+
let magnitude = (0.05 * (1u32 << (failures - 2).min(3)) as f64).min(0.25);
2201+
let uniform_01 = (GlobalRng::random_u64() as f64) / (u64::MAX as f64);
2202+
let offset = magnitude * (2.0 * uniform_01 - 1.0);
2203+
let jittered = (base.as_f64() + offset).rem_euclid(1.0);
2204+
tracing::info!(
2205+
failures,
2206+
magnitude,
2207+
base = %base,
2208+
jittered,
2209+
"connect: applying location jitter after consecutive failures"
2210+
);
2211+
Location::new(jittered)
2212+
} else {
2213+
base
2214+
}
2215+
};
2216+
2217+
// Choose desired_location based on whether we have enough connections for
2218+
// gap-based targeting (Kleinberg-optimal) or need bootstrap targeting.
2219+
//
2220+
// With connections: use gap_target to find the largest gap in our connection
2221+
// distribution in log-distance space and target its midpoint. This produces
2222+
// Kleinberg-optimal 1/d-distributed connections.
2223+
//
2224+
// Without connections (true bootstrap): target own location with jitter.
2225+
// Own-location targeting spreads requests across gateway neighbors (each
2226+
// joiner has a unique location). Jitter after failures explores different
2227+
// ring regions to find NAT-compatible acceptors.
2228+
// Minimum connections needed for meaningful gap analysis. Independent of
2229+
// min_connections because this is about statistical sample size for the 1/d
2230+
// distribution, not connection targets. 3 points are sufficient to identify
2231+
// the largest gap in log-distance space.
2232+
const GAP_TARGET_THRESHOLD: usize = 3;
2233+
let desired_location = if current_connections >= GAP_TARGET_THRESHOLD {
2234+
// Use gap_target: find the largest gap in our current connections
2235+
// and target its midpoint for Kleinberg-optimal placement.
2236+
if let Some(my_loc) = own.location() {
2237+
let neighbor_distances: Vec<f64> = op_manager
2238+
.ring
2239+
.connection_manager
2240+
.location_for_all_peers()
2241+
.into_iter()
2242+
.map(|peer_loc| my_loc.distance(peer_loc).as_f64())
2243+
.collect();
2244+
if neighbor_distances.len() >= GAP_TARGET_THRESHOLD {
2245+
let target = crate::topology::small_world_rand::gap_target(
2246+
my_loc,
2247+
neighbor_distances.into_iter(),
2248+
);
2249+
tracing::info!(
2250+
current_connections,
2251+
own_location = %my_loc,
2252+
target = %target,
2253+
distance = my_loc.distance(target).as_f64(),
2254+
"connect: using gap_target for Kleinberg-optimal placement"
2255+
);
2256+
target
2257+
} else {
2258+
// Fewer unique neighbor locations than connections (e.g. sybil
2259+
// masking collapses peers to same ring location). Fall back to
2260+
// jitter-based exploration to avoid retrying the same acceptors.
2261+
apply_bootstrap_jitter(base_location)
2262+
}
2263+
} else {
2264+
apply_bootstrap_jitter(base_location)
2265+
}
22232266
} else {
2224-
base_location
2267+
// Bootstrap mode: target own location with jitter after failures.
2268+
apply_bootstrap_jitter(base_location)
22252269
};
22262270
let ttl = op_manager
22272271
.ring
@@ -4389,4 +4433,69 @@ mod tests {
43894433
"unconnected peer should not be in the visited filter"
43904434
);
43914435
}
4436+
4437+
/// Verify that gap_target produces shorter connections than own-location+jitter.
4438+
///
4439+
/// This is a regression test for the bootstrap topology bug: before the fix,
4440+
/// join_ring_request always targeted the joiner's own location with up to ±0.25
4441+
/// jitter, producing median connection distance ~0.12 instead of the Kleinberg-
4442+
/// optimal ~0.02. Now, with ≥3 existing connections, gap_target is used instead.
4443+
#[test]
4444+
fn test_gap_target_produces_shorter_connections_than_jitter() {
4445+
use crate::topology::small_world_rand::gap_target;
4446+
4447+
let _guard = GlobalRng::seed_guard(42);
4448+
4449+
let my_location = Location::new(0.5);
4450+
4451+
// Simulate a peer with 5 existing connections at various distances
4452+
let existing_connections = vec![
4453+
Location::new(0.501), // very close
4454+
Location::new(0.51), // close
4455+
Location::new(0.55), // nearby
4456+
Location::new(0.7), // medium
4457+
Location::new(0.9), // far
4458+
];
4459+
4460+
let distances: Vec<f64> = existing_connections
4461+
.iter()
4462+
.map(|loc| my_location.distance(*loc).as_f64())
4463+
.collect();
4464+
4465+
// Generate 100 gap_target samples and 100 jitter samples
4466+
let mut gap_target_distances = Vec::new();
4467+
let mut jitter_distances = Vec::new();
4468+
4469+
for _ in 0..100 {
4470+
let target = gap_target(my_location, distances.iter().copied());
4471+
gap_target_distances.push(my_location.distance(target).as_f64());
4472+
4473+
// Simulate jitter with failures=3 (magnitude=0.10)
4474+
let magnitude = 0.10;
4475+
let uniform_01 = (GlobalRng::random_u64() as f64) / (u64::MAX as f64);
4476+
let offset = magnitude * (2.0 * uniform_01 - 1.0);
4477+
let jittered = (my_location.as_f64() + offset).rem_euclid(1.0);
4478+
jitter_distances.push(my_location.distance(Location::new(jittered)).as_f64());
4479+
}
4480+
4481+
gap_target_distances.sort_by(|a, b| a.partial_cmp(b).unwrap());
4482+
jitter_distances.sort_by(|a, b| a.partial_cmp(b).unwrap());
4483+
4484+
let gap_median = gap_target_distances[50];
4485+
let jitter_median = jitter_distances[50];
4486+
4487+
// gap_target should produce significantly shorter median distances
4488+
// because it targets gaps in the 1/d distribution (Kleinberg-optimal),
4489+
// while jitter produces roughly uniform distances around the peer.
4490+
assert!(
4491+
gap_median < jitter_median,
4492+
"gap_target median ({gap_median:.4}) should be shorter than jitter median ({jitter_median:.4})"
4493+
);
4494+
4495+
// gap_target median should be well under 0.05 (typically ~0.001-0.01)
4496+
assert!(
4497+
gap_median < 0.05,
4498+
"gap_target median ({gap_median:.4}) should be < 0.05 (Kleinberg-optimal produces short connections)"
4499+
);
4500+
}
43924501
}

crates/core/src/ring/connection_manager.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -731,6 +731,15 @@ impl ConnectionManager {
731731
}
732732
}
733733

734+
/// Returns all ring-connected peer locations.
735+
pub fn location_for_all_peers(&self) -> Vec<Location> {
736+
self.connections_by_location
737+
.read()
738+
.keys()
739+
.copied()
740+
.collect()
741+
}
742+
734743
/// Returns our own socket address if set.
735744
pub fn get_own_addr(&self) -> Option<SocketAddr> {
736745
*self.own_addr.lock()

0 commit comments

Comments
 (0)