Skip to content

Commit 55151e6

Browse files
authored
Aligning timeouts to reflect real-world scenarios (#399)
* Changing error messages in case of node disconnection; * cleaning up unused properties * removing request_limit and the logic attached to that since we don't actually handle multiple in-flight requests to binary-port * Removing the possibility to define "infinite" as a valid retry amount in node client connector since it can lead to deadlocks. That allowed removal of RpcServerConfigTarget, NodeClientConfigTarget, ExponentialBackoffConfigTarget and MaxAttemptsTarget since we don't need custom code for deserialization of the config file. * Added some metrics to track unwanted events (timeouts on connection/sending/receiving data from binary port, detecting response id mismatch) * Changed buckets definitions in RESPONSE_TIME_MS_BUCKETS constant * Added MAX_COMPONENT_STARTUP_TIMEOUT_SECS guard in case one of the components hangs on startup * Making keepalive loop use the standard mechnism of sending messages to gain retries and id-checks * Aligning message_timeout_secs
1 parent 7a22f99 commit 55151e6

File tree

14 files changed

+194
-375
lines changed

14 files changed

+194
-375
lines changed

Cargo.lock

Lines changed: 12 additions & 12 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -208,8 +208,8 @@ address = '0.0.0.0:28101'
208208
max_message_size_bytes = 4_194_304
209209
request_limit = 3
210210
request_buffer_size = 16
211-
message_timeout_secs = 30
212-
client_access_timeout_secs = 2
211+
message_timeout_secs = 10
212+
client_access_timeout_secs = 10
213213

214214
[rpc_server.speculative_exec_server]
215215
enable_server = true

metrics/src/rpc.rs

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@ const RESPONSE_SIZE_BUCKETS: &[f64; 8] = &[
88
5e+2_f64, 1e+3_f64, 2e+3_f64, 5e+3_f64, 5e+4_f64, 5e+5_f64, 5e+6_f64, 5e+7_f64,
99
];
1010

11-
const RESPONSE_TIME_MS_BUCKETS: &[f64; 8] = &[
12-
1_f64, 5_f64, 10_f64, 30_f64, 50_f64, 100_f64, 200_f64, 300_f64,
11+
const RESPONSE_TIME_MS_BUCKETS: &[f64; 9] = &[
12+
1_f64, 5_f64, 10_f64, 30_f64, 50_f64, 100_f64, 300_f64, 1000_f64, 3000_f64,
1313
];
1414

1515
static ENDPOINT_CALLS: Lazy<IntCounterVec> = Lazy::new(|| {
@@ -24,6 +24,21 @@ static ENDPOINT_CALLS: Lazy<IntCounterVec> = Lazy::new(|| {
2424
counter
2525
});
2626

27+
static TIMEOUT_COUNTERS: Lazy<IntCounterVec> = Lazy::new(|| {
28+
let counter = IntCounterVec::new(
29+
Opts::new(
30+
"rpc_server_timeout_counts",
31+
"Counters for how many of the requests failed due to internal timeout",
32+
),
33+
&["timer"],
34+
)
35+
.unwrap();
36+
REGISTRY
37+
.register(Box::new(counter.clone()))
38+
.expect("cannot register metric");
39+
counter
40+
});
41+
2742
static RESPONSE_TIMES_MS: Lazy<HistogramVec> = Lazy::new(|| {
2843
let histogram = HistogramVec::new(
2944
HistogramOpts {
@@ -56,6 +71,18 @@ static RECONNECT_TIMES_MS: Lazy<Histogram> = Lazy::new(|| {
5671
histogram
5772
});
5873

74+
static MISMATCHED_IDS: Lazy<IntGauge> = Lazy::new(|| {
75+
let counter = IntGauge::new(
76+
"rpc_server_mismatched_ids",
77+
"Number of mismatched ID events observed in responses from binary port",
78+
)
79+
.expect("rpc_server_mismatched_ids metric can't be created");
80+
REGISTRY
81+
.register(Box::new(counter.clone()))
82+
.expect("cannot register metric");
83+
counter
84+
});
85+
5986
static DISCONNECT_EVENTS: Lazy<IntGauge> = Lazy::new(|| {
6087
let counter = IntGauge::new(
6188
"rpc_server_disconnects",
@@ -108,3 +135,11 @@ pub fn register_request_size(method: &str, payload_size: f64) {
108135
.with_label_values(&[method])
109136
.observe(payload_size);
110137
}
138+
139+
pub fn register_timeout(timer_name: &str) {
140+
TIMEOUT_COUNTERS.with_label_values(&[timer_name]).inc();
141+
}
142+
143+
pub fn register_mismatched_id() {
144+
MISMATCHED_IDS.inc();
145+
}

resources/example_configs/EXAMPLE_NCTL_CONFIG.toml

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,8 @@ cors_origin = ""
1818
ip_address = "0.0.0.0"
1919
port = 28102
2020
max_message_size_bytes = 4194304
21-
request_limit = 3
22-
request_buffer_size = 16
23-
message_timeout_secs = 30
24-
client_access_timeout_secs = 2
21+
message_timeout_secs = 10
22+
client_access_timeout_secs = 10
2523
keepalive_timeout_ms = 10_000
2624

2725
[rpc_server.node_client.exponential_backoff]

resources/example_configs/EXAMPLE_NCTL_POSTGRES_CONFIG.toml

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,8 @@ cors_origin = ""
1818
ip_address = "0.0.0.0"
1919
port = 28102
2020
max_message_size_bytes = 4194304
21-
request_limit = 3
22-
request_buffer_size = 16
23-
message_timeout_secs = 30
24-
client_access_timeout_secs = 2
21+
message_timeout_secs = 10
22+
client_access_timeout_secs = 10
2523
keepalive_timeout_ms = 10_000
2624

2725
[rpc_server.node_client.exponential_backoff]

resources/example_configs/EXAMPLE_NODE_CONFIG.toml

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,8 @@ cors_origin = ""
1818
ip_address = "3.20.57.210"
1919
port = 7777
2020
max_message_size_bytes = 4194304
21-
request_limit = 10
22-
request_buffer_size = 50
23-
message_timeout_secs = 60
24-
client_access_timeout_secs = 60
21+
message_timeout_secs = 10
22+
client_access_timeout_secs = 10
2523
keepalive_timeout_ms = 10_000
2624

2725
[rpc_server.node_client.exponential_backoff]

resources/example_configs/default_debian_config.toml

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -71,14 +71,10 @@ ip_address = '127.0.0.1'
7171
port = 7779
7272
# Maximum size of a message in bytes.
7373
max_message_size_bytes = 4_194_304
74-
# Maximum number of in-flight node requests.
75-
request_limit = 3
76-
# Number of node requests that can be buffered.
77-
request_buffer_size = 16
7874
# Timeout for a node request in seconds.
79-
message_timeout_secs = 30
75+
message_timeout_secs = 10
8076
# Timeout specifying how long to wait for binary port client to be available.
81-
client_access_timeout_secs = 2
77+
client_access_timeout_secs = 10
8278
# The amount of time in milliseconds to wait between sending keepalive requests.
8379
keepalive_timeout_ms = 10_000
8480

resources/example_configs/default_rpc_only_config.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,9 +76,9 @@ request_limit = 3
7676
# Number of node requests that can be buffered.
7777
request_buffer_size = 16
7878
# Timeout for a node request in seconds.
79-
message_timeout_secs = 30
79+
message_timeout_secs = 10
8080
# Timeout specifying how long to wait for binary port client to be available.
81-
client_access_timeout_secs = 2
81+
client_access_timeout_secs = 10
8282
# The amount of time in milliseconds to wait between sending keepalive requests.
8383
keepalive_timeout_ms = 10_000
8484

resources/example_configs/default_sse_only_config.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ port = 18888
3232
max_concurrent_requests = 50
3333
max_requests_per_second = 50
3434

35-
[admin_server]
35+
[admin_api_server]
3636
enable_server = true
3737
port = 18887
3838
max_concurrent_requests = 1

0 commit comments

Comments
 (0)