From 8001dcf2eb17a77b6322ecd706da638a926a9f54 Mon Sep 17 00:00:00 2001 From: Olivia Lee Date: Sun, 11 Aug 2024 15:40:51 -0700 Subject: [PATCH] backoff from offline servers in all outgoing federation requests Only marking M_UNKNOWN errors as a hard failure if they are in the standard error format is conservative, and might cause us to miss some offline servers. For example, a server might configure a load balancer to send a standard-looking { errcode: ..., ... } response when the backend is down, with a custom errcode. We wouldn't catch this. TODO: evaluate whether this comes up in practice by running the changes on computer.surgery --- src/service/sending.rs | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/src/service/sending.rs b/src/service/sending.rs index 1c3b6ecb..07a72972 100644 --- a/src/service/sending.rs +++ b/src/service/sending.rs @@ -679,6 +679,10 @@ impl Service { debug!("Waiting for permit"); let permit = self.maximum_requests.acquire().await; debug!("Got permit"); + + let backoff_guard = + services().server_backoff.server_ready(destination)?; + let response = tokio::time::timeout( Duration::from_secs(2 * 60), server_server::send_request( @@ -692,9 +696,27 @@ impl Service { .map_err(|_| { warn!("Timeout waiting for server response"); Error::BadServerResponse("Timeout waiting for server response") - })?; + }) + .and_then(|result| result); drop(permit); + match &response { + Err(Error::Federation(_, error)) => { + if error.error_kind().is_some() { + // Other errors may occur during normal operation with a + // healthy server, so don't increment the failure counter. + backoff_guard.soft_failure(); + } else { + // The error wasn't in the expected format for matrix API + // responses. This almost certainly indicates the server + // is unhealthy or offline. + backoff_guard.hard_failure(); + } + } + Err(_) => backoff_guard.hard_failure(), + Ok(_) => backoff_guard.success(), + } + response }