backoff from offline servers in all outgoing federation requests

Only marking M_UNKNOWN errors as a hard failure if they are in the
standard error format is conservative, and might cause us to miss some
offline servers. For example, a server might configure a load balancer
to send a standard-looking { errcode: ..., ... } response when the
backend is down, with a custom errcode. We wouldn't catch this.

TODO: evaluate whether this comes up in practice by running the changes
on computer.surgery
This commit is contained in:
Olivia Lee 2024-08-11 15:40:51 -07:00
parent 9b22c9b40b
commit 8001dcf2eb
No known key found for this signature in database
GPG key ID: 54D568A15B9CD1F9

View file

@ -679,6 +679,10 @@ impl Service {
debug!("Waiting for permit");
let permit = self.maximum_requests.acquire().await;
debug!("Got permit");
let backoff_guard =
services().server_backoff.server_ready(destination)?;
let response = tokio::time::timeout(
Duration::from_secs(2 * 60),
server_server::send_request(
@ -692,9 +696,27 @@ impl Service {
.map_err(|_| {
warn!("Timeout waiting for server response");
Error::BadServerResponse("Timeout waiting for server response")
})?;
})
.and_then(|result| result);
drop(permit);
match &response {
Err(Error::Federation(_, error)) => {
if error.error_kind().is_some() {
// Other errors may occur during normal operation with a
// healthy server, so don't increment the failure counter.
backoff_guard.soft_failure();
} else {
// The error wasn't in the expected format for matrix API
// responses. This almost certainly indicates the server
// is unhealthy or offline.
backoff_guard.hard_failure();
}
}
Err(_) => backoff_guard.hard_failure(),
Ok(_) => backoff_guard.success(),
}
response
}