diff --git a/src/observability.rs b/src/observability.rs index f2503f61..6b5330ab 100644 --- a/src/observability.rs +++ b/src/observability.rs @@ -290,6 +290,9 @@ pub(crate) struct Metrics { /// Number of entries in an /// [`OnDemandHashMap`](crate::utils::on_demand_hashmap::OnDemandHashMap) on_demand_hashmap_size: opentelemetry::metrics::Gauge, + + /// Number of known remote servers in each state (online or offline) + remote_server_count: opentelemetry::metrics::Gauge, } impl Metrics { @@ -345,11 +348,17 @@ impl Metrics { .with_description("Number of entries in OnDemandHashMap") .init(); + let remote_server_count = meter + .u64_gauge("remote_server_count") + .with_description("Number of known remote servers") + .init(); + Metrics { otel_state: (registry, provider), http_requests_histogram, lookup, on_demand_hashmap_size, + remote_server_count, } } @@ -384,6 +393,18 @@ impl Metrics { &[KeyValue::new("name", name)], ); } + + /// Record number of remote servers marked online or offline. + pub(crate) fn record_remote_server_count( + &self, + online_count: u64, + offline_count: u64, + ) { + self.remote_server_count + .record(online_count, &[KeyValue::new("state", "online")]); + self.remote_server_count + .record(offline_count, &[KeyValue::new("state", "offline")]); + } } /// Track HTTP metrics by converting this into an [`axum`] layer diff --git a/src/service/server_backoff.rs b/src/service/server_backoff.rs index 0cfe39a7..6326ce6d 100644 --- a/src/service/server_backoff.rs +++ b/src/service/server_backoff.rs @@ -1,6 +1,6 @@ use std::{ collections::HashMap, - sync::{Arc, RwLock}, + sync::{Arc, Mutex, RwLock}, time::{Duration, Instant}, }; @@ -8,7 +8,7 @@ use rand::{thread_rng, Rng}; use ruma::{OwnedServerName, ServerName}; use tracing::{debug, info, instrument}; -use crate::{services, Error, Result}; +use crate::{observability::METRICS, services, Error, Result}; /// Service to handle backing off requests to offline servers. /// @@ -36,6 +36,8 @@ use crate::{services, Error, Result}; /// the server is only briefly offline. pub(crate) struct Service { servers: RwLock>>>, + + server_counts: Mutex, } /// Guard to record the result of an attempted request to a server. @@ -75,10 +77,27 @@ struct BackoffState { jitter_coeff: f64, } +/// State transitions for a single server +#[derive(Debug, Copy, Clone)] +enum Transition { + /// A new server, marked as online by default + New, + OnlineToOffline, + OfflineToOnline, +} + +/// Counts of known servers in each state, used for metrics +#[derive(Debug, Copy, Clone, Default)] +struct ServerCounts { + online_count: u64, + offline_count: u64, +} + impl Service { pub(crate) fn build() -> Arc { Arc::new(Service { servers: RwLock::default(), + server_counts: Mutex::default(), }) } @@ -113,6 +132,45 @@ impl Service { }) } + fn record_transition( + &self, + server_name: &ServerName, + transition: Transition, + ) { + let mut counts = self.server_counts.lock().unwrap(); + + match transition { + Transition::New => { + info!( + %server_name, + "new remote server, marked as online by default" + ); + counts.online_count += 1; + } + Transition::OnlineToOffline => { + info!( + %server_name, + "remote server transitioned from online to offline" + ); + counts.online_count -= 1; + counts.offline_count += 1; + } + Transition::OfflineToOnline => { + info!( + %server_name, + "remote server transitioned from offline to online" + ); + counts.offline_count -= 1; + counts.online_count += 1; + } + } + + METRICS.record_remote_server_count( + counts.online_count, + counts.offline_count, + ); + } + fn server_state( &self, server_name: &ServerName, @@ -134,6 +192,7 @@ impl Service { server_name.to_owned(), ))); servers.insert(server_name.to_owned(), Arc::clone(&state)); + self.record_transition(server_name, Transition::New); state } } @@ -193,9 +252,12 @@ impl BackoffGuard { state.failure_count = 0; - if state.is_online() != was_online { - let server_name = &state.server_name; - info!(%server_name, "server transitioned from offline to online"); + // Server is always online after setting failure_count = 0 + if !was_online { + services().server_backoff.record_transition( + &state.server_name, + Transition::OfflineToOnline, + ); } } @@ -223,8 +285,10 @@ impl BackoffGuard { ); if state.is_online() != was_online { - let server_name = &state.server_name; - info!(%server_name, "server transitioned from online to offline"); + services().server_backoff.record_transition( + &state.server_name, + Transition::OnlineToOffline, + ); } } }