metrics for online and offline remote server count

This commit is contained in:
Olivia Lee 2024-08-23 15:45:10 -07:00
parent 5b6aaa19b9
commit 56f025cb47
No known key found for this signature in database
GPG key ID: 54D568A15B9CD1F9
2 changed files with 92 additions and 7 deletions

View file

@ -290,6 +290,9 @@ pub(crate) struct Metrics {
/// Number of entries in an /// Number of entries in an
/// [`OnDemandHashMap`](crate::utils::on_demand_hashmap::OnDemandHashMap) /// [`OnDemandHashMap`](crate::utils::on_demand_hashmap::OnDemandHashMap)
on_demand_hashmap_size: opentelemetry::metrics::Gauge<u64>, on_demand_hashmap_size: opentelemetry::metrics::Gauge<u64>,
/// Number of known remote servers in each state (online or offline)
remote_server_count: opentelemetry::metrics::Gauge<u64>,
} }
impl Metrics { impl Metrics {
@ -345,11 +348,17 @@ impl Metrics {
.with_description("Number of entries in OnDemandHashMap") .with_description("Number of entries in OnDemandHashMap")
.init(); .init();
let remote_server_count = meter
.u64_gauge("remote_server_count")
.with_description("Number of known remote servers")
.init();
Metrics { Metrics {
otel_state: (registry, provider), otel_state: (registry, provider),
http_requests_histogram, http_requests_histogram,
lookup, lookup,
on_demand_hashmap_size, on_demand_hashmap_size,
remote_server_count,
} }
} }
@ -384,6 +393,18 @@ impl Metrics {
&[KeyValue::new("name", name)], &[KeyValue::new("name", name)],
); );
} }
/// Record number of remote servers marked online or offline.
pub(crate) fn record_remote_server_count(
&self,
online_count: u64,
offline_count: u64,
) {
self.remote_server_count
.record(online_count, &[KeyValue::new("state", "online")]);
self.remote_server_count
.record(offline_count, &[KeyValue::new("state", "offline")]);
}
} }
/// Track HTTP metrics by converting this into an [`axum`] layer /// Track HTTP metrics by converting this into an [`axum`] layer

View file

@ -1,6 +1,6 @@
use std::{ use std::{
collections::HashMap, collections::HashMap,
sync::{Arc, RwLock}, sync::{Arc, Mutex, RwLock},
time::{Duration, Instant}, time::{Duration, Instant},
}; };
@ -8,7 +8,7 @@ use rand::{thread_rng, Rng};
use ruma::{OwnedServerName, ServerName}; use ruma::{OwnedServerName, ServerName};
use tracing::{debug, info, instrument}; use tracing::{debug, info, instrument};
use crate::{services, Error, Result}; use crate::{observability::METRICS, services, Error, Result};
/// Service to handle backing off requests to offline servers. /// Service to handle backing off requests to offline servers.
/// ///
@ -36,6 +36,8 @@ use crate::{services, Error, Result};
/// the server is only briefly offline. /// the server is only briefly offline.
pub(crate) struct Service { pub(crate) struct Service {
servers: RwLock<HashMap<OwnedServerName, Arc<RwLock<BackoffState>>>>, servers: RwLock<HashMap<OwnedServerName, Arc<RwLock<BackoffState>>>>,
server_counts: Mutex<ServerCounts>,
} }
/// Guard to record the result of an attempted request to a server. /// Guard to record the result of an attempted request to a server.
@ -75,10 +77,27 @@ struct BackoffState {
jitter_coeff: f64, jitter_coeff: f64,
} }
/// State transitions for a single server
#[derive(Debug, Copy, Clone)]
enum Transition {
/// A new server, marked as online by default
New,
OnlineToOffline,
OfflineToOnline,
}
/// Counts of known servers in each state, used for metrics
#[derive(Debug, Copy, Clone, Default)]
struct ServerCounts {
online_count: u64,
offline_count: u64,
}
impl Service { impl Service {
pub(crate) fn build() -> Arc<Service> { pub(crate) fn build() -> Arc<Service> {
Arc::new(Service { Arc::new(Service {
servers: RwLock::default(), servers: RwLock::default(),
server_counts: Mutex::default(),
}) })
} }
@ -113,6 +132,45 @@ impl Service {
}) })
} }
fn record_transition(
&self,
server_name: &ServerName,
transition: Transition,
) {
let mut counts = self.server_counts.lock().unwrap();
match transition {
Transition::New => {
info!(
%server_name,
"new remote server, marked as online by default"
);
counts.online_count += 1;
}
Transition::OnlineToOffline => {
info!(
%server_name,
"remote server transitioned from online to offline"
);
counts.online_count -= 1;
counts.offline_count += 1;
}
Transition::OfflineToOnline => {
info!(
%server_name,
"remote server transitioned from offline to online"
);
counts.offline_count -= 1;
counts.online_count += 1;
}
}
METRICS.record_remote_server_count(
counts.online_count,
counts.offline_count,
);
}
fn server_state( fn server_state(
&self, &self,
server_name: &ServerName, server_name: &ServerName,
@ -134,6 +192,7 @@ impl Service {
server_name.to_owned(), server_name.to_owned(),
))); )));
servers.insert(server_name.to_owned(), Arc::clone(&state)); servers.insert(server_name.to_owned(), Arc::clone(&state));
self.record_transition(server_name, Transition::New);
state state
} }
} }
@ -193,9 +252,12 @@ impl BackoffGuard {
state.failure_count = 0; state.failure_count = 0;
if state.is_online() != was_online { // Server is always online after setting failure_count = 0
let server_name = &state.server_name; if !was_online {
info!(%server_name, "server transitioned from offline to online"); services().server_backoff.record_transition(
&state.server_name,
Transition::OfflineToOnline,
);
} }
} }
@ -223,8 +285,10 @@ impl BackoffGuard {
); );
if state.is_online() != was_online { if state.is_online() != was_online {
let server_name = &state.server_name; services().server_backoff.record_transition(
info!(%server_name, "server transitioned from online to offline"); &state.server_name,
Transition::OnlineToOffline,
);
} }
} }
} }