metrics for online and offline remote server count

This commit is contained in:
Olivia Lee 2024-08-23 15:45:10 -07:00
parent 5b6aaa19b9
commit 56f025cb47
No known key found for this signature in database
GPG key ID: 54D568A15B9CD1F9
2 changed files with 92 additions and 7 deletions

View file

@ -290,6 +290,9 @@ pub(crate) struct Metrics {
/// Number of entries in an
/// [`OnDemandHashMap`](crate::utils::on_demand_hashmap::OnDemandHashMap)
on_demand_hashmap_size: opentelemetry::metrics::Gauge<u64>,
/// Number of known remote servers in each state (online or offline)
remote_server_count: opentelemetry::metrics::Gauge<u64>,
}
impl Metrics {
@ -345,11 +348,17 @@ impl Metrics {
.with_description("Number of entries in OnDemandHashMap")
.init();
let remote_server_count = meter
.u64_gauge("remote_server_count")
.with_description("Number of known remote servers")
.init();
Metrics {
otel_state: (registry, provider),
http_requests_histogram,
lookup,
on_demand_hashmap_size,
remote_server_count,
}
}
@ -384,6 +393,18 @@ impl Metrics {
&[KeyValue::new("name", name)],
);
}
/// Record number of remote servers marked online or offline.
pub(crate) fn record_remote_server_count(
&self,
online_count: u64,
offline_count: u64,
) {
self.remote_server_count
.record(online_count, &[KeyValue::new("state", "online")]);
self.remote_server_count
.record(offline_count, &[KeyValue::new("state", "offline")]);
}
}
/// Track HTTP metrics by converting this into an [`axum`] layer

View file

@ -1,6 +1,6 @@
use std::{
collections::HashMap,
sync::{Arc, RwLock},
sync::{Arc, Mutex, RwLock},
time::{Duration, Instant},
};
@ -8,7 +8,7 @@ use rand::{thread_rng, Rng};
use ruma::{OwnedServerName, ServerName};
use tracing::{debug, info, instrument};
use crate::{services, Error, Result};
use crate::{observability::METRICS, services, Error, Result};
/// Service to handle backing off requests to offline servers.
///
@ -36,6 +36,8 @@ use crate::{services, Error, Result};
/// the server is only briefly offline.
pub(crate) struct Service {
servers: RwLock<HashMap<OwnedServerName, Arc<RwLock<BackoffState>>>>,
server_counts: Mutex<ServerCounts>,
}
/// Guard to record the result of an attempted request to a server.
@ -75,10 +77,27 @@ struct BackoffState {
jitter_coeff: f64,
}
/// State transitions for a single server
#[derive(Debug, Copy, Clone)]
enum Transition {
/// A new server, marked as online by default
New,
OnlineToOffline,
OfflineToOnline,
}
/// Counts of known servers in each state, used for metrics
#[derive(Debug, Copy, Clone, Default)]
struct ServerCounts {
online_count: u64,
offline_count: u64,
}
impl Service {
pub(crate) fn build() -> Arc<Service> {
Arc::new(Service {
servers: RwLock::default(),
server_counts: Mutex::default(),
})
}
@ -113,6 +132,45 @@ impl Service {
})
}
fn record_transition(
&self,
server_name: &ServerName,
transition: Transition,
) {
let mut counts = self.server_counts.lock().unwrap();
match transition {
Transition::New => {
info!(
%server_name,
"new remote server, marked as online by default"
);
counts.online_count += 1;
}
Transition::OnlineToOffline => {
info!(
%server_name,
"remote server transitioned from online to offline"
);
counts.online_count -= 1;
counts.offline_count += 1;
}
Transition::OfflineToOnline => {
info!(
%server_name,
"remote server transitioned from offline to online"
);
counts.offline_count -= 1;
counts.online_count += 1;
}
}
METRICS.record_remote_server_count(
counts.online_count,
counts.offline_count,
);
}
fn server_state(
&self,
server_name: &ServerName,
@ -134,6 +192,7 @@ impl Service {
server_name.to_owned(),
)));
servers.insert(server_name.to_owned(), Arc::clone(&state));
self.record_transition(server_name, Transition::New);
state
}
}
@ -193,9 +252,12 @@ impl BackoffGuard {
state.failure_count = 0;
if state.is_online() != was_online {
let server_name = &state.server_name;
info!(%server_name, "server transitioned from offline to online");
// Server is always online after setting failure_count = 0
if !was_online {
services().server_backoff.record_transition(
&state.server_name,
Transition::OfflineToOnline,
);
}
}
@ -223,8 +285,10 @@ impl BackoffGuard {
);
if state.is_online() != was_online {
let server_name = &state.server_name;
info!(%server_name, "server transitioned from online to offline");
services().server_backoff.record_transition(
&state.server_name,
Transition::OnlineToOffline,
);
}
}
}