mirror of
https://gitlab.computer.surgery/matrix/grapevine.git
synced 2025-12-17 15:51:23 +01:00
metrics for online and offline remote server count
This commit is contained in:
parent
5b6aaa19b9
commit
56f025cb47
2 changed files with 92 additions and 7 deletions
|
|
@ -290,6 +290,9 @@ pub(crate) struct Metrics {
|
|||
/// Number of entries in an
|
||||
/// [`OnDemandHashMap`](crate::utils::on_demand_hashmap::OnDemandHashMap)
|
||||
on_demand_hashmap_size: opentelemetry::metrics::Gauge<u64>,
|
||||
|
||||
/// Number of known remote servers in each state (online or offline)
|
||||
remote_server_count: opentelemetry::metrics::Gauge<u64>,
|
||||
}
|
||||
|
||||
impl Metrics {
|
||||
|
|
@ -345,11 +348,17 @@ impl Metrics {
|
|||
.with_description("Number of entries in OnDemandHashMap")
|
||||
.init();
|
||||
|
||||
let remote_server_count = meter
|
||||
.u64_gauge("remote_server_count")
|
||||
.with_description("Number of known remote servers")
|
||||
.init();
|
||||
|
||||
Metrics {
|
||||
otel_state: (registry, provider),
|
||||
http_requests_histogram,
|
||||
lookup,
|
||||
on_demand_hashmap_size,
|
||||
remote_server_count,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -384,6 +393,18 @@ impl Metrics {
|
|||
&[KeyValue::new("name", name)],
|
||||
);
|
||||
}
|
||||
|
||||
/// Record number of remote servers marked online or offline.
|
||||
pub(crate) fn record_remote_server_count(
|
||||
&self,
|
||||
online_count: u64,
|
||||
offline_count: u64,
|
||||
) {
|
||||
self.remote_server_count
|
||||
.record(online_count, &[KeyValue::new("state", "online")]);
|
||||
self.remote_server_count
|
||||
.record(offline_count, &[KeyValue::new("state", "offline")]);
|
||||
}
|
||||
}
|
||||
|
||||
/// Track HTTP metrics by converting this into an [`axum`] layer
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
use std::{
|
||||
collections::HashMap,
|
||||
sync::{Arc, RwLock},
|
||||
sync::{Arc, Mutex, RwLock},
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
|
||||
|
|
@ -8,7 +8,7 @@ use rand::{thread_rng, Rng};
|
|||
use ruma::{OwnedServerName, ServerName};
|
||||
use tracing::{debug, info, instrument};
|
||||
|
||||
use crate::{services, Error, Result};
|
||||
use crate::{observability::METRICS, services, Error, Result};
|
||||
|
||||
/// Service to handle backing off requests to offline servers.
|
||||
///
|
||||
|
|
@ -36,6 +36,8 @@ use crate::{services, Error, Result};
|
|||
/// the server is only briefly offline.
|
||||
pub(crate) struct Service {
|
||||
servers: RwLock<HashMap<OwnedServerName, Arc<RwLock<BackoffState>>>>,
|
||||
|
||||
server_counts: Mutex<ServerCounts>,
|
||||
}
|
||||
|
||||
/// Guard to record the result of an attempted request to a server.
|
||||
|
|
@ -75,10 +77,27 @@ struct BackoffState {
|
|||
jitter_coeff: f64,
|
||||
}
|
||||
|
||||
/// State transitions for a single server
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
enum Transition {
|
||||
/// A new server, marked as online by default
|
||||
New,
|
||||
OnlineToOffline,
|
||||
OfflineToOnline,
|
||||
}
|
||||
|
||||
/// Counts of known servers in each state, used for metrics
|
||||
#[derive(Debug, Copy, Clone, Default)]
|
||||
struct ServerCounts {
|
||||
online_count: u64,
|
||||
offline_count: u64,
|
||||
}
|
||||
|
||||
impl Service {
|
||||
pub(crate) fn build() -> Arc<Service> {
|
||||
Arc::new(Service {
|
||||
servers: RwLock::default(),
|
||||
server_counts: Mutex::default(),
|
||||
})
|
||||
}
|
||||
|
||||
|
|
@ -113,6 +132,45 @@ impl Service {
|
|||
})
|
||||
}
|
||||
|
||||
fn record_transition(
|
||||
&self,
|
||||
server_name: &ServerName,
|
||||
transition: Transition,
|
||||
) {
|
||||
let mut counts = self.server_counts.lock().unwrap();
|
||||
|
||||
match transition {
|
||||
Transition::New => {
|
||||
info!(
|
||||
%server_name,
|
||||
"new remote server, marked as online by default"
|
||||
);
|
||||
counts.online_count += 1;
|
||||
}
|
||||
Transition::OnlineToOffline => {
|
||||
info!(
|
||||
%server_name,
|
||||
"remote server transitioned from online to offline"
|
||||
);
|
||||
counts.online_count -= 1;
|
||||
counts.offline_count += 1;
|
||||
}
|
||||
Transition::OfflineToOnline => {
|
||||
info!(
|
||||
%server_name,
|
||||
"remote server transitioned from offline to online"
|
||||
);
|
||||
counts.offline_count -= 1;
|
||||
counts.online_count += 1;
|
||||
}
|
||||
}
|
||||
|
||||
METRICS.record_remote_server_count(
|
||||
counts.online_count,
|
||||
counts.offline_count,
|
||||
);
|
||||
}
|
||||
|
||||
fn server_state(
|
||||
&self,
|
||||
server_name: &ServerName,
|
||||
|
|
@ -134,6 +192,7 @@ impl Service {
|
|||
server_name.to_owned(),
|
||||
)));
|
||||
servers.insert(server_name.to_owned(), Arc::clone(&state));
|
||||
self.record_transition(server_name, Transition::New);
|
||||
state
|
||||
}
|
||||
}
|
||||
|
|
@ -193,9 +252,12 @@ impl BackoffGuard {
|
|||
|
||||
state.failure_count = 0;
|
||||
|
||||
if state.is_online() != was_online {
|
||||
let server_name = &state.server_name;
|
||||
info!(%server_name, "server transitioned from offline to online");
|
||||
// Server is always online after setting failure_count = 0
|
||||
if !was_online {
|
||||
services().server_backoff.record_transition(
|
||||
&state.server_name,
|
||||
Transition::OfflineToOnline,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -223,8 +285,10 @@ impl BackoffGuard {
|
|||
);
|
||||
|
||||
if state.is_online() != was_online {
|
||||
let server_name = &state.server_name;
|
||||
info!(%server_name, "server transitioned from online to offline");
|
||||
services().server_backoff.record_transition(
|
||||
&state.server_name,
|
||||
Transition::OnlineToOffline,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue