mirror of
https://gitlab.computer.surgery/matrix/grapevine.git
synced 2025-12-17 15:51:23 +01:00
metrics for online and offline remote server count
This commit is contained in:
parent
5b6aaa19b9
commit
56f025cb47
2 changed files with 92 additions and 7 deletions
|
|
@ -290,6 +290,9 @@ pub(crate) struct Metrics {
|
||||||
/// Number of entries in an
|
/// Number of entries in an
|
||||||
/// [`OnDemandHashMap`](crate::utils::on_demand_hashmap::OnDemandHashMap)
|
/// [`OnDemandHashMap`](crate::utils::on_demand_hashmap::OnDemandHashMap)
|
||||||
on_demand_hashmap_size: opentelemetry::metrics::Gauge<u64>,
|
on_demand_hashmap_size: opentelemetry::metrics::Gauge<u64>,
|
||||||
|
|
||||||
|
/// Number of known remote servers in each state (online or offline)
|
||||||
|
remote_server_count: opentelemetry::metrics::Gauge<u64>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Metrics {
|
impl Metrics {
|
||||||
|
|
@ -345,11 +348,17 @@ impl Metrics {
|
||||||
.with_description("Number of entries in OnDemandHashMap")
|
.with_description("Number of entries in OnDemandHashMap")
|
||||||
.init();
|
.init();
|
||||||
|
|
||||||
|
let remote_server_count = meter
|
||||||
|
.u64_gauge("remote_server_count")
|
||||||
|
.with_description("Number of known remote servers")
|
||||||
|
.init();
|
||||||
|
|
||||||
Metrics {
|
Metrics {
|
||||||
otel_state: (registry, provider),
|
otel_state: (registry, provider),
|
||||||
http_requests_histogram,
|
http_requests_histogram,
|
||||||
lookup,
|
lookup,
|
||||||
on_demand_hashmap_size,
|
on_demand_hashmap_size,
|
||||||
|
remote_server_count,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -384,6 +393,18 @@ impl Metrics {
|
||||||
&[KeyValue::new("name", name)],
|
&[KeyValue::new("name", name)],
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Record number of remote servers marked online or offline.
|
||||||
|
pub(crate) fn record_remote_server_count(
|
||||||
|
&self,
|
||||||
|
online_count: u64,
|
||||||
|
offline_count: u64,
|
||||||
|
) {
|
||||||
|
self.remote_server_count
|
||||||
|
.record(online_count, &[KeyValue::new("state", "online")]);
|
||||||
|
self.remote_server_count
|
||||||
|
.record(offline_count, &[KeyValue::new("state", "offline")]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Track HTTP metrics by converting this into an [`axum`] layer
|
/// Track HTTP metrics by converting this into an [`axum`] layer
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
use std::{
|
use std::{
|
||||||
collections::HashMap,
|
collections::HashMap,
|
||||||
sync::{Arc, RwLock},
|
sync::{Arc, Mutex, RwLock},
|
||||||
time::{Duration, Instant},
|
time::{Duration, Instant},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -8,7 +8,7 @@ use rand::{thread_rng, Rng};
|
||||||
use ruma::{OwnedServerName, ServerName};
|
use ruma::{OwnedServerName, ServerName};
|
||||||
use tracing::{debug, info, instrument};
|
use tracing::{debug, info, instrument};
|
||||||
|
|
||||||
use crate::{services, Error, Result};
|
use crate::{observability::METRICS, services, Error, Result};
|
||||||
|
|
||||||
/// Service to handle backing off requests to offline servers.
|
/// Service to handle backing off requests to offline servers.
|
||||||
///
|
///
|
||||||
|
|
@ -36,6 +36,8 @@ use crate::{services, Error, Result};
|
||||||
/// the server is only briefly offline.
|
/// the server is only briefly offline.
|
||||||
pub(crate) struct Service {
|
pub(crate) struct Service {
|
||||||
servers: RwLock<HashMap<OwnedServerName, Arc<RwLock<BackoffState>>>>,
|
servers: RwLock<HashMap<OwnedServerName, Arc<RwLock<BackoffState>>>>,
|
||||||
|
|
||||||
|
server_counts: Mutex<ServerCounts>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Guard to record the result of an attempted request to a server.
|
/// Guard to record the result of an attempted request to a server.
|
||||||
|
|
@ -75,10 +77,27 @@ struct BackoffState {
|
||||||
jitter_coeff: f64,
|
jitter_coeff: f64,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// State transitions for a single server
|
||||||
|
#[derive(Debug, Copy, Clone)]
|
||||||
|
enum Transition {
|
||||||
|
/// A new server, marked as online by default
|
||||||
|
New,
|
||||||
|
OnlineToOffline,
|
||||||
|
OfflineToOnline,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Counts of known servers in each state, used for metrics
|
||||||
|
#[derive(Debug, Copy, Clone, Default)]
|
||||||
|
struct ServerCounts {
|
||||||
|
online_count: u64,
|
||||||
|
offline_count: u64,
|
||||||
|
}
|
||||||
|
|
||||||
impl Service {
|
impl Service {
|
||||||
pub(crate) fn build() -> Arc<Service> {
|
pub(crate) fn build() -> Arc<Service> {
|
||||||
Arc::new(Service {
|
Arc::new(Service {
|
||||||
servers: RwLock::default(),
|
servers: RwLock::default(),
|
||||||
|
server_counts: Mutex::default(),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -113,6 +132,45 @@ impl Service {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn record_transition(
|
||||||
|
&self,
|
||||||
|
server_name: &ServerName,
|
||||||
|
transition: Transition,
|
||||||
|
) {
|
||||||
|
let mut counts = self.server_counts.lock().unwrap();
|
||||||
|
|
||||||
|
match transition {
|
||||||
|
Transition::New => {
|
||||||
|
info!(
|
||||||
|
%server_name,
|
||||||
|
"new remote server, marked as online by default"
|
||||||
|
);
|
||||||
|
counts.online_count += 1;
|
||||||
|
}
|
||||||
|
Transition::OnlineToOffline => {
|
||||||
|
info!(
|
||||||
|
%server_name,
|
||||||
|
"remote server transitioned from online to offline"
|
||||||
|
);
|
||||||
|
counts.online_count -= 1;
|
||||||
|
counts.offline_count += 1;
|
||||||
|
}
|
||||||
|
Transition::OfflineToOnline => {
|
||||||
|
info!(
|
||||||
|
%server_name,
|
||||||
|
"remote server transitioned from offline to online"
|
||||||
|
);
|
||||||
|
counts.offline_count -= 1;
|
||||||
|
counts.online_count += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
METRICS.record_remote_server_count(
|
||||||
|
counts.online_count,
|
||||||
|
counts.offline_count,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
fn server_state(
|
fn server_state(
|
||||||
&self,
|
&self,
|
||||||
server_name: &ServerName,
|
server_name: &ServerName,
|
||||||
|
|
@ -134,6 +192,7 @@ impl Service {
|
||||||
server_name.to_owned(),
|
server_name.to_owned(),
|
||||||
)));
|
)));
|
||||||
servers.insert(server_name.to_owned(), Arc::clone(&state));
|
servers.insert(server_name.to_owned(), Arc::clone(&state));
|
||||||
|
self.record_transition(server_name, Transition::New);
|
||||||
state
|
state
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -193,9 +252,12 @@ impl BackoffGuard {
|
||||||
|
|
||||||
state.failure_count = 0;
|
state.failure_count = 0;
|
||||||
|
|
||||||
if state.is_online() != was_online {
|
// Server is always online after setting failure_count = 0
|
||||||
let server_name = &state.server_name;
|
if !was_online {
|
||||||
info!(%server_name, "server transitioned from offline to online");
|
services().server_backoff.record_transition(
|
||||||
|
&state.server_name,
|
||||||
|
Transition::OfflineToOnline,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -223,8 +285,10 @@ impl BackoffGuard {
|
||||||
);
|
);
|
||||||
|
|
||||||
if state.is_online() != was_online {
|
if state.is_online() != was_online {
|
||||||
let server_name = &state.server_name;
|
services().server_backoff.record_transition(
|
||||||
info!(%server_name, "server transitioned from online to offline");
|
&state.server_name,
|
||||||
|
Transition::OnlineToOffline,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue