From 72860d98db5f2d90506f8eaad329488b9671ad83 Mon Sep 17 00:00:00 2001 From: Charles Hall Date: Tue, 4 Jun 2024 13:26:23 -0700 Subject: [PATCH] record FoundIn with metrics instead of traces This is much more efficient in terms of network use and data storage, and also easier to visualize. --- src/api/server_server.rs | 11 +++-- src/database/key_value/rooms/auth_chain.rs | 14 ++++-- src/database/key_value/rooms/short.rs | 49 +++++++++++------- src/database/key_value/rooms/state_cache.rs | 18 ++++--- src/database/key_value/rooms/timeline.rs | 25 ++++++---- src/observability.rs | 55 +++++++++++++++++---- src/service/rooms/state_accessor.rs | 21 +++++--- src/service/rooms/state_compressor.rs | 13 +++-- 8 files changed, 139 insertions(+), 67 deletions(-) diff --git a/src/api/server_server.rs b/src/api/server_server.rs index f9d56705..81e89863 100644 --- a/src/api/server_server.rs +++ b/src/api/server_server.rs @@ -67,7 +67,7 @@ use tracing::{debug, error, field, warn}; use crate::{ api::client_server::{self, claim_keys_helper, get_keys_helper}, - observability::FoundIn, + observability::{FoundIn, FoundKind, METRICS}, service::pdu::{gen_event_id_canonical_json, PduBuilder}, services, utils, Ar, Error, PduEvent, Ra, Result, }; @@ -128,7 +128,7 @@ impl FedDest { } } -#[tracing::instrument(skip(request), fields(destination_cache_result, url))] +#[tracing::instrument(skip(request), fields(url))] pub(crate) async fn send_request( destination: &ServerName, request: T, @@ -159,7 +159,7 @@ where .cloned(); let (actual_destination, host) = if let Some(result) = cached_result { - FoundIn::Cache.record("destination_cache_result"); + METRICS.record_lookup(FoundKind::FederationDestination, FoundIn::Cache); result } else { write_destination_to_cache = true; @@ -298,7 +298,10 @@ where let response = T::IncomingResponse::try_from_http_response(http_response); if response.is_ok() && write_destination_to_cache { - FoundIn::Remote.record("destination_cache_result"); + METRICS.record_lookup( + FoundKind::FederationDestination, + FoundIn::Remote, + ); services() .globals .actual_destination_cache diff --git a/src/database/key_value/rooms/auth_chain.rs b/src/database/key_value/rooms/auth_chain.rs index f1e23491..146c0697 100644 --- a/src/database/key_value/rooms/auth_chain.rs +++ b/src/database/key_value/rooms/auth_chain.rs @@ -1,19 +1,23 @@ use std::{collections::HashSet, mem::size_of, sync::Arc}; use crate::{ - database::KeyValueDatabase, observability::FoundIn, service, utils, Result, + database::KeyValueDatabase, + observability::{FoundIn, FoundKind, METRICS}, + service, utils, Result, }; impl service::rooms::auth_chain::Data for KeyValueDatabase { - #[tracing::instrument(skip(self, key), fields(cache_result))] + #[tracing::instrument(skip(self, key))] fn get_cached_eventid_authchain( &self, key: &[u64], ) -> Result>>> { + let found_kind = FoundKind::AuthChain; + // Check RAM cache if let Some(result) = self.auth_chain_cache.lock().unwrap().get_mut(key) { - FoundIn::Cache.record("cache_result"); + METRICS.record_lookup(found_kind, FoundIn::Cache); return Ok(Some(Arc::clone(result))); } @@ -34,7 +38,7 @@ impl service::rooms::auth_chain::Data for KeyValueDatabase { }); if let Some(chain) = chain { - FoundIn::Database.record("cache_result"); + METRICS.record_lookup(found_kind, FoundIn::Database); let chain = Arc::new(chain); // Cache in RAM @@ -47,7 +51,7 @@ impl service::rooms::auth_chain::Data for KeyValueDatabase { } } - FoundIn::Nothing.record("cache_result"); + METRICS.record_lookup(found_kind, FoundIn::Nothing); Ok(None) } diff --git a/src/database/key_value/rooms/short.rs b/src/database/key_value/rooms/short.rs index 4cc7fc47..3f264d88 100644 --- a/src/database/key_value/rooms/short.rs +++ b/src/database/key_value/rooms/short.rs @@ -3,30 +3,33 @@ use std::sync::Arc; use ruma::{events::StateEventType, EventId, RoomId}; use crate::{ - database::KeyValueDatabase, observability::FoundIn, service, services, - utils, Error, Result, + database::KeyValueDatabase, + observability::{FoundIn, FoundKind, METRICS}, + service, services, utils, Error, Result, }; impl service::rooms::short::Data for KeyValueDatabase { - #[tracing::instrument(skip(self), fields(cache_result))] + #[tracing::instrument(skip(self))] fn get_or_create_shorteventid(&self, event_id: &EventId) -> Result { + let found_kind = FoundKind::EventIdToShort; + if let Some(short) = self.eventidshort_cache.lock().unwrap().get_mut(event_id) { - FoundIn::Cache.record("cache_result"); + METRICS.record_lookup(found_kind, FoundIn::Cache); return Ok(*short); } let short = if let Some(shorteventid) = self.eventid_shorteventid.get(event_id.as_bytes())? { - FoundIn::Database.record("cache_result"); + METRICS.record_lookup(found_kind, FoundIn::Database); utils::u64_from_bytes(&shorteventid).map_err(|_| { Error::bad_database("Invalid shorteventid in db.") })? } else { - FoundIn::Nothing.record("cache_result"); + METRICS.record_lookup(found_kind, FoundIn::Nothing); let shorteventid = services().globals.next_count()?; self.eventid_shorteventid @@ -50,13 +53,15 @@ impl service::rooms::short::Data for KeyValueDatabase { event_type: &StateEventType, state_key: &str, ) -> Result> { + let found_kind = FoundKind::StateKeyToShort; + if let Some(short) = self .statekeyshort_cache .lock() .unwrap() .get_mut(&(event_type.clone(), state_key.to_owned())) { - FoundIn::Cache.record("cache_result"); + METRICS.record_lookup(found_kind, FoundIn::Cache); return Ok(Some(*short)); } @@ -75,32 +80,34 @@ impl service::rooms::short::Data for KeyValueDatabase { .transpose()?; if let Some(s) = short { - FoundIn::Database.record("cache_result"); + METRICS.record_lookup(found_kind, FoundIn::Database); self.statekeyshort_cache .lock() .unwrap() .insert((event_type.clone(), state_key.to_owned()), s); } else { - FoundIn::Nothing.record("cache_result"); + METRICS.record_lookup(found_kind, FoundIn::Nothing); } Ok(short) } - #[tracing::instrument(skip(self), fields(cache_result))] + #[tracing::instrument(skip(self))] fn get_or_create_shortstatekey( &self, event_type: &StateEventType, state_key: &str, ) -> Result { + let found_kind = FoundKind::CreateStateKeyToShort; + if let Some(short) = self .statekeyshort_cache .lock() .unwrap() .get_mut(&(event_type.clone(), state_key.to_owned())) { - FoundIn::Cache.record("cache_result"); + METRICS.record_lookup(found_kind, FoundIn::Cache); return Ok(*short); } @@ -111,13 +118,13 @@ impl service::rooms::short::Data for KeyValueDatabase { let short = if let Some(shortstatekey) = self.statekey_shortstatekey.get(&db_key)? { - FoundIn::Database.record("cache_result"); + METRICS.record_lookup(found_kind, FoundIn::Database); utils::u64_from_bytes(&shortstatekey).map_err(|_| { Error::bad_database("Invalid shortstatekey in db.") })? } else { - FoundIn::Nothing.record("cache_result"); + METRICS.record_lookup(found_kind, FoundIn::Nothing); let shortstatekey = services().globals.next_count()?; self.statekey_shortstatekey @@ -135,15 +142,17 @@ impl service::rooms::short::Data for KeyValueDatabase { Ok(short) } - #[tracing::instrument(skip(self), fields(cache_result))] + #[tracing::instrument(skip(self))] fn get_eventid_from_short( &self, shorteventid: u64, ) -> Result> { + let found_kind = FoundKind::ShortToEventId; + if let Some(id) = self.shorteventid_cache.lock().unwrap().get_mut(&shorteventid) { - FoundIn::Cache.record("cache_result"); + METRICS.record_lookup(found_kind, FoundIn::Cache); return Ok(Arc::clone(id)); } @@ -165,7 +174,7 @@ impl service::rooms::short::Data for KeyValueDatabase { Error::bad_database("EventId in shorteventid_eventid is invalid.") })?; - FoundIn::Database.record("cache_result"); + METRICS.record_lookup(found_kind, FoundIn::Database); self.shorteventid_cache .lock() @@ -175,15 +184,17 @@ impl service::rooms::short::Data for KeyValueDatabase { Ok(event_id) } - #[tracing::instrument(skip(self), fields(cache_result))] + #[tracing::instrument(skip(self))] fn get_statekey_from_short( &self, shortstatekey: u64, ) -> Result<(StateEventType, String)> { + let found_kind = FoundKind::ShortToStateKey; + if let Some(id) = self.shortstatekey_cache.lock().unwrap().get_mut(&shortstatekey) { - FoundIn::Cache.record("cache_result"); + METRICS.record_lookup(found_kind, FoundIn::Cache); return Ok(id.clone()); } @@ -218,7 +229,7 @@ impl service::rooms::short::Data for KeyValueDatabase { let result = (event_type, state_key); - FoundIn::Database.record("cache_result"); + METRICS.record_lookup(found_kind, FoundIn::Database); self.shortstatekey_cache .lock() diff --git a/src/database/key_value/rooms/state_cache.rs b/src/database/key_value/rooms/state_cache.rs index 84a8f668..f8cf62e1 100644 --- a/src/database/key_value/rooms/state_cache.rs +++ b/src/database/key_value/rooms/state_cache.rs @@ -8,7 +8,7 @@ use ruma::{ use crate::{ database::KeyValueDatabase, - observability::FoundIn, + observability::{FoundIn, FoundKind, METRICS}, service::{self, appservice::RegistrationInfo}, services, utils, Error, Result, }; @@ -171,19 +171,21 @@ impl service::rooms::state_cache::Data for KeyValueDatabase { Ok(()) } - #[tracing::instrument(skip(self), fields(cache_result))] + #[tracing::instrument(skip(self))] fn get_our_real_users( &self, room_id: &RoomId, ) -> Result>> { + let found_kind = FoundKind::OurRealUsers; + let maybe = self.our_real_users_cache.read().unwrap().get(room_id).cloned(); if let Some(users) = maybe { - FoundIn::Cache.record("cache_result"); + METRICS.record_lookup(found_kind, FoundIn::Cache); Ok(users) } else { self.update_joined_count(room_id)?; - FoundIn::Database.record("cache_result"); + METRICS.record_lookup(found_kind, FoundIn::Database); Ok(Arc::clone( self.our_real_users_cache.read().unwrap().get(room_id).unwrap(), )) @@ -192,13 +194,15 @@ impl service::rooms::state_cache::Data for KeyValueDatabase { #[tracing::instrument( skip(self, appservice), - fields(cache_result, appservice_id = appservice.registration.id), + fields(appservice_id = appservice.registration.id), )] fn appservice_in_room( &self, room_id: &RoomId, appservice: &RegistrationInfo, ) -> Result { + let found_kind = FoundKind::AppserviceInRoom; + let maybe = self .appservice_in_room_cache .read() @@ -208,7 +212,7 @@ impl service::rooms::state_cache::Data for KeyValueDatabase { .copied(); if let Some(b) = maybe { - FoundIn::Cache.record("cache_result"); + METRICS.record_lookup(found_kind, FoundIn::Cache); Ok(b) } else { let bridge_user_id = UserId::parse_with_server_name( @@ -225,7 +229,7 @@ impl service::rooms::state_cache::Data for KeyValueDatabase { }) }); - FoundIn::Database.record("cache_result"); + METRICS.record_lookup(found_kind, FoundIn::Database); self.appservice_in_room_cache .write() .unwrap() diff --git a/src/database/key_value/rooms/timeline.rs b/src/database/key_value/rooms/timeline.rs index 3bfbff8d..309b8765 100644 --- a/src/database/key_value/rooms/timeline.rs +++ b/src/database/key_value/rooms/timeline.rs @@ -8,17 +8,20 @@ use service::rooms::timeline::PduCount; use tracing::error; use crate::{ - database::KeyValueDatabase, observability::FoundIn, service, services, - utils, Error, PduEvent, Result, + database::KeyValueDatabase, + observability::{FoundIn, FoundKind, METRICS}, + service, services, utils, Error, PduEvent, Result, }; impl service::rooms::timeline::Data for KeyValueDatabase { - #[tracing::instrument(skip(self), fields(cache_result))] + #[tracing::instrument(skip(self))] fn last_timeline_count( &self, sender_user: &UserId, room_id: &RoomId, ) -> Result { + let found_kind = FoundKind::LastTimelineCount; + match self .lasttimelinecount_cache .lock() @@ -35,15 +38,15 @@ impl service::rooms::timeline::Data for KeyValueDatabase { r.ok() }) { - FoundIn::Database.record("cache_result"); + METRICS.record_lookup(found_kind, FoundIn::Database); Ok(*v.insert(last_count.0)) } else { - FoundIn::Nothing.record("cache_result"); + METRICS.record_lookup(found_kind, FoundIn::Nothing); Ok(PduCount::Normal(0)) } } hash_map::Entry::Occupied(o) => { - FoundIn::Cache.record("cache_result"); + METRICS.record_lookup(found_kind, FoundIn::Cache); Ok(*o.get()) } } @@ -125,10 +128,12 @@ impl service::rooms::timeline::Data for KeyValueDatabase { /// Returns the pdu. /// /// Checks the `eventid_outlierpdu` Tree if not found in the timeline. - #[tracing::instrument(skip(self), fields(cache_result))] + #[tracing::instrument(skip(self))] fn get_pdu(&self, event_id: &EventId) -> Result>> { + let found_kind = FoundKind::Pdu; + if let Some(p) = self.pdu_cache.lock().unwrap().get_mut(event_id) { - FoundIn::Cache.record("cache_result"); + METRICS.record_lookup(found_kind, FoundIn::Cache); return Ok(Some(Arc::clone(p))); } @@ -149,14 +154,14 @@ impl service::rooms::timeline::Data for KeyValueDatabase { )? .map(Arc::new) { - FoundIn::Database.record("cache_result"); + METRICS.record_lookup(found_kind, FoundIn::Database); self.pdu_cache .lock() .unwrap() .insert(event_id.to_owned(), Arc::clone(&pdu)); Ok(Some(pdu)) } else { - FoundIn::Nothing.record("cache_result"); + METRICS.record_lookup(found_kind, FoundIn::Nothing); Ok(None) } } diff --git a/src/observability.rs b/src/observability.rs index 820487d5..546f9bdf 100644 --- a/src/observability.rs +++ b/src/observability.rs @@ -18,7 +18,7 @@ use opentelemetry_sdk::{ metrics::{new_view, Aggregation, Instrument, SdkMeterProvider, Stream}, Resource, }; -use strum::AsRefStr; +use strum::{AsRefStr, IntoStaticStr}; use tokio::time::Instant; use tracing_flame::{FlameLayer, FlushGuard}; use tracing_subscriber::{layer::SubscriberExt, EnvFilter, Layer, Registry}; @@ -41,8 +41,31 @@ impl Drop for Guard { } } +/// A kind of data that gets looked up +/// +/// See also [`Metrics::record_lookup`]. +// Keep variants sorted +#[allow(clippy::missing_docs_in_private_items)] +#[derive(Clone, Copy, AsRefStr, IntoStaticStr)] +pub(crate) enum FoundKind { + AppserviceInRoom, + AuthChain, + CreateStateKeyToShort, + EventIdToShort, + FederationDestination, + LastTimelineCount, + OurRealUsers, + Pdu, + ShortToEventId, + ShortToStateKey, + StateInfo, + StateKeyToShort, + VisibilityForServer, + VisibilityForUser, +} + /// Type to record cache performance in a tracing span field. -#[derive(Clone, Copy, AsRefStr)] +#[derive(Clone, Copy, AsRefStr, IntoStaticStr)] pub(crate) enum FoundIn { /// Found in cache Cache, @@ -55,14 +78,6 @@ pub(crate) enum FoundIn { Nothing, } -impl FoundIn { - /// Record the current value to the current [`tracing::Span`] - // TODO: use tracing::Value instead if it ever becomes accessible - pub(crate) fn record(self, field: &str) { - tracing::Span::current().record(field, self.as_ref()); - } -} - /// Initialize observability pub(crate) fn init(config: &Config) -> Result { let config_filter_layer = || EnvFilter::try_new(&config.log); @@ -137,6 +152,9 @@ pub(crate) struct Metrics { /// Histogram of HTTP requests http_requests_histogram: opentelemetry::metrics::Histogram, + + /// Counts where data is found from + lookup: opentelemetry::metrics::Counter, } impl Metrics { @@ -182,9 +200,15 @@ impl Metrics { .with_description("Histogram of HTTP requests") .init(); + let lookup = meter + .u64_counter("lookup") + .with_description("Counts where data is found from") + .init(); + Metrics { otel_state: (registry, provider), http_requests_histogram, + lookup, } } @@ -194,6 +218,17 @@ impl Metrics { .encode_to_string(&self.otel_state.0.gather()) .expect("should be able to encode metrics") } + + /// Record that some data was found in a particular storage location + pub(crate) fn record_lookup(&self, kind: FoundKind, r#in: FoundIn) { + self.lookup.add( + 1, + &[ + KeyValue::new("kind", <&str>::from(kind)), + KeyValue::new("in", <&str>::from(r#in)), + ], + ); + } } /// Track HTTP metrics by converting this into an [`axum`] layer diff --git a/src/service/rooms/state_accessor.rs b/src/service/rooms/state_accessor.rs index 67ebf8ec..f3b07c4c 100644 --- a/src/service/rooms/state_accessor.rs +++ b/src/service/rooms/state_accessor.rs @@ -28,8 +28,9 @@ use tokio::sync::MutexGuard; use tracing::{error, warn}; use crate::{ - observability::FoundIn, service::pdu::PduBuilder, services, Error, - PduEvent, Result, + observability::{FoundIn, FoundKind, METRICS}, + service::pdu::PduBuilder, + services, Error, PduEvent, Result, }; pub(crate) struct Service { @@ -123,13 +124,15 @@ impl Service { /// Whether a server is allowed to see an event through federation, based on /// the room's history_visibility at that event's state. - #[tracing::instrument(skip(self), fields(cache_result))] + #[tracing::instrument(skip(self))] pub(crate) fn server_can_see_event( &self, origin: &ServerName, room_id: &RoomId, event_id: &EventId, ) -> Result { + let found_kind = FoundKind::VisibilityForServer; + let Some(shortstatehash) = self.pdu_shortstatehash(event_id)? else { return Ok(true); }; @@ -140,7 +143,7 @@ impl Service { .unwrap() .get_mut(&(origin.to_owned(), shortstatehash)) { - FoundIn::Cache.record("cache_result"); + METRICS.record_lookup(found_kind, FoundIn::Cache); return Ok(*visibility); } @@ -191,7 +194,7 @@ impl Service { } }; - FoundIn::Database.record("cache_result"); + METRICS.record_lookup(found_kind, FoundIn::Database); self.server_visibility_cache .lock() .unwrap() @@ -202,13 +205,15 @@ impl Service { /// Whether a user is allowed to see an event, based on /// the room's history_visibility at that event's state. - #[tracing::instrument(skip(self), fields(cache_result))] + #[tracing::instrument(skip(self))] pub(crate) fn user_can_see_event( &self, user_id: &UserId, room_id: &RoomId, event_id: &EventId, ) -> Result { + let found_kind = FoundKind::VisibilityForUser; + let Some(shortstatehash) = self.pdu_shortstatehash(event_id)? else { return Ok(true); }; @@ -219,7 +224,7 @@ impl Service { .unwrap() .get_mut(&(user_id.to_owned(), shortstatehash)) { - FoundIn::Cache.record("cache_result"); + METRICS.record_lookup(found_kind, FoundIn::Cache); return Ok(*visibility); } @@ -262,7 +267,7 @@ impl Service { } }; - FoundIn::Database.record("cache_result"); + METRICS.record_lookup(found_kind, FoundIn::Database); self.user_visibility_cache .lock() .unwrap() diff --git a/src/service/rooms/state_compressor.rs b/src/service/rooms/state_compressor.rs index 942dd67c..79af0433 100644 --- a/src/service/rooms/state_compressor.rs +++ b/src/service/rooms/state_compressor.rs @@ -10,7 +10,10 @@ use lru_cache::LruCache; use ruma::{EventId, RoomId}; use self::data::StateDiff; -use crate::{observability::FoundIn, services, utils, Result}; +use crate::{ + observability::{FoundIn, FoundKind, METRICS}, + services, utils, Result, +}; #[derive(Clone)] pub(crate) struct CompressedStateLayer { @@ -33,15 +36,17 @@ impl Service { /// Returns a stack with info on shortstatehash, full state, added diff and /// removed diff for the selected shortstatehash and each parent layer. #[allow(clippy::type_complexity)] - #[tracing::instrument(skip(self), fields(cache_result))] + #[tracing::instrument(skip(self))] pub(crate) fn load_shortstatehash_info( &self, shortstatehash: u64, ) -> Result> { + let found_kind = FoundKind::StateInfo; + if let Some(r) = self.stateinfo_cache.lock().unwrap().get_mut(&shortstatehash) { - FoundIn::Cache.record("cache_result"); + METRICS.record_lookup(found_kind, FoundIn::Cache); return Ok(r.clone()); } @@ -76,7 +81,7 @@ impl Service { }] }; - FoundIn::Database.record("cache_result"); + METRICS.record_lookup(found_kind, FoundIn::Database); self.stateinfo_cache .lock() .unwrap()