record FoundIn with metrics instead of traces

This is much more efficient in terms of network use and data storage,
and also easier to visualize.
This commit is contained in:
Charles Hall 2024-06-04 13:26:23 -07:00
parent 9364d44ce2
commit 0c2094a56f
No known key found for this signature in database
GPG key ID: 7B8E0645816E07CF
8 changed files with 142 additions and 68 deletions

View file

@ -67,7 +67,7 @@ use tracing::{debug, error, field, warn};
use crate::{
api::client_server::{self, claim_keys_helper, get_keys_helper},
observability::FoundIn,
observability::{FoundIn, Lookup, METRICS},
service::pdu::{gen_event_id_canonical_json, PduBuilder},
services, utils, Ar, Error, PduEvent, Ra, Result,
};
@ -128,7 +128,7 @@ impl FedDest {
}
}
#[tracing::instrument(skip(request), fields(destination_cache_result, url))]
#[tracing::instrument(skip(request), fields(url))]
pub(crate) async fn send_request<T>(
destination: &ServerName,
request: T,
@ -159,7 +159,7 @@ where
.cloned();
let (actual_destination, host) = if let Some(result) = cached_result {
FoundIn::Cache.record("destination_cache_result");
METRICS.record_lookup(Lookup::FederationDestination, FoundIn::Cache);
result
} else {
write_destination_to_cache = true;
@ -298,7 +298,10 @@ where
let response =
T::IncomingResponse::try_from_http_response(http_response);
if response.is_ok() && write_destination_to_cache {
FoundIn::Remote.record("destination_cache_result");
METRICS.record_lookup(
Lookup::FederationDestination,
FoundIn::Remote,
);
services()
.globals
.actual_destination_cache

View file

@ -1,19 +1,23 @@
use std::{collections::HashSet, mem::size_of, sync::Arc};
use crate::{
database::KeyValueDatabase, observability::FoundIn, service, utils, Result,
database::KeyValueDatabase,
observability::{FoundIn, Lookup, METRICS},
service, utils, Result,
};
impl service::rooms::auth_chain::Data for KeyValueDatabase {
#[tracing::instrument(skip(self, key), fields(cache_result))]
#[tracing::instrument(skip(self, key))]
fn get_cached_eventid_authchain(
&self,
key: &[u64],
) -> Result<Option<Arc<HashSet<u64>>>> {
let lookup = Lookup::AuthChain;
// Check RAM cache
if let Some(result) = self.auth_chain_cache.lock().unwrap().get_mut(key)
{
FoundIn::Cache.record("cache_result");
METRICS.record_lookup(lookup, FoundIn::Cache);
return Ok(Some(Arc::clone(result)));
}
@ -34,7 +38,7 @@ impl service::rooms::auth_chain::Data for KeyValueDatabase {
});
if let Some(chain) = chain {
FoundIn::Database.record("cache_result");
METRICS.record_lookup(lookup, FoundIn::Database);
let chain = Arc::new(chain);
// Cache in RAM
@ -47,7 +51,7 @@ impl service::rooms::auth_chain::Data for KeyValueDatabase {
}
}
FoundIn::Nothing.record("cache_result");
METRICS.record_lookup(lookup, FoundIn::Nothing);
Ok(None)
}

View file

@ -3,30 +3,33 @@ use std::sync::Arc;
use ruma::{events::StateEventType, EventId, RoomId};
use crate::{
database::KeyValueDatabase, observability::FoundIn, service, services,
utils, Error, Result,
database::KeyValueDatabase,
observability::{FoundIn, Lookup, METRICS},
service, services, utils, Error, Result,
};
impl service::rooms::short::Data for KeyValueDatabase {
#[tracing::instrument(skip(self), fields(cache_result))]
#[tracing::instrument(skip(self))]
fn get_or_create_shorteventid(&self, event_id: &EventId) -> Result<u64> {
let lookup = Lookup::CreateEventIdToShort;
if let Some(short) =
self.eventidshort_cache.lock().unwrap().get_mut(event_id)
{
FoundIn::Cache.record("cache_result");
METRICS.record_lookup(lookup, FoundIn::Cache);
return Ok(*short);
}
let short = if let Some(shorteventid) =
self.eventid_shorteventid.get(event_id.as_bytes())?
{
FoundIn::Database.record("cache_result");
METRICS.record_lookup(lookup, FoundIn::Database);
utils::u64_from_bytes(&shorteventid).map_err(|_| {
Error::bad_database("Invalid shorteventid in db.")
})?
} else {
FoundIn::Nothing.record("cache_result");
METRICS.record_lookup(lookup, FoundIn::Nothing);
let shorteventid = services().globals.next_count()?;
self.eventid_shorteventid
@ -50,13 +53,15 @@ impl service::rooms::short::Data for KeyValueDatabase {
event_type: &StateEventType,
state_key: &str,
) -> Result<Option<u64>> {
let lookup = Lookup::StateKeyToShort;
if let Some(short) = self
.statekeyshort_cache
.lock()
.unwrap()
.get_mut(&(event_type.clone(), state_key.to_owned()))
{
FoundIn::Cache.record("cache_result");
METRICS.record_lookup(lookup, FoundIn::Cache);
return Ok(Some(*short));
}
@ -75,32 +80,34 @@ impl service::rooms::short::Data for KeyValueDatabase {
.transpose()?;
if let Some(s) = short {
FoundIn::Database.record("cache_result");
METRICS.record_lookup(lookup, FoundIn::Database);
self.statekeyshort_cache
.lock()
.unwrap()
.insert((event_type.clone(), state_key.to_owned()), s);
} else {
FoundIn::Nothing.record("cache_result");
METRICS.record_lookup(lookup, FoundIn::Nothing);
}
Ok(short)
}
#[tracing::instrument(skip(self), fields(cache_result))]
#[tracing::instrument(skip(self))]
fn get_or_create_shortstatekey(
&self,
event_type: &StateEventType,
state_key: &str,
) -> Result<u64> {
let lookup = Lookup::CreateStateKeyToShort;
if let Some(short) = self
.statekeyshort_cache
.lock()
.unwrap()
.get_mut(&(event_type.clone(), state_key.to_owned()))
{
FoundIn::Cache.record("cache_result");
METRICS.record_lookup(lookup, FoundIn::Cache);
return Ok(*short);
}
@ -111,13 +118,13 @@ impl service::rooms::short::Data for KeyValueDatabase {
let short = if let Some(shortstatekey) =
self.statekey_shortstatekey.get(&db_key)?
{
FoundIn::Database.record("cache_result");
METRICS.record_lookup(lookup, FoundIn::Database);
utils::u64_from_bytes(&shortstatekey).map_err(|_| {
Error::bad_database("Invalid shortstatekey in db.")
})?
} else {
FoundIn::Nothing.record("cache_result");
METRICS.record_lookup(lookup, FoundIn::Nothing);
let shortstatekey = services().globals.next_count()?;
self.statekey_shortstatekey
@ -135,15 +142,17 @@ impl service::rooms::short::Data for KeyValueDatabase {
Ok(short)
}
#[tracing::instrument(skip(self), fields(cache_result))]
#[tracing::instrument(skip(self))]
fn get_eventid_from_short(
&self,
shorteventid: u64,
) -> Result<Arc<EventId>> {
let lookup = Lookup::ShortToEventId;
if let Some(id) =
self.shorteventid_cache.lock().unwrap().get_mut(&shorteventid)
{
FoundIn::Cache.record("cache_result");
METRICS.record_lookup(lookup, FoundIn::Cache);
return Ok(Arc::clone(id));
}
@ -165,7 +174,7 @@ impl service::rooms::short::Data for KeyValueDatabase {
Error::bad_database("EventId in shorteventid_eventid is invalid.")
})?;
FoundIn::Database.record("cache_result");
METRICS.record_lookup(lookup, FoundIn::Database);
self.shorteventid_cache
.lock()
@ -175,15 +184,17 @@ impl service::rooms::short::Data for KeyValueDatabase {
Ok(event_id)
}
#[tracing::instrument(skip(self), fields(cache_result))]
#[tracing::instrument(skip(self))]
fn get_statekey_from_short(
&self,
shortstatekey: u64,
) -> Result<(StateEventType, String)> {
let lookup = Lookup::ShortToStateKey;
if let Some(id) =
self.shortstatekey_cache.lock().unwrap().get_mut(&shortstatekey)
{
FoundIn::Cache.record("cache_result");
METRICS.record_lookup(lookup, FoundIn::Cache);
return Ok(id.clone());
}
@ -218,7 +229,7 @@ impl service::rooms::short::Data for KeyValueDatabase {
let result = (event_type, state_key);
FoundIn::Database.record("cache_result");
METRICS.record_lookup(lookup, FoundIn::Database);
self.shortstatekey_cache
.lock()

View file

@ -8,7 +8,7 @@ use ruma::{
use crate::{
database::KeyValueDatabase,
observability::FoundIn,
observability::{FoundIn, Lookup, METRICS},
service::{self, appservice::RegistrationInfo},
services, utils, Error, Result,
};
@ -171,19 +171,21 @@ impl service::rooms::state_cache::Data for KeyValueDatabase {
Ok(())
}
#[tracing::instrument(skip(self), fields(cache_result))]
#[tracing::instrument(skip(self))]
fn get_our_real_users(
&self,
room_id: &RoomId,
) -> Result<Arc<HashSet<OwnedUserId>>> {
let lookup = Lookup::OurRealUsers;
let maybe =
self.our_real_users_cache.read().unwrap().get(room_id).cloned();
if let Some(users) = maybe {
FoundIn::Cache.record("cache_result");
METRICS.record_lookup(lookup, FoundIn::Cache);
Ok(users)
} else {
self.update_joined_count(room_id)?;
FoundIn::Database.record("cache_result");
METRICS.record_lookup(lookup, FoundIn::Database);
Ok(Arc::clone(
self.our_real_users_cache.read().unwrap().get(room_id).unwrap(),
))
@ -192,13 +194,15 @@ impl service::rooms::state_cache::Data for KeyValueDatabase {
#[tracing::instrument(
skip(self, appservice),
fields(cache_result, appservice_id = appservice.registration.id),
fields(appservice_id = appservice.registration.id),
)]
fn appservice_in_room(
&self,
room_id: &RoomId,
appservice: &RegistrationInfo,
) -> Result<bool> {
let lookup = Lookup::AppserviceInRoom;
let maybe = self
.appservice_in_room_cache
.read()
@ -208,7 +212,7 @@ impl service::rooms::state_cache::Data for KeyValueDatabase {
.copied();
if let Some(b) = maybe {
FoundIn::Cache.record("cache_result");
METRICS.record_lookup(lookup, FoundIn::Cache);
Ok(b)
} else {
let bridge_user_id = UserId::parse_with_server_name(
@ -225,7 +229,7 @@ impl service::rooms::state_cache::Data for KeyValueDatabase {
})
});
FoundIn::Database.record("cache_result");
METRICS.record_lookup(lookup, FoundIn::Database);
self.appservice_in_room_cache
.write()
.unwrap()

View file

@ -8,17 +8,20 @@ use service::rooms::timeline::PduCount;
use tracing::error;
use crate::{
database::KeyValueDatabase, observability::FoundIn, service, services,
utils, Error, PduEvent, Result,
database::KeyValueDatabase,
observability::{FoundIn, Lookup, METRICS},
service, services, utils, Error, PduEvent, Result,
};
impl service::rooms::timeline::Data for KeyValueDatabase {
#[tracing::instrument(skip(self), fields(cache_result))]
#[tracing::instrument(skip(self))]
fn last_timeline_count(
&self,
sender_user: &UserId,
room_id: &RoomId,
) -> Result<PduCount> {
let lookup = Lookup::LastTimelineCount;
match self
.lasttimelinecount_cache
.lock()
@ -35,15 +38,15 @@ impl service::rooms::timeline::Data for KeyValueDatabase {
r.ok()
})
{
FoundIn::Database.record("cache_result");
METRICS.record_lookup(lookup, FoundIn::Database);
Ok(*v.insert(last_count.0))
} else {
FoundIn::Nothing.record("cache_result");
METRICS.record_lookup(lookup, FoundIn::Nothing);
Ok(PduCount::Normal(0))
}
}
hash_map::Entry::Occupied(o) => {
FoundIn::Cache.record("cache_result");
METRICS.record_lookup(lookup, FoundIn::Cache);
Ok(*o.get())
}
}
@ -125,10 +128,12 @@ impl service::rooms::timeline::Data for KeyValueDatabase {
/// Returns the pdu.
///
/// Checks the `eventid_outlierpdu` Tree if not found in the timeline.
#[tracing::instrument(skip(self), fields(cache_result))]
#[tracing::instrument(skip(self))]
fn get_pdu(&self, event_id: &EventId) -> Result<Option<Arc<PduEvent>>> {
let lookup = Lookup::Pdu;
if let Some(p) = self.pdu_cache.lock().unwrap().get_mut(event_id) {
FoundIn::Cache.record("cache_result");
METRICS.record_lookup(lookup, FoundIn::Cache);
return Ok(Some(Arc::clone(p)));
}
@ -149,14 +154,14 @@ impl service::rooms::timeline::Data for KeyValueDatabase {
)?
.map(Arc::new)
{
FoundIn::Database.record("cache_result");
METRICS.record_lookup(lookup, FoundIn::Database);
self.pdu_cache
.lock()
.unwrap()
.insert(event_id.to_owned(), Arc::clone(&pdu));
Ok(Some(pdu))
} else {
FoundIn::Nothing.record("cache_result");
METRICS.record_lookup(lookup, FoundIn::Nothing);
Ok(None)
}
}

View file

@ -18,7 +18,7 @@ use opentelemetry_sdk::{
metrics::{new_view, Aggregation, Instrument, SdkMeterProvider, Stream},
Resource,
};
use strum::AsRefStr;
use strum::{AsRefStr, IntoStaticStr};
use tokio::time::Instant;
use tracing_flame::{FlameLayer, FlushGuard};
use tracing_subscriber::{layer::SubscriberExt, EnvFilter, Layer, Registry};
@ -41,8 +41,33 @@ impl Drop for Guard {
}
}
/// Type to record cache performance in a tracing span field.
#[derive(Clone, Copy, AsRefStr)]
/// A kind of data that gets looked up
///
/// See also [`Metrics::record_lookup`].
// Keep variants sorted
#[allow(clippy::missing_docs_in_private_items)]
#[derive(Clone, Copy, AsRefStr, IntoStaticStr)]
pub(crate) enum Lookup {
AppserviceInRoom,
AuthChain,
CreateEventIdToShort,
CreateStateKeyToShort,
FederationDestination,
LastTimelineCount,
OurRealUsers,
Pdu,
ShortToEventId,
ShortToStateKey,
StateInfo,
StateKeyToShort,
VisibilityForServer,
VisibilityForUser,
}
/// Locations where a [`Lookup`] value may be found
///
/// Not all of these variants are used for each value of [`Lookup`].
#[derive(Clone, Copy, AsRefStr, IntoStaticStr)]
pub(crate) enum FoundIn {
/// Found in cache
Cache,
@ -55,14 +80,6 @@ pub(crate) enum FoundIn {
Nothing,
}
impl FoundIn {
/// Record the current value to the current [`tracing::Span`]
// TODO: use tracing::Value instead if it ever becomes accessible
pub(crate) fn record(self, field: &str) {
tracing::Span::current().record(field, self.as_ref());
}
}
/// Initialize observability
pub(crate) fn init(config: &Config) -> Result<Guard, error::Observability> {
let config_filter_layer = || EnvFilter::try_new(&config.log);
@ -137,6 +154,9 @@ pub(crate) struct Metrics {
/// Histogram of HTTP requests
http_requests_histogram: opentelemetry::metrics::Histogram<f64>,
/// Counts where data is found from
lookup: opentelemetry::metrics::Counter<u64>,
}
impl Metrics {
@ -182,9 +202,15 @@ impl Metrics {
.with_description("Histogram of HTTP requests")
.init();
let lookup = meter
.u64_counter("lookup")
.with_description("Counts where data is found from")
.init();
Metrics {
otel_state: (registry, provider),
http_requests_histogram,
lookup,
}
}
@ -194,6 +220,17 @@ impl Metrics {
.encode_to_string(&self.otel_state.0.gather())
.expect("should be able to encode metrics")
}
/// Record that some data was found in a particular storage location
pub(crate) fn record_lookup(&self, lookup: Lookup, found_in: FoundIn) {
self.lookup.add(
1,
&[
KeyValue::new("lookup", <&str>::from(lookup)),
KeyValue::new("found_in", <&str>::from(found_in)),
],
);
}
}
/// Track HTTP metrics by converting this into an [`axum`] layer

View file

@ -28,8 +28,9 @@ use tokio::sync::MutexGuard;
use tracing::{error, warn};
use crate::{
observability::FoundIn, service::pdu::PduBuilder, services, Error,
PduEvent, Result,
observability::{FoundIn, Lookup, METRICS},
service::pdu::PduBuilder,
services, Error, PduEvent, Result,
};
pub(crate) struct Service {
@ -123,13 +124,15 @@ impl Service {
/// Whether a server is allowed to see an event through federation, based on
/// the room's history_visibility at that event's state.
#[tracing::instrument(skip(self), fields(cache_result))]
#[tracing::instrument(skip(self))]
pub(crate) fn server_can_see_event(
&self,
origin: &ServerName,
room_id: &RoomId,
event_id: &EventId,
) -> Result<bool> {
let lookup = Lookup::VisibilityForServer;
let Some(shortstatehash) = self.pdu_shortstatehash(event_id)? else {
return Ok(true);
};
@ -140,7 +143,7 @@ impl Service {
.unwrap()
.get_mut(&(origin.to_owned(), shortstatehash))
{
FoundIn::Cache.record("cache_result");
METRICS.record_lookup(lookup, FoundIn::Cache);
return Ok(*visibility);
}
@ -191,7 +194,7 @@ impl Service {
}
};
FoundIn::Database.record("cache_result");
METRICS.record_lookup(lookup, FoundIn::Database);
self.server_visibility_cache
.lock()
.unwrap()
@ -202,13 +205,15 @@ impl Service {
/// Whether a user is allowed to see an event, based on
/// the room's history_visibility at that event's state.
#[tracing::instrument(skip(self), fields(cache_result))]
#[tracing::instrument(skip(self))]
pub(crate) fn user_can_see_event(
&self,
user_id: &UserId,
room_id: &RoomId,
event_id: &EventId,
) -> Result<bool> {
let lookup = Lookup::VisibilityForUser;
let Some(shortstatehash) = self.pdu_shortstatehash(event_id)? else {
return Ok(true);
};
@ -219,7 +224,7 @@ impl Service {
.unwrap()
.get_mut(&(user_id.to_owned(), shortstatehash))
{
FoundIn::Cache.record("cache_result");
METRICS.record_lookup(lookup, FoundIn::Cache);
return Ok(*visibility);
}
@ -262,7 +267,7 @@ impl Service {
}
};
FoundIn::Database.record("cache_result");
METRICS.record_lookup(lookup, FoundIn::Database);
self.user_visibility_cache
.lock()
.unwrap()

View file

@ -10,7 +10,10 @@ use lru_cache::LruCache;
use ruma::{EventId, RoomId};
use self::data::StateDiff;
use crate::{observability::FoundIn, services, utils, Result};
use crate::{
observability::{FoundIn, Lookup, METRICS},
services, utils, Result,
};
#[derive(Clone)]
pub(crate) struct CompressedStateLayer {
@ -33,15 +36,17 @@ impl Service {
/// Returns a stack with info on shortstatehash, full state, added diff and
/// removed diff for the selected shortstatehash and each parent layer.
#[allow(clippy::type_complexity)]
#[tracing::instrument(skip(self), fields(cache_result))]
#[tracing::instrument(skip(self))]
pub(crate) fn load_shortstatehash_info(
&self,
shortstatehash: u64,
) -> Result<Vec<CompressedStateLayer>> {
let lookup = Lookup::StateInfo;
if let Some(r) =
self.stateinfo_cache.lock().unwrap().get_mut(&shortstatehash)
{
FoundIn::Cache.record("cache_result");
METRICS.record_lookup(lookup, FoundIn::Cache);
return Ok(r.clone());
}
@ -76,7 +81,7 @@ impl Service {
}]
};
FoundIn::Database.record("cache_result");
METRICS.record_lookup(lookup, FoundIn::Database);
self.stateinfo_cache
.lock()
.unwrap()