use ruma::RoomId; use crate::{ database::KeyValueDatabase, service::{ self, rooms::{short::ShortRoomId, timeline::PduId}, }, services, utils, Result, }; /// Splits a string into tokens used as keys in the search inverted index /// /// This may be used to tokenize both message bodies (for indexing) or search /// queries (for querying). fn tokenize(body: &str) -> impl Iterator + '_ { body.split_terminator(|c: char| !c.is_alphanumeric()) .filter(|s| !s.is_empty()) .filter(|word| word.len() <= 50) .map(str::to_lowercase) } impl service::rooms::search::Data for KeyValueDatabase { #[tracing::instrument(skip(self))] fn index_pdu( &self, shortroomid: ShortRoomId, pdu_id: &PduId, message_body: &str, ) -> Result<()> { let mut batch = tokenize(message_body).map(|word| { let mut key = shortroomid.get().to_be_bytes().to_vec(); key.extend_from_slice(word.as_bytes()); key.push(0xFF); // TODO: currently we save the room id a second time here key.extend_from_slice(pdu_id.as_bytes()); (key, Vec::new()) }); self.tokenids.insert_batch(&mut batch) } #[tracing::instrument(skip(self))] fn deindex_pdu( &self, shortroomid: ShortRoomId, pdu_id: &PduId, message_body: &str, ) -> Result<()> { let batch = tokenize(message_body).map(|word| { let mut key = shortroomid.get().to_be_bytes().to_vec(); key.extend_from_slice(word.as_bytes()); key.push(0xFF); // TODO: currently we save the room id a second time here key.extend_from_slice(pdu_id.as_bytes()); key }); for token in batch { self.tokenids.remove(&token)?; } Ok(()) } #[tracing::instrument(skip(self))] #[expect(clippy::type_complexity)] fn search_pdus<'a>( &'a self, room_id: &RoomId, search_string: &str, ) -> Result + 'a>, Vec)>> { let prefix = services() .rooms .short .get_shortroomid(room_id)? .expect("room exists") .get() .to_be_bytes() .to_vec(); let words: Vec<_> = tokenize(search_string).collect(); let iterators = words.clone().into_iter().map(move |word| { let mut prefix2 = prefix.clone(); prefix2.extend_from_slice(word.as_bytes()); prefix2.push(0xFF); let prefix3 = prefix2.clone(); let mut last_possible_id = prefix2.clone(); last_possible_id.extend_from_slice(&u64::MAX.to_be_bytes()); self.tokenids // Newest pdus first .iter_from(&last_possible_id, true) .take_while(move |(k, _)| k.starts_with(&prefix2)) .map(move |(key, _)| PduId::new(key[prefix3.len()..].to_vec())) }); // We compare b with a because we reversed the iterator earlier let Some(common_elements) = utils::common_elements(iterators, |a, b| { b.as_bytes().cmp(a.as_bytes()) }) else { return Ok(None); }; Ok(Some((Box::new(common_elements), words))) } }