grapevine/src/database/key_value/rooms/search.rs
2024-09-28 11:27:05 +00:00

112 lines
3.4 KiB
Rust

use ruma::RoomId;
use crate::{
database::KeyValueDatabase,
service::{
self,
rooms::{short::ShortRoomId, timeline::PduId},
},
services, utils, Result,
};
/// Splits a string into tokens used as keys in the search inverted index
///
/// This may be used to tokenize both message bodies (for indexing) or search
/// queries (for querying).
fn tokenize(body: &str) -> impl Iterator<Item = String> + '_ {
body.split_terminator(|c: char| !c.is_alphanumeric())
.filter(|s| !s.is_empty())
.filter(|word| word.len() <= 50)
.map(str::to_lowercase)
}
impl service::rooms::search::Data for KeyValueDatabase {
#[tracing::instrument(skip(self))]
fn index_pdu(
&self,
shortroomid: ShortRoomId,
pdu_id: &PduId,
message_body: &str,
) -> Result<()> {
let mut batch = tokenize(message_body).map(|word| {
let mut key = shortroomid.get().to_be_bytes().to_vec();
key.extend_from_slice(word.as_bytes());
key.push(0xFF);
// TODO: currently we save the room id a second time here
key.extend_from_slice(pdu_id.as_bytes());
(key, Vec::new())
});
self.tokenids.insert_batch(&mut batch)
}
#[tracing::instrument(skip(self))]
fn deindex_pdu(
&self,
shortroomid: ShortRoomId,
pdu_id: &PduId,
message_body: &str,
) -> Result<()> {
let batch = tokenize(message_body).map(|word| {
let mut key = shortroomid.get().to_be_bytes().to_vec();
key.extend_from_slice(word.as_bytes());
key.push(0xFF);
// TODO: currently we save the room id a second time here
key.extend_from_slice(pdu_id.as_bytes());
key
});
for token in batch {
self.tokenids.remove(&token)?;
}
Ok(())
}
#[tracing::instrument(skip(self))]
#[expect(clippy::type_complexity)]
fn search_pdus<'a>(
&'a self,
room_id: &RoomId,
search_string: &str,
) -> Result<Option<(Box<dyn Iterator<Item = PduId> + 'a>, Vec<String>)>>
{
let prefix = services()
.rooms
.short
.get_shortroomid(room_id)?
.expect("room exists")
.get()
.to_be_bytes()
.to_vec();
let words: Vec<_> = tokenize(search_string).collect();
let iterators = words.clone().into_iter().map(move |word| {
let mut prefix2 = prefix.clone();
prefix2.extend_from_slice(word.as_bytes());
prefix2.push(0xFF);
let prefix3 = prefix2.clone();
let mut last_possible_id = prefix2.clone();
last_possible_id.extend_from_slice(&u64::MAX.to_be_bytes());
self.tokenids
// Newest pdus first
.iter_from(&last_possible_id, true)
.take_while(move |(k, _)| k.starts_with(&prefix2))
.map(move |(key, _)| PduId::new(key[prefix3.len()..].to_vec()))
});
// We compare b with a because we reversed the iterator earlier
let Some(common_elements) =
utils::common_elements(iterators, |a, b| {
b.as_bytes().cmp(a.as_bytes())
})
else {
return Ok(None);
};
Ok(Some((Box::new(common_elements), words)))
}
}