From cc5a9d3440a5769436a98aae158a7cc2e08a91d1 Mon Sep 17 00:00:00 2001 From: Benjamin Lee Date: Wed, 5 Jun 2024 21:52:37 -0700 Subject: [PATCH] factor search tokenization out into a function This ensures that the tokenization algorithm will remain in sync between querying, indexing, and deindexing. The existing code had slightly different behavior for querying, because it did not discard words with >50 bytes. This was inconsequential, because >50 byte tokens are never present in the index. --- src/database/key_value/rooms/search.rs | 38 ++++++++++++++------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/src/database/key_value/rooms/search.rs b/src/database/key_value/rooms/search.rs index bd14f312..8975c74d 100644 --- a/src/database/key_value/rooms/search.rs +++ b/src/database/key_value/rooms/search.rs @@ -2,6 +2,17 @@ use ruma::RoomId; use crate::{database::KeyValueDatabase, service, services, utils, Result}; +/// Splits a string into tokens used as keys in the search inverted index +/// +/// This may be used to tokenize both message bodies (for indexing) or search +/// queries (for querying). +fn tokenize(body: &str) -> impl Iterator + '_ { + body.split_terminator(|c: char| !c.is_alphanumeric()) + .filter(|s| !s.is_empty()) + .filter(|word| word.len() <= 50) + .map(str::to_lowercase) +} + impl service::rooms::search::Data for KeyValueDatabase { #[tracing::instrument(skip(self))] fn index_pdu( @@ -10,19 +21,14 @@ impl service::rooms::search::Data for KeyValueDatabase { pdu_id: &[u8], message_body: &str, ) -> Result<()> { - let mut batch = message_body - .split_terminator(|c: char| !c.is_alphanumeric()) - .filter(|s| !s.is_empty()) - .filter(|word| word.len() <= 50) - .map(str::to_lowercase) - .map(|word| { - let mut key = shortroomid.to_be_bytes().to_vec(); - key.extend_from_slice(word.as_bytes()); - key.push(0xFF); - // TODO: currently we save the room id a second time here - key.extend_from_slice(pdu_id); - (key, Vec::new()) - }); + let mut batch = tokenize(message_body).map(|word| { + let mut key = shortroomid.to_be_bytes().to_vec(); + key.extend_from_slice(word.as_bytes()); + key.push(0xFF); + // TODO: currently we save the room id a second time here + key.extend_from_slice(pdu_id); + (key, Vec::new()) + }); self.tokenids.insert_batch(&mut batch) } @@ -43,11 +49,7 @@ impl service::rooms::search::Data for KeyValueDatabase { .to_be_bytes() .to_vec(); - let words: Vec<_> = search_string - .split_terminator(|c: char| !c.is_alphanumeric()) - .filter(|s| !s.is_empty()) - .map(str::to_lowercase) - .collect(); + let words: Vec<_> = tokenize(search_string).collect(); let iterators = words.clone().into_iter().map(move |word| { let mut prefix2 = prefix.clone();