mirror of
https://gitlab.computer.surgery/matrix/grapevine.git
synced 2025-12-18 00:01:24 +01:00
factor search tokenization out into a function
This ensures that the tokenization algorithm will remain in sync between querying, indexing, and deindexing. The existing code had slightly different behavior for querying, because it did not discard words with >50 bytes. This was inconsequential, because >50 byte tokens are never present in the index.
This commit is contained in:
parent
0c2094a56f
commit
cc5a9d3440
1 changed files with 20 additions and 18 deletions
|
|
@ -2,6 +2,17 @@ use ruma::RoomId;
|
||||||
|
|
||||||
use crate::{database::KeyValueDatabase, service, services, utils, Result};
|
use crate::{database::KeyValueDatabase, service, services, utils, Result};
|
||||||
|
|
||||||
|
/// Splits a string into tokens used as keys in the search inverted index
|
||||||
|
///
|
||||||
|
/// This may be used to tokenize both message bodies (for indexing) or search
|
||||||
|
/// queries (for querying).
|
||||||
|
fn tokenize(body: &str) -> impl Iterator<Item = String> + '_ {
|
||||||
|
body.split_terminator(|c: char| !c.is_alphanumeric())
|
||||||
|
.filter(|s| !s.is_empty())
|
||||||
|
.filter(|word| word.len() <= 50)
|
||||||
|
.map(str::to_lowercase)
|
||||||
|
}
|
||||||
|
|
||||||
impl service::rooms::search::Data for KeyValueDatabase {
|
impl service::rooms::search::Data for KeyValueDatabase {
|
||||||
#[tracing::instrument(skip(self))]
|
#[tracing::instrument(skip(self))]
|
||||||
fn index_pdu(
|
fn index_pdu(
|
||||||
|
|
@ -10,19 +21,14 @@ impl service::rooms::search::Data for KeyValueDatabase {
|
||||||
pdu_id: &[u8],
|
pdu_id: &[u8],
|
||||||
message_body: &str,
|
message_body: &str,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let mut batch = message_body
|
let mut batch = tokenize(message_body).map(|word| {
|
||||||
.split_terminator(|c: char| !c.is_alphanumeric())
|
let mut key = shortroomid.to_be_bytes().to_vec();
|
||||||
.filter(|s| !s.is_empty())
|
key.extend_from_slice(word.as_bytes());
|
||||||
.filter(|word| word.len() <= 50)
|
key.push(0xFF);
|
||||||
.map(str::to_lowercase)
|
// TODO: currently we save the room id a second time here
|
||||||
.map(|word| {
|
key.extend_from_slice(pdu_id);
|
||||||
let mut key = shortroomid.to_be_bytes().to_vec();
|
(key, Vec::new())
|
||||||
key.extend_from_slice(word.as_bytes());
|
});
|
||||||
key.push(0xFF);
|
|
||||||
// TODO: currently we save the room id a second time here
|
|
||||||
key.extend_from_slice(pdu_id);
|
|
||||||
(key, Vec::new())
|
|
||||||
});
|
|
||||||
|
|
||||||
self.tokenids.insert_batch(&mut batch)
|
self.tokenids.insert_batch(&mut batch)
|
||||||
}
|
}
|
||||||
|
|
@ -43,11 +49,7 @@ impl service::rooms::search::Data for KeyValueDatabase {
|
||||||
.to_be_bytes()
|
.to_be_bytes()
|
||||||
.to_vec();
|
.to_vec();
|
||||||
|
|
||||||
let words: Vec<_> = search_string
|
let words: Vec<_> = tokenize(search_string).collect();
|
||||||
.split_terminator(|c: char| !c.is_alphanumeric())
|
|
||||||
.filter(|s| !s.is_empty())
|
|
||||||
.map(str::to_lowercase)
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
let iterators = words.clone().into_iter().map(move |word| {
|
let iterators = words.clone().into_iter().map(move |word| {
|
||||||
let mut prefix2 = prefix.clone();
|
let mut prefix2 = prefix.clone();
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue