From 471442257dee1009f6851b44b59da4a9d91ed14d Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Wed, 22 May 2024 22:09:16 -0700 Subject: [PATCH] extract field_chunk_tokens --- crates/tabby-common/src/index/code/mod.rs | 6 +----- crates/tabby-common/src/index/doc.rs | 10 ++++------ crates/tabby-common/src/index/mod.rs | 5 ++--- crates/tabby-scheduler/src/index.rs | 5 ++++- crates/tabby-scheduler/src/web/mod.rs | 3 +-- 5 files changed, 12 insertions(+), 17 deletions(-) diff --git a/crates/tabby-common/src/index/code/mod.rs b/crates/tabby-common/src/index/code/mod.rs index 262a8a8bf4fe..9f09c741ebe6 100644 --- a/crates/tabby-common/src/index/code/mod.rs +++ b/crates/tabby-common/src/index/code/mod.rs @@ -35,11 +35,7 @@ impl CodeSearchSchema { let subqueries: Vec> = tokens .iter() .map(|text| { - let mut term = Term::from_field_text( - schema.field_chunk_tokens, - &text - ); - term.append_type_and_str(text.as_ref()); + let term = Term::from_field_text(schema.field_chunk_tokens, text); let term_query: Box = Box::new(TermQuery::new(term, IndexRecordOption::Basic)); diff --git a/crates/tabby-common/src/index/doc.rs b/crates/tabby-common/src/index/doc.rs index 6d8946635d2b..31bcc7dbcbdf 100644 --- a/crates/tabby-common/src/index/doc.rs +++ b/crates/tabby-common/src/index/doc.rs @@ -2,7 +2,9 @@ use std::borrow::Cow; use lazy_static::lazy_static; use tantivy::{ - query::{BooleanQuery, ExistsQuery, Occur, TermQuery}, schema::{Field, JsonObjectOptions, Schema, TextFieldIndexing, FAST, INDEXED, STORED, STRING}, tokenizer::TokenizerManager, Term + query::{BooleanQuery, ExistsQuery, Occur, TermQuery}, + schema::{Field, JsonObjectOptions, Schema, TextFieldIndexing, FAST, INDEXED, STORED, STRING}, + Term, }; use super::new_multiterms_const_query; @@ -104,11 +106,7 @@ impl DocSearchSchema { ) -> BooleanQuery { let iter = DocSearchSchema::binarize_embedding(embedding).map(Cow::Owned); - new_multiterms_const_query( - self.field_chunk_tokens, - embedding_dims, - iter, - ) + new_multiterms_const_query(self.field_chunk_tokens, embedding_dims, iter) } /// Build a query to find the document with the given `doc_id`. diff --git a/crates/tabby-common/src/index/mod.rs b/crates/tabby-common/src/index/mod.rs index a49c77300781..83a8a9a9f036 100644 --- a/crates/tabby-common/src/index/mod.rs +++ b/crates/tabby-common/src/index/mod.rs @@ -18,7 +18,7 @@ fn new_multiterms_const_query<'a>( ) -> BooleanQuery { let subqueries: Vec> = terms .map(|text| { - let mut term = Term::from_field_text(field, text.as_ref()); + let term = Term::from_field_text(field, text.as_ref()); let term_query: Box = Box::new(TermQuery::new(term, IndexRecordOption::Basic)); @@ -34,10 +34,9 @@ fn new_multiterms_const_query<'a>( #[cfg(test)] mod tests { - use serde_json::json; + use tantivy::{ collector::TopDocs, - doc, query::Query, schema::{Schema, STRING}, Index, IndexWriter, TantivyDocument, diff --git a/crates/tabby-scheduler/src/index.rs b/crates/tabby-scheduler/src/index.rs index 47aa74fdb9ed..0abd0172ce6a 100644 --- a/crates/tabby-scheduler/src/index.rs +++ b/crates/tabby-scheduler/src/index.rs @@ -9,7 +9,10 @@ pub trait DocumentBuilder: Send + Sync { fn format_id(&self, id: &str) -> String; async fn build_id(&self, document: &T) -> String; async fn build_attributes(&self, document: &T) -> serde_json::Value; - async fn build_chunk_attributes(&self, document: &T) -> BoxStream<(Vec, serde_json::Value)>; + async fn build_chunk_attributes( + &self, + document: &T, + ) -> BoxStream<(Vec, serde_json::Value)>; } pub struct DocIndex { diff --git a/crates/tabby-scheduler/src/web/mod.rs b/crates/tabby-scheduler/src/web/mod.rs index dbd7077838d3..b902dc9e141e 100644 --- a/crates/tabby-scheduler/src/web/mod.rs +++ b/crates/tabby-scheduler/src/web/mod.rs @@ -74,10 +74,9 @@ impl DocumentBuilder for WebBuilder { } let chunk = json!({ - // FIXME: tokenize chunk text webdoc::fields::CHUNK_TEXT: chunk_text, }); - + yield (chunk_embedding_tokens, chunk) } };