extract field_tokens

TabbyML · May 23, 2024 · 4886f92 · 4886f92
1 parent 6f54c08
commit 4886f92
Show file tree

Hide file tree

Showing 6 changed files with 40 additions and 50 deletions.
diff --git a/crates/tabby-common/src/index/code/mod.rs b/crates/tabby-common/src/index/code/mod.rs
@@ -35,10 +35,9 @@ impl CodeSearchSchema {
         let subqueries: Vec<Box<dyn Query>> = tokens
             .iter()
             .map(|text| {
-                let mut term = Term::from_field_json_path(
-                    schema.field_chunk_attributes,
-                    webcode::fields::CHUNK_TOKENIZED_BODY,
-                    false,
+                let mut term = Term::from_field_text(
+                    schema.field_chunk_tokens,
+                    &text
                 );
                 term.append_type_and_str(text.as_ref());
                 let term_query: Box<dyn Query> =

diff --git a/crates/tabby-common/src/index/doc.rs b/crates/tabby-common/src/index/doc.rs
@@ -2,23 +2,16 @@ use std::borrow::Cow;
 
 use lazy_static::lazy_static;
 use tantivy::{
-    query::{BooleanQuery, ExistsQuery, Occur, TermQuery},
-    schema::{Field, JsonObjectOptions, Schema, TextFieldIndexing, FAST, INDEXED, STORED, STRING},
-    Term,
+    query::{BooleanQuery, ExistsQuery, Occur, TermQuery}, schema::{Field, JsonObjectOptions, Schema, TextFieldIndexing, FAST, INDEXED, STORED, STRING}, tokenizer::TokenizerManager, Term
 };
 
-use super::new_multiterms_const_query_with_path;
+use super::new_multiterms_const_query;
 
 pub mod webdoc {
     pub mod fields {
         pub const TITLE: &str = "title";
         pub const LINK: &str = "link";
         pub const CHUNK_TEXT: &str = "chunk_text";
-
-        // Binarized embedding tokens with the following mapping:
-        // * [-1, 0] -> 0
-        // * (0, 1] -> 1
-        pub const CHUNK_EMBEDDING: &str = "chunk_embedding";
     }
 }
 
@@ -27,7 +20,6 @@ pub mod webcode {
         pub const CHUNK_GIT_URL: &str = "chunk_git_url";
         pub const CHUNK_FILEPATH: &str = "chunk_filepath";
         pub const CHUNK_LANGUAGE: &str = "chunk_language";
-        pub const CHUNK_TOKENIZED_BODY: &str = "chunk_tokenized_body";
         pub const CHUNK_BODY: &str = "chunk_body";
         pub const CHUNK_START_LINE: &str = "chunk_start_line";
     }
@@ -46,6 +38,8 @@ pub struct DocSearchSchema {
     // === Fields for chunk ===
     pub field_chunk_id: Field,
     pub field_chunk_attributes: Field,
+
+    pub field_chunk_tokens: Field,
 }
 
 const FIELD_CHUNK_ID: &str = "chunk_id";
@@ -76,6 +70,7 @@ impl DocSearchSchema {
                 ),
         );
 
+        let field_chunk_tokens = builder.add_text_field("chunk_tokens", STRING);
         let schema = builder.build();
 
         Self {
@@ -86,6 +81,7 @@ impl DocSearchSchema {
 
             field_chunk_id,
             field_chunk_attributes,
+            field_chunk_tokens,
         }
     }
 
@@ -108,10 +104,9 @@ impl DocSearchSchema {
     ) -> BooleanQuery {
         let iter = DocSearchSchema::binarize_embedding(embedding).map(Cow::Owned);
 
-        new_multiterms_const_query_with_path(
-            self.field_chunk_attributes,
+        new_multiterms_const_query(
+            self.field_chunk_tokens,
             embedding_dims,
-            webdoc::fields::CHUNK_EMBEDDING,
             iter,
         )
     }

diff --git a/crates/tabby-common/src/index/mod.rs b/crates/tabby-common/src/index/mod.rs
@@ -11,16 +11,14 @@ use tantivy::{
     Term,
 };
 
-fn new_multiterms_const_query_with_path<'a>(
+fn new_multiterms_const_query<'a>(
     field: Field,
     embedding_dims: usize,
-    path: &str,
     terms: impl Iterator<Item = Cow<'a, str>> + 'a,
 ) -> BooleanQuery {
     let subqueries: Vec<Box<dyn Query>> = terms
         .map(|text| {
-            let mut term = Term::from_field_json_path(field, path, false);
-            term.append_type_and_str(text.as_ref());
+            let mut term = Term::from_field_text(field, text.as_ref());
             let term_query: Box<dyn Query> =
                 Box::new(TermQuery::new(term, IndexRecordOption::Basic));
 
@@ -42,36 +40,31 @@ mod tests {
         doc,
         query::Query,
         schema::{Schema, STRING},
-        Index, IndexWriter,
+        Index, IndexWriter, TantivyDocument,
     };
 
     use super::*;
 
-    const PATH: &str = "attr";
-
     #[test]
     fn test_new_multiterms_const_query() -> anyhow::Result<()> {
         let mut schema_builder = Schema::builder();
-        let field1 = schema_builder.add_json_field("field1", STRING);
+        let field1 = schema_builder.add_text_field("field1", STRING);
         let schema = schema_builder.build();
         let index = Index::create_in_ram(schema);
         {
             let mut index_writer: IndexWriter = index.writer(15_000_000)?;
 
             // doc1
-            let doc = doc!(
-                field1 => json!({
-                        PATH: vec!["value1", "value2", "value3"]
-                })
-            );
+            let mut doc = TantivyDocument::new();
+            doc.add_text(field1, "value1");
+            doc.add_text(field1, "value2");
+            doc.add_text(field1, "value3");
             index_writer.add_document(doc)?;
 
             // doc2
-            let doc = doc!(
-                field1 => json!({
-                        PATH: vec!["value2", "value4"]
-                })
-            );
+            let mut doc = TantivyDocument::new();
+            doc.add_text(field1, "value2");
+            doc.add_text(field1, "value4");
             index_writer.add_document(doc)?;
 
             index_writer.commit()?;
@@ -80,10 +73,9 @@ mod tests {
         let searcher = reader.searcher();
 
         {
-            let query = new_multiterms_const_query_with_path(
+            let query = new_multiterms_const_query(
                 field1,
                 4,
-                PATH,
                 vec!["value1", "value3"].into_iter().map(Cow::Borrowed),
             );
 
@@ -95,10 +87,9 @@ mod tests {
         }
 
         {
-            let query = new_multiterms_const_query_with_path(
+            let query = new_multiterms_const_query(
                 field1,
                 4,
-                PATH,
                 vec!["value1", "value2", "value3"]
                     .into_iter()
                     .map(Cow::Borrowed),

diff --git a/crates/tabby-scheduler/src/code/mod.rs b/crates/tabby-scheduler/src/code/mod.rs
@@ -67,7 +67,7 @@ impl DocumentBuilder<SourceCode> for CodeBuilder {
     async fn build_chunk_attributes(
         &self,
         source_file: &SourceCode,
-    ) -> BoxStream<serde_json::Value> {
+    ) -> BoxStream<(Vec<String>, serde_json::Value)> {
         let text = match source_file.read_content() {
             Ok(content) => content,
             Err(e) => {
@@ -84,14 +84,14 @@ impl DocumentBuilder<SourceCode> for CodeBuilder {
         let s = stream! {
             let intelligence = CodeIntelligence::default();
             for (start_line, body) in intelligence.chunks(&text) {
-                yield json!({
+                let tokens = CodeSearchSchema::tokenize_code(body);
+                yield (tokens, json!({
                     webcode::fields::CHUNK_FILEPATH: source_file.filepath,
                     webcode::fields::CHUNK_GIT_URL: source_file.git_url,
                     webcode::fields::CHUNK_LANGUAGE: source_file.language,
-                    webcode::fields::CHUNK_TOKENIZED_BODY:  CodeSearchSchema::tokenize_code(body),
                     webcode::fields::CHUNK_BODY:  body,
                     webcode::fields::CHUNK_START_LINE: start_line,
-                });
+                }));
             }
         };
 

diff --git a/crates/tabby-scheduler/src/index.rs b/crates/tabby-scheduler/src/index.rs
@@ -9,7 +9,7 @@ pub trait DocumentBuilder<T>: Send + Sync {
     fn format_id(&self, id: &str) -> String;
     async fn build_id(&self, document: &T) -> String;
     async fn build_attributes(&self, document: &T) -> serde_json::Value;
-    async fn build_chunk_attributes(&self, document: &T) -> BoxStream<serde_json::Value>;
+    async fn build_chunk_attributes(&self, document: &T) -> BoxStream<(Vec<String>, serde_json::Value)>;
 }
 
 pub struct DocIndex<T> {
@@ -74,13 +74,19 @@ impl<T> DocIndex<T> {
             .build_chunk_attributes(&document)
             .await
             .enumerate()
-            .map(move |(chunk_id, chunk_attributes)| {
-                doc! {
+            .map(move |(chunk_id, (tokens, chunk_attributes))| {
+                let mut doc = doc! {
                     schema.field_id => id,
                     schema.field_updated_at => updated_at,
                     schema.field_chunk_id => format!("{}-{}", id, chunk_id),
                     schema.field_chunk_attributes => chunk_attributes,
+                };
+
+                for token in tokens {
+                    doc.add_text(schema.field_chunk_tokens, token);
                 }
+
+                doc
             })
     }
 

diff --git a/crates/tabby-scheduler/src/web/mod.rs b/crates/tabby-scheduler/src/web/mod.rs
@@ -53,7 +53,7 @@ impl DocumentBuilder<SourceDocument> for WebBuilder {
     async fn build_chunk_attributes(
         &self,
         document: &SourceDocument,
-    ) -> BoxStream<serde_json::Value> {
+    ) -> BoxStream<(Vec<String>, serde_json::Value)> {
         let splitter = TextSplitter::default().with_trim_chunks(true);
         let embedding = self.embedding.clone();
         let content = document.body.clone();
@@ -76,10 +76,9 @@ impl DocumentBuilder<SourceDocument> for WebBuilder {
                 let chunk = json!({
                         // FIXME: tokenize chunk text
                         webdoc::fields::CHUNK_TEXT: chunk_text,
-                        webdoc::fields::CHUNK_EMBEDDING: chunk_embedding_tokens,
                 });
-
-                yield chunk
+                
+                yield (chunk_embedding_tokens, chunk)
             }
         };