Skip to content

Commit

Permalink
extract field_chunk_tokens
Browse files Browse the repository at this point in the history
  • Loading branch information
wsxiaoys committed May 23, 2024
1 parent 56116b0 commit 894e319
Show file tree
Hide file tree
Showing 5 changed files with 12 additions and 17 deletions.
6 changes: 1 addition & 5 deletions crates/tabby-common/src/index/code/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,7 @@ impl CodeSearchSchema {
let subqueries: Vec<Box<dyn Query>> = tokens
.iter()
.map(|text| {
let mut term = Term::from_field_text(
schema.field_chunk_tokens,
&text
);
term.append_type_and_str(text.as_ref());
let term = Term::from_field_text(schema.field_chunk_tokens, text);
let term_query: Box<dyn Query> =
Box::new(TermQuery::new(term, IndexRecordOption::Basic));

Expand Down
10 changes: 4 additions & 6 deletions crates/tabby-common/src/index/doc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@ use std::borrow::Cow;

use lazy_static::lazy_static;
use tantivy::{
query::{BooleanQuery, ExistsQuery, Occur, TermQuery}, schema::{Field, JsonObjectOptions, Schema, TextFieldIndexing, FAST, INDEXED, STORED, STRING}, tokenizer::TokenizerManager, Term
query::{BooleanQuery, ExistsQuery, Occur, TermQuery},
schema::{Field, JsonObjectOptions, Schema, TextFieldIndexing, FAST, INDEXED, STORED, STRING},
Term,
};

use super::new_multiterms_const_query;
Expand Down Expand Up @@ -104,11 +106,7 @@ impl DocSearchSchema {
) -> BooleanQuery {
let iter = DocSearchSchema::binarize_embedding(embedding).map(Cow::Owned);

new_multiterms_const_query(
self.field_chunk_tokens,
embedding_dims,
iter,
)
new_multiterms_const_query(self.field_chunk_tokens, embedding_dims, iter)
}

/// Build a query to find the document with the given `doc_id`.
Expand Down
5 changes: 2 additions & 3 deletions crates/tabby-common/src/index/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ fn new_multiterms_const_query<'a>(
) -> BooleanQuery {
let subqueries: Vec<Box<dyn Query>> = terms
.map(|text| {
let mut term = Term::from_field_text(field, text.as_ref());
let term = Term::from_field_text(field, text.as_ref());
let term_query: Box<dyn Query> =
Box::new(TermQuery::new(term, IndexRecordOption::Basic));

Expand All @@ -34,10 +34,9 @@ fn new_multiterms_const_query<'a>(

#[cfg(test)]
mod tests {
use serde_json::json;

use tantivy::{
collector::TopDocs,
doc,
query::Query,
schema::{Schema, STRING},
Index, IndexWriter, TantivyDocument,
Expand Down
5 changes: 4 additions & 1 deletion crates/tabby-scheduler/src/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@ pub trait DocumentBuilder<T>: Send + Sync {
fn format_id(&self, id: &str) -> String;
async fn build_id(&self, document: &T) -> String;
async fn build_attributes(&self, document: &T) -> serde_json::Value;
async fn build_chunk_attributes(&self, document: &T) -> BoxStream<(Vec<String>, serde_json::Value)>;
async fn build_chunk_attributes(
&self,
document: &T,
) -> BoxStream<(Vec<String>, serde_json::Value)>;
}

pub struct DocIndex<T> {
Expand Down
3 changes: 1 addition & 2 deletions crates/tabby-scheduler/src/web/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,9 @@ impl DocumentBuilder<SourceDocument> for WebBuilder {
}

let chunk = json!({
// FIXME: tokenize chunk text
webdoc::fields::CHUNK_TEXT: chunk_text,
});

yield (chunk_embedding_tokens, chunk)
}
};
Expand Down

0 comments on commit 894e319

Please sign in to comment.