Skip to content

Commit

Permalink
extract field_tokens
Browse files Browse the repository at this point in the history
  • Loading branch information
wsxiaoys committed May 23, 2024
1 parent 6f54c08 commit 4886f92
Show file tree
Hide file tree
Showing 6 changed files with 40 additions and 50 deletions.
7 changes: 3 additions & 4 deletions crates/tabby-common/src/index/code/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,9 @@ impl CodeSearchSchema {
let subqueries: Vec<Box<dyn Query>> = tokens
.iter()
.map(|text| {
let mut term = Term::from_field_json_path(
schema.field_chunk_attributes,
webcode::fields::CHUNK_TOKENIZED_BODY,
false,
let mut term = Term::from_field_text(
schema.field_chunk_tokens,
&text
);
term.append_type_and_str(text.as_ref());
let term_query: Box<dyn Query> =
Expand Down
21 changes: 8 additions & 13 deletions crates/tabby-common/src/index/doc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,16 @@ use std::borrow::Cow;

use lazy_static::lazy_static;
use tantivy::{
query::{BooleanQuery, ExistsQuery, Occur, TermQuery},
schema::{Field, JsonObjectOptions, Schema, TextFieldIndexing, FAST, INDEXED, STORED, STRING},
Term,
query::{BooleanQuery, ExistsQuery, Occur, TermQuery}, schema::{Field, JsonObjectOptions, Schema, TextFieldIndexing, FAST, INDEXED, STORED, STRING}, tokenizer::TokenizerManager, Term
};

use super::new_multiterms_const_query_with_path;
use super::new_multiterms_const_query;

pub mod webdoc {
pub mod fields {
pub const TITLE: &str = "title";
pub const LINK: &str = "link";
pub const CHUNK_TEXT: &str = "chunk_text";

// Binarized embedding tokens with the following mapping:
// * [-1, 0] -> 0
// * (0, 1] -> 1
pub const CHUNK_EMBEDDING: &str = "chunk_embedding";
}
}

Expand All @@ -27,7 +20,6 @@ pub mod webcode {
pub const CHUNK_GIT_URL: &str = "chunk_git_url";
pub const CHUNK_FILEPATH: &str = "chunk_filepath";
pub const CHUNK_LANGUAGE: &str = "chunk_language";
pub const CHUNK_TOKENIZED_BODY: &str = "chunk_tokenized_body";
pub const CHUNK_BODY: &str = "chunk_body";
pub const CHUNK_START_LINE: &str = "chunk_start_line";
}
Expand All @@ -46,6 +38,8 @@ pub struct DocSearchSchema {
// === Fields for chunk ===
pub field_chunk_id: Field,
pub field_chunk_attributes: Field,

pub field_chunk_tokens: Field,
}

const FIELD_CHUNK_ID: &str = "chunk_id";
Expand Down Expand Up @@ -76,6 +70,7 @@ impl DocSearchSchema {
),
);

let field_chunk_tokens = builder.add_text_field("chunk_tokens", STRING);
let schema = builder.build();

Self {
Expand All @@ -86,6 +81,7 @@ impl DocSearchSchema {

field_chunk_id,
field_chunk_attributes,
field_chunk_tokens,
}
}

Expand All @@ -108,10 +104,9 @@ impl DocSearchSchema {
) -> BooleanQuery {
let iter = DocSearchSchema::binarize_embedding(embedding).map(Cow::Owned);

new_multiterms_const_query_with_path(
self.field_chunk_attributes,
new_multiterms_const_query(
self.field_chunk_tokens,
embedding_dims,
webdoc::fields::CHUNK_EMBEDDING,
iter,
)
}
Expand Down
35 changes: 13 additions & 22 deletions crates/tabby-common/src/index/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,14 @@ use tantivy::{
Term,
};

fn new_multiterms_const_query_with_path<'a>(
fn new_multiterms_const_query<'a>(
field: Field,
embedding_dims: usize,
path: &str,
terms: impl Iterator<Item = Cow<'a, str>> + 'a,
) -> BooleanQuery {
let subqueries: Vec<Box<dyn Query>> = terms
.map(|text| {
let mut term = Term::from_field_json_path(field, path, false);
term.append_type_and_str(text.as_ref());
let mut term = Term::from_field_text(field, text.as_ref());
let term_query: Box<dyn Query> =
Box::new(TermQuery::new(term, IndexRecordOption::Basic));

Expand All @@ -42,36 +40,31 @@ mod tests {
doc,
query::Query,
schema::{Schema, STRING},
Index, IndexWriter,
Index, IndexWriter, TantivyDocument,
};

use super::*;

const PATH: &str = "attr";

#[test]
fn test_new_multiterms_const_query() -> anyhow::Result<()> {
let mut schema_builder = Schema::builder();
let field1 = schema_builder.add_json_field("field1", STRING);
let field1 = schema_builder.add_text_field("field1", STRING);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer: IndexWriter = index.writer(15_000_000)?;

// doc1
let doc = doc!(
field1 => json!({
PATH: vec!["value1", "value2", "value3"]
})
);
let mut doc = TantivyDocument::new();
doc.add_text(field1, "value1");
doc.add_text(field1, "value2");
doc.add_text(field1, "value3");
index_writer.add_document(doc)?;

// doc2
let doc = doc!(
field1 => json!({
PATH: vec!["value2", "value4"]
})
);
let mut doc = TantivyDocument::new();
doc.add_text(field1, "value2");
doc.add_text(field1, "value4");
index_writer.add_document(doc)?;

index_writer.commit()?;
Expand All @@ -80,10 +73,9 @@ mod tests {
let searcher = reader.searcher();

{
let query = new_multiterms_const_query_with_path(
let query = new_multiterms_const_query(
field1,
4,
PATH,
vec!["value1", "value3"].into_iter().map(Cow::Borrowed),
);

Expand All @@ -95,10 +87,9 @@ mod tests {
}

{
let query = new_multiterms_const_query_with_path(
let query = new_multiterms_const_query(
field1,
4,
PATH,
vec!["value1", "value2", "value3"]
.into_iter()
.map(Cow::Borrowed),
Expand Down
8 changes: 4 additions & 4 deletions crates/tabby-scheduler/src/code/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ impl DocumentBuilder<SourceCode> for CodeBuilder {
async fn build_chunk_attributes(
&self,
source_file: &SourceCode,
) -> BoxStream<serde_json::Value> {
) -> BoxStream<(Vec<String>, serde_json::Value)> {
let text = match source_file.read_content() {
Ok(content) => content,
Err(e) => {
Expand All @@ -84,14 +84,14 @@ impl DocumentBuilder<SourceCode> for CodeBuilder {
let s = stream! {
let intelligence = CodeIntelligence::default();
for (start_line, body) in intelligence.chunks(&text) {
yield json!({
let tokens = CodeSearchSchema::tokenize_code(body);
yield (tokens, json!({
webcode::fields::CHUNK_FILEPATH: source_file.filepath,
webcode::fields::CHUNK_GIT_URL: source_file.git_url,
webcode::fields::CHUNK_LANGUAGE: source_file.language,
webcode::fields::CHUNK_TOKENIZED_BODY: CodeSearchSchema::tokenize_code(body),
webcode::fields::CHUNK_BODY: body,
webcode::fields::CHUNK_START_LINE: start_line,
});
}));
}
};

Expand Down
12 changes: 9 additions & 3 deletions crates/tabby-scheduler/src/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ pub trait DocumentBuilder<T>: Send + Sync {
fn format_id(&self, id: &str) -> String;
async fn build_id(&self, document: &T) -> String;
async fn build_attributes(&self, document: &T) -> serde_json::Value;
async fn build_chunk_attributes(&self, document: &T) -> BoxStream<serde_json::Value>;
async fn build_chunk_attributes(&self, document: &T) -> BoxStream<(Vec<String>, serde_json::Value)>;
}

pub struct DocIndex<T> {
Expand Down Expand Up @@ -74,13 +74,19 @@ impl<T> DocIndex<T> {
.build_chunk_attributes(&document)
.await
.enumerate()
.map(move |(chunk_id, chunk_attributes)| {
doc! {
.map(move |(chunk_id, (tokens, chunk_attributes))| {
let mut doc = doc! {
schema.field_id => id,
schema.field_updated_at => updated_at,
schema.field_chunk_id => format!("{}-{}", id, chunk_id),
schema.field_chunk_attributes => chunk_attributes,
};

for token in tokens {
doc.add_text(schema.field_chunk_tokens, token);
}

doc
})
}

Expand Down
7 changes: 3 additions & 4 deletions crates/tabby-scheduler/src/web/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ impl DocumentBuilder<SourceDocument> for WebBuilder {
async fn build_chunk_attributes(
&self,
document: &SourceDocument,
) -> BoxStream<serde_json::Value> {
) -> BoxStream<(Vec<String>, serde_json::Value)> {
let splitter = TextSplitter::default().with_trim_chunks(true);
let embedding = self.embedding.clone();
let content = document.body.clone();
Expand All @@ -76,10 +76,9 @@ impl DocumentBuilder<SourceDocument> for WebBuilder {
let chunk = json!({
// FIXME: tokenize chunk text
webdoc::fields::CHUNK_TEXT: chunk_text,
webdoc::fields::CHUNK_EMBEDDING: chunk_embedding_tokens,
});

yield chunk
yield (chunk_embedding_tokens, chunk)
}
};

Expand Down

0 comments on commit 4886f92

Please sign in to comment.