Skip to content

Commit

Permalink
feat(index): add indexed author_email field to issue and pull (#3513)
Browse files Browse the repository at this point in the history
* chore: add indexed author_email field to issue and pull

Signed-off-by: Wei Zhang <[email protected]>

* chore: author_email optional

Signed-off-by: Wei Zhang <[email protected]>

* chore: use new func to get option json from index

Signed-off-by: Wei Zhang <[email protected]>

---------

Signed-off-by: Wei Zhang <[email protected]>
  • Loading branch information
zwpaper authored Dec 5, 2024
1 parent 97561aa commit 36331a9
Show file tree
Hide file tree
Showing 10 changed files with 209 additions and 23 deletions.
35 changes: 35 additions & 0 deletions crates/tabby-common/src/api/structured_doc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ pub struct DocSearchWebDocument {
pub struct DocSearchIssueDocument {
pub title: String,
pub link: String,
pub author_email: Option<String>,
pub body: String,
pub closed: bool,
}
Expand All @@ -70,6 +71,7 @@ pub struct DocSearchIssueDocument {
pub struct DocSearchPullDocument {
pub title: String,
pub link: String,
pub author_email: Option<String>,
pub body: String,
pub diff: String,
pub merged: bool,
Expand Down Expand Up @@ -139,6 +141,11 @@ impl FromTantivyDocument for DocSearchIssueDocument {
schema.field_attributes,
structured_doc::fields::issue::LINK,
);
let author_email = get_json_option_text_field(
doc,
schema.field_attributes,
structured_doc::fields::issue::AUTHOR_EMAIL,
);
let body = get_json_text_field(
doc,
schema.field_attributes,
Expand All @@ -152,6 +159,7 @@ impl FromTantivyDocument for DocSearchIssueDocument {
Some(Self {
title: title.into(),
link: link.into(),
author_email: author_email.map(Into::into),
body: body.into(),
closed,
})
Expand All @@ -171,6 +179,11 @@ impl FromTantivyDocument for DocSearchPullDocument {
schema.field_attributes,
structured_doc::fields::pull::LINK,
);
let author_email = get_json_option_text_field(
doc,
schema.field_attributes,
structured_doc::fields::pull::AUTHOR_EMAIL,
);
let body = get_json_text_field(
doc,
schema.field_attributes,
Expand All @@ -189,6 +202,7 @@ impl FromTantivyDocument for DocSearchPullDocument {
Some(Self {
title: title.into(),
link: link.into(),
author_email: author_email.map(Into::into),
body: body.into(),
diff: diff.into(),
merged,
Expand Down Expand Up @@ -217,3 +231,24 @@ fn get_json_bool_field(doc: &TantivyDocument, field: schema::Field, name: &str)
fn get_json_text_field<'a>(doc: &'a TantivyDocument, field: schema::Field, name: &str) -> &'a str {
get_json_field(doc, field, name).as_str().unwrap()
}

fn get_json_option_field<'a>(
doc: &'a TantivyDocument,
field: schema::Field,
name: &str,
) -> Option<CompactDocValue<'a>> {
Some(
doc.get_first(field)?
.as_object()?
.find(|(k, _)| *k == name)?
.1,
)
}

fn get_json_option_text_field<'a>(
doc: &'a TantivyDocument,
field: schema::Field,
name: &str,
) -> Option<&'a str> {
get_json_option_field(doc, field, name).and_then(|field| field.as_str())
}
44 changes: 40 additions & 4 deletions crates/tabby-common/src/index/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,14 @@ pub struct IndexSchema {
/// Last updated time for the document in index.
pub field_updated_at: Field,

/// Number of failed chunks during indexing.
pub field_failed_chunks_count: Field,
// ==========================================

// === Fields for document ===
/// JSON attributes for the document, it's only stored but not indexed.
/// JSON attributes for the document, it's indexed and stored.
pub field_attributes: Field,

/// Number of failed chunks during indexing.
pub field_failed_chunks_count: Field,
// ===========================

// === Fields for chunk ===
Expand All @@ -76,6 +77,7 @@ const FIELD_CHUNK_ID: &str = "chunk_id";
const FIELD_UPDATED_AT: &str = "updated_at";
const FIELD_FAILED_CHUNKS_COUNT: &str = "failed_chunks_count";
pub const FIELD_SOURCE_ID: &str = "source_id";
pub const FIELD_ATTRIBUTES: &str = "attributes";

pub mod corpus {
pub const CODE: &str = "code";
Expand Down Expand Up @@ -103,7 +105,18 @@ impl IndexSchema {
let field_updated_at = builder.add_date_field(FIELD_UPDATED_AT, INDEXED | STORED);
let field_failed_chunks_count =
builder.add_u64_field(FIELD_FAILED_CHUNKS_COUNT, INDEXED | FAST | STORED);
let field_attributes = builder.add_text_field("attributes", STORED);
let field_attributes = builder.add_json_field(
FIELD_ATTRIBUTES,
JsonObjectOptions::default()
.set_stored()
.set_fast(Some("raw"))
.set_indexing_options(
TextFieldIndexing::default()
.set_tokenizer("raw")
.set_index_option(tantivy::schema::IndexRecordOption::Basic)
.set_fieldnorms(true),
),
);

let field_chunk_id = builder.add_text_field(FIELD_CHUNK_ID, STRING | FAST | STORED);
let field_chunk_attributes = builder.add_json_field(
Expand Down Expand Up @@ -228,6 +241,29 @@ impl IndexSchema {
])
}

/// Build a query to check if the document has specific attribute field.
pub fn doc_has_attribute_field(&self, corpus: &str, doc_id: &str, field: &str) -> impl Query {
let doc_id_query = TermQuery::new(
Term::from_field_text(self.field_id, doc_id),
tantivy::schema::IndexRecordOption::Basic,
);

BooleanQuery::new(vec![
// Must match the corpus
(Occur::Must, self.corpus_query(corpus)),
// Must match the doc id
(Occur::Must, Box::new(doc_id_query)),
// Must has the attributes.field field
(
Occur::Must,
Box::new(ExistsQuery::new_exists_query(format!(
"{}.{}",
FIELD_ATTRIBUTES, field
))),
),
])
}

/// Build a query to find the document with the given `doc_id`, include chunks.
pub fn doc_query_with_chunks(&self, corpus: &str, doc_id: &str) -> impl Query {
let doc_id_query = TermQuery::new(
Expand Down
2 changes: 2 additions & 0 deletions crates/tabby-common/src/index/structured_doc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,15 @@ pub mod fields {
pub mod issue {
pub const TITLE: &str = "title";
pub const LINK: &str = "link";
pub const AUTHOR_EMAIL: &str = "author_email";
pub const BODY: &str = "body";
pub const CLOSED: &str = "closed";
}

pub mod pull {
pub const TITLE: &str = "title";
pub const LINK: &str = "link";
pub const AUTHOR_EMAIL: &str = "author_email";
pub const BODY: &str = "body";
pub const DIFF: &str = "diff";
pub const MERGED: &str = "merged";
Expand Down
18 changes: 15 additions & 3 deletions crates/tabby-index/src/indexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -265,10 +265,9 @@ impl Indexer {
!docs.is_empty()
}

/// Get the failed_chunks_count field for a document.
/// tracks the number of embedding indexing failed chunks for a document.
/// Check whether the document has failed chunks.
///
/// return 0 if the field is not found.
/// failed chunks tracks the number of embedding indexing failed chunks for a document.
pub fn has_failed_chunks(&self, id: &str) -> bool {
let schema = IndexSchema::instance();
let query = schema.doc_has_failed_chunks(&self.corpus, id);
Expand All @@ -278,6 +277,19 @@ impl Indexer {

!docs.is_empty()
}

// Check whether the document has attribute field.
pub fn has_attribute_field(&self, id: &str, field: &str) -> bool {
let schema = IndexSchema::instance();
let query = schema.doc_has_attribute_field(&self.corpus, id, field);
match self.searcher.search(&query, &TopDocs::with_limit(1)) {
Ok(docs) => !docs.is_empty(),
Err(e) => {
debug!("query tantivy error: {}", e);
false
}
}
}
}

pub struct IndexGarbageCollector {
Expand Down
67 changes: 52 additions & 15 deletions crates/tabby-index/src/indexer_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ mod structured_doc_tests {
use std::sync::Arc;

use serial_test::file_serial;
use tabby_common::index::corpus;
use tabby_common::index::{corpus, structured_doc::fields as StructuredDocIndexFields};
use temp_testdir::TempDir;

use super::mock_embedding::MockEmbedding;
Expand Down Expand Up @@ -63,6 +63,7 @@ mod structured_doc_tests {
fields: StructuredDocFields::Issue(StructuredDocIssueFields {
link: id.to_owned(),
title: "title".to_owned(),
author_email: Some("author_email".to_owned()),
body: "body".to_owned(),
closed: false,
}),
Expand All @@ -86,13 +87,7 @@ mod structured_doc_tests {
indexer.commit();

let validator = Indexer::new(corpus::STRUCTURED_DOC);
// Wait for up to 60s for the document to be indexed.
for _ in 0..10 {
if validator.is_indexed(id) {
break;
}
std::thread::sleep(std::time::Duration::from_secs(1));
}

assert!(validator.is_indexed(id));
assert!(validator.has_failed_chunks(id));

Expand All @@ -115,6 +110,7 @@ mod structured_doc_tests {
fields: StructuredDocFields::Issue(StructuredDocIssueFields {
link: id.to_owned(),
title: "title".to_owned(),
author_email: Some("author_email".to_owned()),
body: "body".to_owned(),
closed: false,
}),
Expand All @@ -138,18 +134,57 @@ mod structured_doc_tests {
indexer.commit();

let validator = Indexer::new(corpus::STRUCTURED_DOC);
// Wait for up to 60s for the document to be indexed.
for _ in 0..10 {
if validator.is_indexed(id) {
break;
}
std::thread::sleep(std::time::Duration::from_secs(1));
}

assert!(validator.is_indexed(id));
assert!(!validator.has_failed_chunks(id));

tabby_common::path::set_tabby_root(root);
}

#[test]
#[file_serial(set_tabby_root)]
fn test_structured_doc_has_attribute_field() {
let root = tabby_common::path::tabby_root();
let temp_dir = TempDir::default();
tabby_common::path::set_tabby_root(temp_dir.to_owned());

let id = "structured_doc_has_attribute_field";
let embedding = MockEmbedding::new(vec![1.0], false);
let embedding = Arc::new(embedding);
let indexer = StructuredDocIndexer::new(embedding.clone());
let doc = StructuredDoc {
source_id: "source".to_owned(),
fields: StructuredDocFields::Issue(StructuredDocIssueFields {
link: id.to_owned(),
title: "title".to_owned(),
author_email: Some("author_email".to_owned()),
body: "body".to_owned(),
closed: false,
}),
};

let updated_at = chrono::Utc::now();
let res = tokio::runtime::Runtime::new().unwrap().block_on(async {
indexer
.sync(
StructuredDocState {
updated_at,
deleted: false,
},
doc,
)
.await
});
assert!(res);
indexer.commit();

let validator = Indexer::new(corpus::STRUCTURED_DOC);

assert!(validator.is_indexed(id));
assert!(validator.has_attribute_field(id, StructuredDocIndexFields::issue::AUTHOR_EMAIL));

tabby_common::path::set_tabby_root(root);
}
}

mod builder_tests {
Expand Down Expand Up @@ -239,6 +274,7 @@ mod builder_tests {
fields: StructuredDocFields::Issue(StructuredDocIssueFields {
link: test_id.to_owned(),
title: "title".to_owned(),
author_email: Some("author_email".to_owned()),
body: "body".to_owned(),
closed: false,
}),
Expand Down Expand Up @@ -300,6 +336,7 @@ mod builder_tests {
fields: StructuredDocFields::Issue(StructuredDocIssueFields {
link: test_id.to_owned(),
title: "title".to_owned(),
author_email: Some("author_email".to_owned()),
body: "body".to_owned(),
closed: false,
}),
Expand Down
35 changes: 34 additions & 1 deletion crates/tabby-index/src/structured_doc/public.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use std::sync::Arc;
use async_stream::stream;
use chrono::{DateTime, Utc};
use futures::StreamExt;
use tabby_common::index::corpus;
use tabby_common::index::{corpus, structured_doc::fields as StructuredDocIndexFields};
use tabby_inference::Embedding;

pub use super::types::{
Expand Down Expand Up @@ -85,6 +85,10 @@ impl StructuredDocIndexer {
return false;
}

if self.should_backfill(document) {
return true;
}

if self.indexer.is_indexed_after(document.id(), updated_at)
&& !self.indexer.has_failed_chunks(document.id())
{
Expand All @@ -93,4 +97,33 @@ impl StructuredDocIndexer {

true
}

fn should_backfill(&self, document: &StructuredDoc) -> bool {
// v0.22.0 add the author field to the issue and pull documents.
match &document.fields {
StructuredDocFields::Issue(issue) => {
if issue.author_email.is_some()
&& !self.indexer.has_attribute_field(
document.id(),
StructuredDocIndexFields::issue::AUTHOR_EMAIL,
)
{
return true;
}
}
StructuredDocFields::Pull(pull) => {
if pull.author_email.is_some()
&& !self.indexer.has_attribute_field(
document.id(),
StructuredDocIndexFields::pull::AUTHOR_EMAIL,
)
{
return true;
}
}
_ => (),
}

false
}
}
2 changes: 2 additions & 0 deletions crates/tabby-index/src/structured_doc/types/issue.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ use super::{build_tokens, BuildStructuredDoc};
pub struct IssueDocument {
pub link: String,
pub title: String,
pub author_email: Option<String>,
pub body: String,
pub closed: bool,
}
Expand All @@ -28,6 +29,7 @@ impl BuildStructuredDoc for IssueDocument {
json!({
fields::issue::LINK: self.link,
fields::issue::TITLE: self.title,
fields::issue::AUTHOR_EMAIL: self.author_email,
fields::issue::BODY: self.body,
fields::issue::CLOSED: self.closed,
})
Expand Down
Loading

0 comments on commit 36331a9

Please sign in to comment.