Skip to content

Commit

Permalink
chore: skip chunk if embedding failed
Browse files Browse the repository at this point in the history
Signed-off-by: Wei Zhang <[email protected]>
  • Loading branch information
zwpaper committed Nov 29, 2024
1 parent 32cc5cd commit 5432bc9
Show file tree
Hide file tree
Showing 8 changed files with 59 additions and 66 deletions.
2 changes: 1 addition & 1 deletion crates/tabby-common/src/index/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ impl IndexSchema {

let field_updated_at = builder.add_date_field(FIELD_UPDATED_AT, INDEXED | STORED);
let field_failed_chunks_count =
builder.add_u64_field(FIELD_FAILED_CHUNKS_COUNT, INDEXED | FAST);
builder.add_u64_field(FIELD_FAILED_CHUNKS_COUNT, INDEXED | FAST | STORED);
let field_attributes = builder.add_text_field("attributes", STORED);

let field_chunk_id = builder.add_text_field(FIELD_CHUNK_ID, STRING | FAST | STORED);
Expand Down
14 changes: 12 additions & 2 deletions crates/tabby-index/src/code/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,8 @@ async fn add_changed_documents(

let id = SourceCode::to_index_id(&repository.source_id, &key).id;

if cloned_index.is_indexed(&id) {
// Skip if already indexed
// Skip if already indexed and has no failed chunks
if !require_updates(cloned_index.clone(), &id) {
continue;
}

Expand All @@ -110,6 +110,8 @@ async fn add_changed_documents(
}

let (_, s) = builder.build(code).await;
// must delete before adding, otherwise the some fields like failed_chunks_count will remain
cloned_index.delete(&id);
for await task in s {
yield task;
}
Expand All @@ -133,6 +135,14 @@ async fn add_changed_documents(
count_docs
}

fn require_updates(indexer: Arc<Indexer>, id: &str) -> bool {
if indexer.is_indexed(id) && !indexer.has_failed_chunks(id) {
return false;
};

true
}

fn is_valid_file(file: &SourceCode) -> bool {
file.max_line_length <= MAX_LINE_LENGTH_THRESHOLD
&& file.avg_line_length <= AVG_LINE_LENGTH_THRESHOLD
Expand Down
26 changes: 2 additions & 24 deletions crates/tabby-index/src/code/intelligence.rs
Original file line number Diff line number Diff line change
Expand Up @@ -247,34 +247,12 @@ mod metrics {

#[cfg(test)]
pub mod tests {
use std::path::PathBuf;

use serial_test::file_serial;
use tabby_common::{
config::{config_index_to_id, CodeRepository},
path::set_tabby_root,
};
use tabby_common::path::set_tabby_root;
use tracing_test::traced_test;

use super::*;

pub fn get_tabby_root() -> PathBuf {
let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
path.push("testdata");
path
}

pub fn get_repository_config() -> CodeRepository {
CodeRepository::new("https://github.com/TabbyML/tabby", &config_index_to_id(0))
}

pub fn get_rust_source_file() -> PathBuf {
let mut path = get_tabby_root();
path.push("repositories");
path.push("https_github.com_TabbyML_tabby");
path.push("rust.rs");
path
}
use crate::testutils::{get_repository_config, get_rust_source_file, get_tabby_root};

#[test]
#[traced_test]
Expand Down
1 change: 0 additions & 1 deletion crates/tabby-index/src/code/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,6 @@ async fn build_binarize_embedding_tokens(
let embedding = match embedding.embed(body).await {
Ok(x) => x,
Err(err) => {
warn!("Failed to embed chunk text: {}", err);
bail!("Failed to embed chunk text: {}", err);
}
};
Expand Down
32 changes: 8 additions & 24 deletions crates/tabby-index/src/indexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,17 +79,15 @@ impl<T: ToIndexId> TantivyDocBuilder<T> {
let mut failed_count: u64 = 0;
for await chunk_doc in self.build_chunks(cloned_id, source_id.clone(), updated_at, document).await {
match chunk_doc.await {
Ok((Some(doc), ok)) => {
if !ok {
failed_count += 1;
}
Ok(Ok(doc)) => {
yield tokio::spawn(async move { Some(doc) });
}
Ok((None, _)) => {
Ok(Err(e)) => {
warn!("Failed to build chunk for document '{}': {}", doc_id, e);
failed_count += 1;
}
Err(e) => {
warn!("Failed to build chunk for document '{}': {}", doc_id, e);
warn!("Failed to call build chunk '{}': {}", doc_id, e);
failed_count += 1;
}
}
Expand Down Expand Up @@ -118,32 +116,18 @@ impl<T: ToIndexId> TantivyDocBuilder<T> {
source_id: String,
updated_at: tantivy::DateTime,
document: T,
) -> impl Stream<Item = JoinHandle<(Option<TantivyDocument>, bool)>> + '_ {
) -> impl Stream<Item = JoinHandle<Result<TantivyDocument>>> + '_ {
let kind = self.corpus;
stream! {
let schema = IndexSchema::instance();
for await (chunk_id, task) in self.builder.build_chunk_attributes(&document).await.enumerate() {
let id = id.clone();
let source_id = source_id.clone();

// The tokens may be empty if the embedding call fails,
// but the attributes remain useful.
// Therefore, we return:
// the document, and
// a flag indicating whether the tokens were created successfully.
yield tokio::spawn(async move {
let Ok(built_chunk_attributes_result) = task.await else {
// Join error, there is no attr, return None and false
return (None, false);
};
let built_chunk_attributes_result = task.await?;
let (tokens, chunk_attributes) = built_chunk_attributes_result?;

let (tokens, chunk_attributes) = match built_chunk_attributes_result{
Ok((tokens, chunk_attributes)) => (tokens, chunk_attributes),
Err(e) => {
warn!("Failed to build chunk attributes for document '{}': {}", id, e);
return (None, false);
}
};
let mut doc = doc! {
schema.field_id => id,
schema.field_source_id => source_id,
Expand All @@ -157,7 +141,7 @@ impl<T: ToIndexId> TantivyDocBuilder<T> {
doc.add_text(schema.field_chunk_tokens, token);
}

(Some(doc), !tokens.is_empty())
Ok(doc)
});
}
}
Expand Down
26 changes: 12 additions & 14 deletions crates/tabby-index/src/indexer_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -163,18 +163,13 @@ mod builder_tests {

use super::mock_embedding::MockEmbedding;
use crate::{
code::{
create_code_builder,
intelligence::{
tests::{get_repository_config, get_rust_source_file, get_tabby_root},
CodeIntelligence,
},
},
code::{create_code_builder, intelligence::CodeIntelligence},
indexer::{TantivyDocBuilder, ToIndexId},
structured_doc::{
public::{StructuredDoc, StructuredDocFields, StructuredDocIssueFields},
StructuredDocBuilder,
},
testutils::{get_repository_config, get_rust_source_file, get_tabby_root},
};

#[test]
Expand Down Expand Up @@ -204,7 +199,9 @@ mod builder_tests {
.await
});

assert_eq!(res.len(), 4);
// the chunks should be failed as no embedding is provided
// the last element is the document itself
assert_eq!(res.len(), 1);
let doc = res.last().unwrap().as_ref().unwrap().as_ref().unwrap();

let schema = IndexSchema::instance();
Expand All @@ -213,8 +210,7 @@ mod builder_tests {
.and_then(|v| v.as_u64())
.unwrap();

// the last element is the document itself
// the first three are the chunks and should be failed as no embedding is provided
// the first three are the chunks and failed, counted as 3
assert_eq!(failed_count, 3);

tabby_common::path::set_tabby_root(origin_root);
Expand Down Expand Up @@ -258,10 +254,12 @@ mod builder_tests {
.await
});

// the last element is the document itself
// the rest are the chunks
assert_eq!(res.len(), 2);
let doc = res[1].as_ref().unwrap().as_ref().unwrap();
// The last element is the document itself,
// while the preceding elements are the chunks.
// Given that the embedding is empty,
// all chunks should be considered failed and skipped.
assert_eq!(res.len(), 1);
let doc = res.last().unwrap().as_ref().unwrap().as_ref().unwrap();

let schema = IndexSchema::instance();
let failed_count = doc
Expand Down
3 changes: 3 additions & 0 deletions crates/tabby-index/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ mod code;
mod indexer;
mod tantivy_utils;

#[cfg(test)]
mod testutils;

use indexer::{IndexAttributeBuilder, Indexer};

mod structured_doc;
Expand Down
21 changes: 21 additions & 0 deletions crates/tabby-index/src/testutils.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
use std::path::PathBuf;

use tabby_common::config::{config_index_to_id, CodeRepository};

pub fn get_tabby_root() -> PathBuf {
let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
path.push("testdata");
path
}

pub fn get_repository_config() -> CodeRepository {
CodeRepository::new("https://github.com/TabbyML/tabby", &config_index_to_id(0))
}

pub fn get_rust_source_file() -> PathBuf {
let mut path = get_tabby_root();
path.push("repositories");
path.push("https_github.com_TabbyML_tabby");
path.push("rust.rs");
path
}

0 comments on commit 5432bc9

Please sign in to comment.