Skip to content

Commit

Permalink
fix(index): return attr even when embedding failed
Browse files Browse the repository at this point in the history
Signed-off-by: Wei Zhang <[email protected]>
  • Loading branch information
zwpaper committed Nov 28, 2024
1 parent d75a30e commit 0c27f3e
Show file tree
Hide file tree
Showing 7 changed files with 34 additions and 48 deletions.
13 changes: 5 additions & 8 deletions crates/tabby-index/src/code/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use std::sync::Arc;

use anyhow::{bail, Result};
use anyhow::{anyhow, bail, Result};
use async_stream::stream;
use async_trait::async_trait;
use futures::stream::BoxStream;
Expand Down Expand Up @@ -69,7 +69,7 @@ impl IndexAttributeBuilder<SourceCode> for CodeBuilder {
async fn build_chunk_attributes<'a>(
&self,
source_code: &'a SourceCode,
) -> BoxStream<'a, JoinHandle<Result<(Vec<String>, serde_json::Value)>>> {
) -> BoxStream<'a, JoinHandle<(Result<Vec<String>>, serde_json::Value)>> {
let text = match source_code.read_content() {
Ok(content) => content,
Err(e) => {
Expand All @@ -81,7 +81,7 @@ impl IndexAttributeBuilder<SourceCode> for CodeBuilder {
return Box::pin(stream! {
let path = source_code.filepath.clone();
yield tokio::spawn(async move {
bail!("Failed to read content of '{}': {}", path, e);
(Err(anyhow!("Failed to read content of '{}': {}", path, e)), json!({}))
});
});

Check warning on line 86 in crates/tabby-index/src/code/mod.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/code/mod.rs#L81-L86

Added lines #L81 - L86 were not covered by tests
}
Expand All @@ -91,7 +91,7 @@ impl IndexAttributeBuilder<SourceCode> for CodeBuilder {
warn!("No embedding service found for code indexing");
return Box::pin(stream! {
yield tokio::spawn(async move {
bail!("No embedding service found for code indexing");
(Err(anyhow!("No embedding service found for code indexing")), json!({}))
});
});

Check warning on line 96 in crates/tabby-index/src/code/mod.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/code/mod.rs#L92-L96

Added lines #L92 - L96 were not covered by tests
};
Expand All @@ -110,10 +110,7 @@ impl IndexAttributeBuilder<SourceCode> for CodeBuilder {
let embedding = embedding.clone();
let rewritten_body = format!("```{}\n{}\n```", source_code.filepath, body);
yield tokio::spawn(async move {
match build_binarize_embedding_tokens(embedding.clone(), &rewritten_body).await {
Ok(tokens) => Ok((tokens, attributes)),
Err(err) => Err(err),
}
(build_binarize_embedding_tokens(embedding.clone(), &rewritten_body).await, attributes)
});
}
};
Expand Down
26 changes: 13 additions & 13 deletions crates/tabby-index/src/indexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ pub trait IndexAttributeBuilder<T>: Send + Sync {
async fn build_chunk_attributes<'a>(
&self,
document: &'a T,
) -> BoxStream<'a, JoinHandle<Result<(Vec<String>, serde_json::Value)>>>;
) -> BoxStream<'a, JoinHandle<(Result<Vec<String>>, serde_json::Value)>>;
}

pub struct TantivyDocBuilder<T> {
Expand Down Expand Up @@ -132,18 +132,11 @@ impl<T: ToIndexId> TantivyDocBuilder<T> {
// the document, and
// a flag indicating whether the tokens were created successfully.
yield tokio::spawn(async move {
let Ok(built_chunk_attributes_result) = task.await else {
let Ok((tokens, chunk_attributes)) = task.await else {
// Join error, there is no attr, return None and false
return (None, false);
};

let (tokens, chunk_attributes) = match built_chunk_attributes_result{
Ok((tokens, chunk_attributes)) => (tokens, chunk_attributes),
Err(e) => {
warn!("Failed to build chunk attributes for document '{}': {}", id, e);
return (None, false);
}
};
let mut doc = doc! {
schema.field_id => id,
schema.field_source_id => source_id,
Expand All @@ -153,11 +146,18 @@ impl<T: ToIndexId> TantivyDocBuilder<T> {
schema.field_chunk_attributes => chunk_attributes,
};

for token in &tokens {
doc.add_text(schema.field_chunk_tokens, token);
match tokens {
Ok(tokens) => {
for token in &tokens {
doc.add_text(schema.field_chunk_tokens, token);
}
(Some(doc), true)
},
Err(e) => {
warn!("Failed to build tokens for document '{}': {}", id, e);
(Some(doc), false)
}
}

(Some(doc), !tokens.is_empty())
});
}
}
Expand Down
2 changes: 1 addition & 1 deletion crates/tabby-index/src/structured_doc/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ impl IndexAttributeBuilder<StructuredDoc> for StructuredDocBuilder {
async fn build_chunk_attributes<'a>(
&self,
document: &'a StructuredDoc,
) -> BoxStream<'a, JoinHandle<Result<(Vec<String>, serde_json::Value)>>> {
) -> BoxStream<'a, JoinHandle<(Result<Vec<String>>, serde_json::Value)>> {
let embedding = self.embedding.clone();
document.build_chunk_attributes(embedding).await
}
Expand Down
4 changes: 2 additions & 2 deletions crates/tabby-index/src/structured_doc/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ pub trait BuildStructuredDoc {
async fn build_chunk_attributes(
&self,
embedding: Arc<dyn Embedding>,
) -> BoxStream<JoinHandle<Result<(Vec<String>, serde_json::Value)>>>;
) -> BoxStream<JoinHandle<(Result<Vec<String>>, serde_json::Value)>>;
}

pub enum StructuredDocFields {
Expand Down Expand Up @@ -83,7 +83,7 @@ impl BuildStructuredDoc for StructuredDoc {
async fn build_chunk_attributes(
&self,
embedding: Arc<dyn Embedding>,
) -> BoxStream<JoinHandle<Result<(Vec<String>, serde_json::Value)>>> {
) -> BoxStream<JoinHandle<(Result<Vec<String>>, serde_json::Value)>> {
match &self.fields {
StructuredDocFields::Web(doc) => doc.build_chunk_attributes(embedding).await,
StructuredDocFields::Issue(doc) => doc.build_chunk_attributes(embedding).await,
Expand Down
11 changes: 3 additions & 8 deletions crates/tabby-index/src/structured_doc/types/issue.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,18 +36,13 @@ impl BuildStructuredDoc for IssueDocument {
async fn build_chunk_attributes(
&self,
embedding: Arc<dyn Embedding>,
) -> BoxStream<JoinHandle<Result<(Vec<String>, serde_json::Value)>>> {
) -> BoxStream<JoinHandle<(Result<Vec<String>>, serde_json::Value)>> {
let text = format!("{}\n\n{}", self.title, self.body);
let s = stream! {
yield tokio::spawn(async move {
let tokens = match build_tokens(embedding, &text).await{
Ok(tokens) => tokens,
Err(e) => {
return Err(anyhow::anyhow!("Failed to build tokens for text: {}", e));
}
};
let tokens = build_tokens(embedding, &text).await;
let chunk_attributes = json!({});
Ok((tokens, chunk_attributes))
(tokens, chunk_attributes)
})
};

Expand Down
12 changes: 3 additions & 9 deletions crates/tabby-index/src/structured_doc/types/pull.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,19 +43,13 @@ impl BuildStructuredDoc for PullDocument {
async fn build_chunk_attributes(
&self,
embedding: Arc<dyn Embedding>,
) -> BoxStream<JoinHandle<Result<(Vec<String>, serde_json::Value)>>> {
) -> BoxStream<JoinHandle<(Result<Vec<String>>, serde_json::Value)>> {

Check warning on line 46 in crates/tabby-index/src/structured_doc/types/pull.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/structured_doc/types/pull.rs#L46

Added line #L46 was not covered by tests
// currently not indexing the diff
let text = format!("{}\n\n{}", self.title, self.body);
let s = stream! {
yield tokio::spawn(async move {
let tokens = match build_tokens(embedding, &text).await{
Ok(tokens) => tokens,
Err(e) => {
return Err(anyhow::anyhow!("Failed to build tokens for text: {}", e));
}
};
let chunk_attributes = json!({});
Ok((tokens, chunk_attributes))
let tokens = build_tokens(embedding, &text).await;
(tokens, json!({}))

Check warning on line 52 in crates/tabby-index/src/structured_doc/types/pull.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/structured_doc/types/pull.rs#L52

Added line #L52 was not covered by tests
})
};

Expand Down
14 changes: 7 additions & 7 deletions crates/tabby-index/src/structured_doc/types/web.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ impl BuildStructuredDoc for WebDocument {
async fn build_chunk_attributes(
&self,
embedding: Arc<dyn Embedding>,
) -> BoxStream<JoinHandle<Result<(Vec<String>, serde_json::Value)>>> {
) -> BoxStream<JoinHandle<(Result<Vec<String>>, serde_json::Value)>> {

Check warning on line 37 in crates/tabby-index/src/structured_doc/types/web.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/structured_doc/types/web.rs#L37

Added line #L37 was not covered by tests
let chunks: Vec<_> = TextSplitter::new(2048)
.chunks(&self.body)
.map(|x| x.to_owned())
Expand All @@ -45,7 +45,7 @@ impl BuildStructuredDoc for WebDocument {
Err(e) => {
return Box::pin(stream! {
yield tokio::spawn(async move {
Err(anyhow::anyhow!("Failed to build tokens for title: {}", e))
(Err(anyhow::anyhow!("Failed to build tokens for title: {}", e)), json!({}))
});
});

Check warning on line 50 in crates/tabby-index/src/structured_doc/types/web.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/structured_doc/types/web.rs#L43-L50

Added lines #L43 - L50 were not covered by tests
}
Expand All @@ -55,19 +55,19 @@ impl BuildStructuredDoc for WebDocument {
let title_embedding_tokens = title_embedding_tokens.clone();
let embedding = embedding.clone();
yield tokio::spawn(async move {
let chunk = json!({
fields::web::CHUNK_TEXT: chunk_text.clone(),

Check warning on line 59 in crates/tabby-index/src/structured_doc/types/web.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/structured_doc/types/web.rs#L59

Added line #L59 was not covered by tests
});
let chunk_embedding_tokens = match build_tokens(embedding.clone(), &chunk_text).await {
Ok(tokens) => tokens,
Err(e) => {
return Err(anyhow::anyhow!("Failed to build tokens for chunk: {}", e));
return (Err(anyhow::anyhow!("Failed to build tokens for chunk: {}", e)), chunk);
}
};

Check warning on line 66 in crates/tabby-index/src/structured_doc/types/web.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/structured_doc/types/web.rs#L61-L66

Added lines #L61 - L66 were not covered by tests
let chunk = json!({
fields::web::CHUNK_TEXT: chunk_text,
});

// Title embedding tokens are merged with chunk embedding tokens to enhance the search results.
let tokens = merge_tokens(vec![title_embedding_tokens, chunk_embedding_tokens]);
Ok((tokens, chunk))
(Ok(tokens), chunk)

Check warning on line 70 in crates/tabby-index/src/structured_doc/types/web.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/structured_doc/types/web.rs#L70

Added line #L70 was not covered by tests
});
}
};
Expand Down

0 comments on commit 0c27f3e

Please sign in to comment.