From a52c4e6fe7cbd7e95a327926bf11c8c4998848f0 Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Wed, 13 Nov 2024 20:13:51 -0800 Subject: [PATCH] refactor(index): migrate corpus::WEB to corpus::STRUCTURED_DOC (#3352) * add structured doc * [autofix.ci] apply automated fixes * chore: implement structured_doc::DocService * refactor(index): refactored `web_crawler.rs` to use updated `StructuredDoc` and `StructuredDocFields` types. run make fix * switch doc search * chore: adapt frontend * delete doc related files * run make fix * add deprecation notes for corpus::WEB * [autofix.ci] apply automated fixes * [autofix.ci] apply automated fixes (attempt 2/3) --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> --- crates/tabby-common/src/api/doc.rs | 46 ----- crates/tabby-common/src/api/mod.rs | 2 +- crates/tabby-common/src/api/structured_doc.rs | 169 ++++++++++++++++++ crates/tabby-common/src/config.rs | 2 +- crates/tabby-common/src/index/doc.rs | 5 - crates/tabby-common/src/index/mod.rs | 8 +- .../tabby-common/src/index/structured_doc.rs | 16 ++ crates/tabby-index/src/code/mod.rs | 6 +- crates/tabby-index/src/doc/mod.rs | 99 ---------- crates/tabby-index/src/doc/public.rs | 125 ------------- crates/tabby-index/src/indexer.rs | 6 +- crates/tabby-index/src/lib.rs | 7 +- crates/tabby-index/src/structured_doc/mod.rs | 51 ++++++ .../tabby-index/src/structured_doc/public.rs | 53 ++++++ .../tabby-index/src/structured_doc/types.rs | 102 +++++++++++ .../src/structured_doc/types/issue.rs | 50 ++++++ .../src/structured_doc/types/web.rs | 67 +++++++ crates/tabby/src/serve.rs | 4 +- crates/tabby/src/services/mod.rs | 2 +- .../services/{doc => structured_doc}/mod.rs | 2 +- .../{doc => structured_doc}/serper.rs | 7 +- .../{doc => structured_doc}/tantivy.rs | 44 +---- ee/tabby-db/src/lib.rs | 3 +- ee/tabby-db/src/threads.rs | 17 +- ee/tabby-schema/graphql/schema.graphql | 11 +- ee/tabby-schema/src/dao.rs | 43 ++++- ee/tabby-schema/src/schema/thread/types.rs | 36 +++- .../components/assistant-message-section.tsx | 3 +- .../components/message-markdown/index.tsx | 4 +- ee/tabby-ui/lib/hooks/use-thread-run.ts | 30 +++- ee/tabby-ui/lib/tabby/query.ts | 15 +- ee/tabby-ui/lib/utils/index.ts | 13 +- ee/tabby-webserver/src/service/answer.rs | 58 +++--- .../src/service/answer/testutils/mod.rs | 25 +-- .../background_job/third_party_integration.rs | 8 +- .../third_party_integration/issues.rs | 53 ++---- .../src/service/background_job/web_crawler.rs | 17 +- ee/tabby-webserver/src/service/thread.rs | 2 +- ee/tabby-webserver/src/webserver.rs | 2 +- 39 files changed, 773 insertions(+), 440 deletions(-) delete mode 100644 crates/tabby-common/src/api/doc.rs create mode 100644 crates/tabby-common/src/api/structured_doc.rs delete mode 100644 crates/tabby-common/src/index/doc.rs create mode 100644 crates/tabby-common/src/index/structured_doc.rs delete mode 100644 crates/tabby-index/src/doc/mod.rs delete mode 100644 crates/tabby-index/src/doc/public.rs create mode 100644 crates/tabby-index/src/structured_doc/mod.rs create mode 100644 crates/tabby-index/src/structured_doc/public.rs create mode 100644 crates/tabby-index/src/structured_doc/types.rs create mode 100644 crates/tabby-index/src/structured_doc/types/issue.rs create mode 100644 crates/tabby-index/src/structured_doc/types/web.rs rename crates/tabby/src/services/{doc => structured_doc}/mod.rs (88%) rename crates/tabby/src/services/{doc => structured_doc}/serper.rs (93%) rename crates/tabby/src/services/{doc => structured_doc}/tantivy.rs (76%) diff --git a/crates/tabby-common/src/api/doc.rs b/crates/tabby-common/src/api/doc.rs deleted file mode 100644 index 379c0f7cba34..000000000000 --- a/crates/tabby-common/src/api/doc.rs +++ /dev/null @@ -1,46 +0,0 @@ -use async_trait::async_trait; -use thiserror::Error; - -pub struct DocSearchResponse { - pub hits: Vec, -} - -pub struct DocSearchHit { - pub score: f32, - pub doc: DocSearchDocument, -} - -#[derive(Clone)] -pub struct DocSearchDocument { - pub title: String, - pub link: String, - pub snippet: String, -} - -#[derive(Error, Debug)] -pub enum DocSearchError { - #[error("index not ready")] - NotReady, - - #[error(transparent)] - QueryParserError(#[from] tantivy::query::QueryParserError), - - #[error(transparent)] - TantivyError(#[from] tantivy::TantivyError), - - #[error(transparent)] - Other(#[from] anyhow::Error), -} - -#[async_trait] -pub trait DocSearch: Send + Sync { - /// Search docs from underlying index. - /// - /// * `source_ids`: Filter documents by source IDs, when empty, search all sources. - async fn search( - &self, - source_ids: &[String], - q: &str, - limit: usize, - ) -> Result; -} diff --git a/crates/tabby-common/src/api/mod.rs b/crates/tabby-common/src/api/mod.rs index 46581c56dd7a..885aa4450b14 100644 --- a/crates/tabby-common/src/api/mod.rs +++ b/crates/tabby-common/src/api/mod.rs @@ -1,4 +1,4 @@ pub mod code; -pub mod doc; pub mod event; pub mod server_setting; +pub mod structured_doc; diff --git a/crates/tabby-common/src/api/structured_doc.rs b/crates/tabby-common/src/api/structured_doc.rs new file mode 100644 index 000000000000..03d43f004dbc --- /dev/null +++ b/crates/tabby-common/src/api/structured_doc.rs @@ -0,0 +1,169 @@ +use async_trait::async_trait; +use tantivy::{ + schema::{self, document::CompactDocValue, Value}, + TantivyDocument, +}; +use thiserror::Error; + +use crate::index::{structured_doc, IndexSchema}; + +pub struct DocSearchResponse { + pub hits: Vec, +} + +pub struct DocSearchHit { + pub score: f32, + pub doc: DocSearchDocument, +} + +#[derive(Clone)] +pub enum DocSearchDocument { + Web(DocSearchWebDocument), + Issue(DocSearchIssueDocument), +} + +#[derive(Error, Debug)] +pub enum DocSearchError { + #[error("index not ready")] + NotReady, + + #[error(transparent)] + QueryParserError(#[from] tantivy::query::QueryParserError), + + #[error(transparent)] + TantivyError(#[from] tantivy::TantivyError), + + #[error(transparent)] + Other(#[from] anyhow::Error), +} + +#[async_trait] +pub trait DocSearch: Send + Sync { + /// Search docs from underlying index. + /// + /// * `source_ids`: Filter documents by source IDs, when empty, search all sources. + async fn search( + &self, + source_ids: &[String], + q: &str, + limit: usize, + ) -> Result; +} + +#[derive(Clone)] +pub struct DocSearchWebDocument { + pub title: String, + pub link: String, + pub snippet: String, +} + +#[derive(Clone)] +pub struct DocSearchIssueDocument { + pub title: String, + pub link: String, + pub body: String, + pub closed: bool, +} + +pub trait FromTantivyDocument { + fn from_tantivy_document(doc: &TantivyDocument, chunk: &TantivyDocument) -> Option + where + Self: Sized; +} + +impl FromTantivyDocument for DocSearchDocument { + fn from_tantivy_document(doc: &TantivyDocument, chunk: &TantivyDocument) -> Option { + let schema = IndexSchema::instance(); + let kind = get_json_text_field(doc, schema.field_attributes, structured_doc::fields::KIND); + + match kind { + "web" => { + DocSearchWebDocument::from_tantivy_document(doc, chunk).map(DocSearchDocument::Web) + } + "issue" => DocSearchIssueDocument::from_tantivy_document(doc, chunk) + .map(DocSearchDocument::Issue), + _ => None, + } + } +} + +impl FromTantivyDocument for DocSearchWebDocument { + fn from_tantivy_document(doc: &TantivyDocument, chunk: &TantivyDocument) -> Option { + let schema = IndexSchema::instance(); + let title = get_json_text_field( + doc, + schema.field_attributes, + structured_doc::fields::web::TITLE, + ); + let link = get_json_text_field( + doc, + schema.field_attributes, + structured_doc::fields::web::LINK, + ); + let snippet = get_json_text_field( + chunk, + schema.field_chunk_attributes, + structured_doc::fields::web::CHUNK_TEXT, + ); + + Some(Self { + title: title.into(), + link: link.into(), + snippet: snippet.into(), + }) + } +} + +impl FromTantivyDocument for DocSearchIssueDocument { + fn from_tantivy_document(doc: &TantivyDocument, _: &TantivyDocument) -> Option { + let schema = IndexSchema::instance(); + let title = get_json_text_field( + doc, + schema.field_attributes, + structured_doc::fields::issue::TITLE, + ); + let link = get_json_text_field( + doc, + schema.field_attributes, + structured_doc::fields::issue::LINK, + ); + let body = get_json_text_field( + doc, + schema.field_attributes, + structured_doc::fields::issue::BODY, + ); + let closed = get_json_bool_field( + doc, + schema.field_attributes, + structured_doc::fields::issue::CLOSED, + ); + Some(Self { + title: title.into(), + link: link.into(), + body: body.into(), + closed, + }) + } +} + +fn get_json_field<'a>( + doc: &'a TantivyDocument, + field: schema::Field, + name: &str, +) -> CompactDocValue<'a> { + doc.get_first(field) + .unwrap() + .as_object() + .unwrap() + .find(|(k, _)| *k == name) + .unwrap() + .1 +} + +fn get_json_bool_field(doc: &TantivyDocument, field: schema::Field, name: &str) -> bool { + get_json_field(doc, field, name).as_bool().unwrap() +} + +fn get_json_text_field<'a>(doc: &'a TantivyDocument, field: schema::Field, name: &str) -> &'a str { + get_json_field(doc, field, name).as_str().unwrap() +} diff --git a/crates/tabby-common/src/config.rs b/crates/tabby-common/src/config.rs index d52c51b4c585..7f0df26a1349 100644 --- a/crates/tabby-common/src/config.rs +++ b/crates/tabby-common/src/config.rs @@ -487,7 +487,7 @@ mod tests { } assert!( - matches!(Config::validate_model_config(&config.model.completion), Err(ref e) if true) + matches!(Config::validate_model_config(&config.model.completion), Err(ref _e) if true) ); assert!(Config::validate_model_config(&config.model.chat).is_ok()); } diff --git a/crates/tabby-common/src/index/doc.rs b/crates/tabby-common/src/index/doc.rs deleted file mode 100644 index 6489c22c84df..000000000000 --- a/crates/tabby-common/src/index/doc.rs +++ /dev/null @@ -1,5 +0,0 @@ -pub mod fields { - pub const TITLE: &str = "title"; - pub const LINK: &str = "link"; - pub const CHUNK_TEXT: &str = "chunk_text"; -} diff --git a/crates/tabby-common/src/index/mod.rs b/crates/tabby-common/src/index/mod.rs index e212d98a8366..08da621c926b 100644 --- a/crates/tabby-common/src/index/mod.rs +++ b/crates/tabby-common/src/index/mod.rs @@ -1,5 +1,5 @@ pub mod code; -pub mod doc; +pub mod structured_doc; use std::borrow::Cow; @@ -75,6 +75,12 @@ pub const FIELD_SOURCE_ID: &str = "source_id"; pub mod corpus { pub const CODE: &str = "code"; + pub const STRUCTURED_DOC: &str = "structured_doc"; + + #[deprecated( + since = "0.20.0", + note = "The web corpus is deprecated and will be removed during the version upgrade." + )] pub const WEB: &str = "web"; } diff --git a/crates/tabby-common/src/index/structured_doc.rs b/crates/tabby-common/src/index/structured_doc.rs new file mode 100644 index 000000000000..9dceabbe7506 --- /dev/null +++ b/crates/tabby-common/src/index/structured_doc.rs @@ -0,0 +1,16 @@ +pub mod fields { + pub const KIND: &str = "kind"; + + pub mod web { + pub const TITLE: &str = "title"; + pub const LINK: &str = "link"; + pub const CHUNK_TEXT: &str = "chunk_text"; + } + + pub mod issue { + pub const TITLE: &str = "title"; + pub const LINK: &str = "link"; + pub const BODY: &str = "body"; + pub const CLOSED: &str = "closed"; + } +} diff --git a/crates/tabby-index/src/code/mod.rs b/crates/tabby-index/src/code/mod.rs index 427f6e35f79e..39d43f36976b 100644 --- a/crates/tabby-index/src/code/mod.rs +++ b/crates/tabby-index/src/code/mod.rs @@ -65,10 +65,10 @@ impl IndexAttributeBuilder for CodeBuilder { json!({}) } - async fn build_chunk_attributes( + async fn build_chunk_attributes<'a>( &self, - source_code: &SourceCode, - ) -> BoxStream, serde_json::Value)>> { + source_code: &'a SourceCode, + ) -> BoxStream<'a, JoinHandle<(Vec, serde_json::Value)>> { let text = match source_code.read_content() { Ok(content) => content, Err(e) => { diff --git a/crates/tabby-index/src/doc/mod.rs b/crates/tabby-index/src/doc/mod.rs deleted file mode 100644 index 44011d5fe51a..000000000000 --- a/crates/tabby-index/src/doc/mod.rs +++ /dev/null @@ -1,99 +0,0 @@ -pub mod public; - -use std::{collections::HashSet, sync::Arc}; - -use async_stream::stream; -use async_trait::async_trait; -use futures::stream::BoxStream; -use public::WebDocument; -use serde_json::json; -use tabby_common::index::{self, corpus, doc}; -use tabby_inference::Embedding; -use tantivy::doc; -use text_splitter::TextSplitter; -use tokio::task::JoinHandle; -use tracing::warn; - -use crate::{indexer::TantivyDocBuilder, IndexAttributeBuilder}; - -const CHUNK_SIZE: usize = 2048; - -pub struct DocBuilder { - embedding: Arc, -} - -impl DocBuilder { - fn new(embedding: Arc) -> Self { - Self { embedding } - } -} - -#[async_trait] -impl IndexAttributeBuilder for DocBuilder { - async fn build_attributes(&self, document: &WebDocument) -> serde_json::Value { - json!({ - doc::fields::TITLE: document.title, - doc::fields::LINK: document.link, - }) - } - - /// This function splits the document into chunks and computes the embedding for each chunk. It then converts the embeddings - /// into binarized tokens by thresholding on zero. - async fn build_chunk_attributes( - &self, - document: &WebDocument, - ) -> BoxStream, serde_json::Value)>> { - let embedding = self.embedding.clone(); - let chunks: Vec<_> = TextSplitter::new(CHUNK_SIZE) - .chunks(&document.body) - .map(|x| x.to_owned()) - .collect(); - - let title_embedding_tokens = build_tokens(embedding.clone(), &document.title).await; - let s = stream! { - for chunk_text in chunks { - let title_embedding_tokens = title_embedding_tokens.clone(); - let embedding = embedding.clone(); - yield tokio::spawn(async move { - let chunk_embedding_tokens = build_tokens(embedding.clone(), &chunk_text).await; - let chunk = json!({ - doc::fields::CHUNK_TEXT: chunk_text, - }); - - // Title embedding tokens are merged with chunk embedding tokens to enhance the search results. - let tokens = merge_tokens(vec![title_embedding_tokens, chunk_embedding_tokens]); - (tokens, chunk) - }); - } - }; - - Box::pin(s) - } -} - -async fn build_tokens(embedding: Arc, text: &str) -> Vec { - let embedding = match embedding.embed(text).await { - Ok(embedding) => embedding, - Err(err) => { - warn!("Failed to embed chunk text: {}", err); - return vec![]; - } - }; - - let mut chunk_embedding_tokens = vec![]; - for token in index::binarize_embedding(embedding.iter()) { - chunk_embedding_tokens.push(token); - } - - chunk_embedding_tokens -} - -fn create_web_builder(embedding: Arc) -> TantivyDocBuilder { - let builder = DocBuilder::new(embedding); - TantivyDocBuilder::new(corpus::WEB, builder) -} - -pub fn merge_tokens(tokens: Vec>) -> Vec { - let tokens = tokens.into_iter().flatten().collect::>(); - tokens.into_iter().collect() -} diff --git a/crates/tabby-index/src/doc/public.rs b/crates/tabby-index/src/doc/public.rs deleted file mode 100644 index 8e19971d4617..000000000000 --- a/crates/tabby-index/src/doc/public.rs +++ /dev/null @@ -1,125 +0,0 @@ -use std::sync::Arc; - -use async_stream::stream; -use chrono::{DateTime, Utc}; -use futures::StreamExt; -use tabby_common::index::corpus; -use tabby_inference::Embedding; - -use super::create_web_builder; -use crate::{ - indexer::{IndexId, TantivyDocBuilder, ToIndexId}, - Indexer, -}; - -pub struct DocIndexer { - builder: TantivyDocBuilder, - indexer: Indexer, -} - -pub struct WebDocument { - pub id: String, - pub source_id: String, - pub link: String, - pub title: String, - pub body: String, -} - -impl ToIndexId for WebDocument { - fn to_index_id(&self) -> IndexId { - IndexId { - source_id: self.source_id.clone(), - id: self.id.clone(), - } - } -} - -impl DocIndexer { - pub fn new(embedding: Arc) -> Self { - let builder = create_web_builder(embedding); - let indexer = Indexer::new(corpus::WEB); - Self { indexer, builder } - } - - pub async fn add(&self, updated_at: DateTime, document: WebDocument) -> bool { - let is_document_empty = document.body.trim().is_empty(); - if is_document_empty || self.indexer.is_indexed_after(&document.id, updated_at) { - return false; - }; - - stream! { - let (id, s) = self.builder.build(document).await; - self.indexer.delete(&id); - - for await doc in s.buffer_unordered(std::cmp::max(std::thread::available_parallelism().unwrap().get() * 2, 32)) { - if let Ok(Some(doc)) = doc { - self.indexer.add(doc).await; - } - } - }.count().await; - true - } - - pub fn commit(self) { - self.indexer.commit(); - } -} - -#[cfg(test)] -mod tests { - use std::sync::Arc; - - use async_trait::async_trait; - use serial_test::serial; - use tabby_common::path::set_tabby_root; - use temp_testdir::TempDir; - - use super::*; - - struct FakeEmbedding; - - #[async_trait] - impl Embedding for FakeEmbedding { - async fn embed(&self, _prompt: &str) -> anyhow::Result> { - Ok(vec![0.0; 16]) - } - } - - fn create_testing_document() -> WebDocument { - WebDocument { - id: "1".to_string(), - source_id: "1".to_string(), - link: "https://example.com".to_string(), - title: "Example".to_string(), - body: "Hello, world!".to_string(), - } - } - - #[tokio::test] - #[serial(set_tabby_root)] - async fn test_add() { - let tmp_dir = TempDir::default(); - set_tabby_root(tmp_dir.to_path_buf()); - let embedding = Arc::new(FakeEmbedding); - let indexer = DocIndexer::new(embedding.clone()); - let updated_at = Utc::now(); - - // Insert a new document - assert!(indexer.add(updated_at, create_testing_document()).await); - indexer.commit(); - - // For document with the same id, and the updated_at is not newer, it should not be added. - let indexer = DocIndexer::new(embedding); - assert!(!indexer.add(updated_at, create_testing_document()).await); - - // For document with the same id, and the updated_at is newer, it should be added. - assert!( - indexer - .add( - updated_at + chrono::Duration::seconds(1), - create_testing_document() - ) - .await - ); - } -} diff --git a/crates/tabby-index/src/indexer.rs b/crates/tabby-index/src/indexer.rs index 646a917b1367..d3d33b1189ee 100644 --- a/crates/tabby-index/src/indexer.rs +++ b/crates/tabby-index/src/indexer.rs @@ -40,10 +40,10 @@ pub trait IndexAttributeBuilder: Send + Sync { async fn build_attributes(&self, document: &T) -> serde_json::Value; /// Build chunk level attributes, these attributes are stored and indexed. - async fn build_chunk_attributes( + async fn build_chunk_attributes<'a>( &self, - document: &T, - ) -> BoxStream, serde_json::Value)>>; + document: &'a T, + ) -> BoxStream<'a, JoinHandle<(Vec, serde_json::Value)>>; } pub struct TantivyDocBuilder { diff --git a/crates/tabby-index/src/lib.rs b/crates/tabby-index/src/lib.rs index 1bb14d0dc8d8..dc87050a2b04 100644 --- a/crates/tabby-index/src/lib.rs +++ b/crates/tabby-index/src/lib.rs @@ -7,7 +7,7 @@ mod tantivy_utils; use indexer::{IndexAttributeBuilder, Indexer}; -mod doc; +mod structured_doc; pub mod public { use indexer::IndexGarbageCollector; @@ -15,7 +15,10 @@ pub mod public { use super::*; pub use super::{ code::CodeIndexer, - doc::public::{DocIndexer, WebDocument}, + structured_doc::public::{ + StructuredDoc, StructuredDocFields, StructuredDocIndexer, StructuredDocIssueFields, + StructuredDocWebFields, + }, }; pub fn run_index_garbage_collection(active_sources: Vec) -> anyhow::Result<()> { diff --git a/crates/tabby-index/src/structured_doc/mod.rs b/crates/tabby-index/src/structured_doc/mod.rs new file mode 100644 index 000000000000..fcf5757d9c08 --- /dev/null +++ b/crates/tabby-index/src/structured_doc/mod.rs @@ -0,0 +1,51 @@ +pub mod public; +mod types; + +use std::sync::Arc; + +use async_trait::async_trait; +use futures::stream::BoxStream; +use serde_json::json; +use tabby_common::index::{corpus, structured_doc}; +use tabby_inference::Embedding; +use tokio::task::JoinHandle; +use types::{BuildStructuredDoc, StructuredDoc}; + +use crate::{indexer::TantivyDocBuilder, IndexAttributeBuilder}; + +pub struct StructuredDocBuilder { + embedding: Arc, +} + +impl StructuredDocBuilder { + fn new(embedding: Arc) -> Self { + Self { embedding } + } +} + +#[async_trait] +impl IndexAttributeBuilder for StructuredDocBuilder { + async fn build_attributes(&self, document: &StructuredDoc) -> serde_json::Value { + let mut attributes = document.build_attributes().await; + attributes + .as_object_mut() + .unwrap() + .insert(structured_doc::fields::KIND.into(), json!(document.kind())); + attributes + } + + async fn build_chunk_attributes<'a>( + &self, + document: &'a StructuredDoc, + ) -> BoxStream<'a, JoinHandle<(Vec, serde_json::Value)>> { + let embedding = self.embedding.clone(); + document.build_chunk_attributes(embedding).await + } +} + +fn create_structured_doc_builder( + embedding: Arc, +) -> TantivyDocBuilder { + let builder = StructuredDocBuilder::new(embedding); + TantivyDocBuilder::new(corpus::STRUCTURED_DOC, builder) +} diff --git a/crates/tabby-index/src/structured_doc/public.rs b/crates/tabby-index/src/structured_doc/public.rs new file mode 100644 index 000000000000..56e45fadb50a --- /dev/null +++ b/crates/tabby-index/src/structured_doc/public.rs @@ -0,0 +1,53 @@ +use std::sync::Arc; + +use async_stream::stream; +use chrono::{DateTime, Utc}; +use futures::StreamExt; +use tabby_common::index::corpus; +use tabby_inference::Embedding; + +pub use super::types::{ + issue::IssueDocument as StructuredDocIssueFields, web::WebDocument as StructuredDocWebFields, + StructuredDoc, StructuredDocFields, +}; +use super::{create_structured_doc_builder, types::BuildStructuredDoc}; +use crate::{indexer::TantivyDocBuilder, Indexer}; + +pub struct StructuredDocIndexer { + builder: TantivyDocBuilder, + indexer: Indexer, +} + +impl StructuredDocIndexer { + pub fn new(embedding: Arc) -> Self { + let builder = create_structured_doc_builder(embedding); + let indexer = Indexer::new(corpus::STRUCTURED_DOC); + Self { indexer, builder } + } + + pub async fn add(&self, updated_at: DateTime, document: StructuredDoc) -> bool { + if document.should_skip() { + return false; + } + + if self.indexer.is_indexed_after(document.id(), updated_at) { + return false; + }; + + stream! { + let (id, s) = self.builder.build(document).await; + self.indexer.delete(&id); + + for await doc in s.buffer_unordered(std::cmp::max(std::thread::available_parallelism().unwrap().get() * 2, 32)) { + if let Ok(Some(doc)) = doc { + self.indexer.add(doc).await; + } + } + }.count().await; + true + } + + pub fn commit(self) { + self.indexer.commit(); + } +} diff --git a/crates/tabby-index/src/structured_doc/types.rs b/crates/tabby-index/src/structured_doc/types.rs new file mode 100644 index 000000000000..f447354878cc --- /dev/null +++ b/crates/tabby-index/src/structured_doc/types.rs @@ -0,0 +1,102 @@ +pub mod issue; +pub mod web; + +use std::sync::Arc; + +use async_trait::async_trait; +use futures::stream::BoxStream; +use tabby_inference::Embedding; +use tokio::task::JoinHandle; +use tracing::warn; + +use crate::indexer::{IndexId, ToIndexId}; + +pub struct StructuredDoc { + pub source_id: String, + pub fields: StructuredDocFields, +} + +impl StructuredDoc { + pub fn id(&self) -> &str { + match &self.fields { + StructuredDocFields::Web(web) => &web.link, + StructuredDocFields::Issue(issue) => &issue.link, + } + } + + pub fn kind(&self) -> &'static str { + match &self.fields { + StructuredDocFields::Web(_) => "web", + StructuredDocFields::Issue(_) => "issue", + } + } +} + +impl ToIndexId for StructuredDoc { + fn to_index_id(&self) -> IndexId { + IndexId { + source_id: self.source_id.clone(), + id: self.id().to_owned(), + } + } +} + +#[async_trait] +pub trait BuildStructuredDoc { + fn should_skip(&self) -> bool; + + async fn build_attributes(&self) -> serde_json::Value; + async fn build_chunk_attributes( + &self, + embedding: Arc, + ) -> BoxStream, serde_json::Value)>>; +} + +pub enum StructuredDocFields { + Web(web::WebDocument), + Issue(issue::IssueDocument), +} + +#[async_trait] +impl BuildStructuredDoc for StructuredDoc { + fn should_skip(&self) -> bool { + match &self.fields { + StructuredDocFields::Web(doc) => doc.should_skip(), + StructuredDocFields::Issue(doc) => doc.should_skip(), + } + } + + async fn build_attributes(&self) -> serde_json::Value { + match &self.fields { + StructuredDocFields::Web(doc) => doc.build_attributes().await, + StructuredDocFields::Issue(doc) => doc.build_attributes().await, + } + } + + async fn build_chunk_attributes( + &self, + embedding: Arc, + ) -> BoxStream, serde_json::Value)>> { + match &self.fields { + StructuredDocFields::Web(doc) => doc.build_chunk_attributes(embedding).await, + StructuredDocFields::Issue(doc) => doc.build_chunk_attributes(embedding).await, + } + } +} + +async fn build_tokens(embedding: Arc, text: &str) -> Vec { + let embedding = match embedding.embed(text).await { + Ok(embedding) => embedding, + Err(err) => { + warn!("Failed to embed chunk text: {}", err); + return vec![]; + } + }; + + let mut chunk_embedding_tokens = vec![]; + for token in tabby_common::index::binarize_embedding(embedding.iter()) { + chunk_embedding_tokens.push(token); + } + + chunk_embedding_tokens +} diff --git a/crates/tabby-index/src/structured_doc/types/issue.rs b/crates/tabby-index/src/structured_doc/types/issue.rs new file mode 100644 index 000000000000..d760ad17f309 --- /dev/null +++ b/crates/tabby-index/src/structured_doc/types/issue.rs @@ -0,0 +1,50 @@ +use std::sync::Arc; + +use async_stream::stream; +use async_trait::async_trait; +use futures::stream::BoxStream; +use serde_json::json; +use tabby_common::index::structured_doc::fields; +use tabby_inference::Embedding; +use tokio::task::JoinHandle; + +use super::{build_tokens, BuildStructuredDoc}; + +pub struct IssueDocument { + pub link: String, + pub title: String, + pub body: String, + pub closed: bool, +} + +#[async_trait] +impl BuildStructuredDoc for IssueDocument { + fn should_skip(&self) -> bool { + false + } + + async fn build_attributes(&self) -> serde_json::Value { + json!({ + fields::issue::LINK: self.link, + fields::issue::TITLE: self.title, + fields::issue::BODY: self.body, + fields::issue::CLOSED: self.closed, + }) + } + + async fn build_chunk_attributes( + &self, + embedding: Arc, + ) -> BoxStream, serde_json::Value)>> { + let text = format!("{}\n\n{}", self.title, self.body); + let s = stream! { + yield tokio::spawn(async move { + let tokens = build_tokens(embedding, &text).await; + let chunk_attributes = json!({}); + (tokens, chunk_attributes) + }) + }; + + Box::pin(s) + } +} diff --git a/crates/tabby-index/src/structured_doc/types/web.rs b/crates/tabby-index/src/structured_doc/types/web.rs new file mode 100644 index 000000000000..5565258a6149 --- /dev/null +++ b/crates/tabby-index/src/structured_doc/types/web.rs @@ -0,0 +1,67 @@ +use std::{collections::HashSet, sync::Arc}; + +use async_stream::stream; +use async_trait::async_trait; +use futures::stream::BoxStream; +use serde_json::json; +use tabby_common::index::structured_doc::fields; +use tabby_inference::Embedding; +use text_splitter::TextSplitter; +use tokio::task::JoinHandle; + +use super::{build_tokens, BuildStructuredDoc}; + +pub struct WebDocument { + pub link: String, + pub title: String, + pub body: String, +} + +#[async_trait] +impl BuildStructuredDoc for WebDocument { + fn should_skip(&self) -> bool { + self.body.trim().is_empty() + } + + async fn build_attributes(&self) -> serde_json::Value { + json!({ + fields::web::TITLE: self.title, + fields::web::LINK: self.link, + }) + } + + async fn build_chunk_attributes( + &self, + embedding: Arc, + ) -> BoxStream, serde_json::Value)>> { + let chunks: Vec<_> = TextSplitter::new(2048) + .chunks(&self.body) + .map(|x| x.to_owned()) + .collect(); + + let title_embedding_tokens = build_tokens(embedding.clone(), &self.title).await; + let s = stream! { + for chunk_text in chunks { + let title_embedding_tokens = title_embedding_tokens.clone(); + let embedding = embedding.clone(); + yield tokio::spawn(async move { + let chunk_embedding_tokens = build_tokens(embedding.clone(), &chunk_text).await; + let chunk = json!({ + fields::web::CHUNK_TEXT: chunk_text, + }); + + // Title embedding tokens are merged with chunk embedding tokens to enhance the search results. + let tokens = merge_tokens(vec![title_embedding_tokens, chunk_embedding_tokens]); + (tokens, chunk) + }); + } + }; + + Box::pin(s) + } +} + +pub fn merge_tokens(tokens: Vec>) -> Vec { + let tokens = tokens.into_iter().flatten().collect::>(); + tokens.into_iter().collect() +} diff --git a/crates/tabby/src/serve.rs b/crates/tabby/src/serve.rs index 6fbab8ae136c..23a6cf5fe9b5 100644 --- a/crates/tabby/src/serve.rs +++ b/crates/tabby/src/serve.rs @@ -150,7 +150,7 @@ pub async fn main(config: &Config, args: &ServeArgs) { } let index_reader_provider = Arc::new(IndexReaderProvider::default()); - let docsearch = Arc::new(services::doc::create( + let docsearch = Arc::new(services::structured_doc::create( embedding.clone(), index_reader_provider.clone(), )); @@ -195,7 +195,7 @@ pub async fn main(config: &Config, args: &ServeArgs) { chat, completion_stream, docsearch, - |x| Box::new(services::doc::create_serper(x)), + |x| Box::new(services::structured_doc::create_serper(x)), ) .await; api = new_api; diff --git a/crates/tabby/src/services/mod.rs b/crates/tabby/src/services/mod.rs index 0c7d5892b252..2d572751a124 100644 --- a/crates/tabby/src/services/mod.rs +++ b/crates/tabby/src/services/mod.rs @@ -1,8 +1,8 @@ pub mod code; pub mod completion; -pub mod doc; pub mod embedding; pub mod event; pub mod health; pub mod model; +pub mod structured_doc; pub mod tantivy; diff --git a/crates/tabby/src/services/doc/mod.rs b/crates/tabby/src/services/structured_doc/mod.rs similarity index 88% rename from crates/tabby/src/services/doc/mod.rs rename to crates/tabby/src/services/structured_doc/mod.rs index 128542af5150..c13d64ee40cc 100644 --- a/crates/tabby/src/services/doc/mod.rs +++ b/crates/tabby/src/services/structured_doc/mod.rs @@ -3,7 +3,7 @@ mod tantivy; use std::sync::Arc; -use tabby_common::api::doc::DocSearch; +use tabby_common::api::structured_doc::DocSearch; use tabby_inference::Embedding; use super::tantivy::IndexReaderProvider; diff --git a/crates/tabby/src/services/doc/serper.rs b/crates/tabby/src/services/structured_doc/serper.rs similarity index 93% rename from crates/tabby/src/services/doc/serper.rs rename to crates/tabby/src/services/structured_doc/serper.rs index 3cdd59185b11..515ad3b10609 100644 --- a/crates/tabby/src/services/doc/serper.rs +++ b/crates/tabby/src/services/structured_doc/serper.rs @@ -1,7 +1,8 @@ use async_trait::async_trait; use serde::{Deserialize, Serialize}; -use tabby_common::api::doc::{ +use tabby_common::api::structured_doc::{ DocSearch, DocSearchDocument, DocSearchError, DocSearchHit, DocSearchResponse, + DocSearchWebDocument, }; use tracing::warn; @@ -77,11 +78,11 @@ impl DocSearch for SerperService { .into_iter() .map(|hit| DocSearchHit { score: 0.0, - doc: DocSearchDocument { + doc: DocSearchDocument::Web(DocSearchWebDocument { title: hit.title, link: hit.link, snippet: hit.snippet, - }, + }), }) .collect(); diff --git a/crates/tabby/src/services/doc/tantivy.rs b/crates/tabby/src/services/structured_doc/tantivy.rs similarity index 76% rename from crates/tabby/src/services/doc/tantivy.rs rename to crates/tabby/src/services/structured_doc/tantivy.rs index cecc98841c03..762104efe284 100644 --- a/crates/tabby/src/services/doc/tantivy.rs +++ b/crates/tabby/src/services/structured_doc/tantivy.rs @@ -3,8 +3,11 @@ use std::{collections::HashSet, sync::Arc}; use anyhow::Result; use async_trait::async_trait; use tabby_common::{ - api::doc::{DocSearch, DocSearchDocument, DocSearchError, DocSearchHit, DocSearchResponse}, - index::{self, corpus, doc}, + api::structured_doc::{ + DocSearch, DocSearchDocument, DocSearchError, DocSearchHit, DocSearchResponse, + FromTantivyDocument, + }, + index::{self, corpus}, }; use tabby_inference::Embedding; use tantivy::{ @@ -40,7 +43,7 @@ impl DocSearchImpl { let embedding = self.embedding.embed(q).await?; let embedding_tokens_query = index::embedding_tokens_query(embedding.len(), embedding.iter()); - let corpus_query = schema.corpus_query(corpus::WEB); + let corpus_query = schema.corpus_query(corpus::STRUCTURED_DOC); let mut query_clauses: Vec<(Occur, Box)> = vec![ ( @@ -94,13 +97,7 @@ impl DocSearchImpl { score, chunk, }| { - let chunk_text = get_json_text_field( - chunk, - schema.field_chunk_attributes, - doc::fields::CHUNK_TEXT, - ); - - let doc_query = schema.doc_query(corpus::WEB, doc_id); + let doc_query = schema.doc_query(corpus::STRUCTURED_DOC, doc_id); let top_docs = match searcher.search(&doc_query, &TopDocs::with_limit(1)) { Err(err) => { warn!("Failed to search doc `{}`: `{}`", doc_id, err); @@ -110,19 +107,8 @@ impl DocSearchImpl { }; let (_, doc_address) = top_docs.first()?; let doc: TantivyDocument = searcher.doc(*doc_address).ok()?; - let title = - get_json_text_field(&doc, schema.field_attributes, doc::fields::TITLE); - let link = - get_json_text_field(&doc, schema.field_attributes, doc::fields::LINK); - - Some(DocSearchHit { - doc: DocSearchDocument { - title: title.to_string(), - link: link.to_string(), - snippet: chunk_text.to_string(), - }, - score: *score, - }) + DocSearchDocument::from_tantivy_document(&doc, chunk) + .map(|doc| DocSearchHit { score: *score, doc }) }, ) .filter(|x| x.score >= EMBEDDING_SCORE_THRESHOLD) @@ -137,18 +123,6 @@ fn get_text(doc: &TantivyDocument, field: schema::Field) -> &str { doc.get_first(field).unwrap().as_str().unwrap() } -fn get_json_text_field<'a>(doc: &'a TantivyDocument, field: schema::Field, name: &str) -> &'a str { - doc.get_first(field) - .unwrap() - .as_object() - .unwrap() - .find(|(k, _)| *k == name) - .unwrap() - .1 - .as_str() - .unwrap() -} - struct ScoredChunk { doc_id: String, score: f32, diff --git a/ee/tabby-db/src/lib.rs b/ee/tabby-db/src/lib.rs index dd9c724a357d..a288d678185b 100644 --- a/ee/tabby-db/src/lib.rs +++ b/ee/tabby-db/src/lib.rs @@ -15,7 +15,8 @@ pub use server_setting::ServerSettingDAO; use sqlx::{query, query_scalar, sqlite::SqliteQueryResult, Pool, Sqlite, SqlitePool}; pub use threads::{ ThreadDAO, ThreadMessageAttachmentClientCode, ThreadMessageAttachmentCode, - ThreadMessageAttachmentDoc, ThreadMessageDAO, + ThreadMessageAttachmentDoc, ThreadMessageAttachmentIssueDoc, ThreadMessageAttachmentWebDoc, + ThreadMessageDAO, }; use tokio::sync::Mutex; use user_completions::UserCompletionDailyStatsDAO; diff --git a/ee/tabby-db/src/threads.rs b/ee/tabby-db/src/threads.rs index ede3a48290c1..01d2e0292853 100644 --- a/ee/tabby-db/src/threads.rs +++ b/ee/tabby-db/src/threads.rs @@ -32,12 +32,27 @@ pub struct ThreadMessageDAO { } #[derive(Serialize, Deserialize)] -pub struct ThreadMessageAttachmentDoc { +#[serde(untagged)] // Mark the serde serialization format as untagged for backward compatibility: https://serde.rs/enum-representations.html#untagged +pub enum ThreadMessageAttachmentDoc { + Web(ThreadMessageAttachmentWebDoc), + Issue(ThreadMessageAttachmentIssueDoc), +} + +#[derive(Serialize, Deserialize)] +pub struct ThreadMessageAttachmentWebDoc { pub title: String, pub link: String, pub content: String, } +#[derive(Serialize, Deserialize)] +pub struct ThreadMessageAttachmentIssueDoc { + pub title: String, + pub link: String, + pub body: String, + pub closed: bool, +} + #[derive(Serialize, Deserialize)] pub struct ThreadMessageAttachmentCode { pub git_url: String, diff --git a/ee/tabby-schema/graphql/schema.graphql b/ee/tabby-schema/graphql/schema.graphql index b029d731d089..085bac638402 100644 --- a/ee/tabby-schema/graphql/schema.graphql +++ b/ee/tabby-schema/graphql/schema.graphql @@ -509,7 +509,14 @@ type MessageAttachmentCodeScores { embedding: Float! } -type MessageAttachmentDoc { +type MessageAttachmentIssueDoc { + title: String! + link: String! + body: String! + closed: Boolean! +} + +type MessageAttachmentWebDoc { title: String! link: String! content: String! @@ -893,6 +900,8 @@ type WebContextSource implements ContextSourceId & ContextSource { sourceName: String! } +union MessageAttachmentDoc = MessageAttachmentWebDoc | MessageAttachmentIssueDoc + """ Schema of thread run stream. diff --git a/ee/tabby-schema/src/dao.rs b/ee/tabby-schema/src/dao.rs index 2638f6988dcb..dc0da9b3bc3b 100644 --- a/ee/tabby-schema/src/dao.rs +++ b/ee/tabby-schema/src/dao.rs @@ -4,7 +4,8 @@ use lazy_static::lazy_static; use tabby_db::{ EmailSettingDAO, IntegrationDAO, InvitationDAO, JobRunDAO, OAuthCredentialDAO, ServerSettingDAO, ThreadDAO, ThreadMessageAttachmentClientCode, ThreadMessageAttachmentCode, - ThreadMessageAttachmentDoc, ThreadMessageDAO, UserEventDAO, + ThreadMessageAttachmentDoc, ThreadMessageAttachmentIssueDoc, ThreadMessageAttachmentWebDoc, + ThreadMessageDAO, UserEventDAO, }; use crate::{ @@ -229,20 +230,44 @@ impl From<&thread::MessageAttachmentCodeInput> for ThreadMessageAttachmentClient impl From for thread::MessageAttachmentDoc { fn from(value: ThreadMessageAttachmentDoc) -> Self { - Self { - title: value.title, - link: value.link, - content: value.content, + match value { + ThreadMessageAttachmentDoc::Web(val) => { + thread::MessageAttachmentDoc::Web(thread::MessageAttachmentWebDoc { + title: val.title, + link: val.link, + content: val.content, + }) + } + ThreadMessageAttachmentDoc::Issue(val) => { + thread::MessageAttachmentDoc::Issue(thread::MessageAttachmentIssueDoc { + title: val.title, + link: val.link, + body: val.body, + closed: val.closed, + }) + } } } } impl From<&thread::MessageAttachmentDoc> for ThreadMessageAttachmentDoc { fn from(val: &thread::MessageAttachmentDoc) -> Self { - ThreadMessageAttachmentDoc { - title: val.title.clone(), - link: val.link.clone(), - content: val.content.clone(), + match val { + thread::MessageAttachmentDoc::Web(val) => { + ThreadMessageAttachmentDoc::Web(ThreadMessageAttachmentWebDoc { + title: val.title.clone(), + link: val.link.clone(), + content: val.content.clone(), + }) + } + thread::MessageAttachmentDoc::Issue(val) => { + ThreadMessageAttachmentDoc::Issue(ThreadMessageAttachmentIssueDoc { + title: val.title.clone(), + link: val.link.clone(), + body: val.body.clone(), + closed: val.closed, + }) + } } } } diff --git a/ee/tabby-schema/src/schema/thread/types.rs b/ee/tabby-schema/src/schema/thread/types.rs index afcdb907d886..f89cc850575f 100644 --- a/ee/tabby-schema/src/schema/thread/types.rs +++ b/ee/tabby-schema/src/schema/thread/types.rs @@ -3,7 +3,7 @@ use juniper::{GraphQLEnum, GraphQLInputObject, GraphQLObject, GraphQLUnion, ID}; use serde::Serialize; use tabby_common::api::{ code::{CodeSearchDocument, CodeSearchHit, CodeSearchScores}, - doc::{DocSearchDocument, DocSearchHit}, + structured_doc::{DocSearchDocument, DocSearchHit}, }; use validator::Validate; @@ -121,19 +121,43 @@ impl From for MessageCodeSearchHit { } } +#[derive(GraphQLUnion, Clone)] +pub enum MessageAttachmentDoc { + Web(MessageAttachmentWebDoc), + Issue(MessageAttachmentIssueDoc), +} + #[derive(GraphQLObject, Clone)] -pub struct MessageAttachmentDoc { +pub struct MessageAttachmentWebDoc { pub title: String, pub link: String, pub content: String, } +#[derive(GraphQLObject, Clone)] +pub struct MessageAttachmentIssueDoc { + pub title: String, + pub link: String, + pub body: String, + pub closed: bool, +} + impl From for MessageAttachmentDoc { fn from(doc: DocSearchDocument) -> Self { - Self { - title: doc.title, - link: doc.link, - content: doc.snippet, + match doc { + DocSearchDocument::Web(web) => MessageAttachmentDoc::Web(MessageAttachmentWebDoc { + title: web.title, + link: web.link, + content: web.snippet, + }), + DocSearchDocument::Issue(issue) => { + MessageAttachmentDoc::Issue(MessageAttachmentIssueDoc { + title: issue.title, + link: issue.link, + body: issue.body, + closed: issue.closed, + }) + } } } } diff --git a/ee/tabby-ui/app/search/components/assistant-message-section.tsx b/ee/tabby-ui/app/search/components/assistant-message-section.tsx index 44f5734ea53e..50e3670b3636 100644 --- a/ee/tabby-ui/app/search/components/assistant-message-section.tsx +++ b/ee/tabby-ui/app/search/components/assistant-message-section.tsx @@ -22,6 +22,7 @@ import { import { cn, formatLineHashForCodeBrowser, + getContent, getRangeFromAttachmentCode, getRangeTextFromAttachmentCode } from '@/lib/utils' @@ -491,7 +492,7 @@ function SourceCard({ } )} > - {normalizedText(source.content)} + {normalizedText(getContent(source))}

diff --git a/ee/tabby-ui/components/message-markdown/index.tsx b/ee/tabby-ui/components/message-markdown/index.tsx index cb41ee5aa211..48cfa20be4a9 100644 --- a/ee/tabby-ui/components/message-markdown/index.tsx +++ b/ee/tabby-ui/components/message-markdown/index.tsx @@ -15,7 +15,7 @@ import { MessageAttachmentDoc } from '@/lib/gql/generates/graphql' import { AttachmentCodeItem, AttachmentDocItem } from '@/lib/types' -import { cn } from '@/lib/utils' +import { cn, getContent } from '@/lib/utils' import { CodeBlock, CodeBlockProps } from '@/components/ui/codeblock' import { HoverCard, @@ -398,7 +398,7 @@ function RelevantDocumentBadge({ {relevantDocument.title}

- {normalizedText(relevantDocument.content)} + {normalizedText(getContent(relevantDocument))}

diff --git a/ee/tabby-ui/lib/hooks/use-thread-run.ts b/ee/tabby-ui/lib/hooks/use-thread-run.ts index eb97f860b70c..e8daa2f7ad06 100644 --- a/ee/tabby-ui/lib/hooks/use-thread-run.ts +++ b/ee/tabby-ui/lib/hooks/use-thread-run.ts @@ -54,9 +54,18 @@ const CreateThreadAndRunSubscription = graphql(/* GraphQL */ ` ... on ThreadAssistantMessageAttachmentsDoc { hits { doc { - title - link - content + __typename + ... on MessageAttachmentWebDoc { + title + link + content + } + ... on MessageAttachmentIssueDoc { + title + link + body + closed + } } score } @@ -106,9 +115,18 @@ const CreateThreadRunSubscription = graphql(/* GraphQL */ ` ... on ThreadAssistantMessageAttachmentsDoc { hits { doc { - title - link - content + __typename + ... on MessageAttachmentWebDoc { + title + link + content + } + ... on MessageAttachmentIssueDoc { + title + link + body + closed + } } score } diff --git a/ee/tabby-ui/lib/tabby/query.ts b/ee/tabby-ui/lib/tabby/query.ts index a39af1dda9a2..2821a05235ee 100644 --- a/ee/tabby-ui/lib/tabby/query.ts +++ b/ee/tabby-ui/lib/tabby/query.ts @@ -413,9 +413,18 @@ export const listThreadMessages = graphql(/* GraphQL */ ` startLine } doc { - title - link - content + __typename + ... on MessageAttachmentWebDoc { + title + link + content + } + ... on MessageAttachmentIssueDoc { + title + link + body + closed + } } } } diff --git a/ee/tabby-ui/lib/utils/index.ts b/ee/tabby-ui/lib/utils/index.ts index 8abaeb9c9303..c908130beada 100644 --- a/ee/tabby-ui/lib/utils/index.ts +++ b/ee/tabby-ui/lib/utils/index.ts @@ -3,7 +3,7 @@ import { compact, isNil } from 'lodash-es' import { customAlphabet } from 'nanoid' import { twMerge } from 'tailwind-merge' -import { AttachmentCodeItem } from '@/lib/types' +import { AttachmentCodeItem, AttachmentDocItem } from '@/lib/types' export * from './chat' @@ -121,3 +121,14 @@ export function getRangeTextFromAttachmentCode(code: AttachmentCodeItem) { const { startLine, endLine } = getRangeFromAttachmentCode(code) return formatLineHashForCodeBrowser({ start: startLine, end: endLine }) } + +export function getContent(item: AttachmentDocItem) { + switch (item.__typename) { + case 'MessageAttachmentWebDoc': + return item.content + case 'MessageAttachmentIssueDoc': + return item.body + } + + return '' +} diff --git a/ee/tabby-webserver/src/service/answer.rs b/ee/tabby-webserver/src/service/answer.rs index 8f0944f9d638..ec2343401cfe 100644 --- a/ee/tabby-webserver/src/service/answer.rs +++ b/ee/tabby-webserver/src/service/answer.rs @@ -22,7 +22,7 @@ use tabby_common::{ CodeSearch, CodeSearchError, CodeSearchHit, CodeSearchParams, CodeSearchQuery, CodeSearchScores, }, - doc::{DocSearch, DocSearchError, DocSearchHit}, + structured_doc::{DocSearch, DocSearchError, DocSearchHit}, }, config::AnswerConfig, }; @@ -33,9 +33,9 @@ use tabby_schema::{ repository::{Repository, RepositoryService}, thread::{ self, CodeQueryInput, CodeSearchParamsOverrideInput, DocQueryInput, MessageAttachment, - ThreadAssistantMessageAttachmentsCode, ThreadAssistantMessageAttachmentsDoc, - ThreadAssistantMessageContentDelta, ThreadRelevantQuestions, ThreadRunItem, - ThreadRunOptionsInput, + MessageAttachmentDoc, ThreadAssistantMessageAttachmentsCode, + ThreadAssistantMessageAttachmentsDoc, ThreadAssistantMessageContentDelta, + ThreadRelevantQuestions, ThreadRunItem, ThreadRunOptionsInput, }, }; use tracing::{debug, error, warn}; @@ -317,7 +317,7 @@ impl AnswerService { attachment .doc .iter() - .map(|doc| format!("```\n{}\n```", doc.content)), + .map(|doc| format!("```\n{}\n```", get_content(doc))), ) .collect(); @@ -465,7 +465,7 @@ fn build_user_prompt( let snippets: Vec = assistant_attachment .doc .iter() - .map(|doc| format!("```\n{}\n```", doc.content)) + .map(|doc| format!("```\n{}\n```", get_content(doc))) .chain( user_attachment_input .map(|x| &x.code) @@ -599,6 +599,13 @@ fn count_lines(path: &Path) -> std::io::Result { Ok(count) } +fn get_content(doc: &MessageAttachmentDoc) -> &str { + match doc { + MessageAttachmentDoc::Web(web) => &web.content, + MessageAttachmentDoc::Issue(issue) => &issue.body, + } +} + #[cfg(test)] pub mod testutils; @@ -613,7 +620,7 @@ mod tests { code::{ CodeSearch, CodeSearchDocument, CodeSearchHit, CodeSearchParams, CodeSearchScores, }, - doc::DocSearch, + structured_doc::{DocSearch, DocSearchDocument}, }, config::AnswerConfig, }; @@ -700,15 +707,22 @@ mod tests { } } + fn get_title(doc: &DocSearchDocument) -> &str { + match doc { + DocSearchDocument::Web(web_doc) => &web_doc.title, + DocSearchDocument::Issue(issue_doc) => &issue_doc.title, + } + } + #[test] fn test_build_user_prompt() { let user_input = "What is the purpose of this code?"; let assistant_attachment = tabby_schema::thread::MessageAttachment { - doc: vec![tabby_schema::thread::MessageAttachmentDoc { + doc: vec![tabby_schema::thread::MessageAttachmentDoc::Web(tabby_schema::thread::MessageAttachmentWebDoc { title: "Documentation".to_owned(), content: "This code implements a basic web server.".to_owned(), link: "https://example.com/docs".to_owned(), - }], + })], code: vec![tabby_schema::thread::MessageAttachmentCode { git_url: "https://github.com/".to_owned(), filepath: "server.py".to_owned(), @@ -735,11 +749,13 @@ mod tests { fn test_convert_messages_to_chat_completion_request() { // Fake assistant attachment let attachment = tabby_schema::thread::MessageAttachment { - doc: vec![tabby_schema::thread::MessageAttachmentDoc { - title: "1. Example Document".to_owned(), - content: "This is an example".to_owned(), - link: "https://example.com".to_owned(), - }], + doc: vec![tabby_schema::thread::MessageAttachmentDoc::Web( + tabby_schema::thread::MessageAttachmentWebDoc { + title: "1. Example Document".to_owned(), + content: "This is an example".to_owned(), + link: "https://example.com".to_owned(), + }, + )], code: vec![tabby_schema::thread::MessageAttachmentCode { git_url: "https://github.com".to_owned(), filepath: "server.py".to_owned(), @@ -919,11 +935,13 @@ mod tests { ); let attachment = MessageAttachment { - doc: vec![tabby_schema::thread::MessageAttachmentDoc { - title: "1. Example Document".to_owned(), - content: "This is an example".to_owned(), - link: "https://example.com".to_owned(), - }], + doc: vec![tabby_schema::thread::MessageAttachmentDoc::Web( + tabby_schema::thread::MessageAttachmentWebDoc { + title: "1. Example Document".to_owned(), + content: "This is an example".to_owned(), + link: "https://example.com".to_owned(), + }, + )], code: vec![tabby_schema::thread::MessageAttachmentCode { git_url: "https://github.com".to_owned(), filepath: "server.py".to_owned(), @@ -987,7 +1005,7 @@ mod tests { assert_eq!(hits.len(), 10, "Expected 10 hits from the doc search"); assert!( - hits.iter().any(|hit| hit.doc.title == "Document 1"), + hits.iter().any(|hit| get_title(&hit.doc) == "Document 1"), "Expected to find a hit with title 'Document 1'" ); } diff --git a/ee/tabby-webserver/src/service/answer/testutils/mod.rs b/ee/tabby-webserver/src/service/answer/testutils/mod.rs index 287838c62691..6d784ce4f141 100644 --- a/ee/tabby-webserver/src/service/answer/testutils/mod.rs +++ b/ee/tabby-webserver/src/service/answer/testutils/mod.rs @@ -12,7 +12,10 @@ use axum::async_trait; use juniper::ID; use tabby_common::api::{ code::{CodeSearch, CodeSearchError, CodeSearchParams, CodeSearchQuery, CodeSearchResponse}, - doc::{DocSearch, DocSearchDocument, DocSearchError, DocSearchHit, DocSearchResponse}, + structured_doc::{ + DocSearch, DocSearchDocument, DocSearchError, DocSearchHit, DocSearchResponse, + DocSearchWebDocument, + }, }; use tabby_db::DbConn; use tabby_inference::ChatCompletionStream; @@ -159,43 +162,43 @@ impl DocSearch for FakeDocSearch { let hits = vec![ DocSearchHit { score: 1.0, - doc: DocSearchDocument { + doc: DocSearchDocument::Web(DocSearchWebDocument { title: "Document 1".to_string(), link: "https://example.com/doc1".to_string(), snippet: "Snippet for Document 1".to_string(), - }, + }), }, DocSearchHit { score: 0.9, - doc: DocSearchDocument { + doc: DocSearchDocument::Web(DocSearchWebDocument { title: "Document 2".to_string(), link: "https://example.com/doc2".to_string(), snippet: "Snippet for Document 2".to_string(), - }, + }), }, DocSearchHit { score: 0.8, - doc: DocSearchDocument { + doc: DocSearchDocument::Web(DocSearchWebDocument { title: "Document 3".to_string(), link: "https://example.com/doc3".to_string(), snippet: "Snippet for Document 3".to_string(), - }, + }), }, DocSearchHit { score: 0.7, - doc: DocSearchDocument { + doc: DocSearchDocument::Web(DocSearchWebDocument { title: "Document 4".to_string(), link: "https://example.com/doc4".to_string(), snippet: "Snippet for Document 4".to_string(), - }, + }), }, DocSearchHit { score: 0.6, - doc: DocSearchDocument { + doc: DocSearchDocument::Web(DocSearchWebDocument { title: "Document 5".to_string(), link: "https://example.com/doc5".to_string(), snippet: "Snippet for Document 5".to_string(), - }, + }), }, ]; Ok(DocSearchResponse { hits }) diff --git a/ee/tabby-webserver/src/service/background_job/third_party_integration.rs b/ee/tabby-webserver/src/service/background_job/third_party_integration.rs index ec65ae6a3145..76c670a5dbce 100644 --- a/ee/tabby-webserver/src/service/background_job/third_party_integration.rs +++ b/ee/tabby-webserver/src/service/background_job/third_party_integration.rs @@ -7,7 +7,7 @@ use issues::{list_github_issues, list_gitlab_issues}; use juniper::ID; use serde::{Deserialize, Serialize}; use tabby_common::config::CodeRepository; -use tabby_index::public::{CodeIndexer, DocIndexer, WebDocument}; +use tabby_index::public::{CodeIndexer, StructuredDoc, StructuredDocIndexer}; use tabby_inference::Embedding; use tabby_schema::{ integration::{Integration, IntegrationKind, IntegrationService}, @@ -112,7 +112,7 @@ impl SchedulerGithubGitlabJob { "Indexing documents for repository {}", repository.display_name ); - let index = DocIndexer::new(embedding); + let index = StructuredDocIndexer::new(embedding); let s = match fetch_all_issues(&integration, &repository).await { Ok(s) => s, Err(e) => { @@ -169,8 +169,8 @@ impl SchedulerGithubGitlabJob { async fn fetch_all_issues( integration: &Integration, repository: &ProvidedRepository, -) -> tabby_schema::Result, WebDocument)>> { - let s: BoxStream<(DateTime, WebDocument)> = match &integration.kind { +) -> tabby_schema::Result, StructuredDoc)>> { + let s: BoxStream<(DateTime, StructuredDoc)> = match &integration.kind { IntegrationKind::Github | IntegrationKind::GithubSelfHosted => list_github_issues( &repository.source_id(), integration.api_base(), diff --git a/ee/tabby-webserver/src/service/background_job/third_party_integration/issues.rs b/ee/tabby-webserver/src/service/background_job/third_party_integration/issues.rs index 949f381524ad..84bf63c7203f 100644 --- a/ee/tabby-webserver/src/service/background_job/third_party_integration/issues.rs +++ b/ee/tabby-webserver/src/service/background_job/third_party_integration/issues.rs @@ -2,10 +2,10 @@ use anyhow::{anyhow, Result}; use async_stream::stream; use chrono::{DateTime, Utc}; use futures::Stream; -use gitlab::api::{issues::ProjectIssues, projects::merge_requests::MergeRequests, AsyncQuery}; +use gitlab::api::{issues::ProjectIssues, AsyncQuery}; use octocrab::Octocrab; use serde::Deserialize; -use tabby_index::public::WebDocument; +use tabby_index::public::{StructuredDoc, StructuredDocFields, StructuredDocIssueFields}; use crate::service::create_gitlab_client; @@ -14,7 +14,7 @@ pub async fn list_github_issues( api_base: &str, full_name: &str, access_token: &str, -) -> Result, WebDocument)>> { +) -> Result, StructuredDoc)>> { let octocrab = Octocrab::builder() .personal_token(access_token.to_string()) .base_uri(api_base)? @@ -47,12 +47,14 @@ pub async fn list_github_issues( let pages = response.number_of_pages().unwrap_or_default(); for issue in response.items { - let doc = WebDocument { + let doc = StructuredDoc { source_id: source_id.to_string(), - id: issue.html_url.to_string(), - link: issue.html_url.to_string(), - title: issue.title, - body: issue.body.unwrap_or_default(), + fields: StructuredDocFields::Issue(StructuredDocIssueFields { + link: issue.html_url.to_string(), + title: issue.title, + body: issue.body.unwrap_or_default(), + closed: issue.state == octocrab::models::IssueState::Closed, + }) }; yield (issue.updated_at, doc); } @@ -73,6 +75,7 @@ struct GitlabIssue { description: Option, web_url: String, updated_at: DateTime, + state: String, } pub async fn list_gitlab_issues( @@ -80,7 +83,7 @@ pub async fn list_gitlab_issues( api_base: &str, full_name: &str, access_token: &str, -) -> Result, WebDocument)>> { +) -> Result, StructuredDoc)>> { let gitlab = create_gitlab_client(api_base, access_token).await?; let source_id = source_id.to_owned(); @@ -101,40 +104,16 @@ pub async fn list_gitlab_issues( }; for issue in issues { - let doc = WebDocument { + let doc = StructuredDoc { source_id: source_id.to_owned(), - id: issue.web_url.clone(), + fields: StructuredDocFields::Issue(StructuredDocIssueFields { link: issue.web_url, title: issue.title, body: issue.description.unwrap_or_default(), - }; + closed: issue.state == "closed", + })}; yield (issue.updated_at, doc); } - - let merge_requests: Vec = match gitlab::api::paged( - MergeRequests::builder().project(&full_name).build().expect("Failed to build request"), - gitlab::api::Pagination::All, - ) - .query_async(&gitlab) - .await { - Ok(x) => x, - Err(e) => { - logkit::error!("Failed to fetch merge requests: {}", e); - return; - } - }; - - for merge_request in merge_requests { - let doc = WebDocument { - source_id: source_id.to_owned(), - id: merge_request.web_url.clone(), - link: merge_request.web_url, - title: merge_request.title, - body: merge_request.description.unwrap_or_default(), - }; - yield (merge_request.updated_at, doc); - } - }; Ok(s) diff --git a/ee/tabby-webserver/src/service/background_job/web_crawler.rs b/ee/tabby-webserver/src/service/background_job/web_crawler.rs index 3bd3a626be8d..8d4450310309 100644 --- a/ee/tabby-webserver/src/service/background_job/web_crawler.rs +++ b/ee/tabby-webserver/src/service/background_job/web_crawler.rs @@ -4,7 +4,9 @@ use chrono::Utc; use futures::StreamExt; use serde::{Deserialize, Serialize}; use tabby_crawler::crawl_pipeline; -use tabby_index::public::{DocIndexer, WebDocument}; +use tabby_index::public::{ + StructuredDoc, StructuredDocFields, StructuredDocIndexer, StructuredDocWebFields, +}; use tabby_inference::Embedding; use super::helper::Job; @@ -35,18 +37,19 @@ impl WebCrawlerJob { logkit::info!("Starting doc index pipeline for {}", self.url); let embedding = embedding.clone(); let mut num_docs = 0; - let indexer = DocIndexer::new(embedding.clone()); + let indexer = StructuredDocIndexer::new(embedding.clone()); let url_prefix = self.url_prefix.as_ref().unwrap_or(&self.url); let mut pipeline = Box::pin(crawl_pipeline(&self.url, url_prefix).await?); while let Some(doc) = pipeline.next().await { logkit::info!("Fetching {}", doc.url); - let source_doc = WebDocument { + let source_doc = StructuredDoc { source_id: self.source_id.clone(), - id: doc.url.clone(), - title: doc.metadata.title.unwrap_or_default(), - link: doc.url, - body: doc.markdown, + fields: StructuredDocFields::Web(StructuredDocWebFields { + title: doc.metadata.title.unwrap_or_default(), + link: doc.url, + body: doc.markdown, + }), }; num_docs += 1; diff --git a/ee/tabby-webserver/src/service/thread.rs b/ee/tabby-webserver/src/service/thread.rs index 27fcd53c6acf..55f762842c5a 100644 --- a/ee/tabby-webserver/src/service/thread.rs +++ b/ee/tabby-webserver/src/service/thread.rs @@ -289,7 +289,7 @@ mod tests { use tabby_common::{ api::{ code::{CodeSearch, CodeSearchParams}, - doc::DocSearch, + structured_doc::DocSearch, }, config::AnswerConfig, }; diff --git a/ee/tabby-webserver/src/webserver.rs b/ee/tabby-webserver/src/webserver.rs index b5b5eeec77a3..a7e360955908 100644 --- a/ee/tabby-webserver/src/webserver.rs +++ b/ee/tabby-webserver/src/webserver.rs @@ -4,8 +4,8 @@ use axum::Router; use tabby_common::{ api::{ code::CodeSearch, - doc::DocSearch, event::{ComposedLogger, EventLogger}, + structured_doc::DocSearch, }, config::Config, };