From a52c4e6fe7cbd7e95a327926bf11c8c4998848f0 Mon Sep 17 00:00:00 2001
From: Meng Zhang
Date: Wed, 13 Nov 2024 20:13:51 -0800
Subject: [PATCH] refactor(index): migrate corpus::WEB to
corpus::STRUCTURED_DOC (#3352)
* add structured doc
* [autofix.ci] apply automated fixes
* chore: implement structured_doc::DocService
* refactor(index): refactored `web_crawler.rs` to use updated `StructuredDoc` and `StructuredDocFields` types.
run make fix
* switch doc search
* chore: adapt frontend
* delete doc related files
* run make fix
* add deprecation notes for corpus::WEB
* [autofix.ci] apply automated fixes
* [autofix.ci] apply automated fixes (attempt 2/3)
---------
Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
---
crates/tabby-common/src/api/doc.rs | 46 -----
crates/tabby-common/src/api/mod.rs | 2 +-
crates/tabby-common/src/api/structured_doc.rs | 169 ++++++++++++++++++
crates/tabby-common/src/config.rs | 2 +-
crates/tabby-common/src/index/doc.rs | 5 -
crates/tabby-common/src/index/mod.rs | 8 +-
.../tabby-common/src/index/structured_doc.rs | 16 ++
crates/tabby-index/src/code/mod.rs | 6 +-
crates/tabby-index/src/doc/mod.rs | 99 ----------
crates/tabby-index/src/doc/public.rs | 125 -------------
crates/tabby-index/src/indexer.rs | 6 +-
crates/tabby-index/src/lib.rs | 7 +-
crates/tabby-index/src/structured_doc/mod.rs | 51 ++++++
.../tabby-index/src/structured_doc/public.rs | 53 ++++++
.../tabby-index/src/structured_doc/types.rs | 102 +++++++++++
.../src/structured_doc/types/issue.rs | 50 ++++++
.../src/structured_doc/types/web.rs | 67 +++++++
crates/tabby/src/serve.rs | 4 +-
crates/tabby/src/services/mod.rs | 2 +-
.../services/{doc => structured_doc}/mod.rs | 2 +-
.../{doc => structured_doc}/serper.rs | 7 +-
.../{doc => structured_doc}/tantivy.rs | 44 +----
ee/tabby-db/src/lib.rs | 3 +-
ee/tabby-db/src/threads.rs | 17 +-
ee/tabby-schema/graphql/schema.graphql | 11 +-
ee/tabby-schema/src/dao.rs | 43 ++++-
ee/tabby-schema/src/schema/thread/types.rs | 36 +++-
.../components/assistant-message-section.tsx | 3 +-
.../components/message-markdown/index.tsx | 4 +-
ee/tabby-ui/lib/hooks/use-thread-run.ts | 30 +++-
ee/tabby-ui/lib/tabby/query.ts | 15 +-
ee/tabby-ui/lib/utils/index.ts | 13 +-
ee/tabby-webserver/src/service/answer.rs | 58 +++---
.../src/service/answer/testutils/mod.rs | 25 +--
.../background_job/third_party_integration.rs | 8 +-
.../third_party_integration/issues.rs | 53 ++----
.../src/service/background_job/web_crawler.rs | 17 +-
ee/tabby-webserver/src/service/thread.rs | 2 +-
ee/tabby-webserver/src/webserver.rs | 2 +-
39 files changed, 773 insertions(+), 440 deletions(-)
delete mode 100644 crates/tabby-common/src/api/doc.rs
create mode 100644 crates/tabby-common/src/api/structured_doc.rs
delete mode 100644 crates/tabby-common/src/index/doc.rs
create mode 100644 crates/tabby-common/src/index/structured_doc.rs
delete mode 100644 crates/tabby-index/src/doc/mod.rs
delete mode 100644 crates/tabby-index/src/doc/public.rs
create mode 100644 crates/tabby-index/src/structured_doc/mod.rs
create mode 100644 crates/tabby-index/src/structured_doc/public.rs
create mode 100644 crates/tabby-index/src/structured_doc/types.rs
create mode 100644 crates/tabby-index/src/structured_doc/types/issue.rs
create mode 100644 crates/tabby-index/src/structured_doc/types/web.rs
rename crates/tabby/src/services/{doc => structured_doc}/mod.rs (88%)
rename crates/tabby/src/services/{doc => structured_doc}/serper.rs (93%)
rename crates/tabby/src/services/{doc => structured_doc}/tantivy.rs (76%)
diff --git a/crates/tabby-common/src/api/doc.rs b/crates/tabby-common/src/api/doc.rs
deleted file mode 100644
index 379c0f7cba34..000000000000
--- a/crates/tabby-common/src/api/doc.rs
+++ /dev/null
@@ -1,46 +0,0 @@
-use async_trait::async_trait;
-use thiserror::Error;
-
-pub struct DocSearchResponse {
- pub hits: Vec,
-}
-
-pub struct DocSearchHit {
- pub score: f32,
- pub doc: DocSearchDocument,
-}
-
-#[derive(Clone)]
-pub struct DocSearchDocument {
- pub title: String,
- pub link: String,
- pub snippet: String,
-}
-
-#[derive(Error, Debug)]
-pub enum DocSearchError {
- #[error("index not ready")]
- NotReady,
-
- #[error(transparent)]
- QueryParserError(#[from] tantivy::query::QueryParserError),
-
- #[error(transparent)]
- TantivyError(#[from] tantivy::TantivyError),
-
- #[error(transparent)]
- Other(#[from] anyhow::Error),
-}
-
-#[async_trait]
-pub trait DocSearch: Send + Sync {
- /// Search docs from underlying index.
- ///
- /// * `source_ids`: Filter documents by source IDs, when empty, search all sources.
- async fn search(
- &self,
- source_ids: &[String],
- q: &str,
- limit: usize,
- ) -> Result;
-}
diff --git a/crates/tabby-common/src/api/mod.rs b/crates/tabby-common/src/api/mod.rs
index 46581c56dd7a..885aa4450b14 100644
--- a/crates/tabby-common/src/api/mod.rs
+++ b/crates/tabby-common/src/api/mod.rs
@@ -1,4 +1,4 @@
pub mod code;
-pub mod doc;
pub mod event;
pub mod server_setting;
+pub mod structured_doc;
diff --git a/crates/tabby-common/src/api/structured_doc.rs b/crates/tabby-common/src/api/structured_doc.rs
new file mode 100644
index 000000000000..03d43f004dbc
--- /dev/null
+++ b/crates/tabby-common/src/api/structured_doc.rs
@@ -0,0 +1,169 @@
+use async_trait::async_trait;
+use tantivy::{
+ schema::{self, document::CompactDocValue, Value},
+ TantivyDocument,
+};
+use thiserror::Error;
+
+use crate::index::{structured_doc, IndexSchema};
+
+pub struct DocSearchResponse {
+ pub hits: Vec,
+}
+
+pub struct DocSearchHit {
+ pub score: f32,
+ pub doc: DocSearchDocument,
+}
+
+#[derive(Clone)]
+pub enum DocSearchDocument {
+ Web(DocSearchWebDocument),
+ Issue(DocSearchIssueDocument),
+}
+
+#[derive(Error, Debug)]
+pub enum DocSearchError {
+ #[error("index not ready")]
+ NotReady,
+
+ #[error(transparent)]
+ QueryParserError(#[from] tantivy::query::QueryParserError),
+
+ #[error(transparent)]
+ TantivyError(#[from] tantivy::TantivyError),
+
+ #[error(transparent)]
+ Other(#[from] anyhow::Error),
+}
+
+#[async_trait]
+pub trait DocSearch: Send + Sync {
+ /// Search docs from underlying index.
+ ///
+ /// * `source_ids`: Filter documents by source IDs, when empty, search all sources.
+ async fn search(
+ &self,
+ source_ids: &[String],
+ q: &str,
+ limit: usize,
+ ) -> Result;
+}
+
+#[derive(Clone)]
+pub struct DocSearchWebDocument {
+ pub title: String,
+ pub link: String,
+ pub snippet: String,
+}
+
+#[derive(Clone)]
+pub struct DocSearchIssueDocument {
+ pub title: String,
+ pub link: String,
+ pub body: String,
+ pub closed: bool,
+}
+
+pub trait FromTantivyDocument {
+ fn from_tantivy_document(doc: &TantivyDocument, chunk: &TantivyDocument) -> Option
+ where
+ Self: Sized;
+}
+
+impl FromTantivyDocument for DocSearchDocument {
+ fn from_tantivy_document(doc: &TantivyDocument, chunk: &TantivyDocument) -> Option {
+ let schema = IndexSchema::instance();
+ let kind = get_json_text_field(doc, schema.field_attributes, structured_doc::fields::KIND);
+
+ match kind {
+ "web" => {
+ DocSearchWebDocument::from_tantivy_document(doc, chunk).map(DocSearchDocument::Web)
+ }
+ "issue" => DocSearchIssueDocument::from_tantivy_document(doc, chunk)
+ .map(DocSearchDocument::Issue),
+ _ => None,
+ }
+ }
+}
+
+impl FromTantivyDocument for DocSearchWebDocument {
+ fn from_tantivy_document(doc: &TantivyDocument, chunk: &TantivyDocument) -> Option {
+ let schema = IndexSchema::instance();
+ let title = get_json_text_field(
+ doc,
+ schema.field_attributes,
+ structured_doc::fields::web::TITLE,
+ );
+ let link = get_json_text_field(
+ doc,
+ schema.field_attributes,
+ structured_doc::fields::web::LINK,
+ );
+ let snippet = get_json_text_field(
+ chunk,
+ schema.field_chunk_attributes,
+ structured_doc::fields::web::CHUNK_TEXT,
+ );
+
+ Some(Self {
+ title: title.into(),
+ link: link.into(),
+ snippet: snippet.into(),
+ })
+ }
+}
+
+impl FromTantivyDocument for DocSearchIssueDocument {
+ fn from_tantivy_document(doc: &TantivyDocument, _: &TantivyDocument) -> Option {
+ let schema = IndexSchema::instance();
+ let title = get_json_text_field(
+ doc,
+ schema.field_attributes,
+ structured_doc::fields::issue::TITLE,
+ );
+ let link = get_json_text_field(
+ doc,
+ schema.field_attributes,
+ structured_doc::fields::issue::LINK,
+ );
+ let body = get_json_text_field(
+ doc,
+ schema.field_attributes,
+ structured_doc::fields::issue::BODY,
+ );
+ let closed = get_json_bool_field(
+ doc,
+ schema.field_attributes,
+ structured_doc::fields::issue::CLOSED,
+ );
+ Some(Self {
+ title: title.into(),
+ link: link.into(),
+ body: body.into(),
+ closed,
+ })
+ }
+}
+
+fn get_json_field<'a>(
+ doc: &'a TantivyDocument,
+ field: schema::Field,
+ name: &str,
+) -> CompactDocValue<'a> {
+ doc.get_first(field)
+ .unwrap()
+ .as_object()
+ .unwrap()
+ .find(|(k, _)| *k == name)
+ .unwrap()
+ .1
+}
+
+fn get_json_bool_field(doc: &TantivyDocument, field: schema::Field, name: &str) -> bool {
+ get_json_field(doc, field, name).as_bool().unwrap()
+}
+
+fn get_json_text_field<'a>(doc: &'a TantivyDocument, field: schema::Field, name: &str) -> &'a str {
+ get_json_field(doc, field, name).as_str().unwrap()
+}
diff --git a/crates/tabby-common/src/config.rs b/crates/tabby-common/src/config.rs
index d52c51b4c585..7f0df26a1349 100644
--- a/crates/tabby-common/src/config.rs
+++ b/crates/tabby-common/src/config.rs
@@ -487,7 +487,7 @@ mod tests {
}
assert!(
- matches!(Config::validate_model_config(&config.model.completion), Err(ref e) if true)
+ matches!(Config::validate_model_config(&config.model.completion), Err(ref _e) if true)
);
assert!(Config::validate_model_config(&config.model.chat).is_ok());
}
diff --git a/crates/tabby-common/src/index/doc.rs b/crates/tabby-common/src/index/doc.rs
deleted file mode 100644
index 6489c22c84df..000000000000
--- a/crates/tabby-common/src/index/doc.rs
+++ /dev/null
@@ -1,5 +0,0 @@
-pub mod fields {
- pub const TITLE: &str = "title";
- pub const LINK: &str = "link";
- pub const CHUNK_TEXT: &str = "chunk_text";
-}
diff --git a/crates/tabby-common/src/index/mod.rs b/crates/tabby-common/src/index/mod.rs
index e212d98a8366..08da621c926b 100644
--- a/crates/tabby-common/src/index/mod.rs
+++ b/crates/tabby-common/src/index/mod.rs
@@ -1,5 +1,5 @@
pub mod code;
-pub mod doc;
+pub mod structured_doc;
use std::borrow::Cow;
@@ -75,6 +75,12 @@ pub const FIELD_SOURCE_ID: &str = "source_id";
pub mod corpus {
pub const CODE: &str = "code";
+ pub const STRUCTURED_DOC: &str = "structured_doc";
+
+ #[deprecated(
+ since = "0.20.0",
+ note = "The web corpus is deprecated and will be removed during the version upgrade."
+ )]
pub const WEB: &str = "web";
}
diff --git a/crates/tabby-common/src/index/structured_doc.rs b/crates/tabby-common/src/index/structured_doc.rs
new file mode 100644
index 000000000000..9dceabbe7506
--- /dev/null
+++ b/crates/tabby-common/src/index/structured_doc.rs
@@ -0,0 +1,16 @@
+pub mod fields {
+ pub const KIND: &str = "kind";
+
+ pub mod web {
+ pub const TITLE: &str = "title";
+ pub const LINK: &str = "link";
+ pub const CHUNK_TEXT: &str = "chunk_text";
+ }
+
+ pub mod issue {
+ pub const TITLE: &str = "title";
+ pub const LINK: &str = "link";
+ pub const BODY: &str = "body";
+ pub const CLOSED: &str = "closed";
+ }
+}
diff --git a/crates/tabby-index/src/code/mod.rs b/crates/tabby-index/src/code/mod.rs
index 427f6e35f79e..39d43f36976b 100644
--- a/crates/tabby-index/src/code/mod.rs
+++ b/crates/tabby-index/src/code/mod.rs
@@ -65,10 +65,10 @@ impl IndexAttributeBuilder for CodeBuilder {
json!({})
}
- async fn build_chunk_attributes(
+ async fn build_chunk_attributes<'a>(
&self,
- source_code: &SourceCode,
- ) -> BoxStream, serde_json::Value)>> {
+ source_code: &'a SourceCode,
+ ) -> BoxStream<'a, JoinHandle<(Vec, serde_json::Value)>> {
let text = match source_code.read_content() {
Ok(content) => content,
Err(e) => {
diff --git a/crates/tabby-index/src/doc/mod.rs b/crates/tabby-index/src/doc/mod.rs
deleted file mode 100644
index 44011d5fe51a..000000000000
--- a/crates/tabby-index/src/doc/mod.rs
+++ /dev/null
@@ -1,99 +0,0 @@
-pub mod public;
-
-use std::{collections::HashSet, sync::Arc};
-
-use async_stream::stream;
-use async_trait::async_trait;
-use futures::stream::BoxStream;
-use public::WebDocument;
-use serde_json::json;
-use tabby_common::index::{self, corpus, doc};
-use tabby_inference::Embedding;
-use tantivy::doc;
-use text_splitter::TextSplitter;
-use tokio::task::JoinHandle;
-use tracing::warn;
-
-use crate::{indexer::TantivyDocBuilder, IndexAttributeBuilder};
-
-const CHUNK_SIZE: usize = 2048;
-
-pub struct DocBuilder {
- embedding: Arc,
-}
-
-impl DocBuilder {
- fn new(embedding: Arc) -> Self {
- Self { embedding }
- }
-}
-
-#[async_trait]
-impl IndexAttributeBuilder for DocBuilder {
- async fn build_attributes(&self, document: &WebDocument) -> serde_json::Value {
- json!({
- doc::fields::TITLE: document.title,
- doc::fields::LINK: document.link,
- })
- }
-
- /// This function splits the document into chunks and computes the embedding for each chunk. It then converts the embeddings
- /// into binarized tokens by thresholding on zero.
- async fn build_chunk_attributes(
- &self,
- document: &WebDocument,
- ) -> BoxStream, serde_json::Value)>> {
- let embedding = self.embedding.clone();
- let chunks: Vec<_> = TextSplitter::new(CHUNK_SIZE)
- .chunks(&document.body)
- .map(|x| x.to_owned())
- .collect();
-
- let title_embedding_tokens = build_tokens(embedding.clone(), &document.title).await;
- let s = stream! {
- for chunk_text in chunks {
- let title_embedding_tokens = title_embedding_tokens.clone();
- let embedding = embedding.clone();
- yield tokio::spawn(async move {
- let chunk_embedding_tokens = build_tokens(embedding.clone(), &chunk_text).await;
- let chunk = json!({
- doc::fields::CHUNK_TEXT: chunk_text,
- });
-
- // Title embedding tokens are merged with chunk embedding tokens to enhance the search results.
- let tokens = merge_tokens(vec![title_embedding_tokens, chunk_embedding_tokens]);
- (tokens, chunk)
- });
- }
- };
-
- Box::pin(s)
- }
-}
-
-async fn build_tokens(embedding: Arc, text: &str) -> Vec {
- let embedding = match embedding.embed(text).await {
- Ok(embedding) => embedding,
- Err(err) => {
- warn!("Failed to embed chunk text: {}", err);
- return vec![];
- }
- };
-
- let mut chunk_embedding_tokens = vec![];
- for token in index::binarize_embedding(embedding.iter()) {
- chunk_embedding_tokens.push(token);
- }
-
- chunk_embedding_tokens
-}
-
-fn create_web_builder(embedding: Arc) -> TantivyDocBuilder {
- let builder = DocBuilder::new(embedding);
- TantivyDocBuilder::new(corpus::WEB, builder)
-}
-
-pub fn merge_tokens(tokens: Vec>) -> Vec {
- let tokens = tokens.into_iter().flatten().collect::>();
- tokens.into_iter().collect()
-}
diff --git a/crates/tabby-index/src/doc/public.rs b/crates/tabby-index/src/doc/public.rs
deleted file mode 100644
index 8e19971d4617..000000000000
--- a/crates/tabby-index/src/doc/public.rs
+++ /dev/null
@@ -1,125 +0,0 @@
-use std::sync::Arc;
-
-use async_stream::stream;
-use chrono::{DateTime, Utc};
-use futures::StreamExt;
-use tabby_common::index::corpus;
-use tabby_inference::Embedding;
-
-use super::create_web_builder;
-use crate::{
- indexer::{IndexId, TantivyDocBuilder, ToIndexId},
- Indexer,
-};
-
-pub struct DocIndexer {
- builder: TantivyDocBuilder,
- indexer: Indexer,
-}
-
-pub struct WebDocument {
- pub id: String,
- pub source_id: String,
- pub link: String,
- pub title: String,
- pub body: String,
-}
-
-impl ToIndexId for WebDocument {
- fn to_index_id(&self) -> IndexId {
- IndexId {
- source_id: self.source_id.clone(),
- id: self.id.clone(),
- }
- }
-}
-
-impl DocIndexer {
- pub fn new(embedding: Arc) -> Self {
- let builder = create_web_builder(embedding);
- let indexer = Indexer::new(corpus::WEB);
- Self { indexer, builder }
- }
-
- pub async fn add(&self, updated_at: DateTime, document: WebDocument) -> bool {
- let is_document_empty = document.body.trim().is_empty();
- if is_document_empty || self.indexer.is_indexed_after(&document.id, updated_at) {
- return false;
- };
-
- stream! {
- let (id, s) = self.builder.build(document).await;
- self.indexer.delete(&id);
-
- for await doc in s.buffer_unordered(std::cmp::max(std::thread::available_parallelism().unwrap().get() * 2, 32)) {
- if let Ok(Some(doc)) = doc {
- self.indexer.add(doc).await;
- }
- }
- }.count().await;
- true
- }
-
- pub fn commit(self) {
- self.indexer.commit();
- }
-}
-
-#[cfg(test)]
-mod tests {
- use std::sync::Arc;
-
- use async_trait::async_trait;
- use serial_test::serial;
- use tabby_common::path::set_tabby_root;
- use temp_testdir::TempDir;
-
- use super::*;
-
- struct FakeEmbedding;
-
- #[async_trait]
- impl Embedding for FakeEmbedding {
- async fn embed(&self, _prompt: &str) -> anyhow::Result> {
- Ok(vec![0.0; 16])
- }
- }
-
- fn create_testing_document() -> WebDocument {
- WebDocument {
- id: "1".to_string(),
- source_id: "1".to_string(),
- link: "https://example.com".to_string(),
- title: "Example".to_string(),
- body: "Hello, world!".to_string(),
- }
- }
-
- #[tokio::test]
- #[serial(set_tabby_root)]
- async fn test_add() {
- let tmp_dir = TempDir::default();
- set_tabby_root(tmp_dir.to_path_buf());
- let embedding = Arc::new(FakeEmbedding);
- let indexer = DocIndexer::new(embedding.clone());
- let updated_at = Utc::now();
-
- // Insert a new document
- assert!(indexer.add(updated_at, create_testing_document()).await);
- indexer.commit();
-
- // For document with the same id, and the updated_at is not newer, it should not be added.
- let indexer = DocIndexer::new(embedding);
- assert!(!indexer.add(updated_at, create_testing_document()).await);
-
- // For document with the same id, and the updated_at is newer, it should be added.
- assert!(
- indexer
- .add(
- updated_at + chrono::Duration::seconds(1),
- create_testing_document()
- )
- .await
- );
- }
-}
diff --git a/crates/tabby-index/src/indexer.rs b/crates/tabby-index/src/indexer.rs
index 646a917b1367..d3d33b1189ee 100644
--- a/crates/tabby-index/src/indexer.rs
+++ b/crates/tabby-index/src/indexer.rs
@@ -40,10 +40,10 @@ pub trait IndexAttributeBuilder: Send + Sync {
async fn build_attributes(&self, document: &T) -> serde_json::Value;
/// Build chunk level attributes, these attributes are stored and indexed.
- async fn build_chunk_attributes(
+ async fn build_chunk_attributes<'a>(
&self,
- document: &T,
- ) -> BoxStream, serde_json::Value)>>;
+ document: &'a T,
+ ) -> BoxStream<'a, JoinHandle<(Vec, serde_json::Value)>>;
}
pub struct TantivyDocBuilder {
diff --git a/crates/tabby-index/src/lib.rs b/crates/tabby-index/src/lib.rs
index 1bb14d0dc8d8..dc87050a2b04 100644
--- a/crates/tabby-index/src/lib.rs
+++ b/crates/tabby-index/src/lib.rs
@@ -7,7 +7,7 @@ mod tantivy_utils;
use indexer::{IndexAttributeBuilder, Indexer};
-mod doc;
+mod structured_doc;
pub mod public {
use indexer::IndexGarbageCollector;
@@ -15,7 +15,10 @@ pub mod public {
use super::*;
pub use super::{
code::CodeIndexer,
- doc::public::{DocIndexer, WebDocument},
+ structured_doc::public::{
+ StructuredDoc, StructuredDocFields, StructuredDocIndexer, StructuredDocIssueFields,
+ StructuredDocWebFields,
+ },
};
pub fn run_index_garbage_collection(active_sources: Vec) -> anyhow::Result<()> {
diff --git a/crates/tabby-index/src/structured_doc/mod.rs b/crates/tabby-index/src/structured_doc/mod.rs
new file mode 100644
index 000000000000..fcf5757d9c08
--- /dev/null
+++ b/crates/tabby-index/src/structured_doc/mod.rs
@@ -0,0 +1,51 @@
+pub mod public;
+mod types;
+
+use std::sync::Arc;
+
+use async_trait::async_trait;
+use futures::stream::BoxStream;
+use serde_json::json;
+use tabby_common::index::{corpus, structured_doc};
+use tabby_inference::Embedding;
+use tokio::task::JoinHandle;
+use types::{BuildStructuredDoc, StructuredDoc};
+
+use crate::{indexer::TantivyDocBuilder, IndexAttributeBuilder};
+
+pub struct StructuredDocBuilder {
+ embedding: Arc,
+}
+
+impl StructuredDocBuilder {
+ fn new(embedding: Arc) -> Self {
+ Self { embedding }
+ }
+}
+
+#[async_trait]
+impl IndexAttributeBuilder for StructuredDocBuilder {
+ async fn build_attributes(&self, document: &StructuredDoc) -> serde_json::Value {
+ let mut attributes = document.build_attributes().await;
+ attributes
+ .as_object_mut()
+ .unwrap()
+ .insert(structured_doc::fields::KIND.into(), json!(document.kind()));
+ attributes
+ }
+
+ async fn build_chunk_attributes<'a>(
+ &self,
+ document: &'a StructuredDoc,
+ ) -> BoxStream<'a, JoinHandle<(Vec, serde_json::Value)>> {
+ let embedding = self.embedding.clone();
+ document.build_chunk_attributes(embedding).await
+ }
+}
+
+fn create_structured_doc_builder(
+ embedding: Arc,
+) -> TantivyDocBuilder {
+ let builder = StructuredDocBuilder::new(embedding);
+ TantivyDocBuilder::new(corpus::STRUCTURED_DOC, builder)
+}
diff --git a/crates/tabby-index/src/structured_doc/public.rs b/crates/tabby-index/src/structured_doc/public.rs
new file mode 100644
index 000000000000..56e45fadb50a
--- /dev/null
+++ b/crates/tabby-index/src/structured_doc/public.rs
@@ -0,0 +1,53 @@
+use std::sync::Arc;
+
+use async_stream::stream;
+use chrono::{DateTime, Utc};
+use futures::StreamExt;
+use tabby_common::index::corpus;
+use tabby_inference::Embedding;
+
+pub use super::types::{
+ issue::IssueDocument as StructuredDocIssueFields, web::WebDocument as StructuredDocWebFields,
+ StructuredDoc, StructuredDocFields,
+};
+use super::{create_structured_doc_builder, types::BuildStructuredDoc};
+use crate::{indexer::TantivyDocBuilder, Indexer};
+
+pub struct StructuredDocIndexer {
+ builder: TantivyDocBuilder,
+ indexer: Indexer,
+}
+
+impl StructuredDocIndexer {
+ pub fn new(embedding: Arc) -> Self {
+ let builder = create_structured_doc_builder(embedding);
+ let indexer = Indexer::new(corpus::STRUCTURED_DOC);
+ Self { indexer, builder }
+ }
+
+ pub async fn add(&self, updated_at: DateTime, document: StructuredDoc) -> bool {
+ if document.should_skip() {
+ return false;
+ }
+
+ if self.indexer.is_indexed_after(document.id(), updated_at) {
+ return false;
+ };
+
+ stream! {
+ let (id, s) = self.builder.build(document).await;
+ self.indexer.delete(&id);
+
+ for await doc in s.buffer_unordered(std::cmp::max(std::thread::available_parallelism().unwrap().get() * 2, 32)) {
+ if let Ok(Some(doc)) = doc {
+ self.indexer.add(doc).await;
+ }
+ }
+ }.count().await;
+ true
+ }
+
+ pub fn commit(self) {
+ self.indexer.commit();
+ }
+}
diff --git a/crates/tabby-index/src/structured_doc/types.rs b/crates/tabby-index/src/structured_doc/types.rs
new file mode 100644
index 000000000000..f447354878cc
--- /dev/null
+++ b/crates/tabby-index/src/structured_doc/types.rs
@@ -0,0 +1,102 @@
+pub mod issue;
+pub mod web;
+
+use std::sync::Arc;
+
+use async_trait::async_trait;
+use futures::stream::BoxStream;
+use tabby_inference::Embedding;
+use tokio::task::JoinHandle;
+use tracing::warn;
+
+use crate::indexer::{IndexId, ToIndexId};
+
+pub struct StructuredDoc {
+ pub source_id: String,
+ pub fields: StructuredDocFields,
+}
+
+impl StructuredDoc {
+ pub fn id(&self) -> &str {
+ match &self.fields {
+ StructuredDocFields::Web(web) => &web.link,
+ StructuredDocFields::Issue(issue) => &issue.link,
+ }
+ }
+
+ pub fn kind(&self) -> &'static str {
+ match &self.fields {
+ StructuredDocFields::Web(_) => "web",
+ StructuredDocFields::Issue(_) => "issue",
+ }
+ }
+}
+
+impl ToIndexId for StructuredDoc {
+ fn to_index_id(&self) -> IndexId {
+ IndexId {
+ source_id: self.source_id.clone(),
+ id: self.id().to_owned(),
+ }
+ }
+}
+
+#[async_trait]
+pub trait BuildStructuredDoc {
+ fn should_skip(&self) -> bool;
+
+ async fn build_attributes(&self) -> serde_json::Value;
+ async fn build_chunk_attributes(
+ &self,
+ embedding: Arc,
+ ) -> BoxStream, serde_json::Value)>>;
+}
+
+pub enum StructuredDocFields {
+ Web(web::WebDocument),
+ Issue(issue::IssueDocument),
+}
+
+#[async_trait]
+impl BuildStructuredDoc for StructuredDoc {
+ fn should_skip(&self) -> bool {
+ match &self.fields {
+ StructuredDocFields::Web(doc) => doc.should_skip(),
+ StructuredDocFields::Issue(doc) => doc.should_skip(),
+ }
+ }
+
+ async fn build_attributes(&self) -> serde_json::Value {
+ match &self.fields {
+ StructuredDocFields::Web(doc) => doc.build_attributes().await,
+ StructuredDocFields::Issue(doc) => doc.build_attributes().await,
+ }
+ }
+
+ async fn build_chunk_attributes(
+ &self,
+ embedding: Arc,
+ ) -> BoxStream, serde_json::Value)>> {
+ match &self.fields {
+ StructuredDocFields::Web(doc) => doc.build_chunk_attributes(embedding).await,
+ StructuredDocFields::Issue(doc) => doc.build_chunk_attributes(embedding).await,
+ }
+ }
+}
+
+async fn build_tokens(embedding: Arc, text: &str) -> Vec {
+ let embedding = match embedding.embed(text).await {
+ Ok(embedding) => embedding,
+ Err(err) => {
+ warn!("Failed to embed chunk text: {}", err);
+ return vec![];
+ }
+ };
+
+ let mut chunk_embedding_tokens = vec![];
+ for token in tabby_common::index::binarize_embedding(embedding.iter()) {
+ chunk_embedding_tokens.push(token);
+ }
+
+ chunk_embedding_tokens
+}
diff --git a/crates/tabby-index/src/structured_doc/types/issue.rs b/crates/tabby-index/src/structured_doc/types/issue.rs
new file mode 100644
index 000000000000..d760ad17f309
--- /dev/null
+++ b/crates/tabby-index/src/structured_doc/types/issue.rs
@@ -0,0 +1,50 @@
+use std::sync::Arc;
+
+use async_stream::stream;
+use async_trait::async_trait;
+use futures::stream::BoxStream;
+use serde_json::json;
+use tabby_common::index::structured_doc::fields;
+use tabby_inference::Embedding;
+use tokio::task::JoinHandle;
+
+use super::{build_tokens, BuildStructuredDoc};
+
+pub struct IssueDocument {
+ pub link: String,
+ pub title: String,
+ pub body: String,
+ pub closed: bool,
+}
+
+#[async_trait]
+impl BuildStructuredDoc for IssueDocument {
+ fn should_skip(&self) -> bool {
+ false
+ }
+
+ async fn build_attributes(&self) -> serde_json::Value {
+ json!({
+ fields::issue::LINK: self.link,
+ fields::issue::TITLE: self.title,
+ fields::issue::BODY: self.body,
+ fields::issue::CLOSED: self.closed,
+ })
+ }
+
+ async fn build_chunk_attributes(
+ &self,
+ embedding: Arc,
+ ) -> BoxStream, serde_json::Value)>> {
+ let text = format!("{}\n\n{}", self.title, self.body);
+ let s = stream! {
+ yield tokio::spawn(async move {
+ let tokens = build_tokens(embedding, &text).await;
+ let chunk_attributes = json!({});
+ (tokens, chunk_attributes)
+ })
+ };
+
+ Box::pin(s)
+ }
+}
diff --git a/crates/tabby-index/src/structured_doc/types/web.rs b/crates/tabby-index/src/structured_doc/types/web.rs
new file mode 100644
index 000000000000..5565258a6149
--- /dev/null
+++ b/crates/tabby-index/src/structured_doc/types/web.rs
@@ -0,0 +1,67 @@
+use std::{collections::HashSet, sync::Arc};
+
+use async_stream::stream;
+use async_trait::async_trait;
+use futures::stream::BoxStream;
+use serde_json::json;
+use tabby_common::index::structured_doc::fields;
+use tabby_inference::Embedding;
+use text_splitter::TextSplitter;
+use tokio::task::JoinHandle;
+
+use super::{build_tokens, BuildStructuredDoc};
+
+pub struct WebDocument {
+ pub link: String,
+ pub title: String,
+ pub body: String,
+}
+
+#[async_trait]
+impl BuildStructuredDoc for WebDocument {
+ fn should_skip(&self) -> bool {
+ self.body.trim().is_empty()
+ }
+
+ async fn build_attributes(&self) -> serde_json::Value {
+ json!({
+ fields::web::TITLE: self.title,
+ fields::web::LINK: self.link,
+ })
+ }
+
+ async fn build_chunk_attributes(
+ &self,
+ embedding: Arc,
+ ) -> BoxStream, serde_json::Value)>> {
+ let chunks: Vec<_> = TextSplitter::new(2048)
+ .chunks(&self.body)
+ .map(|x| x.to_owned())
+ .collect();
+
+ let title_embedding_tokens = build_tokens(embedding.clone(), &self.title).await;
+ let s = stream! {
+ for chunk_text in chunks {
+ let title_embedding_tokens = title_embedding_tokens.clone();
+ let embedding = embedding.clone();
+ yield tokio::spawn(async move {
+ let chunk_embedding_tokens = build_tokens(embedding.clone(), &chunk_text).await;
+ let chunk = json!({
+ fields::web::CHUNK_TEXT: chunk_text,
+ });
+
+ // Title embedding tokens are merged with chunk embedding tokens to enhance the search results.
+ let tokens = merge_tokens(vec![title_embedding_tokens, chunk_embedding_tokens]);
+ (tokens, chunk)
+ });
+ }
+ };
+
+ Box::pin(s)
+ }
+}
+
+pub fn merge_tokens(tokens: Vec>) -> Vec {
+ let tokens = tokens.into_iter().flatten().collect::>();
+ tokens.into_iter().collect()
+}
diff --git a/crates/tabby/src/serve.rs b/crates/tabby/src/serve.rs
index 6fbab8ae136c..23a6cf5fe9b5 100644
--- a/crates/tabby/src/serve.rs
+++ b/crates/tabby/src/serve.rs
@@ -150,7 +150,7 @@ pub async fn main(config: &Config, args: &ServeArgs) {
}
let index_reader_provider = Arc::new(IndexReaderProvider::default());
- let docsearch = Arc::new(services::doc::create(
+ let docsearch = Arc::new(services::structured_doc::create(
embedding.clone(),
index_reader_provider.clone(),
));
@@ -195,7 +195,7 @@ pub async fn main(config: &Config, args: &ServeArgs) {
chat,
completion_stream,
docsearch,
- |x| Box::new(services::doc::create_serper(x)),
+ |x| Box::new(services::structured_doc::create_serper(x)),
)
.await;
api = new_api;
diff --git a/crates/tabby/src/services/mod.rs b/crates/tabby/src/services/mod.rs
index 0c7d5892b252..2d572751a124 100644
--- a/crates/tabby/src/services/mod.rs
+++ b/crates/tabby/src/services/mod.rs
@@ -1,8 +1,8 @@
pub mod code;
pub mod completion;
-pub mod doc;
pub mod embedding;
pub mod event;
pub mod health;
pub mod model;
+pub mod structured_doc;
pub mod tantivy;
diff --git a/crates/tabby/src/services/doc/mod.rs b/crates/tabby/src/services/structured_doc/mod.rs
similarity index 88%
rename from crates/tabby/src/services/doc/mod.rs
rename to crates/tabby/src/services/structured_doc/mod.rs
index 128542af5150..c13d64ee40cc 100644
--- a/crates/tabby/src/services/doc/mod.rs
+++ b/crates/tabby/src/services/structured_doc/mod.rs
@@ -3,7 +3,7 @@ mod tantivy;
use std::sync::Arc;
-use tabby_common::api::doc::DocSearch;
+use tabby_common::api::structured_doc::DocSearch;
use tabby_inference::Embedding;
use super::tantivy::IndexReaderProvider;
diff --git a/crates/tabby/src/services/doc/serper.rs b/crates/tabby/src/services/structured_doc/serper.rs
similarity index 93%
rename from crates/tabby/src/services/doc/serper.rs
rename to crates/tabby/src/services/structured_doc/serper.rs
index 3cdd59185b11..515ad3b10609 100644
--- a/crates/tabby/src/services/doc/serper.rs
+++ b/crates/tabby/src/services/structured_doc/serper.rs
@@ -1,7 +1,8 @@
use async_trait::async_trait;
use serde::{Deserialize, Serialize};
-use tabby_common::api::doc::{
+use tabby_common::api::structured_doc::{
DocSearch, DocSearchDocument, DocSearchError, DocSearchHit, DocSearchResponse,
+ DocSearchWebDocument,
};
use tracing::warn;
@@ -77,11 +78,11 @@ impl DocSearch for SerperService {
.into_iter()
.map(|hit| DocSearchHit {
score: 0.0,
- doc: DocSearchDocument {
+ doc: DocSearchDocument::Web(DocSearchWebDocument {
title: hit.title,
link: hit.link,
snippet: hit.snippet,
- },
+ }),
})
.collect();
diff --git a/crates/tabby/src/services/doc/tantivy.rs b/crates/tabby/src/services/structured_doc/tantivy.rs
similarity index 76%
rename from crates/tabby/src/services/doc/tantivy.rs
rename to crates/tabby/src/services/structured_doc/tantivy.rs
index cecc98841c03..762104efe284 100644
--- a/crates/tabby/src/services/doc/tantivy.rs
+++ b/crates/tabby/src/services/structured_doc/tantivy.rs
@@ -3,8 +3,11 @@ use std::{collections::HashSet, sync::Arc};
use anyhow::Result;
use async_trait::async_trait;
use tabby_common::{
- api::doc::{DocSearch, DocSearchDocument, DocSearchError, DocSearchHit, DocSearchResponse},
- index::{self, corpus, doc},
+ api::structured_doc::{
+ DocSearch, DocSearchDocument, DocSearchError, DocSearchHit, DocSearchResponse,
+ FromTantivyDocument,
+ },
+ index::{self, corpus},
};
use tabby_inference::Embedding;
use tantivy::{
@@ -40,7 +43,7 @@ impl DocSearchImpl {
let embedding = self.embedding.embed(q).await?;
let embedding_tokens_query =
index::embedding_tokens_query(embedding.len(), embedding.iter());
- let corpus_query = schema.corpus_query(corpus::WEB);
+ let corpus_query = schema.corpus_query(corpus::STRUCTURED_DOC);
let mut query_clauses: Vec<(Occur, Box)> = vec![
(
@@ -94,13 +97,7 @@ impl DocSearchImpl {
score,
chunk,
}| {
- let chunk_text = get_json_text_field(
- chunk,
- schema.field_chunk_attributes,
- doc::fields::CHUNK_TEXT,
- );
-
- let doc_query = schema.doc_query(corpus::WEB, doc_id);
+ let doc_query = schema.doc_query(corpus::STRUCTURED_DOC, doc_id);
let top_docs = match searcher.search(&doc_query, &TopDocs::with_limit(1)) {
Err(err) => {
warn!("Failed to search doc `{}`: `{}`", doc_id, err);
@@ -110,19 +107,8 @@ impl DocSearchImpl {
};
let (_, doc_address) = top_docs.first()?;
let doc: TantivyDocument = searcher.doc(*doc_address).ok()?;
- let title =
- get_json_text_field(&doc, schema.field_attributes, doc::fields::TITLE);
- let link =
- get_json_text_field(&doc, schema.field_attributes, doc::fields::LINK);
-
- Some(DocSearchHit {
- doc: DocSearchDocument {
- title: title.to_string(),
- link: link.to_string(),
- snippet: chunk_text.to_string(),
- },
- score: *score,
- })
+ DocSearchDocument::from_tantivy_document(&doc, chunk)
+ .map(|doc| DocSearchHit { score: *score, doc })
},
)
.filter(|x| x.score >= EMBEDDING_SCORE_THRESHOLD)
@@ -137,18 +123,6 @@ fn get_text(doc: &TantivyDocument, field: schema::Field) -> &str {
doc.get_first(field).unwrap().as_str().unwrap()
}
-fn get_json_text_field<'a>(doc: &'a TantivyDocument, field: schema::Field, name: &str) -> &'a str {
- doc.get_first(field)
- .unwrap()
- .as_object()
- .unwrap()
- .find(|(k, _)| *k == name)
- .unwrap()
- .1
- .as_str()
- .unwrap()
-}
-
struct ScoredChunk {
doc_id: String,
score: f32,
diff --git a/ee/tabby-db/src/lib.rs b/ee/tabby-db/src/lib.rs
index dd9c724a357d..a288d678185b 100644
--- a/ee/tabby-db/src/lib.rs
+++ b/ee/tabby-db/src/lib.rs
@@ -15,7 +15,8 @@ pub use server_setting::ServerSettingDAO;
use sqlx::{query, query_scalar, sqlite::SqliteQueryResult, Pool, Sqlite, SqlitePool};
pub use threads::{
ThreadDAO, ThreadMessageAttachmentClientCode, ThreadMessageAttachmentCode,
- ThreadMessageAttachmentDoc, ThreadMessageDAO,
+ ThreadMessageAttachmentDoc, ThreadMessageAttachmentIssueDoc, ThreadMessageAttachmentWebDoc,
+ ThreadMessageDAO,
};
use tokio::sync::Mutex;
use user_completions::UserCompletionDailyStatsDAO;
diff --git a/ee/tabby-db/src/threads.rs b/ee/tabby-db/src/threads.rs
index ede3a48290c1..01d2e0292853 100644
--- a/ee/tabby-db/src/threads.rs
+++ b/ee/tabby-db/src/threads.rs
@@ -32,12 +32,27 @@ pub struct ThreadMessageDAO {
}
#[derive(Serialize, Deserialize)]
-pub struct ThreadMessageAttachmentDoc {
+#[serde(untagged)] // Mark the serde serialization format as untagged for backward compatibility: https://serde.rs/enum-representations.html#untagged
+pub enum ThreadMessageAttachmentDoc {
+ Web(ThreadMessageAttachmentWebDoc),
+ Issue(ThreadMessageAttachmentIssueDoc),
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct ThreadMessageAttachmentWebDoc {
pub title: String,
pub link: String,
pub content: String,
}
+#[derive(Serialize, Deserialize)]
+pub struct ThreadMessageAttachmentIssueDoc {
+ pub title: String,
+ pub link: String,
+ pub body: String,
+ pub closed: bool,
+}
+
#[derive(Serialize, Deserialize)]
pub struct ThreadMessageAttachmentCode {
pub git_url: String,
diff --git a/ee/tabby-schema/graphql/schema.graphql b/ee/tabby-schema/graphql/schema.graphql
index b029d731d089..085bac638402 100644
--- a/ee/tabby-schema/graphql/schema.graphql
+++ b/ee/tabby-schema/graphql/schema.graphql
@@ -509,7 +509,14 @@ type MessageAttachmentCodeScores {
embedding: Float!
}
-type MessageAttachmentDoc {
+type MessageAttachmentIssueDoc {
+ title: String!
+ link: String!
+ body: String!
+ closed: Boolean!
+}
+
+type MessageAttachmentWebDoc {
title: String!
link: String!
content: String!
@@ -893,6 +900,8 @@ type WebContextSource implements ContextSourceId & ContextSource {
sourceName: String!
}
+union MessageAttachmentDoc = MessageAttachmentWebDoc | MessageAttachmentIssueDoc
+
"""
Schema of thread run stream.
diff --git a/ee/tabby-schema/src/dao.rs b/ee/tabby-schema/src/dao.rs
index 2638f6988dcb..dc0da9b3bc3b 100644
--- a/ee/tabby-schema/src/dao.rs
+++ b/ee/tabby-schema/src/dao.rs
@@ -4,7 +4,8 @@ use lazy_static::lazy_static;
use tabby_db::{
EmailSettingDAO, IntegrationDAO, InvitationDAO, JobRunDAO, OAuthCredentialDAO,
ServerSettingDAO, ThreadDAO, ThreadMessageAttachmentClientCode, ThreadMessageAttachmentCode,
- ThreadMessageAttachmentDoc, ThreadMessageDAO, UserEventDAO,
+ ThreadMessageAttachmentDoc, ThreadMessageAttachmentIssueDoc, ThreadMessageAttachmentWebDoc,
+ ThreadMessageDAO, UserEventDAO,
};
use crate::{
@@ -229,20 +230,44 @@ impl From<&thread::MessageAttachmentCodeInput> for ThreadMessageAttachmentClient
impl From for thread::MessageAttachmentDoc {
fn from(value: ThreadMessageAttachmentDoc) -> Self {
- Self {
- title: value.title,
- link: value.link,
- content: value.content,
+ match value {
+ ThreadMessageAttachmentDoc::Web(val) => {
+ thread::MessageAttachmentDoc::Web(thread::MessageAttachmentWebDoc {
+ title: val.title,
+ link: val.link,
+ content: val.content,
+ })
+ }
+ ThreadMessageAttachmentDoc::Issue(val) => {
+ thread::MessageAttachmentDoc::Issue(thread::MessageAttachmentIssueDoc {
+ title: val.title,
+ link: val.link,
+ body: val.body,
+ closed: val.closed,
+ })
+ }
}
}
}
impl From<&thread::MessageAttachmentDoc> for ThreadMessageAttachmentDoc {
fn from(val: &thread::MessageAttachmentDoc) -> Self {
- ThreadMessageAttachmentDoc {
- title: val.title.clone(),
- link: val.link.clone(),
- content: val.content.clone(),
+ match val {
+ thread::MessageAttachmentDoc::Web(val) => {
+ ThreadMessageAttachmentDoc::Web(ThreadMessageAttachmentWebDoc {
+ title: val.title.clone(),
+ link: val.link.clone(),
+ content: val.content.clone(),
+ })
+ }
+ thread::MessageAttachmentDoc::Issue(val) => {
+ ThreadMessageAttachmentDoc::Issue(ThreadMessageAttachmentIssueDoc {
+ title: val.title.clone(),
+ link: val.link.clone(),
+ body: val.body.clone(),
+ closed: val.closed,
+ })
+ }
}
}
}
diff --git a/ee/tabby-schema/src/schema/thread/types.rs b/ee/tabby-schema/src/schema/thread/types.rs
index afcdb907d886..f89cc850575f 100644
--- a/ee/tabby-schema/src/schema/thread/types.rs
+++ b/ee/tabby-schema/src/schema/thread/types.rs
@@ -3,7 +3,7 @@ use juniper::{GraphQLEnum, GraphQLInputObject, GraphQLObject, GraphQLUnion, ID};
use serde::Serialize;
use tabby_common::api::{
code::{CodeSearchDocument, CodeSearchHit, CodeSearchScores},
- doc::{DocSearchDocument, DocSearchHit},
+ structured_doc::{DocSearchDocument, DocSearchHit},
};
use validator::Validate;
@@ -121,19 +121,43 @@ impl From for MessageCodeSearchHit {
}
}
+#[derive(GraphQLUnion, Clone)]
+pub enum MessageAttachmentDoc {
+ Web(MessageAttachmentWebDoc),
+ Issue(MessageAttachmentIssueDoc),
+}
+
#[derive(GraphQLObject, Clone)]
-pub struct MessageAttachmentDoc {
+pub struct MessageAttachmentWebDoc {
pub title: String,
pub link: String,
pub content: String,
}
+#[derive(GraphQLObject, Clone)]
+pub struct MessageAttachmentIssueDoc {
+ pub title: String,
+ pub link: String,
+ pub body: String,
+ pub closed: bool,
+}
+
impl From for MessageAttachmentDoc {
fn from(doc: DocSearchDocument) -> Self {
- Self {
- title: doc.title,
- link: doc.link,
- content: doc.snippet,
+ match doc {
+ DocSearchDocument::Web(web) => MessageAttachmentDoc::Web(MessageAttachmentWebDoc {
+ title: web.title,
+ link: web.link,
+ content: web.snippet,
+ }),
+ DocSearchDocument::Issue(issue) => {
+ MessageAttachmentDoc::Issue(MessageAttachmentIssueDoc {
+ title: issue.title,
+ link: issue.link,
+ body: issue.body,
+ closed: issue.closed,
+ })
+ }
}
}
}
diff --git a/ee/tabby-ui/app/search/components/assistant-message-section.tsx b/ee/tabby-ui/app/search/components/assistant-message-section.tsx
index 44f5734ea53e..50e3670b3636 100644
--- a/ee/tabby-ui/app/search/components/assistant-message-section.tsx
+++ b/ee/tabby-ui/app/search/components/assistant-message-section.tsx
@@ -22,6 +22,7 @@ import {
import {
cn,
formatLineHashForCodeBrowser,
+ getContent,
getRangeFromAttachmentCode,
getRangeTextFromAttachmentCode
} from '@/lib/utils'
@@ -491,7 +492,7 @@ function SourceCard({
}
)}
>
- {normalizedText(source.content)}
+ {normalizedText(getContent(source))}
diff --git a/ee/tabby-ui/components/message-markdown/index.tsx b/ee/tabby-ui/components/message-markdown/index.tsx
index cb41ee5aa211..48cfa20be4a9 100644
--- a/ee/tabby-ui/components/message-markdown/index.tsx
+++ b/ee/tabby-ui/components/message-markdown/index.tsx
@@ -15,7 +15,7 @@ import {
MessageAttachmentDoc
} from '@/lib/gql/generates/graphql'
import { AttachmentCodeItem, AttachmentDocItem } from '@/lib/types'
-import { cn } from '@/lib/utils'
+import { cn, getContent } from '@/lib/utils'
import { CodeBlock, CodeBlockProps } from '@/components/ui/codeblock'
import {
HoverCard,
@@ -398,7 +398,7 @@ function RelevantDocumentBadge({
{relevantDocument.title}
- {normalizedText(relevantDocument.content)}
+ {normalizedText(getContent(relevantDocument))}
diff --git a/ee/tabby-ui/lib/hooks/use-thread-run.ts b/ee/tabby-ui/lib/hooks/use-thread-run.ts
index eb97f860b70c..e8daa2f7ad06 100644
--- a/ee/tabby-ui/lib/hooks/use-thread-run.ts
+++ b/ee/tabby-ui/lib/hooks/use-thread-run.ts
@@ -54,9 +54,18 @@ const CreateThreadAndRunSubscription = graphql(/* GraphQL */ `
... on ThreadAssistantMessageAttachmentsDoc {
hits {
doc {
- title
- link
- content
+ __typename
+ ... on MessageAttachmentWebDoc {
+ title
+ link
+ content
+ }
+ ... on MessageAttachmentIssueDoc {
+ title
+ link
+ body
+ closed
+ }
}
score
}
@@ -106,9 +115,18 @@ const CreateThreadRunSubscription = graphql(/* GraphQL */ `
... on ThreadAssistantMessageAttachmentsDoc {
hits {
doc {
- title
- link
- content
+ __typename
+ ... on MessageAttachmentWebDoc {
+ title
+ link
+ content
+ }
+ ... on MessageAttachmentIssueDoc {
+ title
+ link
+ body
+ closed
+ }
}
score
}
diff --git a/ee/tabby-ui/lib/tabby/query.ts b/ee/tabby-ui/lib/tabby/query.ts
index a39af1dda9a2..2821a05235ee 100644
--- a/ee/tabby-ui/lib/tabby/query.ts
+++ b/ee/tabby-ui/lib/tabby/query.ts
@@ -413,9 +413,18 @@ export const listThreadMessages = graphql(/* GraphQL */ `
startLine
}
doc {
- title
- link
- content
+ __typename
+ ... on MessageAttachmentWebDoc {
+ title
+ link
+ content
+ }
+ ... on MessageAttachmentIssueDoc {
+ title
+ link
+ body
+ closed
+ }
}
}
}
diff --git a/ee/tabby-ui/lib/utils/index.ts b/ee/tabby-ui/lib/utils/index.ts
index 8abaeb9c9303..c908130beada 100644
--- a/ee/tabby-ui/lib/utils/index.ts
+++ b/ee/tabby-ui/lib/utils/index.ts
@@ -3,7 +3,7 @@ import { compact, isNil } from 'lodash-es'
import { customAlphabet } from 'nanoid'
import { twMerge } from 'tailwind-merge'
-import { AttachmentCodeItem } from '@/lib/types'
+import { AttachmentCodeItem, AttachmentDocItem } from '@/lib/types'
export * from './chat'
@@ -121,3 +121,14 @@ export function getRangeTextFromAttachmentCode(code: AttachmentCodeItem) {
const { startLine, endLine } = getRangeFromAttachmentCode(code)
return formatLineHashForCodeBrowser({ start: startLine, end: endLine })
}
+
+export function getContent(item: AttachmentDocItem) {
+ switch (item.__typename) {
+ case 'MessageAttachmentWebDoc':
+ return item.content
+ case 'MessageAttachmentIssueDoc':
+ return item.body
+ }
+
+ return ''
+}
diff --git a/ee/tabby-webserver/src/service/answer.rs b/ee/tabby-webserver/src/service/answer.rs
index 8f0944f9d638..ec2343401cfe 100644
--- a/ee/tabby-webserver/src/service/answer.rs
+++ b/ee/tabby-webserver/src/service/answer.rs
@@ -22,7 +22,7 @@ use tabby_common::{
CodeSearch, CodeSearchError, CodeSearchHit, CodeSearchParams, CodeSearchQuery,
CodeSearchScores,
},
- doc::{DocSearch, DocSearchError, DocSearchHit},
+ structured_doc::{DocSearch, DocSearchError, DocSearchHit},
},
config::AnswerConfig,
};
@@ -33,9 +33,9 @@ use tabby_schema::{
repository::{Repository, RepositoryService},
thread::{
self, CodeQueryInput, CodeSearchParamsOverrideInput, DocQueryInput, MessageAttachment,
- ThreadAssistantMessageAttachmentsCode, ThreadAssistantMessageAttachmentsDoc,
- ThreadAssistantMessageContentDelta, ThreadRelevantQuestions, ThreadRunItem,
- ThreadRunOptionsInput,
+ MessageAttachmentDoc, ThreadAssistantMessageAttachmentsCode,
+ ThreadAssistantMessageAttachmentsDoc, ThreadAssistantMessageContentDelta,
+ ThreadRelevantQuestions, ThreadRunItem, ThreadRunOptionsInput,
},
};
use tracing::{debug, error, warn};
@@ -317,7 +317,7 @@ impl AnswerService {
attachment
.doc
.iter()
- .map(|doc| format!("```\n{}\n```", doc.content)),
+ .map(|doc| format!("```\n{}\n```", get_content(doc))),
)
.collect();
@@ -465,7 +465,7 @@ fn build_user_prompt(
let snippets: Vec = assistant_attachment
.doc
.iter()
- .map(|doc| format!("```\n{}\n```", doc.content))
+ .map(|doc| format!("```\n{}\n```", get_content(doc)))
.chain(
user_attachment_input
.map(|x| &x.code)
@@ -599,6 +599,13 @@ fn count_lines(path: &Path) -> std::io::Result {
Ok(count)
}
+fn get_content(doc: &MessageAttachmentDoc) -> &str {
+ match doc {
+ MessageAttachmentDoc::Web(web) => &web.content,
+ MessageAttachmentDoc::Issue(issue) => &issue.body,
+ }
+}
+
#[cfg(test)]
pub mod testutils;
@@ -613,7 +620,7 @@ mod tests {
code::{
CodeSearch, CodeSearchDocument, CodeSearchHit, CodeSearchParams, CodeSearchScores,
},
- doc::DocSearch,
+ structured_doc::{DocSearch, DocSearchDocument},
},
config::AnswerConfig,
};
@@ -700,15 +707,22 @@ mod tests {
}
}
+ fn get_title(doc: &DocSearchDocument) -> &str {
+ match doc {
+ DocSearchDocument::Web(web_doc) => &web_doc.title,
+ DocSearchDocument::Issue(issue_doc) => &issue_doc.title,
+ }
+ }
+
#[test]
fn test_build_user_prompt() {
let user_input = "What is the purpose of this code?";
let assistant_attachment = tabby_schema::thread::MessageAttachment {
- doc: vec![tabby_schema::thread::MessageAttachmentDoc {
+ doc: vec![tabby_schema::thread::MessageAttachmentDoc::Web(tabby_schema::thread::MessageAttachmentWebDoc {
title: "Documentation".to_owned(),
content: "This code implements a basic web server.".to_owned(),
link: "https://example.com/docs".to_owned(),
- }],
+ })],
code: vec![tabby_schema::thread::MessageAttachmentCode {
git_url: "https://github.com/".to_owned(),
filepath: "server.py".to_owned(),
@@ -735,11 +749,13 @@ mod tests {
fn test_convert_messages_to_chat_completion_request() {
// Fake assistant attachment
let attachment = tabby_schema::thread::MessageAttachment {
- doc: vec![tabby_schema::thread::MessageAttachmentDoc {
- title: "1. Example Document".to_owned(),
- content: "This is an example".to_owned(),
- link: "https://example.com".to_owned(),
- }],
+ doc: vec![tabby_schema::thread::MessageAttachmentDoc::Web(
+ tabby_schema::thread::MessageAttachmentWebDoc {
+ title: "1. Example Document".to_owned(),
+ content: "This is an example".to_owned(),
+ link: "https://example.com".to_owned(),
+ },
+ )],
code: vec![tabby_schema::thread::MessageAttachmentCode {
git_url: "https://github.com".to_owned(),
filepath: "server.py".to_owned(),
@@ -919,11 +935,13 @@ mod tests {
);
let attachment = MessageAttachment {
- doc: vec![tabby_schema::thread::MessageAttachmentDoc {
- title: "1. Example Document".to_owned(),
- content: "This is an example".to_owned(),
- link: "https://example.com".to_owned(),
- }],
+ doc: vec![tabby_schema::thread::MessageAttachmentDoc::Web(
+ tabby_schema::thread::MessageAttachmentWebDoc {
+ title: "1. Example Document".to_owned(),
+ content: "This is an example".to_owned(),
+ link: "https://example.com".to_owned(),
+ },
+ )],
code: vec![tabby_schema::thread::MessageAttachmentCode {
git_url: "https://github.com".to_owned(),
filepath: "server.py".to_owned(),
@@ -987,7 +1005,7 @@ mod tests {
assert_eq!(hits.len(), 10, "Expected 10 hits from the doc search");
assert!(
- hits.iter().any(|hit| hit.doc.title == "Document 1"),
+ hits.iter().any(|hit| get_title(&hit.doc) == "Document 1"),
"Expected to find a hit with title 'Document 1'"
);
}
diff --git a/ee/tabby-webserver/src/service/answer/testutils/mod.rs b/ee/tabby-webserver/src/service/answer/testutils/mod.rs
index 287838c62691..6d784ce4f141 100644
--- a/ee/tabby-webserver/src/service/answer/testutils/mod.rs
+++ b/ee/tabby-webserver/src/service/answer/testutils/mod.rs
@@ -12,7 +12,10 @@ use axum::async_trait;
use juniper::ID;
use tabby_common::api::{
code::{CodeSearch, CodeSearchError, CodeSearchParams, CodeSearchQuery, CodeSearchResponse},
- doc::{DocSearch, DocSearchDocument, DocSearchError, DocSearchHit, DocSearchResponse},
+ structured_doc::{
+ DocSearch, DocSearchDocument, DocSearchError, DocSearchHit, DocSearchResponse,
+ DocSearchWebDocument,
+ },
};
use tabby_db::DbConn;
use tabby_inference::ChatCompletionStream;
@@ -159,43 +162,43 @@ impl DocSearch for FakeDocSearch {
let hits = vec![
DocSearchHit {
score: 1.0,
- doc: DocSearchDocument {
+ doc: DocSearchDocument::Web(DocSearchWebDocument {
title: "Document 1".to_string(),
link: "https://example.com/doc1".to_string(),
snippet: "Snippet for Document 1".to_string(),
- },
+ }),
},
DocSearchHit {
score: 0.9,
- doc: DocSearchDocument {
+ doc: DocSearchDocument::Web(DocSearchWebDocument {
title: "Document 2".to_string(),
link: "https://example.com/doc2".to_string(),
snippet: "Snippet for Document 2".to_string(),
- },
+ }),
},
DocSearchHit {
score: 0.8,
- doc: DocSearchDocument {
+ doc: DocSearchDocument::Web(DocSearchWebDocument {
title: "Document 3".to_string(),
link: "https://example.com/doc3".to_string(),
snippet: "Snippet for Document 3".to_string(),
- },
+ }),
},
DocSearchHit {
score: 0.7,
- doc: DocSearchDocument {
+ doc: DocSearchDocument::Web(DocSearchWebDocument {
title: "Document 4".to_string(),
link: "https://example.com/doc4".to_string(),
snippet: "Snippet for Document 4".to_string(),
- },
+ }),
},
DocSearchHit {
score: 0.6,
- doc: DocSearchDocument {
+ doc: DocSearchDocument::Web(DocSearchWebDocument {
title: "Document 5".to_string(),
link: "https://example.com/doc5".to_string(),
snippet: "Snippet for Document 5".to_string(),
- },
+ }),
},
];
Ok(DocSearchResponse { hits })
diff --git a/ee/tabby-webserver/src/service/background_job/third_party_integration.rs b/ee/tabby-webserver/src/service/background_job/third_party_integration.rs
index ec65ae6a3145..76c670a5dbce 100644
--- a/ee/tabby-webserver/src/service/background_job/third_party_integration.rs
+++ b/ee/tabby-webserver/src/service/background_job/third_party_integration.rs
@@ -7,7 +7,7 @@ use issues::{list_github_issues, list_gitlab_issues};
use juniper::ID;
use serde::{Deserialize, Serialize};
use tabby_common::config::CodeRepository;
-use tabby_index::public::{CodeIndexer, DocIndexer, WebDocument};
+use tabby_index::public::{CodeIndexer, StructuredDoc, StructuredDocIndexer};
use tabby_inference::Embedding;
use tabby_schema::{
integration::{Integration, IntegrationKind, IntegrationService},
@@ -112,7 +112,7 @@ impl SchedulerGithubGitlabJob {
"Indexing documents for repository {}",
repository.display_name
);
- let index = DocIndexer::new(embedding);
+ let index = StructuredDocIndexer::new(embedding);
let s = match fetch_all_issues(&integration, &repository).await {
Ok(s) => s,
Err(e) => {
@@ -169,8 +169,8 @@ impl SchedulerGithubGitlabJob {
async fn fetch_all_issues(
integration: &Integration,
repository: &ProvidedRepository,
-) -> tabby_schema::Result, WebDocument)>> {
- let s: BoxStream<(DateTime, WebDocument)> = match &integration.kind {
+) -> tabby_schema::Result, StructuredDoc)>> {
+ let s: BoxStream<(DateTime, StructuredDoc)> = match &integration.kind {
IntegrationKind::Github | IntegrationKind::GithubSelfHosted => list_github_issues(
&repository.source_id(),
integration.api_base(),
diff --git a/ee/tabby-webserver/src/service/background_job/third_party_integration/issues.rs b/ee/tabby-webserver/src/service/background_job/third_party_integration/issues.rs
index 949f381524ad..84bf63c7203f 100644
--- a/ee/tabby-webserver/src/service/background_job/third_party_integration/issues.rs
+++ b/ee/tabby-webserver/src/service/background_job/third_party_integration/issues.rs
@@ -2,10 +2,10 @@ use anyhow::{anyhow, Result};
use async_stream::stream;
use chrono::{DateTime, Utc};
use futures::Stream;
-use gitlab::api::{issues::ProjectIssues, projects::merge_requests::MergeRequests, AsyncQuery};
+use gitlab::api::{issues::ProjectIssues, AsyncQuery};
use octocrab::Octocrab;
use serde::Deserialize;
-use tabby_index::public::WebDocument;
+use tabby_index::public::{StructuredDoc, StructuredDocFields, StructuredDocIssueFields};
use crate::service::create_gitlab_client;
@@ -14,7 +14,7 @@ pub async fn list_github_issues(
api_base: &str,
full_name: &str,
access_token: &str,
-) -> Result, WebDocument)>> {
+) -> Result, StructuredDoc)>> {
let octocrab = Octocrab::builder()
.personal_token(access_token.to_string())
.base_uri(api_base)?
@@ -47,12 +47,14 @@ pub async fn list_github_issues(
let pages = response.number_of_pages().unwrap_or_default();
for issue in response.items {
- let doc = WebDocument {
+ let doc = StructuredDoc {
source_id: source_id.to_string(),
- id: issue.html_url.to_string(),
- link: issue.html_url.to_string(),
- title: issue.title,
- body: issue.body.unwrap_or_default(),
+ fields: StructuredDocFields::Issue(StructuredDocIssueFields {
+ link: issue.html_url.to_string(),
+ title: issue.title,
+ body: issue.body.unwrap_or_default(),
+ closed: issue.state == octocrab::models::IssueState::Closed,
+ })
};
yield (issue.updated_at, doc);
}
@@ -73,6 +75,7 @@ struct GitlabIssue {
description: Option,
web_url: String,
updated_at: DateTime,
+ state: String,
}
pub async fn list_gitlab_issues(
@@ -80,7 +83,7 @@ pub async fn list_gitlab_issues(
api_base: &str,
full_name: &str,
access_token: &str,
-) -> Result, WebDocument)>> {
+) -> Result, StructuredDoc)>> {
let gitlab = create_gitlab_client(api_base, access_token).await?;
let source_id = source_id.to_owned();
@@ -101,40 +104,16 @@ pub async fn list_gitlab_issues(
};
for issue in issues {
- let doc = WebDocument {
+ let doc = StructuredDoc {
source_id: source_id.to_owned(),
- id: issue.web_url.clone(),
+ fields: StructuredDocFields::Issue(StructuredDocIssueFields {
link: issue.web_url,
title: issue.title,
body: issue.description.unwrap_or_default(),
- };
+ closed: issue.state == "closed",
+ })};
yield (issue.updated_at, doc);
}
-
- let merge_requests: Vec = match gitlab::api::paged(
- MergeRequests::builder().project(&full_name).build().expect("Failed to build request"),
- gitlab::api::Pagination::All,
- )
- .query_async(&gitlab)
- .await {
- Ok(x) => x,
- Err(e) => {
- logkit::error!("Failed to fetch merge requests: {}", e);
- return;
- }
- };
-
- for merge_request in merge_requests {
- let doc = WebDocument {
- source_id: source_id.to_owned(),
- id: merge_request.web_url.clone(),
- link: merge_request.web_url,
- title: merge_request.title,
- body: merge_request.description.unwrap_or_default(),
- };
- yield (merge_request.updated_at, doc);
- }
-
};
Ok(s)
diff --git a/ee/tabby-webserver/src/service/background_job/web_crawler.rs b/ee/tabby-webserver/src/service/background_job/web_crawler.rs
index 3bd3a626be8d..8d4450310309 100644
--- a/ee/tabby-webserver/src/service/background_job/web_crawler.rs
+++ b/ee/tabby-webserver/src/service/background_job/web_crawler.rs
@@ -4,7 +4,9 @@ use chrono::Utc;
use futures::StreamExt;
use serde::{Deserialize, Serialize};
use tabby_crawler::crawl_pipeline;
-use tabby_index::public::{DocIndexer, WebDocument};
+use tabby_index::public::{
+ StructuredDoc, StructuredDocFields, StructuredDocIndexer, StructuredDocWebFields,
+};
use tabby_inference::Embedding;
use super::helper::Job;
@@ -35,18 +37,19 @@ impl WebCrawlerJob {
logkit::info!("Starting doc index pipeline for {}", self.url);
let embedding = embedding.clone();
let mut num_docs = 0;
- let indexer = DocIndexer::new(embedding.clone());
+ let indexer = StructuredDocIndexer::new(embedding.clone());
let url_prefix = self.url_prefix.as_ref().unwrap_or(&self.url);
let mut pipeline = Box::pin(crawl_pipeline(&self.url, url_prefix).await?);
while let Some(doc) = pipeline.next().await {
logkit::info!("Fetching {}", doc.url);
- let source_doc = WebDocument {
+ let source_doc = StructuredDoc {
source_id: self.source_id.clone(),
- id: doc.url.clone(),
- title: doc.metadata.title.unwrap_or_default(),
- link: doc.url,
- body: doc.markdown,
+ fields: StructuredDocFields::Web(StructuredDocWebFields {
+ title: doc.metadata.title.unwrap_or_default(),
+ link: doc.url,
+ body: doc.markdown,
+ }),
};
num_docs += 1;
diff --git a/ee/tabby-webserver/src/service/thread.rs b/ee/tabby-webserver/src/service/thread.rs
index 27fcd53c6acf..55f762842c5a 100644
--- a/ee/tabby-webserver/src/service/thread.rs
+++ b/ee/tabby-webserver/src/service/thread.rs
@@ -289,7 +289,7 @@ mod tests {
use tabby_common::{
api::{
code::{CodeSearch, CodeSearchParams},
- doc::DocSearch,
+ structured_doc::DocSearch,
},
config::AnswerConfig,
};
diff --git a/ee/tabby-webserver/src/webserver.rs b/ee/tabby-webserver/src/webserver.rs
index b5b5eeec77a3..a7e360955908 100644
--- a/ee/tabby-webserver/src/webserver.rs
+++ b/ee/tabby-webserver/src/webserver.rs
@@ -4,8 +4,8 @@ use axum::Router;
use tabby_common::{
api::{
code::CodeSearch,
- doc::DocSearch,
event::{ComposedLogger, EventLogger},
+ structured_doc::DocSearch,
},
config::Config,
};