Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(index): add commit to CodeSearchDocument #3577

Merged
merged 10 commits into from
Dec 23, 2024
5 changes: 5 additions & 0 deletions crates/tabby-common/src/api/code.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@ pub struct CodeSearchDocument {
pub body: String,
pub filepath: String,
pub git_url: String,

// FIXME(kweizh): This should be a required field after 0.25.0.
// commit represents the specific revision at which the file was last edited.
pub commit: Option<String>,

pub language: String,
pub start_line: usize,
}
Expand Down
2 changes: 2 additions & 0 deletions crates/tabby-common/src/index/code/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ use super::{corpus, IndexSchema};
use crate::api::code::CodeSearchQuery;

pub mod fields {
pub const ATTRIBUTE_COMMIT: &str = "commit";

pub const CHUNK_GIT_URL: &str = "chunk_git_url";
pub const CHUNK_FILEPATH: &str = "chunk_filepath";
pub const CHUNK_LANGUAGE: &str = "chunk_language";
Expand Down
35 changes: 20 additions & 15 deletions crates/tabby-common/src/index/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,21 @@ impl IndexSchema {
])
}

/// Build a query to find the document with the given `doc_id`, include chunks.
pub fn doc_query_with_chunks(&self, corpus: &str, doc_id: &str) -> impl Query {
let doc_id_query = TermQuery::new(
Term::from_field_text(self.field_id, doc_id),
tantivy::schema::IndexRecordOption::Basic,
);

BooleanQuery::new(vec![
// Must match the corpus
(Occur::Must, self.corpus_query(corpus)),
// Must match the doc id
(Occur::Must, Box::new(doc_id_query)),
])
}

pub fn doc_indexed_after(
&self,
corpus: &str,
Expand Down Expand Up @@ -261,21 +276,11 @@ impl IndexSchema {
FIELD_ATTRIBUTES, field
))),
),
])
}

/// Build a query to find the document with the given `doc_id`, include chunks.
pub fn doc_query_with_chunks(&self, corpus: &str, doc_id: &str) -> impl Query {
let doc_id_query = TermQuery::new(
Term::from_field_text(self.field_id, doc_id),
tantivy::schema::IndexRecordOption::Basic,
);

BooleanQuery::new(vec![
// Must match the corpus
(Occur::Must, self.corpus_query(corpus)),
// Must match the doc id
(Occur::Must, Box::new(doc_id_query)),
// Exclude chunk documents
(
Occur::MustNot,
Box::new(ExistsQuery::new_exists_query(FIELD_CHUNK_ID.into())),
),
])
}

Expand Down
59 changes: 52 additions & 7 deletions crates/tabby-index/src/code/index.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
use std::{pin::pin, sync::Arc};
use std::{path::Path, pin::pin, sync::Arc};

use anyhow::Result;
use async_stream::stream;
use futures::StreamExt;
use ignore::{DirEntry, Walk};
use tabby_common::index::corpus;
use tabby_common::index::{code, corpus};
use tabby_inference::Embedding;
use tracing::warn;

Expand All @@ -12,7 +13,7 @@
intelligence::{CodeIntelligence, SourceCode},
CodeRepository,
};
use crate::indexer::Indexer;
use crate::indexer::{Indexer, TantivyDocBuilder};

// Magic numbers
static MAX_LINE_LENGTH_THRESHOLD: usize = 300;
Expand All @@ -21,7 +22,11 @@
static MAX_NUMBER_OF_LINES: usize = 100000;
static MAX_NUMBER_FRACTION: f32 = 0.5f32;

pub async fn index_repository(embedding: Arc<dyn Embedding>, repository: &CodeRepository) {
pub async fn index_repository(
embedding: Arc<dyn Embedding>,
repository: &CodeRepository,
commit: &str,
) {

Check warning on line 29 in crates/tabby-index/src/code/index.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/code/index.rs#L25-L29

Added lines #L25 - L29 were not covered by tests
let total_files = Walk::new(repository.dir()).count();
let file_stream = stream! {
for file in Walk::new(repository.dir()) {
Expand All @@ -45,7 +50,7 @@
let mut count_chunks = 0;
while let Some(files) = file_stream.next().await {
count_files += files.len();
count_chunks += add_changed_documents(repository, embedding.clone(), files).await;
count_chunks += add_changed_documents(repository, commit, embedding.clone(), files).await;

Check warning on line 53 in crates/tabby-index/src/code/index.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/code/index.rs#L53

Added line #L53 was not covered by tests
logkit::info!("Processed {count_files}/{total_files} files, updated {count_chunks} chunks",);
}
}
Expand Down Expand Up @@ -79,6 +84,7 @@

async fn add_changed_documents(
repository: &CodeRepository,
commit: &str,

Check warning on line 87 in crates/tabby-index/src/code/index.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/code/index.rs#L87

Added line #L87 was not covered by tests
embedding: Arc<dyn Embedding>,
files: Vec<DirEntry>,
) -> usize {
Expand All @@ -96,12 +102,23 @@

let id = SourceCode::to_index_id(&repository.source_id, &key).id;

// Skip if already indexed and has no failed chunks
// Skip if already indexed and has no failed chunks,
// when skip, we should check if the document needs to be backfilled.

Check warning on line 106 in crates/tabby-index/src/code/index.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/code/index.rs#L105-L106

Added lines #L105 - L106 were not covered by tests
if !require_updates(cloned_index.clone(), &id) {
backfill_commit_in_doc_if_needed(
builder.clone(),
cloned_index.clone(),
&id,
repository,
commit,
file.path()).await.unwrap_or_else(|e| {
warn!("Failed to backfill commit for {id}: {e}");
}
);

Check warning on line 117 in crates/tabby-index/src/code/index.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/code/index.rs#L108-L117

Added lines #L108 - L117 were not covered by tests
continue;
}

let Some(code) = CodeIntelligence::compute_source_file(repository, file.path()) else {
let Some(code) = CodeIntelligence::compute_source_file(repository, commit, file.path()) else {

Check warning on line 121 in crates/tabby-index/src/code/index.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/code/index.rs#L121

Added line #L121 was not covered by tests
continue;
};

Expand Down Expand Up @@ -143,6 +160,34 @@
true
}

// v0.23.0 add the commit field to the code document.
async fn backfill_commit_in_doc_if_needed(
builder: Arc<TantivyDocBuilder<SourceCode>>,
indexer: Arc<Indexer>,
id: &str,
repository: &CodeRepository,
commit: &str,
path: &Path,
) -> Result<()> {
if indexer.has_attribute_field(id, code::fields::ATTRIBUTE_COMMIT) {
return Ok(());
}

Check warning on line 174 in crates/tabby-index/src/code/index.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/code/index.rs#L164-L174

Added lines #L164 - L174 were not covered by tests

let code = CodeIntelligence::compute_source_file(repository, commit, path)
.ok_or_else(|| anyhow::anyhow!("Failed to compute source file"))?;
if !is_valid_file(&code) {
anyhow::bail!("Invalid file");
}

Check warning on line 180 in crates/tabby-index/src/code/index.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/code/index.rs#L176-L180

Added lines #L176 - L180 were not covered by tests

let origin = indexer.get_doc(id).await?;
indexer.delete_doc(id);
indexer
.add(builder.backfill_doc_attributes(&origin, &code).await)
.await;

Check warning on line 186 in crates/tabby-index/src/code/index.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/code/index.rs#L182-L186

Added lines #L182 - L186 were not covered by tests

Ok(())
}

Check warning on line 189 in crates/tabby-index/src/code/index.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/code/index.rs#L188-L189

Added lines #L188 - L189 were not covered by tests

fn is_valid_file(file: &SourceCode) -> bool {
file.max_line_length <= MAX_LINE_LENGTH_THRESHOLD
&& file.avg_line_length <= AVG_LINE_LENGTH_THRESHOLD
Expand Down
13 changes: 10 additions & 3 deletions crates/tabby-index/src/code/intelligence.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,11 @@ impl CodeIntelligence {
file_key.to_string() == item_key
}

pub fn compute_source_file(config: &CodeRepository, path: &Path) -> Option<SourceCode> {
pub fn compute_source_file(
config: &CodeRepository,
commit: &str,
path: &Path,
) -> Option<SourceCode> {
let source_file_id = Self::compute_source_file_id(path)?;

if path.is_dir() || !path.exists() {
Expand Down Expand Up @@ -114,6 +118,7 @@ impl CodeIntelligence {
source_file_id,
source_id: config.source_id.clone(),
git_url: config.canonical_git_url(),
commit: commit.to_owned(),
basedir: config.dir().display().to_string(),
filepath: relative_path.display().to_string(),
max_line_length,
Expand Down Expand Up @@ -260,12 +265,14 @@ mod tests {
fn test_create_source_file() {
set_tabby_root(get_tabby_root());
let config = get_repository_config();
let source_file = CodeIntelligence::compute_source_file(&config, &get_rust_source_file())
.expect("Failed to create source file");
let source_file =
CodeIntelligence::compute_source_file(&config, "commit", &get_rust_source_file())
.expect("Failed to create source file");

// check source_file properties
assert_eq!(source_file.language, "rust");
assert_eq!(source_file.tags.len(), 3);
assert_eq!(source_file.filepath, "rust.rs");
assert_eq!(source_file.commit, "commit");
}
}
10 changes: 6 additions & 4 deletions crates/tabby-index/src/code/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,9 @@
"Building source code index: {}",
repository.canonical_git_url()
);
repository::sync_repository(repository)?;
let commit = repository::sync_repository(repository)?;

Check warning on line 41 in crates/tabby-index/src/code/mod.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/code/mod.rs#L41

Added line #L41 was not covered by tests

index::index_repository(embedding, repository).await;
index::index_repository(embedding, repository, &commit).await;

Check warning on line 43 in crates/tabby-index/src/code/mod.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/code/mod.rs#L43

Added line #L43 was not covered by tests
index::garbage_collection().await;

Ok(())
Expand All @@ -62,8 +62,10 @@

#[async_trait]
impl IndexAttributeBuilder<SourceCode> for CodeBuilder {
async fn build_attributes(&self, _source_code: &SourceCode) -> serde_json::Value {
json!({})
async fn build_attributes(&self, source_code: &SourceCode) -> serde_json::Value {
json!({
code::fields::ATTRIBUTE_COMMIT: source_code.commit,
})
}

async fn build_chunk_attributes<'a>(
Expand Down
24 changes: 16 additions & 8 deletions crates/tabby-index/src/code/repository.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,13 @@
use super::CodeRepository;

trait RepositoryExt {
fn sync(&self) -> anyhow::Result<()>;
fn sync(&self) -> anyhow::Result<String>;
}

impl RepositoryExt for CodeRepository {
fn sync(&self) -> anyhow::Result<()> {
// sync clones the repository if it doesn't exist, otherwise it pulls the remote.
// and returns the git commit sha256.
fn sync(&self) -> anyhow::Result<String> {

Check warning on line 21 in crates/tabby-index/src/code/repository.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/code/repository.rs#L21

Added line #L21 was not covered by tests
let dir = self.dir();
let mut finished = false;
if dir.exists() {
Expand Down Expand Up @@ -47,10 +49,17 @@
}
}

Ok(())
get_commit_sha(self)

Check warning on line 52 in crates/tabby-index/src/code/repository.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/code/repository.rs#L52

Added line #L52 was not covered by tests
}
}

fn get_commit_sha(repository: &CodeRepository) -> anyhow::Result<String> {
let repo = git2::Repository::open(repository.dir())?;
let head = repo.head()?;
let commit = head.peel_to_commit()?;
Ok(commit.id().to_string())
}

Check warning on line 61 in crates/tabby-index/src/code/repository.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/code/repository.rs#L56-L61

Added lines #L56 - L61 were not covered by tests

fn pull_remote(path: &Path) -> bool {
let status = Command::new("git")
.current_dir(path)
Expand All @@ -71,16 +80,15 @@
true
}

pub fn sync_repository(repository: &CodeRepository) -> anyhow::Result<()> {
pub fn sync_repository(repository: &CodeRepository) -> anyhow::Result<String> {

Check warning on line 83 in crates/tabby-index/src/code/repository.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/code/repository.rs#L83

Added line #L83 was not covered by tests
if repository.is_local_dir() {
if !repository.dir().exists() {
panic!("Directory {} does not exist", repository.dir().display());
bail!("Directory {} does not exist", repository.dir().display());

Check warning on line 86 in crates/tabby-index/src/code/repository.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/code/repository.rs#L86

Added line #L86 was not covered by tests
}
get_commit_sha(repository)

Check warning on line 88 in crates/tabby-index/src/code/repository.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/code/repository.rs#L88

Added line #L88 was not covered by tests
} else {
repository.sync()?;
repository.sync()

Check warning on line 90 in crates/tabby-index/src/code/repository.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/code/repository.rs#L90

Added line #L90 was not covered by tests
}

Ok(())
}

pub fn garbage_collection(repositories: &[CodeRepository]) {
Expand Down
1 change: 1 addition & 0 deletions crates/tabby-index/src/code/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ pub struct SourceCode {
pub source_file_id: String,
pub source_id: String,
pub git_url: String,
pub commit: String,
pub basedir: String,
pub filepath: String,
pub language: String,
Expand Down
54 changes: 54 additions & 0 deletions crates/tabby-index/src/indexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,26 @@
}
}
}

pub async fn backfill_doc_attributes(
&self,
origin: &TantivyDocument,
doc: &T,
) -> TantivyDocument {
let schema = IndexSchema::instance();
let mut doc = doc! {
schema.field_id => get_text(origin, schema.field_id),
schema.field_source_id => get_text(origin, schema.field_source_id).to_string(),
schema.field_corpus => get_text(origin, schema.field_corpus).to_string(),
schema.field_attributes => self.builder.build_attributes(doc).await,
schema.field_updated_at => get_date(origin, schema.field_updated_at),

Check warning on line 182 in crates/tabby-index/src/indexer.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/indexer.rs#L171-L182

Added lines #L171 - L182 were not covered by tests
};
if let Some(failed_chunks) = get_number_optional(origin, schema.field_failed_chunks_count) {
doc.add_u64(schema.field_failed_chunks_count, failed_chunks as u64);
}

Check warning on line 186 in crates/tabby-index/src/indexer.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/indexer.rs#L184-L186

Added lines #L184 - L186 were not covered by tests

doc
}

Check warning on line 189 in crates/tabby-index/src/indexer.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/indexer.rs#L188-L189

Added lines #L188 - L189 were not covered by tests
}

pub struct Indexer {
Expand Down Expand Up @@ -197,13 +217,39 @@
.expect("Failed to add document");
}

pub async fn get_doc(&self, id: &str) -> Result<TantivyDocument> {
let schema = IndexSchema::instance();
let query = schema.doc_query(&self.corpus, id);
let docs = match self.searcher.search(&query, &TopDocs::with_limit(1)) {
Ok(docs) => docs,
Err(e) => {
debug!("query tantivy error: {}", e);
return Err(e.into());

Check warning on line 227 in crates/tabby-index/src/indexer.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/indexer.rs#L220-L227

Added lines #L220 - L227 were not covered by tests
}
};
if docs.is_empty() {
bail!("Document not found: {}", id);
}

self.searcher
.doc(docs.first().unwrap().1)
.map_err(|e| e.into())
}

Check warning on line 237 in crates/tabby-index/src/indexer.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/indexer.rs#L230-L237

Added lines #L230 - L237 were not covered by tests

pub fn delete(&self, id: &str) {
let schema = IndexSchema::instance();
let _ = self
.writer
.delete_query(Box::new(schema.doc_query_with_chunks(&self.corpus, id)));
}

pub fn delete_doc(&self, id: &str) {
let schema = IndexSchema::instance();
let _ = self
.writer
.delete_query(Box::new(schema.doc_query(&self.corpus, id)));
}

Check warning on line 251 in crates/tabby-index/src/indexer.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/indexer.rs#L246-L251

Added lines #L246 - L251 were not covered by tests

pub fn commit(mut self) {
self.writer.commit().expect("Failed to commit changes");
self.writer
Expand Down Expand Up @@ -369,3 +415,11 @@
fn get_text(doc: &TantivyDocument, field: schema::Field) -> &str {
doc.get_first(field).unwrap().as_str().unwrap()
}

fn get_date(doc: &TantivyDocument, field: schema::Field) -> tantivy::DateTime {
doc.get_first(field).unwrap().as_datetime().unwrap()
}

Check warning on line 421 in crates/tabby-index/src/indexer.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/indexer.rs#L419-L421

Added lines #L419 - L421 were not covered by tests

fn get_number_optional(doc: &TantivyDocument, field: schema::Field) -> Option<i64> {
doc.get_first(field)?.as_i64()
}

Check warning on line 425 in crates/tabby-index/src/indexer.rs

View check run for this annotation

Codecov / codecov/patch

crates/tabby-index/src/indexer.rs#L423-L425

Added lines #L423 - L425 were not covered by tests
3 changes: 2 additions & 1 deletion crates/tabby-index/src/indexer_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,8 @@ mod builder_tests {
let builder = Arc::new(create_code_builder(Some(Arc::new(embedding))));

let repo = get_repository_config();
let code = CodeIntelligence::compute_source_file(&repo, &get_rust_source_file()).unwrap();
let code = CodeIntelligence::compute_source_file(&repo, "commit", &get_rust_source_file())
.unwrap();
let index_id = code.to_index_id();

let (id, s) = tokio::runtime::Runtime::new()
Expand Down
Loading
Loading