Skip to content

Commit

Permalink
feat(index): add commit to CodeSearchDocument (#3577)
Browse files Browse the repository at this point in the history
* WIP: add commit in code search

Signed-off-by: Wei Zhang <[email protected]>

* chore: fix tests

Signed-off-by: Wei Zhang <[email protected]>

* chore: commit fields as optional currently

Signed-off-by: Wei Zhang <[email protected]>

* chore: fix tests

Signed-off-by: Wei Zhang <[email protected]>

* [autofix.ci] apply automated fixes

* chore: commit should be in doc.attribute

Signed-off-by: Wei Zhang <[email protected]>

* [autofix.ci] apply automated fixes

* chore: add commit when create_hit

Signed-off-by: Wei Zhang <[email protected]>

* chore: add comment to note commit is the last updated commit

Signed-off-by: Wei Zhang <[email protected]>

* backfill commit in source code without redo calculate (#3587)

---------

Signed-off-by: Wei Zhang <[email protected]>
Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
  • Loading branch information
zwpaper and autofix-ci[bot] authored Dec 23, 2024
1 parent 69eb044 commit cbf34f6
Show file tree
Hide file tree
Showing 17 changed files with 242 additions and 50 deletions.
5 changes: 5 additions & 0 deletions crates/tabby-common/src/api/code.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@ pub struct CodeSearchDocument {
pub body: String,
pub filepath: String,
pub git_url: String,

// FIXME(kweizh): This should be a required field after 0.25.0.
// commit represents the specific revision at which the file was last edited.
pub commit: Option<String>,

pub language: String,
pub start_line: usize,
}
Expand Down
2 changes: 2 additions & 0 deletions crates/tabby-common/src/index/code/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ use super::{corpus, IndexSchema};
use crate::api::code::CodeSearchQuery;

pub mod fields {
pub const ATTRIBUTE_COMMIT: &str = "commit";

pub const CHUNK_GIT_URL: &str = "chunk_git_url";
pub const CHUNK_FILEPATH: &str = "chunk_filepath";
pub const CHUNK_LANGUAGE: &str = "chunk_language";
Expand Down
35 changes: 20 additions & 15 deletions crates/tabby-common/src/index/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,21 @@ impl IndexSchema {
])
}

/// Build a query to find the document with the given `doc_id`, include chunks.
pub fn doc_query_with_chunks(&self, corpus: &str, doc_id: &str) -> impl Query {
let doc_id_query = TermQuery::new(
Term::from_field_text(self.field_id, doc_id),
tantivy::schema::IndexRecordOption::Basic,
);

BooleanQuery::new(vec![
// Must match the corpus
(Occur::Must, self.corpus_query(corpus)),
// Must match the doc id
(Occur::Must, Box::new(doc_id_query)),
])
}

pub fn doc_indexed_after(
&self,
corpus: &str,
Expand Down Expand Up @@ -261,21 +276,11 @@ impl IndexSchema {
FIELD_ATTRIBUTES, field
))),
),
])
}

/// Build a query to find the document with the given `doc_id`, include chunks.
pub fn doc_query_with_chunks(&self, corpus: &str, doc_id: &str) -> impl Query {
let doc_id_query = TermQuery::new(
Term::from_field_text(self.field_id, doc_id),
tantivy::schema::IndexRecordOption::Basic,
);

BooleanQuery::new(vec![
// Must match the corpus
(Occur::Must, self.corpus_query(corpus)),
// Must match the doc id
(Occur::Must, Box::new(doc_id_query)),
// Exclude chunk documents
(
Occur::MustNot,
Box::new(ExistsQuery::new_exists_query(FIELD_CHUNK_ID.into())),
),
])
}

Expand Down
59 changes: 52 additions & 7 deletions crates/tabby-index/src/code/index.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
use std::{pin::pin, sync::Arc};
use std::{path::Path, pin::pin, sync::Arc};

use anyhow::Result;
use async_stream::stream;
use futures::StreamExt;
use ignore::{DirEntry, Walk};
use tabby_common::index::corpus;
use tabby_common::index::{code, corpus};
use tabby_inference::Embedding;
use tracing::warn;

Expand All @@ -12,7 +13,7 @@ use super::{
intelligence::{CodeIntelligence, SourceCode},
CodeRepository,
};
use crate::indexer::Indexer;
use crate::indexer::{Indexer, TantivyDocBuilder};

// Magic numbers
static MAX_LINE_LENGTH_THRESHOLD: usize = 300;
Expand All @@ -21,7 +22,11 @@ static MIN_ALPHA_NUM_FRACTION: f32 = 0.25f32;
static MAX_NUMBER_OF_LINES: usize = 100000;
static MAX_NUMBER_FRACTION: f32 = 0.5f32;

pub async fn index_repository(embedding: Arc<dyn Embedding>, repository: &CodeRepository) {
pub async fn index_repository(
embedding: Arc<dyn Embedding>,
repository: &CodeRepository,
commit: &str,
) {
let total_files = Walk::new(repository.dir()).count();
let file_stream = stream! {
for file in Walk::new(repository.dir()) {
Expand All @@ -45,7 +50,7 @@ pub async fn index_repository(embedding: Arc<dyn Embedding>, repository: &CodeRe
let mut count_chunks = 0;
while let Some(files) = file_stream.next().await {
count_files += files.len();
count_chunks += add_changed_documents(repository, embedding.clone(), files).await;
count_chunks += add_changed_documents(repository, commit, embedding.clone(), files).await;
logkit::info!("Processed {count_files}/{total_files} files, updated {count_chunks} chunks",);
}
}
Expand Down Expand Up @@ -79,6 +84,7 @@ pub async fn garbage_collection() {

async fn add_changed_documents(
repository: &CodeRepository,
commit: &str,
embedding: Arc<dyn Embedding>,
files: Vec<DirEntry>,
) -> usize {
Expand All @@ -96,12 +102,23 @@ async fn add_changed_documents(

let id = SourceCode::to_index_id(&repository.source_id, &key).id;

// Skip if already indexed and has no failed chunks
// Skip if already indexed and has no failed chunks,
// when skip, we should check if the document needs to be backfilled.
if !require_updates(cloned_index.clone(), &id) {
backfill_commit_in_doc_if_needed(
builder.clone(),
cloned_index.clone(),
&id,
repository,
commit,
file.path()).await.unwrap_or_else(|e| {
warn!("Failed to backfill commit for {id}: {e}");
}
);
continue;
}

let Some(code) = CodeIntelligence::compute_source_file(repository, file.path()) else {
let Some(code) = CodeIntelligence::compute_source_file(repository, commit, file.path()) else {
continue;
};

Expand Down Expand Up @@ -143,6 +160,34 @@ fn require_updates(indexer: Arc<Indexer>, id: &str) -> bool {
true
}

// v0.23.0 add the commit field to the code document.
async fn backfill_commit_in_doc_if_needed(
builder: Arc<TantivyDocBuilder<SourceCode>>,
indexer: Arc<Indexer>,
id: &str,
repository: &CodeRepository,
commit: &str,
path: &Path,
) -> Result<()> {
if indexer.has_attribute_field(id, code::fields::ATTRIBUTE_COMMIT) {
return Ok(());
}

let code = CodeIntelligence::compute_source_file(repository, commit, path)
.ok_or_else(|| anyhow::anyhow!("Failed to compute source file"))?;
if !is_valid_file(&code) {
anyhow::bail!("Invalid file");
}

let origin = indexer.get_doc(id).await?;
indexer.delete_doc(id);
indexer
.add(builder.backfill_doc_attributes(&origin, &code).await)
.await;

Ok(())
}

fn is_valid_file(file: &SourceCode) -> bool {
file.max_line_length <= MAX_LINE_LENGTH_THRESHOLD
&& file.avg_line_length <= AVG_LINE_LENGTH_THRESHOLD
Expand Down
13 changes: 10 additions & 3 deletions crates/tabby-index/src/code/intelligence.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,11 @@ impl CodeIntelligence {
file_key.to_string() == item_key
}

pub fn compute_source_file(config: &CodeRepository, path: &Path) -> Option<SourceCode> {
pub fn compute_source_file(
config: &CodeRepository,
commit: &str,
path: &Path,
) -> Option<SourceCode> {
let source_file_id = Self::compute_source_file_id(path)?;

if path.is_dir() || !path.exists() {
Expand Down Expand Up @@ -114,6 +118,7 @@ impl CodeIntelligence {
source_file_id,
source_id: config.source_id.clone(),
git_url: config.canonical_git_url(),
commit: commit.to_owned(),
basedir: config.dir().display().to_string(),
filepath: relative_path.display().to_string(),
max_line_length,
Expand Down Expand Up @@ -260,12 +265,14 @@ mod tests {
fn test_create_source_file() {
set_tabby_root(get_tabby_root());
let config = get_repository_config();
let source_file = CodeIntelligence::compute_source_file(&config, &get_rust_source_file())
.expect("Failed to create source file");
let source_file =
CodeIntelligence::compute_source_file(&config, "commit", &get_rust_source_file())
.expect("Failed to create source file");

// check source_file properties
assert_eq!(source_file.language, "rust");
assert_eq!(source_file.tags.len(), 3);
assert_eq!(source_file.filepath, "rust.rs");
assert_eq!(source_file.commit, "commit");
}
}
10 changes: 6 additions & 4 deletions crates/tabby-index/src/code/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,9 @@ impl CodeIndexer {
"Building source code index: {}",
repository.canonical_git_url()
);
repository::sync_repository(repository)?;
let commit = repository::sync_repository(repository)?;

index::index_repository(embedding, repository).await;
index::index_repository(embedding, repository, &commit).await;
index::garbage_collection().await;

Ok(())
Expand All @@ -62,8 +62,10 @@ impl CodeBuilder {

#[async_trait]
impl IndexAttributeBuilder<SourceCode> for CodeBuilder {
async fn build_attributes(&self, _source_code: &SourceCode) -> serde_json::Value {
json!({})
async fn build_attributes(&self, source_code: &SourceCode) -> serde_json::Value {
json!({
code::fields::ATTRIBUTE_COMMIT: source_code.commit,
})
}

async fn build_chunk_attributes<'a>(
Expand Down
24 changes: 16 additions & 8 deletions crates/tabby-index/src/code/repository.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,13 @@ use tracing::warn;
use super::CodeRepository;

trait RepositoryExt {
fn sync(&self) -> anyhow::Result<()>;
fn sync(&self) -> anyhow::Result<String>;
}

impl RepositoryExt for CodeRepository {
fn sync(&self) -> anyhow::Result<()> {
// sync clones the repository if it doesn't exist, otherwise it pulls the remote.
// and returns the git commit sha256.
fn sync(&self) -> anyhow::Result<String> {
let dir = self.dir();
let mut finished = false;
if dir.exists() {
Expand Down Expand Up @@ -47,10 +49,17 @@ impl RepositoryExt for CodeRepository {
}
}

Ok(())
get_commit_sha(self)
}
}

fn get_commit_sha(repository: &CodeRepository) -> anyhow::Result<String> {
let repo = git2::Repository::open(repository.dir())?;
let head = repo.head()?;
let commit = head.peel_to_commit()?;
Ok(commit.id().to_string())
}

fn pull_remote(path: &Path) -> bool {
let status = Command::new("git")
.current_dir(path)
Expand All @@ -71,16 +80,15 @@ fn pull_remote(path: &Path) -> bool {
true
}

pub fn sync_repository(repository: &CodeRepository) -> anyhow::Result<()> {
pub fn sync_repository(repository: &CodeRepository) -> anyhow::Result<String> {
if repository.is_local_dir() {
if !repository.dir().exists() {
panic!("Directory {} does not exist", repository.dir().display());
bail!("Directory {} does not exist", repository.dir().display());
}
get_commit_sha(repository)
} else {
repository.sync()?;
repository.sync()
}

Ok(())
}

pub fn garbage_collection(repositories: &[CodeRepository]) {
Expand Down
1 change: 1 addition & 0 deletions crates/tabby-index/src/code/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ pub struct SourceCode {
pub source_file_id: String,
pub source_id: String,
pub git_url: String,
pub commit: String,
pub basedir: String,
pub filepath: String,
pub language: String,
Expand Down
54 changes: 54 additions & 0 deletions crates/tabby-index/src/indexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,26 @@ impl<T: ToIndexId> TantivyDocBuilder<T> {
}
}
}

pub async fn backfill_doc_attributes(
&self,
origin: &TantivyDocument,
doc: &T,
) -> TantivyDocument {
let schema = IndexSchema::instance();
let mut doc = doc! {
schema.field_id => get_text(origin, schema.field_id),
schema.field_source_id => get_text(origin, schema.field_source_id).to_string(),
schema.field_corpus => get_text(origin, schema.field_corpus).to_string(),
schema.field_attributes => self.builder.build_attributes(doc).await,
schema.field_updated_at => get_date(origin, schema.field_updated_at),
};
if let Some(failed_chunks) = get_number_optional(origin, schema.field_failed_chunks_count) {
doc.add_u64(schema.field_failed_chunks_count, failed_chunks as u64);
}

doc
}
}

pub struct Indexer {
Expand Down Expand Up @@ -197,13 +217,39 @@ impl Indexer {
.expect("Failed to add document");
}

pub async fn get_doc(&self, id: &str) -> Result<TantivyDocument> {
let schema = IndexSchema::instance();
let query = schema.doc_query(&self.corpus, id);
let docs = match self.searcher.search(&query, &TopDocs::with_limit(1)) {
Ok(docs) => docs,
Err(e) => {
debug!("query tantivy error: {}", e);
return Err(e.into());
}
};
if docs.is_empty() {
bail!("Document not found: {}", id);
}

self.searcher
.doc(docs.first().unwrap().1)
.map_err(|e| e.into())
}

pub fn delete(&self, id: &str) {
let schema = IndexSchema::instance();
let _ = self
.writer
.delete_query(Box::new(schema.doc_query_with_chunks(&self.corpus, id)));
}

pub fn delete_doc(&self, id: &str) {
let schema = IndexSchema::instance();
let _ = self
.writer
.delete_query(Box::new(schema.doc_query(&self.corpus, id)));
}

pub fn commit(mut self) {
self.writer.commit().expect("Failed to commit changes");
self.writer
Expand Down Expand Up @@ -369,3 +415,11 @@ impl IndexGarbageCollector {
fn get_text(doc: &TantivyDocument, field: schema::Field) -> &str {
doc.get_first(field).unwrap().as_str().unwrap()
}

fn get_date(doc: &TantivyDocument, field: schema::Field) -> tantivy::DateTime {
doc.get_first(field).unwrap().as_datetime().unwrap()
}

fn get_number_optional(doc: &TantivyDocument, field: schema::Field) -> Option<i64> {
doc.get_first(field)?.as_i64()
}
3 changes: 2 additions & 1 deletion crates/tabby-index/src/indexer_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,8 @@ mod builder_tests {
let builder = Arc::new(create_code_builder(Some(Arc::new(embedding))));

let repo = get_repository_config();
let code = CodeIntelligence::compute_source_file(&repo, &get_rust_source_file()).unwrap();
let code = CodeIntelligence::compute_source_file(&repo, "commit", &get_rust_source_file())
.unwrap();
let index_id = code.to_index_id();

let (id, s) = tokio::runtime::Runtime::new()
Expand Down
Loading

0 comments on commit cbf34f6

Please sign in to comment.