Skip to content

Commit

Permalink
WIP: add commit in code search
Browse files Browse the repository at this point in the history
Signed-off-by: Wei Zhang <[email protected]>
  • Loading branch information
zwpaper committed Dec 17, 2024
1 parent 81de67b commit 6eb0493
Show file tree
Hide file tree
Showing 13 changed files with 74 additions and 17 deletions.
1 change: 1 addition & 0 deletions crates/tabby-common/src/api/code.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ pub struct CodeSearchDocument {
pub body: String,
pub filepath: String,
pub git_url: String,
pub commit: String,
pub language: String,
pub start_line: usize,
}
Expand Down
1 change: 1 addition & 0 deletions crates/tabby-common/src/index/code/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use crate::api::code::CodeSearchQuery;

pub mod fields {
pub const CHUNK_GIT_URL: &str = "chunk_git_url";
pub const CHUNK_COMMIT: &str = "chunk_commit";
pub const CHUNK_FILEPATH: &str = "chunk_filepath";
pub const CHUNK_LANGUAGE: &str = "chunk_language";
pub const CHUNK_BODY: &str = "chunk_body";
Expand Down
24 changes: 19 additions & 5 deletions crates/tabby-index/src/code/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use std::{pin::pin, sync::Arc};
use async_stream::stream;
use futures::StreamExt;
use ignore::{DirEntry, Walk};
use tabby_common::index::corpus;
use tabby_common::index::{code, corpus};
use tabby_inference::Embedding;
use tracing::warn;

Expand All @@ -21,7 +21,11 @@ static MIN_ALPHA_NUM_FRACTION: f32 = 0.25f32;
static MAX_NUMBER_OF_LINES: usize = 100000;
static MAX_NUMBER_FRACTION: f32 = 0.5f32;

pub async fn index_repository(embedding: Arc<dyn Embedding>, repository: &CodeRepository) {
pub async fn index_repository(
embedding: Arc<dyn Embedding>,
repository: &CodeRepository,
commit: &str,
) {
let total_files = Walk::new(repository.dir()).count();
let file_stream = stream! {
for file in Walk::new(repository.dir()) {
Expand All @@ -45,7 +49,7 @@ pub async fn index_repository(embedding: Arc<dyn Embedding>, repository: &CodeRe
let mut count_chunks = 0;
while let Some(files) = file_stream.next().await {
count_files += files.len();
count_chunks += add_changed_documents(repository, embedding.clone(), files).await;
count_chunks += add_changed_documents(repository, commit, embedding.clone(), files).await;
logkit::info!("Processed {count_files}/{total_files} files, updated {count_chunks} chunks",);
}
}
Expand Down Expand Up @@ -79,6 +83,7 @@ pub async fn garbage_collection() {

async fn add_changed_documents(
repository: &CodeRepository,
commit: &str,
embedding: Arc<dyn Embedding>,
files: Vec<DirEntry>,
) -> usize {
Expand All @@ -96,12 +101,11 @@ async fn add_changed_documents(

let id = SourceCode::to_index_id(&repository.source_id, &key).id;

// Skip if already indexed and has no failed chunks
if !require_updates(cloned_index.clone(), &id) {
continue;
}

let Some(code) = CodeIntelligence::compute_source_file(repository, file.path()) else {
let Some(code) = CodeIntelligence::compute_source_file(repository, commit, file.path()) else {
continue;
};

Expand Down Expand Up @@ -135,14 +139,24 @@ async fn add_changed_documents(
count_docs
}

// 1. Backfill if the document is missing the commit field
// 2. Skip if already indexed and has no failed chunks
fn require_updates(indexer: Arc<Indexer>, id: &str) -> bool {
if should_backfill(indexer.clone(), id) {
return true;
}
if indexer.is_indexed(id) && !indexer.has_failed_chunks(id) {
return false;
};

true
}

fn should_backfill(indexer: Arc<Indexer>, id: &str) -> bool {
// v0.23.0 add the commit field to the code document.
!indexer.has_attribute_field(id, code::fields::CHUNK_COMMIT)
}

fn is_valid_file(file: &SourceCode) -> bool {
file.max_line_length <= MAX_LINE_LENGTH_THRESHOLD
&& file.avg_line_length <= AVG_LINE_LENGTH_THRESHOLD
Expand Down
7 changes: 6 additions & 1 deletion crates/tabby-index/src/code/intelligence.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,11 @@ impl CodeIntelligence {
file_key.to_string() == item_key
}

pub fn compute_source_file(config: &CodeRepository, path: &Path) -> Option<SourceCode> {
pub fn compute_source_file(
config: &CodeRepository,
commit: &str,
path: &Path,
) -> Option<SourceCode> {
let source_file_id = Self::compute_source_file_id(path)?;

if path.is_dir() || !path.exists() {
Expand Down Expand Up @@ -114,6 +118,7 @@ impl CodeIntelligence {
source_file_id,
source_id: config.source_id.clone(),
git_url: config.canonical_git_url(),
commit: commit.to_owned(),
basedir: config.dir().display().to_string(),
filepath: relative_path.display().to_string(),
max_line_length,
Expand Down
5 changes: 3 additions & 2 deletions crates/tabby-index/src/code/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,9 @@ impl CodeIndexer {
"Building source code index: {}",
repository.canonical_git_url()
);
repository::sync_repository(repository)?;
let commit = repository::sync_repository(repository)?;

index::index_repository(embedding, repository).await;
index::index_repository(embedding, repository, &commit).await;
index::garbage_collection().await;

Ok(())
Expand Down Expand Up @@ -102,6 +102,7 @@ impl IndexAttributeBuilder<SourceCode> for CodeBuilder {
let attributes = json!({
code::fields::CHUNK_FILEPATH: source_code.filepath,
code::fields::CHUNK_GIT_URL: source_code.git_url,
code::fields::CHUNK_COMMIT: source_code.commit,
code::fields::CHUNK_LANGUAGE: source_code.language,
code::fields::CHUNK_BODY: body,
code::fields::CHUNK_START_LINE: start_line,
Expand Down
24 changes: 16 additions & 8 deletions crates/tabby-index/src/code/repository.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,13 @@ use tracing::warn;
use super::CodeRepository;

trait RepositoryExt {
fn sync(&self) -> anyhow::Result<()>;
fn sync(&self) -> anyhow::Result<String>;
}

impl RepositoryExt for CodeRepository {
fn sync(&self) -> anyhow::Result<()> {
// sync clones the repository if it doesn't exist, otherwise it pulls the remote.
// and returns the git commit sha256.
fn sync(&self) -> anyhow::Result<String> {
let dir = self.dir();
let mut finished = false;
if dir.exists() {
Expand Down Expand Up @@ -47,10 +49,17 @@ impl RepositoryExt for CodeRepository {
}
}

Ok(())
get_commit_sha(&self)
}
}

fn get_commit_sha(repository: &CodeRepository) -> anyhow::Result<String> {
let repo = git2::Repository::open(repository.dir())?;
let head = repo.head()?;
let commit = head.peel_to_commit()?;
Ok(commit.id().to_string())
}

fn pull_remote(path: &Path) -> bool {
let status = Command::new("git")
.current_dir(path)
Expand All @@ -71,16 +80,15 @@ fn pull_remote(path: &Path) -> bool {
true
}

pub fn sync_repository(repository: &CodeRepository) -> anyhow::Result<()> {
pub fn sync_repository(repository: &CodeRepository) -> anyhow::Result<String> {
if repository.is_local_dir() {
if !repository.dir().exists() {
panic!("Directory {} does not exist", repository.dir().display());
bail!("Directory {} does not exist", repository.dir().display());
}
get_commit_sha(repository)
} else {
repository.sync()?;
repository.sync()
}

Ok(())
}

pub fn garbage_collection(repositories: &[CodeRepository]) {
Expand Down
1 change: 1 addition & 0 deletions crates/tabby-index/src/code/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ pub struct SourceCode {
pub source_file_id: String,
pub source_id: String,
pub git_url: String,
pub commit: String,
pub basedir: String,
pub filepath: String,
pub language: String,
Expand Down
20 changes: 20 additions & 0 deletions crates/tabby/src/services/code.rs
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,14 @@ fn create_hit(scores: CodeSearchScores, doc: TantivyDocument) -> CodeSearchHit {
code::fields::CHUNK_GIT_URL,
)
.to_owned(),
// commit is introduced in v0.23, but it is also a required field
// so we need to handle the case where it's not present
commit: get_json_text_field_or_default(
&doc,
schema.field_chunk_attributes,
code::fields::CHUNK_COMMIT,
)
.to_owned(),
language: get_json_text_field(
&doc,
schema.field_chunk_attributes,
Expand Down Expand Up @@ -228,6 +236,18 @@ fn get_json_text_field<'a>(doc: &'a TantivyDocument, field: schema::Field, name:
.unwrap()
}

fn get_json_text_field_or_default<'a>(
doc: &'a TantivyDocument,
field: schema::Field,
name: &str,
) -> &'a str {
doc.get_first(field)
.and_then(|value| value.as_object())
.and_then(|mut obj| obj.find(|(k, _)| *k == name))
.and_then(|(_, v)| v.as_str())
.unwrap_or("")
}

struct CodeSearchService {
imp: CodeSearchImpl,
provider: Arc<IndexReaderProvider>,
Expand Down
1 change: 1 addition & 0 deletions ee/tabby-db/src/threads.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ pub struct ThreadMessageAttachmentAuthor {
#[derive(Serialize, Deserialize)]
pub struct ThreadMessageAttachmentCode {
pub git_url: String,
pub commit: Option<String>,
pub language: String,
pub filepath: String,
pub content: String,
Expand Down
1 change: 1 addition & 0 deletions ee/tabby-schema/graphql/schema.graphql
Original file line number Diff line number Diff line change
Expand Up @@ -497,6 +497,7 @@ type MessageAttachmentClientCode {

type MessageAttachmentCode {
gitUrl: String!
commit: String!
filepath: String!
language: String!
content: String!
Expand Down
2 changes: 2 additions & 0 deletions ee/tabby-schema/src/dao.rs
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,7 @@ impl From<ThreadMessageAttachmentCode> for thread::MessageAttachmentCode {
fn from(value: ThreadMessageAttachmentCode) -> Self {
Self {
git_url: value.git_url,
commit: value.commit.unwrap_or_default(),
filepath: value.filepath,
language: value.language,
content: value.content,
Expand All @@ -214,6 +215,7 @@ impl From<&thread::MessageAttachmentCode> for ThreadMessageAttachmentCode {
fn from(val: &thread::MessageAttachmentCode) -> Self {
ThreadMessageAttachmentCode {
git_url: val.git_url.clone(),
commit: Some(val.commit.clone()),
filepath: val.filepath.clone(),
language: val.language.clone(),
content: val.content.clone(),
Expand Down
2 changes: 2 additions & 0 deletions ee/tabby-schema/src/schema/thread/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ pub struct MessageAttachmentClientCode {
#[derive(GraphQLObject, Clone)]
pub struct MessageAttachmentCode {
pub git_url: String,
pub commit: String,
pub filepath: String,
pub language: String,
pub content: String,
Expand All @@ -82,6 +83,7 @@ impl From<CodeSearchDocument> for MessageAttachmentCode {
fn from(doc: CodeSearchDocument) -> Self {
Self {
git_url: doc.git_url,
commit: doc.commit,
filepath: doc.filepath,
language: doc.language,
content: doc.body,
Expand Down
2 changes: 1 addition & 1 deletion ee/tabby-webserver/src/service/answer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -576,7 +576,7 @@ pub async fn merge_code_snippets(

if let Some(file_content) = file_content {
debug!(
"file {} less than 200, it will be included whole file content",
"file {} less than 300, it will be included whole file content",
file_hits[0].doc.filepath
);
let mut insert_hit = file_hits[0].clone();
Expand Down

0 comments on commit 6eb0493

Please sign in to comment.