Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add file filter to chunker #53

Merged
merged 5 commits into from
Jun 15, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions ragdaemon/annotators/call_graph.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
import asyncio
from functools import partial
import json
from functools import partial
from pathlib import Path
from typing import Any, Optional

from tqdm.asyncio import tqdm
from spice import SpiceMessages
from spice.models import TextModel
from tqdm.asyncio import tqdm

from ragdaemon.annotators.base_annotator import Annotator
from ragdaemon.database import Database, remove_update_db_duplicates
from ragdaemon.graph import KnowledgeGraph
from ragdaemon.errors import RagdaemonError
from ragdaemon.graph import KnowledgeGraph
from ragdaemon.utils import (
DEFAULT_CODE_EXTENSIONS,
DEFAULT_COMPLETION_MODEL,
Expand Down
21 changes: 14 additions & 7 deletions ragdaemon/annotators/chunker/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,21 @@
from copy import deepcopy
from functools import partial
from pathlib import Path
from typing import Optional, Set

from astroid.exceptions import AstroidSyntaxError
from tqdm.asyncio import tqdm

from ragdaemon.annotators.base_annotator import Annotator
from ragdaemon.annotators.chunker.chunk_astroid import chunk_document as chunk_astroid
from ragdaemon.annotators.chunker.chunk_line import chunk_document as chunk_line
from ragdaemon.annotators.chunker.chunk_llm import chunk_document as chunk_llm
from ragdaemon.annotators.chunker.utils import resolve_chunk_parent
from ragdaemon.database import (
Database,
remove_add_to_db_duplicates,
remove_update_db_duplicates,
)
from ragdaemon.annotators.chunker.utils import resolve_chunk_parent
from ragdaemon.annotators.chunker.chunk_astroid import chunk_document as chunk_astroid
from ragdaemon.annotators.chunker.chunk_llm import chunk_document as chunk_llm
from ragdaemon.annotators.chunker.chunk_line import chunk_document as chunk_line

from ragdaemon.errors import RagdaemonError
from ragdaemon.graph import KnowledgeGraph
from ragdaemon.utils import (
Expand All @@ -33,9 +33,13 @@ class Chunker(Annotator):
name = "chunker"
chunk_field_id = "chunks"

def __init__(self, *args, use_llm: bool = False, **kwargs):
def __init__(
self, *args, files: Optional[Set[str]] = None, use_llm: bool = False, **kwargs
PCSwingle marked this conversation as resolved.
Show resolved Hide resolved
):
super().__init__(*args, **kwargs)

self.files = files

# By default, use either the LLM chunker or a basic line chunker.
if use_llm and self.spice_client is not None:
default_chunk_fn = partial(
Expand Down Expand Up @@ -120,6 +124,9 @@ async def annotate(
tasks = []
files_just_chunked = set()
for node, data in files_with_chunks:
if self.files != None and node not in self.files:
PCSwingle marked this conversation as resolved.
Show resolved Hide resolved
continue

if (
match_refresh(refresh, node)
or data.get(self.chunk_field_id, None) is None
Expand All @@ -146,7 +153,7 @@ async def annotate(
# 1. Add all chunks to graph
checksums = dict[str, str]()
for file, data in files_with_chunks:
if len(data[self.chunk_field_id]) == 0:
if self.chunk_field_id not in data or len(data[self.chunk_field_id]) == 0:
PCSwingle marked this conversation as resolved.
Show resolved Hide resolved
continue
# Sort such that "parents" are added before "children"
base_id = f"{file}:BASE"
Expand Down
7 changes: 5 additions & 2 deletions ragdaemon/daemon.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import json
import time
from pathlib import Path
from typing import Any, Iterable, Optional
from typing import Any, Dict, Iterable, Optional

from networkx.readwrite import json_graph
from spice import Spice
Expand Down Expand Up @@ -73,6 +73,9 @@ def __init__(
if self.verbose > 1:
print("Initialized empty graph.")

self.set_annotators(annotators)

def set_annotators(self, annotators: Optional[Dict[str, Dict]] = None):
annotators = annotators if annotators is not None else default_annotators()
if self.verbose > 1:
print(f"Initializing annotators: {list(annotators.keys())}...")
Expand All @@ -81,7 +84,7 @@ def __init__(
self.pipeline[ann] = annotators_map[ann](
**kwargs,
verbose=self.verbose,
spice_client=spice_client,
spice_client=self.spice_client,
PCSwingle marked this conversation as resolved.
Show resolved Hide resolved
pipeline=self.pipeline,
)

Expand Down
Loading