From cfb3ac4372ed410713ee38c98ad86450f3f69831 Mon Sep 17 00:00:00 2001 From: Paul Swingle Date: Fri, 14 Jun 2024 16:48:57 -0700 Subject: [PATCH 1/5] add file filter to chunker --- ragdaemon/annotators/call_graph.py | 6 +++--- ragdaemon/annotators/chunker/__init__.py | 21 ++++++++++++++------- ragdaemon/daemon.py | 7 +++++-- 3 files changed, 22 insertions(+), 12 deletions(-) diff --git a/ragdaemon/annotators/call_graph.py b/ragdaemon/annotators/call_graph.py index 4772092..aefb655 100644 --- a/ragdaemon/annotators/call_graph.py +++ b/ragdaemon/annotators/call_graph.py @@ -1,17 +1,17 @@ import asyncio -from functools import partial import json +from functools import partial from pathlib import Path from typing import Any, Optional -from tqdm.asyncio import tqdm from spice import SpiceMessages from spice.models import TextModel +from tqdm.asyncio import tqdm from ragdaemon.annotators.base_annotator import Annotator from ragdaemon.database import Database, remove_update_db_duplicates -from ragdaemon.graph import KnowledgeGraph from ragdaemon.errors import RagdaemonError +from ragdaemon.graph import KnowledgeGraph from ragdaemon.utils import ( DEFAULT_CODE_EXTENSIONS, DEFAULT_COMPLETION_MODEL, diff --git a/ragdaemon/annotators/chunker/__init__.py b/ragdaemon/annotators/chunker/__init__.py index e274535..552e47c 100644 --- a/ragdaemon/annotators/chunker/__init__.py +++ b/ragdaemon/annotators/chunker/__init__.py @@ -3,21 +3,21 @@ from copy import deepcopy from functools import partial from pathlib import Path +from typing import Optional, Set from astroid.exceptions import AstroidSyntaxError from tqdm.asyncio import tqdm from ragdaemon.annotators.base_annotator import Annotator +from ragdaemon.annotators.chunker.chunk_astroid import chunk_document as chunk_astroid +from ragdaemon.annotators.chunker.chunk_line import chunk_document as chunk_line +from ragdaemon.annotators.chunker.chunk_llm import chunk_document as chunk_llm +from ragdaemon.annotators.chunker.utils import resolve_chunk_parent from ragdaemon.database import ( Database, remove_add_to_db_duplicates, remove_update_db_duplicates, ) -from ragdaemon.annotators.chunker.utils import resolve_chunk_parent -from ragdaemon.annotators.chunker.chunk_astroid import chunk_document as chunk_astroid -from ragdaemon.annotators.chunker.chunk_llm import chunk_document as chunk_llm -from ragdaemon.annotators.chunker.chunk_line import chunk_document as chunk_line - from ragdaemon.errors import RagdaemonError from ragdaemon.graph import KnowledgeGraph from ragdaemon.utils import ( @@ -33,9 +33,13 @@ class Chunker(Annotator): name = "chunker" chunk_field_id = "chunks" - def __init__(self, *args, use_llm: bool = False, **kwargs): + def __init__( + self, *args, files: Optional[Set[str]] = None, use_llm: bool = False, **kwargs + ): super().__init__(*args, **kwargs) + self.files = files + # By default, use either the LLM chunker or a basic line chunker. if use_llm and self.spice_client is not None: default_chunk_fn = partial( @@ -120,6 +124,9 @@ async def annotate( tasks = [] files_just_chunked = set() for node, data in files_with_chunks: + if self.files != None and node not in self.files: + continue + if ( match_refresh(refresh, node) or data.get(self.chunk_field_id, None) is None @@ -146,7 +153,7 @@ async def annotate( # 1. Add all chunks to graph checksums = dict[str, str]() for file, data in files_with_chunks: - if len(data[self.chunk_field_id]) == 0: + if self.chunk_field_id not in data or len(data[self.chunk_field_id]) == 0: continue # Sort such that "parents" are added before "children" base_id = f"{file}:BASE" diff --git a/ragdaemon/daemon.py b/ragdaemon/daemon.py index 1de30ac..cc51d86 100644 --- a/ragdaemon/daemon.py +++ b/ragdaemon/daemon.py @@ -2,7 +2,7 @@ import json import time from pathlib import Path -from typing import Any, Iterable, Optional +from typing import Any, Dict, Iterable, Optional from networkx.readwrite import json_graph from spice import Spice @@ -73,6 +73,9 @@ def __init__( if self.verbose > 1: print("Initialized empty graph.") + self.set_annotators(annotators) + + def set_annotators(self, annotators: Optional[Dict[str, Dict]] = None): annotators = annotators if annotators is not None else default_annotators() if self.verbose > 1: print(f"Initializing annotators: {list(annotators.keys())}...") @@ -81,7 +84,7 @@ def __init__( self.pipeline[ann] = annotators_map[ann]( **kwargs, verbose=self.verbose, - spice_client=spice_client, + spice_client=self.spice_client, pipeline=self.pipeline, ) From 34d1be476f4ed2a20bfc4fa8cb0ade2669734695 Mon Sep 17 00:00:00 2001 From: Paul Swingle Date: Fri, 14 Jun 2024 16:50:14 -0700 Subject: [PATCH 2/5] is not none --- ragdaemon/annotators/chunker/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ragdaemon/annotators/chunker/__init__.py b/ragdaemon/annotators/chunker/__init__.py index 552e47c..33e8279 100644 --- a/ragdaemon/annotators/chunker/__init__.py +++ b/ragdaemon/annotators/chunker/__init__.py @@ -124,7 +124,7 @@ async def annotate( tasks = [] files_just_chunked = set() for node, data in files_with_chunks: - if self.files != None and node not in self.files: + if self.files is not None and node not in self.files: continue if ( From 291e298a240d380f68eaabf7caba07fc8b126330 Mon Sep 17 00:00:00 2001 From: Paul Swingle Date: Fri, 14 Jun 2024 16:57:30 -0700 Subject: [PATCH 3/5] grant suggested change --- ragdaemon/annotators/chunker/__init__.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/ragdaemon/annotators/chunker/__init__.py b/ragdaemon/annotators/chunker/__init__.py index 33e8279..be76f2c 100644 --- a/ragdaemon/annotators/chunker/__init__.py +++ b/ragdaemon/annotators/chunker/__init__.py @@ -113,6 +113,8 @@ async def annotate( if data.get("type") == "chunk": graph.remove_node(node) elif data.get("type") == "file": + if self.files is not None and node not in self.files: + continue if self.chunk_extensions_map is None: files_with_chunks.append((node, data)) else: @@ -124,9 +126,6 @@ async def annotate( tasks = [] files_just_chunked = set() for node, data in files_with_chunks: - if self.files is not None and node not in self.files: - continue - if ( match_refresh(refresh, node) or data.get(self.chunk_field_id, None) is None @@ -153,7 +152,7 @@ async def annotate( # 1. Add all chunks to graph checksums = dict[str, str]() for file, data in files_with_chunks: - if self.chunk_field_id not in data or len(data[self.chunk_field_id]) == 0: + if len(data[self.chunk_field_id]) == 0: continue # Sort such that "parents" are added before "children" base_id = f"{file}:BASE" From f4078180743759375aa20d1f42690b9c8d9f492b Mon Sep 17 00:00:00 2001 From: Paul Swingle Date: Fri, 14 Jun 2024 16:59:30 -0700 Subject: [PATCH 4/5] pyright --- tests/sample/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/sample/main.py b/tests/sample/main.py index fcabfbe..50a3a6b 100644 --- a/tests/sample/main.py +++ b/tests/sample/main.py @@ -1,5 +1,5 @@ -from src.interface import parse_arguments, render_response -from src.operations import add, divide, multiply, subtract +from src.interface import parse_arguments, render_response # pyright: ignore +from src.operations import add, divide, multiply, subtract # pyright: ignore def main(): From 09c517065bd7e384d50eca0c79016dad8053c25e Mon Sep 17 00:00:00 2001 From: Paul Swingle Date: Fri, 14 Jun 2024 17:02:43 -0700 Subject: [PATCH 5/5] ignore pyright --- pyproject.toml | 3 +++ tests/sample/main.py | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f35e1fd..13ace73 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,3 +45,6 @@ dev = [ "pytest", "pytest-asyncio" ] + +[tool.pyright] +ignore = ["tests/sample"] \ No newline at end of file diff --git a/tests/sample/main.py b/tests/sample/main.py index 50a3a6b..fcabfbe 100644 --- a/tests/sample/main.py +++ b/tests/sample/main.py @@ -1,5 +1,5 @@ -from src.interface import parse_arguments, render_response # pyright: ignore -from src.operations import add, divide, multiply, subtract # pyright: ignore +from src.interface import parse_arguments, render_response +from src.operations import add, divide, multiply, subtract def main():