Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

impelment summarizer annotator #15

Merged
merged 1 commit into from
Apr 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions ragdaemon/annotators/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from ragdaemon.annotators.diff import Diff
from ragdaemon.annotators.hierarchy import Hierarchy
from ragdaemon.annotators.layout_hierarchy import LayoutHierarchy
from ragdaemon.annotators.summarizer import Summarizer

annotators_map = {
"hierarchy": Hierarchy,
Expand All @@ -13,4 +14,5 @@
"chunker_line": ChunkerLine,
"diff": Diff,
"layout_hierarchy": LayoutHierarchy,
"summarizer": Summarizer,
}
80 changes: 80 additions & 0 deletions ragdaemon/annotators/summarizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
"""
Add a 1-sentence text summary to each file or chunk node
"""

import asyncio
from typing import Any, Coroutine

from tqdm.asyncio import tqdm

from ragdaemon.annotators.base_annotator import Annotator
from ragdaemon.database import Database
from ragdaemon.graph import KnowledgeGraph
from ragdaemon.errors import RagdaemonError
from spice import SpiceMessage

summarizer_prompt = """\
Generate a 1-sentence summary of the provided code. Follow conventions of docstrings:
write in the imerative voice and start with a verb. Do not include any preamble or
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There seems to be a typo here. The word 'imerative' should be corrected to 'imperative' to maintain professionalism in the documentation.

asides.

It may be useful to name specific fucntions from the target repository (not built-in
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please correct the typo in 'fucntions' to 'functions'.

Python functions) which are integral to the functioning of the target code. Include a
maximum of two (2) such named functions, but err on the side of brevity.
"""


semaphore = asyncio.Semaphore(50)


class Summarizer(Annotator):
name = "summarizer"

def is_complete(self, graph: KnowledgeGraph, db: Database) -> bool:
return all(
data.get("summary") is not None
for _, data in graph.nodes(data=True)
if data is not None and data.get("checksum") is not None
)

async def get_llm_response(self, document: str) -> str:
if self.spice_client is None:
raise RagdaemonError("Spice client is not initialized.")
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The spice_client is checked for initialization here, but never assigned in the constructor or any method of Summarizer. This might lead to cases where spice_client remains None, causing this check to fail every time. Consider adding a method or handling to initialize spice_client.

global semaphore
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The use of global semaphore inside an instance method could lead to unpredictable behavior since it's shared across all instances. Consider making semaphore an instance variable or ensure proper locking mechanisms are implemented.

async with semaphore:
messages: list[SpiceMessage] = [
{"role": "system", "content": summarizer_prompt},
{"role": "user", "content": document},
]
response = await self.spice_client.get_response(
messages=messages,
)
return response.text

async def get_summary(self, data: dict[str, Any], db: Database):
"""Asynchronously generate summary and update graph and db"""
record = db.get(data["checksum"])
document = record["documents"][0]
metadatas = record["metadatas"][0]
summary = await self.get_llm_response(document)
metadatas["summary"] = summary
db.update(data["checksum"], metadatas=metadatas)
data["summary"] = summary

async def annotate(
self, graph: KnowledgeGraph, db: Database, refresh: bool = False
) -> KnowledgeGraph:
# Generate/add summaries to nodes with checksums (file, chunk, diff)
tasks = []
for _, data in graph.nodes(data=True):
if data is None or data.get("checksum") is None:
continue
if data.get("summary") is not None and not refresh:
continue
tasks.append(self.get_summary(data, db))
if len(tasks) > 0:
if self.verbose:
await tqdm.gather(*tasks, desc="Summarizing code...")
else:
await asyncio.gather(*tasks)
return graph
1 change: 1 addition & 0 deletions ragdaemon/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ class NodeMetadata(TypedDict):
chunks: Optional[
list[dict[str, str]]
] # For files, func/class/method. For diff, by file/hunk
summary: Optional[str] # Generated summary of the node


class EdgeMetadata(TypedDict):
Expand Down
12 changes: 10 additions & 2 deletions tests/annotators/test_chunker.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from pathlib import Path
from unittest.mock import AsyncMock
from unittest.mock import AsyncMock, patch

import pytest

Expand All @@ -8,6 +8,15 @@
from ragdaemon.graph import KnowledgeGraph


@pytest.fixture
def mock_get_llm_response():
with patch(
"ragdaemon.annotators.chunker_llm.ChunkerLLM.get_llm_response",
return_value={"chunks": []},
) as mock:
yield mock


def test_chunker_is_complete(cwd, mock_db):
chunker = Chunker()

Expand Down Expand Up @@ -45,7 +54,6 @@ async def test_chunker_llm_annotate(cwd, mock_get_llm_response, mock_db):
daemon = Daemon(
cwd=cwd,
annotators={"hierarchy": {}},
graph_path=(Path.cwd() / "tests/data/hierarchy_graph.json"),
)
chunker = ChunkerLLM(spice_client=AsyncMock())
actual = await chunker.annotate(daemon.graph, mock_db)
Expand Down
29 changes: 29 additions & 0 deletions tests/annotators/test_summarizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from unittest.mock import AsyncMock, patch

import pytest

from ragdaemon.annotators import Summarizer
from ragdaemon.daemon import Daemon


@pytest.fixture
def mock_get_llm_response():
with patch(
"ragdaemon.annotators.summarizer.Summarizer.get_llm_response",
return_value="summary of",
) as mock:
yield mock


@pytest.mark.asyncio
async def test_summarizer_annotate(cwd, mock_get_llm_response):
daemon = Daemon(
cwd=cwd,
annotators={"hierarchy": {}},
)
await daemon.update(refresh=True)
summarizer = Summarizer(spice_client=AsyncMock())
actual = await summarizer.annotate(daemon.graph, daemon.db)
for _, data in actual.nodes(data=True):
if data.get("checksum") is not None:
assert data.get("summary") == "summary of"
11 changes: 1 addition & 10 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import subprocess
import tempfile
from pathlib import Path
from unittest.mock import AsyncMock, patch
from unittest.mock import AsyncMock

import pytest

Expand All @@ -22,15 +22,6 @@ def mock_db(cwd):
)


@pytest.fixture
def mock_get_llm_response():
with patch(
"ragdaemon.annotators.chunker_llm.ChunkerLLM.get_llm_response",
return_value={"chunks": []},
) as mock:
yield mock


@pytest.fixture(scope="function")
def git_history(cwd):
with tempfile.TemporaryDirectory() as tmpdir:
Expand Down
Loading