HumanSignal · hakan458 · Apr 24, 2024 · Apr 10, 2024 · Apr 12, 2024 · Apr 16, 2024
diff --git a/adala/environments/kafka.py b/adala/environments/kafka.py
@@ -32,13 +32,36 @@ class AsyncKafkaEnvironment(AsyncEnvironment):
     kafka_input_topic: str
     kafka_output_topic: str
 
+    async def initialize(self):
+        # claim kafka topic from shared pool here?
+        pass
+
+    async def finalize(self):
+        # release kafka topic to shared pool here?
+        pass
+
+    async def get_feedback(
+        self,
+        skills: SkillSet,
+        predictions: InternalDataFrame,
+        num_feedbacks: Optional[int] = None,
+    ) -> EnvironmentFeedback:
+        raise NotImplementedError("Feedback is not supported in Kafka environment")
+
+    async def restore(self):
+        raise NotImplementedError("Restore is not supported in Kafka environment")
+
+    async def save(self):
+        raise NotImplementedError("Save is not supported in Kafka environment")
+
     async def message_receiver(self, consumer: AIOKafkaConsumer, timeout: int = 3):
         await consumer.start()
         try:
             while True:
                 try:
                     # Wait for the next message with a timeout
                     msg = await asyncio.wait_for(consumer.getone(), timeout=timeout)
+                    print_text(f"Received message: {msg.value}")
                     yield msg.value
                 except asyncio.TimeoutError:
                     print_text(
@@ -55,8 +78,10 @@ async def message_sender(
         try:
             for record in data:
                 await producer.send_and_wait(topic, value=record)
+                print_text(f"Sent message: {record} to {topic=}")
         finally:
             await producer.stop()
+            print_text(f"No more messages for {topic=}")
 
     async def get_next_batch(self, data_iterator, batch_size: int) -> List[Dict]:
         batch = []

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -49,6 +49,7 @@ services:
       - AWS_SESSION_TOKEN=${AWS_SESSION_TOKEN}
       - MODULE_NAME=process_file.app
       - KAFKA_BOOTSTRAP_SERVERS=kafka:9093
+      - C_FORCE_ROOT=true # needed when using pickle serializer in celery + running as root - remove when we dont run as root
     command:
       'sh -c "cd tasks && poetry run celery -A $$MODULE_NAME worker --loglevel=info"'
   redis:

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -35,6 +35,7 @@ fastapi = "^0.104.1"
 celery = {version = "^5.3.6", extras = ["redis"]}
 uvicorn = "*"
 pydantic-settings = "^2.2.1"
+label-studio-sdk = "^0.0.32"
 
 [tool.poetry.dev-dependencies]
 pytest = "^7.4.3"

diff --git a/server/app.py b/server/app.py
@@ -10,7 +10,7 @@
 from aiokafka import AIOKafkaProducer
 from fastapi import HTTPException
 from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel
+from pydantic import BaseModel, SerializeAsAny, field_validator
 from pydantic_settings import BaseSettings, SettingsConfigDict
 from pydantic.functional_validators import AfterValidator
 from typing_extensions import Annotated
@@ -23,26 +23,15 @@
     process_file,
     process_file_streaming,
     process_streaming_output,
+    streaming_parent_task,
 )
-from utils import get_input_topic, ResultHandler, Settings
+from utils import get_input_topic, Settings
+from server.handlers.result_handlers import ResultHandler
 
 
 logger = logging.getLogger(__name__)
 
 
-class Settings(BaseSettings):
-    """
-    Can hardcode settings here, read from env file, or pass as env vars
-    https://docs.pydantic.dev/latest/concepts/pydantic_settings/#field-value-priority
-    """
-
-    kafka_bootstrap_servers: Union[str, List[str]]
-
-    model_config = SettingsConfigDict(
-        env_file=".env",
-    )
-
-
 settings = Settings()
 
 app = fastapi.FastAPI()
@@ -148,13 +137,25 @@ class SubmitRequest(BaseModel):
 class SubmitStreamingRequest(BaseModel):
     """
     Request model for submitting a streaming job.
-    Only difference from SubmitRequest is the task_name
     """
 
     agent: Agent
-    result_handler: str
+    # SerializeAsAny is for allowing subclasses of ResultHandler
+    result_handler: SerializeAsAny[ResultHandler]
     task_name: str = "process_file_streaming"
 
+    @field_validator("result_handler", mode="before")
+    def validate_result_handler(cls, value: Dict) -> ResultHandler:
+        """
+        Allows polymorphism for ResultHandlers created from a dict; same implementation as the Skills, Environment, and Runtime within an Agent
+        """
+        if "type" not in value:
+            raise HTTPException(
+                status_code=400, detail="Missing type in result_handler"
+            )
+        result_handler = ResultHandler.create_from_registry(value.pop("type"), **value)
+        return result_handler
+
 
 class BatchData(BaseModel):
     """
@@ -184,10 +185,10 @@ async def submit(request: SubmitRequest):
 
     # TODO: get task by name, e.g. request.task_name
     task = process_file
-    serialized_agent = pickle.dumps(request.agent)
+    agent = request.agent
 
-    logger.debug(f"Submitting task {task.name} with agent {serialized_agent}")
-    result = task.delay(serialized_agent=serialized_agent)
+    logger.debug(f"Submitting task {task.name} with agent {agent}")
+    result = task.delay(agent=agent)
     logger.debug(f"Task {task.name} submitted with job_id {result.id}")
 
     return Response[JobCreated](data=JobCreated(job_id=result.id))
@@ -206,23 +207,13 @@ async def submit_streaming(request: SubmitStreamingRequest):
     """
 
     # TODO: get task by name, e.g. request.task_name
-    task = process_file_streaming
-    serialized_agent = pickle.dumps(request.agent)
-
-    logger.info(f"Submitting task {task.name} with agent {serialized_agent}")
-    input_result = task.delay(serialized_agent=serialized_agent)
-    input_job_id = input_result.id
-    logger.info(f"Task {task.name} submitted with job_id {input_job_id}")
-
-    task = process_streaming_output
-    logger.info(f"Submitting task {task.name}")
-    output_result = task.delay(
-        job_id=input_job_id, result_handler=request.result_handler
+    task = streaming_parent_task
+    result = task.apply_async(
+        kwargs={"agent": request.agent, "result_handler": request.result_handler}
     )
-    output_job_id = output_result.id
-    logger.info(f"Task {task.name} submitted with job_id {output_job_id}")
+    logger.info(f"Submitted {task.name} with ID {result.id}")
 
-    return Response[JobCreated](data=JobCreated(job_id=input_job_id))
+    return Response[JobCreated](data=JobCreated(job_id=result.id))
 
 
 @app.post("/jobs/submit-batch", response_model=Response)
@@ -284,6 +275,68 @@ def get_status(job_id):
     return Response[JobStatusResponse](data=JobStatusResponse(status=status))
 
 
+def aggregate_statuses(input_job_id: str, output_job_id: str):
+    input_job_status = process_file_streaming.AsyncResult(input_job_id).status
+    output_job_status = process_streaming_output.AsyncResult(output_job_id).status
+
+    statuses = [input_job_status, output_job_status]
+
+    if "PENDING" in statuses:
+        return "PENDING"
+    if "FAILURE" in statuses:
+        return "FAILURE"
+    if "REVOKED" in statuses:
+        return "REVOKED"
+    if "STARTED" in statuses or "RETRY" in statuses:
+        return "STARTED"
+
+    return "SUCCESS"
+
+
+@app.get("/streaming-jobs/{job_id}", response_model=Response[JobStatusResponse])
+def get_status_streaming(job_id):
+    """
+    Get the status of a job.
+
+    Args:
+        job_id (str)
+
+    Returns:
+        JobStatusResponse: The response model for getting the status of a job.
+    """
+    celery_status_map = {
+        "PENDING": Status.PENDING,
+        "STARTED": Status.INPROGRESS,
+        "SUCCESS": Status.COMPLETED,
+        "FAILURE": Status.FAILED,
+        "REVOKED": Status.CANCELED,
+        "RETRY": Status.INPROGRESS,
+    }
+    job = streaming_parent_task.AsyncResult(job_id)
+    logger.info(f"\n\nParent task meta : {job.info}\n\n")
+
+    # If parent task meta does not contain input/output job IDs - return FAILED
+    if "input_job_id" not in job.info or "output_job_id" not in job.info:
+        logger.error(
+            "Parent task does not contain input job ID and/or output_job_id - unable to return proper status"
+        )
+        return Response[JobStatusResponse](data=JobStatusResponse(status=Status.FAILED))
+
+    input_job_id = job.info["input_job_id"]
+    output_job_id = job.info["output_job_id"]
+
+    try:
+        status: Status = celery_status_map[
+            aggregate_statuses(input_job_id, output_job_id)
+        ]
+    except Exception as e:
+        logger.error(f"Error getting job status: {e}")
+        status = Status.FAILED
+    else:
+        logger.info(f"Job {job_id} status: {status}")
+    return Response[JobStatusResponse](data=JobStatusResponse(status=status))
+
+
 @app.delete("/jobs/{job_id}", response_model=Response[JobStatusResponse])
 def cancel_job(job_id):
     """

diff --git a/server/handlers/__init__.py b/server/handlers/__init__.py
diff --git a/server/handlers/result_handlers.py b/server/handlers/result_handlers.py
@@ -0,0 +1,77 @@
+from typing import Optional
+import logging
+import json
+from abc import abstractmethod
+from pydantic import computed_field, ConfigDict, model_validator
+
+from adala.utils.registry import BaseModelInRegistry
+from label_studio_sdk import Client
+
+
+logger = logging.getLogger(__name__)
+
+
+class ResultHandler(BaseModelInRegistry):
+    @abstractmethod
+    def __call__(self, batch):
+        """
+        Callable to do something with a batch of results.
+        """
+
+
+class DummyHandler(ResultHandler):
+    """
+    Dummy handler to test streaming output flow
+    Can delete once we have a real handler
+    """
+
+    def __call__(self, batch):
+        logger.info(f"\n\nHandler received batch: {batch}\n\n")
+
+
+class LSEHandler(ResultHandler):
+    """
+    Handler to use the Label Studio SDK to load a batch of results back into a Label Studio project
+    """
+
+    model_config = ConfigDict(arbitrary_types_allowed=True)  # for @computed_field
+
+    api_key: str
+    url: str
+    modelrun_id: int
+
+    @computed_field
+    def client(self) -> Client:
+        _client = Client(
+            api_key=self.api_key,
+            url=self.url,
+        )
+        # Need this to make POST requests using the SDK client
+        # TODO headers can only be set in this function, since client is a computed field. Need to rethink approach if we make non-POST requests, should probably just make a PR in label_studio_sdk to allow setting this in make_request()
+        _client.headers.update(
+            {
+                "accept": "application/json",
+                "Content-Type": "application/json",
+            }
+        )
+        return _client
+
+    @model_validator(mode="after")
+    def ready(self):
+        conn = self.client.check_connection()
+        assert conn["status"] == "UP", "Label Studio is not available"
+
+        return self
+
+    def __call__(self, batch):
+        logger.info(f"\n\nHandler received batch: {batch}\n\n")
+        self.client.make_request(
+            "POST",
+            "/api/model-run/batch-predictions",
+            data=json.dumps(
+                {
+                    "modelrun_id": self.modelrun_id,
+                    "results": batch,
+                }
+            ),
+        )