From 17adec3fe2f43a9ee8ccc6e1fe7490c2f8c70c5b Mon Sep 17 00:00:00 2001
From: Mitchell Victoriano <47313912+MitchellAV@users.noreply.github.com>
Date: Wed, 12 Jun 2024 22:22:06 -0700
Subject: [PATCH 01/20] Dev: Added prefect with dask to dockerize task

---
 dockerize-workflow/main.py                    |  69 ++++++++++++++++++
 dockerize-workflow/submission/Dockerfile      |  18 +++++
 .../submission/requirements.txt               |   1 +
 .../submission/sdt-submission.zip             | Bin 0 -> 2932 bytes
 dockerize-workflow/submission/submission.py   |  33 +++++++++
 dockerize-workflow/submission/submission.zip  | Bin 0 -> 703 bytes
 6 files changed, 121 insertions(+)
 create mode 100644 dockerize-workflow/main.py
 create mode 100644 dockerize-workflow/submission/Dockerfile
 create mode 100644 dockerize-workflow/submission/requirements.txt
 create mode 100644 dockerize-workflow/submission/sdt-submission.zip
 create mode 100644 dockerize-workflow/submission/submission.py
 create mode 100644 dockerize-workflow/submission/submission.zip

diff --git a/dockerize-workflow/main.py b/dockerize-workflow/main.py
new file mode 100644
index 00000000..2ef62bb4
--- /dev/null
+++ b/dockerize-workflow/main.py
@@ -0,0 +1,69 @@
+import os
+from prefect import flow, task
+from prefect_dask.task_runners import DaskTaskRunner
+import docker
+
+
+def docker_task(client: docker.DockerClient, index: int = -1):
+
+    # Define volumes to mount
+    results_dir = os.path.join(os.path.dirname(__file__), "results")
+
+    volumes = {
+        results_dir: {"bind": "/app/results/", "mode": "rw"},
+    }
+
+    # Execute docker image in a container
+    print("Docker container started")
+    container = client.containers.run(
+        "submission:latest",
+        command=["python", "submission.py", f"{index}"],
+        auto_remove=True,
+        volumes=volumes,
+    )
+    print(container)
+
+
+def create_docker_image():
+    client = docker.from_env()
+
+    file_path = os.path.join(os.path.dirname(__file__), "submission")
+
+    print(file_path)
+
+    # Create docker image from Dockerfile
+    image, index = client.images.build(path=file_path, tag="submission:latest")
+    print("Docker image created")
+
+    return client
+
+
+@task
+def main_task(index: int):
+
+    client = create_docker_image()
+    docker_task(client=client, index=index)
+    client.close()
+
+
+@flow(
+    task_runner=DaskTaskRunner(
+        cluster_kwargs={
+            "n_workers": 4,
+            "threads_per_worker": 1,
+            "memory_limit": "8GiB",
+        }
+    ),
+    log_prints=False,
+)
+def main_flow():
+    for i in range(30):
+        main_task.submit(i)
+
+
+def main():
+    main_flow()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/dockerize-workflow/submission/Dockerfile b/dockerize-workflow/submission/Dockerfile
new file mode 100644
index 00000000..5c6bb77b
--- /dev/null
+++ b/dockerize-workflow/submission/Dockerfile
@@ -0,0 +1,18 @@
+# Use an official Python runtime as the base image
+FROM python:3.11-slim
+
+# Set the working directory in the container
+WORKDIR /app
+
+# Copy the submission package into the container
+COPY submission.zip /app
+
+# Unzip the submission package
+RUN apt-get update && apt-get install -y unzip
+RUN unzip submission.zip
+
+# Install the Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+
+# # Set the command to run the application
+# CMD [ "python", "submission.py" ]
\ No newline at end of file
diff --git a/dockerize-workflow/submission/requirements.txt b/dockerize-workflow/submission/requirements.txt
new file mode 100644
index 00000000..296d6545
--- /dev/null
+++ b/dockerize-workflow/submission/requirements.txt
@@ -0,0 +1 @@
+numpy
\ No newline at end of file
diff --git a/dockerize-workflow/submission/sdt-submission.zip b/dockerize-workflow/submission/sdt-submission.zip
new file mode 100644
index 0000000000000000000000000000000000000000..9af57e322af4454721dbffff582bba0cd4d90fde
GIT binary patch
literal 2932
zcmWIWW@Zs#00H&Mi4kB1ln`Q2U?@o~E{QKLP0GzIF3!x)Q!vpF4dG>Azn%OjRq6z5
znj8?9R&X;gvdCZD!o{*NXjW<f*gOsf4ixk50L_#Enin7M>*(ws9HFllk6|>5$%U9E
zCnty|rza#OeDL)N`@kR85zxRiL4sLbK$`KRmIB+y_XYy2GR{s-3w$Q@X*#@VT-G9}
zcy9ij`KzZ#1WcH`YWnOMbC^X^l7F1qHhcR1dzzI&tJim4{dbkWW3#v{14Do}JBJI$
z4YpXILqU#3ctZ;*SXd+&7^JZVi=Inxd~ivAQ7YIM4}jqV!k{oh3!k?$toaTZ2)Ly8
za(kQTY2Gbh$V+fN$+?4pHGIMMi;A}dH;DuVs5Vtj`Wn_%pDE+daO1;{_y!(@gT2Of
zlWvH$tTQ{O*dpN6QJcK3<W8BiQ-`e4eE!yTCB@S7*{#ZwjbdMTnQz}1zd(8J3m@@a
zm(HGbJ}qI_qb)7mW!M_A@OAilog?=P3ZDM_ZJ%iRpIfm`qWf{F%m+S}(|qg8UcYMl
z>h17he(UnQoQ2wV@9g5>*eL1`3Ysj2j@(VaU;zaH_MoW%%Bms9Fh=|W;~aY^krdY(
zr3IRQh&r5M2!5z=pn=(<t%b#Lrl>*e3rHMtcO0-V1v&-f9E3xa$`MIh94MoNH4ckX
z3rjPLQgc)DN{aPLDoVgnDPa64RR$Q1SfcW@_F3H(PqfaR_td%M>#5^;Mfc?S^P10?
zz=`Saf5~%_K&wF3VRwi<BLjmjp%BCCCQ<^<(7@mmQ>McFi}M;a9!LlYu_QK@-eG){
zVxoE`ppcP`ZRL|cF?)|RHO&dxJLkxwNtYg7nKI?bq>!f#2`g@yJG?n~;6Q%>I1d@#
zXYnX7Ht0}#!qxTZbkJv^&y#{BD|+RZHigvwezA;s(M^eI|1u*ttUc$NynYh@WdE5j
z-5>pF=97?=$hrBebla}0`E1}2x?LpawHz2apm0EhP*H771W*tR>VPupz!1VrN{AvU
zzPu>0pdhtKub>hfUzxzT$^v2#jh6LF19s;fHsHDQS5zuj;RCzK2QiVkR;A(*-F>Vc
zOT2<y|9%fmV*Yrc=+WbM1$$<2cU!L7nZtJ_@TJ(pH&cW?7?_j&UOnR5z3r%UfQLq`
zctIf}Z~A93<Eq%lNw?nHdb#W^dH3Ec;bPW-mvcGw^$hR&eXd`#TXy{f@2-;1-4DWD
zXKiQCtf<^@=+&~#r#H{OxMQM1vON!b#T2i7N1A*Ef2;nJ@%NDrn!P@MihtOgIK@oq
zR_#{ihqG^-3jKC_oq9&-3%%cG4l_I6GI?dO+2gd9QRNjjhQsf~LD}<{Sw`G7V32_V
z40}*2Gchn20fQ1=0Aq%!9#TP%D^y8Ladj-09PB0=F$W$nR8iqL#Pl+uguzH|v1Vdx
z7q@|dhwAg^uT;fEo-BFwR7Gt16tU&YmZ(fyqQ%bQJjIBOfs>z&t*&L)og+_VI_~f&
zOgwoYp-*Lc)Fh58^E0Ntef?<ij488(=SPPG#U%wznHU|DH!F8eMo3KL%($$av{}<Q
z3zd~8JV;9raGentFeQH4tm!kR&*7T%oKfhJ#;4Y$ovxjgsxvEB@;nRj$y5mbx9`kD
zri(c|-v77g+z5NVXk+-v_LIxcd{LHet_VK3o|}8`W3_$OYjRF+TRdZi!JjWzm%CrA
ze_Y<a^5n$Z`>(hA7WnDiFDd-1*n6YfJlwzX^Y^2-=RWmWCYyc!PlT|Q@5{oxO_SzN
zd+WAOIPhWjkGNdT#OFKa{i~Sv_L^~UnB>*e9agKh9y@g;yMDr#zLHg!=Y79ql_lz)
z?EcLDXNCTP_qouVqj$yJV1vOXGh<@|BOep9?R*|S{<_=JK(Wrw5&gW>tPPj~Kye-5
z&B!FejJslo<}il0jvy9hVnC^ku~zdC^I;V|a@`54=waY(V*ro|PjzHe`!Jt?Yy#EA
z$W;-jE{1_6jp9JRWBLSpor9$~N4E&MDgo7OFtDW22pGU{FJZBf_<9c6PEaL<Tx)<T
zDHvGNcpGRZrd4>W2LgTs*@s*PgX$s}SkjonOu&zLt;SMLAbS;5$Rn3oph6x7mNd4p
tVDl<jMLxQv$XNtb9Kyho#*3^3{7cw=te$3N1K9yeyzRgcz0MBe0RSI2<#_-A

literal 0
HcmV?d00001

diff --git a/dockerize-workflow/submission/submission.py b/dockerize-workflow/submission/submission.py
new file mode 100644
index 00000000..0f37c74d
--- /dev/null
+++ b/dockerize-workflow/submission/submission.py
@@ -0,0 +1,33 @@
+import sys
+import numpy as np
+from time import sleep
+from time import perf_counter
+import csv
+
+
+def submission_function(index: str = "0", *args, **kwargs):
+    print("Submission function called")
+    start = perf_counter()
+
+    random_number = np.random.rand()
+    sleep_time = random_number * 10
+    sleep(sleep_time)
+    end = perf_counter()
+    time_elapsed = end - start
+
+    print(f"{index} : {time_elapsed}")
+
+    # Write time elapsed to a csv file
+    with open(f"results/{index}.csv", mode="w") as file:
+        writer = csv.writer(file)
+        writer.writerow([time_elapsed])
+
+    print("Submission function completed successfully!")
+    return time_elapsed
+
+
+if __name__ == "__main__":
+
+    args = sys.argv[1:]
+
+    submission_function(*args)
diff --git a/dockerize-workflow/submission/submission.zip b/dockerize-workflow/submission/submission.zip
new file mode 100644
index 0000000000000000000000000000000000000000..771d4b536e5f6b9edaf94f14de99ade80d5f1779
GIT binary patch
literal 703
zcmWIWW@h1H00FhaGZ9T~8oyY9Y!DV;kYOlFEiBC}O3h8pD=F41sVE5z;bdSg=gUfK
z1me;PZU#n{XUhw8SvCgEO3f?HEvO6t8YcqO%)!vH>P*C$wR?VKF)}cGVq#$61sYde
znv|PaT%4Jor&mx3Hmji+XciEonbjKP>wnmQXYcdy83mc@5|5)+?r&ml=X70Is;Cs-
zC48#o$9olDc4>u+-Z8&_-mKeZbgz}8L}A&g%*h|wC$L=GtZBg<DA{7_A@z}e`*DNL
zgR0VoGL>vqsXe_Gj7!}YcTCxw<W<&jGh>SE$F<B|j8zY;>sX~F=V{o_p1w08_qam6
zorbh@-FH_;cP@z~*DOAj9SoeZGp+ub=jBbRVoPM_3RZ4#*Lk|;e%SR-CfbfOeRQ0^
z+8db_?EZbs<o=0Son?Pwcd&4|ESNTLg96vPU1q2J(q{kot#pjN<U`%8PY196(|>5U
zq9gR9a(szG_b~&_UhbT*kCJoTXKOMCU7DB|_9~Yxt+Q8p<8$Nu(^cMaS1UaAM7t~`
zcdoVfb=+bvkbUOt*?(KEw@>@KTQ7aju^G|JO?W&^RM+y0zy8N`OyteXl%G{sSezne
z)-wI7QFoG9xkg~m>Zhj`KAHFVYu@IcyY|18%X+YWax~wX)cFD4j7)OOxKe-wFe(`s
zfHBChq!GlzOb@J(^nezB5L0m_1Y}eFfu^D*3!sspWPxEMD;vlOOhC92!T^#0+YJL8

literal 0
HcmV?d00001


From d8bffd48d4a532d287c12143a10e6cc88ab927be Mon Sep 17 00:00:00 2001
From: Mitchell Victoriano <47313912+MitchellAV@users.noreply.github.com>
Date: Thu, 13 Jun 2024 15:56:48 -0700
Subject: [PATCH 02/20] Dev: working on integrating actual submission into test

---
 dockerize-workflow/main.py                    |  44 +++-
 dockerize-workflow/submission/Dockerfile      |  20 +-
 dockerize-workflow/submission/submission.py   |  11 +-
 dockerize-workflow/submission/submission.zip  | Bin 703 -> 650 bytes
 .../submission/submission_wrapper.py          | 211 ++++++++++++++++++
 5 files changed, 264 insertions(+), 22 deletions(-)
 create mode 100644 dockerize-workflow/submission/submission_wrapper.py

diff --git a/dockerize-workflow/main.py b/dockerize-workflow/main.py
index 2ef62bb4..341baaff 100644
--- a/dockerize-workflow/main.py
+++ b/dockerize-workflow/main.py
@@ -1,10 +1,19 @@
 import os
+from typing import Any
 from prefect import flow, task
 from prefect_dask.task_runners import DaskTaskRunner
 import docker
 
 
-def docker_task(client: docker.DockerClient, index: int = -1):
+def docker_task(
+    client: docker.DockerClient,
+    submission_file_name: str,
+    submission_function_name: str,
+    submission_args: list[Any] | None = None,
+):
+
+    if submission_args is None:
+        submission_args = []
 
     # Define volumes to mount
     results_dir = os.path.join(os.path.dirname(__file__), "results")
@@ -17,7 +26,13 @@ def docker_task(client: docker.DockerClient, index: int = -1):
     print("Docker container started")
     container = client.containers.run(
         "submission:latest",
-        command=["python", "submission.py", f"{index}"],
+        command=[
+            "python",
+            "submission_wrapper.py",
+            submission_file_name,
+            submission_function_name,
+            *submission_args,
+        ],
         auto_remove=True,
         volumes=volumes,
     )
@@ -40,10 +55,25 @@ def create_docker_image():
 
 @task
 def main_task(index: int):
-
-    client = create_docker_image()
-    docker_task(client=client, index=index)
-    client.close()
+    client = None
+    try:
+        client = create_docker_image()
+
+        submission_file_name = "submission"
+        submission_function_name = "submission_function"
+        submission_args = [str(index)]
+
+        docker_task(
+            client,
+            submission_file_name,
+            submission_function_name,
+            submission_args,
+        )
+    except Exception as e:
+        print(f"Error: {e}")
+    finally:
+        if client:
+            client.close()
 
 
 @flow(
@@ -57,7 +87,7 @@ def main_task(index: int):
     log_prints=False,
 )
 def main_flow():
-    for i in range(30):
+    for i in range(5):
         main_task.submit(i)
 
 
diff --git a/dockerize-workflow/submission/Dockerfile b/dockerize-workflow/submission/Dockerfile
index 5c6bb77b..5bb83143 100644
--- a/dockerize-workflow/submission/Dockerfile
+++ b/dockerize-workflow/submission/Dockerfile
@@ -4,15 +4,25 @@ FROM python:3.11-slim
 # Set the working directory in the container
 WORKDIR /app
 
+RUN apt-get update 
+
+COPY submission_wrapper.py /app
+COPY requirements.txt /app
+
+# Install the Python dependencies for the submission wrapper
+RUN pip install --no-cache-dir -r requirements.txt
+
+
 # Copy the submission package into the container
 COPY submission.zip /app
 
 # Unzip the submission package
-RUN apt-get update && apt-get install -y unzip
-RUN unzip submission.zip
+
+RUN apt-get install -y unzip
+
+RUN unzip submission.zip -d submission
+
+WORKDIR /app/submission
 
 # Install the Python dependencies
 RUN pip install --no-cache-dir -r requirements.txt
-
-# # Set the command to run the application
-# CMD [ "python", "submission.py" ]
\ No newline at end of file
diff --git a/dockerize-workflow/submission/submission.py b/dockerize-workflow/submission/submission.py
index 0f37c74d..1d838cc5 100644
--- a/dockerize-workflow/submission/submission.py
+++ b/dockerize-workflow/submission/submission.py
@@ -1,4 +1,3 @@
-import sys
 import numpy as np
 from time import sleep
 from time import perf_counter
@@ -6,11 +5,10 @@
 
 
 def submission_function(index: str = "0", *args, **kwargs):
-    print("Submission function called")
     start = perf_counter()
 
     random_number = np.random.rand()
-    sleep_time = random_number * 10
+    sleep_time = random_number * 15
     sleep(sleep_time)
     end = perf_counter()
     time_elapsed = end - start
@@ -24,10 +22,3 @@ def submission_function(index: str = "0", *args, **kwargs):
 
     print("Submission function completed successfully!")
     return time_elapsed
-
-
-if __name__ == "__main__":
-
-    args = sys.argv[1:]
-
-    submission_function(*args)
diff --git a/dockerize-workflow/submission/submission.zip b/dockerize-workflow/submission/submission.zip
index 771d4b536e5f6b9edaf94f14de99ade80d5f1779..e837e3d1336ba248c27145f0840528161ecad52e 100644
GIT binary patch
delta 433
zcmV;i0Z#tE1&RfbHv)blkvSlL>1xebcwCxaH30wsX#xNM4FDVfb9G{EX>)UFZ*DGd
zc~w*i00TBvYi2i9Yi4zL3jhHG=BIEi1hG<@Wo=Q*PQx$|y!$H_b4e;F<yt9U!GQxp
zQJSp5!Y_GkLRIzO@gt=zkc*w&nc4BI)Xw?<CTh2bfxtK>ywwn-Mv%3CNec0wvO_QW
z(MA){v!yoDCu3|v0Tg#y5=m=%5zQFVrIBXBuPspU&;j4@J6H|=jnc9DnDedOGDshV
zA!&726PlKlo*&F)b)O9HFv}RXuBCD$R1wII`(n2H#aO}P=8S31$jU@B{T-Ye8PcN~
z9AQRe+<i4#eK{{pXaqlhlt&(63x`X}kuUyl;n~Z8HOZ4+urTE8KuCoJHp}o1)}cv~
z9x19o4~z6V0aJ9^PT298w|Q-uSF<bRJRygZkn8%>WJLQ5vGnZRyj*&HZ7-k1pMEw8
zr%l3WwNn_9b&BJNL?SA+zu|R)#}K`_ii~{&P)h*<9s>#h0001!3j$XS>1xebcwCxa
bH30wsX_IdPCj&NBYm=D*AO_0;0000047a>d

delta 476
zcmV<20VDp31-}K5Hv)kUkvSlLh^ow3%&olmY5@QM@&W(=4FDVfb9G{EX>)UFZ*DGd
zc~w*i00V$?YG#3RYG!qK3jhHG=BIEi1hG<@Wrb2rPs1<_z2{e$aA_wP<65b|f&+&M
zMWJ;eB43IV%7pmuBuxh!Akj-)`}x^^wldy@2y`H(s%ew)2ap8Tvne=#15p`-HAouq
z54XoKiNPfsF&udYdS#4_I02<cqllEVVoG+1*=Up<@vQ@j0eav&egzvDo+v+?m$_JN
z#~^<`C>tAo_eB9m3K*o;IP!MEQIxr6f7VIUvLzOz9i0(*utzNAtlw7G@-!<!nM^E0
z^*=H;aJ&1)G~dXYiFW*dUAP1aM4*<QupkQVx;DyAW}Eo?BE|=F@P3-|!PovT;XbH{
zRq-QVbRdhyFe!}-Y*z6ZoJX4}15(nFZdU4T24;zk8?omzZ_9d1Ue$O>E)|M68M&=L
zO+mCj5NpiM&Hl94hnD@jE@!;Pm|Leb3`jI3tq&LL{sP7n?3rXL`Fhj@LRgu70{VO>
zLKvy65WK79%Aw?*^Yw1C`MSUD9%|sPlUoj~WuKGF0b&h^s?1o-t-SbZ0RRB<lR5$?
S1AudClVk!Q1^@y80002jZPCpD

diff --git a/dockerize-workflow/submission/submission_wrapper.py b/dockerize-workflow/submission/submission_wrapper.py
new file mode 100644
index 00000000..c5857aaf
--- /dev/null
+++ b/dockerize-workflow/submission/submission_wrapper.py
@@ -0,0 +1,211 @@
+from importlib import import_module
+import inspect
+import sys
+import os
+import zipfile
+import tarfile
+import shutil
+from typing import Callable, cast
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def extract_files(  # noqa: C901
+    ref: zipfile.ZipFile | tarfile.TarFile,
+    extract_path: str,
+    zip_path: str,
+    remove_unallowed_starting_characters: Callable[[str], str | None],
+):
+
+    logger.info("Extracting files from: " + zip_path)
+
+    if ref.__class__ == zipfile.ZipFile:
+        ref = cast(zipfile.ZipFile, ref)
+        file_names = ref.namelist()
+    elif ref.__class__ == tarfile.TarFile:
+        ref = cast(tarfile.TarFile, ref)
+        file_names = ref.getnames()
+    else:
+        raise Exception("File is not a zip or tar file.")
+
+    # recursively remove files and folders that start with certain characters
+    file_names = [
+        f for f in file_names if remove_unallowed_starting_characters(f)
+    ]
+    logger.info("File names:")
+    logger.info(file_names)
+    folders = [f for f in file_names if f.endswith("/")]
+    logger.info("Folders:")
+    logger.info(folders)
+
+    if len(folders) == 0:
+        logger.info("Extracting all files...")
+
+        for file in file_names:
+            if ref.__class__ == zipfile.ZipFile:
+                ref = cast(zipfile.ZipFile, ref)
+                ref.extract(file, path=extract_path)
+            elif ref.__class__ == tarfile.TarFile:
+                ref = cast(tarfile.TarFile, ref)
+                ref.extract(file, path=extract_path, filter="data")
+            else:
+                raise Exception("File is not a zip or tar file.")
+
+    else:
+        # if all files have the same root any folder can be used to check since all will have the same root if true
+        do_all_files_have_same_root = all(
+            [f.startswith(folders[0]) for f in file_names]
+        )
+        logger.info(
+            "Do all files have the same root? "
+            + str(do_all_files_have_same_root)
+        )
+
+        if do_all_files_have_same_root:
+            # extract all files within the folder with folder of the zipfile that has the same root
+            root_folder_name = folders[0]
+
+            logger.info("Extracting files...")
+            for file in file_names:
+                if file.endswith("/") and file != root_folder_name:
+                    os.makedirs(
+                        os.path.join(
+                            extract_path,
+                            file.removeprefix(root_folder_name),
+                        )
+                    )
+                if not file.endswith("/"):
+                    if ref.__class__ == zipfile.ZipFile:
+                        ref = cast(zipfile.ZipFile, ref)
+                        ref.extract(file, path=extract_path)
+                    elif ref.__class__ == tarfile.TarFile:
+                        ref = cast(tarfile.TarFile, ref)
+                        ref.extract(file, path=extract_path, filter="data")
+                    else:
+                        raise Exception(1, "File is not a zip or tar file.")
+
+                    os.rename(
+                        os.path.join(extract_path, file),
+                        os.path.join(
+                            extract_path,
+                            file.removeprefix(root_folder_name),
+                        ),
+                    )
+
+            # remove the root folder and all other folders
+            shutil.rmtree(os.path.join(extract_path, root_folder_name))
+
+        else:
+            logger.info("Extracting all files...")
+            for file in file_names:
+                if ref.__class__ == zipfile.ZipFile:
+                    ref = cast(zipfile.ZipFile, ref)
+                    ref.extract(file, path=extract_path)
+                elif ref.__class__ == tarfile.TarFile:
+                    ref = cast(tarfile.TarFile, ref)
+                    ref.extract(file, path=extract_path, filter="data")
+                else:
+                    raise Exception(1, "File is not a zip or tar file.")
+
+
+def extract_zip(zip_path: str, extract_path: str):
+    if not os.path.exists(extract_path):
+        os.makedirs(extract_path)
+
+    def remove_unallowed_starting_characters(file_name: str) -> str | None:
+        unallowed_starting_characters = ("_", ".")
+
+        parts = file_name.split("/")
+        for part in parts:
+            if part.startswith(unallowed_starting_characters):
+                return None
+        return file_name
+
+    if zipfile.is_zipfile(zip_path):
+        with zipfile.ZipFile(zip_path, "r") as zip_ref:
+            extract_files(
+                zip_ref,
+                extract_path,
+                zip_path,
+                remove_unallowed_starting_characters,
+            )
+    elif tarfile.is_tarfile(zip_path):
+        with tarfile.open(zip_path, "r") as tar_ref:
+            extract_files(
+                tar_ref,
+                extract_path,
+                zip_path,
+                remove_unallowed_starting_characters,
+            )
+    else:
+        raise Exception(1, "File is not a zip or tar file.")
+
+
+def format_args_for_submission(args):
+    return args
+
+
+def import_submission_function(submission_file_name: str, function_name: str):
+    # Dynamically import function from submission.py
+    try:
+        submission_module = import_module(submission_file_name)
+    except ModuleNotFoundError as e:
+        logger.info(f"ModuleNotFoundError: {submission_file_name} not found")
+        raise e
+
+    try:
+        submission_function: Callable = getattr(
+            submission_module, function_name
+        )
+        function_parameters = list(
+            inspect.signature(submission_function).parameters.keys()
+        )
+    except AttributeError as e:
+        logger.info(
+            f"AttributeError: {function_name} not found in submission module"
+        )
+        raise e
+
+    return submission_function, function_parameters
+
+
+def main():
+    args = sys.argv[1:]
+
+    if len(args) < 1:
+        logger.info("Function name not provided")
+        sys.exit(1)
+
+    submission_file_name = args[0]
+    function_name = args[1]
+
+    submission_zip_file_path = os.path.join(
+        os.path.dirname(__file__), f"{submission_file_name}.zip"
+    )
+
+    logger.info(f"Submission zip file path: {submission_zip_file_path}")
+    extract_zip(submission_zip_file_path, ".")
+
+    if not os.path.exists(submission_file_name):
+        logger.error(f"Submission file not found: {submission_file_name}")
+        sys.exit(1)
+
+    submission_function, function_parameters = import_submission_function(
+        submission_file_name, function_name
+    )
+
+    logger.info(f"Submission file name: {submission_file_name}")
+    logger.info(f"Function name: {function_name}")
+    logger.info(f"Function: {submission_function}")
+    logger.info(f"Function parameters: {function_parameters}")
+
+    submission_args = format_args_for_submission(args[2:])
+
+    results = submission_function(*submission_args)
+    logger.info(f"Results: {results}")
+
+
+if __name__ == "__main__":
+    main()

From 1ddb5abb9b2d680406a9c7a56b882c7c8b7cb399 Mon Sep 17 00:00:00 2001
From: Mitchell Victoriano <47313912+MitchellAV@users.noreply.github.com>
Date: Thu, 13 Jun 2024 22:58:48 -0700
Subject: [PATCH 03/20] Dev: working first pass with sdt submission

---
 dockerize-workflow/environment/Dockerfile     |  33 +++
 .../environment/requirements.txt              |   0
 dockerize-workflow/environment/submission.zip | Bin 0 -> 654 bytes
 .../environment/submission_wrapper.py         |  95 +++++++++
 dockerize-workflow/environment/unzip.py       | 162 +++++++++++++++
 dockerize-workflow/main.py                    | 188 ++++++++++++++----
 dockerize-workflow/requirements.txt           |   3 +
 7 files changed, 441 insertions(+), 40 deletions(-)
 create mode 100644 dockerize-workflow/environment/Dockerfile
 create mode 100644 dockerize-workflow/environment/requirements.txt
 create mode 100644 dockerize-workflow/environment/submission.zip
 create mode 100644 dockerize-workflow/environment/submission_wrapper.py
 create mode 100644 dockerize-workflow/environment/unzip.py
 create mode 100644 dockerize-workflow/requirements.txt

diff --git a/dockerize-workflow/environment/Dockerfile b/dockerize-workflow/environment/Dockerfile
new file mode 100644
index 00000000..ccddc4ba
--- /dev/null
+++ b/dockerize-workflow/environment/Dockerfile
@@ -0,0 +1,33 @@
+# Use an official Python runtime as the base image
+FROM python:3.11-slim
+
+# Set the working directory in the container
+WORKDIR /app
+
+RUN apt-get update 
+
+COPY unzip.py .
+COPY requirements.txt .
+
+# Install the Python dependencies for the submission wrapper
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy the submission package into the container
+COPY submission.zip .
+
+# Unzip the submission package
+
+RUN python -m unzip submission.zip submission
+
+WORKDIR /app/submission
+
+# Install the Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+
+
+# Set the working directory in the container
+WORKDIR /app
+
+COPY submission_wrapper.py .
+# Command to keep the container running without doing anything
+# CMD tail -f /dev/null
\ No newline at end of file
diff --git a/dockerize-workflow/environment/requirements.txt b/dockerize-workflow/environment/requirements.txt
new file mode 100644
index 00000000..e69de29b
diff --git a/dockerize-workflow/environment/submission.zip b/dockerize-workflow/environment/submission.zip
new file mode 100644
index 0000000000000000000000000000000000000000..b3db13edb62879598490484c5a17959465ea229c
GIT binary patch
literal 654
zcmWIWW@Zs#-~d9;ZD%7Gpg;~t3os}!6r~oHW)`L9rskCt>y=cLgof}kuuI>{PLly*
z5M5fq&A`a=Y<Yn$%f_Htsi(Ei>aKXAb?&^U&Lv+@9nUMeC(obP+_u&9it#n$M@$R>
z-s~JUk`LA@0L=&45dgQ{4axR8Mg|5^pzXz_Nx7ND#hLke@#RH{1qG=^dIgnWCx`;A
z76W1sjpl^XfZchA4S4SS6_v_W_`ojmK}=+>RjGJHcOR?A60acFzu!ZXm_J@9di3~R
z!JZl1-Il9%=I~t!d@1(u%@kn|2IgeHSC9C1Z#ya-;Gq#KUQo!$oBmnMxGMH>(yjNl
zUM_n}-o5uqxR`a|<y=mEJ;S?xpX=A`mR&!=yQ}1L_k(cPS=-q&D=Ie}dbMox>CLk*
z?wF{MY|q19F~w`&ktSck->Uy){C(tuX0Okm;vY6APBBxuRlAk>;p`iyLciT!r=Ah|
zLhtvP!_1DiOkP=R_BgF&RC$Gs;qW_gP{92%%ZR%M3^Y*i1b8zti7+D~4mnUk5eEZH
s8bK^lqY>G3P{bhH42l>SSkhPrWRew40p6@^AY+(-@F0-h&IsZG01~|54*&oF

literal 0
HcmV?d00001

diff --git a/dockerize-workflow/environment/submission_wrapper.py b/dockerize-workflow/environment/submission_wrapper.py
new file mode 100644
index 00000000..6105d966
--- /dev/null
+++ b/dockerize-workflow/environment/submission_wrapper.py
@@ -0,0 +1,95 @@
+from importlib import import_module
+import inspect
+import sys
+import pandas as pd
+import numpy as np
+
+from typing import Callable, cast
+
+
+def format_args_for_submission(data_dir: str, args: list[str]):
+    filename = args[0]
+
+    file_path = f"{data_dir}/{filename}"
+
+    df = pd.read_csv(
+        file_path,
+        index_col=0,
+        parse_dates=True,
+    )
+
+    print(df.head(5))
+
+    series: pd.Series = df.asfreq("60min").squeeze()
+
+    submission_args = [series, *args[1:]]
+
+    return submission_args
+
+
+def import_submission_function(submission_file_name: str, function_name: str):
+    # Dynamically import function from submission.py
+    try:
+        submission_module = import_module(submission_file_name)
+    except ModuleNotFoundError as e:
+        print(f"ModuleNotFoundError: {submission_file_name} not found")
+        raise e
+
+    try:
+        submission_function: Callable = getattr(
+            submission_module, function_name
+        )
+        function_parameters = list(
+            inspect.signature(submission_function).parameters.keys()
+        )
+    except AttributeError as e:
+        print(
+            f"AttributeError: {function_name} not found in submission module"
+        )
+        raise e
+
+    return submission_function, function_parameters
+
+
+def main():
+    args = sys.argv[1:]
+
+    if len(args) < 1:
+        print("Function name not provided")
+        sys.exit(1)
+
+    submission_file_name = args[0]
+    function_name = args[1]
+    data_file_name = args[2]
+
+    print("Getting submission function...")
+
+    submission_function, function_parameters = import_submission_function(
+        submission_file_name, function_name
+    )
+    print("Got submission function")
+
+    print(f"Submission file name: {submission_file_name}")
+    print(f"Function name: {function_name}")
+    print(f"Function: {submission_function}")
+    print(f"Function parameters: {function_parameters}")
+
+    data_dir = "/app/data/"
+    results_dir = "/app/results/"
+
+    submission_args = format_args_for_submission(data_dir, args[2:])
+
+    print(f"Submission args: {submission_args}")
+
+    results: np.ndarray = submission_function(*submission_args)
+
+    print(f"Results: {results}")
+
+    # save results to csv file
+    results_df = pd.DataFrame(results)
+    results_file = f"{results_dir}/{data_file_name}"
+    results_df.to_csv(results_file)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/dockerize-workflow/environment/unzip.py b/dockerize-workflow/environment/unzip.py
new file mode 100644
index 00000000..eeec358c
--- /dev/null
+++ b/dockerize-workflow/environment/unzip.py
@@ -0,0 +1,162 @@
+import sys
+import os
+import zipfile
+import tarfile
+import shutil
+from typing import Callable, cast
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def extract_files(  # noqa: C901
+    ref: zipfile.ZipFile | tarfile.TarFile,
+    extract_path: str,
+    zip_path: str,
+    remove_unallowed_starting_characters: Callable[[str], str | None],
+):
+
+    logger.info("Extracting files from: " + zip_path)
+
+    if ref.__class__ == zipfile.ZipFile:
+        ref = cast(zipfile.ZipFile, ref)
+        file_names = ref.namelist()
+    elif ref.__class__ == tarfile.TarFile:
+        ref = cast(tarfile.TarFile, ref)
+        file_names = ref.getnames()
+    else:
+        raise Exception("File is not a zip or tar file.")
+
+    # recursively remove files and folders that start with certain characters
+    file_names = [
+        f for f in file_names if remove_unallowed_starting_characters(f)
+    ]
+    logger.info("File names:")
+    logger.info(file_names)
+    folders = [f for f in file_names if f.endswith("/")]
+    logger.info("Folders:")
+    logger.info(folders)
+
+    if len(folders) == 0:
+        logger.info("Extracting all files...")
+
+        for file in file_names:
+            if ref.__class__ == zipfile.ZipFile:
+                ref = cast(zipfile.ZipFile, ref)
+                ref.extract(file, path=extract_path)
+            elif ref.__class__ == tarfile.TarFile:
+                ref = cast(tarfile.TarFile, ref)
+                ref.extract(file, path=extract_path, filter="data")
+            else:
+                raise Exception("File is not a zip or tar file.")
+
+    else:
+        # if all files have the same root any folder can be used to check since all will have the same root if true
+        do_all_files_have_same_root = all(
+            [f.startswith(folders[0]) for f in file_names]
+        )
+        logger.info(
+            "Do all files have the same root? "
+            + str(do_all_files_have_same_root)
+        )
+
+        if do_all_files_have_same_root:
+            # extract all files within the folder with folder of the zipfile that has the same root
+            root_folder_name = folders[0]
+
+            logger.info("Extracting files...")
+            for file in file_names:
+                if file.endswith("/") and file != root_folder_name:
+                    os.makedirs(
+                        os.path.join(
+                            extract_path,
+                            file.removeprefix(root_folder_name),
+                        )
+                    )
+                if not file.endswith("/"):
+                    if ref.__class__ == zipfile.ZipFile:
+                        ref = cast(zipfile.ZipFile, ref)
+                        ref.extract(file, path=extract_path)
+                    elif ref.__class__ == tarfile.TarFile:
+                        ref = cast(tarfile.TarFile, ref)
+                        ref.extract(file, path=extract_path, filter="data")
+                    else:
+                        raise Exception(1, "File is not a zip or tar file.")
+
+                    os.rename(
+                        os.path.join(extract_path, file),
+                        os.path.join(
+                            extract_path,
+                            file.removeprefix(root_folder_name),
+                        ),
+                    )
+
+            # remove the root folder and all other folders
+            shutil.rmtree(os.path.join(extract_path, root_folder_name))
+
+        else:
+            logger.info("Extracting all files...")
+            for file in file_names:
+                if ref.__class__ == zipfile.ZipFile:
+                    ref = cast(zipfile.ZipFile, ref)
+                    ref.extract(file, path=extract_path)
+                elif ref.__class__ == tarfile.TarFile:
+                    ref = cast(tarfile.TarFile, ref)
+                    ref.extract(file, path=extract_path, filter="data")
+                else:
+                    raise Exception(1, "File is not a zip or tar file.")
+
+
+def extract_zip(zip_path: str, extract_path: str):
+    if not os.path.exists(extract_path):
+        os.makedirs(extract_path)
+
+    def remove_unallowed_starting_characters(file_name: str) -> str | None:
+        unallowed_starting_characters = ("_", ".")
+
+        parts = file_name.split("/")
+        for part in parts:
+            if part.startswith(unallowed_starting_characters):
+                return None
+        return file_name
+
+    if zipfile.is_zipfile(zip_path):
+        with zipfile.ZipFile(zip_path, "r") as zip_ref:
+            extract_files(
+                zip_ref,
+                extract_path,
+                zip_path,
+                remove_unallowed_starting_characters,
+            )
+    elif tarfile.is_tarfile(zip_path):
+        with tarfile.open(zip_path, "r") as tar_ref:
+            extract_files(
+                tar_ref,
+                extract_path,
+                zip_path,
+                remove_unallowed_starting_characters,
+            )
+    else:
+        raise Exception(1, "File is not a zip or tar file.")
+
+
+def main():
+    args = sys.argv[1:]
+
+    if len(args) < 1:
+        logger.info("Function name not provided")
+        sys.exit(1)
+
+    zip_file_path = args[0]
+    extract_path = args[1]
+
+    submission_zip_file_path = os.path.join(
+        os.path.dirname(__file__), f"{zip_file_path}"
+    )
+
+    logger.info(f"Submission zip file path: {submission_zip_file_path}")
+    extract_zip(submission_zip_file_path, extract_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/dockerize-workflow/main.py b/dockerize-workflow/main.py
index 341baaff..ccfc936e 100644
--- a/dockerize-workflow/main.py
+++ b/dockerize-workflow/main.py
@@ -1,12 +1,20 @@
 import os
-from typing import Any
+from sys import stdout
+from typing import Any, cast
+import docker.models
+import docker.models.containers
 from prefect import flow, task
 from prefect_dask.task_runners import DaskTaskRunner
 import docker
+from docker.models.images import Image
+from docker.models.containers import Container
+from docker.errors import ImageNotFound
 
 
 def docker_task(
     client: docker.DockerClient,
+    image: str,
+    memory_limit: str,
     submission_file_name: str,
     submission_function_name: str,
     submission_args: list[Any] | None = None,
@@ -17,54 +25,132 @@ def docker_task(
 
     # Define volumes to mount
     results_dir = os.path.join(os.path.dirname(__file__), "results")
+    data_dir = os.path.join(os.path.dirname(__file__), "data")
 
     volumes = {
         results_dir: {"bind": "/app/results/", "mode": "rw"},
+        data_dir: {"bind": "/app/data/", "mode": "ro"},
     }
 
     # Execute docker image in a container
-    print("Docker container started")
-    container = client.containers.run(
-        "submission:latest",
-        command=[
-            "python",
-            "submission_wrapper.py",
-            submission_file_name,
-            submission_function_name,
-            *submission_args,
-        ],
-        auto_remove=True,
-        volumes=volumes,
-    )
-    print(container)
+    container = None
+    try:
+        print("Docker container starting...")
+        print(f"Image: {image}")
+        print(f"Submission file name: {submission_file_name}")
+        print(f"Submission function name: {submission_function_name}")
+        print(f"Submission args: {submission_args}")
+        container = cast(
+            Container,
+            client.containers.run(
+                image,
+                command=[
+                    "python",
+                    "submission_wrapper.py",
+                    submission_file_name,
+                    submission_function_name,
+                    *submission_args,
+                ],
+                volumes=volumes,
+                detach=True,
+                stdout=True,
+                stderr=True,
+                mem_limit=f"{memory_limit}g",
+            ),
+        )
+
+        print("Docker container started")
+        print(container.id)
 
+        # Wait for container to finish
+        for line in container.logs(stream=True):
+            line = cast(str, line)
+            print(line.strip())
 
-def create_docker_image():
+        container.wait()
+
+    except Exception as e:
+        print(f"Error: {e}")
+        raise e
+    finally:
+        if container:
+            if container.status == "running":
+                print("Docker container stopping...")
+                container.stop()
+                print("Docker container stopped")
+            print("Docker container removing...")
+            container.remove()
+            print("Docker container removed")
+
+
+def initialize_docker_client():
     client = docker.from_env()
+    return client
+
 
-    file_path = os.path.join(os.path.dirname(__file__), "submission")
+def create_docker_image(
+    tag: str,
+    client: docker.DockerClient,
+    overwrite: bool = False,
+):
+
+    file_path = os.path.join(os.path.dirname(__file__), "environment")
 
     print(file_path)
 
-    # Create docker image from Dockerfile
-    image, index = client.images.build(path=file_path, tag="submission:latest")
-    print("Docker image created")
+    # Check if Dockerfile exists
+    if not os.path.exists(os.path.join(file_path, "Dockerfile")):
+        raise FileNotFoundError("Dockerfile not found")
+
+    # Check if docker image already exists
+
+    image = None
+
+    if not overwrite:
+        try:
+            image = client.images.get(tag)
+        except ImageNotFound:
+            print("Docker image not found")
+        except Exception as e:
+            print(f"Error: {e}")
+            raise e
+
+    if image:
+        print("Docker image already exists")
+        print(image)
+        return image
+    else:
+        print("Docker image does not exist")
+
+        # Create docker image from Dockerfile
+        image, build_logs = client.images.build(
+            path=file_path, tag=tag, dockerfile="Dockerfile"
+        )
 
-    return client
+        for log in build_logs:
+            if "stream" in log:
+                print(log["stream"].strip())
+
+        print("Docker image created")
+
+        return image
 
 
 @task
-def main_task(index: int):
+def main_task(image: str, memory_limit: str, data_filepath: str):
     client = None
     try:
-        client = create_docker_image()
+        client = initialize_docker_client()
+        # image = create_docker_image(client, prefect_logger)
 
-        submission_file_name = "submission"
-        submission_function_name = "submission_function"
-        submission_args = [str(index)]
+        submission_file_name = "submission.submission_wrapper"
+        submission_function_name = "detect_time_shifts"
+        submission_args = [data_filepath]
 
         docker_task(
             client,
+            image,
+            memory_limit,
             submission_file_name,
             submission_function_name,
             submission_args,
@@ -76,23 +162,45 @@ def main_task(index: int):
             client.close()
 
 
-@flow(
-    task_runner=DaskTaskRunner(
-        cluster_kwargs={
-            "n_workers": 4,
-            "threads_per_worker": 1,
-            "memory_limit": "8GiB",
-        }
-    ),
-    log_prints=False,
-)
-def main_flow():
-    for i in range(5):
-        main_task.submit(i)
+def main_flow(memory_limit: str):
+    tag: str = "submission:latest"
+
+    client = None
+    try:
+        client = initialize_docker_client()
+        image = create_docker_image(tag, client, overwrite=True)
+    except Exception as e:
+        print(f"Error: {e}")
+    finally:
+        if client:
+            client.close()
+
+    data_files = os.listdir("data")
+    print(data_files)
+
+    if not data_files:
+        raise FileNotFoundError("No data files found")
+
+    files = data_files[:1]
+
+    for filepath in files:
+        main_task.submit(tag, memory_limit, filepath)
 
 
 def main():
-    main_flow()
+
+    memory_limit = "8"
+
+    flow(
+        task_runner=DaskTaskRunner(
+            cluster_kwargs={
+                "n_workers": 1,
+                "threads_per_worker": 1,
+                "memory_limit": f"{memory_limit}GiB",
+            }
+        ),
+        log_prints=True,
+    )(main_flow)(memory_limit)
 
 
 if __name__ == "__main__":
diff --git a/dockerize-workflow/requirements.txt b/dockerize-workflow/requirements.txt
new file mode 100644
index 00000000..b4345058
--- /dev/null
+++ b/dockerize-workflow/requirements.txt
@@ -0,0 +1,3 @@
+prefect
+docker
+bokeh
\ No newline at end of file

From c872deadbacfd2f3d3ed11f26024c678102cc71e Mon Sep 17 00:00:00 2001
From: Mitchell Victoriano <47313912+MitchellAV@users.noreply.github.com>
Date: Fri, 14 Jun 2024 16:16:59 -0700
Subject: [PATCH 04/20] Cleaned folder and made small changes

---
 .../environment/submission_wrapper.py         |  58 ++++-
 dockerize-workflow/main.py                    |   4 +-
 dockerize-workflow/submission/Dockerfile      |  28 ---
 .../submission/requirements.txt               |   1 -
 .../submission/sdt-submission.zip             | Bin 2932 -> 0 bytes
 dockerize-workflow/submission/submission.py   |  24 --
 dockerize-workflow/submission/submission.zip  | Bin 650 -> 0 bytes
 .../submission/submission_wrapper.py          | 211 ------------------
 8 files changed, 57 insertions(+), 269 deletions(-)
 delete mode 100644 dockerize-workflow/submission/Dockerfile
 delete mode 100644 dockerize-workflow/submission/requirements.txt
 delete mode 100644 dockerize-workflow/submission/sdt-submission.zip
 delete mode 100644 dockerize-workflow/submission/submission.py
 delete mode 100644 dockerize-workflow/submission/submission.zip
 delete mode 100644 dockerize-workflow/submission/submission_wrapper.py

diff --git a/dockerize-workflow/environment/submission_wrapper.py b/dockerize-workflow/environment/submission_wrapper.py
index 6105d966..42d7f6c4 100644
--- a/dockerize-workflow/environment/submission_wrapper.py
+++ b/dockerize-workflow/environment/submission_wrapper.py
@@ -3,8 +3,58 @@
 import sys
 import pandas as pd
 import numpy as np
+from time import perf_counter
+from functools import wraps
+from typing import Any, Union, Tuple, TypeVar, Callable, cast
+from logging import Logger
+import logging
 
-from typing import Callable, cast
+T = TypeVar("T")
+
+
+def logger_if_able(
+    message: str, logger: Logger | None = None, level: str = "INFO"
+):
+    if logger is not None:
+        levels_dict = {
+            "DEBUG": logging.DEBUG,
+            "INFO": logging.INFO,
+            "WARNING": logging.WARNING,
+            "ERROR": logging.ERROR,
+            "CRITICAL": logging.CRITICAL,
+        }
+
+        level = level.upper()
+
+        if level not in levels_dict:
+            raise Exception(f"Invalid log level: {level}")
+
+        log_level = levels_dict[level]
+
+        logger.log(log_level, message)
+    else:
+        print(message)
+
+
+def timing(verbose: bool = True, logger: Union[Logger, None] = None):
+    @wraps(timing)
+    def decorator(func: Callable[..., T]):
+        @wraps(func)
+        def wrapper(*args, **kwargs) -> Tuple[T, float]:
+            start_time = perf_counter()
+            result = func(*args, **kwargs)
+            end_time = perf_counter()
+            execution_time = end_time - start_time
+            if verbose:
+                msg = (
+                    f"{func.__name__} took {execution_time:.3f} seconds to run"
+                )
+                logger_if_able(msg, logger)
+            return result, execution_time
+
+        return wrapper
+
+    return decorator
 
 
 def format_args_for_submission(data_dir: str, args: list[str]):
@@ -36,7 +86,7 @@ def import_submission_function(submission_file_name: str, function_name: str):
         raise e
 
     try:
-        submission_function: Callable = getattr(
+        submission_function: Callable[[pd.Series, Any], np.ndarray] = getattr(
             submission_module, function_name
         )
         function_parameters = list(
@@ -81,7 +131,9 @@ def main():
 
     print(f"Submission args: {submission_args}")
 
-    results: np.ndarray = submission_function(*submission_args)
+    results, execution_time = timing()(submission_function)(*submission_args)
+
+    print(f"Execution time: {execution_time}")
 
     print(f"Results: {results}")
 
diff --git a/dockerize-workflow/main.py b/dockerize-workflow/main.py
index ccfc936e..1e231c57 100644
--- a/dockerize-workflow/main.py
+++ b/dockerize-workflow/main.py
@@ -181,7 +181,7 @@ def main_flow(memory_limit: str):
     if not data_files:
         raise FileNotFoundError("No data files found")
 
-    files = data_files[:1]
+    files = data_files
 
     for filepath in files:
         main_task.submit(tag, memory_limit, filepath)
@@ -194,7 +194,7 @@ def main():
     flow(
         task_runner=DaskTaskRunner(
             cluster_kwargs={
-                "n_workers": 1,
+                "n_workers": 3,
                 "threads_per_worker": 1,
                 "memory_limit": f"{memory_limit}GiB",
             }
diff --git a/dockerize-workflow/submission/Dockerfile b/dockerize-workflow/submission/Dockerfile
deleted file mode 100644
index 5bb83143..00000000
--- a/dockerize-workflow/submission/Dockerfile
+++ /dev/null
@@ -1,28 +0,0 @@
-# Use an official Python runtime as the base image
-FROM python:3.11-slim
-
-# Set the working directory in the container
-WORKDIR /app
-
-RUN apt-get update 
-
-COPY submission_wrapper.py /app
-COPY requirements.txt /app
-
-# Install the Python dependencies for the submission wrapper
-RUN pip install --no-cache-dir -r requirements.txt
-
-
-# Copy the submission package into the container
-COPY submission.zip /app
-
-# Unzip the submission package
-
-RUN apt-get install -y unzip
-
-RUN unzip submission.zip -d submission
-
-WORKDIR /app/submission
-
-# Install the Python dependencies
-RUN pip install --no-cache-dir -r requirements.txt
diff --git a/dockerize-workflow/submission/requirements.txt b/dockerize-workflow/submission/requirements.txt
deleted file mode 100644
index 296d6545..00000000
--- a/dockerize-workflow/submission/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-numpy
\ No newline at end of file
diff --git a/dockerize-workflow/submission/sdt-submission.zip b/dockerize-workflow/submission/sdt-submission.zip
deleted file mode 100644
index 9af57e322af4454721dbffff582bba0cd4d90fde..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2932
zcmWIWW@Zs#00H&Mi4kB1ln`Q2U?@o~E{QKLP0GzIF3!x)Q!vpF4dG>Azn%OjRq6z5
znj8?9R&X;gvdCZD!o{*NXjW<f*gOsf4ixk50L_#Enin7M>*(ws9HFllk6|>5$%U9E
zCnty|rza#OeDL)N`@kR85zxRiL4sLbK$`KRmIB+y_XYy2GR{s-3w$Q@X*#@VT-G9}
zcy9ij`KzZ#1WcH`YWnOMbC^X^l7F1qHhcR1dzzI&tJim4{dbkWW3#v{14Do}JBJI$
z4YpXILqU#3ctZ;*SXd+&7^JZVi=Inxd~ivAQ7YIM4}jqV!k{oh3!k?$toaTZ2)Ly8
za(kQTY2Gbh$V+fN$+?4pHGIMMi;A}dH;DuVs5Vtj`Wn_%pDE+daO1;{_y!(@gT2Of
zlWvH$tTQ{O*dpN6QJcK3<W8BiQ-`e4eE!yTCB@S7*{#ZwjbdMTnQz}1zd(8J3m@@a
zm(HGbJ}qI_qb)7mW!M_A@OAilog?=P3ZDM_ZJ%iRpIfm`qWf{F%m+S}(|qg8UcYMl
z>h17he(UnQoQ2wV@9g5>*eL1`3Ysj2j@(VaU;zaH_MoW%%Bms9Fh=|W;~aY^krdY(
zr3IRQh&r5M2!5z=pn=(<t%b#Lrl>*e3rHMtcO0-V1v&-f9E3xa$`MIh94MoNH4ckX
z3rjPLQgc)DN{aPLDoVgnDPa64RR$Q1SfcW@_F3H(PqfaR_td%M>#5^;Mfc?S^P10?
zz=`Saf5~%_K&wF3VRwi<BLjmjp%BCCCQ<^<(7@mmQ>McFi}M;a9!LlYu_QK@-eG){
zVxoE`ppcP`ZRL|cF?)|RHO&dxJLkxwNtYg7nKI?bq>!f#2`g@yJG?n~;6Q%>I1d@#
zXYnX7Ht0}#!qxTZbkJv^&y#{BD|+RZHigvwezA;s(M^eI|1u*ttUc$NynYh@WdE5j
z-5>pF=97?=$hrBebla}0`E1}2x?LpawHz2apm0EhP*H771W*tR>VPupz!1VrN{AvU
zzPu>0pdhtKub>hfUzxzT$^v2#jh6LF19s;fHsHDQS5zuj;RCzK2QiVkR;A(*-F>Vc
zOT2<y|9%fmV*Yrc=+WbM1$$<2cU!L7nZtJ_@TJ(pH&cW?7?_j&UOnR5z3r%UfQLq`
zctIf}Z~A93<Eq%lNw?nHdb#W^dH3Ec;bPW-mvcGw^$hR&eXd`#TXy{f@2-;1-4DWD
zXKiQCtf<^@=+&~#r#H{OxMQM1vON!b#T2i7N1A*Ef2;nJ@%NDrn!P@MihtOgIK@oq
zR_#{ihqG^-3jKC_oq9&-3%%cG4l_I6GI?dO+2gd9QRNjjhQsf~LD}<{Sw`G7V32_V
z40}*2Gchn20fQ1=0Aq%!9#TP%D^y8Ladj-09PB0=F$W$nR8iqL#Pl+uguzH|v1Vdx
z7q@|dhwAg^uT;fEo-BFwR7Gt16tU&YmZ(fyqQ%bQJjIBOfs>z&t*&L)og+_VI_~f&
zOgwoYp-*Lc)Fh58^E0Ntef?<ij488(=SPPG#U%wznHU|DH!F8eMo3KL%($$av{}<Q
z3zd~8JV;9raGentFeQH4tm!kR&*7T%oKfhJ#;4Y$ovxjgsxvEB@;nRj$y5mbx9`kD
zri(c|-v77g+z5NVXk+-v_LIxcd{LHet_VK3o|}8`W3_$OYjRF+TRdZi!JjWzm%CrA
ze_Y<a^5n$Z`>(hA7WnDiFDd-1*n6YfJlwzX^Y^2-=RWmWCYyc!PlT|Q@5{oxO_SzN
zd+WAOIPhWjkGNdT#OFKa{i~Sv_L^~UnB>*e9agKh9y@g;yMDr#zLHg!=Y79ql_lz)
z?EcLDXNCTP_qouVqj$yJV1vOXGh<@|BOep9?R*|S{<_=JK(Wrw5&gW>tPPj~Kye-5
z&B!FejJslo<}il0jvy9hVnC^ku~zdC^I;V|a@`54=waY(V*ro|PjzHe`!Jt?Yy#EA
z$W;-jE{1_6jp9JRWBLSpor9$~N4E&MDgo7OFtDW22pGU{FJZBf_<9c6PEaL<Tx)<T
zDHvGNcpGRZrd4>W2LgTs*@s*PgX$s}SkjonOu&zLt;SMLAbS;5$Rn3oph6x7mNd4p
tVDl<jMLxQv$XNtb9Kyho#*3^3{7cw=te$3N1K9yeyzRgcz0MBe0RSI2<#_-A

diff --git a/dockerize-workflow/submission/submission.py b/dockerize-workflow/submission/submission.py
deleted file mode 100644
index 1d838cc5..00000000
--- a/dockerize-workflow/submission/submission.py
+++ /dev/null
@@ -1,24 +0,0 @@
-import numpy as np
-from time import sleep
-from time import perf_counter
-import csv
-
-
-def submission_function(index: str = "0", *args, **kwargs):
-    start = perf_counter()
-
-    random_number = np.random.rand()
-    sleep_time = random_number * 15
-    sleep(sleep_time)
-    end = perf_counter()
-    time_elapsed = end - start
-
-    print(f"{index} : {time_elapsed}")
-
-    # Write time elapsed to a csv file
-    with open(f"results/{index}.csv", mode="w") as file:
-        writer = csv.writer(file)
-        writer.writerow([time_elapsed])
-
-    print("Submission function completed successfully!")
-    return time_elapsed
diff --git a/dockerize-workflow/submission/submission.zip b/dockerize-workflow/submission/submission.zip
deleted file mode 100644
index e837e3d1336ba248c27145f0840528161ecad52e..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 650
zcmWIWW@h1H00FhaGZ9T~8oyY9Y!DV;kYOlFEiBC}O3h8pD=F41sVE5z;bdSg=gUf~
zQ_4y!t>9*0WO=r{K$m4>(5%$F(%gc|0HARqK+PNsFSE`@RK(1RH)UjC$Yf$*-~}32
zT$+@dSzMf%pQl$)2{y|tG&{{4h|$c-4Lt34*g#;<Z*8$+Pc4n7QJV1w8=0K~XRSGS
zNWU`IDfH|8kIGARCv?xfJM-qpHCN94Vo=N4-rjh?LT%4gfhBGewt8}XoWIFg|JOw~
z;R~CWnq5*)&T(QCDUTBJ%&rnVD{*P*#I!@}bpt+JV7&jqI)e9K@1>2uW_(_I*T_S@
zRWY+PTzJ;Bsq^H|q?FF*d2eu9V*9$KMJgeJC;EOTZTo#R;^5<FJ(=fDoN_V#D?3M8
z^3q}{nMAjgJGG`!wU+w687%fw+~pFv+r3Xs)c=?N@aE|T)030qHW)m4<KWTCYIgbq
z-?fFFljJm4Iq-LXv0@A^x#f4`<IL@qxiiBzYd^73Y@a;g^{>k*F2A`qetC0ePfS&9
zuKxTZ^Xtqwm-`;d*cvP`skG~Hhl`Td*8LAln~n?Y-PzSs%N*d%$Rx*%D>5a3vBJOr
zj0}b)jUX0g)UraN7A-0urs9fIWK;ctrlQ3#&`3}WV;ITG266%u5UvE$r$Me_006()
B?NR^$

diff --git a/dockerize-workflow/submission/submission_wrapper.py b/dockerize-workflow/submission/submission_wrapper.py
deleted file mode 100644
index c5857aaf..00000000
--- a/dockerize-workflow/submission/submission_wrapper.py
+++ /dev/null
@@ -1,211 +0,0 @@
-from importlib import import_module
-import inspect
-import sys
-import os
-import zipfile
-import tarfile
-import shutil
-from typing import Callable, cast
-
-import logging
-
-logger = logging.getLogger(__name__)
-
-
-def extract_files(  # noqa: C901
-    ref: zipfile.ZipFile | tarfile.TarFile,
-    extract_path: str,
-    zip_path: str,
-    remove_unallowed_starting_characters: Callable[[str], str | None],
-):
-
-    logger.info("Extracting files from: " + zip_path)
-
-    if ref.__class__ == zipfile.ZipFile:
-        ref = cast(zipfile.ZipFile, ref)
-        file_names = ref.namelist()
-    elif ref.__class__ == tarfile.TarFile:
-        ref = cast(tarfile.TarFile, ref)
-        file_names = ref.getnames()
-    else:
-        raise Exception("File is not a zip or tar file.")
-
-    # recursively remove files and folders that start with certain characters
-    file_names = [
-        f for f in file_names if remove_unallowed_starting_characters(f)
-    ]
-    logger.info("File names:")
-    logger.info(file_names)
-    folders = [f for f in file_names if f.endswith("/")]
-    logger.info("Folders:")
-    logger.info(folders)
-
-    if len(folders) == 0:
-        logger.info("Extracting all files...")
-
-        for file in file_names:
-            if ref.__class__ == zipfile.ZipFile:
-                ref = cast(zipfile.ZipFile, ref)
-                ref.extract(file, path=extract_path)
-            elif ref.__class__ == tarfile.TarFile:
-                ref = cast(tarfile.TarFile, ref)
-                ref.extract(file, path=extract_path, filter="data")
-            else:
-                raise Exception("File is not a zip or tar file.")
-
-    else:
-        # if all files have the same root any folder can be used to check since all will have the same root if true
-        do_all_files_have_same_root = all(
-            [f.startswith(folders[0]) for f in file_names]
-        )
-        logger.info(
-            "Do all files have the same root? "
-            + str(do_all_files_have_same_root)
-        )
-
-        if do_all_files_have_same_root:
-            # extract all files within the folder with folder of the zipfile that has the same root
-            root_folder_name = folders[0]
-
-            logger.info("Extracting files...")
-            for file in file_names:
-                if file.endswith("/") and file != root_folder_name:
-                    os.makedirs(
-                        os.path.join(
-                            extract_path,
-                            file.removeprefix(root_folder_name),
-                        )
-                    )
-                if not file.endswith("/"):
-                    if ref.__class__ == zipfile.ZipFile:
-                        ref = cast(zipfile.ZipFile, ref)
-                        ref.extract(file, path=extract_path)
-                    elif ref.__class__ == tarfile.TarFile:
-                        ref = cast(tarfile.TarFile, ref)
-                        ref.extract(file, path=extract_path, filter="data")
-                    else:
-                        raise Exception(1, "File is not a zip or tar file.")
-
-                    os.rename(
-                        os.path.join(extract_path, file),
-                        os.path.join(
-                            extract_path,
-                            file.removeprefix(root_folder_name),
-                        ),
-                    )
-
-            # remove the root folder and all other folders
-            shutil.rmtree(os.path.join(extract_path, root_folder_name))
-
-        else:
-            logger.info("Extracting all files...")
-            for file in file_names:
-                if ref.__class__ == zipfile.ZipFile:
-                    ref = cast(zipfile.ZipFile, ref)
-                    ref.extract(file, path=extract_path)
-                elif ref.__class__ == tarfile.TarFile:
-                    ref = cast(tarfile.TarFile, ref)
-                    ref.extract(file, path=extract_path, filter="data")
-                else:
-                    raise Exception(1, "File is not a zip or tar file.")
-
-
-def extract_zip(zip_path: str, extract_path: str):
-    if not os.path.exists(extract_path):
-        os.makedirs(extract_path)
-
-    def remove_unallowed_starting_characters(file_name: str) -> str | None:
-        unallowed_starting_characters = ("_", ".")
-
-        parts = file_name.split("/")
-        for part in parts:
-            if part.startswith(unallowed_starting_characters):
-                return None
-        return file_name
-
-    if zipfile.is_zipfile(zip_path):
-        with zipfile.ZipFile(zip_path, "r") as zip_ref:
-            extract_files(
-                zip_ref,
-                extract_path,
-                zip_path,
-                remove_unallowed_starting_characters,
-            )
-    elif tarfile.is_tarfile(zip_path):
-        with tarfile.open(zip_path, "r") as tar_ref:
-            extract_files(
-                tar_ref,
-                extract_path,
-                zip_path,
-                remove_unallowed_starting_characters,
-            )
-    else:
-        raise Exception(1, "File is not a zip or tar file.")
-
-
-def format_args_for_submission(args):
-    return args
-
-
-def import_submission_function(submission_file_name: str, function_name: str):
-    # Dynamically import function from submission.py
-    try:
-        submission_module = import_module(submission_file_name)
-    except ModuleNotFoundError as e:
-        logger.info(f"ModuleNotFoundError: {submission_file_name} not found")
-        raise e
-
-    try:
-        submission_function: Callable = getattr(
-            submission_module, function_name
-        )
-        function_parameters = list(
-            inspect.signature(submission_function).parameters.keys()
-        )
-    except AttributeError as e:
-        logger.info(
-            f"AttributeError: {function_name} not found in submission module"
-        )
-        raise e
-
-    return submission_function, function_parameters
-
-
-def main():
-    args = sys.argv[1:]
-
-    if len(args) < 1:
-        logger.info("Function name not provided")
-        sys.exit(1)
-
-    submission_file_name = args[0]
-    function_name = args[1]
-
-    submission_zip_file_path = os.path.join(
-        os.path.dirname(__file__), f"{submission_file_name}.zip"
-    )
-
-    logger.info(f"Submission zip file path: {submission_zip_file_path}")
-    extract_zip(submission_zip_file_path, ".")
-
-    if not os.path.exists(submission_file_name):
-        logger.error(f"Submission file not found: {submission_file_name}")
-        sys.exit(1)
-
-    submission_function, function_parameters = import_submission_function(
-        submission_file_name, function_name
-    )
-
-    logger.info(f"Submission file name: {submission_file_name}")
-    logger.info(f"Function name: {function_name}")
-    logger.info(f"Function: {submission_function}")
-    logger.info(f"Function parameters: {function_parameters}")
-
-    submission_args = format_args_for_submission(args[2:])
-
-    results = submission_function(*submission_args)
-    logger.info(f"Results: {results}")
-
-
-if __name__ == "__main__":
-    main()

From d22b774faaf2936f3af1cc235c3cec98659501a3 Mon Sep 17 00:00:00 2001
From: Mitchell Victoriano <47313912+MitchellAV@users.noreply.github.com>
Date: Tue, 18 Jun 2024 17:53:44 -0700
Subject: [PATCH 05/20] Added docker to dev folder and determined Prefect is
 not necessary

---
 dockerize-workflow/.dockerignore              |  34 ++++
 dockerize-workflow/Dockerfile                 |  20 ++
 dockerize-workflow/docker-compose.yaml        |  16 ++
 .../environment/requirements.txt              |   2 +
 dockerize-workflow/environment/submission.zip | Bin 654 -> 659 bytes
 .../environment/submission_wrapper.py         |   5 +
 dockerize-workflow/main.py                    | 171 ++++++++++++++++--
 dockerize-workflow/requirements.txt           |   4 +-
 8 files changed, 238 insertions(+), 14 deletions(-)
 create mode 100644 dockerize-workflow/.dockerignore
 create mode 100644 dockerize-workflow/Dockerfile
 create mode 100644 dockerize-workflow/docker-compose.yaml

diff --git a/dockerize-workflow/.dockerignore b/dockerize-workflow/.dockerignore
new file mode 100644
index 00000000..8429eef8
--- /dev/null
+++ b/dockerize-workflow/.dockerignore
@@ -0,0 +1,34 @@
+# Ignore files and directories generated by the operating system
+.DS_Store
+Thumbs.db
+
+# Ignore build artifacts
+node_modules
+dist
+build
+
+# Ignore development and debugging files
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+.vscode
+
+# Ignore editor-specific files
+*.swp
+*.swo
+*.bak
+
+# Ignore project-specific files
+.env
+.env.local
+.env.*.local
+.env.development
+.env.test
+.env.production
+
+# Ignore any other files or directories as needed
+__pycache__
+.venv
+.dockerignore
+Dockerfile
+docker-compose.yml
\ No newline at end of file
diff --git a/dockerize-workflow/Dockerfile b/dockerize-workflow/Dockerfile
new file mode 100644
index 00000000..9d6cf407
--- /dev/null
+++ b/dockerize-workflow/Dockerfile
@@ -0,0 +1,20 @@
+# Use an official Python runtime as the base image
+FROM python:3.12-alpine
+
+# Set the working directory to /app
+
+WORKDIR /app
+
+# Copy the current directory contents into the container at /app
+COPY . .
+
+RUN apk add --no-cache docker-cli
+
+RUN apk add gcc python3-dev musl-dev linux-headers
+
+RUN python3 -m venv .venv && \
+    .venv/bin/pip install --upgrade pip && \
+    .venv/bin/pip install --no-cache-dir -r requirements.txt
+
+# run container without closing
+CMD ["tail", "-f", "/dev/null"]
\ No newline at end of file
diff --git a/dockerize-workflow/docker-compose.yaml b/dockerize-workflow/docker-compose.yaml
new file mode 100644
index 00000000..75012ec8
--- /dev/null
+++ b/dockerize-workflow/docker-compose.yaml
@@ -0,0 +1,16 @@
+services:
+  dockerize-workflow:
+    build:
+      context: ./
+    env_file:
+      - .env
+    ports:
+      - "8787:8787"
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
+      - ./certs:/app/certs
+      - ./main.py:/app/main.py
+      - ./requirements.txt:/app/requirements.txt
+      - ./results:/app/results
+      - ./data:/app/data
+      - ./environment:/app/environment
diff --git a/dockerize-workflow/environment/requirements.txt b/dockerize-workflow/environment/requirements.txt
index e69de29b..87827093 100644
--- a/dockerize-workflow/environment/requirements.txt
+++ b/dockerize-workflow/environment/requirements.txt
@@ -0,0 +1,2 @@
+numpy
+solar-data-tools==1.3.2
diff --git a/dockerize-workflow/environment/submission.zip b/dockerize-workflow/environment/submission.zip
index b3db13edb62879598490484c5a17959465ea229c..af0ad14a92b64aafa0871004a19772a9ec24ef6e 100644
GIT binary patch
delta 285
zcmeBUoy=Mv;LXe;!oUH9oK2S^7@$B6NDDA1FchU0mSz^E=BDPA6zi2#l!S)xGO#;|
z7Nt1@F^Dd$;AUWCdA7Vjmt|wntW@oD=RI{U`FiSjUeP^y{=DY4t)^FuuNgl&rF~X+
z1xsPinJ+#&7y`W6Id1YyOi~3}A_%ex;f6PjmnL>=bMpXAhT@F}nHYINPJ`)Cn|y&$
z%MFhk0=yZSM3^Cd0-6``%Pb@A8qk(HMg|5^pr2r1Nn`V52PRo$%O+<qDX{GUs$*aP
E0L)QN+yDRo

delta 216
zcmbQt+Q%9i;LXe;!oUH9p4-kwFhGGEkQQK2U?@s0EX^!R%}vcKDb_2gC<zVWWnh=S
zm7OL7#2~t~f}4Sn<=OHAU6zeOvr<oMpVeLQL~EkIxI7a>fHymbjpT#13P8g^1|qC-
z+je&11Z{3npaEh)45T+6V`Ah5xd5(V@>NDHZX~NFb1}&<12s<8WRhmCV_=xLKpNSA
P$+=7lY}*-u)-V77T(Unp

diff --git a/dockerize-workflow/environment/submission_wrapper.py b/dockerize-workflow/environment/submission_wrapper.py
index 42d7f6c4..50aca83e 100644
--- a/dockerize-workflow/environment/submission_wrapper.py
+++ b/dockerize-workflow/environment/submission_wrapper.py
@@ -142,6 +142,11 @@ def main():
     results_file = f"{results_dir}/{data_file_name}"
     results_df.to_csv(results_file)
 
+    execution_tuple = (data_file_name, execution_time)
+    execution_file = f"{results_dir}/time.csv"
+    execution_df = pd.DataFrame([execution_tuple])
+    execution_df.to_csv(execution_file, mode="a", header=False)
+
 
 if __name__ == "__main__":
     main()
diff --git a/dockerize-workflow/main.py b/dockerize-workflow/main.py
index 1e231c57..9b9aab5a 100644
--- a/dockerize-workflow/main.py
+++ b/dockerize-workflow/main.py
@@ -1,9 +1,10 @@
 import os
-from sys import stdout
-from typing import Any, cast
+from typing import Any, TypeVar, cast
+from dask.distributed import Client
+from dask.delayed import delayed
 import docker.models
 import docker.models.containers
-from prefect import flow, task
+from prefect import Flow, flow, task
 from prefect_dask.task_runners import DaskTaskRunner
 import docker
 from docker.models.images import Image
@@ -24,8 +25,11 @@ def docker_task(
         submission_args = []
 
     # Define volumes to mount
-    results_dir = os.path.join(os.path.dirname(__file__), "results")
-    data_dir = os.path.join(os.path.dirname(__file__), "data")
+    # results_dir = os.path.join(os.path.dirname(__file__), "results")
+    # data_dir = os.path.join(os.path.dirname(__file__), "data")
+
+    data_dir = "/Users/mvicto/Desktop/Projects/PVInsight/pv-validation-hub/pv-validation-hub/dockerize-workflow/data"
+    results_dir = "/Users/mvicto/Desktop/Projects/PVInsight/pv-validation-hub/pv-validation-hub/dockerize-workflow/results"
 
     volumes = {
         results_dir: {"bind": "/app/results/", "mode": "rw"},
@@ -70,7 +74,6 @@ def docker_task(
         container.wait()
 
     except Exception as e:
-        print(f"Error: {e}")
         raise e
     finally:
         if container:
@@ -84,7 +87,31 @@ def docker_task(
 
 
 def initialize_docker_client():
-    client = docker.from_env()
+    base_url = os.environ.get("DOCKER_HOST")
+
+    if not base_url:
+
+        raise FileNotFoundError("DOCKER_HOST environment variable not set")
+
+    cert_path = os.environ.get("DOCKER_CERT_PATH")
+    if not cert_path:
+        raise FileNotFoundError(
+            "DOCKER_CERT_PATH environment variable not set"
+        )
+
+    ca_cert = cert_path + "/ca.pem"
+    client_cert = cert_path + "/cert.pem"
+    client_key = cert_path + "/key.pem"
+
+    client = docker.DockerClient(
+        base_url=base_url,
+        version="auto",
+        tls={
+            "ca_cert": ca_cert,
+            "client_cert": (client_cert, client_key),
+            "verify": True,
+        },
+    )
     return client
 
 
@@ -112,7 +139,6 @@ def create_docker_image(
         except ImageNotFound:
             print("Docker image not found")
         except Exception as e:
-            print(f"Error: {e}")
             raise e
 
     if image:
@@ -156,21 +182,34 @@ def main_task(image: str, memory_limit: str, data_filepath: str):
             submission_args,
         )
     except Exception as e:
-        print(f"Error: {e}")
+        raise e
     finally:
         if client:
             client.close()
 
 
+def check_if_docker_daemon_is_running():
+
+    client = initialize_docker_client()
+    try:
+        client.ping()
+    except Exception as e:
+        raise e
+    finally:
+        client.close()
+
+
 def main_flow(memory_limit: str):
     tag: str = "submission:latest"
 
+    check_if_docker_daemon_is_running()
+
     client = None
     try:
         client = initialize_docker_client()
         image = create_docker_image(tag, client, overwrite=True)
     except Exception as e:
-        print(f"Error: {e}")
+        raise e
     finally:
         if client:
             client.close()
@@ -181,7 +220,7 @@ def main_flow(memory_limit: str):
     if not data_files:
         raise FileNotFoundError("No data files found")
 
-    files = data_files
+    files = data_files[:5]
 
     for filepath in files:
         main_task.submit(tag, memory_limit, filepath)
@@ -194,7 +233,7 @@ def main():
     flow(
         task_runner=DaskTaskRunner(
             cluster_kwargs={
-                "n_workers": 3,
+                "n_workers": 2,
                 "threads_per_worker": 1,
                 "memory_limit": f"{memory_limit}GiB",
             }
@@ -203,5 +242,111 @@ def main():
     )(main_flow)(memory_limit)
 
 
+def sub_func(image: str, memory_limit: str, data_filepath: str):
+    client = None
+    try:
+        client = initialize_docker_client()
+        # image = create_docker_image(client, prefect_logger)
+
+        submission_file_name = "submission.submission_wrapper"
+        submission_function_name = "detect_time_shifts"
+        submission_args = [data_filepath]
+
+        docker_task(
+            client,
+            image,
+            memory_limit,
+            submission_file_name,
+            submission_function_name,
+            submission_args,
+        )
+    except Exception as e:
+        raise e
+    finally:
+        if client:
+            client.close()
+
+
+def main_func(memory_limit: str):
+    tag: str = "submission:latest"
+
+    check_if_docker_daemon_is_running()
+
+    client = None
+    try:
+        client = initialize_docker_client()
+        image = create_docker_image(tag, client, overwrite=True)
+    except Exception as e:
+        raise e
+    finally:
+        if client:
+            client.close()
+
+    data_files = os.listdir("data")
+    print(data_files)
+
+    if not data_files:
+        raise FileNotFoundError("No data files found")
+
+    files = data_files[:5]
+
+    for filepath in files:
+        sub_func.submit(tag, memory_limit, filepath)
+
+
+def dask_main():
+    results: list = []
+
+    total_workers = 2
+    total_threads = 1
+    memory_per_worker = 8
+
+    tag: str = "submission:latest"
+
+    check_if_docker_daemon_is_running()
+
+    client = None
+    try:
+        client = initialize_docker_client()
+        image = create_docker_image(tag, client, overwrite=True)
+    except Exception as e:
+        raise e
+    finally:
+        if client:
+            client.close()
+
+    data_files = os.listdir("data")
+    print(data_files)
+
+    if not data_files:
+        raise FileNotFoundError("No data files found")
+
+    files = data_files[:5]
+
+    with Client(
+        n_workers=total_workers,
+        threads_per_worker=total_threads,
+        memory_limit=f"{memory_per_worker}GiB",
+        # **kwargs,
+    ) as client:
+
+        # logger_if_able(f"client: {client}", logger, "INFO")
+
+        lazy_results = []
+        for args in files:
+            lazy_result = delayed(sub_func, pure=True)(
+                tag, memory_per_worker, args
+            )
+            lazy_results.append(lazy_result)
+
+        futures = client.compute(lazy_results)
+
+        results = client.gather(futures)  # type: ignore
+
+    return results
+
+
 if __name__ == "__main__":
-    main()
+    # main()
+    dask_main()
+    # check_if_docker_daemon_is_running()
diff --git a/dockerize-workflow/requirements.txt b/dockerize-workflow/requirements.txt
index b4345058..8de293d2 100644
--- a/dockerize-workflow/requirements.txt
+++ b/dockerize-workflow/requirements.txt
@@ -1,3 +1,5 @@
 prefect
+prefect-dask
 docker
-bokeh
\ No newline at end of file
+bokeh
+requests
\ No newline at end of file

From d839d36c6bb0564bccb2ec2f2ff74d5d87de944d Mon Sep 17 00:00:00 2001
From: Mitchell Victoriano <47313912+MitchellAV@users.noreply.github.com>
Date: Thu, 20 Jun 2024 18:26:47 -0700
Subject: [PATCH 06/20] Refactored docker flow and fixed typing

---
 .../environment/submission_wrapper.py         |  10 +-
 dockerize-workflow/main.py                    | 435 ++++++++++--------
 workers/src/pvinsight-validation-runner.py    |  44 +-
 workers/src/submission_worker.py              |  22 +-
 workers/src/utility.py                        | 349 +++++++++++++-
 5 files changed, 610 insertions(+), 250 deletions(-)

diff --git a/dockerize-workflow/environment/submission_wrapper.py b/dockerize-workflow/environment/submission_wrapper.py
index 50aca83e..866e33d4 100644
--- a/dockerize-workflow/environment/submission_wrapper.py
+++ b/dockerize-workflow/environment/submission_wrapper.py
@@ -5,12 +5,14 @@
 import numpy as np
 from time import perf_counter
 from functools import wraps
-from typing import Any, Union, Tuple, TypeVar, Callable, cast
+from typing import Any, ParamSpec, Union, Tuple, TypeVar, Callable, cast
 from logging import Logger
 import logging
 
 T = TypeVar("T")
 
+P = ParamSpec("P")
+
 
 def logger_if_able(
     message: str, logger: Logger | None = None, level: str = "INFO"
@@ -37,10 +39,8 @@ def logger_if_able(
 
 
 def timing(verbose: bool = True, logger: Union[Logger, None] = None):
-    @wraps(timing)
-    def decorator(func: Callable[..., T]):
-        @wraps(func)
-        def wrapper(*args, **kwargs) -> Tuple[T, float]:
+    def decorator(func: Callable[P, T]):
+        def wrapper(*args: P.args, **kwargs: P.kwargs) -> Tuple[T, float]:
             start_time = perf_counter()
             result = func(*args, **kwargs)
             end_time = perf_counter()
diff --git a/dockerize-workflow/main.py b/dockerize-workflow/main.py
index 9b9aab5a..301580e5 100644
--- a/dockerize-workflow/main.py
+++ b/dockerize-workflow/main.py
@@ -1,24 +1,161 @@
 import os
-from typing import Any, TypeVar, cast
+from typing import Any, Dict, Sequence, TypeVar, cast
 from dask.distributed import Client
 from dask.delayed import delayed
 import docker.models
 import docker.models.containers
-from prefect import Flow, flow, task
-from prefect_dask.task_runners import DaskTaskRunner
 import docker
 from docker.models.images import Image
 from docker.models.containers import Container
 from docker.errors import ImageNotFound
 
 
+# @task
+# def main_task(image: str, memory_limit: str, data_filepath: str):
+#     client = None
+#     try:
+#         client = initialize_docker_client()
+#         # image = create_docker_image(client, prefect_logger)
+
+#         submission_file_name = "submission.submission_wrapper"
+#         submission_function_name = "detect_time_shifts"
+#         submission_args = [data_filepath]
+
+#         docker_task(
+#             client,
+#             image,
+#             memory_limit,
+#             submission_file_name,
+#             submission_function_name,
+#             submission_args,
+#         )
+#     except Exception as e:
+#         raise e
+#     finally:
+#         if client:
+#             client.close()
+
+
+# def main_flow(memory_limit: str):
+#     tag: str = "submission:latest"
+
+#     check_if_docker_daemon_is_running()
+
+#     client = None
+#     try:
+#         client = initialize_docker_client()
+#         image = create_docker_image(tag, client, overwrite=True)
+#     except Exception as e:
+#         raise e
+#     finally:
+#         if client:
+#             client.close()
+
+#     data_files = os.listdir("data")
+#     print(data_files)
+
+#     if not data_files:
+#         raise FileNotFoundError("No data files found")
+
+#     files = data_files[:5]
+
+#     for filepath in files:
+#         main_task.submit(tag, memory_limit, filepath)
+
+
+# def main():
+
+#     memory_limit = "8"
+
+#     flow(
+#         task_runner=DaskTaskRunner(
+#             cluster_kwargs={
+#                 "n_workers": 2,
+#                 "threads_per_worker": 1,
+#                 "memory_limit": f"{memory_limit}GiB",
+#             }
+#         ),
+#         log_prints=True,
+#     )(main_flow)(memory_limit)
+
+
+# def main_func(memory_limit: str):
+#     tag: str = "submission:latest"
+
+#     check_if_docker_daemon_is_running()
+
+#     client = None
+#     try:
+#         client = initialize_docker_client()
+#         image = create_docker_image(tag, client, overwrite=True)
+#     except Exception as e:
+#         raise e
+#     finally:
+#         if client:
+#             client.close()
+
+#     data_files = os.listdir("data")
+#     print(data_files)
+
+#     if not data_files:
+#         raise FileNotFoundError("No data files found")
+
+#     files = data_files[:5]
+
+#     for filepath in files:
+#         sub_func.submit(tag, memory_limit, filepath)
+
+
+# Functions for main code
+
+
+class DockerContainerContextManager:
+
+    def __init__(
+        self,
+        client: docker.DockerClient,
+        image: Image | str,
+        command: str | list[str],
+        volumes: dict[str, dict[str, str]] | list[str],
+        mem_limit: str | None = None,
+    ) -> None:
+        self.client = client
+        self.container: Container | None = None
+        self.image = image
+        self.command = command
+        self.volumes = volumes
+        self.mem_limit = f"{mem_limit}g" if mem_limit else None
+
+    def __enter__(self):
+        container = self.client.containers.run(
+            image=self.image,
+            command=self.command,
+            volumes=self.volumes,
+            detach=True,
+            stdout=True,
+            stderr=True,
+            mem_limit=self.mem_limit,
+        )
+
+        self.container = cast(Container, container)
+        return self.container
+
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        if self.container:
+            if self.container.status == "running":
+                self.container.stop()
+            self.container.remove()
+
+
 def docker_task(
     client: docker.DockerClient,
     image: str,
     memory_limit: str,
     submission_file_name: str,
     submission_function_name: str,
-    submission_args: list[Any] | None = None,
+    submission_args: Sequence[Any],
+    data_dir: str,
+    results_dir: str,
 ):
 
     if submission_args is None:
@@ -28,43 +165,27 @@ def docker_task(
     # results_dir = os.path.join(os.path.dirname(__file__), "results")
     # data_dir = os.path.join(os.path.dirname(__file__), "data")
 
-    data_dir = "/Users/mvicto/Desktop/Projects/PVInsight/pv-validation-hub/pv-validation-hub/dockerize-workflow/data"
-    results_dir = "/Users/mvicto/Desktop/Projects/PVInsight/pv-validation-hub/pv-validation-hub/dockerize-workflow/results"
-
     volumes = {
         results_dir: {"bind": "/app/results/", "mode": "rw"},
         data_dir: {"bind": "/app/data/", "mode": "ro"},
     }
 
-    # Execute docker image in a container
-    container = None
-    try:
+    command: list[str] = [
+        "python",
+        "submission_wrapper.py",
+        submission_file_name,
+        submission_function_name,
+        *submission_args,
+    ]
+
+    with DockerContainerContextManager(
+        client, image, command, volumes, memory_limit
+    ) as container:
         print("Docker container starting...")
         print(f"Image: {image}")
         print(f"Submission file name: {submission_file_name}")
         print(f"Submission function name: {submission_function_name}")
         print(f"Submission args: {submission_args}")
-        container = cast(
-            Container,
-            client.containers.run(
-                image,
-                command=[
-                    "python",
-                    "submission_wrapper.py",
-                    submission_file_name,
-                    submission_function_name,
-                    *submission_args,
-                ],
-                volumes=volumes,
-                detach=True,
-                stdout=True,
-                stderr=True,
-                mem_limit=f"{memory_limit}g",
-            ),
-        )
-
-        print("Docker container started")
-        print(container.id)
 
         # Wait for container to finish
         for line in container.logs(stream=True):
@@ -73,47 +194,29 @@ def docker_task(
 
         container.wait()
 
-    except Exception as e:
-        raise e
-    finally:
-        if container:
-            if container.status == "running":
-                print("Docker container stopping...")
-                container.stop()
-                print("Docker container stopped")
-            print("Docker container removing...")
-            container.remove()
-            print("Docker container removed")
-
 
-def initialize_docker_client():
-    base_url = os.environ.get("DOCKER_HOST")
-
-    if not base_url:
-
-        raise FileNotFoundError("DOCKER_HOST environment variable not set")
+def submission_task(
+    image_tag: str,
+    memory_limit: str,
+    submission_file_name: str,
+    submission_function_name: str,
+    submission_args: Sequence[Any],
+    data_dir: str,
+    results_dir: str,
+):
 
-    cert_path = os.environ.get("DOCKER_CERT_PATH")
-    if not cert_path:
-        raise FileNotFoundError(
-            "DOCKER_CERT_PATH environment variable not set"
+    with DockerClientContextManager() as client:
+        docker_task(
+            client=client,
+            image=image_tag,
+            memory_limit=memory_limit,
+            submission_file_name=submission_file_name,
+            submission_function_name=submission_function_name,
+            submission_args=submission_args,
+            data_dir=data_dir,
+            results_dir=results_dir,
         )
 
-    ca_cert = cert_path + "/ca.pem"
-    client_cert = cert_path + "/cert.pem"
-    client_key = cert_path + "/key.pem"
-
-    client = docker.DockerClient(
-        base_url=base_url,
-        version="auto",
-        tls={
-            "ca_cert": ca_cert,
-            "client_cert": (client_cert, client_key),
-            "verify": True,
-        },
-    )
-    return client
-
 
 def create_docker_image(
     tag: str,
@@ -152,7 +255,6 @@ def create_docker_image(
         image, build_logs = client.images.build(
             path=file_path, tag=tag, dockerfile="Dockerfile"
         )
-
         for log in build_logs:
             if "stream" in log:
                 print(log["stream"].strip())
@@ -162,136 +264,76 @@ def create_docker_image(
         return image
 
 
-@task
-def main_task(image: str, memory_limit: str, data_filepath: str):
-    client = None
-    try:
-        client = initialize_docker_client()
-        # image = create_docker_image(client, prefect_logger)
-
-        submission_file_name = "submission.submission_wrapper"
-        submission_function_name = "detect_time_shifts"
-        submission_args = [data_filepath]
-
-        docker_task(
-            client,
-            image,
-            memory_limit,
-            submission_file_name,
-            submission_function_name,
-            submission_args,
-        )
-    except Exception as e:
-        raise e
-    finally:
-        if client:
-            client.close()
-
-
-def check_if_docker_daemon_is_running():
+class DockerClientContextManager:
+    def __init__(self):
+        self.client = None
 
-    client = initialize_docker_client()
-    try:
-        client.ping()
-    except Exception as e:
-        raise e
-    finally:
-        client.close()
+    def __enter__(self):
+        self.client = initialize_docker_client()
+        return self.client
 
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        if self.client:
+            self.client.close()
 
-def main_flow(memory_limit: str):
-    tag: str = "submission:latest"
 
-    check_if_docker_daemon_is_running()
+def initialize_docker_client():
+    base_url = os.environ.get("DOCKER_HOST")
 
-    client = None
-    try:
-        client = initialize_docker_client()
-        image = create_docker_image(tag, client, overwrite=True)
-    except Exception as e:
-        raise e
-    finally:
-        if client:
-            client.close()
+    if not base_url:
+        raise FileNotFoundError("DOCKER_HOST environment variable not set")
 
-    data_files = os.listdir("data")
-    print(data_files)
+    # cert_path = os.environ.get("DOCKER_CERT_PATH")
+    # if not cert_path:
+    #     raise FileNotFoundError(
+    #         "DOCKER_CERT_PATH environment variable not set"
+    #     )
 
-    if not data_files:
-        raise FileNotFoundError("No data files found")
+    # if not os.path.exists(cert_path):
+    #     raise FileNotFoundError(f"Cert path {cert_path} not found")
 
-    files = data_files[:5]
-
-    for filepath in files:
-        main_task.submit(tag, memory_limit, filepath)
+    # ca_cert = cert_path + "/ca.pem"
+    # client_cert = cert_path + "/ca-key.pem"
+    # client_key = cert_path + "/key.pem"
 
+    # if not os.path.exists(ca_cert):
+    #     raise FileNotFoundError(f"CA cert {ca_cert} not found")
+    # if not os.path.exists(client_cert):
+    #     raise FileNotFoundError(f"Client cert {client_cert} not found")
+    # if not os.path.exists(client_key):
+    #     raise FileNotFoundError(f"Client key {client_key} not found")
 
-def main():
+    client = docker.DockerClient(
+        base_url=base_url,
+        version="auto",
+        # tls={
+        #     "ca_cert": ca_cert,
+        #     "client_cert": (client_cert, client_key),
+        #     "verify": True,
+        # },
+    )
+    return client
 
-    memory_limit = "8"
 
-    flow(
-        task_runner=DaskTaskRunner(
-            cluster_kwargs={
-                "n_workers": 2,
-                "threads_per_worker": 1,
-                "memory_limit": f"{memory_limit}GiB",
-            }
-        ),
-        log_prints=True,
-    )(main_flow)(memory_limit)
+def is_docker_daemon_running():
+    is_running = False
 
+    with DockerClientContextManager() as client:
+        if client.ping():
+            is_running = True
 
-def sub_func(image: str, memory_limit: str, data_filepath: str):
-    client = None
-    try:
-        client = initialize_docker_client()
-        # image = create_docker_image(client, prefect_logger)
+    return is_running
 
-        submission_file_name = "submission.submission_wrapper"
-        submission_function_name = "detect_time_shifts"
-        submission_args = [data_filepath]
 
-        docker_task(
-            client,
-            image,
-            memory_limit,
-            submission_file_name,
-            submission_function_name,
-            submission_args,
-        )
-    except Exception as e:
-        raise e
-    finally:
-        if client:
-            client.close()
+def create_docker_image_for_submission():
+    tag = "submission:latest"
 
+    is_docker_daemon_running()
 
-def main_func(memory_limit: str):
-    tag: str = "submission:latest"
-
-    check_if_docker_daemon_is_running()
-
-    client = None
-    try:
-        client = initialize_docker_client()
+    with DockerClientContextManager() as client:
         image = create_docker_image(tag, client, overwrite=True)
-    except Exception as e:
-        raise e
-    finally:
-        if client:
-            client.close()
-
-    data_files = os.listdir("data")
-    print(data_files)
-
-    if not data_files:
-        raise FileNotFoundError("No data files found")
-
-    files = data_files[:5]
 
-    for filepath in files:
-        sub_func.submit(tag, memory_limit, filepath)
+    return image, tag
 
 
 def dask_main():
@@ -301,19 +343,7 @@ def dask_main():
     total_threads = 1
     memory_per_worker = 8
 
-    tag: str = "submission:latest"
-
-    check_if_docker_daemon_is_running()
-
-    client = None
-    try:
-        client = initialize_docker_client()
-        image = create_docker_image(tag, client, overwrite=True)
-    except Exception as e:
-        raise e
-    finally:
-        if client:
-            client.close()
+    image, tag = create_docker_image_for_submission()
 
     data_files = os.listdir("data")
     print(data_files)
@@ -323,6 +353,12 @@ def dask_main():
 
     files = data_files[:5]
 
+    submission_file_name = "submission.submission_wrapper"
+    submission_function_name = "detect_time_shifts"
+
+    data_dir = "/Users/mvicto/Desktop/Projects/PVInsight/pv-validation-hub/pv-validation-hub/dockerize-workflow/data"
+    results_dir = "/Users/mvicto/Desktop/Projects/PVInsight/pv-validation-hub/pv-validation-hub/dockerize-workflow/results"
+
     with Client(
         n_workers=total_workers,
         threads_per_worker=total_threads,
@@ -330,12 +366,17 @@ def dask_main():
         # **kwargs,
     ) as client:
 
-        # logger_if_able(f"client: {client}", logger, "INFO")
-
         lazy_results = []
-        for args in files:
-            lazy_result = delayed(sub_func, pure=True)(
-                tag, memory_per_worker, args
+        for file in files:
+            submission_args = (file,)
+            lazy_result = delayed(submission_task, pure=True)(
+                tag,
+                memory_per_worker,
+                submission_file_name,
+                submission_function_name,
+                submission_args,
+                data_dir,
+                results_dir,
             )
             lazy_results.append(lazy_result)
 
diff --git a/workers/src/pvinsight-validation-runner.py b/workers/src/pvinsight-validation-runner.py
index 3e88ef0f..8fab437b 100644
--- a/workers/src/pvinsight-validation-runner.py
+++ b/workers/src/pvinsight-validation-runner.py
@@ -16,7 +16,7 @@
       This section will be dependent on the type of analysis being run.
 """
 
-from typing import Any, Callable, cast
+from typing import Any, Callable, Tuple, TypeVar, cast, ParamSpec
 import pandas as pd
 import os
 from importlib import import_module
@@ -50,6 +50,8 @@
     is_local,
 )
 
+P = ParamSpec("P")
+
 FAILED = "failed"
 
 setup_logging()
@@ -102,7 +104,7 @@ def push_to_s3(
                 logger.info(f"update submission status to {FAILED}")
                 update_submission_status(submission_id, FAILED)
     else:
-        s3 = boto3.client("s3")
+        s3 = boto3.client("s3")  # type: ignore
         s3.upload_file(local_file_path, S3_BUCKET_NAME, s3_file_path)
 
 
@@ -286,9 +288,9 @@ def generate_scatter_plot(dataframe, x_axis, y_axis, title):
 
 @timing(verbose=True, logger=logger)
 def run_user_submission(
-    fn: Callable[..., pd.Series],
-    *args: Any,
-    **kwargs: Any,
+    fn: Callable[P, pd.Series],
+    *args,
+    **kwargs,
 ):
     return fn(*args, **kwargs)
 
@@ -614,7 +616,7 @@ def create_function_args_for_file(
     system_metadata_df: pd.DataFrame,
     data_dir: str,
     config_data: dict[str, Any],
-    submission_function: Callable[..., pd.Series],
+    submission_function: Callable[P, pd.Series],
     function_parameters: list[str],
     function_name: str,
     performance_metrics: list[str],
@@ -651,18 +653,28 @@ def create_function_args_for_file(
     return function_args
 
 
+T = TypeVar("T")
+
+
+def append_to_list(item: T, array: list[T] | None = None):
+    if array is None:
+        array = []
+    array.append(item)
+    return array
+
+
 def prepare_function_args_for_parallel_processing(
     file_metadata_df: pd.DataFrame,
     system_metadata_df: pd.DataFrame,
     data_dir: str,
     config_data: dict[str, Any],
-    submission_function: Callable[..., pd.Series],
+    submission_function: Callable[P, pd.Series],
     function_parameters: list[str],
     function_name: str,
     performance_metrics: list[str],
 ):
 
-    function_args_list: list[tuple] = []
+    function_args_list = None
 
     for file_number, (_, file_metadata_row) in enumerate(
         file_metadata_df.iterrows()
@@ -679,7 +691,13 @@ def prepare_function_args_for_parallel_processing(
             performance_metrics,
             file_number,
         )
-        function_args_list.append(function_args)
+        function_args_list = append_to_list(function_args, function_args_list)
+
+    if function_args_list is None:
+        # TODO: add error code
+        raise RunnerException(
+            *get_error_by_code(500, runner_error_codes, logger)
+        )
 
     return function_args_list
 
@@ -689,7 +707,7 @@ def run_submission(
     data_dir: str,
     associated_metadata: dict[str, Any],
     config_data: dict[str, Any],
-    submission_function: Callable[..., pd.Series],
+    submission_function: Callable[P, pd.Series],
     function_parameters: list[str],
     row: pd.Series,
 ):
@@ -709,7 +727,7 @@ def run_submission(
     )
 
     data_outputs, function_run_time = run_user_submission(
-        submission_function, time_series, **kwargs
+        submission_function, time_series, kwargs
     )
 
     return (
@@ -723,7 +741,7 @@ def loop_over_files_and_generate_results(
     system_metadata_df: pd.DataFrame,
     data_dir: str,
     config_data: dict[str, Any],
-    submission_function: Callable[..., pd.Series],
+    submission_function: Callable[P, pd.Series],
     function_parameters: list[str],
     function_name: str,
     performance_metrics: list[str],
@@ -891,7 +909,7 @@ def run_submission_and_generate_performance_metrics(
     data_dir: str,
     associated_system_metadata: dict[str, Any],
     config_data: dict[str, Any],
-    submission_function: Callable[..., pd.Series],
+    submission_function: Callable[P, pd.Series],
     function_parameters: list[str],
     file_metadata_row: pd.Series,
     function_name: str,
diff --git a/workers/src/submission_worker.py b/workers/src/submission_worker.py
index e84e8fda..e7255ca3 100644
--- a/workers/src/submission_worker.py
+++ b/workers/src/submission_worker.py
@@ -1,6 +1,8 @@
 from importlib import import_module
 from logging import exception
 from typing import Any, Callable, Optional
+from mypy_boto3_s3 import S3Client
+from mypy_boto3_sqs import SQSClient, SQSServiceResource
 import requests
 import sys
 import os
@@ -101,7 +103,7 @@ def push_to_s3(local_file_path, s3_file_path, analysis_id, submission_id):
                 )
             return {"status": "success"}
     else:
-        s3 = boto3.client("s3")
+        s3: S3Client = boto3.client("s3")  # type: ignore
         try:
             s3.upload_file(local_file_path, S3_BUCKET_NAME, s3_file_path)
         except botocore.exceptions.ClientError as e:
@@ -139,7 +141,7 @@ def list_s3_bucket(s3_dir: str):
             f"dir after removing pv-validation-hub-bucket/ returns {s3_dir}"
         )
 
-        s3 = boto3.client("s3")
+        s3: S3Client = boto3.client("s3")  # type: ignore
         paginator = s3.get_paginator("list_objects_v2")
         pages = paginator.paginate(Bucket=S3_BUCKET_NAME, Prefix=s3_dir)
         for page in pages:
@@ -474,8 +476,8 @@ def get_or_create_sqs_queue(queue_name):
     """
     # Use the Docker endpoint URL for local development
     if IS_LOCAL:
-        sqs = boto3.resource(
-            "sqs",
+        sqs: SQSServiceResource = boto3.resource(
+            "sqs",  # type: ignore
             endpoint_url="http://sqs:9324",
             region_name="elasticmq",
             aws_secret_access_key="x",
@@ -484,8 +486,8 @@ def get_or_create_sqs_queue(queue_name):
         )
     # Use the production AWS environment for other environments
     else:
-        sqs = boto3.resource(
-            "sqs",
+        sqs: SQSServiceResource = boto3.resource(
+            "sqs",  # type: ignore
             region_name=os.environ.get("AWS_DEFAULT_REGION", "us-west-2"),
         )
 
@@ -530,8 +532,8 @@ def get_analysis_pk():
 
 def get_aws_sqs_client():
     if IS_LOCAL:
-        sqs = boto3.client(
-            "sqs",
+        sqs: SQSClient = boto3.client(
+            "sqs",  # type: ignore
             endpoint_url="http://sqs:9324",
             region_name="elasticmq",
             aws_secret_access_key="x",
@@ -540,8 +542,8 @@ def get_aws_sqs_client():
         )
         logger.info(f"Using local SQS endpoint")
     else:
-        sqs = boto3.client(
-            "sqs",
+        sqs: SQSClient = boto3.client(
+            "sqs",  # type: ignore
             region_name=os.environ.get("AWS_DEFAULT_REGION", "us-west-2"),
         )
         logger.info(f"Using AWS SQS endpoint")
diff --git a/workers/src/utility.py b/workers/src/utility.py
index 974541ca..43bc7190 100644
--- a/workers/src/utility.py
+++ b/workers/src/utility.py
@@ -2,6 +2,10 @@
 from dask.delayed import delayed
 from dask.distributed import Client
 from dask import config
+import docker
+from docker.models.containers import Container
+from docker.errors import ImageNotFound
+from docker.models.images import Image
 
 from concurrent.futures import (
     ProcessPoolExecutor,
@@ -12,10 +16,20 @@
 from logging import Logger
 from time import perf_counter, sleep
 import os
-from typing import Any, Callable, Tuple, TypeVar, Union
+from typing import (
+    Any,
+    Callable,
+    ParamSpec,
+    Sequence,
+    Tuple,
+    TypeVar,
+    Union,
+    cast,
+)
 import logging
 import boto3
 import botocore.exceptions
+from mypy_boto3_s3 import S3Client
 import psutil
 import requests
 import math
@@ -29,6 +43,7 @@
 
 
 T = TypeVar("T")
+P = ParamSpec("P")
 
 FILE_DIR = os.path.dirname(os.path.abspath(__file__))
 
@@ -97,10 +112,10 @@ def get_error_codes_dict(
 
 
 def timing(verbose: bool = True, logger: Union[Logger, None] = None):
-    @wraps(timing)
-    def decorator(func: Callable[..., T]):
-        @wraps(func)
-        def wrapper(*args, **kwargs) -> Tuple[T, float]:
+    # @wraps(timing)
+    def decorator(func: Callable[P, T]):
+        # @wraps(func)
+        def wrapper(*args: P.args, **kwargs: P.kwargs) -> Tuple[T, float]:
             start_time = perf_counter()
             result = func(*args, **kwargs)
             end_time = perf_counter()
@@ -117,25 +132,25 @@ def wrapper(*args, **kwargs) -> Tuple[T, float]:
     return decorator
 
 
-def multiprocess(
-    func: Callable[..., T], data: list, n_processes: int, logger: Logger | None
-) -> list[T]:
-    log = logger or print
-    with ProcessPoolExecutor(max_workers=n_processes) as executor:
-        futures = {executor.submit(func, d): d for d in data}
-        results: list[T] = []
-        for future in as_completed(futures):
-            try:
-                results.append(future.result())
-            except Exception as e:
-                log.error(f"Error: {e}")
-    return results
+# def multiprocess(
+#     func: Callable[P, T], data: list, n_processes: int, logger: Logger | None
+# ) -> list[T]:
+#     log = logger or print
+#     with ProcessPoolExecutor(max_workers=n_processes) as executor:
+#         futures = {executor.submit(func, d): d for d in data}
+#         results: list[T] = []
+#         for future in as_completed(futures):
+#             try:
+#                 results.append(future.result())
+#             except Exception as e:
+#                 log.error(f"Error: {e}")
+#     return results
 
 
 def timeout(seconds: int, logger: Union[Logger, None] = None):
-    def decorator(func: Callable[..., T]):
-        @wraps(func)
-        def wrapper(*args, **kwargs) -> T:
+    def decorator(func: Callable[P, T]):
+        # @wraps(func)
+        def wrapper(*args: P.args, **kwargs: P.kwargs) -> T:
             with ThreadPoolExecutor(max_workers=1) as executor:
                 future = executor.submit(func, *args, **kwargs)
                 try:
@@ -282,9 +297,12 @@ def handle_exceeded_resources(
     return total_workers, total_threads
 
 
+U = TypeVar("U")
+
+
 def dask_multiprocess(
-    func: Callable[..., T],
-    func_arguments: list[tuple[Any, ...]],
+    func: Callable[P, T],
+    func_arguments: Sequence[Tuple[U, ...]],
     n_workers: int | None = None,
     threads_per_worker: int | None = None,
     memory_per_run: float | int | None = None,
@@ -417,7 +435,7 @@ def pull_from_s3(
         with open(target_file_path, "wb") as f:
             f.write(r.content)
     else:
-        s3 = boto3.client("s3")
+        s3: S3Client = boto3.client("s3")  # type: ignore
 
         # check s3_dir string to see if it contains "pv-validation-hub-bucket/"
         # if so, remove it
@@ -518,7 +536,7 @@ def with_credentials(logger: Logger | None = None):
     api_auth_token = None
     headers = {}
 
-    def decorator(func: Callable[..., T]):
+    def decorator(func: Callable[P, T]):
         # @wraps(func)
         def wrapper(*args, **kwargs):
             nonlocal api_auth_token
@@ -738,6 +756,287 @@ def generate_private_report_for_submission(
         raise e
 
 
+# Docker functions
+
+
+class DockerContainerContextManager:
+
+    def __init__(
+        self,
+        client: docker.DockerClient,
+        image: Image | str,
+        command: str | list[str],
+        volumes: dict[str, dict[str, str]] | list[str],
+        mem_limit: str | None = None,
+    ) -> None:
+        self.client = client
+        self.container: Container | None = None
+        self.image = image
+        self.command = command
+        self.volumes = volumes
+        self.mem_limit = f"{mem_limit}g" if mem_limit else None
+
+    def __enter__(self):
+        container = self.client.containers.run(
+            image=self.image,
+            command=self.command,
+            volumes=self.volumes,
+            detach=True,
+            stdout=True,
+            stderr=True,
+            mem_limit=self.mem_limit,
+        )
+
+        self.container = cast(Container, container)
+        return self.container
+
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        if self.container:
+            if self.container.status == "running":
+                self.container.stop()
+            self.container.remove()
+
+
+def docker_task(
+    client: docker.DockerClient,
+    image: str,
+    memory_limit: str,
+    submission_file_name: str,
+    submission_function_name: str,
+    submission_args: Sequence[Any],
+    data_dir: str,
+    results_dir: str,
+):
+
+    if submission_args is None:
+        submission_args = []
+
+    # Define volumes to mount
+    # results_dir = os.path.join(os.path.dirname(__file__), "results")
+    # data_dir = os.path.join(os.path.dirname(__file__), "data")
+
+    volumes = {
+        results_dir: {"bind": "/app/results/", "mode": "rw"},
+        data_dir: {"bind": "/app/data/", "mode": "ro"},
+    }
+
+    command: list[str] = [
+        "python",
+        "submission_wrapper.py",
+        submission_file_name,
+        submission_function_name,
+        *submission_args,
+    ]
+
+    with DockerContainerContextManager(
+        client, image, command, volumes, memory_limit
+    ) as container:
+        print("Docker container starting...")
+        print(f"Image: {image}")
+        print(f"Submission file name: {submission_file_name}")
+        print(f"Submission function name: {submission_function_name}")
+        print(f"Submission args: {submission_args}")
+
+        # Wait for container to finish
+        for line in container.logs(stream=True):
+            line = cast(str, line)
+            print(line.strip())
+
+        container.wait()
+
+
+def submission_task(
+    image_tag: str,
+    memory_limit: str,
+    submission_file_name: str,
+    submission_function_name: str,
+    submission_args: Sequence[Any],
+    data_dir: str,
+    results_dir: str,
+):
+
+    with DockerClientContextManager() as client:
+        docker_task(
+            client=client,
+            image=image_tag,
+            memory_limit=memory_limit,
+            submission_file_name=submission_file_name,
+            submission_function_name=submission_function_name,
+            submission_args=submission_args,
+            data_dir=data_dir,
+            results_dir=results_dir,
+        )
+
+
+def create_docker_image(
+    tag: str,
+    client: docker.DockerClient,
+    overwrite: bool = False,
+):
+
+    file_path = os.path.join(os.path.dirname(__file__), "environment")
+
+    print(file_path)
+
+    # Check if Dockerfile exists
+    if not os.path.exists(os.path.join(file_path, "Dockerfile")):
+        raise FileNotFoundError("Dockerfile not found")
+
+    # Check if docker image already exists
+
+    image = None
+
+    if not overwrite:
+        try:
+            image = client.images.get(tag)
+        except ImageNotFound:
+            print("Docker image not found")
+        except Exception as e:
+            raise e
+
+    if image:
+        print("Docker image already exists")
+        print(image)
+        return image
+    else:
+        print("Docker image does not exist")
+
+        # Create docker image from Dockerfile
+        image, build_logs = client.images.build(
+            path=file_path, tag=tag, dockerfile="Dockerfile"
+        )
+        for log in build_logs:
+            if "stream" in log:
+                print(log["stream"].strip())
+
+        print("Docker image created")
+
+        return image
+
+
+class DockerClientContextManager:
+    def __init__(self):
+        self.client = None
+
+    def __enter__(self):
+        self.client = initialize_docker_client()
+        return self.client
+
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        if self.client:
+            self.client.close()
+
+
+def initialize_docker_client():
+    base_url = os.environ.get("DOCKER_HOST")
+
+    if not base_url:
+        raise FileNotFoundError("DOCKER_HOST environment variable not set")
+
+    # cert_path = os.environ.get("DOCKER_CERT_PATH")
+    # if not cert_path:
+    #     raise FileNotFoundError(
+    #         "DOCKER_CERT_PATH environment variable not set"
+    #     )
+
+    # if not os.path.exists(cert_path):
+    #     raise FileNotFoundError(f"Cert path {cert_path} not found")
+
+    # ca_cert = cert_path + "/ca.pem"
+    # client_cert = cert_path + "/ca-key.pem"
+    # client_key = cert_path + "/key.pem"
+
+    # if not os.path.exists(ca_cert):
+    #     raise FileNotFoundError(f"CA cert {ca_cert} not found")
+    # if not os.path.exists(client_cert):
+    #     raise FileNotFoundError(f"Client cert {client_cert} not found")
+    # if not os.path.exists(client_key):
+    #     raise FileNotFoundError(f"Client key {client_key} not found")
+
+    client = docker.DockerClient(
+        base_url=base_url,
+        version="auto",
+        # tls={
+        #     "ca_cert": ca_cert,
+        #     "client_cert": (client_cert, client_key),
+        #     "verify": True,
+        # },
+    )
+    return client
+
+
+def is_docker_daemon_running():
+    is_running = False
+
+    with DockerClientContextManager() as client:
+        if client.ping():
+            is_running = True
+
+    return is_running
+
+
+def create_docker_image_for_submission():
+    tag = "submission:latest"
+
+    is_docker_daemon_running()
+
+    with DockerClientContextManager() as client:
+        image = create_docker_image(tag, client, overwrite=True)
+
+    return image, tag
+
+
+def dask_main():
+    results: list = []
+
+    total_workers = 2
+    total_threads = 1
+    memory_per_worker = 8
+
+    image, tag = create_docker_image_for_submission()
+
+    data_files = os.listdir("data")
+    print(data_files)
+
+    if not data_files:
+        raise FileNotFoundError("No data files found")
+
+    files = data_files[:5]
+
+    submission_file_name = "submission.submission_wrapper"
+    submission_function_name = "detect_time_shifts"
+
+    data_dir = "/Users/mvicto/Desktop/Projects/PVInsight/pv-validation-hub/pv-validation-hub/dockerize-workflow/data"
+    results_dir = "/Users/mvicto/Desktop/Projects/PVInsight/pv-validation-hub/pv-validation-hub/dockerize-workflow/results"
+
+    with Client(
+        n_workers=total_workers,
+        threads_per_worker=total_threads,
+        memory_limit=f"{memory_per_worker}GiB",
+        # **kwargs,
+    ) as client:
+
+        lazy_results = []
+        for file in files:
+            submission_args = (file,)
+            lazy_result = delayed(submission_task, pure=True)(
+                tag,
+                memory_per_worker,
+                submission_file_name,
+                submission_function_name,
+                submission_args,
+                data_dir,
+                results_dir,
+            )
+            lazy_results.append(lazy_result)
+
+        futures = client.compute(lazy_results)
+
+        results = client.gather(futures)  # type: ignore
+
+    return results
+
+
 if __name__ == "__main__":
 
     def expensive_function(x):

From c95a4030b9169632fa6ef333e99bce5d223746b4 Mon Sep 17 00:00:00 2001
From: Mitchell Victoriano <47313912+MitchellAV@users.noreply.github.com>
Date: Tue, 25 Jun 2024 21:15:09 -0700
Subject: [PATCH 07/20] Create docker image from submission file

---
 docker-compose.yml                         |   3 +-
 pv-validation-hub-client                   |   2 +-
 requirements.txt                           |   9 +-
 workers/requirements.txt                   |   4 +-
 workers/src/docker/Dockerfile              |  35 +++++
 workers/src/docker/requirements.txt        |   2 +
 workers/src/docker/submission_wrapper.py   | 152 ++++++++++++++++++++
 workers/src/docker/unzip.py                | 160 +++++++++++++++++++++
 workers/src/pvinsight-validation-runner.py | 144 +++++++++++++------
 workers/src/submission_worker.py           |  27 +++-
 workers/src/utility.py                     | 103 ++++++++++---
 11 files changed, 566 insertions(+), 75 deletions(-)
 create mode 100644 workers/src/docker/Dockerfile
 create mode 100644 workers/src/docker/requirements.txt
 create mode 100644 workers/src/docker/submission_wrapper.py
 create mode 100644 workers/src/docker/unzip.py

diff --git a/docker-compose.yml b/docker-compose.yml
index f3e0c0c8..253a1ad1 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -123,9 +123,10 @@ services:
       - s3
       - sqs
     volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
       - ./workers/current_evaluation:/root/worker/current_evaluation
       - ./workers/logs:/root/worker/logs
-      # - ./workers/tmp:/tmp
+      - ./workers/tmp:/tmp
       - ./workers/requirements.txt:/root/worker/requirements.txt
       - ./workers/src:/root/worker/src
     restart: unless-stopped
diff --git a/pv-validation-hub-client b/pv-validation-hub-client
index acc0abfa..e0549ec2 160000
--- a/pv-validation-hub-client
+++ b/pv-validation-hub-client
@@ -1 +1 @@
-Subproject commit acc0abfacbd7acb48d6a3a1da73747020b6382b9
+Subproject commit e0549ec2ce95ca0bc0535c1ba98907f4636c8479
diff --git a/requirements.txt b/requirements.txt
index 3081f6c1..73d3fe8e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,10 +1,3 @@
-Django==3.2.16
-mysqlclient
-boto3
-requests
-djangorestframework
-django-cors-headers
-psycopg2
 black==24.2.0
 pre-commit
-flake8=7.0.0
\ No newline at end of file
+flake8==7.0.0
\ No newline at end of file
diff --git a/workers/requirements.txt b/workers/requirements.txt
index cdeb4079..4a625cc5 100644
--- a/workers/requirements.txt
+++ b/workers/requirements.txt
@@ -1,5 +1,6 @@
 requests
 boto3
+boto3-stubs[s3,sqs]
 pandas
 numpy
 seaborn
@@ -7,4 +8,5 @@ matplotlib
 dask
 distributed
 bokeh
-marimo
\ No newline at end of file
+marimo
+docker
\ No newline at end of file
diff --git a/workers/src/docker/Dockerfile b/workers/src/docker/Dockerfile
new file mode 100644
index 00000000..2ffc153c
--- /dev/null
+++ b/workers/src/docker/Dockerfile
@@ -0,0 +1,35 @@
+# Use an official Python runtime as the base image
+FROM python:3.11-slim
+
+# Set the working directory in the container
+WORKDIR /app
+
+RUN apt-get update 
+
+COPY unzip.py .
+COPY requirements.txt .
+
+# Install the Python dependencies for the submission wrapper
+RUN pip install --no-cache-dir -r requirements.txt
+
+ARG zip_file
+
+# Copy the submission package into the container
+COPY $zip_file .
+
+# Unzip the submission package
+
+RUN python -m unzip $zip_file submission
+
+WORKDIR /app/submission
+
+# Install the Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+
+
+# Set the working directory in the container
+WORKDIR /app
+
+COPY submission_wrapper.py .
+# Command to keep the container running without doing anything
+# CMD tail -f /dev/null
\ No newline at end of file
diff --git a/workers/src/docker/requirements.txt b/workers/src/docker/requirements.txt
new file mode 100644
index 00000000..5da331cf
--- /dev/null
+++ b/workers/src/docker/requirements.txt
@@ -0,0 +1,2 @@
+numpy
+pandas
diff --git a/workers/src/docker/submission_wrapper.py b/workers/src/docker/submission_wrapper.py
new file mode 100644
index 00000000..c8217071
--- /dev/null
+++ b/workers/src/docker/submission_wrapper.py
@@ -0,0 +1,152 @@
+from importlib import import_module
+import inspect
+import sys
+import pandas as pd
+import numpy as np
+from time import perf_counter
+from functools import wraps
+from typing import Any, ParamSpec, Union, Tuple, TypeVar, Callable, cast
+from logging import Logger
+import logging
+
+T = TypeVar("T")
+
+P = ParamSpec("P")
+
+
+def logger_if_able(
+    message: object, logger: Logger | None = None, level: str = "INFO"
+):
+    if logger is not None:
+        levels_dict = {
+            "DEBUG": logging.DEBUG,
+            "INFO": logging.INFO,
+            "WARNING": logging.WARNING,
+            "ERROR": logging.ERROR,
+            "CRITICAL": logging.CRITICAL,
+        }
+
+        level = level.upper()
+
+        if level not in levels_dict:
+            raise Exception(f"Invalid log level: {level}")
+
+        log_level = levels_dict[level]
+
+        logger.log(log_level, message)
+    else:
+        print(message)
+
+
+def timing(verbose: bool = True, logger: Union[Logger, None] = None):
+    def decorator(func: Callable[P, T]):
+        def wrapper(*args: P.args, **kwargs: P.kwargs) -> Tuple[T, float]:
+            start_time = perf_counter()
+            result = func(*args, **kwargs)
+            end_time = perf_counter()
+            execution_time = end_time - start_time
+            if verbose:
+                msg = (
+                    f"{func.__name__} took {execution_time:.3f} seconds to run"
+                )
+                logger_if_able(msg, logger)
+            return result, execution_time
+
+        return wrapper
+
+    return decorator
+
+
+def format_args_for_submission(data_dir: str, args: list[str]):
+    filename = args[0]
+
+    file_path = f"{data_dir}/{filename}"
+
+    df = pd.read_csv(
+        file_path,
+        index_col=0,
+        parse_dates=True,
+    )
+
+    print(df.head(5))
+
+    series: pd.Series = df.asfreq("60min").squeeze()
+
+    submission_args = [series, *args[1:]]
+
+    return submission_args
+
+
+def import_submission_function(submission_file_name: str, function_name: str):
+    # Dynamically import function from submission.py
+    try:
+        submission_module = import_module(submission_file_name)
+    except ModuleNotFoundError as e:
+        print(f"ModuleNotFoundError: {submission_file_name} not found")
+        raise e
+
+    try:
+        submission_function: Callable[[pd.Series, Any], np.ndarray] = getattr(
+            submission_module, function_name
+        )
+        function_parameters = list(
+            inspect.signature(submission_function).parameters.keys()
+        )
+    except AttributeError as e:
+        print(
+            f"AttributeError: {function_name} not found in submission module"
+        )
+        raise e
+
+    return submission_function, function_parameters
+
+
+def main():
+    args = sys.argv[1:]
+
+    if len(args) < 1:
+        print("Function name not provided")
+        sys.exit(1)
+
+    submission_file_name = args[0]
+    function_name = args[1]
+    data_file_name = args[2]
+
+    print("Getting submission function...")
+
+    submission_function, function_parameters = import_submission_function(
+        submission_file_name, function_name
+    )
+    print("Got submission function")
+
+    print(f"Submission file name: {submission_file_name}")
+    print(f"Function name: {function_name}")
+    print(f"Function: {submission_function}")
+    print(f"Function parameters: {function_parameters}")
+
+    data_dir = "/app/data/"
+    results_dir = "/app/results/"
+
+    submission_args = format_args_for_submission(data_dir, args[2:])
+
+    print(f"Submission args: {submission_args}")
+
+    results, execution_time = timing()(submission_function)(*submission_args)
+
+    print(f"Execution time: {execution_time}")
+
+    print(f"Results: {results}")
+
+    # save results to csv file
+    results_df = pd.DataFrame(results)
+    results_file = f"{results_dir}/{data_file_name}"
+    results_df.to_csv(results_file)
+
+    execution_tuple = (data_file_name, execution_time)
+    execution_file = f"{results_dir}/time.csv"
+    execution_df = pd.DataFrame([execution_tuple])
+    execution_df.to_csv(execution_file, mode="a", header=False)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/workers/src/docker/unzip.py b/workers/src/docker/unzip.py
new file mode 100644
index 00000000..b36bbe88
--- /dev/null
+++ b/workers/src/docker/unzip.py
@@ -0,0 +1,160 @@
+import sys
+import os
+import zipfile
+import tarfile
+import shutil
+from typing import Callable, cast
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def remove_unallowed_starting_characters(file_name: str) -> str | None:
+    unallowed_starting_characters = ("_", ".")
+
+    parts = file_name.split("/")
+    for part in parts:
+        if part.startswith(unallowed_starting_characters):
+            return None
+    return file_name
+
+
+def extract_files(  # noqa: C901
+    ref: zipfile.ZipFile | tarfile.TarFile,
+    extract_path: str,
+    zip_path: str,
+):
+
+    logger.info("Extracting files from: " + zip_path)
+
+    if ref.__class__ == zipfile.ZipFile:
+        ref = cast(zipfile.ZipFile, ref)
+        file_names = ref.namelist()
+    elif ref.__class__ == tarfile.TarFile:
+        ref = cast(tarfile.TarFile, ref)
+        file_names = ref.getnames()
+    else:
+        raise Exception("File is not a zip or tar file.")
+
+    # recursively remove files and folders that start with certain characters
+    file_names = [
+        f for f in file_names if remove_unallowed_starting_characters(f)
+    ]
+    logger.info("File names:")
+    logger.info(file_names)
+    folders = [f for f in file_names if f.endswith("/")]
+    logger.info("Folders:")
+    logger.info(folders)
+
+    if len(folders) == 0:
+        logger.info("Extracting all files...")
+
+        for file in file_names:
+            if ref.__class__ == zipfile.ZipFile:
+                ref = cast(zipfile.ZipFile, ref)
+                ref.extract(file, path=extract_path)
+            elif ref.__class__ == tarfile.TarFile:
+                ref = cast(tarfile.TarFile, ref)
+                ref.extract(file, path=extract_path, filter="data")
+            else:
+                raise Exception("File is not a zip or tar file.")
+
+    else:
+        # if all files have the same root any folder can be used to check since all will have the same root if true
+        do_all_files_have_same_root = all(
+            [f.startswith(folders[0]) for f in file_names]
+        )
+        logger.info(
+            "Do all files have the same root? "
+            + str(do_all_files_have_same_root)
+        )
+
+        if do_all_files_have_same_root:
+            # extract all files within the folder with folder of the zipfile that has the same root
+            root_folder_name = folders[0]
+
+            logger.info("Extracting files...")
+            for file in file_names:
+                if file.endswith("/") and file != root_folder_name:
+                    os.makedirs(
+                        os.path.join(
+                            extract_path,
+                            file.removeprefix(root_folder_name),
+                        )
+                    )
+                if not file.endswith("/"):
+                    if ref.__class__ == zipfile.ZipFile:
+                        ref = cast(zipfile.ZipFile, ref)
+                        ref.extract(file, path=extract_path)
+                    elif ref.__class__ == tarfile.TarFile:
+                        ref = cast(tarfile.TarFile, ref)
+                        ref.extract(file, path=extract_path, filter="data")
+                    else:
+                        raise Exception(1, "File is not a zip or tar file.")
+
+                    os.rename(
+                        os.path.join(extract_path, file),
+                        os.path.join(
+                            extract_path,
+                            file.removeprefix(root_folder_name),
+                        ),
+                    )
+
+            # remove the root folder and all other folders
+            shutil.rmtree(os.path.join(extract_path, root_folder_name))
+
+        else:
+            logger.info("Extracting all files...")
+            for file in file_names:
+                if ref.__class__ == zipfile.ZipFile:
+                    ref = cast(zipfile.ZipFile, ref)
+                    ref.extract(file, path=extract_path)
+                elif ref.__class__ == tarfile.TarFile:
+                    ref = cast(tarfile.TarFile, ref)
+                    ref.extract(file, path=extract_path, filter="data")
+                else:
+                    raise Exception(1, "File is not a zip or tar file.")
+
+
+def extract_zip(zip_path: str, extract_path: str):
+    if not os.path.exists(extract_path):
+        os.makedirs(extract_path)
+
+    if zipfile.is_zipfile(zip_path):
+        with zipfile.ZipFile(zip_path, "r") as zip_ref:
+            extract_files(
+                zip_ref,
+                extract_path,
+                zip_path,
+            )
+    elif tarfile.is_tarfile(zip_path):
+        with tarfile.open(zip_path, "r") as tar_ref:
+            extract_files(
+                tar_ref,
+                extract_path,
+                zip_path,
+            )
+    else:
+        raise Exception(1, "File is not a zip or tar file.")
+
+
+def main():
+    args = sys.argv[1:]
+
+    if len(args) < 1:
+        logger.info("Function name not provided")
+        sys.exit(1)
+
+    zip_file_path = args[0]
+    extract_path = args[1]
+
+    submission_zip_file_path = os.path.join(
+        os.path.dirname(__file__), f"{zip_file_path}"
+    )
+
+    logger.info(f"Submission zip file path: {submission_zip_file_path}")
+    extract_zip(submission_zip_file_path, extract_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/workers/src/pvinsight-validation-runner.py b/workers/src/pvinsight-validation-runner.py
index 8fab437b..a391cd9d 100644
--- a/workers/src/pvinsight-validation-runner.py
+++ b/workers/src/pvinsight-validation-runner.py
@@ -39,10 +39,12 @@
     RUNNER_ERROR_PREFIX,
     RunnerException,
     SubmissionException,
+    create_docker_image_for_submission,
     dask_multiprocess,
     generate_private_report_for_submission,
     get_error_by_code,
     get_error_codes_dict,
+    move_file_to_directory,
     pull_from_s3,
     request_to_API_w_credentials,
     timeout,
@@ -295,6 +297,13 @@ def run_user_submission(
     return fn(*args, **kwargs)
 
 
+def move_files_to_directory(files: list[str], src_dir: str, dest_dir: str):
+    for file in files:
+        src_file_path = os.path.join(src_dir, file)
+
+        shutil.move(src_file_path, dest_dir)
+
+
 def run(  # noqa: C901
     s3_submission_zip_file_path: str,
     file_metadata_df: pd.DataFrame,
@@ -315,12 +324,18 @@ def run(  # noqa: C901
             if not current_evaluation_dir.endswith("/")
             else current_evaluation_dir + "data"
         )
+        docker_dir = (
+            current_evaluation_dir + "/docker"
+            if not current_evaluation_dir.endswith("/")
+            else current_evaluation_dir + "docker"
+        )
         sys.path.append(
             current_evaluation_dir
         )  # append current_evaluation_dir to sys.path
     else:
         results_dir = "./results"
         data_dir = "./data"
+        docker_dir = "./docker"
         current_evaluation_dir = os.getcwd()
 
     if tmp_dir is None:
@@ -332,7 +347,9 @@ def run(  # noqa: C901
     # Ensure results directory exists
     os.makedirs(data_dir, exist_ok=True)
 
-    # Load in the module that we're going to test on.
+    # # Load in the module that we're going to test on.
+    # target_module_path, new_dir, submission_file_name, module_name = install_module_dependencies(s3_submission_zip_file_path, update_submission_status, submission_id, tmp_dir)
+
     logger.info(f"module_to_import_s3_path: {s3_submission_zip_file_path}")
     target_module_compressed_file_path = pull_from_s3(
         IS_LOCAL, S3_BUCKET_NAME, s3_submission_zip_file_path, tmp_dir, logger
@@ -340,55 +357,36 @@ def run(  # noqa: C901
     logger.info(
         f"target_module_compressed_file_path: {target_module_compressed_file_path}"
     )
-    target_module_path = convert_compressed_file_path_to_directory(
-        target_module_compressed_file_path
-    )
-    logger.info(
-        f"decompressing file {target_module_compressed_file_path} to {target_module_path}"
-    )
 
-    extract_zip(target_module_compressed_file_path, target_module_path)
-    logger.info(
-        f"decompressed file {target_module_compressed_file_path} to {target_module_path}"
-    )
+    # Move the submission file to the docker directory
 
-    logger.info(f"target_module_path: {target_module_path}")
-    # get current directory, i.e. directory of runner.py file
-    new_dir = os.path.dirname(os.path.abspath(__file__))
-    logger.info(f"new_dir: {new_dir}")
+    submission_file_name = target_module_compressed_file_path.split("/")[-1]
 
-    submission_file_name = get_module_file_name(target_module_path)
-    logger.info(f"file_name: {submission_file_name}")
-    module_name = get_module_name(target_module_path)
-    logger.info(f"module_name: {module_name}")
+    # Move the submission file to the docker directory
+    move_file_to_directory(submission_file_name, tmp_dir, docker_dir)
 
-    # install submission dependency
-    try:
-        subprocess.check_call(
-            [
-                "python",
-                "-m",
-                "pip",
-                "install",
-                "-r",
-                os.path.join(target_module_path, "requirements.txt"),
-            ]
-        )
-        logger.info("submission dependencies installed successfully.")
-    except subprocess.CalledProcessError as e:
-        logger.error("error installing submission dependencies:", e)
-        logger.info(f"update submission status to {FAILED}")
-        update_submission_status(submission_id, FAILED)
-        error_code = 2
-        raise RunnerException(
-            *get_error_by_code(error_code, runner_error_codes, logger)
-        )
+    # raise RunnerException(*get_error_by_code(500, runner_error_codes, logger))
+
+    # Create docker image for the submission
+    image_tag = "submission:latest"
+
+    overwrite = True
+
+    logger.info(f"Creating docker image for submission...")
 
-    shutil.move(
-        os.path.join(target_module_path, submission_file_name),
-        os.path.join(new_dir, submission_file_name),
+    image, image_tag = create_docker_image_for_submission(
+        docker_dir, image_tag, submission_file_name, overwrite, logger
     )
 
+    logger.info(f"Created docker image for submission: {image_tag}")
+
+    raise RunnerException(*get_error_by_code(500, runner_error_codes, logger))
+
+    # shutil.move(
+    #     os.path.join(target_module_path, submission_file_name),
+    #     os.path.join(new_dir, submission_file_name),
+    # )
+
     # Generate list for us to store all of our results for the module
     results_list = list()
     # Load in data set that we're going to analyze.
@@ -611,6 +609,66 @@ def run(  # noqa: C901
     return public_metrics_dict
 
 
+def install_module_dependencies(
+    s3_submission_zip_file_path,
+    update_submission_status,
+    submission_id,
+    tmp_dir,
+):
+    logger.info(f"module_to_import_s3_path: {s3_submission_zip_file_path}")
+    target_module_compressed_file_path = pull_from_s3(
+        IS_LOCAL, S3_BUCKET_NAME, s3_submission_zip_file_path, tmp_dir, logger
+    )
+    logger.info(
+        f"target_module_compressed_file_path: {target_module_compressed_file_path}"
+    )
+    target_module_path = convert_compressed_file_path_to_directory(
+        target_module_compressed_file_path
+    )
+    logger.info(
+        f"decompressing file {target_module_compressed_file_path} to {target_module_path}"
+    )
+
+    extract_zip(target_module_compressed_file_path, target_module_path)
+    logger.info(
+        f"decompressed file {target_module_compressed_file_path} to {target_module_path}"
+    )
+
+    logger.info(f"target_module_path: {target_module_path}")
+    # get current directory, i.e. directory of runner.py file
+    new_dir = os.path.dirname(os.path.abspath(__file__))
+    logger.info(f"new_dir: {new_dir}")
+
+    submission_file_name = get_module_file_name(target_module_path)
+    logger.info(f"file_name: {submission_file_name}")
+    module_name = get_module_name(target_module_path)
+    logger.info(f"module_name: {module_name}")
+
+    # install submission dependency
+    try:
+        subprocess.check_call(
+            [
+                "python",
+                "-m",
+                "pip",
+                "install",
+                "-r",
+                os.path.join(target_module_path, "requirements.txt"),
+            ]
+        )
+        logger.info("submission dependencies installed successfully.")
+    except subprocess.CalledProcessError as e:
+        logger.error("error installing submission dependencies:", e)
+        logger.info(f"update submission status to {FAILED}")
+        update_submission_status(submission_id, FAILED)
+        error_code = 2
+        raise RunnerException(
+            *get_error_by_code(error_code, runner_error_codes, logger)
+        )
+
+    return target_module_path, new_dir, submission_file_name, module_name
+
+
 def create_function_args_for_file(
     file_metadata_row: pd.Series,
     system_metadata_df: pd.DataFrame,
diff --git a/workers/src/submission_worker.py b/workers/src/submission_worker.py
index e7255ca3..2c1d51e0 100644
--- a/workers/src/submission_worker.py
+++ b/workers/src/submission_worker.py
@@ -24,6 +24,7 @@
     RunnerException,
     SubmissionException,
     WorkerException,
+    copy_file_to_directory,
     get_error_by_code,
     get_error_codes_dict,
     pull_from_s3,
@@ -146,9 +147,10 @@ def list_s3_bucket(s3_dir: str):
         pages = paginator.paginate(Bucket=S3_BUCKET_NAME, Prefix=s3_dir)
         for page in pages:
             if page["KeyCount"] > 0:
-                for entry in page["Contents"]:
-                    if "Key" in entry:
-                        all_files.append(entry["Key"])
+                if "Contents" in page:
+                    for entry in page["Contents"]:
+                        if "Key" in entry:
+                            all_files.append(entry["Key"])
 
         # remove the first entry if it is the same as s3_dir
         if len(all_files) > 0 and all_files[0] == s3_dir:
@@ -175,6 +177,18 @@ def update_submission_result(submission_id: int, result_json: dict[str, Any]):
         )
 
 
+def prepare_docker_files_for_submission(src_dir: str, docker_dir: str):
+    files = [
+        "Dockerfile",
+        "submission_wrapper.py",
+        "requirements.txt",
+        "unzip.py",
+    ]
+
+    for file in files:
+        copy_file_to_directory(file, src_dir, docker_dir)
+
+
 def extract_analysis_data(  # noqa: C901
     analysis_id: int, current_evaluation_dir: str
 ) -> pd.DataFrame:
@@ -188,9 +202,9 @@ def extract_analysis_data(  # noqa: C901
         raise FileNotFoundError(
             3, f"No files found in s3 bucket for analysis {analysis_id}"
         )
+    file_names = [file.split("/")[-1] for file in files]
 
     required_files = ["config.json", "file_test_link.csv", "template.py"]
-    file_names = [file.split("/")[-1] for file in files]
 
     for required_file in required_files:
         if required_file not in file_names:
@@ -214,9 +228,11 @@ def extract_analysis_data(  # noqa: C901
     data_dir = os.path.join(current_evaluation_dir, "data")
     file_data_dir = os.path.join(data_dir, "file_data")
     validation_data_dir = os.path.join(data_dir, "validation_data")
+    docker_dir = os.path.join(current_evaluation_dir, "docker")
     os.makedirs(data_dir, exist_ok=True)
     os.makedirs(file_data_dir, exist_ok=True)
     os.makedirs(validation_data_dir, exist_ok=True)
+    os.makedirs(docker_dir, exist_ok=True)
 
     # File category link: This file represents the file_category_link table,
     # which links specific files in the file_metadata table.
@@ -358,6 +374,9 @@ def load_analysis(
         os.path.join(current_evaluation_dir, "errorcodes.json"),
     )
 
+    docker_dir = os.path.join(current_evaluation_dir, "docker")
+
+    prepare_docker_files_for_submission("/root/worker/src/docker", docker_dir)
     # import analysis runner as a module
     sys.path.insert(0, current_evaluation_dir)
     runner_module_name = "pvinsight-validation-runner"
diff --git a/workers/src/utility.py b/workers/src/utility.py
index 43bc7190..e6a978ea 100644
--- a/workers/src/utility.py
+++ b/workers/src/utility.py
@@ -1,4 +1,5 @@
 import json
+import shutil
 from dask.delayed import delayed
 from dask.distributed import Client
 from dask import config
@@ -49,7 +50,7 @@
 
 
 def logger_if_able(
-    message: str, logger: Logger | None = None, level: str = "INFO"
+    message: object, logger: Logger | None = None, level: str = "INFO"
 ):
     if logger is not None:
         levels_dict = {
@@ -476,6 +477,48 @@ def get_error_by_code(
     return error_code, error_codes_dict[error_code_str]
 
 
+def copy_file_to_directory(
+    file: str, src_dir: str, dest_dir: str, logger: Logger | None = None
+):
+    src_file_path = os.path.join(src_dir, file)
+
+    if not os.path.exists(src_file_path):
+        raise FileNotFoundError(f"File {src_file_path} not found.")
+
+    if not os.path.exists(dest_dir):
+        raise FileNotFoundError(f"Directory {dest_dir} not found.")
+
+    try:
+        shutil.copy(src_file_path, dest_dir)
+    except Exception as e:
+        logger_if_able(
+            f"Error moving file {src_file_path} to {dest_dir}", logger, "ERROR"
+        )
+        logger_if_able(e, logger, "ERROR")
+        raise e
+
+
+def move_file_to_directory(
+    file: str, src_dir: str, dest_dir: str, logger: Logger | None = None
+):
+    src_file_path = os.path.join(src_dir, file)
+
+    if not os.path.exists(src_file_path):
+        raise FileNotFoundError(f"File {src_file_path} not found.")
+
+    if not os.path.exists(dest_dir):
+        raise FileNotFoundError(f"Directory {dest_dir} not found.")
+
+    try:
+        shutil.move(src_file_path, dest_dir)
+    except Exception as e:
+        logger_if_able(
+            f"Error moving file {src_file_path} to {dest_dir}", logger, "ERROR"
+        )
+        logger_if_able(e, logger, "ERROR")
+        raise e
+
+
 # API Utility Functions
 
 IS_LOCAL = is_local()
@@ -869,17 +912,20 @@ def submission_task(
 
 
 def create_docker_image(
+    dir_path: str,
     tag: str,
+    submission_file_name: str,
     client: docker.DockerClient,
     overwrite: bool = False,
+    logger: Logger | None = None,
 ):
 
-    file_path = os.path.join(os.path.dirname(__file__), "environment")
+    # file_path = os.path.join(os.path.dirname(__file__), "environment")
 
-    print(file_path)
+    logger_if_able(dir_path, logger)
 
     # Check if Dockerfile exists
-    if not os.path.exists(os.path.join(file_path, "Dockerfile")):
+    if not os.path.exists(os.path.join(dir_path, "Dockerfile")):
         raise FileNotFoundError("Dockerfile not found")
 
     # Check if docker image already exists
@@ -890,26 +936,29 @@ def create_docker_image(
         try:
             image = client.images.get(tag)
         except ImageNotFound:
-            print("Docker image not found")
+            logger_if_able("Docker image not found", logger)
         except Exception as e:
             raise e
 
     if image:
-        print("Docker image already exists")
-        print(image)
+        logger_if_able("Docker image already exists", logger)
+        logger_if_able(image, logger)
         return image
     else:
-        print("Docker image does not exist")
+        logger_if_able("Docker image does not exist")
 
         # Create docker image from Dockerfile
         image, build_logs = client.images.build(
-            path=file_path, tag=tag, dockerfile="Dockerfile"
+            path=dir_path,
+            tag=tag,
+            dockerfile="Dockerfile",
+            buildargs={"zip_file": f"{submission_file_name}"},
         )
         for log in build_logs:
             if "stream" in log:
-                print(log["stream"].strip())
+                logger_if_able(log["stream"].strip())
 
-        print("Docker image created")
+        logger_if_able("Docker image created")
 
         return image
 
@@ -975,15 +1024,27 @@ def is_docker_daemon_running():
     return is_running
 
 
-def create_docker_image_for_submission():
-    tag = "submission:latest"
+def create_docker_image_for_submission(
+    dir_path: str,
+    image_tag: str,
+    submission_file_name: str,
+    overwrite: bool = True,
+    logger: Logger | None = None,
+):
 
     is_docker_daemon_running()
 
     with DockerClientContextManager() as client:
-        image = create_docker_image(tag, client, overwrite=True)
+        image = create_docker_image(
+            dir_path,
+            image_tag,
+            submission_file_name,
+            client,
+            overwrite=overwrite,
+            logger=logger,
+        )
 
-    return image, tag
+    return image, image_tag
 
 
 def dask_main():
@@ -993,7 +1054,15 @@ def dask_main():
     total_threads = 1
     memory_per_worker = 8
 
-    image, tag = create_docker_image_for_submission()
+    dir_path = os.path.join(os.path.dirname(__file__), "environment")
+
+    image_tag = "submission:latest"
+
+    submission_file_name = "submission.zip"
+
+    image, _ = create_docker_image_for_submission(
+        dir_path, image_tag, submission_file_name
+    )
 
     data_files = os.listdir("data")
     print(data_files)
@@ -1020,7 +1089,7 @@ def dask_main():
         for file in files:
             submission_args = (file,)
             lazy_result = delayed(submission_task, pure=True)(
-                tag,
+                image_tag,
                 memory_per_worker,
                 submission_file_name,
                 submission_function_name,

From 372f5ab44e4a31f382c21702413810c98c0c8d46 Mon Sep 17 00:00:00 2001
From: Mitchell Victoriano <47313912+MitchellAV@users.noreply.github.com>
Date: Thu, 27 Jun 2024 17:08:37 -0700
Subject: [PATCH 08/20] In Progress: implementing docker to runner

---
 workers/src/docker/submission_wrapper.py   |   9 +-
 workers/src/pvinsight-validation-runner.py | 208 +++++++++++----------
 workers/src/submission_worker.py           |   2 +
 workers/src/utility.py                     |   5 +-
 4 files changed, 121 insertions(+), 103 deletions(-)

diff --git a/workers/src/docker/submission_wrapper.py b/workers/src/docker/submission_wrapper.py
index c8217071..d1ab4f66 100644
--- a/workers/src/docker/submission_wrapper.py
+++ b/workers/src/docker/submission_wrapper.py
@@ -60,7 +60,7 @@ def wrapper(*args: P.args, **kwargs: P.kwargs) -> Tuple[T, float]:
 def format_args_for_submission(data_dir: str, args: list[str]):
     filename = args[0]
 
-    file_path = f"{data_dir}/{filename}"
+    file_path = f"{data_dir}/file_data/{filename}"
 
     df = pd.read_csv(
         file_path,
@@ -124,8 +124,8 @@ def main():
     print(f"Function: {submission_function}")
     print(f"Function parameters: {function_parameters}")
 
-    data_dir = "/app/data/"
-    results_dir = "/app/results/"
+    data_dir = "/app/data"
+    results_dir = "/app/results"
 
     submission_args = format_args_for_submission(data_dir, args[2:])
 
@@ -138,6 +138,9 @@ def main():
     print(f"Results: {results}")
 
     # save results to csv file
+
+    print(f"Saving results to {results_dir}/{data_file_name}")
+
     results_df = pd.DataFrame(results)
     results_file = f"{results_dir}/{data_file_name}"
     results_df.to_csv(results_file)
diff --git a/workers/src/pvinsight-validation-runner.py b/workers/src/pvinsight-validation-runner.py
index a391cd9d..85bc078a 100644
--- a/workers/src/pvinsight-validation-runner.py
+++ b/workers/src/pvinsight-validation-runner.py
@@ -16,7 +16,7 @@
       This section will be dependent on the type of analysis being run.
 """
 
-from typing import Any, Callable, Tuple, TypeVar, cast, ParamSpec
+from typing import Any, Callable, Sequence, Tuple, TypeVar, cast, ParamSpec
 import pandas as pd
 import os
 from importlib import import_module
@@ -47,6 +47,7 @@
     move_file_to_directory,
     pull_from_s3,
     request_to_API_w_credentials,
+    submission_task,
     timeout,
     timing,
     is_local,
@@ -329,6 +330,7 @@ def run(  # noqa: C901
             if not current_evaluation_dir.endswith("/")
             else current_evaluation_dir + "docker"
         )
+
         sys.path.append(
             current_evaluation_dir
         )  # append current_evaluation_dir to sys.path
@@ -380,8 +382,6 @@ def run(  # noqa: C901
 
     logger.info(f"Created docker image for submission: {image_tag}")
 
-    raise RunnerException(*get_error_by_code(500, runner_error_codes, logger))
-
     # shutil.move(
     #     os.path.join(target_module_path, submission_file_name),
     #     os.path.join(new_dir, submission_file_name),
@@ -426,6 +426,25 @@ def run(  # noqa: C901
             *get_error_by_code(error_code, runner_error_codes, logger)
         )
 
+    # Save system metadata to a CSV file
+    system_metadata_file_name = "system_metadata.csv"
+
+    system_metadata_df.to_csv(
+        os.path.join(
+            os.path.join(data_dir, "metadata"), system_metadata_file_name
+        )
+    )
+
+    file_metadata_file_name = "file_metadata.csv"
+
+    file_metadata_df.to_csv(
+        os.path.join(
+            os.path.join(data_dir, "metadata"), file_metadata_file_name
+        )
+    )
+
+    # exit()
+
     # Read in the configuration JSON for the particular run
     with open(os.path.join(current_evaluation_dir, "config.json"), "r") as f:
         if not f:
@@ -444,39 +463,55 @@ def run(  # noqa: C901
 
     # Get the name of the function we want to import associated with this
     # test
-    function_name: str = config_data["function_name"]
-    # Import designated module via importlib
-    module = import_module(module_name)
-    try:
-        submission_function: Callable = getattr(module, function_name)
-        function_parameters = list(
-            inspect.signature(submission_function).parameters
-        )
-    except AttributeError:
-        logger.error(
-            f"function {function_name} not found in module {module_name}"
-        )
-        logger.info(f"update submission status to {FAILED}")
-        update_submission_status(submission_id, FAILED)
-        error_code = 6
-        raise RunnerException(
-            *get_error_by_code(error_code, runner_error_codes, logger)
-        )
+    # # Import designated module via importlib
+    # module = import_module(module_name)
+    # try:
+    #     submission_function: Callable = getattr(module, function_name)
+    #     function_parameters = list(
+    #         inspect.signature(submission_function).parameters
+    #     )
+    # except AttributeError:
+    #     logger.error(
+    #         f"function {function_name} not found in module {module_name}"
+    #     )
+    #     logger.info(f"update submission status to {FAILED}")
+    #     update_submission_status(submission_id, FAILED)
+    #     error_code = 6
+    #     raise RunnerException(
+    #         *get_error_by_code(error_code, runner_error_codes, logger)
+    #     )
 
     total_number_of_files = len(file_metadata_df)
     logger.info(f"total_number_of_files: {total_number_of_files}")
 
+    memory_limit: str = "8"
+    submission_module_name: str = "submission.submission_wrapper"
+    submission_function_name: str = config_data["function_name"]
+    data_dir: str = os.path.abspath(data_dir)
+    results_dir: str = os.path.abspath(results_dir)
+
+    volume_data_dir = "/Users/mvicto/Desktop/Projects/PVInsight/pv-validation-hub/pv-validation-hub/workers/current_evaluation/data"
+    volume_results_dir = "/Users/mvicto/Desktop/Projects/PVInsight/pv-validation-hub/pv-validation-hub/workers/current_evaluation/results"
+
+    func_arguments_list = prepare_function_args_for_parallel_processing(
+        image_tag=image_tag,
+        memory_limit=memory_limit,
+        submission_file_name=submission_module_name,
+        submission_function_name=submission_function_name,
+        data_dir=data_dir,
+        results_dir=results_dir,
+        volume_data_dir=volume_data_dir,
+        volume_results_dir=volume_results_dir,
+    )
+
     # Loop through each file and generate predictions
 
+    # print(func_arguments_list)
+
+    # raise Exception("Finished Successfully")
+
     results_list, number_of_errors = loop_over_files_and_generate_results(
-        file_metadata_df,
-        system_metadata_df,
-        data_dir,
-        config_data,
-        submission_function,
-        function_parameters,
-        function_name,
-        performance_metrics,
+        func_arguments_list
     )
 
     # Convert the results to a pandas dataframe and perform all of the
@@ -670,43 +705,12 @@ def install_module_dependencies(
 
 
 def create_function_args_for_file(
-    file_metadata_row: pd.Series,
-    system_metadata_df: pd.DataFrame,
-    data_dir: str,
-    config_data: dict[str, Any],
-    submission_function: Callable[P, pd.Series],
-    function_parameters: list[str],
-    function_name: str,
-    performance_metrics: list[str],
-    file_number: int,
+    file_metadata_row: pd.Series, *args, **kwargs
 ):
 
-    file_name: str = file_metadata_row["file_name"]
-
-    # Get associated system ID
-    system_id = file_metadata_row["system_id"]
-
-    # Get all of the associated metadata for the particular file based
-    # on its system ID. This metadata will be passed in via kwargs for
-    # any necessary arguments
-    associated_system_metadata: dict[str, Any] = dict(
-        system_metadata_df[system_metadata_df["system_id"] == system_id].iloc[
-            0
-        ]
-    )
-
-    function_args = (
-        file_name,
-        data_dir,
-        associated_system_metadata,
-        config_data,
-        submission_function,
-        function_parameters,
-        file_metadata_row,
-        function_name,
-        performance_metrics,
-        file_number,
-    )
+    submission_file_name: str = cast(str, file_metadata_row["file_name"])
+    # Submission Args for the function
+    function_args = (submission_file_name,)
 
     return function_args
 
@@ -722,33 +726,45 @@ def append_to_list(item: T, array: list[T] | None = None):
 
 
 def prepare_function_args_for_parallel_processing(
-    file_metadata_df: pd.DataFrame,
-    system_metadata_df: pd.DataFrame,
+    image_tag: str,
+    memory_limit: str,
+    submission_file_name: str,
+    submission_function_name: str,
     data_dir: str,
-    config_data: dict[str, Any],
-    submission_function: Callable[P, pd.Series],
-    function_parameters: list[str],
-    function_name: str,
-    performance_metrics: list[str],
+    results_dir: str,
+    volume_data_dir: str,
+    volume_results_dir: str,
 ):
 
+    file_metadata_df = pd.read_csv(
+        os.path.join(data_dir, "metadata", "file_metadata.csv")
+    )
+
+    system_metadata_df = pd.read_csv(
+        os.path.join(data_dir, "metadata", "system_metadata.csv")
+    )
+
     function_args_list = None
 
     for file_number, (_, file_metadata_row) in enumerate(
         file_metadata_df.iterrows()
     ):
 
-        function_args = create_function_args_for_file(
+        submission_args = create_function_args_for_file(
             file_metadata_row,
-            system_metadata_df,
-            data_dir,
-            config_data,
-            submission_function,
-            function_parameters,
-            function_name,
-            performance_metrics,
-            file_number,
+            submission_function_name=submission_function_name,
+        )
+
+        function_args = (
+            image_tag,
+            memory_limit,
+            submission_file_name,
+            submission_function_name,
+            submission_args,
+            volume_data_dir,
+            volume_results_dir,
         )
+
         function_args_list = append_to_list(function_args, function_args_list)
 
     if function_args_list is None:
@@ -795,26 +811,19 @@ def run_submission(
 
 
 def loop_over_files_and_generate_results(
-    file_metadata_df: pd.DataFrame,
-    system_metadata_df: pd.DataFrame,
-    data_dir: str,
-    config_data: dict[str, Any],
-    submission_function: Callable[P, pd.Series],
-    function_parameters: list[str],
-    function_name: str,
-    performance_metrics: list[str],
+    func_arguments_list: list[Tuple],
 ) -> tuple[list[dict[str, Any]], int]:
 
-    func_arguments_list = prepare_function_args_for_parallel_processing(
-        file_metadata_df,
-        system_metadata_df,
-        data_dir,
-        config_data,
-        submission_function,
-        function_parameters,
-        function_name,
-        performance_metrics,
-    )
+    # func_arguments_list = prepare_function_args_for_parallel_processing(
+    #     file_metadata_df,
+    #     system_metadata_df,
+    #     data_dir,
+    #     config_data,
+    #     submission_function,
+    #     function_parameters,
+    #     function_name,
+    #     performance_metrics,
+    # )
 
     NUM_FILES_TO_TEST = 3
 
@@ -829,13 +838,14 @@ def loop_over_files_and_generate_results(
     # Test the first two files
     logger.info(f"Testing the first {NUM_FILES_TO_TEST} files...")
     test_results = dask_multiprocess(
-        run_submission_and_generate_performance_metrics,
+        submission_task,
         test_func_argument_list,
         n_workers=NUM_FILES_TO_TEST,
         threads_per_worker=1,
         # memory_limit="16GiB",
         logger=logger,
     )
+
     errors = [error for _, error in test_results]
     number_of_errors += sum(errors)
 
diff --git a/workers/src/submission_worker.py b/workers/src/submission_worker.py
index 2c1d51e0..fb0b34c3 100644
--- a/workers/src/submission_worker.py
+++ b/workers/src/submission_worker.py
@@ -228,10 +228,12 @@ def extract_analysis_data(  # noqa: C901
     data_dir = os.path.join(current_evaluation_dir, "data")
     file_data_dir = os.path.join(data_dir, "file_data")
     validation_data_dir = os.path.join(data_dir, "validation_data")
+    metadata_dir = os.path.join(data_dir, "metadata")
     docker_dir = os.path.join(current_evaluation_dir, "docker")
     os.makedirs(data_dir, exist_ok=True)
     os.makedirs(file_data_dir, exist_ok=True)
     os.makedirs(validation_data_dir, exist_ok=True)
+    os.makedirs(metadata_dir, exist_ok=True)
     os.makedirs(docker_dir, exist_ok=True)
 
     # File category link: This file represents the file_category_link table,
diff --git a/workers/src/utility.py b/workers/src/utility.py
index e6a978ea..fb0843f5 100644
--- a/workers/src/utility.py
+++ b/workers/src/utility.py
@@ -311,7 +311,7 @@ def dask_multiprocess(
     **kwargs,
 ) -> list[T]:
 
-    MEMORY_PER_RUN = 7.0  # in GB
+    MEMORY_PER_RUN = 8.0  # in GB
 
     memory_per_run = memory_per_run or MEMORY_PER_RUN
 
@@ -355,6 +355,9 @@ def dask_multiprocess(
 
         lazy_results = []
         for args in func_arguments:
+
+            logger_if_able(f"args: {args}", logger, "INFO")
+
             lazy_result = delayed(func, pure=True)(*args)
             lazy_results.append(lazy_result)
 

From ffd477e57bd9e3750ce2cf5175bcc287f42fd412 Mon Sep 17 00:00:00 2001
From: Mitchell Victoriano <47313912+MitchellAV@users.noreply.github.com>
Date: Wed, 3 Jul 2024 16:03:53 -0700
Subject: [PATCH 09/20] Misc changes for development

---
 NewTaskRequirements.md                    | 117 ++++++++++++++++++++++
 docker-compose.yml                        |   7 +-
 dockerize-workflow/.gitignore             |   3 +
 dockerize-workflow/environment/Dockerfile |   4 +-
 dockerize-workflow/environment/unzip.py   |  22 ++--
 dockerize-workflow/requirements.txt       |   2 -
 pv-validation-hub-client                  |   2 +-
 workers/src/utility.py                    |   8 ++
 8 files changed, 147 insertions(+), 18 deletions(-)
 create mode 100644 NewTaskRequirements.md
 create mode 100644 dockerize-workflow/.gitignore

diff --git a/NewTaskRequirements.md b/NewTaskRequirements.md
new file mode 100644
index 00000000..ee6b3922
--- /dev/null
+++ b/NewTaskRequirements.md
@@ -0,0 +1,117 @@
+# Required files for creating a new PV Validation Hub Task
+
+## config.json
+
+Example JSON:
+
+```json
+{
+    "category_name": "Time Shift Analysis",
+    "function_name": "detect_time_shifts",
+    "comparison_type": "time_series",
+    "performance_metrics": [
+        "runtime",
+        "mean_absolute_error"
+    ],
+    "allowable_kwargs": [
+        "latitude",
+        "longitude",
+        "data_sampling_frequency"
+    ],
+    "ground_truth_compare": [
+        "time_series"
+    ],
+    "public_results_table": "time-shift-public-metrics.json",
+    "private_results_columns": [
+        "system_id",
+        "file_name",
+        "run_time",
+        "data_requirements",
+        "mean_absolute_error_time_series",
+        "data_sampling_frequency",
+        "issue"
+    ],
+    "plots": [
+        {
+            "type": "histogram",
+            "x_val": "mean_absolute_error_time_series",
+            "color_code": "issue",
+            "title": "Time Series MAE Distribution by Issue",
+            "save_file_path": "mean_absolute_error_time_series_dist.png"
+        },
+        {
+            "type": "histogram",
+            "x_val": "mean_absolute_error_time_series",
+            "color_code": "data_sampling_frequency",
+            "title": "Time Series MAE Distribution by Sampling Frequency",
+            "save_file_path": "mean_absolute_error_time_series_dist.png"
+        },
+        {
+            "type": "histogram",
+            "x_val": "run_time",
+            "title": "Run Time Distribution",
+            "save_file_path": "run_time_dist.png"
+        }
+    ]
+}
+```
+
+## system_metadata.csv
+
+Required columns:
+
+```csv
+system_id,name,azimuth,tilt,elevation,latitude,longitude,tracking,climate_type,dc_capacity
+```
+
+## file_metadata.csv
+
+Required columns:
+
+```csv
+file_id,system_id,file_name,timezone,data_sampling_frequency,issue
+```
+
+## template.py (Marimo template with cli args input)
+
+Marimo python file will need to input data from `mo.cli_args()` method
+
+Example:
+
+```python
+def create_df_from_cli_args():
+        args = mo.cli_args().to_dict()
+        data = args.get("results_df")
+        rows = []
+        for row in data:
+            rows.append(json.loads(row))
+
+        df = pd.DataFrame.from_records(rows)
+        return df
+```
+
+## csv data files
+
+file names must match what is included in the file_name in the file_metadata.csv
+
+## ground truth csv data files
+
+file names must match what is included in the data files folder
+
+## Markdown files for Task
+
+### description.md
+
+The markdown file used for the description tab in an analysis.
+
+### dataset.md
+
+The markdown file to describe the dataset in the data tab.
+
+### shortdesc.md
+
+The markdown file that is used on the card.
+
+### SubmissionInstructions.md
+
+The markdown file that is used on the Submission Instructions tab in the analysis.
diff --git a/docker-compose.yml b/docker-compose.yml
index 253a1ad1..ccb9a32e 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -129,7 +129,12 @@ services:
       - ./workers/tmp:/tmp
       - ./workers/requirements.txt:/root/worker/requirements.txt
       - ./workers/src:/root/worker/src
-    restart: unless-stopped
+    restart: always
+    develop:
+      watch:
+        - action: sync
+          path: ./workers/src
+          target: /root/worker/src
 
 volumes:
   valhub:
diff --git a/dockerize-workflow/.gitignore b/dockerize-workflow/.gitignore
new file mode 100644
index 00000000..acec3807
--- /dev/null
+++ b/dockerize-workflow/.gitignore
@@ -0,0 +1,3 @@
+results/*
+data/*
+certs/*
diff --git a/dockerize-workflow/environment/Dockerfile b/dockerize-workflow/environment/Dockerfile
index ccddc4ba..4d4ec322 100644
--- a/dockerize-workflow/environment/Dockerfile
+++ b/dockerize-workflow/environment/Dockerfile
@@ -13,11 +13,11 @@ COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 
 # Copy the submission package into the container
-COPY submission.zip .
+COPY $zip_file .
 
 # Unzip the submission package
 
-RUN python -m unzip submission.zip submission
+RUN python -m unzip $zip_file submission
 
 WORKDIR /app/submission
 
diff --git a/dockerize-workflow/environment/unzip.py b/dockerize-workflow/environment/unzip.py
index eeec358c..b36bbe88 100644
--- a/dockerize-workflow/environment/unzip.py
+++ b/dockerize-workflow/environment/unzip.py
@@ -9,11 +9,20 @@
 logger = logging.getLogger(__name__)
 
 
+def remove_unallowed_starting_characters(file_name: str) -> str | None:
+    unallowed_starting_characters = ("_", ".")
+
+    parts = file_name.split("/")
+    for part in parts:
+        if part.startswith(unallowed_starting_characters):
+            return None
+    return file_name
+
+
 def extract_files(  # noqa: C901
     ref: zipfile.ZipFile | tarfile.TarFile,
     extract_path: str,
     zip_path: str,
-    remove_unallowed_starting_characters: Callable[[str], str | None],
 ):
 
     logger.info("Extracting files from: " + zip_path)
@@ -111,22 +120,12 @@ def extract_zip(zip_path: str, extract_path: str):
     if not os.path.exists(extract_path):
         os.makedirs(extract_path)
 
-    def remove_unallowed_starting_characters(file_name: str) -> str | None:
-        unallowed_starting_characters = ("_", ".")
-
-        parts = file_name.split("/")
-        for part in parts:
-            if part.startswith(unallowed_starting_characters):
-                return None
-        return file_name
-
     if zipfile.is_zipfile(zip_path):
         with zipfile.ZipFile(zip_path, "r") as zip_ref:
             extract_files(
                 zip_ref,
                 extract_path,
                 zip_path,
-                remove_unallowed_starting_characters,
             )
     elif tarfile.is_tarfile(zip_path):
         with tarfile.open(zip_path, "r") as tar_ref:
@@ -134,7 +133,6 @@ def remove_unallowed_starting_characters(file_name: str) -> str | None:
                 tar_ref,
                 extract_path,
                 zip_path,
-                remove_unallowed_starting_characters,
             )
     else:
         raise Exception(1, "File is not a zip or tar file.")
diff --git a/dockerize-workflow/requirements.txt b/dockerize-workflow/requirements.txt
index 8de293d2..2d894416 100644
--- a/dockerize-workflow/requirements.txt
+++ b/dockerize-workflow/requirements.txt
@@ -1,5 +1,3 @@
-prefect
-prefect-dask
 docker
 bokeh
 requests
\ No newline at end of file
diff --git a/pv-validation-hub-client b/pv-validation-hub-client
index e0549ec2..77d2b941 160000
--- a/pv-validation-hub-client
+++ b/pv-validation-hub-client
@@ -1 +1 @@
-Subproject commit e0549ec2ce95ca0bc0535c1ba98907f4636c8479
+Subproject commit 77d2b941c98a8d23ad932167ca72d09370f28a63
diff --git a/workers/src/utility.py b/workers/src/utility.py
index fb0843f5..fb27ec25 100644
--- a/workers/src/utility.py
+++ b/workers/src/utility.py
@@ -817,6 +817,7 @@ def __init__(
     ) -> None:
         self.client = client
         self.container: Container | None = None
+        self.id: str | None = None
         self.image = image
         self.command = command
         self.volumes = volumes
@@ -834,6 +835,9 @@ def __enter__(self):
         )
 
         self.container = cast(Container, container)
+
+        self.id = self.container.id
+
         return self.container
 
     def __exit__(self, exc_type, exc_value, exc_traceback):
@@ -842,6 +846,10 @@ def __exit__(self, exc_type, exc_value, exc_traceback):
                 self.container.stop()
             self.container.remove()
 
+        self.client.containers.prune(
+            filters={"label": "status=exited", "label": "status=created"}
+        )
+
 
 def docker_task(
     client: docker.DockerClient,

From 7dc0a44e0b9247a7f94cccf5c404ecab1f42ea29 Mon Sep 17 00:00:00 2001
From: Mitchell Victoriano <mitchell.victoriano@gmail.com>
Date: Fri, 12 Jul 2024 14:13:55 -0700
Subject: [PATCH 10/20] Fixed issue with validation files not being uploaded
 correctly

---
 ec2/insert_analysis.py | 75 ++++++++++++++++++++++++++----------------
 1 file changed, 47 insertions(+), 28 deletions(-)

diff --git a/ec2/insert_analysis.py b/ec2/insert_analysis.py
index 9de97ba4..4053c875 100755
--- a/ec2/insert_analysis.py
+++ b/ec2/insert_analysis.py
@@ -49,7 +49,7 @@ def get_data_from_api_to_df(api_url: str, endpoint: str) -> pd.DataFrame:
 
 
 def post_data_to_api_to_df(
-    api_url: str, endpoint: str, data: dict
+    api_url: str, endpoint: str, data: dict[str, Any]
 ) -> pd.DataFrame:
 
     data = request_to_API_w_credentials(
@@ -392,17 +392,19 @@ def createSystemMetadata(self, sys_metadata_df: pd.DataFrame):
 
         for system in systems_json_list:
 
-            json_body = {
-                # "system_id": system["system_id"],
-                "name": system["name"],
-                "azimuth": system["azimuth"],
-                "tilt": system["tilt"],
-                "elevation": system["elevation"],
-                "latitude": system["latitude"],
-                "longitude": system["longitude"],
-                "tracking": system["tracking"],
-                "dc_capacity": system["dc_capacity"],
-            }
+            json_body: dict[str, Any] = {}
+
+            json_body["name"] = system["name"]
+            json_body["azimuth"] = system["azimuth"]
+            json_body["tilt"] = system["tilt"]
+            json_body["elevation"] = system["elevation"]
+            json_body["latitude"] = system["latitude"]
+            json_body["longitude"] = system["longitude"]
+            json_body["tracking"] = system["tracking"]
+            if "dc_capacity" in system:
+                print(system["dc_capacity"])
+                if system["dc_capacity"] is not None:
+                    json_body["dc_capacity"] = system["dc_capacity"]
 
             print(json_body)
 
@@ -455,11 +457,18 @@ def createFileMetadata(self, file_metadata_df: pd.DataFrame):
                 self.is_local,
             )
 
+    def uploadValidationData(self):
+
+        file_metadata_names = self.new_file_metadata_df["file_name"]
+
+        for file_name in file_metadata_names:
             # upload validation data to s3
             local_path = os.path.join(
-                self.validation_data_folder_path, metadata["file_name"]
+                self.validation_data_folder_path, file_name
+            )
+            upload_path = (
+                f"data_files/ground_truth/{str(self.analysis_id)}/{file_name}"
             )
-            upload_path = f'data_files/ground_truth/{str(self.analysis_id)}/{metadata["file_name"]}'
             upload_to_s3_bucket(
                 self.s3_url,
                 self.s3_bucket_name,
@@ -626,21 +635,30 @@ def buildSystemMetadata(self):
 
         df_new = df_new[~df_new["name"].isin(list(same_systems["name"]))]
 
-        # Return the system data ready for insertion
-        return df_new[
-            [
-                "system_id",
-                "name",
-                "azimuth",
-                "tilt",
-                "elevation",
-                "latitude",
-                "longitude",
-                "tracking",
-                "dc_capacity",
-            ]
+        system_metadata_columns = [
+            "system_id",
+            "name",
+            "azimuth",
+            "tilt",
+            "elevation",
+            "latitude",
+            "longitude",
+            "tracking",
+            "dc_capacity",
         ]
 
+        def addNAtoMissingColumns(df, columns):
+            new_df = df.copy()
+            for column in columns:
+                if column not in df.columns:
+                    new_df[column] = None
+            return new_df
+
+        # Return the system data ready for insertion
+        df_modified = addNAtoMissingColumns(df_new, system_metadata_columns)
+        print(df_modified.head(5))
+        return df_modified
+
     def buildFileMetadata(self):
         """
         Check for duplicates in the file metadata table. Build non duplicated
@@ -789,7 +807,7 @@ def prepareFileTestLinker(self):
         Prepare the file test linker and drop it into the new evaluation folder.
         """
 
-        file_test_link = self.db_file_metadata_df["file_id"]
+        file_test_link = self.new_file_metadata_df["file_id"]
 
         file_test_link.index.name = "test_id"
 
@@ -825,6 +843,7 @@ def insertData(self, force=False):
         new_file_metadata_df = self.buildFileMetadata()
         self.createFileMetadata(new_file_metadata_df)
         self.updateFileMetadataIDs()
+        self.uploadValidationData()
 
         self.prepareFileTestLinker()
         self.prepareConfig()

From 1d71f3f999cb6704db68418e525b0dc2356d6c0b Mon Sep 17 00:00:00 2001
From: Mitchell Victoriano <mitchell.victoriano@gmail.com>
Date: Mon, 15 Jul 2024 17:24:51 -0700
Subject: [PATCH 11/20] Changes to fix az tilt submission within worker

---
 compressions/4/az-tilt-submission.zip      | Bin 0 -> 2275 bytes
 ec2/.gitignore                             |   1 +
 valhub/system_metadata/models.py           |   2 +-
 workers/requirements.txt                   |   2 +-
 workers/src/pvinsight-validation-runner.py |  25 ++++++++++++---------
 workers/src/submission_worker.py           |  15 +++++++++----
 6 files changed, 29 insertions(+), 16 deletions(-)
 create mode 100644 compressions/4/az-tilt-submission.zip
 create mode 100644 ec2/.gitignore

diff --git a/compressions/4/az-tilt-submission.zip b/compressions/4/az-tilt-submission.zip
new file mode 100644
index 0000000000000000000000000000000000000000..feeff0f9002f54acd56054a37625b248441cd23b
GIT binary patch
literal 2275
zcmZ`*c{CJ^7N3Q&j(udy79)GM64{24%txav$!8gv>`P<RNJ&|f!dNFh6A~(0jAJw*
zL<UVlhOuT#4L&jriJs0o=bd-n`QE+fp5M9m+<VWxfBk-F6qrW}008g<jD6NG*er3s
zmGA)o?a}~%<Uuv+>NN}=AFXSm8y*=D6Nc4`imi|djhsKe8JF;r4{`77gvaTxrW<SC
zQF^UYTN|YoJXF89WO1=3JC>c=In+641*u(@xo!G&CXN=K3^3@6y)xZ(G~M}ZgylZ{
zxk;eVWr8CvZ!@D1mo0QDiMX+HCJ9}sp{k$T{FoWwKlHrt50c8?9*-+EP&GJ!9}(-N
zEn%y5w9zDgd3SlEl8Eh$08ggFmP}pgI^~yj^FJCm`&!?13pRQkqucjzk(Xh=Fs|nK
z?T4Hh#Ssgsi@i?VRG;ffFZOgEH=?LEf-m4gTBIj49D1JDW9f4RL~U*~A?7+_Hk#h4
zG~*$a<{jNxg7ld%0fvjKogN|9JL-08`<zjoMZ8TVSgw^FIa_0ls8y&g%3KfStQ9_B
zQ5-MPKxLE12K|Ke*2e-l%VRb2+1d$LQ?^1y6K1?o3b~#{u98EkA{E{f@(x#@;Bq(N
z-jfBF4w#Gmtp;X}0KHY!Dj<ElMP)xa1|d;_*HQ}+Dl`@B-24Hj5aS$@$}bs5P<1Kn
zr=9v*EcS5Q&?%UJk4|A^*9&5=%}vrxfnL%{W%jl`W>d)aazOn0nAbbwS|f~EiL?-D
zvr;B=-u2m?C)n(}JZnLaF3h28`*!(`IU1?~6pj4)AEKlTjb)e;Pe-HN($j1OEXeSe
z8}USud!(20_A3w))hEfCvS*8zT6Zj_UO0LPy^-#f9+;?S8XqIZQQ&>{y%<@*7+zf6
z{iPrObM4dl{P>@RK5CJ-_JV(u{>hqovU8n-!M%F5&&5k{RIK=+wVLyHLSOB<hm^e0
zrmTRGds|_)wBUxLuR)DF`<n6Mk`M3ff`uTKy}N1VEu$9|PN58q8D07D;%KdQo+|Q(
zn+%_Kf~}Z~^+*VpHjE{!BqdFPL@Zy*a7kI<`?=w}^{InID|K4^LL66=4u#{}=5fgo
zckpy4ZbP!MY}WAbtUt3G*6Jshn1d*6F=~wF|K&gE9YCQ;M4cPpKJ0^CBUo-P&F_i_
z>oVS9UXA=5)-fYbBVnr;c9YA|s1QrS?Kz()kNP?VChZLqX+at+J|fb$zVLp0Y`JSU
zMJ4%zw*jZ96TJ)?g_rcsx?&n%dX*<E{;B>&F(>k^>DxFjN8gEuKQlRBgjXDO+~q>*
zvM!uB<@fA`H?QilEe`6??G{Z$P#lX7-EcByz)U>=#lk9yFAoo_q1VXN8<)aJr%Y>s
z>@skQ22UrtYGnIUZuRN*T{D;1Y@n6cdz#4Sg2VVekF2uev@Xm9u$p!6zTbkzm{$UO
zRL>J3Fl*-5w(18b0k}>#lmFh15HcBi6L&(*ss-u~6BuGgmIgE*rYiD(y&*E;+~<#q
z*yr{mJ0c9XwQ;LkLP#1y)I!Qw-Fd3kqt620@A6vKu9?>($h>YTr4+(96l0v>MB=ME
zkspjQ$hYU|9Nq&>j*Ni@*zdh|zC(j*aC#O^!MyW|-?Zr8?Y;_{L=t>?KXT_buU%}q
zsJm|C1D)ofY{^#Eb`6kX0Y8{IJ@G8?2UnefRclMQ`+zV!!^Rw=(AKqlSbXglh~qIm
zOdty>Elsvb?p(I-(ff)w%+D5<%WPMdxbExAe00c?Ol_tKCeYQ|u$3_9WA|oOn-?>)
z6h`O6E;ejg?CX`ytg2crvxASziub7{E9endb<R=iduvR#Cyz{L-9SRfam-)H!B_f%
zY3+Y)Sx*i`f8{H93Pxr{F+v{SHKIre#AqmPFk04VPY5%GLwS#;a-EqH$^M?w6Ojxk
z?{Pu<qJ;Jo{9<dgjB}a>uelT5?0FH|))vXqd^829lf;_0vSKSBOoO0DYZ*+}P7Df2
zQ4*4+^Ybu2W&xW}1ap}j`0%ZkN35~Gf$un1$)Ziq+}@tStXoeDFWB>Z0cGf)43{L`
zpYPj3yIa8LXin81`!grax}=LO2sJNb6B`5WPsmdFQ%QEru@pRP(E=_;&ua>eA~fF?
zxjqUvMI2+pjYqCv3^EC79%>zD$s#Lw-YJI$rs9y>s5fS+3NPsu#1EZ3qGV@n$t;(E
zo-9j5YZ$sym3zsTKEK~J?L5{Oa8}wZq%<YGTrOz3<MRS}??YV19#2e0p8OvTI_w`d
zm2MlUwFO~A(TBsI5O*+Bt{~Yw*wZ8T$VM2l4GUUs+2)2x9U9+J<#5}U@%~-cUXF&M
z2=eFLQky&CA(eaCzdS9ZGtx>D3P;bUWudBof-R_iw4p=KG1*AP>`x8+5f97-!zQ*9
z#a6EHQ$-uf*)nG8K_$KbevgQ$a?-6p?-T+8V;2GR_$(zc%Jv;@1M4u>JI`RHR(<eS
z@jmihR*N>RrLKsIQmNB_rGYe6H1B^Y*%lGILDi}#vd}skV-$$*_e_Tgdz&VyjF_wi
z-H9pRhAtt3QZ{NvXw)0g@mCB=5fN2x?|WVH(cFvnD+{YzhRy?U!8)5M1mCwx$Ym*I
zh)HU;epVK>aa^Vc{a%?n_(J|!YPcsW4+qVi1iL6z?q3DtGjIN^Bz_A#@mjHB>fCwG
z$GZ<FZmwj^n?8kJrVo_y>$lhXJec3uky?wpYFW~gb8S92)e8yvz7IhEht9(e=p5*@
zQtfg;<9-Jrb`Uu1m6%{0HXIv)kJiHz@cr6@x>ZdSlMy4sA)~oPEr$Wr86C)m6DQ;L
z<cn@vcQKtf5b&t*c1d;#&_PGTNTA!u7@(=?sM*J003GF5WWRU<b}D$)E0EQGM#3B&
zAQ}Y(Ndf-9o;bJx00b!ih6DQtb@AT|{^^6?WB*>jIFKp-`wtq$3;EpvI`CTu=*<5+
F`xlsnA};^{

literal 0
HcmV?d00001

diff --git a/ec2/.gitignore b/ec2/.gitignore
new file mode 100644
index 00000000..172a8848
--- /dev/null
+++ b/ec2/.gitignore
@@ -0,0 +1 @@
+az-tilt-submission/
\ No newline at end of file
diff --git a/valhub/system_metadata/models.py b/valhub/system_metadata/models.py
index 10e40b99..3163b920 100644
--- a/valhub/system_metadata/models.py
+++ b/valhub/system_metadata/models.py
@@ -10,7 +10,7 @@ class SystemMetadata(models.Model):
     latitude = models.FloatField()
     longitude = models.FloatField()
     tracking = models.BooleanField()
-    dc_capacity = models.FloatField()
+    dc_capacity = models.FloatField(default=0.0)
 
     def __str__(self):
         return self.name
diff --git a/workers/requirements.txt b/workers/requirements.txt
index 4a625cc5..e083c973 100644
--- a/workers/requirements.txt
+++ b/workers/requirements.txt
@@ -2,7 +2,7 @@ requests
 boto3
 boto3-stubs[s3,sqs]
 pandas
-numpy
+numpy==1.26.4
 seaborn
 matplotlib
 dask
diff --git a/workers/src/pvinsight-validation-runner.py b/workers/src/pvinsight-validation-runner.py
index a836cef9..f5591099 100644
--- a/workers/src/pvinsight-validation-runner.py
+++ b/workers/src/pvinsight-validation-runner.py
@@ -647,6 +647,7 @@ def run(  # noqa: C901
         f"{total_number_of_files - number_of_errors} out of {total_number_of_files} files processed successfully"
     )
 
+    # public_metrics_dict["success_rate"] = success_rate
     return public_metrics_dict
 
 
@@ -869,6 +870,9 @@ def generate_performance_metrics_for_submission(
     if config_data["comparison_type"] == "scalar":
         for val in config_data["ground_truth_compare"]:
             ground_truth_dict[val] = associated_metadata[val]
+            logger.debug(
+                f'ground_truth_dict["{val}"]: {ground_truth_dict[val]}'
+            )
     if config_data["comparison_type"] == "time_series":
         ground_truth_series: pd.Series = pd.read_csv(
             os.path.join(data_dir + "/validation_data/", file_name),
@@ -877,18 +881,18 @@ def generate_performance_metrics_for_submission(
         ).squeeze()
         ground_truth_dict["time_series"] = ground_truth_series
 
-    ground_truth_file_length = len(ground_truth_series)
+        ground_truth_file_length = len(ground_truth_series)
 
-    file_submission_result_length = len(data_outputs)
-    if file_submission_result_length != ground_truth_file_length:
-        logger.error(
-            f"{file_name} submission result length {file_submission_result_length} does not match ground truth file length {ground_truth_file_length}"
-        )
-        error_code = 8
+        file_submission_result_length = len(data_outputs)
+        if file_submission_result_length != ground_truth_file_length:
+            logger.error(
+                f"{file_name} submission result length {file_submission_result_length} does not match ground truth file length {ground_truth_file_length}"
+            )
+            error_code = 8
 
-        raise RunnerException(
-            *get_error_by_code(error_code, runner_error_codes, logger)
-        )
+            raise RunnerException(
+                *get_error_by_code(error_code, runner_error_codes, logger)
+            )
 
     # Convert the data outputs to a dictionary identical to the
     # ground truth dictionary
@@ -916,6 +920,7 @@ def generate_performance_metrics_for_submission(
             # and calculate the absolute error
             for val in config_data["ground_truth_compare"]:
                 error = np.abs(output_dictionary[val] - ground_truth_dict[val])
+                logger.debug(f"error for {val}: {error}")
                 results_dictionary[metric + "_" + val] = error
         elif metric == "mean_absolute_error":
             for val in config_data["ground_truth_compare"]:
diff --git a/workers/src/submission_worker.py b/workers/src/submission_worker.py
index b58392c4..f879e371 100644
--- a/workers/src/submission_worker.py
+++ b/workers/src/submission_worker.py
@@ -283,10 +283,17 @@ def extract_analysis_data(  # noqa: C901
         ground_truth.split("/")[-1] for ground_truth in ground_truths
     ]
 
-    if not all(file in ground_truth_files for file in files_for_analysis):
-        raise FileNotFoundError(
-            9, f"Ground truth data files not found for analysis {analysis_id}"
-        )
+    # if not all(file in ground_truth_files for file in files_for_analysis):
+    #     raise FileNotFoundError(
+    #         9, f"Ground truth data files not found for analysis {analysis_id}"
+    #     )
+
+    for analysis_file in files_for_analysis:
+        if analysis_file not in ground_truth_files:
+            raise FileNotFoundError(
+                9,
+                f"Ground truth data file {analysis_file} not found for analysis {analysis_id}",
+            )
 
     if not all(file in analytical_files for file in files_for_analysis):
         raise FileNotFoundError(

From 319fa0da54fe1a2268573f2d3a3bbe2ca3e0d72e Mon Sep 17 00:00:00 2001
From: Mitchell Victoriano <mitchell.victoriano@gmail.com>
Date: Tue, 16 Jul 2024 18:02:52 -0700
Subject: [PATCH 12/20] Removed sdt version pin for test submission zip

---
 compressions/1/sdt-submission.zip              | Bin 2932 -> 489 bytes
 .../1/sdt-submission/submission_wrapper.py     |   1 -
 2 files changed, 1 deletion(-)

diff --git a/compressions/1/sdt-submission.zip b/compressions/1/sdt-submission.zip
index 9af57e322af4454721dbffff582bba0cd4d90fde..165217cc59228968ed32159cfa3f240d11527756 100644
GIT binary patch
literal 489
zcmWIWW@Zs#00Fb9uOcGS8FVCpY!DUz;-b{T(#)dN+|<01V!e`zlDyK~f=XVl;{2S%
zBHfh4l0@B-{QR6^UakPBF&qp$eIFtg{&jb|0o0zz$iN^9q>D?Fax;sIGxPJ}%Zm~V
z3Q~*o3MzdAHu@bk;JKsyRo-ZuP?dbc6<$R(wQna(-eoU2!nf_S%l~;(#4c7?{Vpmm
zJJWhp=4y-?`>PdS?kdfhTg1efkZ{H`)}}&#?MF5dA+P&yMGFcUdDB0O8CS(VPO|zk
zH~amI1uj?Aryp2!wc)b0u-S={{g3XR(ta{Y>F!^pAE6VUy)D00{`m7ki7@k6XCCqH
z*@eH^CEE-CB;Q{$Nq=KP@X^H1`{@T>I%gTzE~we`=nI#pw8%OUzqb4=?QqMq<qH;s
zE{y$NAIZ;>d3nY5nXb1Kl_tCAvTT_4hdscXkx7IBcbEdhgMpDj0Ym`-dI%$QV-HG@
d9*AytAPZ~226(fwfs`=;VIz<Z0>%OZ0|5CPrzHRY

literal 2932
zcmWIWW@Zs#00H&Mi4kB1ln`Q2U?@o~E{QKLP0GzIF3!x)Q!vpF4dG>Azn%OjRq6z5
znj8?9R&X;gvdCZD!o{*NXjW<f*gOsf4ixk50L_#Enin7M>*(ws9HFllk6|>5$%U9E
zCnty|rza#OeDL)N`@kR85zxRiL4sLbK$`KRmIB+y_XYy2GR{s-3w$Q@X*#@VT-G9}
zcy9ij`KzZ#1WcH`YWnOMbC^X^l7F1qHhcR1dzzI&tJim4{dbkWW3#v{14Do}JBJI$
z4YpXILqU#3ctZ;*SXd+&7^JZVi=Inxd~ivAQ7YIM4}jqV!k{oh3!k?$toaTZ2)Ly8
za(kQTY2Gbh$V+fN$+?4pHGIMMi;A}dH;DuVs5Vtj`Wn_%pDE+daO1;{_y!(@gT2Of
zlWvH$tTQ{O*dpN6QJcK3<W8BiQ-`e4eE!yTCB@S7*{#ZwjbdMTnQz}1zd(8J3m@@a
zm(HGbJ}qI_qb)7mW!M_A@OAilog?=P3ZDM_ZJ%iRpIfm`qWf{F%m+S}(|qg8UcYMl
z>h17he(UnQoQ2wV@9g5>*eL1`3Ysj2j@(VaU;zaH_MoW%%Bms9Fh=|W;~aY^krdY(
zr3IRQh&r5M2!5z=pn=(<t%b#Lrl>*e3rHMtcO0-V1v&-f9E3xa$`MIh94MoNH4ckX
z3rjPLQgc)DN{aPLDoVgnDPa64RR$Q1SfcW@_F3H(PqfaR_td%M>#5^;Mfc?S^P10?
zz=`Saf5~%_K&wF3VRwi<BLjmjp%BCCCQ<^<(7@mmQ>McFi}M;a9!LlYu_QK@-eG){
zVxoE`ppcP`ZRL|cF?)|RHO&dxJLkxwNtYg7nKI?bq>!f#2`g@yJG?n~;6Q%>I1d@#
zXYnX7Ht0}#!qxTZbkJv^&y#{BD|+RZHigvwezA;s(M^eI|1u*ttUc$NynYh@WdE5j
z-5>pF=97?=$hrBebla}0`E1}2x?LpawHz2apm0EhP*H771W*tR>VPupz!1VrN{AvU
zzPu>0pdhtKub>hfUzxzT$^v2#jh6LF19s;fHsHDQS5zuj;RCzK2QiVkR;A(*-F>Vc
zOT2<y|9%fmV*Yrc=+WbM1$$<2cU!L7nZtJ_@TJ(pH&cW?7?_j&UOnR5z3r%UfQLq`
zctIf}Z~A93<Eq%lNw?nHdb#W^dH3Ec;bPW-mvcGw^$hR&eXd`#TXy{f@2-;1-4DWD
zXKiQCtf<^@=+&~#r#H{OxMQM1vON!b#T2i7N1A*Ef2;nJ@%NDrn!P@MihtOgIK@oq
zR_#{ihqG^-3jKC_oq9&-3%%cG4l_I6GI?dO+2gd9QRNjjhQsf~LD}<{Sw`G7V32_V
z40}*2Gchn20fQ1=0Aq%!9#TP%D^y8Ladj-09PB0=F$W$nR8iqL#Pl+uguzH|v1Vdx
z7q@|dhwAg^uT;fEo-BFwR7Gt16tU&YmZ(fyqQ%bQJjIBOfs>z&t*&L)og+_VI_~f&
zOgwoYp-*Lc)Fh58^E0Ntef?<ij488(=SPPG#U%wznHU|DH!F8eMo3KL%($$av{}<Q
z3zd~8JV;9raGentFeQH4tm!kR&*7T%oKfhJ#;4Y$ovxjgsxvEB@;nRj$y5mbx9`kD
zri(c|-v77g+z5NVXk+-v_LIxcd{LHet_VK3o|}8`W3_$OYjRF+TRdZi!JjWzm%CrA
ze_Y<a^5n$Z`>(hA7WnDiFDd-1*n6YfJlwzX^Y^2-=RWmWCYyc!PlT|Q@5{oxO_SzN
zd+WAOIPhWjkGNdT#OFKa{i~Sv_L^~UnB>*e9agKh9y@g;yMDr#zLHg!=Y79ql_lz)
z?EcLDXNCTP_qouVqj$yJV1vOXGh<@|BOep9?R*|S{<_=JK(Wrw5&gW>tPPj~Kye-5
z&B!FejJslo<}il0jvy9hVnC^ku~zdC^I;V|a@`54=waY(V*ro|PjzHe`!Jt?Yy#EA
z$W;-jE{1_6jp9JRWBLSpor9$~N4E&MDgo7OFtDW22pGU{FJZBf_<9c6PEaL<Tx)<T
zDHvGNcpGRZrd4>W2LgTs*@s*PgX$s}SkjonOu&zLt;SMLAbS;5$Rn3oph6x7mNd4p
tVDl<jMLxQv$XNtb9Kyho#*3^3{7cw=te$3N1K9yeyzRgcz0MBe0RSI2<#_-A

diff --git a/compressions/1/sdt-submission/submission_wrapper.py b/compressions/1/sdt-submission/submission_wrapper.py
index 4c9b756a..78112f51 100644
--- a/compressions/1/sdt-submission/submission_wrapper.py
+++ b/compressions/1/sdt-submission/submission_wrapper.py
@@ -5,7 +5,6 @@
 def detect_time_shifts(
     time_series, latitude=None, longitude=None, data_sampling_frequency=None
 ):
-    raise NotImplementedError("This function is not implemented yet.")
     dh = DataHandler(time_series.to_frame())
     dh.run_pipeline(fix_shifts=True, verbose=False, round_shifts_to_hour=False)
     return dh.time_shift_analysis.correction_estimate

From c56ea8551e65e47d9efd3e0eef90b146745d0560 Mon Sep 17 00:00:00 2001
From: Mitchell Victoriano <mitchell.victoriano@gmail.com>
Date: Tue, 16 Jul 2024 21:58:25 -0700
Subject: [PATCH 13/20] Updated the worker to handle submission over data files
 using docker env

---
 compressions/4/az-tilt-submission.zip      | Bin 2275 -> 2279 bytes
 pv-validation-hub-client                   |   2 +-
 workers/Dockerfile                         |   8 +-
 workers/src/docker/submission_wrapper.py   |  10 ++-
 workers/src/pvinsight-validation-runner.py |  50 +++++++----
 workers/src/utility.py                     |  93 ++++++++++++++-------
 6 files changed, 107 insertions(+), 56 deletions(-)

diff --git a/compressions/4/az-tilt-submission.zip b/compressions/4/az-tilt-submission.zip
index feeff0f9002f54acd56054a37625b248441cd23b..bd0f422636be750cdcc506b3f88fe80b7aaf9313 100644
GIT binary patch
delta 129
zcmaDX_*`)EdG^MMOcG+E3=9m#rAfJ&#l@NVdGX~%i3J6zMH}@NvrnGLp&<g4WhhE5
zEX^!R%}vcKDb_2gD4FcXA!7^D2Lg-?3P7?Rh%rskE2s?cW@Q7)F)#sPBajZ@0Pz4A
Cdm=di

delta 124
zcmaDZ_*igay_ASB0|P@rS$SefNwKb}Zf<@`X-?|K^^4gjGSyF>z#*jyk_Q1s1_dyU
xsamg~a&iENmezT828N>4!qUv5)ZEm(l48A*ijn|tRyL3}CLpW_(taEu9sr%FAF%)c

diff --git a/pv-validation-hub-client b/pv-validation-hub-client
index 75e16e89..1ce46a5d 160000
--- a/pv-validation-hub-client
+++ b/pv-validation-hub-client
@@ -1 +1 @@
-Subproject commit 75e16e89589e7b113761caaf019dd637b8ead295
+Subproject commit 1ce46a5df02257066fdc2ab86025cc2b2edffabf
diff --git a/workers/Dockerfile b/workers/Dockerfile
index 6d866584..e6ab9a86 100644
--- a/workers/Dockerfile
+++ b/workers/Dockerfile
@@ -1,8 +1,5 @@
-FROM python:3.11-slim as base
+FROM python:3.11-slim
 
-# docker build --progress=plain -t "hmm:Dockerfile" -f valhub/Dockerfile .
-WORKDIR /root
-RUN mkdir worker
 WORKDIR /root/worker
 COPY . .
 
@@ -17,10 +14,7 @@ RUN apt-get install -y libpq-dev python3-psycopg2 curl nano
 # RUN apt-get --assume-yes install python3-pip
 RUN apt-get --assume-yes install mariadb-client supervisor postgresql-client libopenblas-dev
 RUN apt-get --assume-yes install python3-dev default-libmysqlclient-dev build-essential cmake git libhdf5-dev pkg-config
-WORKDIR /root
-WORKDIR /root/worker
 RUN python3 -m pip install --upgrade pip
-WORKDIR /root/worker
 RUN python3 -m pip install -r requirements.txt --timeout=1000
 
 CMD ["/bin/sh", "/root/worker/docker-entrypoint.sh"]
\ No newline at end of file
diff --git a/workers/src/docker/submission_wrapper.py b/workers/src/docker/submission_wrapper.py
index d1ab4f66..29bcdf21 100644
--- a/workers/src/docker/submission_wrapper.py
+++ b/workers/src/docker/submission_wrapper.py
@@ -114,9 +114,13 @@ def main():
 
     print("Getting submission function...")
 
-    submission_function, function_parameters = import_submission_function(
-        submission_file_name, function_name
-    )
+    try:
+        submission_function, function_parameters = import_submission_function(
+            submission_file_name, function_name
+        )
+    except AttributeError as e:
+        error_code = 500
+        exit(error_code)
     print("Got submission function")
 
     print(f"Submission file name: {submission_file_name}")
diff --git a/workers/src/pvinsight-validation-runner.py b/workers/src/pvinsight-validation-runner.py
index 1d4bc499..516d670f 100644
--- a/workers/src/pvinsight-validation-runner.py
+++ b/workers/src/pvinsight-validation-runner.py
@@ -490,8 +490,20 @@ def run(  # noqa: C901
     data_dir: str = os.path.abspath(data_dir)
     results_dir: str = os.path.abspath(results_dir)
 
-    volume_data_dir = "/Users/mvicto/Desktop/Projects/PVInsight/pv-validation-hub/pv-validation-hub/workers/current_evaluation/data"
-    volume_results_dir = "/Users/mvicto/Desktop/Projects/PVInsight/pv-validation-hub/pv-validation-hub/workers/current_evaluation/results"
+    volume_host_data_dir = os.environ.get("DOCKER_HOST_VOLUME_DATA_DIR")
+    volume_host_results_dir = os.environ.get("DOCKER_HOST_VOLUME_RESULTS_DIR")
+
+    if volume_host_data_dir is None:
+        # TODO: add error code
+        raise RunnerException(
+            *get_error_by_code(500, runner_error_codes, logger)
+        )
+
+    if volume_host_results_dir is None:
+        # TODO: add error code
+        raise RunnerException(
+            *get_error_by_code(500, runner_error_codes, logger)
+        )
 
     func_arguments_list = prepare_function_args_for_parallel_processing(
         image_tag=image_tag,
@@ -500,8 +512,8 @@ def run(  # noqa: C901
         submission_function_name=submission_function_name,
         data_dir=data_dir,
         results_dir=results_dir,
-        volume_data_dir=volume_data_dir,
-        volume_results_dir=volume_results_dir,
+        volume_data_dir=volume_host_data_dir,
+        volume_results_dir=volume_host_results_dir,
     )
 
     # Loop through each file and generate predictions
@@ -510,9 +522,12 @@ def run(  # noqa: C901
 
     # raise Exception("Finished Successfully")
 
-    results_list, number_of_errors = loop_over_files_and_generate_results(
+    number_of_errors = loop_over_files_and_generate_results(
         func_arguments_list
     )
+    logger.info(f"number_of_errors: {number_of_errors}")
+
+    raise Exception("Finished Successfully")
 
     # Convert the results to a pandas dataframe and perform all of the
     # post-processing in the script
@@ -805,6 +820,7 @@ def prepare_function_args_for_parallel_processing(
             submission_args,
             volume_data_dir,
             volume_results_dir,
+            logger,
         )
 
         function_args_list = append_to_list(function_args, function_args_list)
@@ -854,7 +870,7 @@ def run_submission(
 
 def loop_over_files_and_generate_results(
     func_arguments_list: list[Tuple],
-) -> tuple[list[dict[str, Any]], int]:
+) -> int:
 
     # func_arguments_list = prepare_function_args_for_parallel_processing(
     #     file_metadata_df,
@@ -879,7 +895,7 @@ def loop_over_files_and_generate_results(
 
     # Test the first two files
     logger.info(f"Testing the first {NUM_FILES_TO_TEST} files...")
-    test_results = dask_multiprocess(
+    test_errors = dask_multiprocess(
         submission_task,
         test_func_argument_list,
         n_workers=NUM_FILES_TO_TEST,
@@ -888,7 +904,7 @@ def loop_over_files_and_generate_results(
         logger=logger,
     )
 
-    errors = [error for _, error in test_results]
+    errors = [error for error, error_code in test_errors]
     number_of_errors += sum(errors)
 
     if number_of_errors == NUM_FILES_TO_TEST:
@@ -903,10 +919,10 @@ def loop_over_files_and_generate_results(
     # Test the rest of the files
 
     logger.info(f"Testing the rest of the files...")
-    rest_results = []
+    rest_errors = []
     try:
-        rest_results = dask_multiprocess(
-            run_submission_and_generate_performance_metrics,
+        rest_errors = dask_multiprocess(
+            submission_task,
             rest_func_argument_list,
             # n_workers=4,
             threads_per_worker=1,
@@ -929,16 +945,16 @@ def loop_over_files_and_generate_results(
         raise RunnerException(
             *get_error_by_code(500, runner_error_codes, logger)
         )
-    errors = [error for _, error in rest_results]
+    errors = [error for error, error_code in rest_errors]
     number_of_errors += sum(errors)
 
-    test_results = [result for result, _ in test_results if result is not None]
-    rest_results = [result for result, _ in rest_results if result is not None]
+    # test_errors = [result for result, _ in test_errors if result is not None]
+    # rest_errors = [result for result, _ in rest_errors if result is not None]
 
-    results.extend(test_results)
-    results.extend(rest_results)
+    # results.extend(test_errors)
+    # results.extend(rest_errors)
 
-    return results, number_of_errors
+    return number_of_errors
 
 
 def generate_performance_metrics_for_submission(
diff --git a/workers/src/utility.py b/workers/src/utility.py
index b77de5e7..9896d1e1 100644
--- a/workers/src/utility.py
+++ b/workers/src/utility.py
@@ -9,11 +9,8 @@
 from docker.models.images import Image
 
 from concurrent.futures import (
-    ProcessPoolExecutor,
     ThreadPoolExecutor,
-    as_completed,
 )
-from functools import wraps
 from logging import Logger
 from time import perf_counter, sleep
 import os
@@ -878,10 +875,6 @@ def __exit__(self, exc_type, exc_value, exc_traceback):
                 self.container.stop()
             self.container.remove()
 
-        self.client.containers.prune(
-            filters={"label": "status=exited", "label": "status=created"}
-        )
-
 
 def docker_task(
     client: docker.DockerClient,
@@ -892,7 +885,11 @@ def docker_task(
     submission_args: Sequence[Any],
     data_dir: str,
     results_dir: str,
-):
+    logger: Logger | None = None,
+) -> tuple[bool, int | None]:
+
+    error_raised = False
+    error_code: int | None = None
 
     if submission_args is None:
         submission_args = []
@@ -917,18 +914,40 @@ def docker_task(
     with DockerContainerContextManager(
         client, image, command, volumes, memory_limit
     ) as container:
-        print("Docker container starting...")
-        print(f"Image: {image}")
-        print(f"Submission file name: {submission_file_name}")
-        print(f"Submission function name: {submission_function_name}")
-        print(f"Submission args: {submission_args}")
+        logger_if_able("Docker container starting...", logger)
+        logger_if_able(f"Image: {image}", logger)
+        logger_if_able(f"Submission file name: {submission_file_name}", logger)
+        logger_if_able(
+            f"Submission function name: {submission_function_name}", logger
+        )
+        logger_if_able(f"Submission args: {submission_args}", logger)
 
         # Wait for container to finish
         for line in container.logs(stream=True):
             line = cast(str, line)
-            print(line.strip())
+            logger_if_able(line.strip(), logger)
+
+        try:
+            container_dict: dict[str, Any] = container.wait()
+        except Exception as e:
+            error_raised = True
+            error_code = 500
+            logger_if_able(f"Error: {e}", logger, "ERROR")
+            return error_raised, error_code
+
+        if "StatusCode" not in container_dict:
+            raise Exception(
+                "Error: Docker container did not return status code"
+            )
+
+        exit_code: int = cast(int, container_dict["StatusCode"])
+
+        if exit_code != 0:
+            error_raised = True
+            error_code = exit_code
+            logger_if_able("Error: Docker container exited with error", logger)
 
-        container.wait()
+    return error_raised, error_code
 
 
 def submission_task(
@@ -939,19 +958,35 @@ def submission_task(
     submission_args: Sequence[Any],
     data_dir: str,
     results_dir: str,
-):
+    logger: Logger | None = None,
+) -> tuple[bool, int | None]:
+
+    error = False
+    error_code: int | None = None
 
     with DockerClientContextManager() as client:
-        docker_task(
-            client=client,
-            image=image_tag,
-            memory_limit=memory_limit,
-            submission_file_name=submission_file_name,
-            submission_function_name=submission_function_name,
-            submission_args=submission_args,
-            data_dir=data_dir,
-            results_dir=results_dir,
-        )
+        try:
+            error_raised, error_code_raised = docker_task(
+                client=client,
+                image=image_tag,
+                memory_limit=memory_limit,
+                submission_file_name=submission_file_name,
+                submission_function_name=submission_function_name,
+                submission_args=submission_args,
+                data_dir=data_dir,
+                results_dir=results_dir,
+                logger=logger,
+            )
+            if error_raised:
+                error = True
+                error_code = error_code_raised
+                logger_if_able("Error: Docker task failed", logger, "ERROR")
+        except Exception as e:
+            error = True
+            error_code = 500
+            logger_if_able(f"Error: {e}", None, "ERROR")
+
+    return error, error_code
 
 
 def create_docker_image(
@@ -994,6 +1029,7 @@ def create_docker_image(
         image, build_logs = client.images.build(
             path=dir_path,
             tag=tag,
+            rm=True,
             dockerfile="Dockerfile",
             buildargs={"zip_file": f"{submission_file_name}"},
         )
@@ -1020,10 +1056,10 @@ def __exit__(self, exc_type, exc_value, exc_traceback):
 
 
 def initialize_docker_client():
-    base_url = os.environ.get("DOCKER_HOST")
+    base_url = os.environ.get("DOCKER_HOST", None)
 
     if not base_url:
-        raise FileNotFoundError("DOCKER_HOST environment variable not set")
+        logger_if_able("Docker host not set", None, "WARNING")
 
     # cert_path = os.environ.get("DOCKER_CERT_PATH")
     # if not cert_path:
@@ -1139,6 +1175,7 @@ def dask_main():
                 submission_args,
                 data_dir,
                 results_dir,
+                logger,
             )
             lazy_results.append(lazy_result)
 

From 5d492e0bb89924742e8e00c39c1386dd98603e82 Mon Sep 17 00:00:00 2001
From: Mitchell Victoriano <mitchell.victoriano@gmail.com>
Date: Wed, 17 Jul 2024 20:56:28 -0700
Subject: [PATCH 14/20] Adding support for allowable kwargs from config

---
 workers/src/docker/submission_wrapper.py   | 32 ++++++++++--
 workers/src/pvinsight-validation-runner.py | 58 ++++++++++++++++++++--
 2 files changed, 82 insertions(+), 8 deletions(-)

diff --git a/workers/src/docker/submission_wrapper.py b/workers/src/docker/submission_wrapper.py
index 29bcdf21..f46e99f2 100644
--- a/workers/src/docker/submission_wrapper.py
+++ b/workers/src/docker/submission_wrapper.py
@@ -1,5 +1,6 @@
 from importlib import import_module
 import inspect
+from re import sub
 import sys
 import pandas as pd
 import numpy as np
@@ -57,7 +58,9 @@ def wrapper(*args: P.args, **kwargs: P.kwargs) -> Tuple[T, float]:
     return decorator
 
 
-def format_args_for_submission(data_dir: str, args: list[str]):
+def format_args_for_submission(
+    data_dir: str, function_params: list[str], args: list[str]
+):
     filename = args[0]
 
     file_path = f"{data_dir}/file_data/{filename}"
@@ -68,11 +71,28 @@ def format_args_for_submission(data_dir: str, args: list[str]):
         parse_dates=True,
     )
 
-    print(df.head(5))
+    series: pd.Series = df.squeeze()
+
+    rest_args = args[1:]
+    new_args = []
+
+    for arg in rest_args:
+        if arg.isdigit():
+            new_args.append(int(arg))
+        elif arg.isdecimal():
+            new_args.append(float(arg))
+        else:
+            new_args.append(arg)
+
+    submission_args: list = [series, *new_args]
 
-    series: pd.Series = df.asfreq("60min").squeeze()
+    if len(submission_args) != len(function_params):
+        print(
+            f"Function parameters do not match submission arguments: {submission_args}"
+        )
+        submission_args = submission_args[: len(function_params)]
 
-    submission_args = [series, *args[1:]]
+    print(f"Submission args: {submission_args}")
 
     return submission_args
 
@@ -131,7 +151,9 @@ def main():
     data_dir = "/app/data"
     results_dir = "/app/results"
 
-    submission_args = format_args_for_submission(data_dir, args[2:])
+    submission_args = format_args_for_submission(
+        data_dir, function_parameters, args[2:]
+    )
 
     print(f"Submission args: {submission_args}")
 
diff --git a/workers/src/pvinsight-validation-runner.py b/workers/src/pvinsight-validation-runner.py
index 516d670f..cf550fec 100644
--- a/workers/src/pvinsight-validation-runner.py
+++ b/workers/src/pvinsight-validation-runner.py
@@ -510,6 +510,7 @@ def run(  # noqa: C901
         memory_limit=memory_limit,
         submission_file_name=submission_module_name,
         submission_function_name=submission_function_name,
+        current_evaluation_dir=current_evaluation_dir,
         data_dir=data_dir,
         results_dir=results_dir,
         volume_data_dir=volume_host_data_dir,
@@ -762,12 +763,39 @@ def install_module_dependencies(
 
 
 def create_function_args_for_file(
-    file_metadata_row: pd.Series, *args, **kwargs
+    file_metadata_row: pd.Series,
+    system_metadata_row: pd.Series,
+    allowable_kwargs: list[str],
 ):
 
     submission_file_name: str = cast(str, file_metadata_row["file_name"])
+
+    # Join both the file and system metadata into a single dictionary
+    merged_row = pd.merge(
+        file_metadata_row.to_frame().T,
+        system_metadata_row.to_frame().T,
+        on="system_id",
+        how="inner",
+    ).squeeze()
+
+    args: list[str] = []
+
+    for argument in allowable_kwargs:
+        if argument not in merged_row:
+            logger.error(f"argument {argument} not found in merged_row")
+            # raise RunnerException(
+            #     *get_error_by_code(500, runner_error_codes, logger)
+            # )
+            args.append("")
+            continue
+        value = merged_row[argument]
+
+        args.append(str(value))
+
     # Submission Args for the function
-    function_args = (submission_file_name,)
+    function_args = (submission_file_name, *args)
+
+    logger.info(f"function_args: {function_args}")
 
     return function_args
 
@@ -787,6 +815,7 @@ def prepare_function_args_for_parallel_processing(
     memory_limit: str,
     submission_file_name: str,
     submission_function_name: str,
+    current_evaluation_dir: str,
     data_dir: str,
     results_dir: str,
     volume_data_dir: str,
@@ -801,17 +830,40 @@ def prepare_function_args_for_parallel_processing(
         os.path.join(data_dir, "metadata", "system_metadata.csv")
     )
 
+    config_data: dict[str, Any] = json.load(
+        open(os.path.join(current_evaluation_dir, "config.json"))
+    )
+
     function_args_list = None
 
+    allowable_kwargs: list[str] = config_data.get("allowable_kwargs", {})
+
+    logger.info(f"allowable_kwargs: {allowable_kwargs}")
+
     for file_number, (_, file_metadata_row) in enumerate(
         file_metadata_df.iterrows()
     ):
 
+        system_metadata_row: pd.Series = system_metadata_df[
+            system_metadata_df["system_id"] == file_metadata_row["system_id"]
+        ].iloc[0]
+
+        if system_metadata_row.empty:
+            logger.error(
+                f"system_metadata not found for system_id: {file_metadata_row['system_id']}"
+            )
+            raise RunnerException(
+                *get_error_by_code(500, runner_error_codes, logger)
+            )
+
         submission_args = create_function_args_for_file(
             file_metadata_row,
-            submission_function_name=submission_function_name,
+            system_metadata_row,
+            allowable_kwargs,
         )
 
+        logger.info(f"submission_args: {submission_args}")
+
         function_args = (
             image_tag,
             memory_limit,

From b48850820c8c6b1d2d43dfd304e7bda5d57189f1 Mon Sep 17 00:00:00 2001
From: Mitchell Victoriano <mitchell.victoriano@gmail.com>
Date: Wed, 17 Jul 2024 21:26:39 -0700
Subject: [PATCH 15/20] Convert string args back into float or int in docker
 python main entrypoint

---
 workers/src/docker/submission_wrapper.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/workers/src/docker/submission_wrapper.py b/workers/src/docker/submission_wrapper.py
index f46e99f2..de1a7c71 100644
--- a/workers/src/docker/submission_wrapper.py
+++ b/workers/src/docker/submission_wrapper.py
@@ -58,6 +58,14 @@ def wrapper(*args: P.args, **kwargs: P.kwargs) -> Tuple[T, float]:
     return decorator
 
 
+def is_float(value: str) -> bool:
+    try:
+        float(value)
+        return True
+    except ValueError:
+        return False
+
+
 def format_args_for_submission(
     data_dir: str, function_params: list[str], args: list[str]
 ):
@@ -77,10 +85,10 @@ def format_args_for_submission(
     new_args = []
 
     for arg in rest_args:
-        if arg.isdigit():
-            new_args.append(int(arg))
-        elif arg.isdecimal():
+        if is_float(arg):
             new_args.append(float(arg))
+        elif arg.isdigit():
+            new_args.append(int(arg))
         else:
             new_args.append(arg)
 

From 03a74339ca902cd8f70f193bc8eb3a86cf9295a3 Mon Sep 17 00:00:00 2001
From: Mitchell Victoriano <mitchell.victoriano@gmail.com>
Date: Thu, 18 Jul 2024 20:45:59 -0700
Subject: [PATCH 16/20] Added performance metric generation from results back
 into pipeline

---
 workers/src/docker/submission_wrapper.py   |  31 ++-
 workers/src/logging_config.json            |   4 +-
 workers/src/pvinsight-validation-runner.py | 269 ++++++++++++++++-----
 3 files changed, 227 insertions(+), 77 deletions(-)

diff --git a/workers/src/docker/submission_wrapper.py b/workers/src/docker/submission_wrapper.py
index de1a7c71..3a6ce613 100644
--- a/workers/src/docker/submission_wrapper.py
+++ b/workers/src/docker/submission_wrapper.py
@@ -1,5 +1,6 @@
 from importlib import import_module
 import inspect
+import pathlib
 from re import sub
 import sys
 import pandas as pd
@@ -114,9 +115,9 @@ def import_submission_function(submission_file_name: str, function_name: str):
         raise e
 
     try:
-        submission_function: Callable[[pd.Series, Any], np.ndarray] = getattr(
-            submission_module, function_name
-        )
+        submission_function: Callable[
+            [pd.Series, Any], np.ndarray | tuple[float, float]
+        ] = getattr(submission_module, function_name)
         function_parameters = list(
             inspect.signature(submission_function).parameters.keys()
         )
@@ -169,20 +170,30 @@ def main():
 
     print(f"Execution time: {execution_time}")
 
-    print(f"Results: {results}")
-
     # save results to csv file
 
     print(f"Saving results to {results_dir}/{data_file_name}")
+    if isinstance(results, tuple):
 
-    results_df = pd.DataFrame(results)
+        results_df = pd.DataFrame([results])
+    else:
+        results_df = pd.DataFrame(results)
+    print(f"Results: {results_df}")
     results_file = f"{results_dir}/{data_file_name}"
-    results_df.to_csv(results_file)
+    results_df.to_csv(results_file, header=True)
+
+    columns = ["file_name", "execution_time"]
+
+    execution_file = f"{results_dir}/execution_time.csv"
+    time_file = pathlib.Path(execution_file)
+
+    if not time_file.exists():
+        time_df = pd.DataFrame(columns=columns)
+        time_df.to_csv(execution_file, index=False, header=True)
 
     execution_tuple = (data_file_name, execution_time)
-    execution_file = f"{results_dir}/time.csv"
-    execution_df = pd.DataFrame([execution_tuple])
-    execution_df.to_csv(execution_file, mode="a", header=False)
+    execution_df = pd.DataFrame([execution_tuple], columns=columns)
+    execution_df.to_csv(execution_file, mode="a", header=False, index=False)
 
 
 if __name__ == "__main__":
diff --git a/workers/src/logging_config.json b/workers/src/logging_config.json
index 09df840d..4a2c400a 100644
--- a/workers/src/logging_config.json
+++ b/workers/src/logging_config.json
@@ -33,13 +33,13 @@
         },
         "file": {
             "class": "logging.handlers.RotatingFileHandler",
-            "level": "INFO",
+            "level": "DEBUG",
             "formatter": "detailed",
             "filename": "logs/submission.log"
         },
         "json_file": {
             "class": "logging.handlers.RotatingFileHandler",
-            "level": "INFO",
+            "level": "DEBUG",
             "formatter": "json",
             "filename": "logs/submission.log.jsonl"
         }
diff --git a/workers/src/pvinsight-validation-runner.py b/workers/src/pvinsight-validation-runner.py
index cf550fec..3d4f9943 100644
--- a/workers/src/pvinsight-validation-runner.py
+++ b/workers/src/pvinsight-validation-runner.py
@@ -16,6 +16,7 @@
       This section will be dependent on the type of analysis being run.
 """
 
+from logging import config
 from typing import Any, Callable, Sequence, Tuple, TypeVar, cast, ParamSpec
 import pandas as pd
 import os
@@ -528,7 +529,15 @@ def run(  # noqa: C901
     )
     logger.info(f"number_of_errors: {number_of_errors}")
 
-    raise Exception("Finished Successfully")
+    # raise Exception("Finished Successfully")
+
+    results_list = loop_over_results_and_generate_metrics(
+        data_dir=data_dir,
+        results_dir=results_dir,
+        current_evaluation_dir=current_evaluation_dir,
+    )
+
+    # raise Exception("Finished Successfully")
 
     # Convert the results to a pandas dataframe and perform all of the
     # post-processing in the script
@@ -540,11 +549,17 @@ def run(  # noqa: C901
     # First get mean value for all the performance metrics and save (this will
     # be saved to a public metrics dictionary)
     public_metrics_dict: dict[str, Any] = dict()
+
+    module_name = "submission"
+
     public_metrics_dict["module"] = module_name
     # Get the mean and median run times
     public_metrics_dict["mean_run_time"] = results_df["run_time"].mean()
     public_metrics_dict["median_run_time"] = results_df["run_time"].median()
-    public_metrics_dict["function_parameters"] = function_parameters
+    public_metrics_dict["function_parameters"] = [
+        "time_series",
+        *config_data["allowable_kwargs"],
+    ]
     public_metrics_dict["data_requirements"] = results_df[
         "data_requirements"
     ].iloc[0]
@@ -1009,25 +1024,109 @@ def loop_over_files_and_generate_results(
     return number_of_errors
 
 
+def loop_over_results_and_generate_metrics(
+    data_dir: str,
+    results_dir: str,
+    current_evaluation_dir: str,
+) -> list[dict[str, Any]]:
+    all_results: list[dict[str, Any]] = []
+
+    file_metadata_df: pd.DataFrame = pd.read_csv(
+        os.path.join(data_dir, "metadata", "file_metadata.csv")
+    )
+
+    system_metadata_df = pd.read_csv(
+        os.path.join(data_dir, "metadata", "system_metadata.csv")
+    )
+
+    config_data: dict[str, Any] = json.load(
+        open(os.path.join(current_evaluation_dir, "config.json"))
+    )
+
+    submission_execution_times_df = pd.read_csv(
+        os.path.join(results_dir, "execution_time.csv")
+    )
+
+    for _, file_metadata_row in file_metadata_df.iterrows():
+
+        file_name = file_metadata_row["file_name"]
+
+        system_metadata_dict: dict[str, Any] = dict(
+            system_metadata_df[
+                system_metadata_df["system_id"]
+                == file_metadata_row["system_id"]
+            ].iloc[0]
+        )
+
+        try:
+            submission_runtime: float = cast(
+                float,
+                submission_execution_times_df[
+                    submission_execution_times_df["file_name"] == file_name
+                ]["execution_time"].iloc[0],
+            )
+        except IndexError:
+            logger.error(
+                f"submission_runtime not found for file {file_name}. Exiting."
+            )
+            continue
+
+        function_parameters = ["time_series", *config_data["allowable_kwargs"]]
+
+        result = generate_performance_metrics_for_submission(
+            file_name,
+            config_data,
+            system_metadata_dict,
+            results_dir,
+            data_dir,
+            submission_runtime,
+            function_parameters,
+        )
+
+        logger.info(f"{file_name}: {result}")
+        all_results.append(result)
+
+    return all_results
+
+
 def generate_performance_metrics_for_submission(
-    data_outputs: pd.Series,
-    function_run_time: float,
     file_name: str,
-    data_dir: str,
-    associated_metadata: dict[str, Any],
     config_data: dict[str, Any],
+    system_metadata_dict: dict[str, Any],
+    results_dir: str,
+    data_dir: str,
+    submission_runtime: float,
     function_parameters: list[str],
-    performance_metrics: list[str],
 ):
+
+    performance_metrics = config_data["performance_metrics"]
+
+    submission_output_row: pd.Series | None = None
+    submission_output_series: pd.Series | None = None
+
     # Get the ground truth scalars that we will compare to
-    ground_truth_dict = dict()
+    ground_truth_dict: dict[str, Any] = dict()
     if config_data["comparison_type"] == "scalar":
+        submission_output_row = cast(
+            pd.Series,
+            pd.read_csv(
+                os.path.join(results_dir, file_name),
+                index_col=0,
+            ).iloc[0],
+        )
         for val in config_data["ground_truth_compare"]:
-            ground_truth_dict[val] = associated_metadata[val]
-            logger.debug(
+            ground_truth_dict[val] = system_metadata_dict[val]
+            logger.info(
                 f'ground_truth_dict["{val}"]: {ground_truth_dict[val]}'
             )
     if config_data["comparison_type"] == "time_series":
+        submission_output_series = cast(
+            pd.Series,
+            pd.read_csv(
+                os.path.join(results_dir, file_name),
+                index_col=0,
+            ).squeeze(),
+        )
         ground_truth_series: pd.Series = pd.read_csv(
             os.path.join(data_dir + "/validation_data/", file_name),
             index_col=0,
@@ -1037,7 +1136,7 @@ def generate_performance_metrics_for_submission(
 
         ground_truth_file_length = len(ground_truth_series)
 
-        file_submission_result_length = len(data_outputs)
+        file_submission_result_length = len(submission_output_series)
         if file_submission_result_length != ground_truth_file_length:
             logger.error(
                 f"{file_name} submission result length {file_submission_result_length} does not match ground truth file length {ground_truth_file_length}"
@@ -1052,18 +1151,36 @@ def generate_performance_metrics_for_submission(
     # ground truth dictionary
     output_dictionary: dict[str, Any] = dict()
     if config_data["comparison_type"] == "scalar":
+        if submission_output_row is None:
+            logger.error(
+                f"submission_output_row is None for {file_name}. Exiting."
+            )
+            error_code = 9
+            raise RunnerException(
+                *get_error_by_code(error_code, runner_error_codes, logger)
+            )
+
         for idx in range(len(config_data["ground_truth_compare"])):
+
+            logger.info(f"submission_output_row: {submission_output_row}")
+            logger.info(
+                f"submission_output_row[{idx}]: {submission_output_row[idx]}"
+            )
+            logger.info(
+                f"config_data['ground_truth_compare'][{idx}]: {config_data['ground_truth_compare'][idx]}"
+            )
+
             output_dictionary[config_data["ground_truth_compare"][idx]] = (
-                data_outputs[idx]
+                submission_output_row[idx]
             )
     if config_data["comparison_type"] == "time_series":
-        output_dictionary["time_series"] = data_outputs
+        output_dictionary["time_series"] = submission_output_series
     # Run routine for all of the performance metrics and append
     # results to the dictionary
     results_dictionary: dict[str, Any] = dict()
     results_dictionary["file_name"] = file_name
     # Set the runtime in the results dictionary
-    results_dictionary["run_time"] = function_run_time
+    results_dictionary["run_time"] = submission_runtime
     # Set the data requirements in the dictionary, JSON required or bad juju happens in my DB and FE
     results_dictionary["data_requirements"] = json.dumps(function_parameters)
     # Loop through the rest of the performance metrics and calculate them
@@ -1073,66 +1190,88 @@ def generate_performance_metrics_for_submission(
             # Loop through the input and the output dictionaries,
             # and calculate the absolute error
             for val in config_data["ground_truth_compare"]:
-                error = np.abs(output_dictionary[val] - ground_truth_dict[val])
+
+                logger.debug(
+                    f"output_dictionary[val]: {output_dictionary[val]}"
+                )
+                logger.debug(
+                    f"ground_truth_dict[val]: {ground_truth_dict[val]}"
+                )
+                difference = output_dictionary[val] - ground_truth_dict[val]
+                logger.debug(f"difference: {difference}")
+
+                error = np.abs(difference)
                 logger.debug(f"error for {val}: {error}")
                 results_dictionary[metric + "_" + val] = error
         elif metric == "mean_absolute_error":
             for val in config_data["ground_truth_compare"]:
-                error = np.mean(
-                    np.abs(output_dictionary[val] - ground_truth_dict[val])
-                )
-                results_dictionary[metric + "_" + val] = error
-    return results_dictionary
 
+                output_series: pd.Series = output_dictionary[val]
+                logger.debug(f"output_series: {output_series}")
 
-@timeout(SUBMISSION_TIMEOUT)
-def run_submission_and_generate_performance_metrics(
-    file_name: str,
-    data_dir: str,
-    associated_system_metadata: dict[str, Any],
-    config_data: dict[str, Any],
-    submission_function: Callable[P, pd.Series],
-    function_parameters: list[str],
-    file_metadata_row: pd.Series,
-    function_name: str,
-    performance_metrics: list[str],
-    file_number: int,
-):
+                ground_truth_series: pd.Series = ground_truth_dict[val]
+                logger.debug(f"ground_truth_series: {ground_truth_series}")
 
-    error = False
-    try:
-        logger.info(f"{file_number} - running submission for file {file_name}")
-        # Get file_name, which will be pulled from database or S3 for
-        # each analysis
-        (
-            data_outputs,
-            function_run_time,
-        ) = run_submission(
-            file_name,
-            data_dir,
-            associated_system_metadata,
-            config_data,
-            submission_function,
-            function_parameters,
-            file_metadata_row,
-        )
+                # copy index from ground truth series
+                output_series.index = ground_truth_series.index
 
-        results_dictionary = generate_performance_metrics_for_submission(
-            data_outputs,
-            function_run_time,
-            file_name,
-            data_dir,
-            associated_system_metadata,
-            config_data,
-            function_parameters,
-            performance_metrics,
-        )
+                difference = output_series - ground_truth_series
+                logger.debug(f"difference: {difference}")
+                error = np.mean(difference)
 
-        return results_dictionary, error
-    except Exception as e:
-        logger.error(f"error running function {function_name}: {e}")
-        error = True
-        return None, error
+                logger.debug(f"mean_absolute_error for {val}: {error}")
+                results_dictionary[metric + "_" + val] = error
+    return results_dictionary
+
+
+# @timeout(SUBMISSION_TIMEOUT)
+# def run_submission_and_generate_performance_metrics(
+#     file_name: str,
+#     data_dir: str,
+#     associated_system_metadata: dict[str, Any],
+#     config_data: dict[str, Any],
+#     submission_function: Callable[P, pd.Series],
+#     function_parameters: list[str],
+#     file_metadata_row: pd.Series,
+#     function_name: str,
+#     performance_metrics: list[str],
+#     file_number: int,
+# ):
+
+#     error = False
+#     try:
+#         logger.info(f"{file_number} - running submission for file {file_name}")
+#         # Get file_name, which will be pulled from database or S3 for
+#         # each analysis
+#         (
+#             data_outputs,
+#             function_run_time,
+#         ) = run_submission(
+#             file_name,
+#             data_dir,
+#             associated_system_metadata,
+#             config_data,
+#             submission_function,
+#             function_parameters,
+#             file_metadata_row,
+#         )
+
+#         results_dictionary = generate_performance_metrics_for_submission(
+#             data_outputs,
+#             function_run_time,
+#             file_name,
+#             data_dir,
+#             associated_system_metadata,
+#             config_data,
+#             function_parameters,
+#             performance_metrics,
+#         )
+
+#         return results_dictionary, error
+#     except Exception as e:
+#         logger.error(f"error running function {function_name}: {e}")
+#         error = True
+#         return None, error
 
 
 def prepare_kwargs_for_submission_function(

From b99ca59a21fbb6b25903588785bf780b044d0a05 Mon Sep 17 00:00:00 2001
From: Mitchell Victoriano <mitchell.victoriano@gmail.com>
Date: Thu, 18 Jul 2024 20:46:49 -0700
Subject: [PATCH 17/20] Changed indexing by column

---
 workers/src/pvinsight-validation-runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workers/src/pvinsight-validation-runner.py b/workers/src/pvinsight-validation-runner.py
index 3d4f9943..9eacdec7 100644
--- a/workers/src/pvinsight-validation-runner.py
+++ b/workers/src/pvinsight-validation-runner.py
@@ -1164,7 +1164,7 @@ def generate_performance_metrics_for_submission(
 
             logger.info(f"submission_output_row: {submission_output_row}")
             logger.info(
-                f"submission_output_row[{idx}]: {submission_output_row[idx]}"
+                f"submission_output_row[{idx}]: {submission_output_row.iloc[idx]}"
             )
             logger.info(
                 f"config_data['ground_truth_compare'][{idx}]: {config_data['ground_truth_compare'][idx]}"

From 17166a39522ac379849b4a3c91958667d140634c Mon Sep 17 00:00:00 2001
From: Mitchell Victoriano <mitchell.victoriano@gmail.com>
Date: Fri, 19 Jul 2024 12:20:25 -0700
Subject: [PATCH 18/20] Fixed issue with scalar type analysis metrics not being
 passed correctly

---
 pv-validation-hub-client                   |  2 +-
 valhub/submissions/views.py                | 16 ++--
 workers/src/pvinsight-validation-runner.py | 87 +++++++++++-----------
 workers/src/utility.py                     |  8 +-
 4 files changed, 62 insertions(+), 51 deletions(-)

diff --git a/pv-validation-hub-client b/pv-validation-hub-client
index 1ce46a5d..06ad0caf 160000
--- a/pv-validation-hub-client
+++ b/pv-validation-hub-client
@@ -1 +1 @@
-Subproject commit 1ce46a5df02257066fdc2ab86025cc2b2edffabf
+Subproject commit 06ad0cafefaf53ff5ff70e209ac1ef27e018d74f
diff --git a/valhub/submissions/views.py b/valhub/submissions/views.py
index 566ea468..ed4e8d42 100644
--- a/valhub/submissions/views.py
+++ b/valhub/submissions/views.py
@@ -262,19 +262,23 @@ def update_submission_result(request: Request, submission_id: str):
         return Response(response_data, status=status.HTTP_400_BAD_REQUEST)
 
     required_fields = [
-        "mean_mean_absolute_error",
         "mean_run_time",
         "function_parameters",
         "metrics",
     ]
 
-    if not all(field in results for field in required_fields):
-        response_data = {"error": "missing required fields"}
-        return Response(response_data, status=status.HTTP_400_BAD_REQUEST)
+    for field in required_fields:
+        if field not in results:
+            response_data = {"error": f"{field} is required"}
+            return Response(response_data, status=status.HTTP_400_BAD_REQUEST)
 
     logging.info(f"results = {results}")
-    submission.mae = float(results["mean_mean_absolute_error"])
-    submission.mrt = float(results["mean_run_time"])
+
+    if "mean_absolute_error" in results:
+        submission.mae = float(results["mean_absolute_error"])
+    elif "mean_mean_absolute_error" in results:
+        submission.mae = float(results["mean_mean_absolute_error"])
+    # submission.mrt = float(results["mean_run_time"])
     submission.data_requirements = results["function_parameters"]
     submission.result = results["metrics"]
     try:
diff --git a/workers/src/pvinsight-validation-runner.py b/workers/src/pvinsight-validation-runner.py
index 9eacdec7..e08ffc63 100644
--- a/workers/src/pvinsight-validation-runner.py
+++ b/workers/src/pvinsight-validation-runner.py
@@ -572,12 +572,15 @@ def run(  # noqa: C901
     # are valid keys, anything else breaks our results processing
     for metric in performance_metrics:
         if "absolute_error" in metric:
+            # QUESTION: Does this need to loop over all the ground truth compare values?
             for val in config_data["ground_truth_compare"]:
                 logger.info(
                     f"metric: {metric}, val: {val}, combined: {'mean_' + metric}"
                 )
 
-                mean_metric = results_df[metric + "_" + val].mean()
+                metric_name = metric + "_" + val
+
+                mean_metric = results_df[metric_name].mean()
 
                 public_metrics_dict["mean_" + metric] = mean_metric
 
@@ -587,7 +590,7 @@ def run(  # noqa: C901
                 )
                 metrics_list.append(metric_tuple)
 
-                median_metric = results_df[metric + "_" + val].median()
+                median_metric = results_df[metric_name].median()
                 public_metrics_dict["median_" + metric] = median_metric
 
                 metric_tuple = (
@@ -662,46 +665,46 @@ def run(  # noqa: C901
 
     # Loop through all of the plot dictionaries and generate plots and
     # associated tables for reporting
-    for plot in config_data["plots"]:
-        if plot["type"] == "histogram":
-            if "color_code" in plot:
-                color_code = plot["color_code"]
-            else:
-                color_code = None
-            gen_plot = generate_histogram(
-                results_df_private, plot["x_val"], plot["title"], color_code
-            )
-            # Save the plot
-            gen_plot.savefig(os.path.join(results_dir, plot["save_file_path"]))
-            plt.close()
-            plt.clf()
-            # Write the stratified results to a table for private reporting
-            # (if color_code param is not None)
-            if color_code:
-                stratified_results_tbl = pd.DataFrame(
-                    results_df_private.groupby(color_code)[
-                        plot["x_val"]
-                    ].mean()
-                )
-                stratified_results_tbl.to_csv(
-                    os.path.join(
-                        results_dir,
-                        module_name
-                        + "_"
-                        + str(color_code)
-                        + "_"
-                        + plot["x_val"]
-                        + ".csv",
-                    )
-                )
-        if plot["type"] == "scatter_plot":
-            gen_plot = generate_scatter_plot(
-                results_df_private, plot["x_val"], plot["y_val"], plot["title"]
-            )
-            # Save the plot
-            gen_plot.savefig(os.path.join(results_dir, plot["save_file_path"]))
-            plt.close()
-            plt.clf()
+    # for plot in config_data["plots"]:
+    #     if plot["type"] == "histogram":
+    #         if "color_code" in plot:
+    #             color_code = plot["color_code"]
+    #         else:
+    #             color_code = None
+    #         gen_plot = generate_histogram(
+    #             results_df_private, plot["x_val"], plot["title"], color_code
+    #         )
+    #         # Save the plot
+    #         gen_plot.savefig(os.path.join(results_dir, plot["save_file_path"]))
+    #         plt.close()
+    #         plt.clf()
+    #         # Write the stratified results to a table for private reporting
+    #         # (if color_code param is not None)
+    #         if color_code:
+    #             stratified_results_tbl = pd.DataFrame(
+    #                 results_df_private.groupby(color_code)[
+    #                     plot["x_val"]
+    #                 ].mean()
+    #             )
+    #             stratified_results_tbl.to_csv(
+    #                 os.path.join(
+    #                     results_dir,
+    #                     module_name
+    #                     + "_"
+    #                     + str(color_code)
+    #                     + "_"
+    #                     + plot["x_val"]
+    #                     + ".csv",
+    #                 )
+    #             )
+    #     if plot["type"] == "scatter_plot":
+    #         gen_plot = generate_scatter_plot(
+    #             results_df_private, plot["x_val"], plot["y_val"], plot["title"]
+    #         )
+    #         # Save the plot
+    #         gen_plot.savefig(os.path.join(results_dir, plot["save_file_path"]))
+    #         plt.close()
+    #         plt.clf()
 
     logger.info(f"number_of_errors: {number_of_errors}")
 
diff --git a/workers/src/utility.py b/workers/src/utility.py
index 9896d1e1..a9212c39 100644
--- a/workers/src/utility.py
+++ b/workers/src/utility.py
@@ -823,9 +823,13 @@ def generate_private_report_for_submission(
         subprocess.run(
             cli_commands[action],
             check=True,
-            # stdout=subprocess.PIPE,
-            # stderr=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
         )
+    except subprocess.CalledProcessError as e:
+        combined_output = e.stdout + "\n" + e.stderr
+        logger_if_able(f"Error: {combined_output}", logger, "ERROR")
     except Exception as e:
         logger_if_able(f"Error: {e}", logger, "ERROR")
         raise e

From ac952a7409fab93be953a86db217e83ae2acfb86 Mon Sep 17 00:00:00 2001
From: Mitchell Victoriano <mitchell.victoriano@gmail.com>
Date: Tue, 23 Jul 2024 12:08:48 -0700
Subject: [PATCH 19/20] Added context type "text/html" for html file uploads

---
 workers/src/submission_worker.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/workers/src/submission_worker.py b/workers/src/submission_worker.py
index f879e371..d1363a31 100644
--- a/workers/src/submission_worker.py
+++ b/workers/src/submission_worker.py
@@ -103,7 +103,16 @@ def push_to_s3(local_file_path, s3_file_path, analysis_id, submission_id):
     else:
         s3 = boto3.client("s3")
         try:
-            s3.upload_file(local_file_path, S3_BUCKET_NAME, s3_file_path)
+            extra_args = {}
+            if s3_file_path.endswith(".html"):
+                extra_args = {"ContentType": "text/html"}
+            ExtraArgs = extra_args if extra_args else None
+            s3.upload_file(
+                local_file_path,
+                S3_BUCKET_NAME,
+                s3_file_path,
+                ExtraArgs=ExtraArgs,
+            )
         except botocore.exceptions.ClientError as e:
             logger.error(f"Error: {e}")
             logger.info(f"update submission status to {FAILED}")

From 84352807de1b7c6849be5bf26fae8ab5e738c3bd Mon Sep 17 00:00:00 2001
From: Mitchell Victoriano <mitchell.victoriano@gmail.com>
Date: Tue, 23 Jul 2024 14:19:38 -0700
Subject: [PATCH 20/20] Reverting changes to submission for new results display

---
 valhub/submissions/models.py | 2 +-
 valhub/submissions/views.py  | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/valhub/submissions/models.py b/valhub/submissions/models.py
index 1fb18bfc..134ed6cb 100644
--- a/valhub/submissions/models.py
+++ b/valhub/submissions/models.py
@@ -48,7 +48,7 @@ class Submission(models.Model):
     # mae - mean average error
     # mrt - mean run time
 
-    mae = models.FloatField(null=True, blank=True)
+    # mae = models.FloatField(null=True, blank=True)
     mrt = models.FloatField(null=True, blank=True)
     data_requirements = models.TextField(null=True, blank=True)
     archived = models.BooleanField(default=False)
diff --git a/valhub/submissions/views.py b/valhub/submissions/views.py
index ed4e8d42..8a67f421 100644
--- a/valhub/submissions/views.py
+++ b/valhub/submissions/views.py
@@ -274,10 +274,10 @@ def update_submission_result(request: Request, submission_id: str):
 
     logging.info(f"results = {results}")
 
-    if "mean_absolute_error" in results:
-        submission.mae = float(results["mean_absolute_error"])
-    elif "mean_mean_absolute_error" in results:
-        submission.mae = float(results["mean_mean_absolute_error"])
+    # if "mean_absolute_error" in results:
+    #     submission.mae = float(results["mean_absolute_error"])
+    # elif "mean_mean_absolute_error" in results:
+    #     submission.mae = float(results["mean_mean_absolute_error"])
     # submission.mrt = float(results["mean_run_time"])
     submission.data_requirements = results["function_parameters"]
     submission.result = results["metrics"]