From aee55ace3af53e180d20d4777f97cfbf18fae3c2 Mon Sep 17 00:00:00 2001
From: yan91083 <zhyan910@gmail.com>
Date: Tue, 28 Nov 2023 11:06:40 -0800
Subject: [PATCH] update assets

---
 python/tabby-eval/.gitignore                  |   2 +
 .../tabby_data_pipeline/__init__.py           |   4 +-
 .../tabby-eval/tabby_data_pipeline/analyze.py |  15 +-
 .../tabby-eval/tabby_data_pipeline/assets.py  | 130 +++++++++++++-----
 .../tabby_data_pipeline/create_csv.py         |  23 +++-
 5 files changed, 131 insertions(+), 43 deletions(-)
 create mode 100644 python/tabby-eval/.gitignore

diff --git a/python/tabby-eval/.gitignore b/python/tabby-eval/.gitignore
new file mode 100644
index 000000000000..e98198e48d50
--- /dev/null
+++ b/python/tabby-eval/.gitignore
@@ -0,0 +1,2 @@
+*tmp
+tabby_data_pipeline.egg-info
\ No newline at end of file
diff --git a/python/tabby-eval/tabby_data_pipeline/__init__.py b/python/tabby-eval/tabby_data_pipeline/__init__.py
index 503a23759246..11c46715f2df 100644
--- a/python/tabby-eval/tabby_data_pipeline/__init__.py
+++ b/python/tabby-eval/tabby_data_pipeline/__init__.py
@@ -4,9 +4,9 @@
 
 from dagster import AssetIn, Field, Int, asset, file_relative_path
 
-from . import assets
+from . import assets, create_csv
 
-all_assets = load_assets_from_modules([assets])
+all_assets = load_assets_from_modules([assets, create_csv])
 
 defs = Definitions(
     assets=all_assets,
diff --git a/python/tabby-eval/tabby_data_pipeline/analyze.py b/python/tabby-eval/tabby_data_pipeline/analyze.py
index 3a46383c18fe..b8523d6249e7 100644
--- a/python/tabby-eval/tabby_data_pipeline/analyze.py
+++ b/python/tabby-eval/tabby_data_pipeline/analyze.py
@@ -1,8 +1,14 @@
+import pandas as pd
 import json
 import sys
-#from eval_utils import postprocess_code_lines, remove_comments
-#from tree_sitter import Language, Parser
-import pandas as pd
+
+from dagster import (
+    AssetExecutionContext,
+    MetadataValue,
+    asset,
+    StaticPartitionsDefinition,
+    MultiPartitionsDefinition,
+)
 
 def get_bracket_lang_statement(completion):
     end_idx = None
@@ -22,6 +28,7 @@ def postprocess_code_lines(prompt, target, language):
     except Exception as e:
         return target
 
+
 def analyze(model, language, file):
 
     line_match = 0
@@ -78,5 +85,3 @@ def analyze(model, language, file):
                 fout.write("\n")
 
 
-
-
diff --git a/python/tabby-eval/tabby_data_pipeline/assets.py b/python/tabby-eval/tabby_data_pipeline/assets.py
index 953ea99bbd2d..28413a27c9c3 100644
--- a/python/tabby-eval/tabby_data_pipeline/assets.py
+++ b/python/tabby-eval/tabby_data_pipeline/assets.py
@@ -1,32 +1,75 @@
+import modal
 import json
 import os, subprocess
-import modal
-
-import requests
 import pandas as pd
 
-import base64
-from io import BytesIO
-
-import matplotlib.pyplot as plt
-
-from typing import Dict, List
-
 from dagster import (
     AssetExecutionContext,
     MetadataValue,
     asset,
-    get_dagster_logger,
-    op,
     StaticPartitionsDefinition,
     MultiPartitionsDefinition,
-    AssetIn,
-    Field,
-    Int,
-    file_relative_path
 )
-from . import analyze, create_csv
-from dagstermill import define_dagstermill_asset
+from . import analyze
+
+
+@asset
+def baseline() -> str:
+    return "line_completion.jsonl"
+
+@asset
+def bm25() -> str:
+    return "line_completion_rg1_bm25.jsonl"
+
+@asset
+def oracle() -> str:
+    return "line_completion_oracle_bm25.jsonl"
+
+@asset(
+    partitions_def=MultiPartitionsDefinition(
+        {
+            "model_id" : StaticPartitionsDefinition(['TabbyML/StarCoder-1B', 'TabbyML/StarCoder-3B', 'TabbyML/StarCoder-7B', 'TabbyML/WizardCoder-1B', 'TabbyML/WizardCoder-3B', 'TabbyML/CodeLlama-7B', 'TabbyML/CodeLlama-13B']),
+            "language" : StaticPartitionsDefinition(["python", "java", "csharp", "typescript"]),
+            
+        }
+    ))
+def predict_baseline(context: AssetExecutionContext, baseline: str) -> None:
+    model_id = context.partition_key.keys_by_dimension["model_id"]
+    language = context.partition_key.keys_by_dimension["language"]
+
+    my_env = os.environ.copy()
+    my_env["MODEL_ID"] = model_id
+    
+    context.add_output_metadata(metadata={"model_id": MetadataValue.md(model_id)})
+
+    files = baseline
+
+    p = subprocess.Popen(["modal", "run", "./modal/predict.py","--language", language, "--files", files], env=my_env)
+    p.wait()
+    context.add_output_metadata(metadata={'modal run': MetadataValue.md("success!")})
+
+@asset(
+    partitions_def=MultiPartitionsDefinition(
+        {
+            "model_id" : StaticPartitionsDefinition(['TabbyML/StarCoder-1B', 'TabbyML/StarCoder-3B', 'TabbyML/StarCoder-7B', 'TabbyML/WizardCoder-1B', 'TabbyML/WizardCoder-3B', 'TabbyML/CodeLlama-7B', 'TabbyML/CodeLlama-13B']),
+            "language" : StaticPartitionsDefinition(["python", "java", "csharp", "typescript"]),
+            
+        }
+    ))
+def predict_bm25(context: AssetExecutionContext, bm25: str) -> None:
+    model_id = context.partition_key.keys_by_dimension["model_id"]
+    language = context.partition_key.keys_by_dimension["language"]
+
+    my_env = os.environ.copy()
+    my_env["MODEL_ID"] = model_id
+    
+    context.add_output_metadata(metadata={"model_id": MetadataValue.md(model_id)})
+
+    files = bm25
+
+    p = subprocess.Popen(["modal", "run", "./modal/predict.py","--language", language, "--files", files], env=my_env)
+    p.wait()
+    context.add_output_metadata(metadata={'modal run': MetadataValue.md("success!")})
 
 
 @asset(
@@ -37,7 +80,7 @@
             
         }
     ))
-def model_predict(context: AssetExecutionContext) -> None:
+def predict_oracle(context: AssetExecutionContext, oracle: str) -> None:
     model_id = context.partition_key.keys_by_dimension["model_id"]
     language = context.partition_key.keys_by_dimension["language"]
 
@@ -46,42 +89,59 @@ def model_predict(context: AssetExecutionContext) -> None:
     
     context.add_output_metadata(metadata={"model_id": MetadataValue.md(model_id)})
 
-    files = 'line_completion.jsonl, line_completion_rg1_bm25.jsonl, line_completion_oracle_bm25.jsonl'
+    files = oracle
 
     p = subprocess.Popen(["modal", "run", "./modal/predict.py","--language", language, "--files", files], env=my_env)
     p.wait()
     context.add_output_metadata(metadata={'modal run': MetadataValue.md("success!")})
 
 
+
 @asset(
     partitions_def=MultiPartitionsDefinition(
         {
             "model_id" : StaticPartitionsDefinition(['TabbyML/StarCoder-1B', 'TabbyML/StarCoder-3B', 'TabbyML/StarCoder-7B', 'TabbyML/WizardCoder-1B', 'TabbyML/WizardCoder-3B', 'TabbyML/CodeLlama-7B', 'TabbyML/CodeLlama-13B']),
             "language" : StaticPartitionsDefinition(["python", "java", "csharp", "typescript"]),       
         }
-    ), deps=[model_predict])
-def matching(context) -> None:
+    ), deps=[predict_baseline])
+def matching_baseline(context) -> None:
     model_id = context.partition_key.keys_by_dimension["model_id"]
     language = context.partition_key.keys_by_dimension["language"]
     
 
     model = model_id.split("/")[-1]
-    for file in ["line_completion.jsonl", "line_completion_rg1_bm25.jsonl", "line_completion_oracle_bm25.jsonl"]:
-        analyze.analyze(model, language, file)
+    analyze.analyze(model, language, 'line_completion.jsonl')
 
-@asset
-def tabby_eval_result():
-    create_csv.create_csv()
 
 
+@asset(
+    partitions_def=MultiPartitionsDefinition(
+        {
+            "model_id" : StaticPartitionsDefinition(['TabbyML/StarCoder-1B', 'TabbyML/StarCoder-3B', 'TabbyML/StarCoder-7B', 'TabbyML/WizardCoder-1B', 'TabbyML/WizardCoder-3B', 'TabbyML/CodeLlama-7B', 'TabbyML/CodeLlama-13B']),
+            "language" : StaticPartitionsDefinition(["python", "java", "csharp", "typescript"]),       
+        }
+    ), deps=[predict_bm25])
+def matching_bm25(context) -> None:
+    model_id = context.partition_key.keys_by_dimension["model_id"]
+    language = context.partition_key.keys_by_dimension["language"]
+    
+
+    model = model_id.split("/")[-1]
+    analyze.analyze(model, language, 'line_completion_rg1_bm25.jsonl')
+
 
-@asset(deps=[tabby_eval_result])
-def tabby_dataset():
-    return pd.read_csv(file_relative_path(__file__,'tabby.csv'))
 
-tabby_jupyter_notebook = define_dagstermill_asset(
-    name = 'tabby_jupyter',
-    notebook_path = file_relative_path(__file__, "tabby_eval.ipynb"),
-    ins={"df": AssetIn("tabby_dataset")},
-)
+@asset(
+    partitions_def=MultiPartitionsDefinition(
+        {
+            "model_id" : StaticPartitionsDefinition(['TabbyML/StarCoder-1B', 'TabbyML/StarCoder-3B', 'TabbyML/StarCoder-7B', 'TabbyML/WizardCoder-1B', 'TabbyML/WizardCoder-3B', 'TabbyML/CodeLlama-7B', 'TabbyML/CodeLlama-13B']),
+            "language" : StaticPartitionsDefinition(["python", "java", "csharp", "typescript"]),       
+        }
+    ), deps=[predict_oracle])
+def matching_oracle(context) -> None:
+    model_id = context.partition_key.keys_by_dimension["model_id"]
+    language = context.partition_key.keys_by_dimension["language"]
+    
 
+    model = model_id.split("/")[-1]
+    analyze.analyze(model, language, 'line_completion_oracle_bm25.jsonl')
\ No newline at end of file
diff --git a/python/tabby-eval/tabby_data_pipeline/create_csv.py b/python/tabby-eval/tabby_data_pipeline/create_csv.py
index 72080f1323cb..70b04ae5ce4e 100644
--- a/python/tabby-eval/tabby_data_pipeline/create_csv.py
+++ b/python/tabby-eval/tabby_data_pipeline/create_csv.py
@@ -2,6 +2,15 @@
 import json
 import pandas as pd
 
+from dagster import (
+    asset,
+    AssetIn,
+    file_relative_path
+    )
+from dagstermill import define_dagstermill_asset
+
+
+
 models = ["StarCoder-1B", "StarCoder-3B", "StarCoder-7B", "CodeLlama-7B", "CodeLlama-13B", "WizardCoder-1B", "WizardCoder-3B", "DeepseekCoder-1.3B", "DeepseekCoder-6.7B"]
 languages = {"csharp": "C#", "java": "Java", "python": "Python", "typescript": "Typescript"}
 files = ["line_completion.jsonl", 'line_completion_rg1_bm25.jsonl', 'line_completion_oracle_bm25.jsonl']
@@ -20,6 +29,7 @@ def get_match(model, language, file):
 
     return count
 
+@asset
 def create_csv():
     for model in models:
         for language in languages.keys():
@@ -32,4 +42,15 @@ def create_csv():
     df = pd.DataFrame(stat, columns=headers)
     print(df)
 
-    df.to_csv('./tabby_data_pipeline/tabby.csv', index=False)
\ No newline at end of file
+    df.to_csv('./tabby_data_pipeline/tabby.csv', index=False)
+
+
+@asset(deps=[create_csv])
+def tabby_dataset():
+    return pd.read_csv(file_relative_path(__file__,'tabby.csv'))
+
+tabby_jupyter_notebook = define_dagstermill_asset(
+    name = 'tabby_jupyter',
+    notebook_path = file_relative_path(__file__, "tabby_eval.ipynb"),
+    ins={"df": AssetIn("tabby_dataset")},
+)