update assets

TabbyML · Nov 28, 2023 · aee55ac · aee55ac
1 parent c68811a
commit aee55ac
Show file tree

Hide file tree

Showing 5 changed files with 131 additions and 43 deletions.
diff --git a/python/tabby-eval/.gitignore b/python/tabby-eval/.gitignore
@@ -0,0 +1,2 @@
+*tmp
+tabby_data_pipeline.egg-info
diff --git a/python/tabby-eval/tabby_data_pipeline/__init__.py b/python/tabby-eval/tabby_data_pipeline/__init__.py
@@ -4,9 +4,9 @@
 
 from dagster import AssetIn, Field, Int, asset, file_relative_path
 
-from . import assets
+from . import assets, create_csv
 
-all_assets = load_assets_from_modules([assets])
+all_assets = load_assets_from_modules([assets, create_csv])
 
 defs = Definitions(
     assets=all_assets,

diff --git a/python/tabby-eval/tabby_data_pipeline/analyze.py b/python/tabby-eval/tabby_data_pipeline/analyze.py
@@ -1,8 +1,14 @@
+import pandas as pd
 import json
 import sys
-#from eval_utils import postprocess_code_lines, remove_comments
-#from tree_sitter import Language, Parser
-import pandas as pd
+
+from dagster import (
+    AssetExecutionContext,
+    MetadataValue,
+    asset,
+    StaticPartitionsDefinition,
+    MultiPartitionsDefinition,
+)
 
 def get_bracket_lang_statement(completion):
     end_idx = None
@@ -22,6 +28,7 @@ def postprocess_code_lines(prompt, target, language):
     except Exception as e:
         return target
 
+
 def analyze(model, language, file):
 
     line_match = 0
@@ -78,5 +85,3 @@ def analyze(model, language, file):
                 fout.write("\n")
 
 
-
-
diff --git a/python/tabby-eval/tabby_data_pipeline/assets.py b/python/tabby-eval/tabby_data_pipeline/assets.py
@@ -1,32 +1,75 @@
+import modal
 import json
 import os, subprocess
-import modal
-
-import requests
 import pandas as pd
 
-import base64
-from io import BytesIO
-
-import matplotlib.pyplot as plt
-
-from typing import Dict, List
-
 from dagster import (
     AssetExecutionContext,
     MetadataValue,
     asset,
-    get_dagster_logger,
-    op,
     StaticPartitionsDefinition,
     MultiPartitionsDefinition,
-    AssetIn,
-    Field,
-    Int,
-    file_relative_path
 )
-from . import analyze, create_csv
-from dagstermill import define_dagstermill_asset
+from . import analyze
+
+
+@asset
+def baseline() -> str:
+    return "line_completion.jsonl"
+
+@asset
+def bm25() -> str:
+    return "line_completion_rg1_bm25.jsonl"
+
+@asset
+def oracle() -> str:
+    return "line_completion_oracle_bm25.jsonl"
+
+@asset(
+    partitions_def=MultiPartitionsDefinition(
+        {
+            "model_id" : StaticPartitionsDefinition(['TabbyML/StarCoder-1B', 'TabbyML/StarCoder-3B', 'TabbyML/StarCoder-7B', 'TabbyML/WizardCoder-1B', 'TabbyML/WizardCoder-3B', 'TabbyML/CodeLlama-7B', 'TabbyML/CodeLlama-13B']),
+            "language" : StaticPartitionsDefinition(["python", "java", "csharp", "typescript"]),
+            
+        }
+    ))
+def predict_baseline(context: AssetExecutionContext, baseline: str) -> None:
+    model_id = context.partition_key.keys_by_dimension["model_id"]
+    language = context.partition_key.keys_by_dimension["language"]
+
+    my_env = os.environ.copy()
+    my_env["MODEL_ID"] = model_id
+
+    context.add_output_metadata(metadata={"model_id": MetadataValue.md(model_id)})
+
+    files = baseline
+
+    p = subprocess.Popen(["modal", "run", "./modal/predict.py","--language", language, "--files", files], env=my_env)
+    p.wait()
+    context.add_output_metadata(metadata={'modal run': MetadataValue.md("success!")})
+
+@asset(
+    partitions_def=MultiPartitionsDefinition(
+        {
+            "model_id" : StaticPartitionsDefinition(['TabbyML/StarCoder-1B', 'TabbyML/StarCoder-3B', 'TabbyML/StarCoder-7B', 'TabbyML/WizardCoder-1B', 'TabbyML/WizardCoder-3B', 'TabbyML/CodeLlama-7B', 'TabbyML/CodeLlama-13B']),
+            "language" : StaticPartitionsDefinition(["python", "java", "csharp", "typescript"]),
+            
+        }
+    ))
+def predict_bm25(context: AssetExecutionContext, bm25: str) -> None:
+    model_id = context.partition_key.keys_by_dimension["model_id"]
+    language = context.partition_key.keys_by_dimension["language"]
+
+    my_env = os.environ.copy()
+    my_env["MODEL_ID"] = model_id
+
+    context.add_output_metadata(metadata={"model_id": MetadataValue.md(model_id)})
+
+    files = bm25
+
+    p = subprocess.Popen(["modal", "run", "./modal/predict.py","--language", language, "--files", files], env=my_env)
+    p.wait()
+    context.add_output_metadata(metadata={'modal run': MetadataValue.md("success!")})
 
 
 @asset(
@@ -37,7 +80,7 @@
             
         }
     ))
-def model_predict(context: AssetExecutionContext) -> None:
+def predict_oracle(context: AssetExecutionContext, oracle: str) -> None:
     model_id = context.partition_key.keys_by_dimension["model_id"]
     language = context.partition_key.keys_by_dimension["language"]
 
@@ -46,42 +89,59 @@ def model_predict(context: AssetExecutionContext) -> None:
 
     context.add_output_metadata(metadata={"model_id": MetadataValue.md(model_id)})
 
-    files = 'line_completion.jsonl, line_completion_rg1_bm25.jsonl, line_completion_oracle_bm25.jsonl'
+    files = oracle
 
     p = subprocess.Popen(["modal", "run", "./modal/predict.py","--language", language, "--files", files], env=my_env)
     p.wait()
     context.add_output_metadata(metadata={'modal run': MetadataValue.md("success!")})
 
 
+
 @asset(
     partitions_def=MultiPartitionsDefinition(
         {
             "model_id" : StaticPartitionsDefinition(['TabbyML/StarCoder-1B', 'TabbyML/StarCoder-3B', 'TabbyML/StarCoder-7B', 'TabbyML/WizardCoder-1B', 'TabbyML/WizardCoder-3B', 'TabbyML/CodeLlama-7B', 'TabbyML/CodeLlama-13B']),
             "language" : StaticPartitionsDefinition(["python", "java", "csharp", "typescript"]),       
         }
-    ), deps=[model_predict])
-def matching(context) -> None:
+    ), deps=[predict_baseline])
+def matching_baseline(context) -> None:
     model_id = context.partition_key.keys_by_dimension["model_id"]
     language = context.partition_key.keys_by_dimension["language"]
 
 
     model = model_id.split("/")[-1]
-    for file in ["line_completion.jsonl", "line_completion_rg1_bm25.jsonl", "line_completion_oracle_bm25.jsonl"]:
-        analyze.analyze(model, language, file)
+    analyze.analyze(model, language, 'line_completion.jsonl')
 
-@asset
-def tabby_eval_result():
-    create_csv.create_csv()
 
 
+@asset(
+    partitions_def=MultiPartitionsDefinition(
+        {
+            "model_id" : StaticPartitionsDefinition(['TabbyML/StarCoder-1B', 'TabbyML/StarCoder-3B', 'TabbyML/StarCoder-7B', 'TabbyML/WizardCoder-1B', 'TabbyML/WizardCoder-3B', 'TabbyML/CodeLlama-7B', 'TabbyML/CodeLlama-13B']),
+            "language" : StaticPartitionsDefinition(["python", "java", "csharp", "typescript"]),       
+        }
+    ), deps=[predict_bm25])
+def matching_bm25(context) -> None:
+    model_id = context.partition_key.keys_by_dimension["model_id"]
+    language = context.partition_key.keys_by_dimension["language"]
+
+
+    model = model_id.split("/")[-1]
+    analyze.analyze(model, language, 'line_completion_rg1_bm25.jsonl')
+
 
-@asset(deps=[tabby_eval_result])
-def tabby_dataset():
-    return pd.read_csv(file_relative_path(__file__,'tabby.csv'))
 
-tabby_jupyter_notebook = define_dagstermill_asset(
-    name = 'tabby_jupyter',
-    notebook_path = file_relative_path(__file__, "tabby_eval.ipynb"),
-    ins={"df": AssetIn("tabby_dataset")},
-)
+@asset(
+    partitions_def=MultiPartitionsDefinition(
+        {
+            "model_id" : StaticPartitionsDefinition(['TabbyML/StarCoder-1B', 'TabbyML/StarCoder-3B', 'TabbyML/StarCoder-7B', 'TabbyML/WizardCoder-1B', 'TabbyML/WizardCoder-3B', 'TabbyML/CodeLlama-7B', 'TabbyML/CodeLlama-13B']),
+            "language" : StaticPartitionsDefinition(["python", "java", "csharp", "typescript"]),       
+        }
+    ), deps=[predict_oracle])
+def matching_oracle(context) -> None:
+    model_id = context.partition_key.keys_by_dimension["model_id"]
+    language = context.partition_key.keys_by_dimension["language"]
+
 
+    model = model_id.split("/")[-1]
+    analyze.analyze(model, language, 'line_completion_oracle_bm25.jsonl')
diff --git a/python/tabby-eval/tabby_data_pipeline/create_csv.py b/python/tabby-eval/tabby_data_pipeline/create_csv.py
@@ -2,6 +2,15 @@
 import json
 import pandas as pd
 
+from dagster import (
+    asset,
+    AssetIn,
+    file_relative_path
+    )
+from dagstermill import define_dagstermill_asset
+
+
+
 models = ["StarCoder-1B", "StarCoder-3B", "StarCoder-7B", "CodeLlama-7B", "CodeLlama-13B", "WizardCoder-1B", "WizardCoder-3B", "DeepseekCoder-1.3B", "DeepseekCoder-6.7B"]
 languages = {"csharp": "C#", "java": "Java", "python": "Python", "typescript": "Typescript"}
 files = ["line_completion.jsonl", 'line_completion_rg1_bm25.jsonl', 'line_completion_oracle_bm25.jsonl']
@@ -20,6 +29,7 @@ def get_match(model, language, file):
 
     return count
 
+@asset
 def create_csv():
     for model in models:
         for language in languages.keys():
@@ -32,4 +42,15 @@ def create_csv():
     df = pd.DataFrame(stat, columns=headers)
     print(df)
 
-    df.to_csv('./tabby_data_pipeline/tabby.csv', index=False)
+    df.to_csv('./tabby_data_pipeline/tabby.csv', index=False)
+
+
+@asset(deps=[create_csv])
+def tabby_dataset():
+    return pd.read_csv(file_relative_path(__file__,'tabby.csv'))
+
+tabby_jupyter_notebook = define_dagstermill_asset(
+    name = 'tabby_jupyter',
+    notebook_path = file_relative_path(__file__, "tabby_eval.ipynb"),
+    ins={"df": AssetIn("tabby_dataset")},
+)