Merge pull request #271 from RamiAwar/dummy-test-2

Add comparison to evaluation pipeline
RamiAwar · Jul 20, 2024 · 48776a9 · 48776a9
2 parents c738a2c + d3bede0
commit 48776a9
Show file tree

Hide file tree

Showing 2 changed files with 99 additions and 51 deletions.
diff --git a/.github/workflows/evaluation_pipeline.yml b/.github/workflows/evaluation_pipeline.yml
@@ -41,15 +41,21 @@ jobs:
           DATA_DIRECTORY: ./
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
         run: PYTHONPATH=. poetry run pytest tests/evaluation/test_query_flow.py --run-expensive
+
+      - name: Upload test results as artifact
+        uses: actions/upload-artifact@v3
+        with:
+          name: test-results.csv
+          path: ./backend/test_results.csv
 
       - name: Run score aggregation script
         run: poetry run python tests/evaluation/aggregate_scores.py
-      
+
       - name: Save scores as artifact
         uses: actions/upload-artifact@v3
         with:
           name: scores.md
-          path: scores.md
+          path: ./backend/scores.md
 
       - name: Find associated pull request
         uses: jwalton/gh-find-current-pr@v1
@@ -64,55 +70,39 @@ jobs:
         if: github.ref != 'refs/heads/main'
         uses: dawidd6/action-download-artifact@v2
         with:
-          workflow: evaluate-llm.yml
-          branch: main
-          name: scores.md
-          path: main-scores
           workflow: evaluation_pipeline.yml
           branch: main
           name: scores.md
-          path: main-scores
-
-      - name: Verify working directory and file locations
-        run: |
-          echo "Current working directory:"
-          pwd
-          echo "Contents of current directory:"
-          ls -la
-          echo "Contents of parent directory:"
-          ls -la ..
-          echo "Contents of main-scores directory:"
-          ls -la main-scores || echo "main-scores directory not found"
-          echo "Content of scores.md (if exists):"
-          cat scores.md || echo "scores.md not found"
-          echo "Content of main-scores/scores.md (if exists):"
-          cat main-scores/scores.md || echo "main-scores/scores.md not found"
+          path: ./backend/main-scores
 
       - name: Compare scores and generate report
         if: github.ref != 'refs/heads/main'
         run: |
           poetry run python tests/evaluation/compare_scores.py \
             scores.md \
-            main-scores/scores.md \
-            comparison.md
-            
+            main-scores/scores.md
+
+      - name: Generate markdown comment
+        if: github.ref != 'refs/heads/main'
+        id: comparison
+        run: |
+          comparison=$(cat comparison.md )
+          echo "comparison<<EOF" >> $GITHUB_OUTPUT
+          echo "$comparison" >> $GITHUB_OUTPUT
+          echo "EOF" >> $GITHUB_OUTPUT
+
       - name: Post comparison on PR
         if: success() && steps.findPr.outputs.number
         uses: marocchino/sticky-pull-request-comment@v2
         with:
           number: ${{ steps.findPr.outputs.number }}
-          path: comparison.md
+          message: |
+            ${{ steps.comparison.outputs.comparison }}
 
       - name: Upload comparison as artifact
         if: github.ref != 'refs/heads/main'
         uses: actions/upload-artifact@v3
         with:
           name: score-comparison
-          path: comparison.md
-
-      - name: Upload test results as artifact
-        uses: actions/upload-artifact@v3
-        with:
-          name: test-results.csv
-          path: ./backend/test_results.csv
+          path: ./backend/comparison.md
 
diff --git a/backend/tests/evaluation/compare_scores.py b/backend/tests/evaluation/compare_scores.py
@@ -1,42 +1,100 @@
 import sys
+from pathlib import Path
 
 import pandas as pd
 
 
-def compare_scores(current_scores_path, main_scores_path, output_path):
+def read_markdown_table(file_path):
+    with open(file_path, "r") as f:
+        lines = f.readlines()
+
+    # Remove the header and separator lines
+    data_lines = [line.strip() for line in lines[2:] if line.strip()]
+
+    # Split each line into columns
+    data = [line.split("|") for line in data_lines]
+
+    # Create DataFrame
+    df = pd.DataFrame(data, columns=["", "Tag", "Score", ""])
+
+    # Clean up the DataFrame
+    df = df.iloc[:, 1:3]  # Keep only Tag and Score columns
+    df.columns = ["Tag", "Score"]
+    df["Tag"] = df["Tag"].str.strip()
+    df["Score"] = df["Score"].str.strip()
+
+    return df
+
+
+def get_emoji(difference):
+    if abs(difference) < 1:
+        return "✅"  # Green checkmark for small differences
+    elif difference > 0:
+        if difference > 5:
+            return "🚀"  # Rocket for significant improvements
+        else:
+            return "📈"  # Chart with upwards trend for improvements
+    else:
+        if difference < -5:
+            return "⚠️"  # Warning sign for significant regressions
+        else:
+            return "📉"  # Chart with downwards trend for regressions
+
+
+def compare_scores(current_scores_path, main_scores_path):
     # Read current scores
-    current_df = pd.read_csv(current_scores_path, sep="|", skiprows=1)
-    current_df.columns = [col.strip() for col in current_df.columns]
-    current_df = current_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
+    try:
+        current_df = read_markdown_table(current_scores_path)
+    except Exception as e:
+        print(f"Error reading current scores: {str(e)}")
+        raise
 
     # Read main scores
-    main_df = pd.read_csv(main_scores_path, sep="|", skiprows=1)
-    main_df.columns = [col.strip() for col in main_df.columns]
-    main_df = main_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
+    try:
+        main_df = read_markdown_table(main_scores_path)
+    except FileNotFoundError:
+        print(f"File not found: {main_scores_path}")
+        raise
+    except Exception as e:
+        print(f"Error reading main scores: {str(e)}")
+        raise
 
     # Merge dataframes
-    merged_df = current_df.merge(main_df, on="Tag", how="outer", suffixes=("_current", "_main"))
+    try:
+        merged_df = current_df.merge(main_df, on="Tag", how="outer", suffixes=("_current", "_main"))
+    except Exception as e:
+        print("Error during merge:", e)
+        print(f"Current DataFrame columns: {current_df.columns.tolist()}")
+        print(f"Main DataFrame columns: {main_df.columns.tolist()}")
+        raise
 
     # Calculate difference
+    merged_df["Score_current"] = pd.to_numeric(merged_df["Score_current"], errors="coerce")
+    merged_df["Score_main"] = pd.to_numeric(merged_df["Score_main"], errors="coerce")
     merged_df["Difference"] = (
-        (merged_df["Score_current"].astype(float) - merged_df["Score_main"].astype(float))
-        / merged_df["Score_main"].astype(float)
-        * 100
+        (merged_df["Score_current"] - merged_df["Score_main"]) / merged_df["Score_main"] * 100
     ).round(2)
 
+    # Add emoji column
+    merged_df["Emoji"] = merged_df["Difference"].apply(get_emoji)
+
     # Prepare output dataframe
-    output_df = merged_df[["Tag", "Score_current", "Score_main", "Difference"]]
-    output_df.columns = ["Tag", "Current Score", "Main Score", "Difference (%)"]
+    output_df = merged_df[["Tag", "Score_current", "Score_main", "Difference", "Emoji"]]
+    output_df.columns = ["Tag", "PR Score", "Main Branch Score", "Difference (%)", "Status"]
     output_df = output_df.fillna("N/A")
 
     # Write to markdown file
-    with open(output_path, "w") as f:
-        f.write("## LLM Evaluation Score Comparison\n\n")
-        f.write(output_df.to_markdown(index=False))
+    with open("comparison.md", "w") as f:
+        f.write("## DataLine Workflow Evaluation Score\n\n")
+        f.write("| Skills | Pull-Request Score | Baseline Score (main) | Difference (%) | Status |\n")
+        f.write("|-----|----------|-------------------|----------------|--------|\n")
+        for _, row in output_df.iterrows():
+            f.write(
+                f"| {row['Tag']} | {row['PR Score']:.2f} | {row['Main Branch Score']:.2f} | {row['Difference (%)']:+.2f}% | {row['Status']} |\n"
+            )
 
 
 if __name__ == "__main__":
     current_scores_path = sys.argv[1]
     main_scores_path = sys.argv[2]
-    output_path = sys.argv[3]
-    compare_scores(current_scores_path, main_scores_path, output_path)
+    compare_scores(current_scores_path, main_scores_path)