Skip to content

Commit

Permalink
Merge pull request #271 from RamiAwar/dummy-test-2
Browse files Browse the repository at this point in the history
Add comparison to evaluation pipeline
  • Loading branch information
RamiAwar authored Jul 20, 2024
2 parents c738a2c + d3bede0 commit 48776a9
Show file tree
Hide file tree
Showing 2 changed files with 99 additions and 51 deletions.
56 changes: 23 additions & 33 deletions .github/workflows/evaluation_pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,15 +41,21 @@ jobs:
DATA_DIRECTORY: ./
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
run: PYTHONPATH=. poetry run pytest tests/evaluation/test_query_flow.py --run-expensive

- name: Upload test results as artifact
uses: actions/upload-artifact@v3
with:
name: test-results.csv
path: ./backend/test_results.csv

- name: Run score aggregation script
run: poetry run python tests/evaluation/aggregate_scores.py

- name: Save scores as artifact
uses: actions/upload-artifact@v3
with:
name: scores.md
path: scores.md
path: ./backend/scores.md

- name: Find associated pull request
uses: jwalton/gh-find-current-pr@v1
Expand All @@ -64,55 +70,39 @@ jobs:
if: github.ref != 'refs/heads/main'
uses: dawidd6/action-download-artifact@v2
with:
workflow: evaluate-llm.yml
branch: main
name: scores.md
path: main-scores
workflow: evaluation_pipeline.yml
branch: main
name: scores.md
path: main-scores

- name: Verify working directory and file locations
run: |
echo "Current working directory:"
pwd
echo "Contents of current directory:"
ls -la
echo "Contents of parent directory:"
ls -la ..
echo "Contents of main-scores directory:"
ls -la main-scores || echo "main-scores directory not found"
echo "Content of scores.md (if exists):"
cat scores.md || echo "scores.md not found"
echo "Content of main-scores/scores.md (if exists):"
cat main-scores/scores.md || echo "main-scores/scores.md not found"
path: ./backend/main-scores

- name: Compare scores and generate report
if: github.ref != 'refs/heads/main'
run: |
poetry run python tests/evaluation/compare_scores.py \
scores.md \
main-scores/scores.md \
comparison.md
main-scores/scores.md
- name: Generate markdown comment
if: github.ref != 'refs/heads/main'
id: comparison
run: |
comparison=$(cat comparison.md )
echo "comparison<<EOF" >> $GITHUB_OUTPUT
echo "$comparison" >> $GITHUB_OUTPUT
echo "EOF" >> $GITHUB_OUTPUT
- name: Post comparison on PR
if: success() && steps.findPr.outputs.number
uses: marocchino/sticky-pull-request-comment@v2
with:
number: ${{ steps.findPr.outputs.number }}
path: comparison.md
message: |
${{ steps.comparison.outputs.comparison }}
- name: Upload comparison as artifact
if: github.ref != 'refs/heads/main'
uses: actions/upload-artifact@v3
with:
name: score-comparison
path: comparison.md

- name: Upload test results as artifact
uses: actions/upload-artifact@v3
with:
name: test-results.csv
path: ./backend/test_results.csv
path: ./backend/comparison.md

94 changes: 76 additions & 18 deletions backend/tests/evaluation/compare_scores.py
Original file line number Diff line number Diff line change
@@ -1,42 +1,100 @@
import sys
from pathlib import Path

import pandas as pd


def compare_scores(current_scores_path, main_scores_path, output_path):
def read_markdown_table(file_path):
with open(file_path, "r") as f:
lines = f.readlines()

# Remove the header and separator lines
data_lines = [line.strip() for line in lines[2:] if line.strip()]

# Split each line into columns
data = [line.split("|") for line in data_lines]

# Create DataFrame
df = pd.DataFrame(data, columns=["", "Tag", "Score", ""])

# Clean up the DataFrame
df = df.iloc[:, 1:3] # Keep only Tag and Score columns
df.columns = ["Tag", "Score"]
df["Tag"] = df["Tag"].str.strip()
df["Score"] = df["Score"].str.strip()

return df


def get_emoji(difference):
if abs(difference) < 1:
return "✅" # Green checkmark for small differences
elif difference > 0:
if difference > 5:
return "🚀" # Rocket for significant improvements
else:
return "📈" # Chart with upwards trend for improvements
else:
if difference < -5:
return "⚠️" # Warning sign for significant regressions
else:
return "📉" # Chart with downwards trend for regressions


def compare_scores(current_scores_path, main_scores_path):
# Read current scores
current_df = pd.read_csv(current_scores_path, sep="|", skiprows=1)
current_df.columns = [col.strip() for col in current_df.columns]
current_df = current_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
try:
current_df = read_markdown_table(current_scores_path)
except Exception as e:
print(f"Error reading current scores: {str(e)}")
raise

# Read main scores
main_df = pd.read_csv(main_scores_path, sep="|", skiprows=1)
main_df.columns = [col.strip() for col in main_df.columns]
main_df = main_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
try:
main_df = read_markdown_table(main_scores_path)
except FileNotFoundError:
print(f"File not found: {main_scores_path}")
raise
except Exception as e:
print(f"Error reading main scores: {str(e)}")
raise

# Merge dataframes
merged_df = current_df.merge(main_df, on="Tag", how="outer", suffixes=("_current", "_main"))
try:
merged_df = current_df.merge(main_df, on="Tag", how="outer", suffixes=("_current", "_main"))
except Exception as e:
print("Error during merge:", e)
print(f"Current DataFrame columns: {current_df.columns.tolist()}")
print(f"Main DataFrame columns: {main_df.columns.tolist()}")
raise

# Calculate difference
merged_df["Score_current"] = pd.to_numeric(merged_df["Score_current"], errors="coerce")
merged_df["Score_main"] = pd.to_numeric(merged_df["Score_main"], errors="coerce")
merged_df["Difference"] = (
(merged_df["Score_current"].astype(float) - merged_df["Score_main"].astype(float))
/ merged_df["Score_main"].astype(float)
* 100
(merged_df["Score_current"] - merged_df["Score_main"]) / merged_df["Score_main"] * 100
).round(2)

# Add emoji column
merged_df["Emoji"] = merged_df["Difference"].apply(get_emoji)

# Prepare output dataframe
output_df = merged_df[["Tag", "Score_current", "Score_main", "Difference"]]
output_df.columns = ["Tag", "Current Score", "Main Score", "Difference (%)"]
output_df = merged_df[["Tag", "Score_current", "Score_main", "Difference", "Emoji"]]
output_df.columns = ["Tag", "PR Score", "Main Branch Score", "Difference (%)", "Status"]
output_df = output_df.fillna("N/A")

# Write to markdown file
with open(output_path, "w") as f:
f.write("## LLM Evaluation Score Comparison\n\n")
f.write(output_df.to_markdown(index=False))
with open("comparison.md", "w") as f:
f.write("## DataLine Workflow Evaluation Score\n\n")
f.write("| Skills | Pull-Request Score | Baseline Score (main) | Difference (%) | Status |\n")
f.write("|-----|----------|-------------------|----------------|--------|\n")
for _, row in output_df.iterrows():
f.write(
f"| {row['Tag']} | {row['PR Score']:.2f} | {row['Main Branch Score']:.2f} | {row['Difference (%)']:+.2f}% | {row['Status']} |\n"
)


if __name__ == "__main__":
current_scores_path = sys.argv[1]
main_scores_path = sys.argv[2]
output_path = sys.argv[3]
compare_scores(current_scores_path, main_scores_path, output_path)
compare_scores(current_scores_path, main_scores_path)

0 comments on commit 48776a9

Please sign in to comment.