From 9a94dfe1239ddfb8010a654aa1e677d56c01eee0 Mon Sep 17 00:00:00 2001
From: Luc Georges <McPatate@users.noreply.github.com>
Date: Wed, 18 Dec 2024 18:59:07 +0100
Subject: [PATCH] feat: add `benchmarks_entrypoint.py` (#34495)

* feat: add `benchmarks_entrypoint.py`

Adding `benchmarks_entrypoint.py` file, which will be run from the
benchmarks CI.

This python script will list all python files from the `benchmark/`
folder and run the included `run_benchmark` function, allowing people to
add new benchmarks scripts.

* feat: add `MetricsRecorder`

* feat: update dashboard

* fix: add missing arguments to `MetricsRecorder`

* feat: update dash & add datasource + `default.yml`

* fix: move responsibility to create `MetricsRecorder` in bench script

* fix: update incorrect datasource UID

* fix: incorrect variable values

* debug: benchmark entrypoint script

* refactor: update log level

* fix: update broken import

* feat: add debug log in `MetricsRecorder`

* debug: set log level to debug

* fix: set connection `autocommit` to `True`
---
 .github/workflows/benchmark.yml    |   2 +-
 benchmark/README.md                |  49 ++++++++++
 benchmark/benchmarks_entrypoint.py | 144 ++++++++++++++++++++++++++++
 benchmark/default.yml              |  10 ++
 benchmark/grafana_dashboard.json   | 145 ++++++++++++++++-------------
 benchmark/grafana_datasource.yaml  |  17 ++++
 benchmark/init_db.sql              |   2 +-
 benchmark/llama.py                 | 134 +++++++-------------------
 8 files changed, 334 insertions(+), 169 deletions(-)
 create mode 100644 benchmark/README.md
 create mode 100644 benchmark/benchmarks_entrypoint.py
 create mode 100644 benchmark/default.yml
 create mode 100644 benchmark/grafana_datasource.yaml

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index eaa4b3b2f82456..1bbd1c1e94d08c 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -63,7 +63,7 @@ jobs:
             commit_id=$GITHUB_SHA
           fi
           commit_msg=$(git show -s --format=%s | cut -c1-70)
-          python3 benchmark/llama.py "${{ github.head_ref || github.ref_name }}" "$commit_id" "$commit_msg"
+          python3 benchmark/benchmarks_entrypoint.py "${{ github.head_ref || github.ref_name }}" "$commit_id" "$commit_msg"
         env:
           HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
           # Enable this to see debug logs
diff --git a/benchmark/README.md b/benchmark/README.md
new file mode 100644
index 00000000000000..a827da444f0801
--- /dev/null
+++ b/benchmark/README.md
@@ -0,0 +1,49 @@
+# Benchmarks
+
+You might want to add new benchmarks.
+
+You will need to define a python function named `run_benchmark` in your python file and the file must be located in this `benchmark/` directory.
+
+The expected function signature is the following:
+
+```py
+def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str, num_tokens_to_generate=100):
+```
+
+## Writing metrics to the database
+
+`MetricRecorder` is thread-safe, in the sense of the python [`Thread`](https://docs.python.org/3/library/threading.html#threading.Thread). This means you can start a background thread to do the readings on the device measurements while not blocking the main thread to execute the model measurements.
+
+cf [`llama.py`](./llama.py) to see an example of this in practice.
+
+```py
+from benchmarks_entrypoint import MetricsRecorder
+import psycopg2
+
+def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str, num_tokens_to_generate=100):
+  metrics_recorder = MetricsRecorder(psycopg2.connect("dbname=metrics"), logger, branch, commit_id, commit_msg)
+  benchmark_id = metrics_recorder.initialise_benchmark({"gpu_name": gpu_name, "model_id": model_id})
+    # To collect device measurements
+    metrics_recorder.collect_device_measurements(
+        benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes
+    )
+    # To collect your model measurements
+    metrics_recorder.collect_model_measurements(
+        benchmark_id,
+        {
+            "model_load_time": model_load_time,
+            "first_eager_forward_pass_time_secs": first_eager_fwd_pass_time,
+            "second_eager_forward_pass_time_secs": second_eager_fwd_pass_time,
+            "first_eager_generate_time_secs": first_eager_generate_time,
+            "second_eager_generate_time_secs": second_eager_generate_time,
+            "time_to_first_token_secs": time_to_first_token,
+            "time_to_second_token_secs": time_to_second_token,
+            "time_to_third_token_secs": time_to_third_token,
+            "time_to_next_token_mean_secs": mean_time_to_next_token,
+            "first_compile_generate_time_secs": first_compile_generate_time,
+            "second_compile_generate_time_secs": second_compile_generate_time,
+            "third_compile_generate_time_secs": third_compile_generate_time,
+            "fourth_compile_generate_time_secs": fourth_compile_generate_time,
+        },
+    )
+```
diff --git a/benchmark/benchmarks_entrypoint.py b/benchmark/benchmarks_entrypoint.py
new file mode 100644
index 00000000000000..7925e2902834f7
--- /dev/null
+++ b/benchmark/benchmarks_entrypoint.py
@@ -0,0 +1,144 @@
+import argparse
+import importlib.util
+import logging
+import os
+from typing import Dict
+import psycopg2
+import sys
+
+from psycopg2.extras import Json
+from psycopg2.extensions import register_adapter
+
+
+register_adapter(dict, Json)
+
+
+class ImportModuleException(Exception):
+    pass
+
+
+class MetricsRecorder:
+    def __init__(self, connection, logger: logging.Logger, branch: str, commit_id: str, commit_msg: str):
+        self.conn = connection
+        self.conn.autocommit = True
+        self.logger = logger
+        self.branch = branch
+        self.commit_id = commit_id
+        self.commit_msg = commit_msg
+
+    def initialise_benchmark(self, metadata: Dict[str, str]) -> int:
+        """
+        Creates a new benchmark, returns the benchmark id
+        """
+        # gpu_name: str, model_id: str
+        with self.conn.cursor() as cur:
+            cur.execute(
+                "INSERT INTO benchmarks (branch, commit_id, commit_message, metadata) VALUES (%s, %s, %s, %s) RETURNING benchmark_id",
+                (self.branch, self.commit_id, self.commit_msg, metadata),
+            )
+            benchmark_id = cur.fetchone()[0]
+            logger.debug(f"initialised benchmark #{benchmark_id}")
+            return benchmark_id
+
+    def collect_device_measurements(self, benchmark_id: int, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes):
+        """
+        Collect device metrics, such as CPU & GPU usage. These are "static", as in you cannot pass arbitrary arguments to the function.
+        """
+        with self.conn.cursor() as cur:
+            cur.execute(
+                "INSERT INTO device_measurements (benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes) VALUES (%s, %s, %s, %s, %s)",
+                (benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes),
+            )
+        self.logger.debug(
+            f"inserted device measurements for benchmark #{benchmark_id} [CPU util: {cpu_util}, mem MBs: {mem_megabytes}, GPU util: {gpu_util}, GPU mem MBs: {gpu_mem_megabytes}]"
+        )
+
+    def collect_model_measurements(self, benchmark_id: int, measurements: Dict[str, float]):
+        with self.conn.cursor() as cur:
+            cur.execute(
+                """
+                INSERT INTO model_measurements (
+                    benchmark_id,
+                    measurements
+                ) VALUES (%s, %s)
+                """,
+                (
+                    benchmark_id,
+                    measurements,
+                ),
+            )
+        self.logger.debug(f"inserted model measurements for benchmark #{benchmark_id}: {measurements}")
+
+    def close(self):
+        self.conn.close()
+
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+handler = logging.StreamHandler(sys.stdout)
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter("[%(levelname)s - %(asctime)s] %(message)s")
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+
+
+def parse_arguments():
+    """
+    Parse command line arguments for the benchmarking CLI.
+    """
+    parser = argparse.ArgumentParser(description="CLI for benchmarking the huggingface/transformers.")
+
+    parser.add_argument(
+        "branch",
+        type=str,
+        help="The branch name on which the benchmarking is performed.",
+    )
+
+    parser.add_argument(
+        "commit_id",
+        type=str,
+        help="The commit hash on which the benchmarking is performed.",
+    )
+
+    parser.add_argument(
+        "commit_msg",
+        type=str,
+        help="The commit message associated with the commit, truncated to 70 characters.",
+    )
+
+    args = parser.parse_args()
+
+    return args.branch, args.commit_id, args.commit_msg
+
+
+def import_from_path(module_name, file_path):
+    try:
+        spec = importlib.util.spec_from_file_location(module_name, file_path)
+        module = importlib.util.module_from_spec(spec)
+        sys.modules[module_name] = module
+        spec.loader.exec_module(module)
+        return module
+    except Exception as e:
+        raise ImportModuleException(f"failed to load python module: {e}")
+
+
+if __name__ == "__main__":
+    benchmarks_folder_path = os.path.dirname(os.path.realpath(__file__))
+
+    branch, commit_id, commit_msg = parse_arguments()
+
+    for entry in os.scandir(benchmarks_folder_path):
+        try:
+            if not entry.name.endswith(".py"):
+                continue
+            if entry.path == __file__:
+                continue
+            logger.debug(f"loading: {entry.name}")
+            module = import_from_path(entry.name.split(".")[0], entry.path)
+            logger.info(f"runnning benchmarks in: {entry.name}")
+            module.run_benchmark(logger, branch, commit_id, commit_msg)
+        except ImportModuleException as e:
+            logger.error(e)
+        except Exception as e:
+            logger.error(f"error running benchmarks for {entry.name}: {e}")
diff --git a/benchmark/default.yml b/benchmark/default.yml
new file mode 100644
index 00000000000000..f3f02cab34d1bd
--- /dev/null
+++ b/benchmark/default.yml
@@ -0,0 +1,10 @@
+apiVersion: 1
+
+providers:
+  - name: 'Transformers Benchmarks'
+    orgId: 1
+    type: file
+    updateIntervalSeconds: 10
+    allowUiUpdates: true
+    options:
+      path: /etc/grafana/dashboards
diff --git a/benchmark/grafana_dashboard.json b/benchmark/grafana_dashboard.json
index 3d579f7b368711..caaec78a522303 100644
--- a/benchmark/grafana_dashboard.json
+++ b/benchmark/grafana_dashboard.json
@@ -30,7 +30,7 @@
       "title": "Go to data",
       "tooltip": "Go to data",
       "type": "link",
-      "url": "http://transformers-benchmarks.huggingface.co/d/fdz33iyzln9c0a/transformers-benchmarks?orgId=1&from=${StartTime}&to=${EndTime}"
+      "url": "http://transformers-benchmarks.hf.co/d/fdz33iyzln9c0a/transformers-benchmarks?orgId=1&from=${StartTime}&to=${EndTime}"
     }
   ],
   "liveNow": true,
@@ -77,7 +77,7 @@
             "properties": [
               {
                 "id": "custom.width",
-                "value": 196
+                "value": 202
               }
             ]
           },
@@ -101,7 +101,7 @@
             "properties": [
               {
                 "id": "custom.width",
-                "value": 581
+                "value": 524
               }
             ]
           },
@@ -113,7 +113,19 @@
             "properties": [
               {
                 "id": "custom.width",
-                "value": 379
+                "value": 353
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "model_id"
+            },
+            "properties": [
+              {
+                "id": "custom.width",
+                "value": 216
               }
             ]
           }
@@ -143,12 +155,14 @@
       "targets": [
         {
           "datasource": {
-            "type": "grafana-postgresql-datasource"
+            "default": true,
+            "type": "grafana-postgresql-datasource",
+            "uid": "be28nkzirtb0gd"
           },
           "editorMode": "code",
           "format": "table",
           "rawQuery": true,
-          "rawSql": "SELECT commit_id as commit_id, commit_message, gpu_name, created_at AS date FROM benchmarks WHERE branch = '${branch}' ORDER BY benchmark_id DESC LIMIT ${last_n_commits};",
+          "rawSql": "SELECT commit_id, commit_message, metadata->>'gpu_name' as gpu_name, metadata->>'model_id' as model_id, created_at AS date FROM benchmarks WHERE branch = '${branch}' AND metadata->>'gpu_name' = '${gpu_name}' ORDER BY benchmark_id DESC LIMIT ${last_n_commits};",
           "refId": "A",
           "sql": {
             "columns": [
@@ -306,13 +320,14 @@
       "targets": [
         {
           "datasource": {
+            "default": true,
             "type": "grafana-postgresql-datasource",
-            "uid": "bdz2yss7sxo1sc"
+            "uid": "be28nkzirtb0gd"
           },
           "editorMode": "code",
           "format": "table",
           "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'first_eager_forward_pass_time_secs' AS double precision) AS first_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
+          "rawSql": "SELECT CAST(m.measurements->'first_eager_forward_pass_time_secs' AS double precision) AS first_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
           "refId": "A",
           "sql": {
             "columns": [
@@ -431,13 +446,14 @@
       "targets": [
         {
           "datasource": {
+            "default": true,
             "type": "grafana-postgresql-datasource",
-            "uid": "bdz2yss7sxo1sc"
+            "uid": "be28nkzirtb0gd"
           },
           "editorMode": "code",
           "format": "table",
           "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'second_eager_forward_pass_time_secs' AS double precision) AS second_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
+          "rawSql": "SELECT CAST(m.measurements->'second_eager_forward_pass_time_secs' AS double precision) AS second_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
           "refId": "A",
           "sql": {
             "columns": [
@@ -565,13 +581,14 @@
       "targets": [
         {
           "datasource": {
+            "default": true,
             "type": "grafana-postgresql-datasource",
-            "uid": "bdz2yss7sxo1sc"
+            "uid": "be28nkzirtb0gd"
           },
           "editorMode": "code",
           "format": "table",
           "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'time_to_first_token_secs' AS double precision) AS time_to_first_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
+          "rawSql": "SELECT CAST(m.measurements->'time_to_first_token_secs' AS double precision) AS time_to_first_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
           "refId": "A",
           "sql": {
             "columns": [
@@ -686,13 +703,14 @@
       "targets": [
         {
           "datasource": {
+            "default": true,
             "type": "grafana-postgresql-datasource",
-            "uid": "bdz2yss7sxo1sc"
+            "uid": "be28nkzirtb0gd"
           },
           "editorMode": "code",
           "format": "table",
           "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'time_to_second_token_secs' AS double precision) AS time_to_second_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
+          "rawSql": "SELECT CAST(m.measurements->'time_to_second_token_secs' AS double precision) AS time_to_second_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
           "refId": "A",
           "sql": {
             "columns": [
@@ -807,13 +825,14 @@
       "targets": [
         {
           "datasource": {
+            "default": true,
             "type": "grafana-postgresql-datasource",
-            "uid": "bdz2yss7sxo1sc"
+            "uid": "be28nkzirtb0gd"
           },
           "editorMode": "code",
           "format": "table",
           "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'time_to_third_token_secs' AS double precision) AS time_to_third_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
+          "rawSql": "SELECT CAST(m.measurements->'time_to_third_token_secs' AS double precision) AS time_to_third_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
           "refId": "A",
           "sql": {
             "columns": [
@@ -928,13 +947,14 @@
       "targets": [
         {
           "datasource": {
+            "default": true,
             "type": "grafana-postgresql-datasource",
-            "uid": "bdz2yss7sxo1sc"
+            "uid": "be28nkzirtb0gd"
           },
           "editorMode": "code",
           "format": "table",
           "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'time_to_next_token_mean_secs' AS double precision) AS time_to_next_token_mean_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
+          "rawSql": "SELECT CAST(m.measurements->'time_to_next_token_mean_secs' AS double precision) AS time_to_next_token_mean_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
           "refId": "A",
           "sql": {
             "columns": [
@@ -1062,13 +1082,14 @@
       "targets": [
         {
           "datasource": {
+            "default": true,
             "type": "grafana-postgresql-datasource",
-            "uid": "bdz2yss7sxo1sc"
+            "uid": "be28nkzirtb0gd"
           },
           "editorMode": "code",
           "format": "table",
           "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'first_compile_generate_time_secs' AS double precision) AS first_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
+          "rawSql": "SELECT CAST(m.measurements->'first_compile_generate_time_secs' AS double precision) AS first_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
           "refId": "A",
           "sql": {
             "columns": [
@@ -1183,13 +1204,14 @@
       "targets": [
         {
           "datasource": {
+            "default": true,
             "type": "grafana-postgresql-datasource",
-            "uid": "bdz2yss7sxo1sc"
+            "uid": "be28nkzirtb0gd"
           },
           "editorMode": "code",
           "format": "table",
           "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'second_compile_generate_time_secs' AS double precision) AS second_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
+          "rawSql": "SELECT CAST(m.measurements->'second_compile_generate_time_secs' AS double precision) AS second_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
           "refId": "A",
           "sql": {
             "columns": [
@@ -1304,13 +1326,14 @@
       "targets": [
         {
           "datasource": {
+            "default": true,
             "type": "grafana-postgresql-datasource",
-            "uid": "bdz2yss7sxo1sc"
+            "uid": "be28nkzirtb0gd"
           },
           "editorMode": "code",
           "format": "table",
           "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'third_compile_generate_time_secs' AS double precision) AS third_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
+          "rawSql": "SELECT CAST(m.measurements->'third_compile_generate_time_secs' AS double precision) AS third_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
           "refId": "A",
           "sql": {
             "columns": [
@@ -1425,13 +1448,14 @@
       "targets": [
         {
           "datasource": {
+            "default": true,
             "type": "grafana-postgresql-datasource",
-            "uid": "bdz2yss7sxo1sc"
+            "uid": "be28nkzirtb0gd"
           },
           "editorMode": "code",
           "format": "table",
           "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'fourth_compile_generate_time_secs' AS double precision) AS fourth_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
+          "rawSql": "SELECT CAST(m.measurements->'fourth_compile_generate_time_secs' AS double precision) AS fourth_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
           "refId": "A",
           "sql": {
             "columns": [
@@ -1480,11 +1504,7 @@
       "id": 15,
       "panels": [
         {
-          "datasource": {
-            "default": true,
-            "type": "grafana-postgresql-datasource",
-            "uid": "be28nkzirtb0gd"
-          },
+          "datasource": {},
           "fieldConfig": {
             "defaults": {
               "color": {
@@ -1528,8 +1548,7 @@
                 "mode": "absolute",
                 "steps": [
                   {
-                    "color": "green",
-                    "value": null
+                    "color": "green"
                   },
                   {
                     "color": "red",
@@ -1563,8 +1582,9 @@
           "targets": [
             {
               "datasource": {
+                "default": true,
                 "type": "grafana-postgresql-datasource",
-                "uid": "bdz2yss7sxo1sc"
+                "uid": "be28nkzirtb0gd"
               },
               "editorMode": "code",
               "format": "table",
@@ -1665,11 +1685,7 @@
           "type": "timeseries"
         },
         {
-          "datasource": {
-            "default": true,
-            "type": "grafana-postgresql-datasource",
-            "uid": "be28nkzirtb0gd"
-          },
+          "datasource": {},
           "fieldConfig": {
             "defaults": {
               "color": {
@@ -1713,8 +1729,7 @@
                 "mode": "absolute",
                 "steps": [
                   {
-                    "color": "green",
-                    "value": null
+                    "color": "green"
                   },
                   {
                     "color": "red",
@@ -1748,8 +1763,9 @@
           "targets": [
             {
               "datasource": {
+                "default": true,
                 "type": "grafana-postgresql-datasource",
-                "uid": "bdz2yss7sxo1sc"
+                "uid": "be28nkzirtb0gd"
               },
               "editorMode": "code",
               "format": "table",
@@ -1850,11 +1866,7 @@
           "type": "timeseries"
         },
         {
-          "datasource": {
-            "default": true,
-            "type": "grafana-postgresql-datasource",
-            "uid": "be28nkzirtb0gd"
-          },
+          "datasource": {},
           "fieldConfig": {
             "defaults": {
               "color": {
@@ -1898,8 +1910,7 @@
                 "mode": "absolute",
                 "steps": [
                   {
-                    "color": "green",
-                    "value": null
+                    "color": "green"
                   },
                   {
                     "color": "red",
@@ -1933,8 +1944,9 @@
           "targets": [
             {
               "datasource": {
+                "default": true,
                 "type": "grafana-postgresql-datasource",
-                "uid": "bdz2yss7sxo1sc"
+                "uid": "be28nkzirtb0gd"
               },
               "editorMode": "code",
               "format": "table",
@@ -2035,11 +2047,7 @@
           "type": "timeseries"
         },
         {
-          "datasource": {
-            "default": true,
-            "type": "grafana-postgresql-datasource",
-            "uid": "be28nkzirtb0gd"
-          },
+          "datasource": {},
           "fieldConfig": {
             "defaults": {
               "color": {
@@ -2083,8 +2091,7 @@
                 "mode": "absolute",
                 "steps": [
                   {
-                    "color": "green",
-                    "value": null
+                    "color": "green"
                   },
                   {
                     "color": "red",
@@ -2118,8 +2125,9 @@
           "targets": [
             {
               "datasource": {
+                "default": true,
                 "type": "grafana-postgresql-datasource",
-                "uid": "bdz2yss7sxo1sc"
+                "uid": "be28nkzirtb0gd"
               },
               "editorMode": "code",
               "format": "table",
@@ -2224,7 +2232,6 @@
       "type": "row"
     }
   ],
-  "refresh": "",
   "schemaVersion": 39,
   "tags": [],
   "templating": {
@@ -2236,6 +2243,7 @@
           "value": "main"
         },
         "datasource": {
+          "default": true,
           "type": "grafana-postgresql-datasource",
           "uid": "be28nkzirtb0gd"
         },
@@ -2248,7 +2256,7 @@
         "name": "branch",
         "options": [],
         "query": "SELECT DISTINCT branch FROM benchmarks;",
-        "refresh": 2,
+        "refresh": 1,
         "regex": "",
         "skipUrlSync": false,
         "sort": 0,
@@ -2261,6 +2269,7 @@
           "value": "1729701492845"
         },
         "datasource": {
+          "default": true,
           "type": "grafana-postgresql-datasource",
           "uid": "be28nkzirtb0gd"
         },
@@ -2281,10 +2290,11 @@
       {
         "current": {
           "selected": false,
-          "text": "1730120430069",
-          "value": "1730120430069"
+          "text": "1730393397577",
+          "value": "1730393397577"
         },
         "datasource": {
+          "default": true,
           "type": "grafana-postgresql-datasource",
           "uid": "be28nkzirtb0gd"
         },
@@ -2312,15 +2322,16 @@
           "type": "grafana-postgresql-datasource",
           "uid": "be28nkzirtb0gd"
         },
-        "definition": "SELECT DISTINCT gpu_name FROM benchmarks;",
+        "definition": "SELECT DISTINCT metadata->>'gpu_name' FROM benchmarks;",
+        "description": "",
         "hide": 0,
         "includeAll": false,
         "label": "GPU",
         "multi": false,
         "name": "gpu_name",
         "options": [],
-        "query": "SELECT DISTINCT gpu_name FROM benchmarks;",
-        "refresh": 2,
+        "query": "SELECT DISTINCT metadata->>'gpu_name' FROM benchmarks;",
+        "refresh": 1,
         "regex": "",
         "skipUrlSync": false,
         "sort": 0,
@@ -2328,7 +2339,7 @@
       },
       {
         "current": {
-          "selected": false,
+          "selected": true,
           "text": "10",
           "value": "10"
         },
@@ -2359,6 +2370,6 @@
   "timezone": "browser",
   "title": "Transformers benchmarks",
   "uid": "fdz33iyzln9c0a",
-  "version": 4,
+  "version": 10,
   "weekStart": ""
 }
diff --git a/benchmark/grafana_datasource.yaml b/benchmark/grafana_datasource.yaml
new file mode 100644
index 00000000000000..25f36254104ab5
--- /dev/null
+++ b/benchmark/grafana_datasource.yaml
@@ -0,0 +1,17 @@
+apiVersion: 1
+datasources:
+  - name: grafana-postgresql-datasource
+    uid: be28nkzirtb0gd
+    type: postgres
+    url: $GRAFANA_POSTGRES_DATASOURCE_URL
+    user: $GRAFANA_POSTGRES_DATASOURCE_USER
+    secureJsonData:
+      password: $GRAFANA_POSTGRES_DATASOURCE_PWD
+    jsonData:
+      database: metrics
+      maxOpenConns: 100
+      maxIdleConns: 100
+      maxIdleConnsAuto: true
+      connMaxLifetime: 14400
+      postgresVersion: 1000
+      timescaledb: false
diff --git a/benchmark/init_db.sql b/benchmark/init_db.sql
index 573cc11518e857..a7864c4af183b6 100644
--- a/benchmark/init_db.sql
+++ b/benchmark/init_db.sql
@@ -3,7 +3,7 @@ CREATE TABLE IF NOT EXISTS benchmarks (
   branch VARCHAR(255),
   commit_id VARCHAR(72),
   commit_message VARCHAR(70),
-  gpu_name VARCHAR(255),
+  metadata jsonb,
   created_at timestamp without time zone NOT NULL DEFAULT (current_timestamp AT TIME ZONE 'UTC')
 );
 
diff --git a/benchmark/llama.py b/benchmark/llama.py
index 4a2c57422e6ffb..bbe1afefd5ef1b 100644
--- a/benchmark/llama.py
+++ b/benchmark/llama.py
@@ -1,71 +1,25 @@
-import argparse
-import json
-import logging
+from logging import Logger
 import os
-import sys
-from statistics import mean
 from threading import Event, Thread
 from time import perf_counter, sleep
 from typing import Optional
+from benchmarks_entrypoint import MetricsRecorder
 import gpustat
 import psutil
 import psycopg2
 import torch
 
 from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, StaticCache
-from psycopg2.extras import Json
-from psycopg2.extensions import register_adapter
 
 
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-
-handler = logging.StreamHandler(sys.stdout)
-handler.setLevel(logging.INFO)
-formatter = logging.Formatter("[%(levelname)s - %(asctime)s] %(message)s")
-handler.setFormatter(formatter)
-logger.addHandler(handler)
-
 os.environ["TOKENIZERS_PARALLELISM"] = "1"
 torch.set_float32_matmul_precision("high")
-register_adapter(dict, Json)
-
-
-def parse_arguments():
-    """
-    Parse command line arguments for the benchmarking CLI.
-    """
-    parser = argparse.ArgumentParser(description="CLI for benchmarking the huggingface/transformers.")
-
-    parser.add_argument(
-        "branch",
-        type=str,
-        help="The branch name on which the benchmarking is performed.",
-    )
-
-    parser.add_argument(
-        "commit_id",
-        type=str,
-        help="The commit hash on which the benchmarking is performed.",
-    )
 
-    parser.add_argument(
-        "commit_msg",
-        type=str,
-        help="The commit message associated with the commit, truncated to 70 characters.",
-    )
 
-    args = parser.parse_args()
-
-    return args.branch, args.commit_id, args.commit_msg
-
-
-def collect_metrics(benchmark_id, continue_metric_collection):
+def collect_metrics(benchmark_id, continue_metric_collection, metrics_recorder):
     p = psutil.Process(os.getpid())
-    conn = psycopg2.connect("dbname=metrics")
-    cur = conn.cursor()
     while not continue_metric_collection.is_set():
         with p.oneshot():
             cpu_util = p.cpu_percent()
@@ -73,47 +27,41 @@ def collect_metrics(benchmark_id, continue_metric_collection):
         gpu_stats = gpustat.GPUStatCollection.new_query()
         gpu_util = gpu_stats[0]["utilization.gpu"]
         gpu_mem_megabytes = gpu_stats[0]["memory.used"]
-        cur.execute(
-            "INSERT INTO device_measurements (benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes) VALUES (%s, %s, %s, %s, %s)",
-            (benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes),
+        metrics_recorder.collect_device_measurements(
+            benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes
         )
         sleep(0.01)
-        conn.commit()
-    conn.close()
 
 
-def run_benchmark(branch: str, commit_id: str, commit_msg: str, num_tokens_to_generate=100):
+def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str, num_tokens_to_generate=100):
     continue_metric_collection = Event()
     metrics_thread = None
+    model_id = "meta-llama/Llama-2-7b-hf"
+    metrics_recorder = MetricsRecorder(psycopg2.connect("dbname=metrics"), logger, branch, commit_id, commit_msg)
     try:
         gpu_stats = gpustat.GPUStatCollection.new_query()
         gpu_name = gpu_stats[0]["name"]
-        conn = psycopg2.connect("dbname=metrics")
-        cur = conn.cursor()
-        cur.execute(
-            "INSERT INTO benchmarks (branch, commit_id, commit_message, gpu_name) VALUES (%s, %s, %s, %s) RETURNING benchmark_id",
-            (branch, commit_id, commit_msg, gpu_name),
+        benchmark_id = metrics_recorder.initialise_benchmark({"gpu_name": gpu_name, "model_id": model_id})
+        logger.info(f"running benchmark #{benchmark_id} on {gpu_name} for {model_id}")
+        metrics_thread = Thread(
+            target=collect_metrics,
+            args=[benchmark_id, continue_metric_collection, metrics_recorder],
         )
-        conn.commit()
-        benchmark_id = cur.fetchone()[0]
-        logger.info(f"running benchmark #{benchmark_id} on {gpu_name}")
-        metrics_thread = Thread(target=collect_metrics, args=[benchmark_id, continue_metric_collection])
         metrics_thread.start()
         logger.info("started background thread to fetch device metrics")
 
         os.environ["TOKENIZERS_PARALLELISM"] = "false"  # silence warnings when compiling
 
         device = "cuda"
-        ckpt = "meta-llama/Llama-2-7b-hf"
 
         logger.info("downloading weights")
         # This is to avoid counting download in model load time measurement
-        model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16)
+        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16)
         gen_config = GenerationConfig(do_sample=False, top_p=1, temperature=1)
         logger.info("loading model")
         start = perf_counter()
         model = AutoModelForCausalLM.from_pretrained(
-            ckpt, torch_dtype=torch.float16, generation_config=gen_config
+            model_id, torch_dtype=torch.float16, generation_config=gen_config
         ).eval()
         model.to(device)
         torch.cuda.synchronize()
@@ -121,7 +69,7 @@ def run_benchmark(branch: str, commit_id: str, commit_msg: str, num_tokens_to_ge
         model_load_time = end - start
         logger.info(f"loaded model in: {model_load_time}s")
 
-        tokenizer = AutoTokenizer.from_pretrained(ckpt)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
 
         prompt = "Why dogs are so cute?"
         inputs = tokenizer(prompt, return_tensors="pt").to(device)
@@ -368,41 +316,27 @@ def decode_one_token(model, cur_token, cache_position, past_key_values):
             logger.info(f"completed second compile generation in: {fourth_compile_generate_time}s")
             logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")
 
-        cur.execute(
-            """
-            INSERT INTO model_measurements (
-                benchmark_id,
-                measurements
-            ) VALUES (%s, %s)
-            """,
-            (
-                benchmark_id,
-                {
-                    "model_load_time": model_load_time,
-                    "first_eager_forward_pass_time_secs": first_eager_fwd_pass_time,
-                    "second_eager_forward_pass_time_secs": second_eager_fwd_pass_time,
-                    "first_eager_generate_time_secs": first_eager_generate_time,
-                    "second_eager_generate_time_secs": second_eager_generate_time,
-                    "time_to_first_token_secs": time_to_first_token,
-                    "time_to_second_token_secs": time_to_second_token,
-                    "time_to_third_token_secs": time_to_third_token,
-                    "time_to_next_token_mean_secs": mean_time_to_next_token,
-                    "first_compile_generate_time_secs": first_compile_generate_time,
-                    "second_compile_generate_time_secs": second_compile_generate_time,
-                    "third_compile_generate_time_secs": third_compile_generate_time,
-                    "fourth_compile_generate_time_secs": fourth_compile_generate_time,
-                },
-            ),
+        metrics_recorder.collect_model_measurements(
+            benchmark_id,
+            {
+                "model_load_time": model_load_time,
+                "first_eager_forward_pass_time_secs": first_eager_fwd_pass_time,
+                "second_eager_forward_pass_time_secs": second_eager_fwd_pass_time,
+                "first_eager_generate_time_secs": first_eager_generate_time,
+                "second_eager_generate_time_secs": second_eager_generate_time,
+                "time_to_first_token_secs": time_to_first_token,
+                "time_to_second_token_secs": time_to_second_token,
+                "time_to_third_token_secs": time_to_third_token,
+                "time_to_next_token_mean_secs": mean_time_to_next_token,
+                "first_compile_generate_time_secs": first_compile_generate_time,
+                "second_compile_generate_time_secs": second_compile_generate_time,
+                "third_compile_generate_time_secs": third_compile_generate_time,
+                "fourth_compile_generate_time_secs": fourth_compile_generate_time,
+            },
         )
-        conn.commit()
-        conn.close()
     except Exception as e:
         logger.error(f"Caught exception: {e}")
     continue_metric_collection.set()
     if metrics_thread is not None:
         metrics_thread.join()
-
-
-if __name__ == "__main__":
-    branch, commit_id, commit_msg = parse_arguments()
-    run_benchmark(branch, commit_id, commit_msg, num_tokens_to_generate=20)
+    metrics_recorder.close()