From fc699e4fcced91ee2a8ede91e5fd5fc82c881a82 Mon Sep 17 00:00:00 2001
From: Egor Krasheninnikov <krasheninnikovegor@gmail.com>
Date: Mon, 19 Feb 2024 16:54:12 +0000
Subject: [PATCH] rm unused: mnist and old utils for plotting

---
 mnist.ipynb                    | 303 -------------------------------
 utils/aggregation_utils_old.py | 314 ---------------------------------
 2 files changed, 617 deletions(-)
 delete mode 100644 mnist.ipynb
 delete mode 100644 utils/aggregation_utils_old.py

diff --git a/mnist.ipynb b/mnist.ipynb
deleted file mode 100644
index e0d7aa1..0000000
--- a/mnist.ipynb
+++ /dev/null
@@ -1,303 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 179,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import torchvision.transforms as transforms\n",
-    "\n",
-    "transform = transforms.Compose([transforms.Normalize((0), (1,)), transforms.ToTensor()]) #transforms.Resize((784, 1))])\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 184,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import torchvision.datasets as datasets\n",
-    "mnist_trainset = datasets.MNIST(root='./mnist_data', train=True, download=True, transform=None)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 188,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "AttributeError",
-     "evalue": "'Tensor' object has no attribute 'astype'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[188], line 4\u001b[0m\n\u001b[1;32m      1\u001b[0m imgs, labels \u001b[39m=\u001b[39m mnist_trainset\u001b[39m.\u001b[39mdata, mnist_trainset\u001b[39m.\u001b[39mtargets\n\u001b[1;32m      2\u001b[0m imgs \u001b[39m=\u001b[39m imgs\u001b[39m.\u001b[39mreshape(imgs\u001b[39m.\u001b[39mshape[\u001b[39m0\u001b[39m], imgs\u001b[39m.\u001b[39mshape[\u001b[39m1\u001b[39m] \u001b[39m*\u001b[39m imgs\u001b[39m.\u001b[39mshape[\u001b[39m2\u001b[39m])\n\u001b[0;32m----> 4\u001b[0m imgs \u001b[39m=\u001b[39m imgs\u001b[39m.\u001b[39;49mastype(\u001b[39m'\u001b[39m\u001b[39mfloat32\u001b[39m\u001b[39m'\u001b[39m) \u001b[39m/\u001b[39m \u001b[39m255\u001b[39m\n",
-      "\u001b[0;31mAttributeError\u001b[0m: 'Tensor' object has no attribute 'astype'"
-     ]
-    }
-   ],
-   "source": [
-    "imgs, labels = mnist_trainset.data, mnist_trainset.targets\n",
-    "imgs = imgs.reshape(imgs.shape[0], imgs.shape[1] * imgs.shape[2])\n",
-    "\n",
-    "imgs = imgs.ca('float32') / 255"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 186,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "imgs = imgs[:784]\n",
-    "labels = labels[:784]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 187,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "imgs = imgs.transpose(1, 0)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 138,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import torch\n",
-    "import torch.nn as nn\n",
-    "from torch.utils.data import DataLoader, TensorDataset\n",
-    "from transformers import Trainer\n",
-    "\n",
-    "class SimpleNet(nn.Module):\n",
-    "    def __init__(self):\n",
-    "        super().__init__()\n",
-    "        self.linear_0 = nn.Linear(784, 128)\n",
-    "        self.linear_1 = nn.Linear(128, 10)\n",
-    "        \n",
-    "    def forward(self, x: torch.Tensor) -> torch.Tensor:\n",
-    "        x = x.squeeze(-1)\n",
-    "        x = torch.relu(self.linear_0(x))\n",
-    "        x = self.linear_1(x)\n",
-    "        \n",
-    "        return x"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 139,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "dataset = TensorDataset(imgs, labels)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 140,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "data_loader = DataLoader(dataset,\n",
-    "                        batch_size=32,\n",
-    "                        shuffle=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 141,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from transformers import Trainer, TrainingArguments"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 159,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model = SimpleNet()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 170,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "loss_fn = torch.nn.CrossEntropyLoss()\n",
-    "optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 174,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def train_one_epoch(epoch_index, tb_writer):\n",
-    "    running_loss = 0.\n",
-    "    last_loss = 0.\n",
-    "\n",
-    "    # Here, we use enumerate(training_loader) instead of\n",
-    "    # iter(training_loader) so that we can track the batch\n",
-    "    # index and do some intra-epoch reporting\n",
-    "    for i, data in enumerate(data_loader):\n",
-    "        # Every data instance is an input + label pair\n",
-    "        inputs, labels = data\n",
-    "\n",
-    "        # Zero your gradients for every batch!\n",
-    "        optimizer.zero_grad()\n",
-    "        print(inputs.shape, labels.shape)\n",
-    "        # Make predictions for this batch\n",
-    "        outputs = model(inputs)\n",
-    "\n",
-    "        # Compute the loss and its gradients\n",
-    "        loss = loss_fn(outputs, labels)\n",
-    "        loss.backward()\n",
-    "\n",
-    "        # Adjust learning weights\n",
-    "        optimizer.step()\n",
-    "\n",
-    "        # Gather data and report\n",
-    "        running_loss += loss.item()\n",
-    "        if i % 1000 == 999:\n",
-    "            last_loss = running_loss / 1000 # loss per batch\n",
-    "            print('  batch {} loss: {}'.format(i + 1, last_loss))\n",
-    "            tb_x = epoch_index * len(data_loader) + i + 1\n",
-    "            tb_writer.add_scalar('Loss/train', last_loss, tb_x)\n",
-    "            running_loss = 0.\n",
-    "\n",
-    "    return last_loss"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 175,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from datetime import datetime\n",
-    "from torch.utils.tensorboard import SummaryWriter"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 176,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "EPOCH 1:\n",
-      "torch.Size([32, 784]) torch.Size([32])\n"
-     ]
-    },
-    {
-     "ename": "RuntimeError",
-     "evalue": "expected scalar type Float but found Byte",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[176], line 15\u001b[0m\n\u001b[1;32m     13\u001b[0m \u001b[39m# Make sure gradient tracking is on, and do a pass over the data\u001b[39;00m\n\u001b[1;32m     14\u001b[0m model\u001b[39m.\u001b[39mtrain(\u001b[39mTrue\u001b[39;00m)\n\u001b[0;32m---> 15\u001b[0m avg_loss \u001b[39m=\u001b[39m train_one_epoch(epoch_number, writer)\n\u001b[1;32m     17\u001b[0m \u001b[39m# We don't need gradients on to do reporting\u001b[39;00m\n\u001b[1;32m     18\u001b[0m model\u001b[39m.\u001b[39mtrain(\u001b[39mFalse\u001b[39;00m)\n",
-      "Cell \u001b[0;32mIn[174], line 16\u001b[0m, in \u001b[0;36mtrain_one_epoch\u001b[0;34m(epoch_index, tb_writer)\u001b[0m\n\u001b[1;32m     14\u001b[0m \u001b[39mprint\u001b[39m(inputs\u001b[39m.\u001b[39mshape, labels\u001b[39m.\u001b[39mshape)\n\u001b[1;32m     15\u001b[0m \u001b[39m# Make predictions for this batch\u001b[39;00m\n\u001b[0;32m---> 16\u001b[0m outputs \u001b[39m=\u001b[39m model(inputs)\n\u001b[1;32m     18\u001b[0m \u001b[39m# Compute the loss and its gradients\u001b[39;00m\n\u001b[1;32m     19\u001b[0m loss \u001b[39m=\u001b[39m loss_fn(outputs, labels)\n",
-      "File \u001b[0;32m~/anaconda3/envs/gpt/lib/python3.10/site-packages/torch/nn/modules/module.py:1130\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m   1126\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1127\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1128\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1129\u001b[0m         \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1130\u001b[0m     \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39;49m\u001b[39minput\u001b[39;49m, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m   1131\u001b[0m \u001b[39m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m   1132\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[39m=\u001b[39m [], []\n",
-      "Cell \u001b[0;32mIn[138], line 14\u001b[0m, in \u001b[0;36mSimpleNet.forward\u001b[0;34m(self, x)\u001b[0m\n\u001b[1;32m     12\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mforward\u001b[39m(\u001b[39mself\u001b[39m, x: torch\u001b[39m.\u001b[39mTensor) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m torch\u001b[39m.\u001b[39mTensor:\n\u001b[1;32m     13\u001b[0m     x \u001b[39m=\u001b[39m x\u001b[39m.\u001b[39msqueeze(\u001b[39m-\u001b[39m\u001b[39m1\u001b[39m)\n\u001b[0;32m---> 14\u001b[0m     x \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39mrelu(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mlinear_0(x))\n\u001b[1;32m     15\u001b[0m     x \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mlinear_1(x)\n\u001b[1;32m     17\u001b[0m     \u001b[39mreturn\u001b[39;00m x\n",
-      "File \u001b[0;32m~/anaconda3/envs/gpt/lib/python3.10/site-packages/torch/nn/modules/module.py:1130\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m   1126\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1127\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1128\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1129\u001b[0m         \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1130\u001b[0m     \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39;49m\u001b[39minput\u001b[39;49m, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m   1131\u001b[0m \u001b[39m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m   1132\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[39m=\u001b[39m [], []\n",
-      "File \u001b[0;32m~/anaconda3/envs/gpt/lib/python3.10/site-packages/torch/nn/modules/linear.py:114\u001b[0m, in \u001b[0;36mLinear.forward\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m    113\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mforward\u001b[39m(\u001b[39mself\u001b[39m, \u001b[39minput\u001b[39m: Tensor) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m Tensor:\n\u001b[0;32m--> 114\u001b[0m     \u001b[39mreturn\u001b[39;00m F\u001b[39m.\u001b[39;49mlinear(\u001b[39minput\u001b[39;49m, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mweight, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mbias)\n",
-      "\u001b[0;31mRuntimeError\u001b[0m: expected scalar type Float but found Byte"
-     ]
-    }
-   ],
-   "source": [
-    "# Initializing in a separate cell so we can easily add more epochs to the same run\n",
-    "timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')\n",
-    "writer = SummaryWriter('runs/fashion_trainer_{}'.format(timestamp))\n",
-    "epoch_number = 0\n",
-    "\n",
-    "EPOCHS = 5\n",
-    "\n",
-    "best_vloss = 1_000_000.\n",
-    "\n",
-    "for epoch in range(EPOCHS):\n",
-    "    print('EPOCH {}:'.format(epoch_number + 1))\n",
-    "\n",
-    "    # Make sure gradient tracking is on, and do a pass over the data\n",
-    "    model.train(True)\n",
-    "    avg_loss = train_one_epoch(epoch_number, writer)\n",
-    "\n",
-    "    # We don't need gradients on to do reporting\n",
-    "    model.train(False)\n",
-    "\n",
-    "    running_vloss = 0.0\n",
-    "    # for i, vdata in enumerate(validation_loader):\n",
-    "    #     vinputs, vlabels = vdata\n",
-    "    #     voutputs = model(vinputs)\n",
-    "    #     vloss = loss_fn(voutputs, vlabels)\n",
-    "    #     running_vloss += vloss\n",
-    "\n",
-    "    avg_vloss = running_vloss / (i + 1)\n",
-    "    print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))\n",
-    "\n",
-    "    # Log the running loss averaged per batch\n",
-    "    # for both training and validation\n",
-    "    writer.add_scalars('Training vs. Validation Loss',\n",
-    "                    { 'Training' : avg_loss, 'Validation' : avg_vloss },\n",
-    "                    epoch_number + 1)\n",
-    "    writer.flush()\n",
-    "\n",
-    "    # Track best performance, and save the model's state\n",
-    "    if avg_vloss < best_vloss:\n",
-    "        best_vloss = avg_vloss\n",
-    "        model_path = 'model_{}_{}'.format(timestamp, epoch_number)\n",
-    "        torch.save(model.state_dict(), model_path)\n",
-    "\n",
-    "    epoch_number += 1"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "gpt",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.9"
-  },
-  "orig_nbformat": 4,
-  "vscode": {
-   "interpreter": {
-    "hash": "ec868635af931660b05f2ea3e465e69a5f21692fde26f4ee4d537a52dd704680"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/utils/aggregation_utils_old.py b/utils/aggregation_utils_old.py
deleted file mode 100644
index 72e21bd..0000000
--- a/utils/aggregation_utils_old.py
+++ /dev/null
@@ -1,314 +0,0 @@
-import json
-import os
-
-import matplotlib
-import matplotlib.pyplot as plt
-import numpy as np
-import pandas as pd
-import seaborn as sns
-from pathlib import Path
-from scipy.stats import ttest_ind_from_stats
-from tbparse import SummaryReader
-from matplotlib import rc
-rc('text', usetex=True)
-plt.rcParams['text.usetex'] = True
-
-rc('text.latex', preamble=r'\usepackage{color, amsfonts, amsmath, amsthm}')
-
-
-def aggregate_mean_std_count(df):
-    # df is the output of utils.aggregation_utils.make_experiment_plot
-    agg_df = df.groupby(['tag', 'epoch']).agg({'value': ['mean', 'std', 'count']})
-    agg_df.columns = ['_'.join(col).strip() for col in agg_df.columns.values]
-    agg_df.reset_index(inplace=True)
-    return agg_df
-
-
-def ttest_eval_df(agg_df, tag1, tag2):
-    # agg_df is the output of utils.aggregation_utils.aggregate_mean_std_count
-    # print(agg_df['tag'].unique())
-    df_tag1 = agg_df[agg_df['tag'] == tag1]
-    df_tag2 = agg_df[agg_df['tag'] == tag2]
-    return ttest_ind_from_stats(mean1=df_tag1['value_mean'], std1=df_tag1['value_std'], nobs1=np.array(df_tag1['value_count']),
-                                mean2=df_tag2['value_mean'], std2=df_tag2['value_std'], nobs2=np.array(df_tag2['value_count']),
-                                alternative='greater')
-
-
-def aggregate_results(run_generic_name, runs_directory='./', eval_files=None, run_name_exclude=None, os_list=None, metric='EM'):
-    """
-    @param run_generic_name: ex. gpt2-medium-seed
-    @return:
-    """
-    assert metric in ['EM', 'F1']
-    if os_list is None:
-        os_list = os.listdir(runs_directory)
-    extracted_runs_names = [name for name in os_list
-                            if name.startswith(run_generic_name)]
-    if run_name_exclude:
-        extracted_runs_names = [
-            name for name in extracted_runs_names if run_name_exclude not in name]
-    print(f'Aggregating from {len(extracted_runs_names)} runs')
-    # for i, name in enumerate(extracted_runs_names):
-    #     print(f'{i+1}) {name}')
-
-    if eval_files is None:
-        eval_files = ['eval_d1consis', 'eval_d2consis', 'eval_no_qd_baseline']
-
-    all_results = []
-    for name in extracted_runs_names:
-        # seed = int(name[name.find('B-s') + 3:])
-        run_results = []
-        for eval_file in eval_files:
-            try:
-                with open(os.path.join(runs_directory, name, eval_file + '_results.json')) as f:
-                    data = json.load(f)
-            except FileNotFoundError:
-                # print(f'File {eval_file} not found in {name}')
-                break
-            # except Exception:
-            #     print('Broken json', seed)
-            #     continue
-
-            run_results.append(data[f'{metric} ' + '{k}'])
-        if len(run_results) == len(eval_files):
-            all_results.append(run_results)
-    assert len(all_results) > 0
-    print(f'Successfully loaded full results from {len(all_results)} runs')
-
-    averaged = np.array(all_results).mean(axis=0)
-    # ddof=1 for unbiased std (bessel's correction)
-    stds = np.array(all_results).std(axis=0, ddof=1)
-    res_dict = dict(
-        zip(eval_files, zip(averaged, stds, [len(all_results)]*len(eval_files))))
-
-    for k in dict(res_dict):
-        if k.startswith('eval_'):
-            res_dict[k[5:]] = res_dict.pop(k)
-
-    df = pd.DataFrame.from_dict(res_dict, orient='index', columns=[
-                                f'{metric} avg', f'{metric} std', 'n_runs'])
-    df = df.drop(columns=['n_runs'])
-    return res_dict
-
-
-def ttest_res_dict(res_dict, var1, var2):
-    return ttest_ind_from_stats(mean1=res_dict[var1][0], std1=res_dict[var1][1], nobs1=res_dict[var1][2],
-                                mean2=res_dict[var2][0], std2=res_dict[var2][1], nobs2=res_dict[var2][2],
-                                alternative='greater')
-
-
-def prettify_labels(labels_list, labels_mapping=None, bs=None):
-    if labels_mapping is None:
-        labels_mapping = {
-            'defs_': 'Defs ',
-            'questions_': 'Questions ',
-            '_swapped': ' (assoc with defs)',
-            # 'ent_assoc_meaning_': 'Meaning of var? ',
-            # 'ent_assoc_who_': 'Who is var? ',
-            # 'ent_assoc_name_': 'Name of var? ',
-            # 'ent_assoc_standFor_': 'What does var stand for? ',
-            'ent_assoc_meaning_': '',
-            'ent_assoc_who_': '',
-            'ent_assoc_name_': '',
-            'ent_assoc_standFor_': '',
-            'qd1consis': r'$\dot{\mathtt{D}}_1^\text{cons}\mathtt{QA}_1$',
-            'qd1incons': r'$\dot{\mathtt{D}}_8^\text{incons}\mathtt{QA}_8$',
-            'qd2consis': r'$\overline{\mathtt{D}}_9^\text{cons}\mathtt{QA}_9$',
-            'qd2incons': r'$\overline{\mathtt{D}}_2^\text{incons}\mathtt{QA}_2$',
-            'q': r'$\mathtt{QA}_3$',
-            'q_no_replacement_baseline': r'$\hat{\mathtt{QA}}_4$',
-            'd1consis': r'$\dot{\mathtt{D}}_5^\text{cons}$',
-            'd2consis': r'$\overline{\mathtt{D}}_6^\text{cons}$',
-            'd3consis' : r'$\tilde{\mathtt{D}}_0^\text{cons}$',
-            'no_qd_baseline': r'$\mathtt{QA}_7$',
-            }
-    def prettify_label(label, bs=None):
-        # go from longest to shortest keys
-        for k in sorted(labels_mapping, key=lambda x: len(x), reverse=True):
-            if k in label:
-            #label = label.replace(k, labels_mapping[k])
-                label = labels_mapping[k]
-                if bs: label += f' ({bs})'
-                break
-        return label
-    return [prettify_label(label, bs) for label in labels_list]
-    # return [labels_mapping.get(label, label) for label in labels_list]
-    
-    
-def make_experiment_plot(exp_name, stage_paths, thruncate_stages_after_epoch=None, eval_each_epochs_per_stage=None,
-                         tags=['eval/d1consis_EM', 'eval/d2consis_EM'], os_list=None, ylabel='Value', title='',
-                         figsize=(5.7,4), legend_loc='best', colors=None, bs=None):
-    """
-    exp_name - name of the experiment (top level folder name)
-    stage_paths - list of strings that are the starts to paths to stages, 
-    e.g. ['first_stage', 'second_stage', 's']
-    thruncate_stages_after_epoch - list of ints, how many epochs to thruncate each stage after. Use -1 to not thruncate
-    eval_each_epochs_per_stage - list of ints, how many epochs to are skipped between evaluations
-    
-    colors - list of colors for each stage ('blue', 'orange', 'green', 'red', 'purple', 'brown', 'pink', 'gray', 'olive', 'cyan')
-    """
-
-    # fixed order to use colors
-    color2order = {'blue': 0, 'orange': 1, 'green': 2, 'red': 3, 'purple': 4, 'brown': 5, 'pink': 6, 'gray': 7, 'olive': 8, 'cyan': 9}  
-    name2color = {'d1consis': 'blue', 'q': 'brown',  'qd2incons': 'pink',  'd2consis': 'red', 'qd1consis': 'purple',
-                  'no_qd_baseline': 'orange', 'q_no_replacement_baseline': 'green', 'qd1incons': 'cyan', 'qd2consis': 'olive', 'd3consis': 'gray',
-                  'd2incons': 'red'}
-    
-    palette = sns.color_palette()  # default palette, muted version of tab10
-    
-    if colors is None:
-        # tag -> name -> order -> color
-        names = []
-        for tag in tags:
-            for k in sorted(name2color.keys(), key=lambda x: len(x), reverse=True):
-                if k in tag:
-                    names.append(k)
-                    break
-        colors = [palette[color2order[name2color[name]]] for name in names]
-    else:
-        colors = [palette[color2order[color]] for color in colors]
-    
-    if eval_each_epochs_per_stage is None:
-        # TODO load eval_each_epochs_per_stage from config yaml file instead
-        eval_each_epochs_per_stage = [1] * len(stage_paths)
-    assert len(stage_paths) == len(thruncate_stages_after_epoch) == len(eval_each_epochs_per_stage)
-    exp_folder = f'experiments/{exp_name}'
-    if os_list is None:
-        os_list = os.listdir(exp_folder)
-        
-    dfs_all_stages = []
-    dfs_t_all_stages = []
-    
-    maxstep = 0
-    maxepoch = 0
-    for stage_path, thruncate_after_epoch, eval_each_epochs in zip(stage_paths, thruncate_stages_after_epoch, eval_each_epochs_per_stage):
-        curr_stage_exp_names = [x for x in os_list if x.startswith(stage_path)]
-        
-        # take only seed_stage2 = 0 experiments
-        # if 's2stage' in curr_stage_exp_names[0]:
-        #     curr_stage_exp_names = [x for x in curr_stage_exp_names if 's2stage0' in x]
-        
-        # remove experiments with ent_assoc and _q in stage2 (keep only d1consis, d2consis, d3consis)
-        tags_to_retrieve = tags.copy()
-        if len(dfs_all_stages)>0:
-            tags_to_retrieve = [t for t in tags_to_retrieve if not ('ent_assoc' in t and '_q' in t)]
-
-        print(f'Retrieving from {len(curr_stage_exp_names)} experiments')
-        dfs = []
-        dfs_t = []
-        unique_tags = set()
-        for experiment_name in curr_stage_exp_names:
-            logdir = os.path.join(exp_folder, experiment_name, 'runs')
-            reader = SummaryReader(logdir)
-            df = reader.scalars
-            tensors = reader.tensors
-            if not df.empty:
-                unique_tags = unique_tags | set(df.tag.unique())
-                # filter only relevant data
-                df = df[df.tag.isin(tags_to_retrieve)]
-                tensors = tensors[tensors.tag.isin(tags_to_retrieve)]
-                if thruncate_after_epoch != -1:
-                    # thruncate after epoch
-                    step_to_thruncate_after = sorted(df.step.unique())[thruncate_after_epoch//eval_each_epochs-1]
-                    df = df[df.step <= step_to_thruncate_after]
-                    tensors = tensors[tensors.step <= step_to_thruncate_after]
-
-                step_to_epoch = {step: (epoch + 1) * eval_each_epochs for epoch, step in enumerate(sorted(df.step.unique()))}
-                step_to_epoch_t = {step: (epoch + 1) * eval_each_epochs for epoch, step in enumerate(sorted(tensors.step.unique()))}
-                df['epoch'] = df['step'].map(step_to_epoch)
-                tensors['epoch'] = tensors['step'].map(step_to_epoch_t)
-                
-                dfs.append(df)
-                dfs_t.append(tensors)
-
-        print(f'Succesfully retrieved from {len(dfs)} experiments')
-        df_curr_stage = pd.concat(dfs, axis=0)
-        df_t_curr_stage = pd.concat(dfs_t, axis=0)
-        
-        df_curr_stage['epoch'] += maxepoch
-        df_curr_stage['step'] += maxstep
-        
-        df_t_curr_stage['epoch'] += maxepoch
-        df_t_curr_stage['step'] += maxstep
-        
-        maxstep = df_curr_stage.step.max()
-        maxepoch = df_curr_stage.epoch.max()
-        print(f'Epochs: {maxepoch}, steps: {maxstep}')
-        dfs_all_stages.append(df_curr_stage)
-        dfs_t_all_stages.append(df_t_curr_stage)
-                          
-    df = pd.concat(dfs_all_stages, axis=0)
-    df_t = pd.concat(dfs_t_all_stages, axis=0)
-
-    df['tag'] = df['tag'].apply(lambda x: x.replace('eval/', '').replace('train_', '').replace('_EM', '').replace('_loss', ''))
-    df_t['tag'] = df_t['tag'].apply(lambda x: x.replace('eval/', '').replace('train_', '').replace('_EM', '').replace('_loss', ''))
-    
-    tags = [x.replace('eval/', '').replace('train_', '').replace('_EM', '').replace('_loss', '') for x in tags]
-
-    matplotlib.rcParams['font.family'] = 'Times New Roman'
-    matplotlib.rcParams.update({'font.size': 12})
-    fig, ax = plt.subplots(figsize=figsize)
-    
-    ax1 = sns.pointplot(ax = ax,
-                        data=df,
-                        x = 'epoch',
-                        y = 'value', 
-                        hue='tag', 
-                        #hue_order=tags,
-                        palette=colors, markers=['d']*len(colors))#capsize=.1, errwidth=.9,)
-    
-    # ax1.set_ylim([0.45, 0.6])
-    n_epochs_per_stage = [len(df.epoch.unique()) for df in dfs_all_stages]
-    if len(n_epochs_per_stage) > 1:
-        curr_stage_end_epoch = 0
-        for i, n_epochs in enumerate(n_epochs_per_stage):
-            if i != len(n_epochs_per_stage) - 1: # no dashed line after last stage
-                ax1.axvline(x=ax1.get_xticks()[curr_stage_end_epoch + n_epochs - 1], color='black', linestyle='--')
-            
-            # add text indicating stage number if there is more than 1 stage
-            loc = curr_stage_end_epoch + n_epochs // 2 - 1
-            y_pos = ax1.get_ylim()[1] #+ (ax1.get_ylim()[1] - ax1.get_ylim()[0]) * .05
-            ax1.text(loc, y_pos, rf'Stage ${i+1}$', ha='center', va='bottom', fontsize=10)
-            
-            curr_stage_end_epoch += n_epochs
-    
-    # remove every second xticklabel
-    xticklabels = ax1.get_xticklabels()
-    for i in range(len(xticklabels)):
-        if i % 2 == 1:
-            xticklabels[i].set_text('')
-    ax1.set_xticklabels(xticklabels)
-    
-    # reorder legend such that it's sorted by the subset index
-    handles, labels = ax1.get_legend_handles_labels()
-    new_labels = prettify_labels(tags, bs=bs)
-    # sort by single-digit numbers that are part of the label
-    # sorted_pairs = sorted(zip(handles, new_labels), key=lambda zipped_pair: int([c for c in zipped_pair[1] if c.isdigit()][0]))
-    # handles, new_labels = zip(*sorted_pairs)
-    legend = ax1.legend(handles, new_labels, fontsize=12, loc=legend_loc)
-    legend.set_zorder(100)
-    
-    ax1.set_xlabel('Epoch', fontsize=14)
-    ax1.set_ylabel(ylabel, fontsize=14)
-    if title:
-        ax1.set_title(title, y=1.05)
-    
-    #plt.tight_layout()
-    #plt.show()
-    
-    # SAVING
-    # make sure the plots folder exists and create it if it doesn't
-    plt_name = (exp_name + ylabel).replace(' ', '').replace('.', '')
-    plt_format = 'pdf'
-    # plt_format = 'svg'
-    plt_path = f'plots/{exp_name}'
-    Path(plt_path).mkdir(parents=True, exist_ok=True)
-    n = 1
-    # Check if the file already exists and increment n if it does
-    while Path(f'{plt_path}/{plt_name}-{n}.{plt_format}').exists():
-        n += 1
-    # Save the plot to a file with the updated n value
-    fig.savefig(f'{plt_path}/{plt_name}-{n}.{plt_format}')
-    plt.close()
-    
-    return df, df_t
\ No newline at end of file