From 13606bd0883bbd48a43c784ded96c929224a165a Mon Sep 17 00:00:00 2001
From: Julius Schlensok <julius@schlensok.org>
Date: Sun, 7 Jul 2024 12:06:48 +0000
Subject: [PATCH] chore: delete erroneously included notebook

---
 spectrum_io/file/parquet.ipynb | 902 ---------------------------------
 1 file changed, 902 deletions(-)
 delete mode 100644 spectrum_io/file/parquet.ipynb

diff --git a/spectrum_io/file/parquet.ipynb b/spectrum_io/file/parquet.ipynb
deleted file mode 100644
index 5d63ee5..0000000
--- a/spectrum_io/file/parquet.ipynb
+++ /dev/null
@@ -1,902 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from pathlib import Path\n",
-    "from typing import Union\n",
-    "\n",
-    "import datasets\n",
-    "import pandas as pd\n",
-    "import pyarrow as pa\n",
-    "import pyarrow.feather as feather\n",
-    "import pyarrow.parquet as pq\n",
-    "import scipy\n",
-    "\n",
-    "from parquet import *"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "root_path = Path(\"/cmnfs/proj/prosit_astral/datasets/parquet\")\n",
-    "train_path = root_path / \"train\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "385ec5beebab46e6933deda6d071ca39",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Resolving data files:   0%|          | 0/78 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "a11b3f1e3a844189a474b17392ce41a6",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Resolving data files:   0%|          | 0/24 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "c75fd8d80d9342baa53f7faff1372e3a",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading data:   0%|          | 0/77 [00:00<?, ?files/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "ac52b88968364c048c72edde6a415940",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading data:   0%|          | 0/23 [00:00<?, ?files/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "695ef38f0bd840059aaa58256eca6e42",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Generating train split: 0 examples [00:00, ? examples/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "ename": "DatasetGenerationError",
-     "evalue": "An error occurred while generating the dataset",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
-      "File \u001b[0;32m~/micromamba/envs/spectrum-io/lib/python3.10/site-packages/datasets/builder.py:1997\u001b[0m, in \u001b[0;36mArrowBasedBuilder._prepare_split_single\u001b[0;34m(self, gen_kwargs, fpath, file_format, max_shard_size, job_id)\u001b[0m\n\u001b[1;32m   1996\u001b[0m \u001b[39mif\u001b[39;00m max_shard_size \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mand\u001b[39;00m writer\u001b[39m.\u001b[39m_num_bytes \u001b[39m>\u001b[39m max_shard_size:\n\u001b[0;32m-> 1997\u001b[0m     num_examples, num_bytes \u001b[39m=\u001b[39m writer\u001b[39m.\u001b[39;49mfinalize()\n\u001b[1;32m   1998\u001b[0m     writer\u001b[39m.\u001b[39mclose()\n",
-      "File \u001b[0;32m~/micromamba/envs/spectrum-io/lib/python3.10/site-packages/datasets/arrow_writer.py:607\u001b[0m, in \u001b[0;36mArrowWriter.finalize\u001b[0;34m(self, close_stream)\u001b[0m\n\u001b[1;32m    606\u001b[0m     \u001b[39mif\u001b[39;00m close_stream:\n\u001b[0;32m--> 607\u001b[0m         \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mstream\u001b[39m.\u001b[39;49mclose()\n\u001b[1;32m    608\u001b[0m \u001b[39melse\u001b[39;00m:\n",
-      "File \u001b[0;32m~/micromamba/envs/spectrum-io/lib/python3.10/site-packages/fsspec/implementations/local.py:407\u001b[0m, in \u001b[0;36mLocalFileOpener.close\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    406\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mclose\u001b[39m(\u001b[39mself\u001b[39m):\n\u001b[0;32m--> 407\u001b[0m     \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mf\u001b[39m.\u001b[39;49mclose()\n",
-      "\u001b[0;31mKeyboardInterrupt\u001b[0m: ",
-      "\nDuring handling of the above exception, another exception occurred:\n",
-      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
-      "File \u001b[0;32m~/micromamba/envs/spectrum-io/lib/python3.10/site-packages/datasets/builder.py:2027\u001b[0m, in \u001b[0;36mArrowBasedBuilder._prepare_split_single\u001b[0;34m(self, gen_kwargs, fpath, file_format, max_shard_size, job_id)\u001b[0m\n\u001b[1;32m   2026\u001b[0m num_shards \u001b[39m=\u001b[39m shard_id \u001b[39m+\u001b[39m \u001b[39m1\u001b[39m\n\u001b[0;32m-> 2027\u001b[0m num_examples, num_bytes \u001b[39m=\u001b[39m writer\u001b[39m.\u001b[39;49mfinalize()\n\u001b[1;32m   2028\u001b[0m writer\u001b[39m.\u001b[39mclose()\n",
-      "File \u001b[0;32m~/micromamba/envs/spectrum-io/lib/python3.10/site-packages/datasets/arrow_writer.py:602\u001b[0m, in \u001b[0;36mArrowWriter.finalize\u001b[0;34m(self, close_stream)\u001b[0m\n\u001b[1;32m    601\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mpa_writer \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mand\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mschema:\n\u001b[0;32m--> 602\u001b[0m     \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_build_writer(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mschema)\n\u001b[1;32m    603\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mpa_writer \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n",
-      "File \u001b[0;32m~/micromamba/envs/spectrum-io/lib/python3.10/site-packages/datasets/arrow_writer.py:404\u001b[0m, in \u001b[0;36mArrowWriter._build_writer\u001b[0;34m(self, inferred_schema)\u001b[0m\n\u001b[1;32m    403\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_schema \u001b[39m=\u001b[39m schema\n\u001b[0;32m--> 404\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mpa_writer \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_WRITER_CLASS(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mstream, schema)\n",
-      "File \u001b[0;32m~/micromamba/envs/spectrum-io/lib/python3.10/site-packages/pyarrow/ipc.py:85\u001b[0m, in \u001b[0;36mRecordBatchStreamWriter.__init__\u001b[0;34m(self, sink, schema, use_legacy_format, options)\u001b[0m\n\u001b[1;32m     84\u001b[0m options \u001b[39m=\u001b[39m _get_legacy_format_default(use_legacy_format, options)\n\u001b[0;32m---> 85\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_open(sink, schema, options\u001b[39m=\u001b[39;49moptions)\n",
-      "File \u001b[0;32m~/micromamba/envs/spectrum-io/lib/python3.10/site-packages/pyarrow/ipc.pxi:582\u001b[0m, in \u001b[0;36mpyarrow.lib._RecordBatchStreamWriter._open\u001b[0;34m()\u001b[0m\n",
-      "File \u001b[0;32m~/micromamba/envs/spectrum-io/lib/python3.10/site-packages/pyarrow/io.pxi:2097\u001b[0m, in \u001b[0;36mpyarrow.lib.get_writer\u001b[0;34m()\u001b[0m\n",
-      "File \u001b[0;32m~/micromamba/envs/spectrum-io/lib/python3.10/site-packages/pyarrow/io.pxi:232\u001b[0m, in \u001b[0;36mpyarrow.lib.NativeFile.get_output_stream\u001b[0;34m()\u001b[0m\n",
-      "File \u001b[0;32m~/micromamba/envs/spectrum-io/lib/python3.10/site-packages/pyarrow/io.pxi:246\u001b[0m, in \u001b[0;36mpyarrow.lib.NativeFile._assert_writable\u001b[0;34m()\u001b[0m\n",
-      "File \u001b[0;32m~/micromamba/envs/spectrum-io/lib/python3.10/site-packages/pyarrow/io.pxi:237\u001b[0m, in \u001b[0;36mpyarrow.lib.NativeFile._assert_open\u001b[0;34m()\u001b[0m\n",
-      "\u001b[0;31mValueError\u001b[0m: I/O operation on closed file",
-      "\nThe above exception was the direct cause of the following exception:\n",
-      "\u001b[0;31mDatasetGenerationError\u001b[0m                    Traceback (most recent call last)",
-      "\u001b[1;32m/cmnfs/home/students/j.schlensok/spectrum_io/spectrum_io/file/parquet.ipynb Cell 3\u001b[0m line \u001b[0;36m1\n\u001b[0;32m----> <a href='vscode-notebook-cell://ssh-remote%2Bcompms-cpu-2/cmnfs/home/students/j.schlensok/spectrum_io/spectrum_io/file/parquet.ipynb#X20sdnNjb2RlLXJlbW90ZQ%3D%3D?line=0'>1</a>\u001b[0m ds \u001b[39m=\u001b[39m datasets\u001b[39m.\u001b[39;49mload_dataset(\u001b[39mstr\u001b[39;49m(root_path))\n",
-      "File \u001b[0;32m~/micromamba/envs/spectrum-io/lib/python3.10/site-packages/datasets/load.py:2609\u001b[0m, in \u001b[0;36mload_dataset\u001b[0;34m(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, token, use_auth_token, task, streaming, num_proc, storage_options, trust_remote_code, **config_kwargs)\u001b[0m\n\u001b[1;32m   2606\u001b[0m     \u001b[39mreturn\u001b[39;00m builder_instance\u001b[39m.\u001b[39mas_streaming_dataset(split\u001b[39m=\u001b[39msplit)\n\u001b[1;32m   2608\u001b[0m \u001b[39m# Download and prepare data\u001b[39;00m\n\u001b[0;32m-> 2609\u001b[0m builder_instance\u001b[39m.\u001b[39;49mdownload_and_prepare(\n\u001b[1;32m   2610\u001b[0m     download_config\u001b[39m=\u001b[39;49mdownload_config,\n\u001b[1;32m   2611\u001b[0m     download_mode\u001b[39m=\u001b[39;49mdownload_mode,\n\u001b[1;32m   2612\u001b[0m     verification_mode\u001b[39m=\u001b[39;49mverification_mode,\n\u001b[1;32m   2613\u001b[0m     num_proc\u001b[39m=\u001b[39;49mnum_proc,\n\u001b[1;32m   2614\u001b[0m     storage_options\u001b[39m=\u001b[39;49mstorage_options,\n\u001b[1;32m   2615\u001b[0m )\n\u001b[1;32m   2617\u001b[0m \u001b[39m# Build dataset for splits\u001b[39;00m\n\u001b[1;32m   2618\u001b[0m keep_in_memory \u001b[39m=\u001b[39m (\n\u001b[1;32m   2619\u001b[0m     keep_in_memory \u001b[39mif\u001b[39;00m keep_in_memory \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39melse\u001b[39;00m is_small_dataset(builder_instance\u001b[39m.\u001b[39minfo\u001b[39m.\u001b[39mdataset_size)\n\u001b[1;32m   2620\u001b[0m )\n",
-      "File \u001b[0;32m~/micromamba/envs/spectrum-io/lib/python3.10/site-packages/datasets/builder.py:1027\u001b[0m, in \u001b[0;36mDatasetBuilder.download_and_prepare\u001b[0;34m(self, output_dir, download_config, download_mode, verification_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)\u001b[0m\n\u001b[1;32m   1025\u001b[0m     \u001b[39mif\u001b[39;00m num_proc \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m   1026\u001b[0m         prepare_split_kwargs[\u001b[39m\"\u001b[39m\u001b[39mnum_proc\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m num_proc\n\u001b[0;32m-> 1027\u001b[0m     \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_download_and_prepare(\n\u001b[1;32m   1028\u001b[0m         dl_manager\u001b[39m=\u001b[39;49mdl_manager,\n\u001b[1;32m   1029\u001b[0m         verification_mode\u001b[39m=\u001b[39;49mverification_mode,\n\u001b[1;32m   1030\u001b[0m         \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mprepare_split_kwargs,\n\u001b[1;32m   1031\u001b[0m         \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mdownload_and_prepare_kwargs,\n\u001b[1;32m   1032\u001b[0m     )\n\u001b[1;32m   1033\u001b[0m \u001b[39m# Sync info\u001b[39;00m\n\u001b[1;32m   1034\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39minfo\u001b[39m.\u001b[39mdataset_size \u001b[39m=\u001b[39m \u001b[39msum\u001b[39m(split\u001b[39m.\u001b[39mnum_bytes \u001b[39mfor\u001b[39;00m split \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39minfo\u001b[39m.\u001b[39msplits\u001b[39m.\u001b[39mvalues())\n",
-      "File \u001b[0;32m~/micromamba/envs/spectrum-io/lib/python3.10/site-packages/datasets/builder.py:1122\u001b[0m, in \u001b[0;36mDatasetBuilder._download_and_prepare\u001b[0;34m(self, dl_manager, verification_mode, **prepare_split_kwargs)\u001b[0m\n\u001b[1;32m   1118\u001b[0m split_dict\u001b[39m.\u001b[39madd(split_generator\u001b[39m.\u001b[39msplit_info)\n\u001b[1;32m   1120\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m   1121\u001b[0m     \u001b[39m# Prepare split will record examples associated to the split\u001b[39;00m\n\u001b[0;32m-> 1122\u001b[0m     \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_prepare_split(split_generator, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mprepare_split_kwargs)\n\u001b[1;32m   1123\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mOSError\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n\u001b[1;32m   1124\u001b[0m     \u001b[39mraise\u001b[39;00m \u001b[39mOSError\u001b[39;00m(\n\u001b[1;32m   1125\u001b[0m         \u001b[39m\"\u001b[39m\u001b[39mCannot find data file. \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m   1126\u001b[0m         \u001b[39m+\u001b[39m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mmanual_download_instructions \u001b[39mor\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m   1127\u001b[0m         \u001b[39m+\u001b[39m \u001b[39m\"\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39mOriginal error:\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[1;32m   1128\u001b[0m         \u001b[39m+\u001b[39m \u001b[39mstr\u001b[39m(e)\n\u001b[1;32m   1129\u001b[0m     ) \u001b[39mfrom\u001b[39;00m \u001b[39mNone\u001b[39;00m\n",
-      "File \u001b[0;32m~/micromamba/envs/spectrum-io/lib/python3.10/site-packages/datasets/builder.py:1882\u001b[0m, in \u001b[0;36mArrowBasedBuilder._prepare_split\u001b[0;34m(self, split_generator, file_format, num_proc, max_shard_size)\u001b[0m\n\u001b[1;32m   1880\u001b[0m job_id \u001b[39m=\u001b[39m \u001b[39m0\u001b[39m\n\u001b[1;32m   1881\u001b[0m \u001b[39mwith\u001b[39;00m pbar:\n\u001b[0;32m-> 1882\u001b[0m     \u001b[39mfor\u001b[39;00m job_id, done, content \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_prepare_split_single(\n\u001b[1;32m   1883\u001b[0m         gen_kwargs\u001b[39m=\u001b[39mgen_kwargs, job_id\u001b[39m=\u001b[39mjob_id, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39m_prepare_split_args\n\u001b[1;32m   1884\u001b[0m     ):\n\u001b[1;32m   1885\u001b[0m         \u001b[39mif\u001b[39;00m done:\n\u001b[1;32m   1886\u001b[0m             result \u001b[39m=\u001b[39m content\n",
-      "File \u001b[0;32m~/micromamba/envs/spectrum-io/lib/python3.10/site-packages/datasets/builder.py:2038\u001b[0m, in \u001b[0;36mArrowBasedBuilder._prepare_split_single\u001b[0;34m(self, gen_kwargs, fpath, file_format, max_shard_size, job_id)\u001b[0m\n\u001b[1;32m   2036\u001b[0m     \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(e, DatasetGenerationError):\n\u001b[1;32m   2037\u001b[0m         \u001b[39mraise\u001b[39;00m\n\u001b[0;32m-> 2038\u001b[0m     \u001b[39mraise\u001b[39;00m DatasetGenerationError(\u001b[39m\"\u001b[39m\u001b[39mAn error occurred while generating the dataset\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39mfrom\u001b[39;00m \u001b[39me\u001b[39;00m\n\u001b[1;32m   2040\u001b[0m \u001b[39myield\u001b[39;00m job_id, \u001b[39mTrue\u001b[39;00m, (total_num_examples, total_num_bytes, writer\u001b[39m.\u001b[39m_features, num_shards, shard_lengths)\n",
-      "\u001b[0;31mDatasetGenerationError\u001b[0m: An error occurred while generating the dataset"
-     ]
-    }
-   ],
-   "source": [
-    "ds = datasets.load_dataset(str(root_path))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "input_test_file = Path.cwd().parent.parent.parent / \"oktoberfest/data/intensity_data.parquet\"\n",
-    "raw_data = {\n",
-    "    \"intensities\": [\n",
-    "        [4e-5, 0., -1., 0., 0., -1., 0.03, 0., -1., 0.4],\n",
-    "        [.3, 0., -1., 1., 0., -1., 0.4, 0., -1., 0.05],\n",
-    "        [.04, 0., 0., 0., 0., 0., 2e-3, 0., 0., .13]\n",
-    "    ],\n",
-    "    \"sequence\": [\"SVFLTFLR\", \"KTSQIFLAK\", \"SPVGRVTPKEWR\"],\n",
-    "    \"precursor_charge_onehot\": [\n",
-    "        [0, 1, 0, 0, 0, 0],\n",
-    "        [0, 1, 0, 0, 0, 0],\n",
-    "        [0, 0, 1, 0, 0, 0],\n",
-    "    ],\n",
-    "    \"collision_energy_normed\": [.25, .28, .28]\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 42,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "output_path = Path.cwd() / \"temp\"\n",
-    "output_path.mkdir(exist_ok=True)\n",
-    "\n",
-    "df = pd.DataFrame(raw_data)\n",
-    "\n",
-    "#df2 = pd.concat([df, df], keys=['1', '2'], names=[\"dataset\", \"index\"])\n",
-    "df2 = pd.concat([df.assign(dataset='1'), df.assign(dataset='2')])\n",
-    "table = pa.Table.from_pandas(df2)\n",
-    "\n",
-    "pq.write_to_dataset(\n",
-    "    table,\n",
-    "    root_path=output_path,\n",
-    "    partition_cols=[\"dataset\"],\n",
-    "    existing_data_behavior=\"delete_matching\",\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 43,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "pyarrow.Table\n",
-       "intensities: list<item: double>\n",
-       "  child 0, item: double\n",
-       "sequence: string\n",
-       "precursor_charge_onehot: list<item: int64>\n",
-       "  child 0, item: int64\n",
-       "collision_energy_normed: double\n",
-       "dataset: string\n",
-       "__index_level_0__: int64\n",
-       "----\n",
-       "intensities: [[[0.00004,0,-1,0,0,-1,0.03,0,-1,0.4],[0.3,0,-1,1,0,-1,0.4,0,-1,0.05],...,[0.3,0,-1,1,0,-1,0.4,0,-1,0.05],[0.04,0,0,0,0,0,0.002,0,0,0.13]]]\n",
-       "sequence: [[\"SVFLTFLR\",\"KTSQIFLAK\",\"SPVGRVTPKEWR\",\"SVFLTFLR\",\"KTSQIFLAK\",\"SPVGRVTPKEWR\"]]\n",
-       "precursor_charge_onehot: [[[0,1,0,0,0,0],[0,1,0,0,0,0],...,[0,1,0,0,0,0],[0,0,1,0,0,0]]]\n",
-       "collision_energy_normed: [[0.25,0.28,0.28,0.25,0.28,0.28]]\n",
-       "dataset: [[\"1\",\"1\",\"1\",\"2\",\"2\",\"2\"]]\n",
-       "__index_level_0__: [[0,1,2,0,1,2]]"
-      ]
-     },
-     "execution_count": 43,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "table"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 44,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>intensities</th>\n",
-       "      <th>sequence</th>\n",
-       "      <th>precursor_charge_onehot</th>\n",
-       "      <th>collision_energy_normed</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>[4e-05, 0.0, -1.0, 0.0, 0.0, -1.0, 0.03, 0.0, ...</td>\n",
-       "      <td>SVFLTFLR</td>\n",
-       "      <td>[0, 1, 0, 0, 0, 0]</td>\n",
-       "      <td>0.25</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>[0.3, 0.0, -1.0, 1.0, 0.0, -1.0, 0.4, 0.0, -1....</td>\n",
-       "      <td>KTSQIFLAK</td>\n",
-       "      <td>[0, 1, 0, 0, 0, 0]</td>\n",
-       "      <td>0.28</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>[0.04, 0.0, 0.0, 0.0, 0.0, 0.0, 0.002, 0.0, 0....</td>\n",
-       "      <td>SPVGRVTPKEWR</td>\n",
-       "      <td>[0, 0, 1, 0, 0, 0]</td>\n",
-       "      <td>0.28</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                         intensities      sequence  \\\n",
-       "0  [4e-05, 0.0, -1.0, 0.0, 0.0, -1.0, 0.03, 0.0, ...      SVFLTFLR   \n",
-       "1  [0.3, 0.0, -1.0, 1.0, 0.0, -1.0, 0.4, 0.0, -1....     KTSQIFLAK   \n",
-       "2  [0.04, 0.0, 0.0, 0.0, 0.0, 0.0, 0.002, 0.0, 0....  SPVGRVTPKEWR   \n",
-       "\n",
-       "  precursor_charge_onehot  collision_energy_normed  \n",
-       "0      [0, 1, 0, 0, 0, 0]                     0.25  \n",
-       "1      [0, 1, 0, 0, 0, 0]                     0.28  \n",
-       "2      [0, 0, 1, 0, 0, 0]                     0.28  "
-      ]
-     },
-     "execution_count": 44,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "dataset = pq.ParquetDataset(output_path, filters=[(\"dataset\", \"=\", '1')])\n",
-    "df = dataset.read().to_pandas().drop(\"dataset\", axis=1)\n",
-    "df#.to_pandas()\n",
-    "#read_df = read_partition(output_path, '1')\n",
-    "#pd.testing.assert_frame_equal(read_df, df)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 64,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>intensities</th>\n",
-       "      <th>sequence</th>\n",
-       "      <th>precursor_charge_onehot</th>\n",
-       "      <th>collision_energy_aligned_normed</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>[0.03713018032121684, 0.0, -1.0, 0.0, 0.0, -1....</td>\n",
-       "      <td>SVFLTFLR</td>\n",
-       "      <td>[0, 1, 0, 0, 0, 0]</td>\n",
-       "      <td>0.25</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>[0.32880081359926777, 0.0, -1.0, 1.0, 0.0, -1....</td>\n",
-       "      <td>KTSQIFLAK</td>\n",
-       "      <td>[0, 1, 0, 0, 0, 0]</td>\n",
-       "      <td>0.28</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>[0.03919235848040409, 0.0, 0.0, 0.0, 0.0, 0.0,...</td>\n",
-       "      <td>SPVGRVTPKEWR</td>\n",
-       "      <td>[0, 0, 1, 0, 0, 0]</td>\n",
-       "      <td>0.28</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>[0.11537762755556774, 0.0, 0.0, 0.0, 0.0, 0.0,...</td>\n",
-       "      <td>SHIWPEYCSRALR</td>\n",
-       "      <td>[0, 0, 1, 0, 0, 0]</td>\n",
-       "      <td>0.30</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>[0.003340655605539741, 0.0, 0.0, 0.00303169307...</td>\n",
-       "      <td>ELESQISELQEDLESERASR</td>\n",
-       "      <td>[0, 0, 1, 0, 0, 0]</td>\n",
-       "      <td>0.20</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>41587</th>\n",
-       "      <td>[0.12310221158139793, 0.0, 0.0, 0.0, 0.0, 0.0,...</td>\n",
-       "      <td>LKFEEITGVINPALDKYFPSDSGVR</td>\n",
-       "      <td>[0, 0, 1, 0, 0, 0]</td>\n",
-       "      <td>0.30</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>41588</th>\n",
-       "      <td>[0.036119027089409034, 0.0, 0.0, 0.0, 0.0, 0.0...</td>\n",
-       "      <td>AYVGLERFLAGLRDY</td>\n",
-       "      <td>[0, 0, 1, 0, 0, 0]</td>\n",
-       "      <td>0.35</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>41589</th>\n",
-       "      <td>[0.036547268719584185, 0.0, 0.0, 0.0, 0.0, 0.0...</td>\n",
-       "      <td>AACLLTKWTAGR</td>\n",
-       "      <td>[0, 0, 1, 0, 0, 0]</td>\n",
-       "      <td>0.23</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>41590</th>\n",
-       "      <td>[0.053176686541959346, -1.0, -1.0, 0.0, -1.0, ...</td>\n",
-       "      <td>SLEKLEIIPASQ</td>\n",
-       "      <td>[1, 0, 0, 0, 0, 0]</td>\n",
-       "      <td>0.30</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>41591</th>\n",
-       "      <td>[0.0, -1.0, -1.0, 0.0, -1.0, -1.0, 0.055506936...</td>\n",
-       "      <td>LVSEIDTGTLAQL</td>\n",
-       "      <td>[1, 0, 0, 0, 0, 0]</td>\n",
-       "      <td>0.20</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>41592 rows × 4 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                             intensities  \\\n",
-       "0      [0.03713018032121684, 0.0, -1.0, 0.0, 0.0, -1....   \n",
-       "1      [0.32880081359926777, 0.0, -1.0, 1.0, 0.0, -1....   \n",
-       "2      [0.03919235848040409, 0.0, 0.0, 0.0, 0.0, 0.0,...   \n",
-       "3      [0.11537762755556774, 0.0, 0.0, 0.0, 0.0, 0.0,...   \n",
-       "4      [0.003340655605539741, 0.0, 0.0, 0.00303169307...   \n",
-       "...                                                  ...   \n",
-       "41587  [0.12310221158139793, 0.0, 0.0, 0.0, 0.0, 0.0,...   \n",
-       "41588  [0.036119027089409034, 0.0, 0.0, 0.0, 0.0, 0.0...   \n",
-       "41589  [0.036547268719584185, 0.0, 0.0, 0.0, 0.0, 0.0...   \n",
-       "41590  [0.053176686541959346, -1.0, -1.0, 0.0, -1.0, ...   \n",
-       "41591  [0.0, -1.0, -1.0, 0.0, -1.0, -1.0, 0.055506936...   \n",
-       "\n",
-       "                        sequence precursor_charge_onehot  \\\n",
-       "0                       SVFLTFLR      [0, 1, 0, 0, 0, 0]   \n",
-       "1                      KTSQIFLAK      [0, 1, 0, 0, 0, 0]   \n",
-       "2                   SPVGRVTPKEWR      [0, 0, 1, 0, 0, 0]   \n",
-       "3                  SHIWPEYCSRALR      [0, 0, 1, 0, 0, 0]   \n",
-       "4           ELESQISELQEDLESERASR      [0, 0, 1, 0, 0, 0]   \n",
-       "...                          ...                     ...   \n",
-       "41587  LKFEEITGVINPALDKYFPSDSGVR      [0, 0, 1, 0, 0, 0]   \n",
-       "41588            AYVGLERFLAGLRDY      [0, 0, 1, 0, 0, 0]   \n",
-       "41589               AACLLTKWTAGR      [0, 0, 1, 0, 0, 0]   \n",
-       "41590               SLEKLEIIPASQ      [1, 0, 0, 0, 0, 0]   \n",
-       "41591              LVSEIDTGTLAQL      [1, 0, 0, 0, 0, 0]   \n",
-       "\n",
-       "       collision_energy_aligned_normed  \n",
-       "0                                 0.25  \n",
-       "1                                 0.28  \n",
-       "2                                 0.28  \n",
-       "3                                 0.30  \n",
-       "4                                 0.20  \n",
-       "...                                ...  \n",
-       "41587                             0.30  \n",
-       "41588                             0.35  \n",
-       "41589                             0.23  \n",
-       "41590                             0.30  \n",
-       "41591                             0.20  \n",
-       "\n",
-       "[41592 rows x 4 columns]"
-      ]
-     },
-     "execution_count": 64,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df = pd.read_parquet(input_test_file)\n",
-    "df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 65,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th>intensities</th>\n",
-       "      <th>sequence</th>\n",
-       "      <th>precursor_charge_onehot</th>\n",
-       "      <th>collision_energy_aligned_normed</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>dataset</th>\n",
-       "      <th>index</th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th rowspan=\"5\" valign=\"top\">1</th>\n",
-       "      <th>0</th>\n",
-       "      <td>[0.03713018032121684, 0.0, -1.0, 0.0, 0.0, -1....</td>\n",
-       "      <td>SVFLTFLR</td>\n",
-       "      <td>[0, 1, 0, 0, 0, 0]</td>\n",
-       "      <td>0.25</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>[0.32880081359926777, 0.0, -1.0, 1.0, 0.0, -1....</td>\n",
-       "      <td>KTSQIFLAK</td>\n",
-       "      <td>[0, 1, 0, 0, 0, 0]</td>\n",
-       "      <td>0.28</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>[0.03919235848040409, 0.0, 0.0, 0.0, 0.0, 0.0,...</td>\n",
-       "      <td>SPVGRVTPKEWR</td>\n",
-       "      <td>[0, 0, 1, 0, 0, 0]</td>\n",
-       "      <td>0.28</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>[0.11537762755556774, 0.0, 0.0, 0.0, 0.0, 0.0,...</td>\n",
-       "      <td>SHIWPEYCSRALR</td>\n",
-       "      <td>[0, 0, 1, 0, 0, 0]</td>\n",
-       "      <td>0.30</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>[0.003340655605539741, 0.0, 0.0, 0.00303169307...</td>\n",
-       "      <td>ELESQISELQEDLESERASR</td>\n",
-       "      <td>[0, 0, 1, 0, 0, 0]</td>\n",
-       "      <td>0.20</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th rowspan=\"5\" valign=\"top\">2</th>\n",
-       "      <th>41587</th>\n",
-       "      <td>[0.12310221158139793, 0.0, 0.0, 0.0, 0.0, 0.0,...</td>\n",
-       "      <td>LKFEEITGVINPALDKYFPSDSGVR</td>\n",
-       "      <td>[0, 0, 1, 0, 0, 0]</td>\n",
-       "      <td>0.30</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>41588</th>\n",
-       "      <td>[0.036119027089409034, 0.0, 0.0, 0.0, 0.0, 0.0...</td>\n",
-       "      <td>AYVGLERFLAGLRDY</td>\n",
-       "      <td>[0, 0, 1, 0, 0, 0]</td>\n",
-       "      <td>0.35</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>41589</th>\n",
-       "      <td>[0.036547268719584185, 0.0, 0.0, 0.0, 0.0, 0.0...</td>\n",
-       "      <td>AACLLTKWTAGR</td>\n",
-       "      <td>[0, 0, 1, 0, 0, 0]</td>\n",
-       "      <td>0.23</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>41590</th>\n",
-       "      <td>[0.053176686541959346, -1.0, -1.0, 0.0, -1.0, ...</td>\n",
-       "      <td>SLEKLEIIPASQ</td>\n",
-       "      <td>[1, 0, 0, 0, 0, 0]</td>\n",
-       "      <td>0.30</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>41591</th>\n",
-       "      <td>[0.0, -1.0, -1.0, 0.0, -1.0, -1.0, 0.055506936...</td>\n",
-       "      <td>LVSEIDTGTLAQL</td>\n",
-       "      <td>[1, 0, 0, 0, 0, 0]</td>\n",
-       "      <td>0.20</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>83184 rows × 4 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                                     intensities  \\\n",
-       "dataset index                                                      \n",
-       "1       0      [0.03713018032121684, 0.0, -1.0, 0.0, 0.0, -1....   \n",
-       "        1      [0.32880081359926777, 0.0, -1.0, 1.0, 0.0, -1....   \n",
-       "        2      [0.03919235848040409, 0.0, 0.0, 0.0, 0.0, 0.0,...   \n",
-       "        3      [0.11537762755556774, 0.0, 0.0, 0.0, 0.0, 0.0,...   \n",
-       "        4      [0.003340655605539741, 0.0, 0.0, 0.00303169307...   \n",
-       "...                                                          ...   \n",
-       "2       41587  [0.12310221158139793, 0.0, 0.0, 0.0, 0.0, 0.0,...   \n",
-       "        41588  [0.036119027089409034, 0.0, 0.0, 0.0, 0.0, 0.0...   \n",
-       "        41589  [0.036547268719584185, 0.0, 0.0, 0.0, 0.0, 0.0...   \n",
-       "        41590  [0.053176686541959346, -1.0, -1.0, 0.0, -1.0, ...   \n",
-       "        41591  [0.0, -1.0, -1.0, 0.0, -1.0, -1.0, 0.055506936...   \n",
-       "\n",
-       "                                sequence precursor_charge_onehot  \\\n",
-       "dataset index                                                      \n",
-       "1       0                       SVFLTFLR      [0, 1, 0, 0, 0, 0]   \n",
-       "        1                      KTSQIFLAK      [0, 1, 0, 0, 0, 0]   \n",
-       "        2                   SPVGRVTPKEWR      [0, 0, 1, 0, 0, 0]   \n",
-       "        3                  SHIWPEYCSRALR      [0, 0, 1, 0, 0, 0]   \n",
-       "        4           ELESQISELQEDLESERASR      [0, 0, 1, 0, 0, 0]   \n",
-       "...                                  ...                     ...   \n",
-       "2       41587  LKFEEITGVINPALDKYFPSDSGVR      [0, 0, 1, 0, 0, 0]   \n",
-       "        41588            AYVGLERFLAGLRDY      [0, 0, 1, 0, 0, 0]   \n",
-       "        41589               AACLLTKWTAGR      [0, 0, 1, 0, 0, 0]   \n",
-       "        41590               SLEKLEIIPASQ      [1, 0, 0, 0, 0, 0]   \n",
-       "        41591              LVSEIDTGTLAQL      [1, 0, 0, 0, 0, 0]   \n",
-       "\n",
-       "               collision_energy_aligned_normed  \n",
-       "dataset index                                   \n",
-       "1       0                                 0.25  \n",
-       "        1                                 0.28  \n",
-       "        2                                 0.28  \n",
-       "        3                                 0.30  \n",
-       "        4                                 0.20  \n",
-       "...                                        ...  \n",
-       "2       41587                             0.30  \n",
-       "        41588                             0.35  \n",
-       "        41589                             0.23  \n",
-       "        41590                             0.30  \n",
-       "        41591                             0.20  \n",
-       "\n",
-       "[83184 rows x 4 columns]"
-      ]
-     },
-     "execution_count": 65,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "table = pd.concat([df, df], keys=[\"1\", \"2\"], names=[\"dataset\", \"index\"])\n",
-    "table"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 66,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "out_dir = Path.cwd() / \"test\"\n",
-    "out_dir.mkdir(exist_ok=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 77,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "pq.write_to_dataset(\n",
-    "    pa.Table.from_pandas(table), root_path=out_dir, partition_cols=[\"dataset\"], existing_data_behavior=\"delete_matching\"\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 78,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "dataset = pq.ParquetDataset(out_dir, filters=[(\"dataset\", \"=\", 1)])\n",
-    "read_df = dataset.read().to_pandas()\n",
-    "read_df = read_df.reset_index(level=0, drop=True).rename_axis(None)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 85,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>intensities</th>\n",
-       "      <th>sequence</th>\n",
-       "      <th>precursor_charge_onehot</th>\n",
-       "      <th>collision_energy_aligned_normed</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>[0.03713018032121684, 0.0, -1.0, 0.0, 0.0, -1....</td>\n",
-       "      <td>SVFLTFLR</td>\n",
-       "      <td>[0, 1, 0, 0, 0, 0]</td>\n",
-       "      <td>0.25</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>[0.32880081359926777, 0.0, -1.0, 1.0, 0.0, -1....</td>\n",
-       "      <td>KTSQIFLAK</td>\n",
-       "      <td>[0, 1, 0, 0, 0, 0]</td>\n",
-       "      <td>0.28</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>[0.03919235848040409, 0.0, 0.0, 0.0, 0.0, 0.0,...</td>\n",
-       "      <td>SPVGRVTPKEWR</td>\n",
-       "      <td>[0, 0, 1, 0, 0, 0]</td>\n",
-       "      <td>0.28</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>[0.11537762755556774, 0.0, 0.0, 0.0, 0.0, 0.0,...</td>\n",
-       "      <td>SHIWPEYCSRALR</td>\n",
-       "      <td>[0, 0, 1, 0, 0, 0]</td>\n",
-       "      <td>0.30</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>[0.003340655605539741, 0.0, 0.0, 0.00303169307...</td>\n",
-       "      <td>ELESQISELQEDLESERASR</td>\n",
-       "      <td>[0, 0, 1, 0, 0, 0]</td>\n",
-       "      <td>0.20</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>41587</th>\n",
-       "      <td>[0.12310221158139793, 0.0, 0.0, 0.0, 0.0, 0.0,...</td>\n",
-       "      <td>LKFEEITGVINPALDKYFPSDSGVR</td>\n",
-       "      <td>[0, 0, 1, 0, 0, 0]</td>\n",
-       "      <td>0.30</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>41588</th>\n",
-       "      <td>[0.036119027089409034, 0.0, 0.0, 0.0, 0.0, 0.0...</td>\n",
-       "      <td>AYVGLERFLAGLRDY</td>\n",
-       "      <td>[0, 0, 1, 0, 0, 0]</td>\n",
-       "      <td>0.35</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>41589</th>\n",
-       "      <td>[0.036547268719584185, 0.0, 0.0, 0.0, 0.0, 0.0...</td>\n",
-       "      <td>AACLLTKWTAGR</td>\n",
-       "      <td>[0, 0, 1, 0, 0, 0]</td>\n",
-       "      <td>0.23</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>41590</th>\n",
-       "      <td>[0.053176686541959346, -1.0, -1.0, 0.0, -1.0, ...</td>\n",
-       "      <td>SLEKLEIIPASQ</td>\n",
-       "      <td>[1, 0, 0, 0, 0, 0]</td>\n",
-       "      <td>0.30</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>41591</th>\n",
-       "      <td>[0.0, -1.0, -1.0, 0.0, -1.0, -1.0, 0.055506936...</td>\n",
-       "      <td>LVSEIDTGTLAQL</td>\n",
-       "      <td>[1, 0, 0, 0, 0, 0]</td>\n",
-       "      <td>0.20</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>41592 rows × 4 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                             intensities  \\\n",
-       "0      [0.03713018032121684, 0.0, -1.0, 0.0, 0.0, -1....   \n",
-       "1      [0.32880081359926777, 0.0, -1.0, 1.0, 0.0, -1....   \n",
-       "2      [0.03919235848040409, 0.0, 0.0, 0.0, 0.0, 0.0,...   \n",
-       "3      [0.11537762755556774, 0.0, 0.0, 0.0, 0.0, 0.0,...   \n",
-       "4      [0.003340655605539741, 0.0, 0.0, 0.00303169307...   \n",
-       "...                                                  ...   \n",
-       "41587  [0.12310221158139793, 0.0, 0.0, 0.0, 0.0, 0.0,...   \n",
-       "41588  [0.036119027089409034, 0.0, 0.0, 0.0, 0.0, 0.0...   \n",
-       "41589  [0.036547268719584185, 0.0, 0.0, 0.0, 0.0, 0.0...   \n",
-       "41590  [0.053176686541959346, -1.0, -1.0, 0.0, -1.0, ...   \n",
-       "41591  [0.0, -1.0, -1.0, 0.0, -1.0, -1.0, 0.055506936...   \n",
-       "\n",
-       "                        sequence precursor_charge_onehot  \\\n",
-       "0                       SVFLTFLR      [0, 1, 0, 0, 0, 0]   \n",
-       "1                      KTSQIFLAK      [0, 1, 0, 0, 0, 0]   \n",
-       "2                   SPVGRVTPKEWR      [0, 0, 1, 0, 0, 0]   \n",
-       "3                  SHIWPEYCSRALR      [0, 0, 1, 0, 0, 0]   \n",
-       "4           ELESQISELQEDLESERASR      [0, 0, 1, 0, 0, 0]   \n",
-       "...                          ...                     ...   \n",
-       "41587  LKFEEITGVINPALDKYFPSDSGVR      [0, 0, 1, 0, 0, 0]   \n",
-       "41588            AYVGLERFLAGLRDY      [0, 0, 1, 0, 0, 0]   \n",
-       "41589               AACLLTKWTAGR      [0, 0, 1, 0, 0, 0]   \n",
-       "41590               SLEKLEIIPASQ      [1, 0, 0, 0, 0, 0]   \n",
-       "41591              LVSEIDTGTLAQL      [1, 0, 0, 0, 0, 0]   \n",
-       "\n",
-       "       collision_energy_aligned_normed  \n",
-       "0                                 0.25  \n",
-       "1                                 0.28  \n",
-       "2                                 0.28  \n",
-       "3                                 0.30  \n",
-       "4                                 0.20  \n",
-       "...                                ...  \n",
-       "41587                             0.30  \n",
-       "41588                             0.35  \n",
-       "41589                             0.23  \n",
-       "41590                             0.30  \n",
-       "41591                             0.20  \n",
-       "\n",
-       "[41592 rows x 4 columns]"
-      ]
-     },
-     "execution_count": 85,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "read_df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 80,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "True"
-      ]
-     },
-     "execution_count": 80,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df.equals(read_df)"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.0"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}