From 13606bd0883bbd48a43c784ded96c929224a165a Mon Sep 17 00:00:00 2001 From: Julius Schlensok Date: Sun, 7 Jul 2024 12:06:48 +0000 Subject: [PATCH] chore: delete erroneously included notebook --- spectrum_io/file/parquet.ipynb | 902 --------------------------------- 1 file changed, 902 deletions(-) delete mode 100644 spectrum_io/file/parquet.ipynb diff --git a/spectrum_io/file/parquet.ipynb b/spectrum_io/file/parquet.ipynb deleted file mode 100644 index 5d63ee5..0000000 --- a/spectrum_io/file/parquet.ipynb +++ /dev/null @@ -1,902 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from pathlib import Path\n", - "from typing import Union\n", - "\n", - "import datasets\n", - "import pandas as pd\n", - "import pyarrow as pa\n", - "import pyarrow.feather as feather\n", - "import pyarrow.parquet as pq\n", - "import scipy\n", - "\n", - "from parquet import *" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "root_path = Path(\"/cmnfs/proj/prosit_astral/datasets/parquet\")\n", - "train_path = root_path / \"train\"" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "385ec5beebab46e6933deda6d071ca39", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Resolving data files: 0%| | 0/78 [00:00\u001b[39m max_shard_size:\n\u001b[0;32m-> 1997\u001b[0m num_examples, num_bytes \u001b[39m=\u001b[39m writer\u001b[39m.\u001b[39;49mfinalize()\n\u001b[1;32m 1998\u001b[0m writer\u001b[39m.\u001b[39mclose()\n", - "File \u001b[0;32m~/micromamba/envs/spectrum-io/lib/python3.10/site-packages/datasets/arrow_writer.py:607\u001b[0m, in \u001b[0;36mArrowWriter.finalize\u001b[0;34m(self, close_stream)\u001b[0m\n\u001b[1;32m 606\u001b[0m \u001b[39mif\u001b[39;00m close_stream:\n\u001b[0;32m--> 607\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mstream\u001b[39m.\u001b[39;49mclose()\n\u001b[1;32m 608\u001b[0m \u001b[39melse\u001b[39;00m:\n", - "File \u001b[0;32m~/micromamba/envs/spectrum-io/lib/python3.10/site-packages/fsspec/implementations/local.py:407\u001b[0m, in \u001b[0;36mLocalFileOpener.close\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 406\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mclose\u001b[39m(\u001b[39mself\u001b[39m):\n\u001b[0;32m--> 407\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mf\u001b[39m.\u001b[39;49mclose()\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m: ", - "\nDuring handling of the above exception, another exception occurred:\n", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "File \u001b[0;32m~/micromamba/envs/spectrum-io/lib/python3.10/site-packages/datasets/builder.py:2027\u001b[0m, in \u001b[0;36mArrowBasedBuilder._prepare_split_single\u001b[0;34m(self, gen_kwargs, fpath, file_format, max_shard_size, job_id)\u001b[0m\n\u001b[1;32m 2026\u001b[0m num_shards \u001b[39m=\u001b[39m shard_id \u001b[39m+\u001b[39m \u001b[39m1\u001b[39m\n\u001b[0;32m-> 2027\u001b[0m num_examples, num_bytes \u001b[39m=\u001b[39m writer\u001b[39m.\u001b[39;49mfinalize()\n\u001b[1;32m 2028\u001b[0m writer\u001b[39m.\u001b[39mclose()\n", - "File \u001b[0;32m~/micromamba/envs/spectrum-io/lib/python3.10/site-packages/datasets/arrow_writer.py:602\u001b[0m, in \u001b[0;36mArrowWriter.finalize\u001b[0;34m(self, close_stream)\u001b[0m\n\u001b[1;32m 601\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mpa_writer \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mand\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mschema:\n\u001b[0;32m--> 602\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_build_writer(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mschema)\n\u001b[1;32m 603\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mpa_writer \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n", - "File \u001b[0;32m~/micromamba/envs/spectrum-io/lib/python3.10/site-packages/datasets/arrow_writer.py:404\u001b[0m, in \u001b[0;36mArrowWriter._build_writer\u001b[0;34m(self, inferred_schema)\u001b[0m\n\u001b[1;32m 403\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_schema \u001b[39m=\u001b[39m schema\n\u001b[0;32m--> 404\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mpa_writer \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_WRITER_CLASS(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mstream, schema)\n", - "File \u001b[0;32m~/micromamba/envs/spectrum-io/lib/python3.10/site-packages/pyarrow/ipc.py:85\u001b[0m, in \u001b[0;36mRecordBatchStreamWriter.__init__\u001b[0;34m(self, sink, schema, use_legacy_format, options)\u001b[0m\n\u001b[1;32m 84\u001b[0m options \u001b[39m=\u001b[39m _get_legacy_format_default(use_legacy_format, options)\n\u001b[0;32m---> 85\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_open(sink, schema, options\u001b[39m=\u001b[39;49moptions)\n", - "File \u001b[0;32m~/micromamba/envs/spectrum-io/lib/python3.10/site-packages/pyarrow/ipc.pxi:582\u001b[0m, in \u001b[0;36mpyarrow.lib._RecordBatchStreamWriter._open\u001b[0;34m()\u001b[0m\n", - "File \u001b[0;32m~/micromamba/envs/spectrum-io/lib/python3.10/site-packages/pyarrow/io.pxi:2097\u001b[0m, in \u001b[0;36mpyarrow.lib.get_writer\u001b[0;34m()\u001b[0m\n", - "File \u001b[0;32m~/micromamba/envs/spectrum-io/lib/python3.10/site-packages/pyarrow/io.pxi:232\u001b[0m, in \u001b[0;36mpyarrow.lib.NativeFile.get_output_stream\u001b[0;34m()\u001b[0m\n", - "File \u001b[0;32m~/micromamba/envs/spectrum-io/lib/python3.10/site-packages/pyarrow/io.pxi:246\u001b[0m, in \u001b[0;36mpyarrow.lib.NativeFile._assert_writable\u001b[0;34m()\u001b[0m\n", - "File \u001b[0;32m~/micromamba/envs/spectrum-io/lib/python3.10/site-packages/pyarrow/io.pxi:237\u001b[0m, in \u001b[0;36mpyarrow.lib.NativeFile._assert_open\u001b[0;34m()\u001b[0m\n", - "\u001b[0;31mValueError\u001b[0m: I/O operation on closed file", - "\nThe above exception was the direct cause of the following exception:\n", - "\u001b[0;31mDatasetGenerationError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m/cmnfs/home/students/j.schlensok/spectrum_io/spectrum_io/file/parquet.ipynb Cell 3\u001b[0m line \u001b[0;36m1\n\u001b[0;32m----> 1\u001b[0m ds \u001b[39m=\u001b[39m datasets\u001b[39m.\u001b[39;49mload_dataset(\u001b[39mstr\u001b[39;49m(root_path))\n", - "File \u001b[0;32m~/micromamba/envs/spectrum-io/lib/python3.10/site-packages/datasets/load.py:2609\u001b[0m, in \u001b[0;36mload_dataset\u001b[0;34m(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, token, use_auth_token, task, streaming, num_proc, storage_options, trust_remote_code, **config_kwargs)\u001b[0m\n\u001b[1;32m 2606\u001b[0m \u001b[39mreturn\u001b[39;00m builder_instance\u001b[39m.\u001b[39mas_streaming_dataset(split\u001b[39m=\u001b[39msplit)\n\u001b[1;32m 2608\u001b[0m \u001b[39m# Download and prepare data\u001b[39;00m\n\u001b[0;32m-> 2609\u001b[0m builder_instance\u001b[39m.\u001b[39;49mdownload_and_prepare(\n\u001b[1;32m 2610\u001b[0m download_config\u001b[39m=\u001b[39;49mdownload_config,\n\u001b[1;32m 2611\u001b[0m download_mode\u001b[39m=\u001b[39;49mdownload_mode,\n\u001b[1;32m 2612\u001b[0m verification_mode\u001b[39m=\u001b[39;49mverification_mode,\n\u001b[1;32m 2613\u001b[0m num_proc\u001b[39m=\u001b[39;49mnum_proc,\n\u001b[1;32m 2614\u001b[0m storage_options\u001b[39m=\u001b[39;49mstorage_options,\n\u001b[1;32m 2615\u001b[0m )\n\u001b[1;32m 2617\u001b[0m \u001b[39m# Build dataset for splits\u001b[39;00m\n\u001b[1;32m 2618\u001b[0m keep_in_memory \u001b[39m=\u001b[39m (\n\u001b[1;32m 2619\u001b[0m keep_in_memory \u001b[39mif\u001b[39;00m keep_in_memory \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39melse\u001b[39;00m is_small_dataset(builder_instance\u001b[39m.\u001b[39minfo\u001b[39m.\u001b[39mdataset_size)\n\u001b[1;32m 2620\u001b[0m )\n", - "File \u001b[0;32m~/micromamba/envs/spectrum-io/lib/python3.10/site-packages/datasets/builder.py:1027\u001b[0m, in \u001b[0;36mDatasetBuilder.download_and_prepare\u001b[0;34m(self, output_dir, download_config, download_mode, verification_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)\u001b[0m\n\u001b[1;32m 1025\u001b[0m \u001b[39mif\u001b[39;00m num_proc \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m 1026\u001b[0m prepare_split_kwargs[\u001b[39m\"\u001b[39m\u001b[39mnum_proc\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m num_proc\n\u001b[0;32m-> 1027\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_download_and_prepare(\n\u001b[1;32m 1028\u001b[0m dl_manager\u001b[39m=\u001b[39;49mdl_manager,\n\u001b[1;32m 1029\u001b[0m verification_mode\u001b[39m=\u001b[39;49mverification_mode,\n\u001b[1;32m 1030\u001b[0m \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mprepare_split_kwargs,\n\u001b[1;32m 1031\u001b[0m \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mdownload_and_prepare_kwargs,\n\u001b[1;32m 1032\u001b[0m )\n\u001b[1;32m 1033\u001b[0m \u001b[39m# Sync info\u001b[39;00m\n\u001b[1;32m 1034\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39minfo\u001b[39m.\u001b[39mdataset_size \u001b[39m=\u001b[39m \u001b[39msum\u001b[39m(split\u001b[39m.\u001b[39mnum_bytes \u001b[39mfor\u001b[39;00m split \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39minfo\u001b[39m.\u001b[39msplits\u001b[39m.\u001b[39mvalues())\n", - "File \u001b[0;32m~/micromamba/envs/spectrum-io/lib/python3.10/site-packages/datasets/builder.py:1122\u001b[0m, in \u001b[0;36mDatasetBuilder._download_and_prepare\u001b[0;34m(self, dl_manager, verification_mode, **prepare_split_kwargs)\u001b[0m\n\u001b[1;32m 1118\u001b[0m split_dict\u001b[39m.\u001b[39madd(split_generator\u001b[39m.\u001b[39msplit_info)\n\u001b[1;32m 1120\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m 1121\u001b[0m \u001b[39m# Prepare split will record examples associated to the split\u001b[39;00m\n\u001b[0;32m-> 1122\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_prepare_split(split_generator, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mprepare_split_kwargs)\n\u001b[1;32m 1123\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mOSError\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n\u001b[1;32m 1124\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mOSError\u001b[39;00m(\n\u001b[1;32m 1125\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mCannot find data file. \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 1126\u001b[0m \u001b[39m+\u001b[39m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mmanual_download_instructions \u001b[39mor\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m 1127\u001b[0m \u001b[39m+\u001b[39m \u001b[39m\"\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39mOriginal error:\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[1;32m 1128\u001b[0m \u001b[39m+\u001b[39m \u001b[39mstr\u001b[39m(e)\n\u001b[1;32m 1129\u001b[0m ) \u001b[39mfrom\u001b[39;00m \u001b[39mNone\u001b[39;00m\n", - "File \u001b[0;32m~/micromamba/envs/spectrum-io/lib/python3.10/site-packages/datasets/builder.py:1882\u001b[0m, in \u001b[0;36mArrowBasedBuilder._prepare_split\u001b[0;34m(self, split_generator, file_format, num_proc, max_shard_size)\u001b[0m\n\u001b[1;32m 1880\u001b[0m job_id \u001b[39m=\u001b[39m \u001b[39m0\u001b[39m\n\u001b[1;32m 1881\u001b[0m \u001b[39mwith\u001b[39;00m pbar:\n\u001b[0;32m-> 1882\u001b[0m \u001b[39mfor\u001b[39;00m job_id, done, content \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_prepare_split_single(\n\u001b[1;32m 1883\u001b[0m gen_kwargs\u001b[39m=\u001b[39mgen_kwargs, job_id\u001b[39m=\u001b[39mjob_id, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39m_prepare_split_args\n\u001b[1;32m 1884\u001b[0m ):\n\u001b[1;32m 1885\u001b[0m \u001b[39mif\u001b[39;00m done:\n\u001b[1;32m 1886\u001b[0m result \u001b[39m=\u001b[39m content\n", - "File \u001b[0;32m~/micromamba/envs/spectrum-io/lib/python3.10/site-packages/datasets/builder.py:2038\u001b[0m, in \u001b[0;36mArrowBasedBuilder._prepare_split_single\u001b[0;34m(self, gen_kwargs, fpath, file_format, max_shard_size, job_id)\u001b[0m\n\u001b[1;32m 2036\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(e, DatasetGenerationError):\n\u001b[1;32m 2037\u001b[0m \u001b[39mraise\u001b[39;00m\n\u001b[0;32m-> 2038\u001b[0m \u001b[39mraise\u001b[39;00m DatasetGenerationError(\u001b[39m\"\u001b[39m\u001b[39mAn error occurred while generating the dataset\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39mfrom\u001b[39;00m \u001b[39me\u001b[39;00m\n\u001b[1;32m 2040\u001b[0m \u001b[39myield\u001b[39;00m job_id, \u001b[39mTrue\u001b[39;00m, (total_num_examples, total_num_bytes, writer\u001b[39m.\u001b[39m_features, num_shards, shard_lengths)\n", - "\u001b[0;31mDatasetGenerationError\u001b[0m: An error occurred while generating the dataset" - ] - } - ], - "source": [ - "ds = datasets.load_dataset(str(root_path))" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "input_test_file = Path.cwd().parent.parent.parent / \"oktoberfest/data/intensity_data.parquet\"\n", - "raw_data = {\n", - " \"intensities\": [\n", - " [4e-5, 0., -1., 0., 0., -1., 0.03, 0., -1., 0.4],\n", - " [.3, 0., -1., 1., 0., -1., 0.4, 0., -1., 0.05],\n", - " [.04, 0., 0., 0., 0., 0., 2e-3, 0., 0., .13]\n", - " ],\n", - " \"sequence\": [\"SVFLTFLR\", \"KTSQIFLAK\", \"SPVGRVTPKEWR\"],\n", - " \"precursor_charge_onehot\": [\n", - " [0, 1, 0, 0, 0, 0],\n", - " [0, 1, 0, 0, 0, 0],\n", - " [0, 0, 1, 0, 0, 0],\n", - " ],\n", - " \"collision_energy_normed\": [.25, .28, .28]\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [], - "source": [ - "output_path = Path.cwd() / \"temp\"\n", - "output_path.mkdir(exist_ok=True)\n", - "\n", - "df = pd.DataFrame(raw_data)\n", - "\n", - "#df2 = pd.concat([df, df], keys=['1', '2'], names=[\"dataset\", \"index\"])\n", - "df2 = pd.concat([df.assign(dataset='1'), df.assign(dataset='2')])\n", - "table = pa.Table.from_pandas(df2)\n", - "\n", - "pq.write_to_dataset(\n", - " table,\n", - " root_path=output_path,\n", - " partition_cols=[\"dataset\"],\n", - " existing_data_behavior=\"delete_matching\",\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "pyarrow.Table\n", - "intensities: list\n", - " child 0, item: double\n", - "sequence: string\n", - "precursor_charge_onehot: list\n", - " child 0, item: int64\n", - "collision_energy_normed: double\n", - "dataset: string\n", - "__index_level_0__: int64\n", - "----\n", - "intensities: [[[0.00004,0,-1,0,0,-1,0.03,0,-1,0.4],[0.3,0,-1,1,0,-1,0.4,0,-1,0.05],...,[0.3,0,-1,1,0,-1,0.4,0,-1,0.05],[0.04,0,0,0,0,0,0.002,0,0,0.13]]]\n", - "sequence: [[\"SVFLTFLR\",\"KTSQIFLAK\",\"SPVGRVTPKEWR\",\"SVFLTFLR\",\"KTSQIFLAK\",\"SPVGRVTPKEWR\"]]\n", - "precursor_charge_onehot: [[[0,1,0,0,0,0],[0,1,0,0,0,0],...,[0,1,0,0,0,0],[0,0,1,0,0,0]]]\n", - "collision_energy_normed: [[0.25,0.28,0.28,0.25,0.28,0.28]]\n", - "dataset: [[\"1\",\"1\",\"1\",\"2\",\"2\",\"2\"]]\n", - "__index_level_0__: [[0,1,2,0,1,2]]" - ] - }, - "execution_count": 43, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "table" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
intensitiessequenceprecursor_charge_onehotcollision_energy_normed
0[4e-05, 0.0, -1.0, 0.0, 0.0, -1.0, 0.03, 0.0, ...SVFLTFLR[0, 1, 0, 0, 0, 0]0.25
1[0.3, 0.0, -1.0, 1.0, 0.0, -1.0, 0.4, 0.0, -1....KTSQIFLAK[0, 1, 0, 0, 0, 0]0.28
2[0.04, 0.0, 0.0, 0.0, 0.0, 0.0, 0.002, 0.0, 0....SPVGRVTPKEWR[0, 0, 1, 0, 0, 0]0.28
\n", - "
" - ], - "text/plain": [ - " intensities sequence \\\n", - "0 [4e-05, 0.0, -1.0, 0.0, 0.0, -1.0, 0.03, 0.0, ... SVFLTFLR \n", - "1 [0.3, 0.0, -1.0, 1.0, 0.0, -1.0, 0.4, 0.0, -1.... KTSQIFLAK \n", - "2 [0.04, 0.0, 0.0, 0.0, 0.0, 0.0, 0.002, 0.0, 0.... SPVGRVTPKEWR \n", - "\n", - " precursor_charge_onehot collision_energy_normed \n", - "0 [0, 1, 0, 0, 0, 0] 0.25 \n", - "1 [0, 1, 0, 0, 0, 0] 0.28 \n", - "2 [0, 0, 1, 0, 0, 0] 0.28 " - ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dataset = pq.ParquetDataset(output_path, filters=[(\"dataset\", \"=\", '1')])\n", - "df = dataset.read().to_pandas().drop(\"dataset\", axis=1)\n", - "df#.to_pandas()\n", - "#read_df = read_partition(output_path, '1')\n", - "#pd.testing.assert_frame_equal(read_df, df)" - ] - }, - { - "cell_type": "code", - "execution_count": 64, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
intensitiessequenceprecursor_charge_onehotcollision_energy_aligned_normed
0[0.03713018032121684, 0.0, -1.0, 0.0, 0.0, -1....SVFLTFLR[0, 1, 0, 0, 0, 0]0.25
1[0.32880081359926777, 0.0, -1.0, 1.0, 0.0, -1....KTSQIFLAK[0, 1, 0, 0, 0, 0]0.28
2[0.03919235848040409, 0.0, 0.0, 0.0, 0.0, 0.0,...SPVGRVTPKEWR[0, 0, 1, 0, 0, 0]0.28
3[0.11537762755556774, 0.0, 0.0, 0.0, 0.0, 0.0,...SHIWPEYCSRALR[0, 0, 1, 0, 0, 0]0.30
4[0.003340655605539741, 0.0, 0.0, 0.00303169307...ELESQISELQEDLESERASR[0, 0, 1, 0, 0, 0]0.20
...............
41587[0.12310221158139793, 0.0, 0.0, 0.0, 0.0, 0.0,...LKFEEITGVINPALDKYFPSDSGVR[0, 0, 1, 0, 0, 0]0.30
41588[0.036119027089409034, 0.0, 0.0, 0.0, 0.0, 0.0...AYVGLERFLAGLRDY[0, 0, 1, 0, 0, 0]0.35
41589[0.036547268719584185, 0.0, 0.0, 0.0, 0.0, 0.0...AACLLTKWTAGR[0, 0, 1, 0, 0, 0]0.23
41590[0.053176686541959346, -1.0, -1.0, 0.0, -1.0, ...SLEKLEIIPASQ[1, 0, 0, 0, 0, 0]0.30
41591[0.0, -1.0, -1.0, 0.0, -1.0, -1.0, 0.055506936...LVSEIDTGTLAQL[1, 0, 0, 0, 0, 0]0.20
\n", - "

41592 rows × 4 columns

\n", - "
" - ], - "text/plain": [ - " intensities \\\n", - "0 [0.03713018032121684, 0.0, -1.0, 0.0, 0.0, -1.... \n", - "1 [0.32880081359926777, 0.0, -1.0, 1.0, 0.0, -1.... \n", - "2 [0.03919235848040409, 0.0, 0.0, 0.0, 0.0, 0.0,... \n", - "3 [0.11537762755556774, 0.0, 0.0, 0.0, 0.0, 0.0,... \n", - "4 [0.003340655605539741, 0.0, 0.0, 0.00303169307... \n", - "... ... \n", - "41587 [0.12310221158139793, 0.0, 0.0, 0.0, 0.0, 0.0,... \n", - "41588 [0.036119027089409034, 0.0, 0.0, 0.0, 0.0, 0.0... \n", - "41589 [0.036547268719584185, 0.0, 0.0, 0.0, 0.0, 0.0... \n", - "41590 [0.053176686541959346, -1.0, -1.0, 0.0, -1.0, ... \n", - "41591 [0.0, -1.0, -1.0, 0.0, -1.0, -1.0, 0.055506936... \n", - "\n", - " sequence precursor_charge_onehot \\\n", - "0 SVFLTFLR [0, 1, 0, 0, 0, 0] \n", - "1 KTSQIFLAK [0, 1, 0, 0, 0, 0] \n", - "2 SPVGRVTPKEWR [0, 0, 1, 0, 0, 0] \n", - "3 SHIWPEYCSRALR [0, 0, 1, 0, 0, 0] \n", - "4 ELESQISELQEDLESERASR [0, 0, 1, 0, 0, 0] \n", - "... ... ... \n", - "41587 LKFEEITGVINPALDKYFPSDSGVR [0, 0, 1, 0, 0, 0] \n", - "41588 AYVGLERFLAGLRDY [0, 0, 1, 0, 0, 0] \n", - "41589 AACLLTKWTAGR [0, 0, 1, 0, 0, 0] \n", - "41590 SLEKLEIIPASQ [1, 0, 0, 0, 0, 0] \n", - "41591 LVSEIDTGTLAQL [1, 0, 0, 0, 0, 0] \n", - "\n", - " collision_energy_aligned_normed \n", - "0 0.25 \n", - "1 0.28 \n", - "2 0.28 \n", - "3 0.30 \n", - "4 0.20 \n", - "... ... \n", - "41587 0.30 \n", - "41588 0.35 \n", - "41589 0.23 \n", - "41590 0.30 \n", - "41591 0.20 \n", - "\n", - "[41592 rows x 4 columns]" - ] - }, - "execution_count": 64, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = pd.read_parquet(input_test_file)\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": 65, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
intensitiessequenceprecursor_charge_onehotcollision_energy_aligned_normed
datasetindex
10[0.03713018032121684, 0.0, -1.0, 0.0, 0.0, -1....SVFLTFLR[0, 1, 0, 0, 0, 0]0.25
1[0.32880081359926777, 0.0, -1.0, 1.0, 0.0, -1....KTSQIFLAK[0, 1, 0, 0, 0, 0]0.28
2[0.03919235848040409, 0.0, 0.0, 0.0, 0.0, 0.0,...SPVGRVTPKEWR[0, 0, 1, 0, 0, 0]0.28
3[0.11537762755556774, 0.0, 0.0, 0.0, 0.0, 0.0,...SHIWPEYCSRALR[0, 0, 1, 0, 0, 0]0.30
4[0.003340655605539741, 0.0, 0.0, 0.00303169307...ELESQISELQEDLESERASR[0, 0, 1, 0, 0, 0]0.20
..................
241587[0.12310221158139793, 0.0, 0.0, 0.0, 0.0, 0.0,...LKFEEITGVINPALDKYFPSDSGVR[0, 0, 1, 0, 0, 0]0.30
41588[0.036119027089409034, 0.0, 0.0, 0.0, 0.0, 0.0...AYVGLERFLAGLRDY[0, 0, 1, 0, 0, 0]0.35
41589[0.036547268719584185, 0.0, 0.0, 0.0, 0.0, 0.0...AACLLTKWTAGR[0, 0, 1, 0, 0, 0]0.23
41590[0.053176686541959346, -1.0, -1.0, 0.0, -1.0, ...SLEKLEIIPASQ[1, 0, 0, 0, 0, 0]0.30
41591[0.0, -1.0, -1.0, 0.0, -1.0, -1.0, 0.055506936...LVSEIDTGTLAQL[1, 0, 0, 0, 0, 0]0.20
\n", - "

83184 rows × 4 columns

\n", - "
" - ], - "text/plain": [ - " intensities \\\n", - "dataset index \n", - "1 0 [0.03713018032121684, 0.0, -1.0, 0.0, 0.0, -1.... \n", - " 1 [0.32880081359926777, 0.0, -1.0, 1.0, 0.0, -1.... \n", - " 2 [0.03919235848040409, 0.0, 0.0, 0.0, 0.0, 0.0,... \n", - " 3 [0.11537762755556774, 0.0, 0.0, 0.0, 0.0, 0.0,... \n", - " 4 [0.003340655605539741, 0.0, 0.0, 0.00303169307... \n", - "... ... \n", - "2 41587 [0.12310221158139793, 0.0, 0.0, 0.0, 0.0, 0.0,... \n", - " 41588 [0.036119027089409034, 0.0, 0.0, 0.0, 0.0, 0.0... \n", - " 41589 [0.036547268719584185, 0.0, 0.0, 0.0, 0.0, 0.0... \n", - " 41590 [0.053176686541959346, -1.0, -1.0, 0.0, -1.0, ... \n", - " 41591 [0.0, -1.0, -1.0, 0.0, -1.0, -1.0, 0.055506936... \n", - "\n", - " sequence precursor_charge_onehot \\\n", - "dataset index \n", - "1 0 SVFLTFLR [0, 1, 0, 0, 0, 0] \n", - " 1 KTSQIFLAK [0, 1, 0, 0, 0, 0] \n", - " 2 SPVGRVTPKEWR [0, 0, 1, 0, 0, 0] \n", - " 3 SHIWPEYCSRALR [0, 0, 1, 0, 0, 0] \n", - " 4 ELESQISELQEDLESERASR [0, 0, 1, 0, 0, 0] \n", - "... ... ... \n", - "2 41587 LKFEEITGVINPALDKYFPSDSGVR [0, 0, 1, 0, 0, 0] \n", - " 41588 AYVGLERFLAGLRDY [0, 0, 1, 0, 0, 0] \n", - " 41589 AACLLTKWTAGR [0, 0, 1, 0, 0, 0] \n", - " 41590 SLEKLEIIPASQ [1, 0, 0, 0, 0, 0] \n", - " 41591 LVSEIDTGTLAQL [1, 0, 0, 0, 0, 0] \n", - "\n", - " collision_energy_aligned_normed \n", - "dataset index \n", - "1 0 0.25 \n", - " 1 0.28 \n", - " 2 0.28 \n", - " 3 0.30 \n", - " 4 0.20 \n", - "... ... \n", - "2 41587 0.30 \n", - " 41588 0.35 \n", - " 41589 0.23 \n", - " 41590 0.30 \n", - " 41591 0.20 \n", - "\n", - "[83184 rows x 4 columns]" - ] - }, - "execution_count": 65, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "table = pd.concat([df, df], keys=[\"1\", \"2\"], names=[\"dataset\", \"index\"])\n", - "table" - ] - }, - { - "cell_type": "code", - "execution_count": 66, - "metadata": {}, - "outputs": [], - "source": [ - "out_dir = Path.cwd() / \"test\"\n", - "out_dir.mkdir(exist_ok=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 77, - "metadata": {}, - "outputs": [], - "source": [ - "pq.write_to_dataset(\n", - " pa.Table.from_pandas(table), root_path=out_dir, partition_cols=[\"dataset\"], existing_data_behavior=\"delete_matching\"\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 78, - "metadata": {}, - "outputs": [], - "source": [ - "dataset = pq.ParquetDataset(out_dir, filters=[(\"dataset\", \"=\", 1)])\n", - "read_df = dataset.read().to_pandas()\n", - "read_df = read_df.reset_index(level=0, drop=True).rename_axis(None)" - ] - }, - { - "cell_type": "code", - "execution_count": 85, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
intensitiessequenceprecursor_charge_onehotcollision_energy_aligned_normed
0[0.03713018032121684, 0.0, -1.0, 0.0, 0.0, -1....SVFLTFLR[0, 1, 0, 0, 0, 0]0.25
1[0.32880081359926777, 0.0, -1.0, 1.0, 0.0, -1....KTSQIFLAK[0, 1, 0, 0, 0, 0]0.28
2[0.03919235848040409, 0.0, 0.0, 0.0, 0.0, 0.0,...SPVGRVTPKEWR[0, 0, 1, 0, 0, 0]0.28
3[0.11537762755556774, 0.0, 0.0, 0.0, 0.0, 0.0,...SHIWPEYCSRALR[0, 0, 1, 0, 0, 0]0.30
4[0.003340655605539741, 0.0, 0.0, 0.00303169307...ELESQISELQEDLESERASR[0, 0, 1, 0, 0, 0]0.20
...............
41587[0.12310221158139793, 0.0, 0.0, 0.0, 0.0, 0.0,...LKFEEITGVINPALDKYFPSDSGVR[0, 0, 1, 0, 0, 0]0.30
41588[0.036119027089409034, 0.0, 0.0, 0.0, 0.0, 0.0...AYVGLERFLAGLRDY[0, 0, 1, 0, 0, 0]0.35
41589[0.036547268719584185, 0.0, 0.0, 0.0, 0.0, 0.0...AACLLTKWTAGR[0, 0, 1, 0, 0, 0]0.23
41590[0.053176686541959346, -1.0, -1.0, 0.0, -1.0, ...SLEKLEIIPASQ[1, 0, 0, 0, 0, 0]0.30
41591[0.0, -1.0, -1.0, 0.0, -1.0, -1.0, 0.055506936...LVSEIDTGTLAQL[1, 0, 0, 0, 0, 0]0.20
\n", - "

41592 rows × 4 columns

\n", - "
" - ], - "text/plain": [ - " intensities \\\n", - "0 [0.03713018032121684, 0.0, -1.0, 0.0, 0.0, -1.... \n", - "1 [0.32880081359926777, 0.0, -1.0, 1.0, 0.0, -1.... \n", - "2 [0.03919235848040409, 0.0, 0.0, 0.0, 0.0, 0.0,... \n", - "3 [0.11537762755556774, 0.0, 0.0, 0.0, 0.0, 0.0,... \n", - "4 [0.003340655605539741, 0.0, 0.0, 0.00303169307... \n", - "... ... \n", - "41587 [0.12310221158139793, 0.0, 0.0, 0.0, 0.0, 0.0,... \n", - "41588 [0.036119027089409034, 0.0, 0.0, 0.0, 0.0, 0.0... \n", - "41589 [0.036547268719584185, 0.0, 0.0, 0.0, 0.0, 0.0... \n", - "41590 [0.053176686541959346, -1.0, -1.0, 0.0, -1.0, ... \n", - "41591 [0.0, -1.0, -1.0, 0.0, -1.0, -1.0, 0.055506936... \n", - "\n", - " sequence precursor_charge_onehot \\\n", - "0 SVFLTFLR [0, 1, 0, 0, 0, 0] \n", - "1 KTSQIFLAK [0, 1, 0, 0, 0, 0] \n", - "2 SPVGRVTPKEWR [0, 0, 1, 0, 0, 0] \n", - "3 SHIWPEYCSRALR [0, 0, 1, 0, 0, 0] \n", - "4 ELESQISELQEDLESERASR [0, 0, 1, 0, 0, 0] \n", - "... ... ... \n", - "41587 LKFEEITGVINPALDKYFPSDSGVR [0, 0, 1, 0, 0, 0] \n", - "41588 AYVGLERFLAGLRDY [0, 0, 1, 0, 0, 0] \n", - "41589 AACLLTKWTAGR [0, 0, 1, 0, 0, 0] \n", - "41590 SLEKLEIIPASQ [1, 0, 0, 0, 0, 0] \n", - "41591 LVSEIDTGTLAQL [1, 0, 0, 0, 0, 0] \n", - "\n", - " collision_energy_aligned_normed \n", - "0 0.25 \n", - "1 0.28 \n", - "2 0.28 \n", - "3 0.30 \n", - "4 0.20 \n", - "... ... \n", - "41587 0.30 \n", - "41588 0.35 \n", - "41589 0.23 \n", - "41590 0.30 \n", - "41591 0.20 \n", - "\n", - "[41592 rows x 4 columns]" - ] - }, - "execution_count": 85, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "read_df" - ] - }, - { - "cell_type": "code", - "execution_count": 80, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 80, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.equals(read_df)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.0" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}