diff --git a/LICENSE b/LICENSE deleted file mode 100644 index ab2fb0a..0000000 --- a/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2019 ml-mipt - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/Natural Language Processing exam program MSAI 21f .pdf b/Natural Language Processing exam program MSAI 21f .pdf deleted file mode 100644 index ff5d6ac..0000000 Binary files a/Natural Language Processing exam program MSAI 21f .pdf and /dev/null differ diff --git a/homeworks/assignment01_three_headed_network/README.md b/homeworks/assignment01_three_headed_network/README.md deleted file mode 100644 index 937d25a..0000000 --- a/homeworks/assignment01_three_headed_network/README.md +++ /dev/null @@ -1,2 +0,0 @@ -Assignment on more complex network: -[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/girafe-ai/natural-language-processing/blob/master/homeworks/assignment01_three_headed_network/assignment01_three_headed_network.ipynb) diff --git a/homeworks/assignment01_three_headed_network/assignment01_three_headed_network.ipynb b/homeworks/assignment01_three_headed_network/assignment01_three_headed_network.ipynb deleted file mode 100644 index 8d45b31..0000000 --- a/homeworks/assignment01_three_headed_network/assignment01_three_headed_network.ipynb +++ /dev/null @@ -1,906 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "13pL--6rycN3" - }, - "source": [ - "## Homework01: Three headed network in PyTorch\n", - "\n", - "This notebook accompanies the [week02](https://github.com/girafe-ai/natural-language-processing/tree/master/week02_cnn_for_texts) practice session. Refer to that notebook for more comments.\n", - "\n", - "All the preprocessing is the same as in the classwork. *Including the data leakage in the train test split (it's still for bonus points).*" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "P8zS7m-gycN5" - }, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "import matplotlib.pyplot as plt\n", - "%matplotlib inline\n", - "\n", - "import nltk\n", - "import tqdm\n", - "from collections import Counter" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If you have already downloaded the data on the Seminar, simply run through the next cells. Otherwise uncomment the next cell (and comment the another one ;)" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "# uncomment and run this cell, if you don't have data locally yet.\n", - "\n", - "# !curl -L \"https://www.dropbox.com/s/5msc5ix7ndyba10/Train_rev1.csv.tar.gz?dl=1\" -o Train_rev1.csv.tar.gz\n", - "# !tar -xvzf ./Train_rev1.csv.tar.gz\n", - "\n", - "# data = pd.read_csv(\"./Train_rev1.csv\", index_col=None)\n", - "\n", - "# wget https://raw.githubusercontent.com/girafe-ai/ml-mipt/advanced_f20/homeworks_advanced/assignment1_02_Three_headed_network/network.py" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 143 - }, - "colab_type": "code", - "id": "vwN72gd4ycOA", - "outputId": "7b9e8549-3128-4041-c4be-33fb6f326c78" - }, - "outputs": [], - "source": [ - "# run this cell if you have downloaded the dataset on the seminar\n", - "data = pd.read_csv(\"../../week02_CNN_n_Vanishing_gradient/Train_rev1.csv\", index_col=None)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 265 - }, - "colab_type": "code", - "id": "UuuKIKfrycOH", - "outputId": "e5de0f94-a4f6-4b51-db80-9d11ddc1db31" - }, - "outputs": [], - "source": [ - "data['Log1pSalary'] = np.log1p(data['SalaryNormalized']).astype('float32')\n", - "text_columns = [\"Title\", \"FullDescription\"]\n", - "categorical_columns = [\"Category\", \"Company\", \"LocationNormalized\", \"ContractType\", \"ContractTime\"]\n", - "target_column = \"Log1pSalary\"\n", - "\n", - "data[categorical_columns] = data[categorical_columns].fillna('NaN') # cast missing values to string \"NaN\"\n", - "\n", - "data.sample(3)\n", - "\n", - "\n", - "data_for_autotest = data[-5000:]\n", - "data = data[:-5000]" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "RUWkpd7PycOQ" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "595it [00:00, 5946.62it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Tokenized:\n", - "2 mathematical modeller / simulation analyst / o...\n", - "100002 a successful and high achieving specialist sch...\n", - "200002 web designer html , css , javascript , photosh...\n", - "Name: FullDescription, dtype: object\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "239768it [00:49, 4858.44it/s]\n" - ] - } - ], - "source": [ - "tokenizer = nltk.tokenize.WordPunctTokenizer()\n", - "# see task above\n", - "def normalize(text):\n", - " text = str(text).lower()\n", - " return ' '.join(tokenizer.tokenize(text))\n", - " \n", - "data[text_columns] = data[text_columns].applymap(normalize)\n", - "\n", - "print(\"Tokenized:\")\n", - "print(data[\"FullDescription\"][2::100000])\n", - "assert data[\"FullDescription\"][2][:50] == 'mathematical modeller / simulation analyst / opera'\n", - "assert data[\"Title\"][54321] == 'international digital account manager ( german )'\n", - "\n", - "# Count how many times does each token occur in both \"Title\" and \"FullDescription\" in total\n", - "# build a dictionary { token -> it's count }\n", - "from collections import Counter\n", - "from tqdm import tqdm as tqdm\n", - "\n", - "token_counts = Counter()# \n", - "for _, row in tqdm(data[text_columns].iterrows()):\n", - " for string in row:\n", - " token_counts.update(string.split())\n", - "\n", - "# hint: you may or may not want to use collections.Counter" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "2598827" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "token_counts.most_common(1)[0][1]" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 215 - }, - "colab_type": "code", - "id": "GiOWbc15ycOb", - "outputId": "1e807140-5513-4af0-d9a9-9f029059a553" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Total unique tokens : 201127\n", - "('and', 2598827)\n", - "('.', 2471477)\n", - "(',', 2266256)\n", - "('the', 2036428)\n", - "('to', 1977039)\n", - "...\n", - "('dbms_stats', 1)\n", - "('dbms_output', 1)\n", - "('dbms_job', 1)\n", - "Correct!\n", - "Vocabulary size: 33795\n", - "Correct!\n", - "Correct!\n" - ] - } - ], - "source": [ - "print(\"Total unique tokens :\", len(token_counts))\n", - "print('\\n'.join(map(str, token_counts.most_common(n=5))))\n", - "print('...')\n", - "print('\\n'.join(map(str, token_counts.most_common()[-3:])))\n", - "\n", - "assert token_counts.most_common(1)[0][1] in range(2500000, 2700000)\n", - "assert len(token_counts) in range(200000, 210000)\n", - "print('Correct!')\n", - "\n", - "min_count = 10\n", - "\n", - "# tokens from token_counts keys that had at least min_count occurrences throughout the dataset\n", - "tokens = [token for token, count in token_counts.items() if count >= min_count]# \n", - "# Add a special tokens for unknown and empty words\n", - "UNK, PAD = \"UNK\", \"PAD\"\n", - "tokens = [UNK, PAD] + sorted(tokens)\n", - "print(\"Vocabulary size:\", len(tokens))\n", - "\n", - "assert type(tokens) == list\n", - "assert len(tokens) in range(32000, 35000)\n", - "assert 'me' in tokens\n", - "assert UNK in tokens\n", - "print(\"Correct!\")\n", - "\n", - "token_to_id = {token: idx for idx, token in enumerate(tokens)}\n", - "assert isinstance(token_to_id, dict)\n", - "assert len(token_to_id) == len(tokens)\n", - "for tok in tokens:\n", - " assert tokens[token_to_id[tok]] == tok\n", - "\n", - "print(\"Correct!\")" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "JEsLeBjVycOw" - }, - "outputs": [], - "source": [ - "UNK_IX, PAD_IX = map(token_to_id.get, [UNK, PAD])\n", - "\n", - "def as_matrix(sequences, max_len=None):\n", - " \"\"\" Convert a list of tokens into a matrix with padding \"\"\"\n", - " if isinstance(sequences[0], str):\n", - " sequences = list(map(str.split, sequences))\n", - " \n", - " max_len = min(max(map(len, sequences)), max_len or float('inf'))\n", - " \n", - " matrix = np.full((len(sequences), max_len), np.int32(PAD_IX))\n", - " for i,seq in enumerate(sequences):\n", - " row_ix = [token_to_id.get(word, UNK_IX) for word in seq[:max_len]]\n", - " matrix[i, :len(row_ix)] = row_ix\n", - " \n", - " return matrix" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 179 - }, - "colab_type": "code", - "id": "JiBlPkdKycOy", - "outputId": "3866b444-1e2d-4d79-d429-fecc6d8e02a8" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Lines:\n", - "engineering systems analyst\n", - "hr assistant\n", - "senior ec & i engineer\n", - "\n", - "Matrix:\n", - "[[10705 29830 2143 1 1]\n", - " [14875 2817 1 1 1]\n", - " [27345 10107 15 15069 10702]]\n" - ] - } - ], - "source": [ - "print(\"Lines:\")\n", - "print('\\n'.join(data[\"Title\"][::100000].values), end='\\n\\n')\n", - "print(\"Matrix:\")\n", - "print(as_matrix(data[\"Title\"][::100000]))" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 53 - }, - "colab_type": "code", - "id": "DpOlBp7ZycO6", - "outputId": "30a911f2-7d35-4cb5-8991-60457b1e8bac" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "DictVectorizer(dtype=, separator='=', sort=True,\n", - " sparse=False)" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from sklearn.feature_extraction import DictVectorizer\n", - "\n", - "# we only consider top-1k most frequent companies to minimize memory usage\n", - "top_companies, top_counts = zip(*Counter(data['Company']).most_common(1000))\n", - "recognized_companies = set(top_companies)\n", - "data[\"Company\"] = data[\"Company\"].apply(lambda comp: comp if comp in recognized_companies else \"Other\")\n", - "\n", - "categorical_vectorizer = DictVectorizer(dtype=np.float32, sparse=False)\n", - "categorical_vectorizer.fit(data[categorical_columns].apply(dict, axis=1))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "yk4jmtAYycO8" - }, - "source": [ - "### The deep learning part\n", - "\n", - "Once we've learned to tokenize the data, let's design a machine learning experiment.\n", - "\n", - "As before, we won't focus too much on validation, opting for a simple train-test split.\n", - "\n", - "__To be completely rigorous,__ we've comitted a small crime here: we used the whole data for tokenization and vocabulary building. A more strict way would be to do that part on training set only. You may want to do that and measure the magnitude of changes.\n", - "\n", - "\n", - "#### Here comes the simple one-headed network from the seminar. " - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 53 - }, - "colab_type": "code", - "id": "TngLcWA0ycO_", - "outputId": "6731b28c-07b1-41dc-9574-f76b01785bba" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Train size = 191814\n", - "Validation size = 47954\n" - ] - } - ], - "source": [ - "from sklearn.model_selection import train_test_split\n", - "\n", - "data_train, data_val = train_test_split(data, test_size=0.2, random_state=42)\n", - "data_train.index = range(len(data_train))\n", - "data_val.index = range(len(data_val))\n", - "\n", - "print(\"Train size = \", len(data_train))\n", - "print(\"Validation size = \", len(data_val))" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "2PXuKgOSycPB" - }, - "outputs": [], - "source": [ - "def make_batch(data, max_len=None, word_dropout=0):\n", - " \"\"\"\n", - " Creates a keras-friendly dict from the batch data.\n", - " :param word_dropout: replaces token index with UNK_IX with this probability\n", - " :returns: a dict with {'title' : int64[batch, title_max_len]\n", - " \"\"\"\n", - " batch = {}\n", - " batch[\"Title\"] = as_matrix(data[\"Title\"].values, max_len)\n", - " batch[\"FullDescription\"] = as_matrix(data[\"FullDescription\"].values, max_len)\n", - " batch['Categorical'] = categorical_vectorizer.transform(data[categorical_columns].apply(dict, axis=1))\n", - " \n", - " if word_dropout != 0:\n", - " batch[\"FullDescription\"] = apply_word_dropout(batch[\"FullDescription\"], 1. - word_dropout)\n", - " \n", - " if target_column in data.columns:\n", - " batch[target_column] = data[target_column].values\n", - " \n", - " return batch\n", - "\n", - "def apply_word_dropout(matrix, keep_prop, replace_with=UNK_IX, pad_ix=PAD_IX,):\n", - " dropout_mask = np.random.choice(2, np.shape(matrix), p=[keep_prop, 1 - keep_prop])\n", - " dropout_mask &= matrix != pad_ix\n", - " return np.choose(dropout_mask, [matrix, np.full_like(matrix, replace_with)])" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 251 - }, - "colab_type": "code", - "id": "I6LpEQf0ycPD", - "outputId": "e3520cae-fba1-46cc-a216-56287b6e4929" - }, - "outputs": [], - "source": [ - "a = make_batch(data_train[:3], max_len=10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "But to start with let's build the simple model using only the part of the data. Let's create the baseline solution using only the description part (so it should definetely fit into the Sequential model)." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "import torch\n", - "from torch import nn\n", - "import torch.nn.functional as F" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "# You will need these to make it simple\n", - "\n", - "class Flatten(nn.Module):\n", - " def forward(self, input):\n", - " return input.view(input.size(0), -1)\n", - "\n", - "class Reorder(nn.Module):\n", - " def forward(self, input):\n", - " return input.permute((0, 2, 1))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To generate minibatches we will use simple pyton generator." - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "def iterate_minibatches(data, batch_size=256, shuffle=True, cycle=False, **kwargs):\n", - " \"\"\" iterates minibatches of data in random order \"\"\"\n", - " while True:\n", - " indices = np.arange(len(data))\n", - " if shuffle:\n", - " indices = np.random.permutation(indices)\n", - "\n", - " for start in range(0, len(indices), batch_size):\n", - " batch = make_batch(data.iloc[indices[start : start + batch_size]], **kwargs)\n", - " target = batch.pop(target_column)\n", - " yield batch, target\n", - " \n", - " if not cycle: break" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "iterator = iterate_minibatches(data_train, 3)\n", - "batch, target = next(iterator)" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "# Here is some startup code:\n", - "n_tokens=len(tokens)\n", - "n_cat_features=len(categorical_vectorizer.vocabulary_)\n", - "hid_size=64\n", - "simple_model = nn.Sequential()\n", - "\n", - "simple_model.add_module('emb', nn.Embedding(num_embeddings=n_tokens, embedding_dim=hid_size))\n", - "simple_model.add_module('reorder', Reorder())\n", - "simple_model.add_module('conv1', nn.Conv1d(\n", - " in_channels=hid_size,\n", - " out_channels=hid_size,\n", - " kernel_size=2)\n", - " )\n", - "simple_model.add_module('relu1', nn.ReLU())\n", - "simple_model.add_module('adapt_avg_pool', nn.AdaptiveAvgPool1d(output_size=1))\n", - "simple_model.add_module('flatten1', Flatten())\n", - "simple_model.add_module('linear1', nn.Linear(in_features=hid_size, out_features=1))\n", - "# " - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'Title': array([[11439, 1467, 1, 1, 1, 1, 1, 1, 1,\n", - " 1, 1],\n", - " [18664, 7252, 195, 24093, 18670, 12351, 13242, 195, 12724,\n", - " 195, 10720],\n", - " [26688, 10702, 1, 1, 1, 1, 1, 1, 1,\n", - " 1, 1]], dtype=int32),\n", - " 'FullDescription': array([[30411, 26324, 33079, ..., 1, 1, 1],\n", - " [18664, 7252, 195, ..., 195, 0, 80],\n", - " [26688, 10702, 10364, ..., 1, 1, 1]], dtype=int32),\n", - " 'Categorical': array([[1., 0., 0., ..., 0., 0., 0.],\n", - " [0., 0., 0., ..., 0., 0., 0.],\n", - " [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)}" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "batch" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "__Remember!__ We are working with regression problem and predicting only one number." - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "tensor([[0.0493],\n", - " [0.1251],\n", - " [0.0742]], grad_fn=)" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Try this to check your model. `torch.long` tensors are required for nn.Embedding layers.\n", - "simple_model(torch.tensor(batch['FullDescription'], dtype=torch.long))" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(3, 653)" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "batch['FullDescription'].shape" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And now simple training pipeline (it's commented because we've already done that in class. No need to do it again)." - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# from IPython.display import clear_output\n", - "# from random import sample\n", - "\n", - "# epochs = 1\n", - "\n", - "# model = simple_model\n", - "# opt = torch.optim.Adam(model.parameters())\n", - "# loss_func = nn.MSELoss()\n", - "\n", - "# history = []\n", - "# for epoch_num in range(epochs):\n", - "# for idx, (batch, target) in enumerate(iterate_minibatches(data_train)):\n", - "# # Preprocessing the batch data and target\n", - "# batch = torch.tensor(batch['FullDescription'], dtype=torch.long)\n", - "\n", - "# target = torch.tensor(target)\n", - "\n", - "\n", - "# predictions = model(batch)\n", - "# predictions = predictions.view(predictions.size(0))\n", - "\n", - "# loss = loss_func(predictions, target)# \n", - "\n", - "# # train with backprop\n", - "# loss.backward()\n", - "# opt.step()\n", - "# opt.zero_grad()\n", - "# # \n", - "\n", - "# history.append(loss.data.numpy())\n", - "# if (idx+1)%10==0:\n", - "# clear_output(True)\n", - "# plt.plot(history,label='loss')\n", - "# plt.legend()\n", - "# plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Actual homework starts here\n", - "__Your ultimate task is to code the three headed network described on the picture below.__ \n", - "To make it closer to the real world, please store the network code in file `network.py` in this directory. " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "0eI5h9UMycPF" - }, - "source": [ - "#### Architecture\n", - "\n", - "Our main model consists of three branches:\n", - "* Title encoder\n", - "* Description encoder\n", - "* Categorical features encoder\n", - "\n", - "We will then feed all 3 branches into one common network that predicts salary.\n", - "\n", - "\n", - "\n", - "This clearly doesn't fit into PyTorch __Sequential__ interface. To build such a network, one will have to use [__PyTorch nn.Module API__](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import network" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Re-run this cell if you updated the file with network source code\n", - "import imp\n", - "imp.reload(network)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "model = network.ThreeInputsNet(\n", - " n_tokens=len(tokens),\n", - " n_cat_features=len(categorical_vectorizer.vocabulary_),\n", - "\n", - " # this parameter defines the number of the inputs in the layer,\n", - " # which stands after the concatenation. In should be found out by you.\n", - " concat_number_of_features= \n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "testing_batch, _ = next(iterate_minibatches(data_train, 3))\n", - "testing_batch = [\n", - " torch.tensor(testing_batch['Title'], dtype=torch.long),\n", - " torch.tensor(testing_batch['FullDescription'], dtype=torch.long),\n", - " torch.tensor(testing_batch['Categorical'])\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "assert model(testing_batch).shape == torch.Size([3, 1])\n", - "assert model(testing_batch).dtype == torch.float32\n", - "print('Seems fine!')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now train the network for a while (100 batches would be fine)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Training pipeline comes here (almost the same as for the simple_model)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, to evaluate the model it can be switched to `eval` state." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "model.eval()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def generate_submission(model, data, batch_size=256, name=\"\", three_inputs_mode=True, **kw):\n", - " squared_error = abs_error = num_samples = 0.0\n", - " output_list = []\n", - " for batch_x, batch_y in tqdm(iterate_minibatches(data, batch_size=batch_size, shuffle=False, **kw)):\n", - " if three_inputs_mode:\n", - " batch = [\n", - " torch.tensor(batch_x['Title'], dtype=torch.long),\n", - " torch.tensor(batch_x['FullDescription'], dtype=torch.long),\n", - " torch.tensor(batch_x['Categorical'])\n", - " ]\n", - " else:\n", - " batch = torch.tensor(batch_x['FullDescription'], dtype=torch.long)\n", - "\n", - " batch_pred = model(batch)[:, 0].detach().numpy()\n", - " \n", - " output_list.append((list(batch_pred), list(batch_y)))\n", - " \n", - " squared_error += np.sum(np.square(batch_pred - batch_y))\n", - " abs_error += np.sum(np.abs(batch_pred - batch_y))\n", - " num_samples += len(batch_y)\n", - " print(\"%s results:\" % (name or \"\"))\n", - " print(\"Mean square error: %.5f\" % (squared_error / num_samples))\n", - " print(\"Mean absolute error: %.5f\" % (abs_error / num_samples))\n", - " \n", - "\n", - " batch_pred = [c for x in output_list for c in x[0]]\n", - " batch_y = [c for x in output_list for c in x[1]]\n", - " output_df = pd.DataFrame(list(zip(batch_pred, batch_y)), columns=['batch_pred', 'batch_y'])\n", - " output_df.to_csv('submission.csv', index=False)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "generate_submission(model, data_for_autotest, name='Submission')\n", - "print('Submission file generated')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "__Both the notebook and the `.py` file are required to submit this homework.__" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "name": "CNN_for_texts.ipynb", - "provenance": [], - "version": "0.3.2" - }, - "kernelspec": { - "display_name": "Py3 research env", - "language": "python", - "name": "py3_research" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.7" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/homeworks/assignment01_three_headed_network/network.py b/homeworks/assignment01_three_headed_network/network.py deleted file mode 100644 index 07decef..0000000 --- a/homeworks/assignment01_three_headed_network/network.py +++ /dev/null @@ -1,50 +0,0 @@ - -import numpy as np -import pandas as pd - -import torch -from torch import nn -import torch.nn.functional as F - -import tqdm - - -class ThreeInputsNet(nn.Module): - def __init__(self, n_tokens, n_cat_features, concat_number_of_features, hid_size=64): - super(ThreeInputsNet, self).__init__() - self.title_emb = nn.Embedding(n_tokens, embedding_dim=hid_size) - # - - self.full_emb = nn.Embedding(num_embeddings=n_tokens, embedding_dim=hid_size) - # - - self.category_out = # - - - # Example for the final layers (after the concatenation) - self.inter_dense = nn.Linear(in_features=concat_number_of_features, out_features=hid_size*2) - self.final_dense = nn.Linear(in_features=hid_size*2, out_features=1) - - - - def forward(self, whole_input): - input1, input2, input3 = whole_input - title_beg = self.title_emb(input1).permute((0, 2, 1)) - title = # - - full_beg = self.full_emb(input2).permute((0, 2, 1)) - full = # - - category = # - - concatenated = torch.cat( - [ - title.view(title.size(0), -1), - full.view(full.size(0), -1), - category.view(category.size(0), -1) - ], - dim=1) - - out = # - - return out \ No newline at end of file diff --git a/homeworks/assignment02_attention_scores/README.md b/homeworks/assignment02_attention_scores/README.md deleted file mode 100644 index 3d9f15b..0000000 --- a/homeworks/assignment02_attention_scores/README.md +++ /dev/null @@ -1 +0,0 @@ -Please, refer to week04 attention notebook and finish the concat and general attention scores. diff --git a/homeworks/lab01_nlp/.ipynb_checkpoints/Lab1_NLP_par1_Embedding_based_MT-checkpoint.ipynb b/homeworks/lab01_nlp/.ipynb_checkpoints/Lab1_NLP_par1_Embedding_based_MT-checkpoint.ipynb deleted file mode 100644 index 0f42882..0000000 --- a/homeworks/lab01_nlp/.ipynb_checkpoints/Lab1_NLP_par1_Embedding_based_MT-checkpoint.ipynb +++ /dev/null @@ -1,753 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "eulvfJWl7ueY" - }, - "source": [ - "# Lab 1\n", - "\n", - "\n", - "## Part 1: Bilingual dictionary induction and unsupervised embedding-based MT (30%)\n", - "*Note: this homework is based on materials from yandexdataschool [NLP course](https://github.com/yandexdataschool/nlp_course/). Feel free to check this awesome course if you wish to dig deeper.*\n", - "\n", - "*Refined by [Nikolay Karpachev](https://www.linkedin.com/in/nikolay-karpachev-b0146a104/)*" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "fV4rIjxa7uei" - }, - "source": [ - "**In this homework** **YOU** will make machine translation system without using parallel corpora, alignment, attention, 100500 depth super-cool recurrent neural network and all that kind superstuff.\n", - "\n", - "But even without parallel corpora this system can be good enough (hopefully), in particular for similar languages, e.g. Ukrainian and Russian. " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "idSYq2GU7uew" - }, - "source": [ - "### Frament of the Swadesh list for some slavic languages\n", - "\n", - "The Swadesh list is a lexicostatistical stuff. It's named after American linguist Morris Swadesh and contains basic lexis. This list are used to define subgroupings of languages, its relatedness.\n", - "\n", - "So we can see some kind of word invariance for different Slavic languages.\n", - "\n", - "\n", - "| Russian | Belorussian | Ukrainian | Polish | Czech | Bulgarian |\n", - "|-----------------|--------------------------|-------------------------|--------------------|-------------------------------|-----------------------|\n", - "| женщина | жанчына, кабета, баба | жінка | kobieta | žena | жена |\n", - "| мужчина | мужчына | чоловік, мужчина | mężczyzna | muž | мъж |\n", - "| человек | чалавек | людина, чоловік | człowiek | člověk | човек |\n", - "| ребёнок, дитя | дзіця, дзіцёнак, немаўля | дитина, дитя | dziecko | dítě | дете |\n", - "| жена | жонка | дружина, жінка | żona | žena, manželka, choť | съпруга, жена |\n", - "| муж | муж, гаспадар | чоловiк, муж | mąż | muž, manžel, choť | съпруг, мъж |\n", - "| мать, мама | маці, матка | мати, матір, неня, мама | matka | matka, máma, 'стар.' mateř | майка |\n", - "| отец, тятя | бацька, тата | батько, тато, татусь | ojciec | otec | баща, татко |\n", - "| много | шмат, багата | багато | wiele | mnoho, hodně | много |\n", - "| несколько | некалькі, колькі | декілька, кілька | kilka | několik, pár, trocha | няколко |\n", - "| другой, иной | іншы | інший | inny | druhý, jiný | друг |\n", - "| зверь, животное | жывёла, звер, істота | тварина, звір | zwierzę | zvíře | животно |\n", - "| рыба | рыба | риба | ryba | ryba | риба |\n", - "| птица | птушка | птах, птиця | ptak | pták | птица |\n", - "| собака, пёс | сабака | собака, пес | pies | pes | куче, пес |\n", - "| вошь | вош | воша | wesz | veš | въшка |\n", - "| змея, гад | змяя | змія, гад | wąż | had | змия |\n", - "| червь, червяк | чарвяк | хробак, черв'як | robak | červ | червей |\n", - "| дерево | дрэва | дерево | drzewo | strom, dřevo | дърво |\n", - "| лес | лес | ліс | las | les | гора, лес |\n", - "| палка | кій, палка | палиця | patyk, pręt, pałka | hůl, klacek, prut, kůl, pálka | палка, пръчка, бастун |" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "cNM3_fjr7ue2" - }, - "source": [ - "But the context distribution of these languages demonstrates even more invariance. And we can use this fact for our for our purposes." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "YLppwa527ue6" - }, - "source": [ - "## Data" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "lYBGKAUn7ue_" - }, - "outputs": [], - "source": [ - "import gensim\n", - "import numpy as np\n", - "from gensim.models import KeyedVectors" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "MwGoVhRA7ufP" - }, - "source": [ - "In this notebook we're going to use pretrained word vectors - FastText (original paper - https://arxiv.org/abs/1607.04606).\n", - "\n", - "You can download them from the official [website](https://fasttext.cc/docs/en/crawl-vectors.html). We're going to need embeddings for Russian and Ukrainian languages. Please use word2vec-compatible format (.text)." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "u1JjQv_97ufT" - }, - "outputs": [], - "source": [ - "uk_emb = KeyedVectors.load_word2vec_format(\"cc.uk.300.vec\")" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "ffzuept_7ufd" - }, - "outputs": [], - "source": [ - "ru_emb = KeyedVectors.load_word2vec_format(\"cc.ru.300.vec\")" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "nTkXfT0W7ufk" - }, - "outputs": [], - "source": [ - "ru_emb.most_similar([ru_emb[\"август\"]], topn=10)" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "vdBA8lcg7ufs" - }, - "outputs": [], - "source": [ - "uk_emb.most_similar([uk_emb[\"серпень\"]])" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "_yJvcKXO7uf0" - }, - "outputs": [], - "source": [ - "ru_emb.most_similar([uk_emb[\"серпень\"]])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "pNdYAR1q7uf6" - }, - "source": [ - "Load small dictionaries for correspoinding words pairs as trainset and testset." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "35d_DAK67uf8" - }, - "outputs": [], - "source": [ - "def load_word_pairs(filename):\n", - " uk_ru_pairs = []\n", - " uk_vectors = []\n", - " ru_vectors = []\n", - " with open(filename, \"r\") as inpf:\n", - " for line in inpf:\n", - " uk, ru = line.rstrip().split(\"\\t\")\n", - " if uk not in uk_emb or ru not in ru_emb:\n", - " continue\n", - " uk_ru_pairs.append((uk, ru))\n", - " uk_vectors.append(uk_emb[uk])\n", - " ru_vectors.append(ru_emb[ru])\n", - " return uk_ru_pairs, np.array(uk_vectors), np.array(ru_vectors)" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "wkNL602WHJyO" - }, - "outputs": [], - "source": [ - "!wget -O ukr_rus.train.txt http://tiny.cc/jfgecz" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "uoclU6JcHCcn" - }, - "outputs": [], - "source": [ - "!wget -O ukr_rus.test.txt http://tiny.cc/6zoeez" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "05BqsdSK7ugD" - }, - "outputs": [], - "source": [ - "uk_ru_train, X_train, Y_train = load_word_pairs(\"ukr_rus.train.txt\")" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "zQOZw51r7ugL" - }, - "outputs": [], - "source": [ - "uk_ru_test, X_test, Y_test = load_word_pairs(\"ukr_rus.test.txt\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "-ZBBNvpz7ugQ" - }, - "source": [ - "## Embedding space mapping (0.3 pts)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "x_Dhk5gL7ugS" - }, - "source": [ - "Let $x_i \\in \\mathrm{R}^d$ be the distributed representation of word $i$ in the source language, and $y_i \\in \\mathrm{R}^d$ is the vector representation of its translation. Our purpose is to learn such linear transform $W$ that minimizes euclidian distance between $Wx_i$ and $y_i$ for some subset of word embeddings. Thus we can formulate so-called Procrustes problem:\n", - "\n", - "$$W^*= \\arg\\min_W \\sum_{i=1}^n||Wx_i - y_i||_2$$\n", - "or\n", - "$$W^*= \\arg\\min_W ||WX - Y||_F$$\n", - "\n", - "where $||*||_F$ - Frobenius norm." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "acOjDdtL7ugY" - }, - "source": [ - "$W^*= \\arg\\min_W \\sum_{i=1}^n||Wx_i - y_i||_2$ looks like simple multiple linear regression (without intercept fit). So let's code." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "Lb-KN1be7uga" - }, - "outputs": [], - "source": [ - "from sklearn.linear_model import LinearRegression\n", - "\n", - "# YOUR CODE HERE\n", - "# mapping = ...\n", - "# -------" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "X7tqJwoY7ugf" - }, - "source": [ - "Let's take a look at neigbours of the vector of word _\"серпень\"_ (_\"август\"_ in Russian) after linear transform." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "31SrFSbn7ugi" - }, - "outputs": [], - "source": [ - "august = mapping.predict(uk_emb[\"серпень\"].reshape(1, -1))\n", - "ru_emb.most_similar(august)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "okSkjk597ugo" - }, - "source": [ - "We can see that neighbourhood of this embedding cosists of different months, but right variant is on the ninth place." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "o2uY6Y9B7ugt" - }, - "source": [ - "As quality measure we will use precision top-1, top-5 and top-10 (for each transformed Ukrainian embedding we count how many right target pairs are found in top N nearest neighbours in Russian embedding space)." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "zptuho8LAfIE" - }, - "outputs": [], - "source": [ - "def precision(pairs, mapped_vectors, topn=1):\n", - " \"\"\"\n", - " :args:\n", - " pairs = list of right word pairs [(uk_word_0, ru_word_0), ...]\n", - " mapped_vectors = list of embeddings after mapping from source embedding space to destination embedding space\n", - " topn = the number of nearest neighbours in destination embedding space to choose from\n", - " :returns:\n", - " precision_val, float number, total number of words for those we can find right translation at top K.\n", - " \"\"\"\n", - " assert len(pairs) == len(mapped_vectors)\n", - " num_matches = 0\n", - " for i, (_, ru) in enumerate(pairs):\n", - " # YOUR CODE HERE\n", - " precision_val = num_matches / len(pairs)\n", - " return precision_val" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "duhj9hpv7ugy" - }, - "outputs": [], - "source": [ - "assert precision([(\"серпень\", \"август\")], august, topn=5) == 0.0\n", - "assert precision([(\"серпень\", \"август\")], august, topn=9) == 1.0\n", - "assert precision([(\"серпень\", \"август\")], august, topn=10) == 1.0" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "0-iyd5gP7ug5" - }, - "outputs": [], - "source": [ - "assert precision(uk_ru_test, X_test) == 0.0\n", - "assert precision(uk_ru_test, Y_test) == 1.0" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "U-ssEJ3x7uhA" - }, - "outputs": [], - "source": [ - "precision_top1 = precision(uk_ru_test, mapping.predict(X_test), 1)\n", - "precision_top5 = precision(uk_ru_test, mapping.predict(X_test), 5)" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "7K-hy7a6Ksn2" - }, - "outputs": [], - "source": [ - "print(precision_top1)\n", - "print(precision_top5)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "hf6Ou8bx7uhH" - }, - "source": [ - "## Making it better (orthogonal Procrustean problem) (0.3 pts)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "4oLs-drN7uhK" - }, - "source": [ - "It can be shown (see original paper) that a self-consistent linear mapping between semantic spaces should be orthogonal. \n", - "We can restrict transform $W$ to be orthogonal. Then we will solve next problem:\n", - "\n", - "$$W^*= \\arg\\min_W ||WX - Y||_F \\text{, where: } W^TW = I$$\n", - "\n", - "$$I \\text{- identity matrix}$$\n", - "\n", - "Instead of making yet another regression problem we can find optimal orthogonal transformation using singular value decomposition. It turns out that optimal transformation $W^*$ can be expressed via SVD components:\n", - "$$X^TY=U\\Sigma V^T\\text{, singular value decompostion}$$\n", - "$$W^*=UV^T$$" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "_KSaRJFGMFiJ" - }, - "outputs": [], - "source": [ - "import numpy as np" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "DdFQ7qti7uhL" - }, - "outputs": [], - "source": [ - "def learn_transform(X_train, Y_train):\n", - " \"\"\" \n", - " :returns: W* : float matrix[emb_dim x emb_dim] as defined in formulae above\n", - " \"\"\"\n", - " # YOUR CODE GOES HERE\n", - " # compute orthogonal embedding space mapping\n", - " # mapping = ...\n", - "\n", - " return mapping" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "7X7QfYDd7uhQ" - }, - "outputs": [], - "source": [ - "W = learn_transform(X_train, Y_train)" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "OVOFYYa37uhX" - }, - "outputs": [], - "source": [ - "ru_emb.most_similar([np.matmul(uk_emb[\"серпень\"], W)])" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "r297sYP37uhb" - }, - "outputs": [], - "source": [ - "print(precision(uk_ru_test, np.matmul(X_test, W)))\n", - "print(precision(uk_ru_test, np.matmul(X_test, W), 5))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "hvUZ72U5AfJg" - }, - "source": [ - "## Unsupervised embedding-based MT (0.4 pts)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "LLyuVfHBLrJn" - }, - "source": [ - "Now, let's build our word embeddings-based translator!" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "tPAURW1CMuP7" - }, - "source": [ - "Firstly, download OPUS Tatoeba corpus." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "F80kUKzQMsDu" - }, - "outputs": [], - "source": [ - "!wget https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/mono/uk.txt.gz" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "0CGFZoxCUVf1" - }, - "outputs": [], - "source": [ - "!gzip -d ./uk.txt.gz" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "2MV3VvoVUX5U" - }, - "outputs": [], - "source": [ - "with open('./uk.txt', 'r') as f:\n", - " uk_corpus = f.readlines()" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "tU7nPVf0UhbI" - }, - "outputs": [], - "source": [ - "# To save your time and CPU, feel free to use first 1000 sentences of the corpus\n", - "uk_corpus = uk_corpus[:1000]" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "FLN8dBOXAfJ1" - }, - "outputs": [], - "source": [ - "# Any necessary preprocessing if needed\n", - "# YOUR CODE HERE" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "FGksC7l_NMi9" - }, - "outputs": [], - "source": [ - "def translate(sentence):\n", - " \"\"\"\n", - " :args:\n", - " sentence - sentence in Ukrainian (str)\n", - " :returns:\n", - " translation - sentence in Russian (str)\n", - "\n", - " * find ukrainian embedding for each word in sentence\n", - " * transform ukrainian embedding vector\n", - " * find nearest russian word and replace\n", - " \"\"\"\n", - " # YOUR CODE GOES HERE\n", - "\n", - " return \" \".join(translated)" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "4hbbMy-tNxlf" - }, - "outputs": [], - "source": [ - "assert translate(\".\") == \".\"\n", - "assert translate(\"1 , 3\") == \"1 , 3\"\n", - "assert translate(\"кіт зловив мишу\") == \"кот поймал мышку\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "ia6I2ce7O_HI" - }, - "source": [ - "Now you can play with your model and try to get as accurate translations as possible. **Note**: one big issue is out-of-vocabulary words. Try to think of various ways of handling it (you can start with translating each of them to a special **UNK** token and then move to more sophisticated approaches). Good luck!" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "ap1W7ZCeOAVU" - }, - "outputs": [], - "source": [ - "for sent in uk_corpus[::10]:\n", - " print(translate(sent))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Great! \n", - "See second notebook for the Neural Machine Translation assignment." - ] - } - ], - "metadata": { - "anaconda-cloud": {}, - "colab": { - "collapsed_sections": [], - "machine_shape": "hm", - "name": "homework.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Py3 research env", - "language": "python", - "name": "py3_research" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.7" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/homeworks/lab01_nlp/.ipynb_checkpoints/Lab1_NLP_part2_NMT-checkpoint.ipynb b/homeworks/lab01_nlp/.ipynb_checkpoints/Lab1_NLP_part2_NMT-checkpoint.ipynb deleted file mode 100644 index cae6998..0000000 --- a/homeworks/lab01_nlp/.ipynb_checkpoints/Lab1_NLP_part2_NMT-checkpoint.ipynb +++ /dev/null @@ -1,941 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Lab 1" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Part 2: Neural Machine Translation in the wild\n", - "In the third homework you are supposed to get the best translation you can for the EN-RU translation task.\n", - "\n", - "Basic approach using RNNs as encoder and decoder is implemented for you. \n", - "\n", - "Your ultimate task is to use the techniques we've covered, e.g.\n", - "\n", - "* Optimization enhancements (e.g. learning rate decay)\n", - "\n", - "* CNN encoder (with or without positional encoding)\n", - "\n", - "* attention/self-attention mechanism\n", - "\n", - "* pretraining the language model\n", - "\n", - "* [Byte Pair Encoding](https://github.com/rsennrich/subword-nmt)\n", - "\n", - "* or just fine-tunning BERT ;)\n", - "\n", - "to improve the translation quality. \n", - "\n", - "__Please use at least three different approaches/models and compare them (translation quality/complexity/training and evaluation time).__\n", - "\n", - "Write down some summary on your experiments and illustrate it with convergence plots/metrics and your thoughts. Just like you would approach a real problem." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# You might need to install the libraries below. Do it in the desired environment\n", - "# if you are working locally.\n", - "\n", - "# ! pip install subword-nmt\n", - "# ! pip install nltk\n", - "# ! pip install torchtext" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "# Thanks to YSDA NLP course team for the data\n", - "# (who thanks tilda and deephack teams for the data in their turn)\n", - "\n", - "import os\n", - "path_do_data = '../../datasets/Machine_translation_EN_RU/data.txt'\n", - "if not os.path.exists(path_do_data):\n", - " print(\"Dataset not found locally. Downloading from github. Loading special files as well\")\n", - " !wget https://raw.githubusercontent.com/girafe-ai/ml-mipt/advanced_f20/datasets/Machine_translation_EN_RU/data.txt -nc\n", - " path_do_data = './data.txt'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if not os.path.exists('./utils.py'):\n", - " print(\"utils file not found locally. Downloading from github.\")\n", - " !wget https://raw.githubusercontent.com/girafe-ai/ml-mipt/advanced_f20/homeworks_advanced/Lab1_NLP/utils.py -nc\n", - "\n", - "if not os.path.exists('./my_network.py'):\n", - " print(\"network file not found locally. Downloading from github.\")\n", - " !wget https://raw.githubusercontent.com/girafe-ai/ml-mipt/advanced_f20/homeworks_advanced/Lab1_NLP/my_network.py -nc" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "import torch\n", - "import torch.nn as nn\n", - "import torch.optim as optim\n", - "\n", - "import torchtext\n", - "from torchtext.datasets import TranslationDataset, Multi30k\n", - "from torchtext.data import Field, BucketIterator\n", - "\n", - "import spacy\n", - "\n", - "import random\n", - "import math\n", - "import time\n", - "\n", - "import matplotlib\n", - "matplotlib.rcParams.update({'figure.figsize': (16, 12), 'font.size': 14})\n", - "import matplotlib.pyplot as plt\n", - "%matplotlib inline\n", - "from IPython.display import clear_output\n", - "\n", - "from nltk.tokenize import WordPunctTokenizer\n", - "from subword_nmt.learn_bpe import learn_bpe\n", - "from subword_nmt.apply_bpe import BPE\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Main part\n", - "__Here comes the preprocessing. Do not hesitate to use BPE or more complex preprocessing ;)__" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "tokenizer_W = WordPunctTokenizer()\n", - "def tokenize(x, tokenizer=tokenizer_W):\n", - " return tokenizer.tokenize(x.lower())" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "SRC = Field(tokenize=tokenize,\n", - " init_token = '', \n", - " eos_token = '', \n", - " lower = True)\n", - "\n", - "TRG = Field(tokenize=tokenize,\n", - " init_token = '', \n", - " eos_token = '', \n", - " lower = True)\n", - "\n", - "dataset = torchtext.data.TabularDataset(\n", - " path=path_do_data,\n", - " format='tsv',\n", - " fields=[('trg', TRG), ('src', SRC)]\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "train_data, valid_data, test_data = dataset.split(split_ratio=[0.8, 0.15, 0.05])" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Number of training examples: 40000\n", - "Number of validation examples: 2500\n", - "Number of testing examples: 7500\n" - ] - } - ], - "source": [ - "print(f\"Number of training examples: {len(train_data.examples)}\")\n", - "print(f\"Number of validation examples: {len(valid_data.examples)}\")\n", - "print(f\"Number of testing examples: {len(test_data.examples)}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "SRC.build_vocab(train_data, min_freq = 3)\n", - "TRG.build_vocab(train_data, min_freq = 3)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Unique tokens in source (ru) vocabulary: 9267\n", - "Unique tokens in target (en) vocabulary: 6699\n" - ] - } - ], - "source": [ - "print(f\"Unique tokens in source (ru) vocabulary: {len(SRC.vocab)}\")\n", - "print(f\"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here are tokens from original (RU) corpus:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['',\n", - " '29',\n", - " 'соль',\n", - " 'комо',\n", - " '―',\n", - " 'электрическая',\n", - " 'ming',\n", - " 'утренний',\n", - " 'детском',\n", - " 'таунус']" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "SRC.vocab.itos[::1000]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And from target (EN) corpus:" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['', 'king', 'buffets', 'catch', 'media', 'schedule', 'maraunenhof']" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "TRG.vocab.itos[::1000]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And here is example from train dataset:" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'trg': ['laundry', 'service', 'is', 'provided', '.'], 'src': ['помимо', 'этого', ',', 'гостям', 'предоставляются', 'услуги', 'прачечной', '.']}\n" - ] - } - ], - "source": [ - "print(vars(train_data.examples[9]))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's check the length distributions:" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Length distribution in Train data\n" - ] - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAfUAAAEICAYAAABGRG3WAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8GearUAAAer0lEQVR4nO3df7ReVX3n8fdHIr+0kCApxQRNKhlbZC0rzUhcOB3HOPyybVhr1MFxDdGmTVdLW9vpTAvTrmFGZQbXOEVYKpURSrAWpFRLRqw0RRlXp8OPoA7yQ0rKryQFuZIAVqs19jt/nH3hId6be3Pvzb3PPff9WutZ95y99zlnn3Of/XzP2c9+zklVIUmS5r8XzHUFJEnSzDCoS5LUEwZ1SZJ6wqAuSVJPGNQlSeoJg7okST1hUNcBkWRFkkqyaA62/c4kfznb25XmQpKrkrxvGsv/XZIfnck6tfU+nORNM73eSWx3zj57hoFBXfPaQm/AGg5zFcD2V5Jbkvz8YFpVvbiqHpyrOk3XfDn2s8WgrudJctBc10HqG086NVsM6vNIkt9OsjPJN5Pcn2RtSz8kyQeT/G17fTDJIS3vB7qi25Xt8W36qiSXJflskm8B/yLJcUk+lWQkyZNJPjSw7M8luS/J7iQ3JXn5JOt+ZJIrkjzW9uF9oycQo3VM8oG23oeSnDGw7MokX2z7/RdJPpzkD1v2F9vfp1o34usGlhtzfdJMSvJx4GXA/2rvwd8a6EHakORR4POt7B8neTzJ0+09/aqB9VzV3ts3tvf6bUle0fKS5OIkTyR5JslXk5w4Rl2WJPlMa7u72/Tylnch8M+AD7V6fqilD34eHJnk6rb8I0l+N8kLWt4+2+kEx+gFSc5L8jftM+W6JEe1vNFjtT7Jo0m+keR3BpY9LMmmts372vHdMd6xH9jsO8ZaX+9Vla958AJeCWwHXtrmVwCvaNPvAW4FfhhYCvwV8N6W907gL/daVwHHt+mrgKeBU+hO8l4E/D/g4jZ9KPD6VnYdsA34cWAR8LvAX41T3xVtO4va/KeBj7Z1/jBwO/CLA3X8HvALwEHALwF/C6Tl/1/gA8DBwOuBZ4A/HGs7k1mfL18z/QIeBt40MD/6vry6vecPa+k/B/wQcAjwQeArA8tcBTwJvLa1r08A17a804A7gcVAWhs8dmC597XplwD/Cji8beePgT8d2MYtwM/vVffBz4OrgRvasiuAvwY2tLz9aleDxwR4N91n1PK27x8FrtnrWP1P4DDg1cB3gR9v+RcB/xtY0pa/C9gxiWM/5vr6/przCvia5D8KjgeeAN4EvHCvvL8BzhyYPw14uE2/k4mD+tUDea8DRhgIkgN5fzbawNv8C4BvAy8fo+xow1oEHNMa1WED+W8HvjBQx20DeYe3ZX+E7ix8D3D4QP4fMnFQH3N9c/1/9NXP1z4Cy4/uY5nFrcyRbf4q4GMD+WcCX2vTb6QLsGuAF+y1nqtoQX2MbfwEsHtg/hbGCep0gfofgBMG8n4RuKVN71e74vlB/T5g7UDesXQnCIsGjtXygfzbgbPb9IPAaQN5P8/kgvqY6+v7y+73eaKqtgG/Dvxn4Ikk1yZ5act+KfDIQPFHWtpkbR+YPg54pKr2jFHu5cAlSZ5K8hSwi+6qYdkE63858ELgsYFlP0p3xT7q8dGJqvp2m3xx249dA2l713c8461Pmk3PvleTHJTkotYF/QxdMAI4eqD84wPT36a9Z6vq88CHgA/Ttf/Lkxyx98aSHJ7ko63r/Bm6r6cWZ3JjZY6ma6d7f5YMtu+ptquXA58eaP/3Ad+nO+H/gXUzsO90nwGDbX4y7X9f6+s1g/o8UlV/VFWvp2sgBby/Zf1tSxv1spYG8C26M2oAkvzIWKsemN4OvCxjD+zZTtdlvnjgdVhV/dUEVd9Od6V+9MByR1TVqyZYDuAx4Kgkhw+kHTdO3aW5Mt77cDD939B9hfUm4Ei6K0roTown3kDVpVX1k8AJwD8B/sMYxX6T7qu6k6vqCOCn9trGvtrLN+iunvf+LNk5mfpNYDtwxl6fHYdW1WTW/Rhdt/uo4/bK9zNggEF9nkjyyiRvTDcA7jvA3wP/2LKvAX43ydIkRwP/ia6LGrrvx1+V5CeSHEp3pb8vt9M1oouSvCjJoUlOaXm/D5w/OrinDap560R1r6rHgD8H/keSI9qgmVck+eeTWPYRYCvwn5Mc3AbC/cxAkRG64zDjv7OV9sPXmfg9+EN0J7dP0p1o/9fJrjzJP01ycpIX0p2of4fn2v/e2/h7uoGjRwEXTLaeVfV94DrgwiQ/lG4Q7L/juc+S6fj9tt6Xt/1ZmmTdJJe9ju5zZ0mSZcCv7JU/mWO/YBjU549D6AaMfIOuW+mHgfNb3vvoAt9dwFeBL7U0quqv6QbS/QXwALDPm7K0hv0zdN+xPQrsAP51y/s0Xe/Ata1r725gsqPKz6Eb6HYvsBu4nu57tcl4B913/U+2/fok3YfjaBfghcD/aV17aya5Tmkm/Te6E+unkvz7ccpcTdedvZOuHdy6H+s/gm7g1+62jieB/z5GuQ/SDQ77Rlv/5/bKvwR4SxtJfukYy/8q3UnDg3SfFX8EXLkf9RzPJcBm4M+TfLPV7eRJLvseus+hh+g+x66ntf9mMsd+wRgdXSzNG0k+STeAaO+rEEk9l+SX6Aa9TdjTtxB5pa6h17oeX9G67U+n+17yT+e6XpIOvCTHJjmltf9X0o0b+PRc12tYeZcjzQc/AnyK7je4O4Bfqqovz22VJM2Sg+l+LbMSeAq4FvjInNZoiNn9LklST9j9LklST8zb7vejjz66VqxYMdfVkIbanXfe+Y2qWjrX9dgX27I0OZNpzxMG9SRXAj8NPFFVJ7a0o+h+VrSC7q5Ib6uq3UlC99OFM+nu4PPOqvpSW2Y93b3Cobul4aaW/pN0tzk8DPgs8O6axHcCK1asYOvWrRMVkxa0JI9MXGpu2ZalyZlMe55M9/tVwOl7pZ0H3FxVq4Cb2zx0v1le1V4bgctaRUZvgnAy3cMKLkiypC1zGd0DAkaX23tbkiRpEiYM6lX1Rbp7fA9aB2xq05uAswbSr67OrXT3HD6W7gEjW6pqV1XtBrYAp7e8I6rq1nZ1fvXAuiRJ0n6Y6kC5Y9qtP6G7u9noTfmX8fyb7e9oaftK3zFG+piSbEyyNcnWkZGRKVZdkqR+mvbo93aFPSu/i6uqy6tqdVWtXrp0qMf+SJI066Ya1L/eus5pf59o6Tt5/hN0lre0faUvHyNdkiTtp6kG9c3A+ja9HrhhIP2cdNYAT7du+puAU9tTdpYApwI3tbxnkqxpI+fPGViXJEnaD5P5Sds1wBuAo5PsoBvFfhFwXZINdE8Melsr/lm6n7Nto/tJ27sAqmpXkvcCd7Ry76mq0cF3v8xzP2n7s/aSJEn7acKgXlVvHydr7RhlCzh3nPVcyRiP8KuqrcCJE9VDkiTtm7eJlSSpJ+btbWJn04rzbpywzMMXvXkWaiJpumzP6jOv1CVJ6gmDuiRJPWFQlySpJwzqkiT1hEFdkqSeMKhLktQTBnVJknrCoC5JUk8Y1CVJ6gmDuiRJPWFQlySpJwzqkiT1hEFdkqSeMKhLktQTBnVJknrCoC5JUk8Y1KUFJMmVSZ5IcvdA2lFJtiR5oP1d0tKT5NIk25LcleSkgWXWt/IPJFk/kP6TSb7alrk0SWZ3D6WFzaAuLSxXAafvlXYecHNVrQJubvMAZwCr2msjcBl0JwHABcDJwGuBC0ZPBFqZXxhYbu9tSTqADOrSAlJVXwR27ZW8DtjUpjcBZw2kX12dW4HFSY4FTgO2VNWuqtoNbAFOb3lHVNWtVVXA1QPrkjQLDOqSjqmqx9r048AxbXoZsH2g3I6Wtq/0HWOk/4AkG5NsTbJ1ZGRk+nsgCTCoSxrQrrBrFrZzeVWtrqrVS5cuPdCbkxYMg7qkr7euc9rfJ1r6TuC4gXLLW9q+0pePkS5plhjUJW0GRkewrwduGEg/p42CXwM83brpbwJOTbKkDZA7Fbip5T2TZE0b9X7OwLokzYJFc10BSbMnyTXAG4Cjk+ygG8V+EXBdkg3AI8DbWvHPAmcC24BvA+8CqKpdSd4L3NHKvaeqRgff/TLdCPvDgD9rL0mzxKAuLSBV9fZxstaOUbaAc8dZz5XAlWOkbwVOnE4dJU2d3e+SJPWEQV2SpJ4wqEuS1BMGdUmSesKgLklSTxjUJUnqCYO6JEk9Ma2gnuQ3ktyT5O4k1yQ5NMnKJLe15yl/MsnBrewhbX5by18xsJ7zW/r9SU6b3i5JkrQwTTmoJ1kG/BqwuqpOBA4CzgbeD1xcVccDu4ENbZENwO6WfnErR5IT2nKvonv28keSHDTVekmStFBNt/t9EXBYkkXA4cBjwBuB61v+3s9mHn1m8/XA2nZ/6HXAtVX13ap6iO6WlK+dZr0kSVpwphzUq2on8AHgUbpg/jRwJ/BUVe1pxQafp/zsM5hb/tPASxj/2cw/wGcwS5I0vul0vy+hu8peCbwUeBFd9/kB4zOYJUka33S6398EPFRVI1X1PeBTwCnA4tYdD89/nvKzz2Bu+UcCTzL+s5klSdJ+mE5QfxRYk+Tw9t34WuBe4AvAW1qZvZ/NPPrM5rcAn29PgdoMnN1Gx68EVgG3T6NekiQtSFN+9GpV3ZbkeuBLwB7gy8DlwI3AtUne19KuaItcAXw8yTZgF92Id6rqniTX0Z0Q7AHOrarvT7VekiQtVNN6nnpVXQBcsFfyg4wxer2qvgO8dZz1XAhcOJ26SJK00E0rqEtSH60478YJyzx80ZtnoSbS/jGoz5DJfAiAHwSSpAPHe79LktQTBnVJknrC7ndJvTHZr8GkvvJKXZKknjCoS5LUEwZ1SZJ6wqAuSVJPGNQlSeoJg7okST1hUJckqScM6pIASPIbSe5JcneSa5IcmmRlktuSbEvyySQHt7KHtPltLX/FwHrOb+n3JzltrvZHWogM6pJIsgz4NWB1VZ0IHET3eOT3AxdX1fHAbmBDW2QDsLulX9zKkeSEttyrgNOBjyQ5aDb3RVrIDOqSRi0CDkuyCDgceAx4I3B9y98EnNWm17V5Wv7aJGnp11bVd6vqIWAbYzyKWdKBYVCXRFXtBD4APEoXzJ8G7gSeqqo9rdgOYFmbXgZsb8vuaeVfMpg+xjLPSrIxydYkW0dGRmZ+h6QFyqAuiSRL6K6yVwIvBV5E131+QFTV5VW1uqpWL1269EBtRlpwDOqSAN4EPFRVI1X1PeBTwCnA4tYdD7Ac2NmmdwLHAbT8I4EnB9PHWEbSAWZQlwRdt/uaJIe378bXAvcCXwDe0sqsB25o05vbPC3/81VVLf3sNjp+JbAKuH2W9kFa8Hz0qiSq6rYk1wNfAvYAXwYuB24Erk3yvpZ2RVvkCuDjSbYBu+hGvFNV9yS5ju6EYA9wblV9f1Z3RlrADOqSAKiqC4AL9kp+kDFGr1fVd4C3jrOeC4ELZ7yCkiZk97skST1hUJckqScM6pIk9YRBXZKknjCoS5LUEwZ1SZJ6wqAuSVJPGNQlSeoJg7okST3hHeVm2YrzbpywzMMXvXkWaiJJ6huv1CVJ6gmDuiRJPTGtoJ5kcZLrk3wtyX1JXpfkqCRbkjzQ/i5pZZPk0iTbktyV5KSB9axv5R9Isn78LUqSpPFM90r9EuBzVfVjwKuB+4DzgJurahVwc5sHOIPu2cqrgI3AZQBJjqJ7MtTJdE+DumD0RECSJE3elIN6kiOBn6I9X7mq/qGqngLWAZtasU3AWW16HXB1dW4FFic5FjgN2FJVu6pqN7AFOH2q9ZIkaaGazuj3lcAI8AdJXg3cCbwbOKaqHmtlHgeOadPLgO0Dy+9oaeOlz4rJjEaXJGk+mE73+yLgJOCyqnoN8C2e62oHoKoKqGls43mSbEyyNcnWkZGRmVqtJEm9MJ2gvgPYUVW3tfnr6YL811u3Ou3vEy1/J3DcwPLLW9p46T+gqi6vqtVVtXrp0qXTqLokSf0z5aBeVY8D25O8siWtBe4FNgOjI9jXAze06c3AOW0U/Brg6dZNfxNwapIlbYDcqS1NkiTth+neUe5XgU8kORh4EHgX3YnCdUk2AI8Ab2tlPwucCWwDvt3KUlW7krwXuKOVe09V7ZpmvSRJWnCmFdSr6ivA6jGy1o5RtoBzx1nPlcCV06mLJEkLnXeUkySpJwzqkiT1hEFdkqSeMKhLktQTBnVJknrCoC5JUk8Y1CVJ6gmDuiQAkixOcn2SryW5L8nrkhyVZEuSB9rfJa1sklyaZFuSu5KcNLCe9a38A0nWj79FSTPNoC5p1CXA56rqx4BXA/fRPaTp5qpaBdzMcw9tOgNY1V4bgcsAkhwFXACcDLwWuGD0REDSgWdQl0SSI4GfAq4AqKp/qKqngHXAplZsE3BWm14HXF2dW4HF7QFOpwFbqmpXVe0GtgCnz+KuSAuaQV0SwEpgBPiDJF9O8rEkLwKOaQ9eAngcOKZNLwO2Dyy/o6WNly5pFhjUJUH3HIiTgMuq6jXAt3iuqx149vkNNRMbS7IxydYkW0dGRmZilZIwqEvq7AB2VNVtbf56uiD/9datTvv7RMvfCRw3sPzyljZe+vNU1eVVtbqqVi9dunRGd0RayAzqkqiqx4HtSV7ZktYC9wKbgdER7OuBG9r0ZuCcNgp+DfB066a/CTg1yZI2QO7UliZpFkz3eeqS+uNXgU8kORh4EHgX3Yn/dUk2AI8Ab2tlPwucCWwDvt3KUlW7krwXuKOVe09V7Zq9XZAWNoO6JACq6ivA6jGy1o5RtoBzx1nPlcCVM1s7SZNh97skST1hUJckqSfsfpekKVhx3o0Tlnn4ojfPQk2k53ilLklSTxjUJUnqCYO6JEk9YVCXJKknDOqSJPWEQV2SpJ4wqEuS1BMGdUmSesKgLklSTxjUJUnqCYO6JEk9YVCXJKknDOqSJPWEQV2SpJ7w0atDyEc6SpKmYtpX6kkOSvLlJJ9p8yuT3JZkW5JPJjm4pR/S5re1/BUD6zi/pd+f5LTp1kmSpIVoJrrf3w3cNzD/fuDiqjoe2A1saOkbgN0t/eJWjiQnAGcDrwJOBz6S5KAZqJckSQvKtIJ6kuXAm4GPtfkAbwSub0U2AWe16XVtnpa/tpVfB1xbVd+tqoeAbcBrp1MvSZIWouleqX8Q+C3gH9v8S4CnqmpPm98BLGvTy4DtAC3/6Vb+2fQxlnmeJBuTbE2ydWRkZJpVlySpX6Yc1JP8NPBEVd05g/XZp6q6vKpWV9XqpUuXztZmJUmaF6Yz+v0U4GeTnAkcChwBXAIsTrKoXY0vB3a28juB44AdSRYBRwJPDqSPGlxGkiRN0pSv1Kvq/KpaXlUr6Aa6fb6q3gF8AXhLK7YeuKFNb27ztPzPV1W19LPb6PiVwCrg9qnWS5KkhepA/E79t4Frk7wP+DJwRUu/Avh4km3ALroTAarqniTXAfcCe4Bzq+r7B6BekiT12owE9aq6BbilTT/IGKPXq+o7wFvHWf5C4MKZqIskSQuVt4mVJKknDOqSJPWEQV3Ss7ztszS/GdQlDfK2z9I8ZlCXBHjbZ6kPDOqSRs3abZ+95bN0YBjUJc36bZ+95bN0YByIm89Imn+87bPUA16pS/K2z1JPeKUuaV+87bM0jxjUJT2Pt32W5i+73yVJ6gmDuiRJPWFQlySpJwzqkiT1hEFdkqSeMKhLktQTBnVJknrCoC5JUk8Y1CVJ6gmDuiRJPWFQlySpJwzqkiT1hEFdkqSeMKhLktQTPnp1nlpx3o0Tlnn4ojfPQk0kScPCoC5JB8hkTr7BE3DNHLvfJUnqCYO6JEk9YVCXJKknDOqSJPWEQV2SpJ4wqEuS1BMGdUmSemLKQT3JcUm+kOTeJPckeXdLPyrJliQPtL9LWnqSXJpkW5K7kpw0sK71rfwDSdZPf7ckSVp4pnOlvgf4zao6AVgDnJvkBOA84OaqWgXc3OYBzgBWtddG4DLoTgKAC4CTgdcCF4yeCEiSpMmbclCvqseq6ktt+pvAfcAyYB2wqRXbBJzVptcBV1fnVmBxkmOB04AtVbWrqnYDW4DTp1ovSZIWqhn5Tj3JCuA1wG3AMVX1WMt6HDimTS8Dtg8stqOljZc+1nY2JtmaZOvIyMhMVF2SpN6YdlBP8mLgT4Bfr6pnBvOqqoCa7jYG1nd5Va2uqtVLly6dqdVKktQL0wrqSV5IF9A/UVWfaslfb93qtL9PtPSdwHEDiy9vaeOlS5olDnyV+mE6o98DXAHcV1W/N5C1GRhtyOuBGwbSz2kfBmuAp1s3/U3AqUmWtA+MU1uapNnjwFepB6bz6NVTgH8LfDXJV1rafwQuAq5LsgF4BHhby/sscCawDfg28C6AqtqV5L3AHa3ce6pq1zTqJWk/tRPsx9r0N5MMDnx9Qyu2CbgF+G0GBr4CtyYZHfj6BtrAV4AkowNfr5m1nZEWsCkH9ar6SyDjZK8do3wB546zriuBK6daF0kzZzYGvibZSHeFz8te9rKZq7y0wHlHOUnPmq2Brw56lQ4Mg7okwIGvUh8Y1CU58FXqiekMlJPUHw58lXrAoN5jK867cVLlHr7ozQe4Jhp2DnyV+sHud0mSesKgLklSTxjUJUnqCYO6JEk9YVCXJKknHP0uSXNsMr9U8Vcqmgyv1CVJ6gmDuiRJPWFQlySpJwzqkiT1hEFdkqSeMKhLktQTBnVJknrCoC5JUk948xl54wtJ6gmv1CVJ6gmDuiRJPWFQlySpJwzqkiT1hAPlJGkecECrJsMrdUmSesKgLklST9j9rkmx60+Shp9BXdLQm8xJpSS73yVJ6g2DuiRJPWFQlySpJ3r9nbrfw80uB9NJc8s2KK/UJUnqiaG5Uk9yOnAJcBDwsaq6aI6rpAPAK4n+sy1Lc2cognqSg4APA/8S2AHckWRzVd07tzWTtD9sy8Nvsl9LenI9Pw1FUAdeC2yrqgcBklwLrAP8IFiAZnIshB9Ms8623BMz1Q5tg7NrWIL6MmD7wPwO4OS9CyXZCGxss3+X5P4x1nU08I0Zr+HssO4zLO+fVLGhrPskTKbeL5+NigyYybYMw/u/sV6TlPcPX52a+VivCdvzsAT1Samqy4HL91UmydaqWj1LVZpR1n1uzNe6z9d6w+TaMgzvPlqvyRvGOkF/6zUso993AscNzC9vaZLmF9uyNIeGJajfAaxKsjLJwcDZwOY5rpOk/WdblubQUHS/V9WeJL8C3ET3M5grq+qeKa5uwi69IWbd58Z8rfvQ1XuG2zIM4T421mvyhrFO0NN6papmqiKSJGkODUv3uyRJmiaDuiRJPdGroJ7k9CT3J9mW5Ly5rs++JDkuyReS3JvkniTvbulHJdmS5IH2d8lc13UsSQ5K8uUkn2nzK5Pc1o79J9sgqaGTZHGS65N8Lcl9SV43j475b7T3yt1Jrkly6Hw57lMxDO152NvpMLbDYWxjw9J2klyZ5Ikkdw+kjXls0rm01e+uJCdNZhu9Cep57vaUZwAnAG9PcsLc1mqf9gC/WVUnAGuAc1t9zwNurqpVwM1tfhi9G7hvYP79wMVVdTywG9gwJ7Wa2CXA56rqx4BX0+3D0B/zJMuAXwNWV9WJdIPQzmb+HPf9MkTtedjb6TC2w6FqY0PWdq4CTt8rbbxjcwawqr02ApdNagtV1YsX8DrgpoH584Hz57pe+1H/G+jul30/cGxLOxa4f67rNkZdl7c33xuBzwChuwPSorH+F8PyAo4EHqINEB1Inw/HfPRObUfR/WrlM8Bp8+G4T3F/h7I9D1M7HcZ2OIxtbNjaDrACuHuiYwN8FHj7WOX29erNlTpj355y2RzVZb8kWQG8BrgNOKaqHmtZjwPHzFG19uWDwG8B/9jmXwI8VVV72vywHvuVwAjwB63L8mNJXsQ8OOZVtRP4APAo8BjwNHAn8+O4T8XQtechbKfD2A6Hro3Ng7Yz3rGZUhvoU1Cfl5K8GPgT4Ner6pnBvOpOz4bqN4dJfhp4oqrunOu6TMEi4CTgsqp6DfAt9uoGHMZjDtC+Z1tH96H5UuBF/GA3ng6QYWunQ9wOh66Nzae2MxPHpk9Bfd7dnjLJC+k+KD5RVZ9qyV9PcmzLPxZ4Yq7qN45TgJ9N8jBwLV3X3yXA4iSjNzMa1mO/A9hRVbe1+evpPoCG/ZgDvAl4qKpGqup7wKfo/hfz4bhPxdC05yFtp8PaDoexjQ172xnv2EypDfQpqM+r21MmCXAFcF9V/d5A1mZgfZteT/cd3tCoqvOranlVraA7xp+vqncAXwDe0ooNXb0BqupxYHuSV7aktXSPBB3qY948CqxJcnh774zWfeiP+xQNRXse1nY6rO1wSNvYsLed8Y7NZuCcNgp+DfD0QDf9+GZrsMIsDUA4E/hr4G+A35nr+kxQ19fTdbPcBXylvc6k+17sZuAB4C+Ao+a6rvvYhzcAn2nTPwrcDmwD/hg4ZK7rN06dfwLY2o77nwJL5ssxB/4L8DXgbuDjwCHz5bhPcX/nvD3Ph3Y6bO1wGNvYsLQd4Bq67/W/R9ersWG8Y0M38PHD7f3/VbrR+xNuw9vESpLUE33qfpckaUEzqEuS1BMGdUmSesKgLklSTxjUJUnqCYO6JEk9YVCXJKkn/j90WDhb6Ns32gAAAABJRU5ErkJggg==\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "src_length = map(len, [vars(x)['src'] for x in train_data.examples])\n", - "trg_length = map(len, [vars(x)['trg'] for x in train_data.examples])\n", - "\n", - "print('Length distribution in Train data')\n", - "plt.figure(figsize=[8, 4])\n", - "plt.subplot(1, 2, 1)\n", - "plt.title(\"source length\")\n", - "plt.hist(list(src_length), bins=20);\n", - "\n", - "plt.subplot(1, 2, 2)\n", - "plt.title(\"translation length\")\n", - "plt.hist(list(trg_length), bins=20);" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Length distribution in Test data\n" - ] - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAe0AAAEICAYAAAByPazKAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8GearUAAAfXElEQVR4nO3df7RdZX3n8fdHIij+4GdETMCkktqiq1YmBVzYjiVWAa241qDFsWPUdNJatLbaarBdpctKJ06dIi4tNRUKTK1AqdaMUjFFrdNpQYNaFFBJEUjSYCK/bKVq0e/8sZ8Lh8tN7k3uz33O+7XWWWfv53n23s++5z7nu/ezn7N3qgpJkrTwPWq+KyBJkqbGoC1JUk8YtCVJ6gmDtiRJPWHQliSpJwzakiT1hEFb05JkWZJKsmgetv3qJH8/19uV5kOSi5O8YxrL/1uSH5nJOrX13pbk+TO93ilsd96+e+aTQVu9MKoNVAvLfAWovZXkM0l+aTCtqh5fVbfOV52mqy9/+9lm0B5RSfab7zpIw8aDSs02g/YClOStSbYn+dckX0uyqqUfkOTdSf6lvd6d5ICW94iu4nZmekybvjjJBUmuSvId4GeTHJXkw0l2JbkryXsHln1tkpuT3JPk6iRPnWLdD0pyYZIdbR/eMXaAMFbHJO9q6/1GklMHll2e5LNtv/82yfuS/HnL/mx7v7d18z1nYLkJ1yfNpCT/Gzga+D/tf/AtAz1Aa5LcAXyqlf3LJHcmua/9Tz9jYD0Xt//tj7f/9euSPK3lJcl5SXYm+XaSLyd55gR1OSTJx1rbvadNL2155wI/Dby31fO9LX3w++CgJJe25W9P8jtJHtXy9thOJ/kbPSrJuiT/3L5TrkhyaMsb+1utTnJHkm8l+e2BZR+b5JK2zZvb33fb7v72A5t95UTrG1pV5WsBvYCnA1uBp7T5ZcDT2vTbgWuBJwGLgX8Afr/lvRr4+3HrKuCYNn0xcB9wEt3B2uOAfwLOa9OPAZ7byp4ObAF+HFgE/A7wD7up77K2nUVt/iPA+9s6nwR8DvjlgTr+B/Dfgf2A1wH/AqTl/yPwLmB/4LnAt4E/n2g7U1mfL18z/QJuA54/MD/2f3lp+59/bEt/LfAE4ADg3cCXBpa5GLgLOL61rw8Cl7W8FwLXAwcDaW3wyIHl3tGmDwP+C3Bg285fAn89sI3PAL80ru6D3weXAh9tyy4Dvg6saXl71a4G/ybAG+m+o5a2fX8/8KFxf6s/BR4LPAv4HvDjLX898HfAIW35G4BtU/jbT7i+YX3NewV8jftA4BhgJ/B84NHj8v4ZOG1g/oXAbW361UwetC8dyHsOsIuBIDiQ9zdjDbjNPwq4H3jqBGXHGs4i4IjWaB47kP8K4NMDddwykHdgW/bJdEfRDwAHDuT/OZMH7QnXN9+fo6/hfO0hcPzIHpY5uJU5qM1fDHxgIP804Ktt+mS6AHoi8Khx67mYFrQn2MZPAvcMzH+G3QRtukD8feDYgbxfBj7TpveqXfHwoH0zsGog70i6A4BFA3+rpQP5nwPObNO3Ai8cyPslpha0J1zfsL7sHl9gqmoL8OvA7wE7k1yW5Ckt+ynA7QPFb29pU7V1YPoo4PaqemCCck8Fzk9yb5J7gbvpjvqXTLL+pwKPBnYMLPt+ujPuMXeOTVTV/W3y8W0/7h5IG1/f3dnd+qS59OD/apL9kqxvXcTfpgs2AIcPlL9zYPp+2v9sVX0KeC/wPrr2vyHJE8dvLMmBSd7fura/TXf56OBMbazK4XTtdPx3yWD73td29VTgIwPt/2bgB3QH9I9YNwP7TvcdMNjmp9L+97S+oWTQXoCq6i+q6rl0DaCAd7asf2lpY45uaQDfoTsiBiDJkyda9cD0VuDoTDxwZitdl/bBA6/HVtU/TFL1rXRn2ocPLPfEqnrGJMsB7AAOTXLgQNpRu6m7NF929384mP5f6S4xPR84iO6MELoD38k3UPWeqvpPwLHAjwK/NUGxN9NdSjuhqp4I/My4beypvXyL7ux3/HfJ9qnUbxJbgVPHfXc8pqqmsu4ddN3iY44al+93AAbtBSfJ05OcnG6A2XeBfwd+2LI/BPxOksVJDgd+l64LGbrr089I8pNJHkN3pr4nn6NrJOuTPC7JY5Kc1PL+BDh7bPBMG7TyssnqXlU7gE8C/yvJE9uglKcl+c9TWPZ2YDPwe0n2bwPNfn6gyC66v8OM/85U2gvfZPL/wSfQHbzeRXcg/QdTXXmSn0pyQpJH0x2If5eH2v/4bfw73cDMQ4FzplrPqvoBcAVwbpInpBtk+iYe+i6Zjj9p631q25/FSU6f4rJX0H3vHJJkCfD6cflT+dsPPYP2wnMA3YCMb9F1+zwJOLvlvYMusN0AfBn4Qkujqr5ON1Dtb4FbgD3edKQ13J+nu8Z1B7AN+IWW9xG6s/vLWtfbV4Cpjsp+Fd1AspuAe4Ar6a5rTcUr6a6139X263K6L7+xLrpzgf/Xut5OnOI6pZn0P+gOnO9N8pu7KXMpXXfzdrp2cO1erP+JdAOr7mnruAv4wwnKvZtu8NW32vo/MS7/fOCMNhL7PRMs/wa6g4Jb6b4r/gK4aC/quTvnAxuBTyb511a3E6a47Nvpvoe+Qfc9diWt/TdT+dsPvbFRu9KCk+RyugE6488iJA25JK+jG1Q2aU/dKPFMWwtG6xp8WutWP4XuuuBfz3e9JM2+JEcmOam1/6fTXbf/yHzXa6Hx7j1aSJ4MfJjuN6jbgNdV1Rfnt0qS5sj+dL82WQ7cC1wG/PG81mgBsntckqSesHtckqSeWNDd44cffngtW7ZsvqshLXjXX3/9t6pq8XzXY09sz9LU7Kk9L+igvWzZMjZv3jzf1ZAWvCS3T15qftmepanZU3u2e1ySpJ4waEuS1BMGbUmSesKgLUlSTxi0JUnqCYO2JEk9YdCWJKknDNqSJPWEQVuSpJ5Y0HdEm0nL1n18RtZz2/oXzch6JM2uqbR527P6xjNtSZJ6wqAtSVJPGLQlSeoJg7YkST1h0JYkqScM2pIk9YRBW5KknjBoSyMkyUVJdib5yrj0NyT5apIbk/zPgfSzk2xJ8rUkLxxIP6WlbUmybi73QRplI3NzFUkAXAy8F7h0LCHJzwKnA8+qqu8leVJLPxY4E3gG8BTgb5P8aFvsfcDPAduAzyfZWFU3zdleSCPKoC2NkKr6bJJl45JfB6yvqu+1Mjtb+unAZS39G0m2AMe3vC1VdStAkstaWYO2NMvsHpf0o8BPJ7kuyd8l+amWvgTYOlBuW0vbXfojJFmbZHOSzbt27ZqFqkujxaAtaRFwKHAi8FvAFUkyEyuuqg1VtbKqVi5evHgmVimNtEmD9kQDV5L8YRu0ckOSjyQ5eCDPgStSv2wDPlydzwE/BA4HtgNHDZRb2tJ2ly5plk3lTPti4JRxaZuAZ1bVTwBfB86GRwxcOQX44yT7JdmPbuDKqcCxwCtaWUnz76+BnwVoA832B74FbATOTHJAkuXACuBzwOeBFUmWJ9mfrs1vnJeaSyNm0oFoEw1cqapPDsxeC5zRph24Ii1gST4EPA84PMk24BzgIuCi1pv2fWB1VRVwY5Ir6NrpA8BZVfWDtp7XA1cD+wEXVdWNc74z0giaidHjrwUub9NL6IL4mMEBKuMHrpww0cqSrAXWAhx99NEzUD1JY6rqFbvJ+sXdlD8XOHeC9KuAq2awapKmYFoD0ZL8Nt0R+AdnpjoOXJEkaXf2+Uw7yauBFwOrWlca7HmAigNXJEmahn06005yCvAW4CVVdf9AlgNXJEmaJZOeae9m4MrZwAHApvZzzmur6leqyoErkiTNkqmMHp9o4MqFeyg/1ANXlq37+JTK3bb+RbNcE0nSqPGOaJIk9YRBW5KknjBoS5LUEwZtSZJ6wqAtSVJPGLQlSeoJg7YkST1h0JYkqScM2pIk9YRBW5KknjBoSyMkyUVJdib5ygR5b05SSQ5v80nyniRbktyQ5LiBsquT3NJeq+dyH6RRZtCWRsvFwCnjE5McBbwAuGMg+VS6J/WtANYCF7Syh9I9OOgE4HjgnCSHzGqtJQEGbWmkVNVngbsnyDqP7nG7NZB2OnBpda4FDk5yJPBCYFNV3V1V9wCbmOBAQNLMM2hLIy7J6cD2qvqncVlLgK0D89ta2u7SJ1r32iSbk2zetWvXDNZaGk0GbWmEJTkQeBvwu7Ox/qraUFUrq2rl4sWLZ2MT0kgxaEuj7WnAcuCfktwGLAW+kOTJwHbgqIGyS1va7tIlzTKDtjTCqurLVfWkqlpWVcvourqPq6o7gY3Aq9oo8hOB+6pqB3A18IIkh7QBaC9oaZJmmUFbGiFJPgT8I/D0JNuSrNlD8auAW4EtwJ8CvwpQVXcDvw98vr3e3tIkzbJF810BSXOnql4xSf6ygekCztpNuYuAi2a0cpIm5Zm2JEk9YdCWJKknDNqSJPWEQVuSpJ6YNGhP9ICBJIcm2dQeFrBp7L7DPmBAkqTZM5Uz7Yt55H2F1wHXVNUK4Jo2Dz5gQJKkWTNp0N7NAwZOBy5p05cALx1I9wEDkiTNgn29pn1EuzMSwJ3AEW3aBwxIkjRLpj0Qrd2AoSYtOPX1+YABSZImsK9B+5ut25v2vrOl+4ABSZJmyb7exnQjsBpY394/OpD++iSX0Q06u6+qdiS5GviDgcFnLwDO3vdqD4dl6z4+aZnb1r9oDmoiSeqDSYN2e8DA84DDk2yjGwW+HriiPWzgduDlrfhVwGl0Dxi4H3gNdA8YSDL2gAHwAQOS9tFUDnalYTVp0N7DAwZWTVDWBwxIkjRLvCOaJEk9YdCWJKknDNqSJPXEvo4e1yQcLKOFKMlFwIuBnVX1zJb2h8DPA98H/hl4TVXd2/LOBtYAPwB+raqubumnAOcD+wEfqKr1c70v0ijyTFsaLRfzyFsIbwKeWVU/AXyd9nPMJMcCZwLPaMv8cZL9kuwHvI/uWQPHAq9oZSXNMoO2NEImepZAVX2yqh5os9fS3fwIumcJXFZV36uqb9D9lPP49tpSVbdW1feBy1pZSbPMoC1p0GuBv2nTPktAWmC8pi0JgCS/DTwAfHCm1llVG4ANACtXrpyxZxTMJe9cqIXEoC2JJK+mG6C2qt0kCfb8zACfJSDNA7vHpRHXRoK/BXhJVd0/kLURODPJAUmWAyuAz9HdjnhFkuVJ9qcbrLZxrustjSLPtKURsptnCZwNHABsSgJwbVX9SlXdmOQK4Ca6bvOzquoHbT2vB66m+8nXRVV145zvjDSCDNrSCNnNswQu3EP5c4FzJ0i/iu4BQZLmkN3jkiT1hEFbkqSeMGhLktQTBm1JknrCoC1JUk84elzSyPJpfOobz7QlSeoJg7YkST1h0JYkqScM2pIk9YRBW5KknphW0E7yG0luTPKVJB9K8pj25J/rkmxJcnl7ChDtSUGXt/TrkiybiR2QJGlU7HPQTrIE+DVgZVU9k+5pP2cC7wTOq6pjgHuANW2RNcA9Lf28Vk6SJE3RdLvHFwGPTbIIOBDYAZwMXNnyLwFe2qZPb/O0/FVpzwGUJEmT2+egXVXbgXcBd9AF6/uA64F7q+qBVmwbsKRNLwG2tmUfaOUPG7/eJGuTbE6yedeuXftaPUmShs50uscPoTt7Xg48BXgccMp0K1RVG6pqZVWtXLx48XRXJ2lAkouS7EzylYG0Q5NsSnJLez+kpSfJe9o4lBuSHDewzOpW/pYkq+djX6RRNJ3u8ecD36iqXVX1H8CHgZOAg1t3OcBSYHub3g4cBdDyDwLumsb2Je29i3nkwfU64JqqWgFc0+YBTgVWtNda4ALogjxwDnACcDxwzliglzS7phO07wBOTHJguza9CrgJ+DRwRiuzGvhom97Y5mn5n6qqmsb2Je2lqvoscPe45MHxJuPHoVxanWvpDsiPBF4IbKqqu6vqHmATM9DLJmly07mmfR3dgLIvAF9u69oAvBV4U5ItdNesL2yLXAgc1tLfxENH85Lm1xFVtaNN3wkc0aYfHIfSjI1R2V36IzhGRZpZ03rKV1WdQ9dNNuhWui6z8WW/C7xsOtuTNLuqqpLMWA9YVW2gO5hn5cqV9qxJ0+Qd0SR9s3V70953tvQHx6E0Y2NUdpcuaZYZtCUNjjcZPw7lVW0U+YnAfa0b/WrgBUkOaQPQXtDSJM2yaXWPS+qXJB8CngccnmQb3eWt9cAVSdYAtwMvb8WvAk4DtgD3A68BqKq7k/w+8PlW7u1VNX5wm6RZYNCWRkhVvWI3WasmKFvAWbtZz0XARTNYNUlTYPe4JEk9YdCWJKknDNqSJPWEQVuSpJ4waEuS1BMGbUmSesKgLUlSTxi0JUnqCYO2JEk9YdCWJKknDNqSJPWEQVuSpJ4waEuS1BMGbUmSesKgLUlSTxi0JUnqCYO2JACS/EaSG5N8JcmHkjwmyfIk1yXZkuTyJPu3sge0+S0tf9n81l4aDQZtSSRZAvwasLKqngnsB5wJvBM4r6qOAe4B1rRF1gD3tPTzWjlJs2xaQTvJwUmuTPLVJDcneU6SQ5NsSnJLez+klU2S97Qj8xuSHDczuyBphiwCHptkEXAgsAM4Gbiy5V8CvLRNn97mafmrkmQO6yqNpOmeaZ8PfKKqfgx4FnAzsA64pqpWANe0eYBTgRXttRa4YJrbljRDqmo78C7gDrpgfR9wPXBvVT3Qim0DlrTpJcDWtuwDrfxh49ebZG2SzUk279q1a3Z3QhoB+xy0kxwE/AxwIUBVfb+q7uXhR+Djj8wvrc61wMFJjtznmkuaMa1H7HRgOfAU4HHAKdNdb1VtqKqVVbVy8eLF012dNPKmc6a9HNgF/FmSLyb5QJLHAUdU1Y5W5k7giDb94JF5M3jULml+PR/4RlXtqqr/AD4MnER3cL2olVkKbG/T24GjAFr+QcBdc1tlafRMJ2gvAo4DLqiqZwPf4aGucACqqoDam5XanSbNizuAE5Mc2K5NrwJuAj4NnNHKrAY+2qY3tnla/qdae5c0i6YTtLcB26rqujZ/JV0Q/+ZYt3d739nyHzwybwaP2h9kd5o091o7vhL4AvBluu+GDcBbgTcl2UJ3zfrCtsiFwGEt/U2MO2CXNDsWTV5kYlV1Z5KtSZ5eVV/joSPzm+iOwNfzyCPz1ye5DDgBuG+gG13SPKuqc4BzxiXfChw/QdnvAi+bi3pJesg+B+3mDcAH2w0XbgVeQ3eEfkWSNcDtwMtb2auA04AtwP2trCRJmqJpBe2q+hKwcoKsVROULeCs6WxPkqRR5h3RJEnqCYO2JEk9YdCWJKknDNqSJPWEQVuSpJ4waEuS1BMGbUmSesKgLUlSTxi0JUnqCYO2JEk9YdCWJKknDNqSJPWEQVuSpJ4waEsCIMnBSa5M8tUkNyd5TpJDk2xKckt7P6SVTZL3JNmS5IYkx813/aVRMN3naWuWLVv38UnL3Lb+RXNQE42A84FPVNUZSfYHDgTeBlxTVeuTrAPWAW8FTgVWtNcJwAXtXdIs8kxbEkkOAn4GuBCgqr5fVfcCpwOXtGKXAC9t06cDl1bnWuDgJEfOcbWlkeOZtiSA5cAu4M+SPAu4HngjcERV7Whl7gSOaNNLgK0Dy29raTsG0kiyFlgLcPTRR89a5efbVHrEwF4xTZ9n2pKgO4A/Drigqp4NfIeuK/xBVVVA7c1Kq2pDVa2sqpWLFy+escpKo8qgLQm6M+VtVXVdm7+SLoh/c6zbu73vbPnbgaMGll/a0iTNIoO2JKrqTmBrkqe3pFXATcBGYHVLWw18tE1vBF7VRpGfCNw30I0uaZZ4TVvSmDcAH2wjx28FXkN3YH9FkjXA7cDLW9mrgNOALcD9raykWWbQlgRAVX0JWDlB1qoJyhZw1qxXStLD2D0uSVJPTDtoJ9kvyReTfKzNL09yXbtT0uWtq40kB7T5LS1/2XS3LUnSKJmJ7vE3AjcDT2zz7wTOq6rLkvwJsIbubklrgHuq6pgkZ7ZyvzAD25ekXvAOh5quaZ1pJ1kKvAj4QJsPcDLdz0XgkXdQGruz0pXAqlZekiRNwXS7x98NvAX4YZs/DLi3qh5o82N3SYKBOyi1/Pta+YdJsjbJ5iSbd+3aNc3qSZI0PPY5aCd5MbCzqq6fwfp4ByVJknZjOte0TwJekuQ04DF017TPp3twwKJ2Nj14l6SxOyhtS7IIOAi4axrblyRppOzzmXZVnV1VS6tqGXAm8KmqeiXwaeCMVmz8HZTG7qx0Riu/V/cxliRplM3G77TfCrwpyRa6a9YXtvQLgcNa+psY9zACSZK0ZzNyR7Sq+gzwmTZ9K3D8BGW+C7xsJrY33lQfizes/BmJhsWot2VpMt4RTZKknjBoS5LUEwZtSZJ6wqAtSVJPGLQlSeoJg7YkST1h0Jb0IB+1Ky1sBm1Jg8YetTtm7FG7xwD30D1iFwYetQuc18pJmmUGbUmAj9qV+sCgLWmMj9qVFjiDtiQftSv1xIzce1xS7/moXakHPNOW5KN2pZ4waEvaEx+1Ky0gdo9Lepj5ftSupN3zTFuSpJ4waEuS1BMGbUmSesKgLUlSTxi0JUnqCYO2JEk9YdCWJKknDNqSJPXEPgftJEcl+XSSm5LcmOSNLf3QJJuS3NLeD2npSfKeJFuS3JDkuJnaCUmSRsF0zrQfAN5cVccCJwJnJTmW7naG11TVCuAaHrq94anAivZaC1wwjW1LkjRy9jloV9WOqvpCm/5X4Ga6Z+yeDlzSil0CvLRNnw5cWp1r6Z4edOQ+11ySpBEzI9e0kywDng1cBxxRVTta1p3AEW16CbB1YLFtLW38utYm2Zxk865du2aiepIkDYVpPzAkyeOBvwJ+vaq+neTBvKqqJHv1uL6q2gBsAFi5cqWP+pshy9Z9fErlblv/olmuiSRpX03rTDvJo+kC9ger6sMt+Ztj3d7tfWdL3w4cNbD40pYmSZKmYDqjx0P3TN2bq+qPBrI2Aqvb9GrgowPpr2qjyE8E7hvoRpckSZOYzpn2ScB/A05O8qX2Og1YD/xckluA57d5gKuAW4EtwJ8CvzqNbUuaQf6EU+qHfb6mXVV/D2Q32asmKF/AWfu6PUmzauwnnF9I8gTg+iSbgFfT/YRzfZJ1dD/hfCsP/wnnCXQ/4TxhXmo+ZKYy/sSxJ6PLO6JJ8iecUk8YtCU9jD/hlBYug7akB43/CedgXrvEtdc/4ayqlVW1cvHixTNYU2k0Tft32pKGw55+wllVO/wJ58Lhde/R5Zm2JH/CKfWEZ9raax7lD6Wxn3B+OcmXWtrb6H6yeUWSNcDtwMtb3lXAaXQ/4bwfeM3cVlcaTQZtPcxUb3eq4eJPOKV+sHtckqSeMGhLktQTBm1JknrCoC1JUk8YtCVJ6gmDtiRJPWHQliSpJwzakiT1hEFbkqSe8I5okjSEvN3wcDJoS9KImuptiw3uC4dBW7PCo3xJmnle05YkqScM2pIk9YTd45o3Xk+TpL1j0NaC5/VxaX7ZBheOOe8eT3JKkq8l2ZJk3VxvX9LMsC1Lc29Oz7ST7Ae8D/g5YBvw+SQbq+qmuayHho9nAnPLtqzxpnq5azK20z2b6+7x44EtVXUrQJLLgNMBG7pm3UIM7AuxTlNkW9ascKzLns110F4CbB2Y3wacMFggyVpgbZv9tyR3Ad+am+rNq8MZ/v1c8PuYd87IamZ0P6dYp6fO1PamaNK2DBO256/NQd3myoL/f94HvdmnvWyrvdmvZrftecENRKuqDcCGsfkkm6tq5TxWaU6Mwn6Owj7C6OznVIxvz8NkGD/nYdwnGK79muuBaNuBowbml7Y0Sf1iW5bmwVwH7c8DK5IsT7I/cCawcY7rIGn6bMvSPJjT7vGqeiDJ64Grgf2Ai6rqxkkWG8qutQmMwn6Owj7CCOznPrblYTOMn/Mw7hMM0X6lqua7DpIkaQq897gkST1h0JYkqScWdNAextskJjkqyaeT3JTkxiRvbOmHJtmU5Jb2fsh813UmJNkvyReTfKzNL09yXftML2+DmHorycFJrkzy1SQ3J3nOsH6Wo2rY2+ywtdFhb5MLNmgP3CbxVOBY4BVJjp3fWs2IB4A3V9WxwInAWW2/1gHXVNUK4Jo2PwzeCNw8MP9O4LyqOga4B1gzL7WaOecDn6iqHwOeRbevw/pZjqphb7PD1kaHu01W1YJ8Ac8Brh6YPxs4e77rNQv7+VG6+zd/DTiypR0JfG2+6zYD+7aUroGcDHwMCN1diRZN9Bn37QUcBHyDNqBzIH3oPktfD/t8h6bNDlsbHYU2uWDPtJn4NolL5qkusyLJMuDZwHXAEVW1o2XdCRwxT9WaSe8G3gL8sM0fBtxbVQ+0+b5/psuBXcCfte7FDyR5HMP5WYqhbLPD1kaHvk0u5KA91JI8Hvgr4Ner6tuDedUdDvb6t3hJXgzsrKrr57sus2gRcBxwQVU9G/gO47rdhuGzVGfY2uyQttGhb5MLOWgP7W0SkzyarvF/sKo+3JK/meTIln8ksHO+6jdDTgJekuQ24DK67rfzgYOTjN3Up++f6TZgW1Vd1+avpPvCGLbPcuQNaZsdxjY69G1yIQftobxNYpIAFwI3V9UfDWRtBFa36dV01816q6rOrqqlVbWM7rP7VFW9Evg0cEYr1uv9rKo7ga1Jnt6SVtE9mnKoPstRN6xtdhjb6Ci0yQV9R7Qkp9Fdcxm7TeK581ylaUvyXOD/Al/moetIb6O7RnYFcDRwO/Dyqrp7Xio5w5I8D/jNqnpxkh+hO6o/FPgi8ItV9b35rN90JPlJ4APA/sCtwGvoDoaH8rMcRaPQZoepjQ57m1zQQVuSJD1kIXePS5KkAQZtSZJ6wqAtSVJPGLQlSeoJg7YkST1h0JYkqScM2pIk9cT/B/odrf8G1COhAAAAAElFTkSuQmCC\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "src_length = map(len, [vars(x)['src'] for x in test_data.examples])\n", - "trg_length = map(len, [vars(x)['trg'] for x in test_data.examples])\n", - "\n", - "print('Length distribution in Test data')\n", - "plt.figure(figsize=[8, 4])\n", - "plt.subplot(1, 2, 1)\n", - "plt.title(\"source length\")\n", - "plt.hist(list(src_length), bins=20);\n", - "\n", - "plt.subplot(1, 2, 2)\n", - "plt.title(\"translation length\")\n", - "plt.hist(list(trg_length), bins=20);" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Model side\n", - "__Here comes simple pipeline of NMT model learning. It almost copies the week03 practice__" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "device(type='cuda', index=1)" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "device" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [], - "source": [ - "def _len_sort_key(x):\n", - " return len(x.src)\n", - "\n", - "BATCH_SIZE = 128\n", - "\n", - "train_iterator, valid_iterator, test_iterator = BucketIterator.splits(\n", - " (train_data, valid_data, test_data), \n", - " batch_size = BATCH_SIZE, \n", - " device = device,\n", - " sort_key=_len_sort_key\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "[torchtext.data.batch.Batch of size 128]\n", - "\t[.trg]:[torch.cuda.LongTensor of size 55x128 (GPU 1)]\n", - "\t[.src]:[torch.cuda.LongTensor of size 59x128 (GPU 1)]\n", - "torch.Size([59, 128]) torch.Size([55, 128])\n" - ] - } - ], - "source": [ - "for x in train_iterator:\n", - " break\n", - "print(x)\n", - "print(x.src.shape, x.trg.shape)" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [], - "source": [ - "import my_network\n", - "Encoder = my_network.Encoder\n", - "Decoder = my_network.Decoder\n", - "Seq2Seq = my_network.Seq2Seq" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [], - "source": [ - "INPUT_DIM = len(SRC.vocab)\n", - "OUTPUT_DIM = len(TRG.vocab)\n", - "ENC_EMB_DIM = 256\n", - "DEC_EMB_DIM = 256\n", - "HID_DIM = 512\n", - "N_LAYERS = 2\n", - "ENC_DROPOUT = 0.5\n", - "DEC_DROPOUT = 0.5\n", - "\n", - "enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)\n", - "dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)\n", - "\n", - "# dont forget to put the model to the right device\n", - "model = Seq2Seq(enc, dec, device).to(device)" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Seq2Seq(\n", - " (encoder): Encoder(\n", - " (embedding): Embedding(9267, 256)\n", - " (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)\n", - " (dropout): Dropout(p=0.5, inplace=False)\n", - " )\n", - " (decoder): Decoder(\n", - " (embedding): Embedding(6699, 256)\n", - " (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)\n", - " (out): Linear(in_features=512, out_features=6699, bias=True)\n", - " (dropout): Dropout(p=0.5, inplace=False)\n", - " )\n", - ")" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "def init_weights(m):\n", - " # \n", - " for name, param in m.named_parameters():\n", - " nn.init.uniform_(param, -0.08, 0.08)\n", - " \n", - "model.apply(init_weights)" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The model has 14,880,299 trainable parameters\n" - ] - } - ], - "source": [ - "def count_parameters(model):\n", - " return sum(p.numel() for p in model.parameters() if p.requires_grad)\n", - "\n", - "print(f'The model has {count_parameters(model):,} trainable parameters')" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [], - "source": [ - "PAD_IDX = TRG.vocab.stoi['']\n", - "optimizer = optim.Adam(model.parameters())\n", - "criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [], - "source": [ - "def train(model, iterator, optimizer, criterion, clip, train_history=None, valid_history=None):\n", - " model.train()\n", - " \n", - " epoch_loss = 0\n", - " history = []\n", - " for i, batch in enumerate(iterator):\n", - " \n", - " src = batch.src\n", - " trg = batch.trg\n", - " \n", - " optimizer.zero_grad()\n", - " \n", - " output = model(src, trg)\n", - " \n", - " #trg = [trg sent len, batch size]\n", - " #output = [trg sent len, batch size, output dim]\n", - " \n", - " output = output[1:].view(-1, output.shape[-1])\n", - " trg = trg[1:].view(-1)\n", - " \n", - " #trg = [(trg sent len - 1) * batch size]\n", - " #output = [(trg sent len - 1) * batch size, output dim]\n", - " \n", - " loss = criterion(output, trg)\n", - " \n", - " loss.backward()\n", - " \n", - " # Let's clip the gradient\n", - " torch.nn.utils.clip_grad_norm_(model.parameters(), clip)\n", - " \n", - " optimizer.step()\n", - " \n", - " epoch_loss += loss.item()\n", - " \n", - " history.append(loss.cpu().data.numpy())\n", - " if (i+1)%10==0:\n", - " fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(12, 8))\n", - "\n", - " clear_output(True)\n", - " ax[0].plot(history, label='train loss')\n", - " ax[0].set_xlabel('Batch')\n", - " ax[0].set_title('Train loss')\n", - " if train_history is not None:\n", - " ax[1].plot(train_history, label='general train history')\n", - " ax[1].set_xlabel('Epoch')\n", - " if valid_history is not None:\n", - " ax[1].plot(valid_history, label='general valid history')\n", - " plt.legend()\n", - " \n", - " plt.show()\n", - "\n", - " \n", - " return epoch_loss / len(iterator)" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [], - "source": [ - "def evaluate(model, iterator, criterion):\n", - " \n", - " model.eval()\n", - " \n", - " epoch_loss = 0\n", - " \n", - " history = []\n", - " \n", - " with torch.no_grad():\n", - " \n", - " for i, batch in enumerate(iterator):\n", - "\n", - " src = batch.src\n", - " trg = batch.trg\n", - "\n", - " output = model(src, trg, 0) #turn off teacher forcing\n", - "\n", - " #trg = [trg sent len, batch size]\n", - " #output = [trg sent len, batch size, output dim]\n", - "\n", - " output = output[1:].view(-1, output.shape[-1])\n", - " trg = trg[1:].view(-1)\n", - "\n", - " #trg = [(trg sent len - 1) * batch size]\n", - " #output = [(trg sent len - 1) * batch size, output dim]\n", - "\n", - " loss = criterion(output, trg)\n", - " \n", - " epoch_loss += loss.item()\n", - " \n", - " return epoch_loss / len(iterator)" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [], - "source": [ - "def epoch_time(start_time, end_time):\n", - " elapsed_time = end_time - start_time\n", - " elapsed_mins = int(elapsed_time / 60)\n", - " elapsed_secs = int(elapsed_time - (elapsed_mins * 60))\n", - " return elapsed_mins, elapsed_secs" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [], - "source": [ - "train_history = []\n", - "valid_history = []\n", - "\n", - "N_EPOCHS = 10\n", - "CLIP = 1\n", - "\n", - "best_valid_loss = float('inf')" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Epoch: 10 | Time: 1m 10s\n", - "\tTrain Loss: 2.998 | Train PPL: 20.040\n", - "\t Val. Loss: 4.710 | Val. PPL: 111.007\n" - ] - } - ], - "source": [ - "for epoch in range(N_EPOCHS):\n", - " \n", - " start_time = time.time()\n", - " \n", - " train_loss = train(model, train_iterator, optimizer, criterion, CLIP, train_history, valid_history)\n", - " valid_loss = evaluate(model, valid_iterator, criterion)\n", - " \n", - " end_time = time.time()\n", - " \n", - " epoch_mins, epoch_secs = epoch_time(start_time, end_time)\n", - " \n", - " if valid_loss < best_valid_loss:\n", - " best_valid_loss = valid_loss\n", - " torch.save(model.state_dict(), 'tut1-model.pt')\n", - " \n", - " train_history.append(train_loss)\n", - " valid_history.append(valid_loss)\n", - " print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')\n", - " print(f'\\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')\n", - " print(f'\\t Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f}')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "__Let's take a look at our network quality__:" - ] - }, - { - "cell_type": "code", - "execution_count": 104, - "metadata": {}, - "outputs": [], - "source": [ - "del utils" - ] - }, - { - "cell_type": "code", - "execution_count": 105, - "metadata": {}, - "outputs": [], - "source": [ - "import utils\n", - "import imp\n", - "imp.reload(utils)\n", - "generate_translation = utils.generate_translation\n", - "remove_tech_tokens = utils.remove_tech_tokens\n", - "get_text = utils.get_text\n", - "flatten = utils.flatten" - ] - }, - { - "cell_type": "code", - "execution_count": 106, - "metadata": {}, - "outputs": [], - "source": [ - "batch = next(iter(test_iterator))" - ] - }, - { - "cell_type": "code", - "execution_count": 107, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Original: there is a 24 - hour front desk at the property .\n", - "Generated: the property offers a 24 - hour front desk . .\n", - "\n", - "Original: this property also features free wifi .\n", - "Generated: free wifi access . . . .\n", - "\n" - ] - } - ], - "source": [ - "for idx in [1,2]:\n", - " src = batch.src[:, idx:idx+1]\n", - " trg = batch.trg[:, idx:idx+1]\n", - " generate_translation(src, trg, model, TRG.vocab)" - ] - }, - { - "cell_type": "code", - "execution_count": 108, - "metadata": {}, - "outputs": [], - "source": [ - "from nltk.translate.bleu_score import corpus_bleu\n", - "\n", - "# \"\"\" Estimates corpora-level BLEU score of model's translations given inp and reference out \"\"\"\n", - "# translations, _ = model.translate_lines(inp_lines, **flags)\n", - "# # Note: if you experience out-of-memory error, split input lines into batches and translate separately\n", - "# return corpus_bleu([[ref] for ref in out_lines], translations) * 100" - ] - }, - { - "cell_type": "code", - "execution_count": 109, - "metadata": {}, - "outputs": [], - "source": [ - "import tqdm" - ] - }, - { - "cell_type": "code", - "execution_count": 110, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "59it [00:03, 18.87it/s]\n" - ] - } - ], - "source": [ - "original_text = []\n", - "generated_text = []\n", - "model.eval()\n", - "with torch.no_grad():\n", - "\n", - " for i, batch in tqdm.tqdm(enumerate(test_iterator)):\n", - "\n", - " src = batch.src\n", - " trg = batch.trg\n", - "\n", - " output = model(src, trg, 0) #turn off teacher forcing\n", - "\n", - " #trg = [trg sent len, batch size]\n", - " #output = [trg sent len, batch size, output dim]\n", - "\n", - " output = output.argmax(dim=-1)\n", - " \n", - " original_text.extend([get_text(x, TRG.vocab) for x in trg.cpu().numpy().T])\n", - " generated_text.extend([get_text(x, TRG.vocab) for x in output[1:].detach().cpu().numpy().T])\n", - "\n", - "# original_text = flatten(original_text)\n", - "# generated_text = flatten(generated_text)" - ] - }, - { - "cell_type": "code", - "execution_count": 111, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "14.139920232081806" - ] - }, - "execution_count": 111, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "corpus_bleu([[text] for text in original_text], generated_text) * 100" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Baseline solution BLEU score is quite low. Try to achieve at least __24__ BLEU on the test set. \n", - "The checkpoints are:\n", - "\n", - "* __22__ - minimal score to submit the homework, 30% of points\n", - "\n", - "* __27__ - good score, 70% of points\n", - "\n", - "* __29__ - excellent score, 100% of points" - ] - } - ], - "metadata": { - "anaconda-cloud": {}, - "colab": { - "collapsed_sections": [], - "machine_shape": "hm", - "name": "homework.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Py3 research env", - "language": "python", - "name": "py3_research" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.7" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/homeworks/lab01_nlp/.ipynb_checkpoints/Lab1_NLP_part2_NMT_old-checkpoint.ipynb b/homeworks/lab01_nlp/.ipynb_checkpoints/Lab1_NLP_part2_NMT_old-checkpoint.ipynb deleted file mode 100644 index 4e586b1..0000000 --- a/homeworks/lab01_nlp/.ipynb_checkpoints/Lab1_NLP_part2_NMT_old-checkpoint.ipynb +++ /dev/null @@ -1,900 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "eulvfJWl7ueY" - }, - "source": [ - "# Lab 1" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Part 2: Neural Machine Translation in the wild\n", - "In the second part of the homework you are supposed to get the best translation you can for the EN-RU translation task.\n", - "\n", - "Basic approach using RNNs as encoder and decoder is implemented for you. \n", - "\n", - "Your ultimate task is to use the techniques we've covered, e.g.\n", - "* [Byte Pair Encoding](https://github.com/rsennrich/subword-nmt)\n", - "\n", - "* CNN encoder (with or without positional encoding)\n", - "\n", - "* attention/self-attention mechanism\n", - "\n", - "* pretraining the language model\n", - "\n", - "* or just fine-tunning BERT)\n", - "\n", - "to improve the translation quality. \n", - "\n", - "__Please use at least three different approaches/models and compare them (translation quality/complexity/training and evaluation time).__\n", - "Write down some summary on your experiments and illustrate it with convergence plots/metrics and your thoughts. Just like you would approach a real problem." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# ! pip install subword-nmt\n", - "# ! pip install nltk\n", - "# ! pip install torchtext\n", - "# ! wget https://raw.githubusercontent.com/girafe-ai/ml-mipt/advanced/homeworks/Lab1_NLP/data.txt\n", - "\n", - "# Thanks to YSDA NLP course team for the data\n", - "# (who thanks tilda and deephack teams for the data in their turn)" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import torch\n", - "import torch.nn as nn\n", - "import torch.optim as optim\n", - "\n", - "import torchtext\n", - "from torchtext.datasets import TranslationDataset, Multi30k\n", - "from torchtext.data import Field, BucketIterator\n", - "\n", - "import spacy\n", - "\n", - "import random\n", - "import math\n", - "import time\n", - "\n", - "import matplotlib\n", - "matplotlib.rcParams.update({'figure.figsize': (16, 12), 'font.size': 14})\n", - "import matplotlib.pyplot as plt\n", - "%matplotlib inline\n", - "from IPython.display import clear_output\n", - "\n", - "from nltk.tokenize import WordPunctTokenizer\n", - "from subword_nmt.learn_bpe import learn_bpe\n", - "from subword_nmt.apply_bpe import BPE\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Main part\n", - "__Here comes the preprocessing. Do not hesitate to use BPE or more complex preprocessing ;)__" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "tokenizer_W = WordPunctTokenizer()\n", - "def tokenize(x, tokenizer=tokenizer_W):\n", - " return tokenizer.tokenize(x.lower())" - ] - }, - { - "cell_type": "code", - "execution_count": 90, - "metadata": {}, - "outputs": [], - "source": [ - "SRC = Field(tokenize=tokenize,\n", - " init_token = '', \n", - " eos_token = '', \n", - " lower = True)\n", - "\n", - "TRG = Field(tokenize=tokenize,\n", - " init_token = '', \n", - " eos_token = '', \n", - " lower = True)\n", - "\n", - "dataset = torchtext.data.TabularDataset(\n", - " path='data.txt',\n", - " format='tsv',\n", - " fields=[('trg', TRG), ('src', SRC)]\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 91, - "metadata": {}, - "outputs": [], - "source": [ - "train_data, valid_data, test_data = dataset.split(split_ratio=[0.8, 0.15, 0.05])" - ] - }, - { - "cell_type": "code", - "execution_count": 92, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Number of training examples: 40000\n", - "Number of validation examples: 2500\n", - "Number of testing examples: 7500\n" - ] - } - ], - "source": [ - "print(f\"Number of training examples: {len(train_data.examples)}\")\n", - "print(f\"Number of validation examples: {len(valid_data.examples)}\")\n", - "print(f\"Number of testing examples: {len(test_data.examples)}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 93, - "metadata": {}, - "outputs": [], - "source": [ - "SRC.build_vocab(train_data, min_freq = 3)\n", - "TRG.build_vocab(train_data, min_freq = 3)" - ] - }, - { - "cell_type": "code", - "execution_count": 95, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Unique tokens in source (ru) vocabulary: 9285\n", - "Unique tokens in target (en) vocabulary: 6770\n" - ] - } - ], - "source": [ - "print(f\"Unique tokens in source (ru) vocabulary: {len(SRC.vocab)}\")\n", - "print(f\"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here are tokens from original (RU) corpus:" - ] - }, - { - "cell_type": "code", - "execution_count": 97, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['',\n", - " 'общими',\n", - " 'ferienwohnung',\n", - " 'закат',\n", - " 'campo',\n", - " 'шампанское',\n", - " 'louis',\n", - " 'уэверли',\n", - " 'диннер',\n", - " 'стеклянными']" - ] - }, - "execution_count": 97, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "SRC.vocab.itos[::1000]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And from target (EN) corpus:" - ] - }, - { - "cell_type": "code", - "execution_count": 98, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['', '46', 'cheeses', 'columbia', 'macerata', 'rouge', 'mactan']" - ] - }, - "execution_count": 98, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "TRG.vocab.itos[::1000]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And here is example from train dataset:" - ] - }, - { - "cell_type": "code", - "execution_count": 99, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'trg': ['you', 'can', 'find', 'a', 'restaurant', '1', 'km', 'from', 'the', 'grain', 'bauernhof', ',', 'while', 'a', 'supermarket', 'is', '5', 'km', 'away', '.'], 'src': ['расстояние', 'от', 'фермерского', 'дома', 'grain', 'bauernhof', 'до', 'ресторана', 'составляет', '1', 'км', ',', 'до', 'супермаркета', '—', '5', 'км', '.']}\n" - ] - } - ], - "source": [ - "print(vars(train_data.examples[9]))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's check the length distributions:" - ] - }, - { - "cell_type": "code", - "execution_count": 100, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Length distribution in Train data\n" - ] - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "src_length = map(len, [vars(x)['src'] for x in train_data.examples])\n", - "trg_length = map(len, [vars(x)['trg'] for x in train_data.examples])\n", - "\n", - "print('Length distribution in Train data')\n", - "plt.figure(figsize=[8, 4])\n", - "plt.subplot(1, 2, 1)\n", - "plt.title(\"source length\")\n", - "plt.hist(list(src_length), bins=20);\n", - "\n", - "plt.subplot(1, 2, 2)\n", - "plt.title(\"translation length\")\n", - "plt.hist(list(trg_length), bins=20);" - ] - }, - { - "cell_type": "code", - "execution_count": 101, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Length distribution in Test data\n" - ] - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "src_length = map(len, [vars(x)['src'] for x in test_data.examples])\n", - "trg_length = map(len, [vars(x)['trg'] for x in test_data.examples])\n", - "\n", - "print('Length distribution in Test data')\n", - "plt.figure(figsize=[8, 4])\n", - "plt.subplot(1, 2, 1)\n", - "plt.title(\"source length\")\n", - "plt.hist(list(src_length), bins=20);\n", - "\n", - "plt.subplot(1, 2, 2)\n", - "plt.title(\"translation length\")\n", - "plt.hist(list(trg_length), bins=20);" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Model side\n", - "__Here comes simple pipeline of NMT model learning. It almost copies the week03 practice__" - ] - }, - { - "cell_type": "code", - "execution_count": 102, - "metadata": {}, - "outputs": [], - "source": [ - "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')" - ] - }, - { - "cell_type": "code", - "execution_count": 103, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "device(type='cuda')" - ] - }, - "execution_count": 103, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "device" - ] - }, - { - "cell_type": "code", - "execution_count": 104, - "metadata": {}, - "outputs": [], - "source": [ - "def _len_sort_key(x):\n", - " return len(x.src)\n", - "\n", - "BATCH_SIZE = 128\n", - "\n", - "train_iterator, valid_iterator, test_iterator = BucketIterator.splits(\n", - " (train_data, valid_data, test_data), \n", - " batch_size = BATCH_SIZE, \n", - " device = device,\n", - " sort_key=_len_sort_key\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 105, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "[torchtext.data.batch.Batch of size 128]\n", - "\t[.trg]:[torch.cuda.LongTensor of size 47x128 (GPU 0)]\n", - "\t[.src]:[torch.cuda.LongTensor of size 47x128 (GPU 0)]\n", - "torch.Size([47, 128]) torch.Size([47, 128])\n" - ] - } - ], - "source": [ - "for x in train_iterator:\n", - " break\n", - "print(x)\n", - "print(x.src.shape, x.trg.shape)" - ] - }, - { - "cell_type": "code", - "execution_count": 106, - "metadata": {}, - "outputs": [], - "source": [ - "import my_network\n", - "Encoder = my_network.Encoder\n", - "Decoder = my_network.Decoder\n", - "Seq2Seq = my_network.Seq2Seq" - ] - }, - { - "cell_type": "code", - "execution_count": 107, - "metadata": {}, - "outputs": [], - "source": [ - "INPUT_DIM = len(SRC.vocab)\n", - "OUTPUT_DIM = len(TRG.vocab)\n", - "ENC_EMB_DIM = 256\n", - "DEC_EMB_DIM = 256\n", - "HID_DIM = 512\n", - "N_LAYERS = 2\n", - "ENC_DROPOUT = 0.5\n", - "DEC_DROPOUT = 0.5\n", - "\n", - "enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)\n", - "dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)\n", - "\n", - "# dont forget to put the model to the right device\n", - "model = Seq2Seq(enc, dec, device).to(device)" - ] - }, - { - "cell_type": "code", - "execution_count": 108, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Seq2Seq(\n", - " (encoder): Encoder(\n", - " (embedding): Embedding(9285, 256)\n", - " (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)\n", - " (dropout): Dropout(p=0.5, inplace=False)\n", - " )\n", - " (decoder): Decoder(\n", - " (embedding): Embedding(6770, 256)\n", - " (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)\n", - " (out): Linear(in_features=512, out_features=6770, bias=True)\n", - " (dropout): Dropout(p=0.5, inplace=False)\n", - " )\n", - ")" - ] - }, - "execution_count": 108, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "def init_weights(m):\n", - " # \n", - " for name, param in m.named_parameters():\n", - " nn.init.uniform_(param, -0.08, 0.08)\n", - " \n", - "model.apply(init_weights)" - ] - }, - { - "cell_type": "code", - "execution_count": 109, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The model has 14,939,506 trainable parameters\n" - ] - } - ], - "source": [ - "def count_parameters(model):\n", - " return sum(p.numel() for p in model.parameters() if p.requires_grad)\n", - "\n", - "print(f'The model has {count_parameters(model):,} trainable parameters')" - ] - }, - { - "cell_type": "code", - "execution_count": 110, - "metadata": {}, - "outputs": [], - "source": [ - "PAD_IDX = TRG.vocab.stoi['']\n", - "optimizer = optim.Adam(model.parameters())\n", - "criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)" - ] - }, - { - "cell_type": "code", - "execution_count": 111, - "metadata": {}, - "outputs": [], - "source": [ - "def train(model, iterator, optimizer, criterion, clip, train_history=None, valid_history=None):\n", - " model.train()\n", - " \n", - " epoch_loss = 0\n", - " history = []\n", - " for i, batch in enumerate(iterator):\n", - " \n", - " src = batch.src\n", - " trg = batch.trg\n", - " \n", - " optimizer.zero_grad()\n", - " \n", - " output = model(src, trg)\n", - " \n", - " #trg = [trg sent len, batch size]\n", - " #output = [trg sent len, batch size, output dim]\n", - " \n", - " output = output[1:].view(-1, output.shape[-1])\n", - " trg = trg[1:].view(-1)\n", - " \n", - " #trg = [(trg sent len - 1) * batch size]\n", - " #output = [(trg sent len - 1) * batch size, output dim]\n", - " \n", - " loss = criterion(output, trg)\n", - " \n", - " loss.backward()\n", - " \n", - " # Let's clip the gradient\n", - " torch.nn.utils.clip_grad_norm_(model.parameters(), clip)\n", - " \n", - " optimizer.step()\n", - " \n", - " epoch_loss += loss.item()\n", - " \n", - " history.append(loss.cpu().data.numpy())\n", - " if (i+1)%10==0:\n", - " fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(12, 8))\n", - "\n", - " clear_output(True)\n", - " ax[0].plot(history, label='train loss')\n", - " ax[0].set_xlabel('Batch')\n", - " ax[0].set_title('Train loss')\n", - " if train_history is not None:\n", - " ax[1].plot(train_history, label='general train history')\n", - " ax[1].set_xlabel('Epoch')\n", - " if valid_history is not None:\n", - " ax[1].plot(valid_history, label='general valid history')\n", - " plt.legend()\n", - " \n", - " plt.show()\n", - "\n", - " \n", - " return epoch_loss / len(iterator)" - ] - }, - { - "cell_type": "code", - "execution_count": 112, - "metadata": {}, - "outputs": [], - "source": [ - "def evaluate(model, iterator, criterion):\n", - " \n", - " model.eval()\n", - " \n", - " epoch_loss = 0\n", - " \n", - " history = []\n", - " \n", - " with torch.no_grad():\n", - " \n", - " for i, batch in enumerate(iterator):\n", - "\n", - " src = batch.src\n", - " trg = batch.trg\n", - "\n", - " output = model(src, trg, 0) #turn off teacher forcing\n", - "\n", - " #trg = [trg sent len, batch size]\n", - " #output = [trg sent len, batch size, output dim]\n", - "\n", - " output = output[1:].view(-1, output.shape[-1])\n", - " trg = trg[1:].view(-1)\n", - "\n", - " #trg = [(trg sent len - 1) * batch size]\n", - " #output = [(trg sent len - 1) * batch size, output dim]\n", - "\n", - " loss = criterion(output, trg)\n", - " \n", - " epoch_loss += loss.item()\n", - " \n", - " return epoch_loss / len(iterator)" - ] - }, - { - "cell_type": "code", - "execution_count": 113, - "metadata": {}, - "outputs": [], - "source": [ - "def epoch_time(start_time, end_time):\n", - " elapsed_time = end_time - start_time\n", - " elapsed_mins = int(elapsed_time / 60)\n", - " elapsed_secs = int(elapsed_time - (elapsed_mins * 60))\n", - " return elapsed_mins, elapsed_secs" - ] - }, - { - "cell_type": "code", - "execution_count": 114, - "metadata": {}, - "outputs": [], - "source": [ - "train_history = []\n", - "valid_history = []\n", - "\n", - "N_EPOCHS = 10\n", - "CLIP = 1\n", - "\n", - "best_valid_loss = float('inf')" - ] - }, - { - "cell_type": "code", - "execution_count": 115, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Epoch: 10 | Time: 1m 15s\n", - "\tTrain Loss: 3.074 | Train PPL: 21.637\n", - "\t Val. Loss: 4.600 | Val. PPL: 99.469\n" - ] - } - ], - "source": [ - "for epoch in range(N_EPOCHS):\n", - " \n", - " start_time = time.time()\n", - " \n", - " train_loss = train(model, train_iterator, optimizer, criterion, CLIP, train_history, valid_history)\n", - " valid_loss = evaluate(model, valid_iterator, criterion)\n", - " \n", - " end_time = time.time()\n", - " \n", - " epoch_mins, epoch_secs = epoch_time(start_time, end_time)\n", - " \n", - " if valid_loss < best_valid_loss:\n", - " best_valid_loss = valid_loss\n", - " torch.save(model.state_dict(), 'tut1-model.pt')\n", - " \n", - " train_history.append(train_loss)\n", - " valid_history.append(valid_loss)\n", - " print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')\n", - " print(f'\\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')\n", - " print(f'\\t Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f}')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "__Let's take a look at our network quality__:" - ] - }, - { - "cell_type": "code", - "execution_count": 136, - "metadata": {}, - "outputs": [], - "source": [ - "import utils\n", - "import imp\n", - "imp.reload(utils)\n", - "generate_translation = utils.generate_translation\n", - "remove_tech_tokens = utils.remove_tech_tokens\n", - "get_text = utils.get_text\n", - "flatten = utils.flatten" - ] - }, - { - "cell_type": "code", - "execution_count": 137, - "metadata": {}, - "outputs": [], - "source": [ - "batch = next(iter(test_iterator))" - ] - }, - { - "cell_type": "code", - "execution_count": 138, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Original: each room has a tv .\n", - "Generated: each room is equipped with a tv . .\n", - "\n", - "Original: you will find a 24 - hour front desk at the property .\n", - "Generated: the hotel offers a 24 - hour front desk . property .\n", - "\n" - ] - } - ], - "source": [ - "for idx in [1,2]:\n", - " src = batch.src[:, idx:idx+1]\n", - " trg = batch.trg[:, idx:idx+1]\n", - " generate_translation(src, trg, model, TRG.vocab)" - ] - }, - { - "cell_type": "code", - "execution_count": 139, - "metadata": {}, - "outputs": [], - "source": [ - "from nltk.translate.bleu_score import corpus_bleu\n", - "\n", - "# \"\"\" Estimates corpora-level BLEU score of model's translations given inp and reference out \"\"\"\n", - "# translations, _ = model.translate_lines(inp_lines, **flags)\n", - "# # Note: if you experience out-of-memory error, split input lines into batches and translate separately\n", - "# return corpus_bleu([[ref] for ref in out_lines], translations) * 100" - ] - }, - { - "cell_type": "code", - "execution_count": 140, - "metadata": {}, - "outputs": [], - "source": [ - "import tqdm" - ] - }, - { - "cell_type": "code", - "execution_count": 149, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "59it [00:03, 18.95it/s]\n" - ] - } - ], - "source": [ - "original_text = []\n", - "generated_text = []\n", - "model.eval()\n", - "with torch.no_grad():\n", - "\n", - " for i, batch in tqdm.tqdm(enumerate(test_iterator)):\n", - "\n", - " src = batch.src\n", - " trg = batch.trg\n", - "\n", - " output = model(src, trg, 0) #turn off teacher forcing\n", - "\n", - " #trg = [trg sent len, batch size]\n", - " #output = [trg sent len, batch size, output dim]\n", - "\n", - " output = output.argmax(dim=-1)\n", - " \n", - " original_text.extend([get_text(x, TRG.vocab) for x in trg.cpu().numpy().T])\n", - " generated_text.extend([get_text(x, TRG.vocab) for x in output.detach().cpu().numpy().T])\n", - "\n", - "# original_text = flatten(original_text)\n", - "# generated_text = flatten(generated_text)" - ] - }, - { - "cell_type": "code", - "execution_count": 150, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "14.449864542777785" - ] - }, - "execution_count": 150, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "corpus_bleu([[text] for text in original_text], generated_text) * 100" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Baseline solution BLEU score is quite low. Try to achieve at least __18__ BLEU on the test set. \n", - "The checkpoints are:\n", - "\n", - "* __18__ - minimal score to submit the homework, 30% of points\n", - "\n", - "* __20__ - good score, 70% of points\n", - "\n", - "* __25__ - excellent score, 100% of points" - ] - } - ], - "metadata": { - "anaconda-cloud": {}, - "colab": { - "collapsed_sections": [], - "machine_shape": "hm", - "name": "homework.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Py3 research env", - "language": "python", - "name": "py3_research" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.7" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/homeworks/lab01_nlp/.ipynb_checkpoints/lab1_01_nlp_part1_embedding_based_mt-checkpoint.ipynb b/homeworks/lab01_nlp/.ipynb_checkpoints/lab1_01_nlp_part1_embedding_based_mt-checkpoint.ipynb deleted file mode 100644 index 2bcd322..0000000 --- a/homeworks/lab01_nlp/.ipynb_checkpoints/lab1_01_nlp_part1_embedding_based_mt-checkpoint.ipynb +++ /dev/null @@ -1,753 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "eulvfJWl7ueY" - }, - "source": [ - "# Lab 1\n", - "\n", - "\n", - "## Part 1: Bilingual dictionary induction and unsupervised embedding-based MT (30%)\n", - "*Note: this homework is based on materials from yandexdataschool [NLP course](https://github.com/yandexdataschool/nlp_course/). Feel free to check this awesome course if you wish to dig deeper.*\n", - "\n", - "*Refined by [Nikolay Karpachev](https://www.linkedin.com/in/nikolay-karpachev-b0146a104/)*" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "fV4rIjxa7uei" - }, - "source": [ - "**In this homework** **YOU** will make machine translation system without using parallel corpora, alignment, attention, 100500 depth super-cool recurrent neural network and all that kind superstuff.\n", - "\n", - "But even without parallel corpora this system can be good enough (hopefully), in particular for similar languages, e.g. Ukrainian and Russian. " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "idSYq2GU7uew" - }, - "source": [ - "### Frament of the Swadesh list for some slavic languages\n", - "\n", - "The Swadesh list is a lexicostatistical stuff. It's named after American linguist Morris Swadesh and contains basic lexis. This list are used to define subgroupings of languages, its relatedness.\n", - "\n", - "So we can see some kind of word invariance for different Slavic languages.\n", - "\n", - "\n", - "| Russian | Belorussian | Ukrainian | Polish | Czech | Bulgarian |\n", - "|-----------------|--------------------------|-------------------------|--------------------|-------------------------------|-----------------------|\n", - "| женщина | жанчына, кабета, баба | жінка | kobieta | žena | жена |\n", - "| мужчина | мужчына | чоловік, мужчина | mężczyzna | muž | мъж |\n", - "| человек | чалавек | людина, чоловік | człowiek | člověk | човек |\n", - "| ребёнок, дитя | дзіця, дзіцёнак, немаўля | дитина, дитя | dziecko | dítě | дете |\n", - "| жена | жонка | дружина, жінка | żona | žena, manželka, choť | съпруга, жена |\n", - "| муж | муж, гаспадар | чоловiк, муж | mąż | muž, manžel, choť | съпруг, мъж |\n", - "| мать, мама | маці, матка | мати, матір, неня, мама | matka | matka, máma, 'стар.' mateř | майка |\n", - "| отец, тятя | бацька, тата | батько, тато, татусь | ojciec | otec | баща, татко |\n", - "| много | шмат, багата | багато | wiele | mnoho, hodně | много |\n", - "| несколько | некалькі, колькі | декілька, кілька | kilka | několik, pár, trocha | няколко |\n", - "| другой, иной | іншы | інший | inny | druhý, jiný | друг |\n", - "| зверь, животное | жывёла, звер, істота | тварина, звір | zwierzę | zvíře | животно |\n", - "| рыба | рыба | риба | ryba | ryba | риба |\n", - "| птица | птушка | птах, птиця | ptak | pták | птица |\n", - "| собака, пёс | сабака | собака, пес | pies | pes | куче, пес |\n", - "| вошь | вош | воша | wesz | veš | въшка |\n", - "| змея, гад | змяя | змія, гад | wąż | had | змия |\n", - "| червь, червяк | чарвяк | хробак, черв'як | robak | červ | червей |\n", - "| дерево | дрэва | дерево | drzewo | strom, dřevo | дърво |\n", - "| лес | лес | ліс | las | les | гора, лес |\n", - "| палка | кій, палка | палиця | patyk, pręt, pałka | hůl, klacek, prut, kůl, pálka | палка, пръчка, бастун |" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "cNM3_fjr7ue2" - }, - "source": [ - "But the context distribution of these languages demonstrates even more invariance. And we can use this fact for our for our purposes." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "YLppwa527ue6" - }, - "source": [ - "## Data" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "lYBGKAUn7ue_" - }, - "outputs": [], - "source": [ - "import gensim\n", - "import numpy as np\n", - "from gensim.models import KeyedVectors" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "MwGoVhRA7ufP" - }, - "source": [ - "In this notebook we're going to use pretrained word vectors - FastText (original paper - https://arxiv.org/abs/1607.04606).\n", - "\n", - "You can download them from the official [website](https://fasttext.cc/docs/en/crawl-vectors.html). We're going to need embeddings for Russian and Ukrainian languages. Please use word2vec-compatible format (.text)." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "u1JjQv_97ufT" - }, - "outputs": [], - "source": [ - "uk_emb = KeyedVectors.load_word2vec_format(\"cc.uk.300.vec\")" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "ffzuept_7ufd" - }, - "outputs": [], - "source": [ - "ru_emb = KeyedVectors.load_word2vec_format(\"cc.ru.300.vec\")" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "nTkXfT0W7ufk" - }, - "outputs": [], - "source": [ - "ru_emb.most_similar([ru_emb[\"август\"]], topn=10)" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "vdBA8lcg7ufs" - }, - "outputs": [], - "source": [ - "uk_emb.most_similar([uk_emb[\"серпень\"]])" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "_yJvcKXO7uf0" - }, - "outputs": [], - "source": [ - "ru_emb.most_similar([uk_emb[\"серпень\"]])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "pNdYAR1q7uf6" - }, - "source": [ - "Load small dictionaries for correspoinding words pairs as trainset and testset." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "35d_DAK67uf8" - }, - "outputs": [], - "source": [ - "def load_word_pairs(filename):\n", - " uk_ru_pairs = []\n", - " uk_vectors = []\n", - " ru_vectors = []\n", - " with open(filename, \"r\") as inpf:\n", - " for line in inpf:\n", - " uk, ru = line.rstrip().split(\"\\t\")\n", - " if uk not in uk_emb or ru not in ru_emb:\n", - " continue\n", - " uk_ru_pairs.append((uk, ru))\n", - " uk_vectors.append(uk_emb[uk])\n", - " ru_vectors.append(ru_emb[ru])\n", - " return uk_ru_pairs, np.array(uk_vectors), np.array(ru_vectors)" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "wkNL602WHJyO" - }, - "outputs": [], - "source": [ - "!wget -O ukr_rus.train.txt http://tiny.cc/jfgecz" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "uoclU6JcHCcn" - }, - "outputs": [], - "source": [ - "!wget -O ukr_rus.test.txt http://tiny.cc/6zoeez" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "05BqsdSK7ugD" - }, - "outputs": [], - "source": [ - "uk_ru_train, X_train, Y_train = load_word_pairs(\"ukr_rus.train.txt\")" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "zQOZw51r7ugL" - }, - "outputs": [], - "source": [ - "uk_ru_test, X_test, Y_test = load_word_pairs(\"ukr_rus.test.txt\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "-ZBBNvpz7ugQ" - }, - "source": [ - "## Embedding space mapping (0.3 pts)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "x_Dhk5gL7ugS" - }, - "source": [ - "Let $x_i \\in \\mathrm{R}^d$ be the distributed representation of word $i$ in the source language, and $y_i \\in \\mathrm{R}^d$ is the vector representation of its translation. Our purpose is to learn such linear transform $W$ that minimizes euclidian distance between $Wx_i$ and $y_i$ for some subset of word embeddings. Thus we can formulate so-called Procrustes problem:\n", - "\n", - "$$W^*= \\arg\\min_W \\sum_{i=1}^n||Wx_i - y_i||_2$$\n", - "or\n", - "$$W^*= \\arg\\min_W ||WX - Y||_F$$\n", - "\n", - "where $||*||_F$ - Frobenius norm." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "acOjDdtL7ugY" - }, - "source": [ - "$W^*= \\arg\\min_W \\sum_{i=1}^n||Wx_i - y_i||_2$ looks like simple multiple linear regression (without intercept fit). So let's code." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "Lb-KN1be7uga" - }, - "outputs": [], - "source": [ - "from sklearn.linear_model import LinearRegression\n", - "\n", - "# YOUR CODE HERE\n", - "# mapping = ...\n", - "# -------" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "X7tqJwoY7ugf" - }, - "source": [ - "Let's take a look at neigbours of the vector of word _\"серпень\"_ (_\"август\"_ in Russian) after linear transform." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "31SrFSbn7ugi" - }, - "outputs": [], - "source": [ - "august = mapping.predict(uk_emb[\"серпень\"].reshape(1, -1))\n", - "ru_emb.most_similar(august)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "okSkjk597ugo" - }, - "source": [ - "We can see that neighbourhood of this embedding cosists of different months, but right variant is on the ninth place." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "o2uY6Y9B7ugt" - }, - "source": [ - "As quality measure we will use precision top-1, top-5 and top-10 (for each transformed Ukrainian embedding we count how many right target pairs are found in top N nearest neighbours in Russian embedding space)." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "zptuho8LAfIE" - }, - "outputs": [], - "source": [ - "def precision(pairs, mapped_vectors, topn=1):\n", - " \"\"\"\n", - " :args:\n", - " pairs = list of right word pairs [(uk_word_0, ru_word_0), ...]\n", - " mapped_vectors = list of embeddings after mapping from source embedding space to destination embedding space\n", - " topn = the number of nearest neighbours in destination embedding space to choose from\n", - " :returns:\n", - " precision_val, float number, total number of words for those we can find right translation at top K.\n", - " \"\"\"\n", - " assert len(pairs) == len(mapped_vectors)\n", - " num_matches = 0\n", - " for i, (_, ru) in enumerate(pairs):\n", - " # YOUR CODE HERE\n", - " precision_val = num_matches / len(pairs)\n", - " return precision_val" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "duhj9hpv7ugy" - }, - "outputs": [], - "source": [ - "assert precision([(\"серпень\", \"август\")], august, topn=5) == 0.0\n", - "assert precision([(\"серпень\", \"август\")], august, topn=9) == 1.0\n", - "assert precision([(\"серпень\", \"август\")], august, topn=10) == 1.0" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "0-iyd5gP7ug5" - }, - "outputs": [], - "source": [ - "assert precision(uk_ru_test, X_test) == 0.0\n", - "assert precision(uk_ru_test, Y_test) == 1.0" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "U-ssEJ3x7uhA" - }, - "outputs": [], - "source": [ - "precision_top1 = precision(uk_ru_test, mapping.predict(X_test), 1)\n", - "precision_top5 = precision(uk_ru_test, mapping.predict(X_test), 5)" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "7K-hy7a6Ksn2" - }, - "outputs": [], - "source": [ - "print(precision_top1)\n", - "print(precision_top5)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "hf6Ou8bx7uhH" - }, - "source": [ - "## Making it better (orthogonal Procrustean problem) (0.3 pts)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "4oLs-drN7uhK" - }, - "source": [ - "It can be shown (see original paper) that a self-consistent linear mapping between semantic spaces should be orthogonal. \n", - "We can restrict transform $W$ to be orthogonal. Then we will solve next problem:\n", - "\n", - "$$W^*= \\arg\\min_W ||WX - Y||_F \\text{, where: } W^TW = I$$\n", - "\n", - "$$I \\text{- identity matrix}$$\n", - "\n", - "Instead of making yet another regression problem we can find optimal orthogonal transformation using singular value decomposition. It turns out that optimal transformation $W^*$ can be expressed via SVD components:\n", - "$$X^TY=U\\Sigma V^T\\text{, singular value decompostion}$$\n", - "$$W^*=UV^T$$" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "_KSaRJFGMFiJ" - }, - "outputs": [], - "source": [ - "import numpy as np" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "DdFQ7qti7uhL" - }, - "outputs": [], - "source": [ - "def learn_transform(X_train, Y_train):\n", - " \"\"\" \n", - " :returns: W* : float matrix[emb_dim x emb_dim] as defined in formulae above\n", - " \"\"\"\n", - " # YOUR CODE GOES HERE\n", - " # compute orthogonal embedding space mapping\n", - " # mapping = ...\n", - "\n", - " return mapping" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "7X7QfYDd7uhQ" - }, - "outputs": [], - "source": [ - "W = learn_transform(X_train, Y_train)" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "OVOFYYa37uhX" - }, - "outputs": [], - "source": [ - "ru_emb.most_similar([np.matmul(uk_emb[\"серпень\"], W)])" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "r297sYP37uhb" - }, - "outputs": [], - "source": [ - "print(precision(uk_ru_test, np.matmul(X_test, W)))\n", - "print(precision(uk_ru_test, np.matmul(X_test, W), 5))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "hvUZ72U5AfJg" - }, - "source": [ - "## Unsupervised embedding-based MT (0.4 pts)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "LLyuVfHBLrJn" - }, - "source": [ - "Now, let's build our word embeddings-based translator!" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "tPAURW1CMuP7" - }, - "source": [ - "Firstly, download OPUS Tatoeba corpus." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "F80kUKzQMsDu" - }, - "outputs": [], - "source": [ - "!wget https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/mono/uk.txt.gz" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "0CGFZoxCUVf1" - }, - "outputs": [], - "source": [ - "!gzip -d ./uk.txt.gz" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "2MV3VvoVUX5U" - }, - "outputs": [], - "source": [ - "with open('./uk.txt', 'r') as f:\n", - " uk_corpus = f.readlines()" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "tU7nPVf0UhbI" - }, - "outputs": [], - "source": [ - "# To save your time and CPU, feel free to use first 1000 sentences of the corpus\n", - "uk_corpus = uk_corpus[:1000]" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "FLN8dBOXAfJ1" - }, - "outputs": [], - "source": [ - "# Any necessary preprocessing if needed\n", - "# YOUR CODE HERE" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "FGksC7l_NMi9" - }, - "outputs": [], - "source": [ - "def translate(sentence):\n", - " \"\"\"\n", - " :args:\n", - " sentence - sentence in Ukrainian (str)\n", - " :returns:\n", - " translation - sentence in Russian (str)\n", - "\n", - " * find ukrainian embedding for each word in sentence\n", - " * transform ukrainian embedding vector\n", - " * find nearest russian word and replace\n", - " \"\"\"\n", - " # YOUR CODE GOES HERE\n", - "\n", - " return \" \".join(translated)" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "4hbbMy-tNxlf" - }, - "outputs": [], - "source": [ - "assert translate(\".\") == \".\"\n", - "assert translate(\"1 , 3\") == \"1 , 3\"\n", - "assert translate(\"кіт зловив мишу\") == \"кот поймал мышку\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "ia6I2ce7O_HI" - }, - "source": [ - "Now you can play with your model and try to get as accurate translations as possible. **Note**: one big issue is out-of-vocabulary words. Try to think of various ways of handling it (you can start with translating each of them to a special **UNK** token and then move to more sophisticated approaches). Good luck!" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "ap1W7ZCeOAVU" - }, - "outputs": [], - "source": [ - "for sent in uk_corpus[::10]:\n", - " print(translate(sent))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Great! \n", - "See second notebook for the Neural Machine Translation assignment." - ] - } - ], - "metadata": { - "anaconda-cloud": {}, - "colab": { - "collapsed_sections": [], - "machine_shape": "hm", - "name": "homework.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Py3 Research", - "language": "python", - "name": "py3_research" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.7" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/homeworks/lab01_nlp/.ipynb_checkpoints/lab1_02_nlp_part2_nmt-checkpoint.ipynb b/homeworks/lab01_nlp/.ipynb_checkpoints/lab1_02_nlp_part2_nmt-checkpoint.ipynb deleted file mode 100644 index eb346fa..0000000 --- a/homeworks/lab01_nlp/.ipynb_checkpoints/lab1_02_nlp_part2_nmt-checkpoint.ipynb +++ /dev/null @@ -1,941 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Lab 1" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Part 2: Neural Machine Translation in the wild\n", - "In the third homework you are supposed to get the best translation you can for the EN-RU translation task.\n", - "\n", - "Basic approach using RNNs as encoder and decoder is implemented for you. \n", - "\n", - "Your ultimate task is to use the techniques we've covered, e.g.\n", - "\n", - "* Optimization enhancements (e.g. learning rate decay)\n", - "\n", - "* CNN encoder (with or without positional encoding)\n", - "\n", - "* attention/self-attention mechanism\n", - "\n", - "* pretraining the language model\n", - "\n", - "* [Byte Pair Encoding](https://github.com/rsennrich/subword-nmt)\n", - "\n", - "* or just fine-tunning BERT ;)\n", - "\n", - "to improve the translation quality. \n", - "\n", - "__Please use at least three different approaches/models and compare them (translation quality/complexity/training and evaluation time).__\n", - "\n", - "Write down some summary on your experiments and illustrate it with convergence plots/metrics and your thoughts. Just like you would approach a real problem." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# You might need to install the libraries below. Do it in the desired environment\n", - "# if you are working locally.\n", - "\n", - "# ! pip install subword-nmt\n", - "# ! pip install nltk\n", - "# ! pip install torchtext" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "# Thanks to YSDA NLP course team for the data\n", - "# (who thanks tilda and deephack teams for the data in their turn)\n", - "\n", - "import os\n", - "path_do_data = '../../datasets/Machine_translation_EN_RU/data.txt'\n", - "if not os.path.exists(path_do_data):\n", - " print(\"Dataset not found locally. Downloading from github. Loading special files as well\")\n", - " !wget https://raw.githubusercontent.com/girafe-ai/ml-mipt/master/datasets/Machine_translation_EN_RU/data.txt -nc\n", - " path_do_data = './data.txt'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if not os.path.exists('./utils.py'):\n", - " print(\"utils file not found locally. Downloading from github.\")\n", - " !wget https://raw.githubusercontent.com/girafe-ai/ml-mipt/master/homeworks_advanced/Lab1_NLP/utils.py -nc\n", - "\n", - "if not os.path.exists('./my_network.py'):\n", - " print(\"network file not found locally. Downloading from github.\")\n", - " !wget https://raw.githubusercontent.com/girafe-ai/ml-mipt/master/homeworks_advanced/Lab1_NLP/my_network.py -nc" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "import torch\n", - "import torch.nn as nn\n", - "import torch.optim as optim\n", - "\n", - "import torchtext\n", - "from torchtext.legacy.datasets import TranslationDataset, Multi30k\n", - "from torchtext.legacy.data import Field, BucketIterator, TabularDataset\n", - "\n", - "import spacy\n", - "\n", - "import random\n", - "import math\n", - "import time\n", - "\n", - "import matplotlib\n", - "matplotlib.rcParams.update({'figure.figsize': (16, 12), 'font.size': 14})\n", - "import matplotlib.pyplot as plt\n", - "%matplotlib inline\n", - "from IPython.display import clear_output\n", - "\n", - "from nltk.tokenize import WordPunctTokenizer\n", - "from subword_nmt.learn_bpe import learn_bpe\n", - "from subword_nmt.apply_bpe import BPE" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Main part\n", - "__Here comes the preprocessing. Do not hesitate to use BPE or more complex preprocessing ;)__" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "tokenizer_W = WordPunctTokenizer()\n", - "def tokenize(x, tokenizer=tokenizer_W):\n", - " return tokenizer.tokenize(x.lower())" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "SRC = Field(tokenize=tokenize,\n", - " init_token = '', \n", - " eos_token = '', \n", - " lower = True)\n", - "\n", - "TRG = Field(tokenize=tokenize,\n", - " init_token = '', \n", - " eos_token = '', \n", - " lower = True)\n", - "\n", - "dataset = TabularDataset(\n", - " path=path_do_data,\n", - " format='tsv',\n", - " fields=[('trg', TRG), ('src', SRC)]\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "train_data, valid_data, test_data = dataset.split(split_ratio=[0.8, 0.15, 0.05])" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Number of training examples: 40000\n", - "Number of validation examples: 2500\n", - "Number of testing examples: 7500\n" - ] - } - ], - "source": [ - "print(f\"Number of training examples: {len(train_data.examples)}\")\n", - "print(f\"Number of validation examples: {len(valid_data.examples)}\")\n", - "print(f\"Number of testing examples: {len(test_data.examples)}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "SRC.build_vocab(train_data, min_freq = 3)\n", - "TRG.build_vocab(train_data, min_freq = 3)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Unique tokens in source (ru) vocabulary: 9267\n", - "Unique tokens in target (en) vocabulary: 6699\n" - ] - } - ], - "source": [ - "print(f\"Unique tokens in source (ru) vocabulary: {len(SRC.vocab)}\")\n", - "print(f\"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here are tokens from original (RU) corpus:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['',\n", - " '29',\n", - " 'соль',\n", - " 'комо',\n", - " '―',\n", - " 'электрическая',\n", - " 'ming',\n", - " 'утренний',\n", - " 'детском',\n", - " 'таунус']" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "SRC.vocab.itos[::1000]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And from target (EN) corpus:" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['', 'king', 'buffets', 'catch', 'media', 'schedule', 'maraunenhof']" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "TRG.vocab.itos[::1000]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And here is example from train dataset:" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'trg': ['laundry', 'service', 'is', 'provided', '.'], 'src': ['помимо', 'этого', ',', 'гостям', 'предоставляются', 'услуги', 'прачечной', '.']}\n" - ] - } - ], - "source": [ - "print(vars(train_data.examples[9]))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's check the length distributions:" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Length distribution in Train data\n" - ] - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "src_length = map(len, [vars(x)['src'] for x in train_data.examples])\n", - "trg_length = map(len, [vars(x)['trg'] for x in train_data.examples])\n", - "\n", - "print('Length distribution in Train data')\n", - "plt.figure(figsize=[8, 4])\n", - "plt.subplot(1, 2, 1)\n", - "plt.title(\"source length\")\n", - "plt.hist(list(src_length), bins=20);\n", - "\n", - "plt.subplot(1, 2, 2)\n", - "plt.title(\"translation length\")\n", - "plt.hist(list(trg_length), bins=20);" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Length distribution in Test data\n" - ] - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "src_length = map(len, [vars(x)['src'] for x in test_data.examples])\n", - "trg_length = map(len, [vars(x)['trg'] for x in test_data.examples])\n", - "\n", - "print('Length distribution in Test data')\n", - "plt.figure(figsize=[8, 4])\n", - "plt.subplot(1, 2, 1)\n", - "plt.title(\"source length\")\n", - "plt.hist(list(src_length), bins=20);\n", - "\n", - "plt.subplot(1, 2, 2)\n", - "plt.title(\"translation length\")\n", - "plt.hist(list(trg_length), bins=20);" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Model side\n", - "__Here comes simple pipeline of NMT model learning. It almost copies the week03 practice__" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "device(type='cuda', index=1)" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "device" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [], - "source": [ - "def _len_sort_key(x):\n", - " return len(x.src)\n", - "\n", - "BATCH_SIZE = 128\n", - "\n", - "train_iterator, valid_iterator, test_iterator = BucketIterator.splits(\n", - " (train_data, valid_data, test_data), \n", - " batch_size = BATCH_SIZE, \n", - " device = device,\n", - " sort_key=_len_sort_key\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "[torchtext.data.batch.Batch of size 128]\n", - "\t[.trg]:[torch.cuda.LongTensor of size 55x128 (GPU 1)]\n", - "\t[.src]:[torch.cuda.LongTensor of size 59x128 (GPU 1)]\n", - "torch.Size([59, 128]) torch.Size([55, 128])\n" - ] - } - ], - "source": [ - "for x in train_iterator:\n", - " break\n", - "print(x)\n", - "print(x.src.shape, x.trg.shape)" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [], - "source": [ - "import my_network\n", - "Encoder = my_network.Encoder\n", - "Decoder = my_network.Decoder\n", - "Seq2Seq = my_network.Seq2Seq" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [], - "source": [ - "INPUT_DIM = len(SRC.vocab)\n", - "OUTPUT_DIM = len(TRG.vocab)\n", - "ENC_EMB_DIM = 256\n", - "DEC_EMB_DIM = 256\n", - "HID_DIM = 512\n", - "N_LAYERS = 2\n", - "ENC_DROPOUT = 0.5\n", - "DEC_DROPOUT = 0.5\n", - "\n", - "enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)\n", - "dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)\n", - "\n", - "# dont forget to put the model to the right device\n", - "model = Seq2Seq(enc, dec, device).to(device)" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Seq2Seq(\n", - " (encoder): Encoder(\n", - " (embedding): Embedding(9267, 256)\n", - " (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)\n", - " (dropout): Dropout(p=0.5, inplace=False)\n", - " )\n", - " (decoder): Decoder(\n", - " (embedding): Embedding(6699, 256)\n", - " (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)\n", - " (out): Linear(in_features=512, out_features=6699, bias=True)\n", - " (dropout): Dropout(p=0.5, inplace=False)\n", - " )\n", - ")" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "def init_weights(m):\n", - " # \n", - " for name, param in m.named_parameters():\n", - " nn.init.uniform_(param, -0.08, 0.08)\n", - " \n", - "model.apply(init_weights)" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The model has 14,880,299 trainable parameters\n" - ] - } - ], - "source": [ - "def count_parameters(model):\n", - " return sum(p.numel() for p in model.parameters() if p.requires_grad)\n", - "\n", - "print(f'The model has {count_parameters(model):,} trainable parameters')" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [], - "source": [ - "PAD_IDX = TRG.vocab.stoi['']\n", - "optimizer = optim.Adam(model.parameters())\n", - "criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [], - "source": [ - "def train(model, iterator, optimizer, criterion, clip, train_history=None, valid_history=None):\n", - " model.train()\n", - " \n", - " epoch_loss = 0\n", - " history = []\n", - " for i, batch in enumerate(iterator):\n", - " \n", - " src = batch.src\n", - " trg = batch.trg\n", - " \n", - " optimizer.zero_grad()\n", - " \n", - " output = model(src, trg)\n", - " \n", - " #trg = [trg sent len, batch size]\n", - " #output = [trg sent len, batch size, output dim]\n", - " \n", - " output = output[1:].view(-1, output.shape[-1])\n", - " trg = trg[1:].view(-1)\n", - " \n", - " #trg = [(trg sent len - 1) * batch size]\n", - " #output = [(trg sent len - 1) * batch size, output dim]\n", - " \n", - " loss = criterion(output, trg)\n", - " \n", - " loss.backward()\n", - " \n", - " # Let's clip the gradient\n", - " torch.nn.utils.clip_grad_norm_(model.parameters(), clip)\n", - " \n", - " optimizer.step()\n", - " \n", - " epoch_loss += loss.item()\n", - " \n", - " history.append(loss.cpu().data.numpy())\n", - " if (i+1)%10==0:\n", - " fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(12, 8))\n", - "\n", - " clear_output(True)\n", - " ax[0].plot(history, label='train loss')\n", - " ax[0].set_xlabel('Batch')\n", - " ax[0].set_title('Train loss')\n", - " if train_history is not None:\n", - " ax[1].plot(train_history, label='general train history')\n", - " ax[1].set_xlabel('Epoch')\n", - " if valid_history is not None:\n", - " ax[1].plot(valid_history, label='general valid history')\n", - " plt.legend()\n", - " \n", - " plt.show()\n", - "\n", - " \n", - " return epoch_loss / len(iterator)" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [], - "source": [ - "def evaluate(model, iterator, criterion):\n", - " \n", - " model.eval()\n", - " \n", - " epoch_loss = 0\n", - " \n", - " history = []\n", - " \n", - " with torch.no_grad():\n", - " \n", - " for i, batch in enumerate(iterator):\n", - "\n", - " src = batch.src\n", - " trg = batch.trg\n", - "\n", - " output = model(src, trg, 0) #turn off teacher forcing\n", - "\n", - " #trg = [trg sent len, batch size]\n", - " #output = [trg sent len, batch size, output dim]\n", - "\n", - " output = output[1:].view(-1, output.shape[-1])\n", - " trg = trg[1:].view(-1)\n", - "\n", - " #trg = [(trg sent len - 1) * batch size]\n", - " #output = [(trg sent len - 1) * batch size, output dim]\n", - "\n", - " loss = criterion(output, trg)\n", - " \n", - " epoch_loss += loss.item()\n", - " \n", - " return epoch_loss / len(iterator)" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [], - "source": [ - "def epoch_time(start_time, end_time):\n", - " elapsed_time = end_time - start_time\n", - " elapsed_mins = int(elapsed_time / 60)\n", - " elapsed_secs = int(elapsed_time - (elapsed_mins * 60))\n", - " return elapsed_mins, elapsed_secs" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [], - "source": [ - "train_history = []\n", - "valid_history = []\n", - "\n", - "N_EPOCHS = 10\n", - "CLIP = 1\n", - "\n", - "best_valid_loss = float('inf')" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Epoch: 10 | Time: 1m 10s\n", - "\tTrain Loss: 2.998 | Train PPL: 20.040\n", - "\t Val. Loss: 4.710 | Val. PPL: 111.007\n" - ] - } - ], - "source": [ - "for epoch in range(N_EPOCHS):\n", - " \n", - " start_time = time.time()\n", - " \n", - " train_loss = train(model, train_iterator, optimizer, criterion, CLIP, train_history, valid_history)\n", - " valid_loss = evaluate(model, valid_iterator, criterion)\n", - " \n", - " end_time = time.time()\n", - " \n", - " epoch_mins, epoch_secs = epoch_time(start_time, end_time)\n", - " \n", - " if valid_loss < best_valid_loss:\n", - " best_valid_loss = valid_loss\n", - " torch.save(model.state_dict(), 'tut1-model.pt')\n", - " \n", - " train_history.append(train_loss)\n", - " valid_history.append(valid_loss)\n", - " print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')\n", - " print(f'\\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')\n", - " print(f'\\t Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f}')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "__Let's take a look at our network quality__:" - ] - }, - { - "cell_type": "code", - "execution_count": 104, - "metadata": {}, - "outputs": [], - "source": [ - "del utils" - ] - }, - { - "cell_type": "code", - "execution_count": 105, - "metadata": {}, - "outputs": [], - "source": [ - "import utils\n", - "import imp\n", - "imp.reload(utils)\n", - "generate_translation = utils.generate_translation\n", - "remove_tech_tokens = utils.remove_tech_tokens\n", - "get_text = utils.get_text\n", - "flatten = utils.flatten" - ] - }, - { - "cell_type": "code", - "execution_count": 106, - "metadata": {}, - "outputs": [], - "source": [ - "batch = next(iter(test_iterator))" - ] - }, - { - "cell_type": "code", - "execution_count": 107, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Original: there is a 24 - hour front desk at the property .\n", - "Generated: the property offers a 24 - hour front desk . .\n", - "\n", - "Original: this property also features free wifi .\n", - "Generated: free wifi access . . . .\n", - "\n" - ] - } - ], - "source": [ - "for idx in [1,2]:\n", - " src = batch.src[:, idx:idx+1]\n", - " trg = batch.trg[:, idx:idx+1]\n", - " generate_translation(src, trg, model, TRG.vocab)" - ] - }, - { - "cell_type": "code", - "execution_count": 108, - "metadata": {}, - "outputs": [], - "source": [ - "from nltk.translate.bleu_score import corpus_bleu\n", - "\n", - "# \"\"\" Estimates corpora-level BLEU score of model's translations given inp and reference out \"\"\"\n", - "# translations, _ = model.translate_lines(inp_lines, **flags)\n", - "# # Note: if you experience out-of-memory error, split input lines into batches and translate separately\n", - "# return corpus_bleu([[ref] for ref in out_lines], translations) * 100" - ] - }, - { - "cell_type": "code", - "execution_count": 109, - "metadata": {}, - "outputs": [], - "source": [ - "import tqdm" - ] - }, - { - "cell_type": "code", - "execution_count": 110, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "59it [00:03, 18.87it/s]\n" - ] - } - ], - "source": [ - "original_text = []\n", - "generated_text = []\n", - "model.eval()\n", - "with torch.no_grad():\n", - "\n", - " for i, batch in tqdm.tqdm(enumerate(test_iterator)):\n", - "\n", - " src = batch.src\n", - " trg = batch.trg\n", - "\n", - " output = model(src, trg, 0) #turn off teacher forcing\n", - "\n", - " #trg = [trg sent len, batch size]\n", - " #output = [trg sent len, batch size, output dim]\n", - "\n", - " output = output.argmax(dim=-1)\n", - " \n", - " original_text.extend([get_text(x, TRG.vocab) for x in trg.cpu().numpy().T])\n", - " generated_text.extend([get_text(x, TRG.vocab) for x in output[1:].detach().cpu().numpy().T])\n", - "\n", - "# original_text = flatten(original_text)\n", - "# generated_text = flatten(generated_text)" - ] - }, - { - "cell_type": "code", - "execution_count": 111, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "14.139920232081806" - ] - }, - "execution_count": 111, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "corpus_bleu([[text] for text in original_text], generated_text) * 100" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Baseline solution BLEU score is quite low. Try to achieve at least __24__ BLEU on the test set. \n", - "The checkpoints are:\n", - "\n", - "* __22__ - minimal score to submit the homework, 30% of points\n", - "\n", - "* __27__ - good score, 70% of points\n", - "\n", - "* __29__ - excellent score, 100% of points" - ] - } - ], - "metadata": { - "anaconda-cloud": {}, - "colab": { - "collapsed_sections": [], - "machine_shape": "hm", - "name": "homework.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Py3 Research", - "language": "python", - "name": "py3_research" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.7" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/homeworks/lab01_nlp/README.md b/homeworks/lab01_nlp/README.md deleted file mode 100644 index 3b94c95..0000000 --- a/homeworks/lab01_nlp/README.md +++ /dev/null @@ -1,6 +0,0 @@ -Lab assignment #1 - -* Part 1: Embedding-based Machine Translation: -[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/girafe-ai/natural-language-processing/blob/master/homeworks/lab01_nlp/lab1_01_nlp_part1_embedding_based_mt.ipynb) - -* Part 2: NMT: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/girafe-ai/natural-language-processing/blob/master/homeworks/lab01_nlp/lab1_02_nlp_part2_nmt.ipynb) diff --git a/homeworks/lab01_nlp/lab1_01_nlp_part1_embedding_based_mt.ipynb b/homeworks/lab01_nlp/lab1_01_nlp_part1_embedding_based_mt.ipynb deleted file mode 100644 index 2bcd322..0000000 --- a/homeworks/lab01_nlp/lab1_01_nlp_part1_embedding_based_mt.ipynb +++ /dev/null @@ -1,753 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "eulvfJWl7ueY" - }, - "source": [ - "# Lab 1\n", - "\n", - "\n", - "## Part 1: Bilingual dictionary induction and unsupervised embedding-based MT (30%)\n", - "*Note: this homework is based on materials from yandexdataschool [NLP course](https://github.com/yandexdataschool/nlp_course/). Feel free to check this awesome course if you wish to dig deeper.*\n", - "\n", - "*Refined by [Nikolay Karpachev](https://www.linkedin.com/in/nikolay-karpachev-b0146a104/)*" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "fV4rIjxa7uei" - }, - "source": [ - "**In this homework** **YOU** will make machine translation system without using parallel corpora, alignment, attention, 100500 depth super-cool recurrent neural network and all that kind superstuff.\n", - "\n", - "But even without parallel corpora this system can be good enough (hopefully), in particular for similar languages, e.g. Ukrainian and Russian. " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "idSYq2GU7uew" - }, - "source": [ - "### Frament of the Swadesh list for some slavic languages\n", - "\n", - "The Swadesh list is a lexicostatistical stuff. It's named after American linguist Morris Swadesh and contains basic lexis. This list are used to define subgroupings of languages, its relatedness.\n", - "\n", - "So we can see some kind of word invariance for different Slavic languages.\n", - "\n", - "\n", - "| Russian | Belorussian | Ukrainian | Polish | Czech | Bulgarian |\n", - "|-----------------|--------------------------|-------------------------|--------------------|-------------------------------|-----------------------|\n", - "| женщина | жанчына, кабета, баба | жінка | kobieta | žena | жена |\n", - "| мужчина | мужчына | чоловік, мужчина | mężczyzna | muž | мъж |\n", - "| человек | чалавек | людина, чоловік | człowiek | člověk | човек |\n", - "| ребёнок, дитя | дзіця, дзіцёнак, немаўля | дитина, дитя | dziecko | dítě | дете |\n", - "| жена | жонка | дружина, жінка | żona | žena, manželka, choť | съпруга, жена |\n", - "| муж | муж, гаспадар | чоловiк, муж | mąż | muž, manžel, choť | съпруг, мъж |\n", - "| мать, мама | маці, матка | мати, матір, неня, мама | matka | matka, máma, 'стар.' mateř | майка |\n", - "| отец, тятя | бацька, тата | батько, тато, татусь | ojciec | otec | баща, татко |\n", - "| много | шмат, багата | багато | wiele | mnoho, hodně | много |\n", - "| несколько | некалькі, колькі | декілька, кілька | kilka | několik, pár, trocha | няколко |\n", - "| другой, иной | іншы | інший | inny | druhý, jiný | друг |\n", - "| зверь, животное | жывёла, звер, істота | тварина, звір | zwierzę | zvíře | животно |\n", - "| рыба | рыба | риба | ryba | ryba | риба |\n", - "| птица | птушка | птах, птиця | ptak | pták | птица |\n", - "| собака, пёс | сабака | собака, пес | pies | pes | куче, пес |\n", - "| вошь | вош | воша | wesz | veš | въшка |\n", - "| змея, гад | змяя | змія, гад | wąż | had | змия |\n", - "| червь, червяк | чарвяк | хробак, черв'як | robak | červ | червей |\n", - "| дерево | дрэва | дерево | drzewo | strom, dřevo | дърво |\n", - "| лес | лес | ліс | las | les | гора, лес |\n", - "| палка | кій, палка | палиця | patyk, pręt, pałka | hůl, klacek, prut, kůl, pálka | палка, пръчка, бастун |" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "cNM3_fjr7ue2" - }, - "source": [ - "But the context distribution of these languages demonstrates even more invariance. And we can use this fact for our for our purposes." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "YLppwa527ue6" - }, - "source": [ - "## Data" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "lYBGKAUn7ue_" - }, - "outputs": [], - "source": [ - "import gensim\n", - "import numpy as np\n", - "from gensim.models import KeyedVectors" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "MwGoVhRA7ufP" - }, - "source": [ - "In this notebook we're going to use pretrained word vectors - FastText (original paper - https://arxiv.org/abs/1607.04606).\n", - "\n", - "You can download them from the official [website](https://fasttext.cc/docs/en/crawl-vectors.html). We're going to need embeddings for Russian and Ukrainian languages. Please use word2vec-compatible format (.text)." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "u1JjQv_97ufT" - }, - "outputs": [], - "source": [ - "uk_emb = KeyedVectors.load_word2vec_format(\"cc.uk.300.vec\")" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "ffzuept_7ufd" - }, - "outputs": [], - "source": [ - "ru_emb = KeyedVectors.load_word2vec_format(\"cc.ru.300.vec\")" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "nTkXfT0W7ufk" - }, - "outputs": [], - "source": [ - "ru_emb.most_similar([ru_emb[\"август\"]], topn=10)" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "vdBA8lcg7ufs" - }, - "outputs": [], - "source": [ - "uk_emb.most_similar([uk_emb[\"серпень\"]])" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "_yJvcKXO7uf0" - }, - "outputs": [], - "source": [ - "ru_emb.most_similar([uk_emb[\"серпень\"]])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "pNdYAR1q7uf6" - }, - "source": [ - "Load small dictionaries for correspoinding words pairs as trainset and testset." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "35d_DAK67uf8" - }, - "outputs": [], - "source": [ - "def load_word_pairs(filename):\n", - " uk_ru_pairs = []\n", - " uk_vectors = []\n", - " ru_vectors = []\n", - " with open(filename, \"r\") as inpf:\n", - " for line in inpf:\n", - " uk, ru = line.rstrip().split(\"\\t\")\n", - " if uk not in uk_emb or ru not in ru_emb:\n", - " continue\n", - " uk_ru_pairs.append((uk, ru))\n", - " uk_vectors.append(uk_emb[uk])\n", - " ru_vectors.append(ru_emb[ru])\n", - " return uk_ru_pairs, np.array(uk_vectors), np.array(ru_vectors)" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "wkNL602WHJyO" - }, - "outputs": [], - "source": [ - "!wget -O ukr_rus.train.txt http://tiny.cc/jfgecz" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "uoclU6JcHCcn" - }, - "outputs": [], - "source": [ - "!wget -O ukr_rus.test.txt http://tiny.cc/6zoeez" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "05BqsdSK7ugD" - }, - "outputs": [], - "source": [ - "uk_ru_train, X_train, Y_train = load_word_pairs(\"ukr_rus.train.txt\")" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "zQOZw51r7ugL" - }, - "outputs": [], - "source": [ - "uk_ru_test, X_test, Y_test = load_word_pairs(\"ukr_rus.test.txt\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "-ZBBNvpz7ugQ" - }, - "source": [ - "## Embedding space mapping (0.3 pts)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "x_Dhk5gL7ugS" - }, - "source": [ - "Let $x_i \\in \\mathrm{R}^d$ be the distributed representation of word $i$ in the source language, and $y_i \\in \\mathrm{R}^d$ is the vector representation of its translation. Our purpose is to learn such linear transform $W$ that minimizes euclidian distance between $Wx_i$ and $y_i$ for some subset of word embeddings. Thus we can formulate so-called Procrustes problem:\n", - "\n", - "$$W^*= \\arg\\min_W \\sum_{i=1}^n||Wx_i - y_i||_2$$\n", - "or\n", - "$$W^*= \\arg\\min_W ||WX - Y||_F$$\n", - "\n", - "where $||*||_F$ - Frobenius norm." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "acOjDdtL7ugY" - }, - "source": [ - "$W^*= \\arg\\min_W \\sum_{i=1}^n||Wx_i - y_i||_2$ looks like simple multiple linear regression (without intercept fit). So let's code." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "Lb-KN1be7uga" - }, - "outputs": [], - "source": [ - "from sklearn.linear_model import LinearRegression\n", - "\n", - "# YOUR CODE HERE\n", - "# mapping = ...\n", - "# -------" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "X7tqJwoY7ugf" - }, - "source": [ - "Let's take a look at neigbours of the vector of word _\"серпень\"_ (_\"август\"_ in Russian) after linear transform." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "31SrFSbn7ugi" - }, - "outputs": [], - "source": [ - "august = mapping.predict(uk_emb[\"серпень\"].reshape(1, -1))\n", - "ru_emb.most_similar(august)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "okSkjk597ugo" - }, - "source": [ - "We can see that neighbourhood of this embedding cosists of different months, but right variant is on the ninth place." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "o2uY6Y9B7ugt" - }, - "source": [ - "As quality measure we will use precision top-1, top-5 and top-10 (for each transformed Ukrainian embedding we count how many right target pairs are found in top N nearest neighbours in Russian embedding space)." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "zptuho8LAfIE" - }, - "outputs": [], - "source": [ - "def precision(pairs, mapped_vectors, topn=1):\n", - " \"\"\"\n", - " :args:\n", - " pairs = list of right word pairs [(uk_word_0, ru_word_0), ...]\n", - " mapped_vectors = list of embeddings after mapping from source embedding space to destination embedding space\n", - " topn = the number of nearest neighbours in destination embedding space to choose from\n", - " :returns:\n", - " precision_val, float number, total number of words for those we can find right translation at top K.\n", - " \"\"\"\n", - " assert len(pairs) == len(mapped_vectors)\n", - " num_matches = 0\n", - " for i, (_, ru) in enumerate(pairs):\n", - " # YOUR CODE HERE\n", - " precision_val = num_matches / len(pairs)\n", - " return precision_val" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "duhj9hpv7ugy" - }, - "outputs": [], - "source": [ - "assert precision([(\"серпень\", \"август\")], august, topn=5) == 0.0\n", - "assert precision([(\"серпень\", \"август\")], august, topn=9) == 1.0\n", - "assert precision([(\"серпень\", \"август\")], august, topn=10) == 1.0" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "0-iyd5gP7ug5" - }, - "outputs": [], - "source": [ - "assert precision(uk_ru_test, X_test) == 0.0\n", - "assert precision(uk_ru_test, Y_test) == 1.0" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "U-ssEJ3x7uhA" - }, - "outputs": [], - "source": [ - "precision_top1 = precision(uk_ru_test, mapping.predict(X_test), 1)\n", - "precision_top5 = precision(uk_ru_test, mapping.predict(X_test), 5)" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "7K-hy7a6Ksn2" - }, - "outputs": [], - "source": [ - "print(precision_top1)\n", - "print(precision_top5)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "hf6Ou8bx7uhH" - }, - "source": [ - "## Making it better (orthogonal Procrustean problem) (0.3 pts)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "4oLs-drN7uhK" - }, - "source": [ - "It can be shown (see original paper) that a self-consistent linear mapping between semantic spaces should be orthogonal. \n", - "We can restrict transform $W$ to be orthogonal. Then we will solve next problem:\n", - "\n", - "$$W^*= \\arg\\min_W ||WX - Y||_F \\text{, where: } W^TW = I$$\n", - "\n", - "$$I \\text{- identity matrix}$$\n", - "\n", - "Instead of making yet another regression problem we can find optimal orthogonal transformation using singular value decomposition. It turns out that optimal transformation $W^*$ can be expressed via SVD components:\n", - "$$X^TY=U\\Sigma V^T\\text{, singular value decompostion}$$\n", - "$$W^*=UV^T$$" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "_KSaRJFGMFiJ" - }, - "outputs": [], - "source": [ - "import numpy as np" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "DdFQ7qti7uhL" - }, - "outputs": [], - "source": [ - "def learn_transform(X_train, Y_train):\n", - " \"\"\" \n", - " :returns: W* : float matrix[emb_dim x emb_dim] as defined in formulae above\n", - " \"\"\"\n", - " # YOUR CODE GOES HERE\n", - " # compute orthogonal embedding space mapping\n", - " # mapping = ...\n", - "\n", - " return mapping" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "7X7QfYDd7uhQ" - }, - "outputs": [], - "source": [ - "W = learn_transform(X_train, Y_train)" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "OVOFYYa37uhX" - }, - "outputs": [], - "source": [ - "ru_emb.most_similar([np.matmul(uk_emb[\"серпень\"], W)])" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "r297sYP37uhb" - }, - "outputs": [], - "source": [ - "print(precision(uk_ru_test, np.matmul(X_test, W)))\n", - "print(precision(uk_ru_test, np.matmul(X_test, W), 5))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "hvUZ72U5AfJg" - }, - "source": [ - "## Unsupervised embedding-based MT (0.4 pts)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "LLyuVfHBLrJn" - }, - "source": [ - "Now, let's build our word embeddings-based translator!" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "tPAURW1CMuP7" - }, - "source": [ - "Firstly, download OPUS Tatoeba corpus." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "F80kUKzQMsDu" - }, - "outputs": [], - "source": [ - "!wget https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/mono/uk.txt.gz" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "0CGFZoxCUVf1" - }, - "outputs": [], - "source": [ - "!gzip -d ./uk.txt.gz" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "2MV3VvoVUX5U" - }, - "outputs": [], - "source": [ - "with open('./uk.txt', 'r') as f:\n", - " uk_corpus = f.readlines()" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "tU7nPVf0UhbI" - }, - "outputs": [], - "source": [ - "# To save your time and CPU, feel free to use first 1000 sentences of the corpus\n", - "uk_corpus = uk_corpus[:1000]" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "FLN8dBOXAfJ1" - }, - "outputs": [], - "source": [ - "# Any necessary preprocessing if needed\n", - "# YOUR CODE HERE" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "FGksC7l_NMi9" - }, - "outputs": [], - "source": [ - "def translate(sentence):\n", - " \"\"\"\n", - " :args:\n", - " sentence - sentence in Ukrainian (str)\n", - " :returns:\n", - " translation - sentence in Russian (str)\n", - "\n", - " * find ukrainian embedding for each word in sentence\n", - " * transform ukrainian embedding vector\n", - " * find nearest russian word and replace\n", - " \"\"\"\n", - " # YOUR CODE GOES HERE\n", - "\n", - " return \" \".join(translated)" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "4hbbMy-tNxlf" - }, - "outputs": [], - "source": [ - "assert translate(\".\") == \".\"\n", - "assert translate(\"1 , 3\") == \"1 , 3\"\n", - "assert translate(\"кіт зловив мишу\") == \"кот поймал мышку\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "ia6I2ce7O_HI" - }, - "source": [ - "Now you can play with your model and try to get as accurate translations as possible. **Note**: one big issue is out-of-vocabulary words. Try to think of various ways of handling it (you can start with translating each of them to a special **UNK** token and then move to more sophisticated approaches). Good luck!" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "ap1W7ZCeOAVU" - }, - "outputs": [], - "source": [ - "for sent in uk_corpus[::10]:\n", - " print(translate(sent))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Great! \n", - "See second notebook for the Neural Machine Translation assignment." - ] - } - ], - "metadata": { - "anaconda-cloud": {}, - "colab": { - "collapsed_sections": [], - "machine_shape": "hm", - "name": "homework.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Py3 Research", - "language": "python", - "name": "py3_research" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.7" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/homeworks/lab01_nlp/lab1_02_nlp_part2_nmt.ipynb b/homeworks/lab01_nlp/lab1_02_nlp_part2_nmt.ipynb deleted file mode 100644 index eb346fa..0000000 --- a/homeworks/lab01_nlp/lab1_02_nlp_part2_nmt.ipynb +++ /dev/null @@ -1,941 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Lab 1" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Part 2: Neural Machine Translation in the wild\n", - "In the third homework you are supposed to get the best translation you can for the EN-RU translation task.\n", - "\n", - "Basic approach using RNNs as encoder and decoder is implemented for you. \n", - "\n", - "Your ultimate task is to use the techniques we've covered, e.g.\n", - "\n", - "* Optimization enhancements (e.g. learning rate decay)\n", - "\n", - "* CNN encoder (with or without positional encoding)\n", - "\n", - "* attention/self-attention mechanism\n", - "\n", - "* pretraining the language model\n", - "\n", - "* [Byte Pair Encoding](https://github.com/rsennrich/subword-nmt)\n", - "\n", - "* or just fine-tunning BERT ;)\n", - "\n", - "to improve the translation quality. \n", - "\n", - "__Please use at least three different approaches/models and compare them (translation quality/complexity/training and evaluation time).__\n", - "\n", - "Write down some summary on your experiments and illustrate it with convergence plots/metrics and your thoughts. Just like you would approach a real problem." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# You might need to install the libraries below. Do it in the desired environment\n", - "# if you are working locally.\n", - "\n", - "# ! pip install subword-nmt\n", - "# ! pip install nltk\n", - "# ! pip install torchtext" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "# Thanks to YSDA NLP course team for the data\n", - "# (who thanks tilda and deephack teams for the data in their turn)\n", - "\n", - "import os\n", - "path_do_data = '../../datasets/Machine_translation_EN_RU/data.txt'\n", - "if not os.path.exists(path_do_data):\n", - " print(\"Dataset not found locally. Downloading from github. Loading special files as well\")\n", - " !wget https://raw.githubusercontent.com/girafe-ai/ml-mipt/master/datasets/Machine_translation_EN_RU/data.txt -nc\n", - " path_do_data = './data.txt'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if not os.path.exists('./utils.py'):\n", - " print(\"utils file not found locally. Downloading from github.\")\n", - " !wget https://raw.githubusercontent.com/girafe-ai/ml-mipt/master/homeworks_advanced/Lab1_NLP/utils.py -nc\n", - "\n", - "if not os.path.exists('./my_network.py'):\n", - " print(\"network file not found locally. Downloading from github.\")\n", - " !wget https://raw.githubusercontent.com/girafe-ai/ml-mipt/master/homeworks_advanced/Lab1_NLP/my_network.py -nc" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "import torch\n", - "import torch.nn as nn\n", - "import torch.optim as optim\n", - "\n", - "import torchtext\n", - "from torchtext.legacy.datasets import TranslationDataset, Multi30k\n", - "from torchtext.legacy.data import Field, BucketIterator, TabularDataset\n", - "\n", - "import spacy\n", - "\n", - "import random\n", - "import math\n", - "import time\n", - "\n", - "import matplotlib\n", - "matplotlib.rcParams.update({'figure.figsize': (16, 12), 'font.size': 14})\n", - "import matplotlib.pyplot as plt\n", - "%matplotlib inline\n", - "from IPython.display import clear_output\n", - "\n", - "from nltk.tokenize import WordPunctTokenizer\n", - "from subword_nmt.learn_bpe import learn_bpe\n", - "from subword_nmt.apply_bpe import BPE" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Main part\n", - "__Here comes the preprocessing. Do not hesitate to use BPE or more complex preprocessing ;)__" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "tokenizer_W = WordPunctTokenizer()\n", - "def tokenize(x, tokenizer=tokenizer_W):\n", - " return tokenizer.tokenize(x.lower())" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "SRC = Field(tokenize=tokenize,\n", - " init_token = '', \n", - " eos_token = '', \n", - " lower = True)\n", - "\n", - "TRG = Field(tokenize=tokenize,\n", - " init_token = '', \n", - " eos_token = '', \n", - " lower = True)\n", - "\n", - "dataset = TabularDataset(\n", - " path=path_do_data,\n", - " format='tsv',\n", - " fields=[('trg', TRG), ('src', SRC)]\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "train_data, valid_data, test_data = dataset.split(split_ratio=[0.8, 0.15, 0.05])" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Number of training examples: 40000\n", - "Number of validation examples: 2500\n", - "Number of testing examples: 7500\n" - ] - } - ], - "source": [ - "print(f\"Number of training examples: {len(train_data.examples)}\")\n", - "print(f\"Number of validation examples: {len(valid_data.examples)}\")\n", - "print(f\"Number of testing examples: {len(test_data.examples)}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "SRC.build_vocab(train_data, min_freq = 3)\n", - "TRG.build_vocab(train_data, min_freq = 3)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Unique tokens in source (ru) vocabulary: 9267\n", - "Unique tokens in target (en) vocabulary: 6699\n" - ] - } - ], - "source": [ - "print(f\"Unique tokens in source (ru) vocabulary: {len(SRC.vocab)}\")\n", - "print(f\"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here are tokens from original (RU) corpus:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['',\n", - " '29',\n", - " 'соль',\n", - " 'комо',\n", - " '―',\n", - " 'электрическая',\n", - " 'ming',\n", - " 'утренний',\n", - " 'детском',\n", - " 'таунус']" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "SRC.vocab.itos[::1000]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And from target (EN) corpus:" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['', 'king', 'buffets', 'catch', 'media', 'schedule', 'maraunenhof']" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "TRG.vocab.itos[::1000]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And here is example from train dataset:" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'trg': ['laundry', 'service', 'is', 'provided', '.'], 'src': ['помимо', 'этого', ',', 'гостям', 'предоставляются', 'услуги', 'прачечной', '.']}\n" - ] - } - ], - "source": [ - "print(vars(train_data.examples[9]))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's check the length distributions:" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Length distribution in Train data\n" - ] - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "src_length = map(len, [vars(x)['src'] for x in train_data.examples])\n", - "trg_length = map(len, [vars(x)['trg'] for x in train_data.examples])\n", - "\n", - "print('Length distribution in Train data')\n", - "plt.figure(figsize=[8, 4])\n", - "plt.subplot(1, 2, 1)\n", - "plt.title(\"source length\")\n", - "plt.hist(list(src_length), bins=20);\n", - "\n", - "plt.subplot(1, 2, 2)\n", - "plt.title(\"translation length\")\n", - "plt.hist(list(trg_length), bins=20);" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Length distribution in Test data\n" - ] - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAe0AAAEICAYAAAByPazKAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8GearUAAAfXElEQVR4nO3df7RdZX3n8fdHIij+4GdETMCkktqiq1YmBVzYjiVWAa241qDFsWPUdNJatLbaarBdpctKJ06dIi4tNRUKTK1AqdaMUjFFrdNpQYNaFFBJEUjSYCK/bKVq0e/8sZ8Lh8tN7k3uz33O+7XWWWfv53n23s++5z7nu/ezn7N3qgpJkrTwPWq+KyBJkqbGoC1JUk8YtCVJ6gmDtiRJPWHQliSpJwzakiT1hEFb05JkWZJKsmgetv3qJH8/19uV5kOSi5O8YxrL/1uSH5nJOrX13pbk+TO93ilsd96+e+aTQVu9MKoNVAvLfAWovZXkM0l+aTCtqh5fVbfOV52mqy9/+9lm0B5RSfab7zpIw8aDSs02g/YClOStSbYn+dckX0uyqqUfkOTdSf6lvd6d5ICW94iu4nZmekybvjjJBUmuSvId4GeTHJXkw0l2JbkryXsHln1tkpuT3JPk6iRPnWLdD0pyYZIdbR/eMXaAMFbHJO9q6/1GklMHll2e5LNtv/82yfuS/HnL/mx7v7d18z1nYLkJ1yfNpCT/Gzga+D/tf/AtAz1Aa5LcAXyqlf3LJHcmua/9Tz9jYD0Xt//tj7f/9euSPK3lJcl5SXYm+XaSLyd55gR1OSTJx1rbvadNL2155wI/Dby31fO9LX3w++CgJJe25W9P8jtJHtXy9thOJ/kbPSrJuiT/3L5TrkhyaMsb+1utTnJHkm8l+e2BZR+b5JK2zZvb33fb7v72A5t95UTrG1pV5WsBvYCnA1uBp7T5ZcDT2vTbgWuBJwGLgX8Afr/lvRr4+3HrKuCYNn0xcB9wEt3B2uOAfwLOa9OPAZ7byp4ObAF+HFgE/A7wD7up77K2nUVt/iPA+9s6nwR8DvjlgTr+B/Dfgf2A1wH/AqTl/yPwLmB/4LnAt4E/n2g7U1mfL18z/QJuA54/MD/2f3lp+59/bEt/LfAE4ADg3cCXBpa5GLgLOL61rw8Cl7W8FwLXAwcDaW3wyIHl3tGmDwP+C3Bg285fAn89sI3PAL80ru6D3weXAh9tyy4Dvg6saXl71a4G/ybAG+m+o5a2fX8/8KFxf6s/BR4LPAv4HvDjLX898HfAIW35G4BtU/jbT7i+YX3NewV8jftA4BhgJ/B84NHj8v4ZOG1g/oXAbW361UwetC8dyHsOsIuBIDiQ9zdjDbjNPwq4H3jqBGXHGs4i4IjWaB47kP8K4NMDddwykHdgW/bJdEfRDwAHDuT/OZMH7QnXN9+fo6/hfO0hcPzIHpY5uJU5qM1fDHxgIP804Ktt+mS6AHoi8Khx67mYFrQn2MZPAvcMzH+G3QRtukD8feDYgbxfBj7TpveqXfHwoH0zsGog70i6A4BFA3+rpQP5nwPObNO3Ai8cyPslpha0J1zfsL7sHl9gqmoL8OvA7wE7k1yW5Ckt+ynA7QPFb29pU7V1YPoo4PaqemCCck8Fzk9yb5J7gbvpjvqXTLL+pwKPBnYMLPt+ujPuMXeOTVTV/W3y8W0/7h5IG1/f3dnd+qS59OD/apL9kqxvXcTfpgs2AIcPlL9zYPp+2v9sVX0KeC/wPrr2vyHJE8dvLMmBSd7fura/TXf56OBMbazK4XTtdPx3yWD73td29VTgIwPt/2bgB3QH9I9YNwP7TvcdMNjmp9L+97S+oWTQXoCq6i+q6rl0DaCAd7asf2lpY45uaQDfoTsiBiDJkyda9cD0VuDoTDxwZitdl/bBA6/HVtU/TFL1rXRn2ocPLPfEqnrGJMsB7AAOTXLgQNpRu6m7NF929384mP5f6S4xPR84iO6MELoD38k3UPWeqvpPwLHAjwK/NUGxN9NdSjuhqp4I/My4beypvXyL7ux3/HfJ9qnUbxJbgVPHfXc8pqqmsu4ddN3iY44al+93AAbtBSfJ05OcnG6A2XeBfwd+2LI/BPxOksVJDgd+l64LGbrr089I8pNJHkN3pr4nn6NrJOuTPC7JY5Kc1PL+BDh7bPBMG7TyssnqXlU7gE8C/yvJE9uglKcl+c9TWPZ2YDPwe0n2bwPNfn6gyC66v8OM/85U2gvfZPL/wSfQHbzeRXcg/QdTXXmSn0pyQpJH0x2If5eH2v/4bfw73cDMQ4FzplrPqvoBcAVwbpInpBtk+iYe+i6Zjj9p631q25/FSU6f4rJX0H3vHJJkCfD6cflT+dsPPYP2wnMA3YCMb9F1+zwJOLvlvYMusN0AfBn4Qkujqr5ON1Dtb4FbgD3edKQ13J+nu8Z1B7AN+IWW9xG6s/vLWtfbV4Cpjsp+Fd1AspuAe4Ar6a5rTcUr6a6139X263K6L7+xLrpzgf/Xut5OnOI6pZn0P+gOnO9N8pu7KXMpXXfzdrp2cO1erP+JdAOr7mnruAv4wwnKvZtu8NW32vo/MS7/fOCMNhL7PRMs/wa6g4Jb6b4r/gK4aC/quTvnAxuBTyb511a3E6a47Nvpvoe+Qfc9diWt/TdT+dsPvbFRu9KCk+RyugE6488iJA25JK+jG1Q2aU/dKPFMWwtG6xp8WutWP4XuuuBfz3e9JM2+JEcmOam1/6fTXbf/yHzXa6Hx7j1aSJ4MfJjuN6jbgNdV1Rfnt0qS5sj+dL82WQ7cC1wG/PG81mgBsntckqSesHtckqSeWNDd44cffngtW7ZsvqshLXjXX3/9t6pq8XzXY09sz9LU7Kk9L+igvWzZMjZv3jzf1ZAWvCS3T15qftmepanZU3u2e1ySpJ4waEuS1BMGbUmSesKgLUlSTxi0JUnqCYO2JEk9YdCWJKknDNqSJPWEQVuSpJ5Y0HdEm0nL1n18RtZz2/oXzch6JM2uqbR527P6xjNtSZJ6wqAtSVJPGLQlSeoJg7YkST1h0JYkqScM2pIk9YRBW5KknjBoSyMkyUVJdib5yrj0NyT5apIbk/zPgfSzk2xJ8rUkLxxIP6WlbUmybi73QRplI3NzFUkAXAy8F7h0LCHJzwKnA8+qqu8leVJLPxY4E3gG8BTgb5P8aFvsfcDPAduAzyfZWFU3zdleSCPKoC2NkKr6bJJl45JfB6yvqu+1Mjtb+unAZS39G0m2AMe3vC1VdStAkstaWYO2NMvsHpf0o8BPJ7kuyd8l+amWvgTYOlBuW0vbXfojJFmbZHOSzbt27ZqFqkujxaAtaRFwKHAi8FvAFUkyEyuuqg1VtbKqVi5evHgmVimNtEmD9kQDV5L8YRu0ckOSjyQ5eCDPgStSv2wDPlydzwE/BA4HtgNHDZRb2tJ2ly5plk3lTPti4JRxaZuAZ1bVTwBfB86GRwxcOQX44yT7JdmPbuDKqcCxwCtaWUnz76+BnwVoA832B74FbATOTHJAkuXACuBzwOeBFUmWJ9mfrs1vnJeaSyNm0oFoEw1cqapPDsxeC5zRph24Ii1gST4EPA84PMk24BzgIuCi1pv2fWB1VRVwY5Ir6NrpA8BZVfWDtp7XA1cD+wEXVdWNc74z0giaidHjrwUub9NL6IL4mMEBKuMHrpww0cqSrAXWAhx99NEzUD1JY6rqFbvJ+sXdlD8XOHeC9KuAq2awapKmYFoD0ZL8Nt0R+AdnpjoOXJEkaXf2+Uw7yauBFwOrWlca7HmAigNXJEmahn06005yCvAW4CVVdf9AlgNXJEmaJZOeae9m4MrZwAHApvZzzmur6leqyoErkiTNkqmMHp9o4MqFeyg/1ANXlq37+JTK3bb+RbNcE0nSqPGOaJIk9YRBW5KknjBoS5LUEwZtSZJ6wqAtSVJPGLQlSeoJg7YkST1h0JYkqScM2pIk9YRBW5KknjBoSyMkyUVJdib5ygR5b05SSQ5v80nyniRbktyQ5LiBsquT3NJeq+dyH6RRZtCWRsvFwCnjE5McBbwAuGMg+VS6J/WtANYCF7Syh9I9OOgE4HjgnCSHzGqtJQEGbWmkVNVngbsnyDqP7nG7NZB2OnBpda4FDk5yJPBCYFNV3V1V9wCbmOBAQNLMM2hLIy7J6cD2qvqncVlLgK0D89ta2u7SJ1r32iSbk2zetWvXDNZaGk0GbWmEJTkQeBvwu7Ox/qraUFUrq2rl4sWLZ2MT0kgxaEuj7WnAcuCfktwGLAW+kOTJwHbgqIGyS1va7tIlzTKDtjTCqurLVfWkqlpWVcvourqPq6o7gY3Aq9oo8hOB+6pqB3A18IIkh7QBaC9oaZJmmUFbGiFJPgT8I/D0JNuSrNlD8auAW4EtwJ8CvwpQVXcDvw98vr3e3tIkzbJF810BSXOnql4xSf6ygekCztpNuYuAi2a0cpIm5Zm2JEk9YdCWJKknDNqSJPWEQVuSpJ6YNGhP9ICBJIcm2dQeFrBp7L7DPmBAkqTZM5Uz7Yt55H2F1wHXVNUK4Jo2Dz5gQJKkWTNp0N7NAwZOBy5p05cALx1I9wEDkiTNgn29pn1EuzMSwJ3AEW3aBwxIkjRLpj0Qrd2AoSYtOPX1+YABSZImsK9B+5ut25v2vrOl+4ABSZJmyb7exnQjsBpY394/OpD++iSX0Q06u6+qdiS5GviDgcFnLwDO3vdqD4dl6z4+aZnb1r9oDmoiSeqDSYN2e8DA84DDk2yjGwW+HriiPWzgduDlrfhVwGl0Dxi4H3gNdA8YSDL2gAHwAQOS9tFUDnalYTVp0N7DAwZWTVDWBwxIkjRLvCOaJEk9YdCWJKknDNqSJPXEvo4e1yQcLKOFKMlFwIuBnVX1zJb2h8DPA98H/hl4TVXd2/LOBtYAPwB+raqubumnAOcD+wEfqKr1c70v0ijyTFsaLRfzyFsIbwKeWVU/AXyd9nPMJMcCZwLPaMv8cZL9kuwHvI/uWQPHAq9oZSXNMoO2NEImepZAVX2yqh5os9fS3fwIumcJXFZV36uqb9D9lPP49tpSVbdW1feBy1pZSbPMoC1p0GuBv2nTPktAWmC8pi0JgCS/DTwAfHCm1llVG4ANACtXrpyxZxTMJe9cqIXEoC2JJK+mG6C2qt0kCfb8zACfJSDNA7vHpRHXRoK/BXhJVd0/kLURODPJAUmWAyuAz9HdjnhFkuVJ9qcbrLZxrustjSLPtKURsptnCZwNHABsSgJwbVX9SlXdmOQK4Ca6bvOzquoHbT2vB66m+8nXRVV145zvjDSCDNrSCNnNswQu3EP5c4FzJ0i/iu4BQZLmkN3jkiT1hEFbkqSeMGhLktQTBm1JknrCoC1JUk84elzSyPJpfOobz7QlSeoJg7YkST1h0JYkqScM2pIk9YRBW5KknphW0E7yG0luTPKVJB9K8pj25J/rkmxJcnl7ChDtSUGXt/TrkiybiR2QJGlU7HPQTrIE+DVgZVU9k+5pP2cC7wTOq6pjgHuANW2RNcA9Lf28Vk6SJE3RdLvHFwGPTbIIOBDYAZwMXNnyLwFe2qZPb/O0/FVpzwGUJEmT2+egXVXbgXcBd9AF6/uA64F7q+qBVmwbsKRNLwG2tmUfaOUPG7/eJGuTbE6yedeuXftaPUmShs50uscPoTt7Xg48BXgccMp0K1RVG6pqZVWtXLx48XRXJ2lAkouS7EzylYG0Q5NsSnJLez+kpSfJe9o4lBuSHDewzOpW/pYkq+djX6RRNJ3u8ecD36iqXVX1H8CHgZOAg1t3OcBSYHub3g4cBdDyDwLumsb2Je29i3nkwfU64JqqWgFc0+YBTgVWtNda4ALogjxwDnACcDxwzliglzS7phO07wBOTHJguza9CrgJ+DRwRiuzGvhom97Y5mn5n6qqmsb2Je2lqvoscPe45MHxJuPHoVxanWvpDsiPBF4IbKqqu6vqHmATM9DLJmly07mmfR3dgLIvAF9u69oAvBV4U5ItdNesL2yLXAgc1tLfxENH85Lm1xFVtaNN3wkc0aYfHIfSjI1R2V36IzhGRZpZ03rKV1WdQ9dNNuhWui6z8WW/C7xsOtuTNLuqqpLMWA9YVW2gO5hn5cqV9qxJ0+Qd0SR9s3V70953tvQHx6E0Y2NUdpcuaZYZtCUNjjcZPw7lVW0U+YnAfa0b/WrgBUkOaQPQXtDSJM2yaXWPS+qXJB8CngccnmQb3eWt9cAVSdYAtwMvb8WvAk4DtgD3A68BqKq7k/w+8PlW7u1VNX5wm6RZYNCWRkhVvWI3WasmKFvAWbtZz0XARTNYNUlTYPe4JEk9YdCWJKknDNqSJPWEQVuSpJ4waEuS1BMGbUmSesKgLUlSTxi0JUnqCYO2JEk9YdCWJKknDNqSJPWEQVuSpJ4waEuS1BMGbUmSesKgLUlSTxi0JUnqCYO2JACS/EaSG5N8JcmHkjwmyfIk1yXZkuTyJPu3sge0+S0tf9n81l4aDQZtSSRZAvwasLKqngnsB5wJvBM4r6qOAe4B1rRF1gD3tPTzWjlJs2xaQTvJwUmuTPLVJDcneU6SQ5NsSnJLez+klU2S97Qj8xuSHDczuyBphiwCHptkEXAgsAM4Gbiy5V8CvLRNn97mafmrkmQO6yqNpOmeaZ8PfKKqfgx4FnAzsA64pqpWANe0eYBTgRXttRa4YJrbljRDqmo78C7gDrpgfR9wPXBvVT3Qim0DlrTpJcDWtuwDrfxh49ebZG2SzUk279q1a3Z3QhoB+xy0kxwE/AxwIUBVfb+q7uXhR+Djj8wvrc61wMFJjtznmkuaMa1H7HRgOfAU4HHAKdNdb1VtqKqVVbVy8eLF012dNPKmc6a9HNgF/FmSLyb5QJLHAUdU1Y5W5k7giDb94JF5M3jULml+PR/4RlXtqqr/AD4MnER3cL2olVkKbG/T24GjAFr+QcBdc1tlafRMJ2gvAo4DLqiqZwPf4aGucACqqoDam5XanSbNizuAE5Mc2K5NrwJuAj4NnNHKrAY+2qY3tnla/qdae5c0i6YTtLcB26rqujZ/JV0Q/+ZYt3d739nyHzwybwaP2h9kd5o091o7vhL4AvBluu+GDcBbgTcl2UJ3zfrCtsiFwGEt/U2MO2CXNDsWTV5kYlV1Z5KtSZ5eVV/joSPzm+iOwNfzyCPz1ye5DDgBuG+gG13SPKuqc4BzxiXfChw/QdnvAi+bi3pJesg+B+3mDcAH2w0XbgVeQ3eEfkWSNcDtwMtb2auA04AtwP2trCRJmqJpBe2q+hKwcoKsVROULeCs6WxPkqRR5h3RJEnqCYO2JEk9YdCWJKknDNqSJPWEQVuSpJ4waEuS1BMGbUmSesKgLUlSTxi0JUnqCYO2JEk9YdCWJKknDNqSJPWEQVuSpJ4waEsCIMnBSa5M8tUkNyd5TpJDk2xKckt7P6SVTZL3JNmS5IYkx813/aVRMN3naWuWLVv38UnL3Lb+RXNQE42A84FPVNUZSfYHDgTeBlxTVeuTrAPWAW8FTgVWtNcJwAXtXdIs8kxbEkkOAn4GuBCgqr5fVfcCpwOXtGKXAC9t06cDl1bnWuDgJEfOcbWlkeOZtiSA5cAu4M+SPAu4HngjcERV7Whl7gSOaNNLgK0Dy29raTsG0kiyFlgLcPTRR89a5efbVHrEwF4xTZ9n2pKgO4A/Drigqp4NfIeuK/xBVVVA7c1Kq2pDVa2sqpWLFy+escpKo8qgLQm6M+VtVXVdm7+SLoh/c6zbu73vbPnbgaMGll/a0iTNIoO2JKrqTmBrkqe3pFXATcBGYHVLWw18tE1vBF7VRpGfCNw30I0uaZZ4TVvSmDcAH2wjx28FXkN3YH9FkjXA7cDLW9mrgNOALcD9raykWWbQlgRAVX0JWDlB1qoJyhZw1qxXStLD2D0uSVJPTDtoJ9kvyReTfKzNL09yXbtT0uWtq40kB7T5LS1/2XS3LUnSKJmJ7vE3AjcDT2zz7wTOq6rLkvwJsIbubklrgHuq6pgkZ7ZyvzAD25ekXvAOh5quaZ1pJ1kKvAj4QJsPcDLdz0XgkXdQGruz0pXAqlZekiRNwXS7x98NvAX4YZs/DLi3qh5o82N3SYKBOyi1/Pta+YdJsjbJ5iSbd+3aNc3qSZI0PPY5aCd5MbCzqq6fwfp4ByVJknZjOte0TwJekuQ04DF017TPp3twwKJ2Nj14l6SxOyhtS7IIOAi4axrblyRppOzzmXZVnV1VS6tqGXAm8KmqeiXwaeCMVmz8HZTG7qx0Riu/V/cxliRplM3G77TfCrwpyRa6a9YXtvQLgcNa+psY9zACSZK0ZzNyR7Sq+gzwmTZ9K3D8BGW+C7xsJrY33lQfizes/BmJhsWot2VpMt4RTZKknjBoS5LUEwZtSZJ6wqAtSVJPGLQlSeoJg7YkST1h0Jb0IB+1Ky1sBm1Jg8YetTtm7FG7xwD30D1iFwYetQuc18pJmmUGbUmAj9qV+sCgLWmMj9qVFjiDtiQftSv1xIzce1xS7/moXakHPNOW5KN2pZ4waEvaEx+1Ky0gdo9Lepj5ftSupN3zTFuSpJ4waEuS1BMGbUmSesKgLUlSTxi0JUnqCYO2JEk9YdCWJKknDNqSJPXEPgftJEcl+XSSm5LcmOSNLf3QJJuS3NLeD2npSfKeJFuS3JDkuJnaCUmSRsF0zrQfAN5cVccCJwJnJTmW7naG11TVCuAaHrq94anAivZaC1wwjW1LkjRy9jloV9WOqvpCm/5X4Ga6Z+yeDlzSil0CvLRNnw5cWp1r6Z4edOQ+11ySpBEzI9e0kywDng1cBxxRVTta1p3AEW16CbB1YLFtLW38utYm2Zxk865du2aiepIkDYVpPzAkyeOBvwJ+vaq+neTBvKqqJHv1uL6q2gBsAFi5cqWP+pshy9Z9fErlblv/olmuiSRpX03rTDvJo+kC9ger6sMt+Ztj3d7tfWdL3w4cNbD40pYmSZKmYDqjx0P3TN2bq+qPBrI2Aqvb9GrgowPpr2qjyE8E7hvoRpckSZOYzpn2ScB/A05O8qX2Og1YD/xckluA57d5gKuAW4EtwJ8CvzqNbUuaQf6EU+qHfb6mXVV/D2Q32asmKF/AWfu6PUmzauwnnF9I8gTg+iSbgFfT/YRzfZJ1dD/hfCsP/wnnCXQ/4TxhXmo+ZKYy/sSxJ6PLO6JJ8iecUk8YtCU9jD/hlBYug7akB43/CedgXrvEtdc/4ayqlVW1cvHixTNYU2k0Tft32pKGw55+wllVO/wJ58Lhde/R5Zm2JH/CKfWEZ9raax7lD6Wxn3B+OcmXWtrb6H6yeUWSNcDtwMtb3lXAaXQ/4bwfeM3cVlcaTQZtPcxUb3eq4eJPOKV+sHtckqSeMGhLktQTBm1JknrCoC1JUk8YtCVJ6gmDtiRJPWHQliSpJwzakiT1hEFbkqSe8I5okjSEvN3wcDJoS9KImuptiw3uC4dBW7PCo3xJmnle05YkqScM2pIk9YTd45o3Xk+TpL1j0NaC5/VxaX7ZBheOOe8eT3JKkq8l2ZJk3VxvX9LMsC1Lc29Oz7ST7Ae8D/g5YBvw+SQbq+qmuayHho9nAnPLtqzxpnq5azK20z2b6+7x44EtVXUrQJLLgNMBG7pm3UIM7AuxTlNkW9ascKzLns110F4CbB2Y3wacMFggyVpgbZv9tyR3Ad+am+rNq8MZ/v1c8PuYd87IamZ0P6dYp6fO1PamaNK2DBO256/NQd3myoL/f94HvdmnvWyrvdmvZrftecENRKuqDcCGsfkkm6tq5TxWaU6Mwn6Owj7C6OznVIxvz8NkGD/nYdwnGK79muuBaNuBowbml7Y0Sf1iW5bmwVwH7c8DK5IsT7I/cCawcY7rIGn6bMvSPJjT7vGqeiDJ64Grgf2Ai6rqxkkWG8qutQmMwn6Owj7CCOznPrblYTOMn/Mw7hMM0X6lqua7DpIkaQq897gkST1h0JYkqScWdNAextskJjkqyaeT3JTkxiRvbOmHJtmU5Jb2fsh813UmJNkvyReTfKzNL09yXftML2+DmHorycFJrkzy1SQ3J3nOsH6Wo2rY2+ywtdFhb5MLNmgP3CbxVOBY4BVJjp3fWs2IB4A3V9WxwInAWW2/1gHXVNUK4Jo2PwzeCNw8MP9O4LyqOga4B1gzL7WaOecDn6iqHwOeRbevw/pZjqphb7PD1kaHu01W1YJ8Ac8Brh6YPxs4e77rNQv7+VG6+zd/DTiypR0JfG2+6zYD+7aUroGcDHwMCN1diRZN9Bn37QUcBHyDNqBzIH3oPktfD/t8h6bNDlsbHYU2uWDPtJn4NolL5qkusyLJMuDZwHXAEVW1o2XdCRwxT9WaSe8G3gL8sM0fBtxbVQ+0+b5/psuBXcCfte7FDyR5HMP5WYqhbLPD1kaHvk0u5KA91JI8Hvgr4Ner6tuDedUdDvb6t3hJXgzsrKrr57sus2gRcBxwQVU9G/gO47rdhuGzVGfY2uyQttGhb5MLOWgP7W0SkzyarvF/sKo+3JK/meTIln8ksHO+6jdDTgJekuQ24DK67rfzgYOTjN3Up++f6TZgW1Vd1+avpPvCGLbPcuQNaZsdxjY69G1yIQftobxNYpIAFwI3V9UfDWRtBFa36dV01816q6rOrqqlVbWM7rP7VFW9Evg0cEYr1uv9rKo7ga1Jnt6SVtE9mnKoPstRN6xtdhjb6Ci0yQV9R7Qkp9Fdcxm7TeK581ylaUvyXOD/Al/moetIb6O7RnYFcDRwO/Dyqrp7Xio5w5I8D/jNqnpxkh+hO6o/FPgi8ItV9b35rN90JPlJ4APA/sCtwGvoDoaH8rMcRaPQZoepjQ57m1zQQVuSJD1kIXePS5KkAQZtSZJ6wqAtSVJPGLQlSeoJg7YkST1h0JYkqScM2pIk9cT/B/odrf8G1COhAAAAAElFTkSuQmCC\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "src_length = map(len, [vars(x)['src'] for x in test_data.examples])\n", - "trg_length = map(len, [vars(x)['trg'] for x in test_data.examples])\n", - "\n", - "print('Length distribution in Test data')\n", - "plt.figure(figsize=[8, 4])\n", - "plt.subplot(1, 2, 1)\n", - "plt.title(\"source length\")\n", - "plt.hist(list(src_length), bins=20);\n", - "\n", - "plt.subplot(1, 2, 2)\n", - "plt.title(\"translation length\")\n", - "plt.hist(list(trg_length), bins=20);" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Model side\n", - "__Here comes simple pipeline of NMT model learning. It almost copies the week03 practice__" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "device(type='cuda', index=1)" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "device" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [], - "source": [ - "def _len_sort_key(x):\n", - " return len(x.src)\n", - "\n", - "BATCH_SIZE = 128\n", - "\n", - "train_iterator, valid_iterator, test_iterator = BucketIterator.splits(\n", - " (train_data, valid_data, test_data), \n", - " batch_size = BATCH_SIZE, \n", - " device = device,\n", - " sort_key=_len_sort_key\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "[torchtext.data.batch.Batch of size 128]\n", - "\t[.trg]:[torch.cuda.LongTensor of size 55x128 (GPU 1)]\n", - "\t[.src]:[torch.cuda.LongTensor of size 59x128 (GPU 1)]\n", - "torch.Size([59, 128]) torch.Size([55, 128])\n" - ] - } - ], - "source": [ - "for x in train_iterator:\n", - " break\n", - "print(x)\n", - "print(x.src.shape, x.trg.shape)" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [], - "source": [ - "import my_network\n", - "Encoder = my_network.Encoder\n", - "Decoder = my_network.Decoder\n", - "Seq2Seq = my_network.Seq2Seq" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [], - "source": [ - "INPUT_DIM = len(SRC.vocab)\n", - "OUTPUT_DIM = len(TRG.vocab)\n", - "ENC_EMB_DIM = 256\n", - "DEC_EMB_DIM = 256\n", - "HID_DIM = 512\n", - "N_LAYERS = 2\n", - "ENC_DROPOUT = 0.5\n", - "DEC_DROPOUT = 0.5\n", - "\n", - "enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)\n", - "dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)\n", - "\n", - "# dont forget to put the model to the right device\n", - "model = Seq2Seq(enc, dec, device).to(device)" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Seq2Seq(\n", - " (encoder): Encoder(\n", - " (embedding): Embedding(9267, 256)\n", - " (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)\n", - " (dropout): Dropout(p=0.5, inplace=False)\n", - " )\n", - " (decoder): Decoder(\n", - " (embedding): Embedding(6699, 256)\n", - " (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)\n", - " (out): Linear(in_features=512, out_features=6699, bias=True)\n", - " (dropout): Dropout(p=0.5, inplace=False)\n", - " )\n", - ")" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "def init_weights(m):\n", - " # \n", - " for name, param in m.named_parameters():\n", - " nn.init.uniform_(param, -0.08, 0.08)\n", - " \n", - "model.apply(init_weights)" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The model has 14,880,299 trainable parameters\n" - ] - } - ], - "source": [ - "def count_parameters(model):\n", - " return sum(p.numel() for p in model.parameters() if p.requires_grad)\n", - "\n", - "print(f'The model has {count_parameters(model):,} trainable parameters')" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [], - "source": [ - "PAD_IDX = TRG.vocab.stoi['']\n", - "optimizer = optim.Adam(model.parameters())\n", - "criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [], - "source": [ - "def train(model, iterator, optimizer, criterion, clip, train_history=None, valid_history=None):\n", - " model.train()\n", - " \n", - " epoch_loss = 0\n", - " history = []\n", - " for i, batch in enumerate(iterator):\n", - " \n", - " src = batch.src\n", - " trg = batch.trg\n", - " \n", - " optimizer.zero_grad()\n", - " \n", - " output = model(src, trg)\n", - " \n", - " #trg = [trg sent len, batch size]\n", - " #output = [trg sent len, batch size, output dim]\n", - " \n", - " output = output[1:].view(-1, output.shape[-1])\n", - " trg = trg[1:].view(-1)\n", - " \n", - " #trg = [(trg sent len - 1) * batch size]\n", - " #output = [(trg sent len - 1) * batch size, output dim]\n", - " \n", - " loss = criterion(output, trg)\n", - " \n", - " loss.backward()\n", - " \n", - " # Let's clip the gradient\n", - " torch.nn.utils.clip_grad_norm_(model.parameters(), clip)\n", - " \n", - " optimizer.step()\n", - " \n", - " epoch_loss += loss.item()\n", - " \n", - " history.append(loss.cpu().data.numpy())\n", - " if (i+1)%10==0:\n", - " fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(12, 8))\n", - "\n", - " clear_output(True)\n", - " ax[0].plot(history, label='train loss')\n", - " ax[0].set_xlabel('Batch')\n", - " ax[0].set_title('Train loss')\n", - " if train_history is not None:\n", - " ax[1].plot(train_history, label='general train history')\n", - " ax[1].set_xlabel('Epoch')\n", - " if valid_history is not None:\n", - " ax[1].plot(valid_history, label='general valid history')\n", - " plt.legend()\n", - " \n", - " plt.show()\n", - "\n", - " \n", - " return epoch_loss / len(iterator)" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [], - "source": [ - "def evaluate(model, iterator, criterion):\n", - " \n", - " model.eval()\n", - " \n", - " epoch_loss = 0\n", - " \n", - " history = []\n", - " \n", - " with torch.no_grad():\n", - " \n", - " for i, batch in enumerate(iterator):\n", - "\n", - " src = batch.src\n", - " trg = batch.trg\n", - "\n", - " output = model(src, trg, 0) #turn off teacher forcing\n", - "\n", - " #trg = [trg sent len, batch size]\n", - " #output = [trg sent len, batch size, output dim]\n", - "\n", - " output = output[1:].view(-1, output.shape[-1])\n", - " trg = trg[1:].view(-1)\n", - "\n", - " #trg = [(trg sent len - 1) * batch size]\n", - " #output = [(trg sent len - 1) * batch size, output dim]\n", - "\n", - " loss = criterion(output, trg)\n", - " \n", - " epoch_loss += loss.item()\n", - " \n", - " return epoch_loss / len(iterator)" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [], - "source": [ - "def epoch_time(start_time, end_time):\n", - " elapsed_time = end_time - start_time\n", - " elapsed_mins = int(elapsed_time / 60)\n", - " elapsed_secs = int(elapsed_time - (elapsed_mins * 60))\n", - " return elapsed_mins, elapsed_secs" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [], - "source": [ - "train_history = []\n", - "valid_history = []\n", - "\n", - "N_EPOCHS = 10\n", - "CLIP = 1\n", - "\n", - "best_valid_loss = float('inf')" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Epoch: 10 | Time: 1m 10s\n", - "\tTrain Loss: 2.998 | Train PPL: 20.040\n", - "\t Val. Loss: 4.710 | Val. PPL: 111.007\n" - ] - } - ], - "source": [ - "for epoch in range(N_EPOCHS):\n", - " \n", - " start_time = time.time()\n", - " \n", - " train_loss = train(model, train_iterator, optimizer, criterion, CLIP, train_history, valid_history)\n", - " valid_loss = evaluate(model, valid_iterator, criterion)\n", - " \n", - " end_time = time.time()\n", - " \n", - " epoch_mins, epoch_secs = epoch_time(start_time, end_time)\n", - " \n", - " if valid_loss < best_valid_loss:\n", - " best_valid_loss = valid_loss\n", - " torch.save(model.state_dict(), 'tut1-model.pt')\n", - " \n", - " train_history.append(train_loss)\n", - " valid_history.append(valid_loss)\n", - " print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')\n", - " print(f'\\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')\n", - " print(f'\\t Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f}')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "__Let's take a look at our network quality__:" - ] - }, - { - "cell_type": "code", - "execution_count": 104, - "metadata": {}, - "outputs": [], - "source": [ - "del utils" - ] - }, - { - "cell_type": "code", - "execution_count": 105, - "metadata": {}, - "outputs": [], - "source": [ - "import utils\n", - "import imp\n", - "imp.reload(utils)\n", - "generate_translation = utils.generate_translation\n", - "remove_tech_tokens = utils.remove_tech_tokens\n", - "get_text = utils.get_text\n", - "flatten = utils.flatten" - ] - }, - { - "cell_type": "code", - "execution_count": 106, - "metadata": {}, - "outputs": [], - "source": [ - "batch = next(iter(test_iterator))" - ] - }, - { - "cell_type": "code", - "execution_count": 107, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Original: there is a 24 - hour front desk at the property .\n", - "Generated: the property offers a 24 - hour front desk . .\n", - "\n", - "Original: this property also features free wifi .\n", - "Generated: free wifi access . . . .\n", - "\n" - ] - } - ], - "source": [ - "for idx in [1,2]:\n", - " src = batch.src[:, idx:idx+1]\n", - " trg = batch.trg[:, idx:idx+1]\n", - " generate_translation(src, trg, model, TRG.vocab)" - ] - }, - { - "cell_type": "code", - "execution_count": 108, - "metadata": {}, - "outputs": [], - "source": [ - "from nltk.translate.bleu_score import corpus_bleu\n", - "\n", - "# \"\"\" Estimates corpora-level BLEU score of model's translations given inp and reference out \"\"\"\n", - "# translations, _ = model.translate_lines(inp_lines, **flags)\n", - "# # Note: if you experience out-of-memory error, split input lines into batches and translate separately\n", - "# return corpus_bleu([[ref] for ref in out_lines], translations) * 100" - ] - }, - { - "cell_type": "code", - "execution_count": 109, - "metadata": {}, - "outputs": [], - "source": [ - "import tqdm" - ] - }, - { - "cell_type": "code", - "execution_count": 110, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "59it [00:03, 18.87it/s]\n" - ] - } - ], - "source": [ - "original_text = []\n", - "generated_text = []\n", - "model.eval()\n", - "with torch.no_grad():\n", - "\n", - " for i, batch in tqdm.tqdm(enumerate(test_iterator)):\n", - "\n", - " src = batch.src\n", - " trg = batch.trg\n", - "\n", - " output = model(src, trg, 0) #turn off teacher forcing\n", - "\n", - " #trg = [trg sent len, batch size]\n", - " #output = [trg sent len, batch size, output dim]\n", - "\n", - " output = output.argmax(dim=-1)\n", - " \n", - " original_text.extend([get_text(x, TRG.vocab) for x in trg.cpu().numpy().T])\n", - " generated_text.extend([get_text(x, TRG.vocab) for x in output[1:].detach().cpu().numpy().T])\n", - "\n", - "# original_text = flatten(original_text)\n", - "# generated_text = flatten(generated_text)" - ] - }, - { - "cell_type": "code", - "execution_count": 111, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "14.139920232081806" - ] - }, - "execution_count": 111, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "corpus_bleu([[text] for text in original_text], generated_text) * 100" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Baseline solution BLEU score is quite low. Try to achieve at least __24__ BLEU on the test set. \n", - "The checkpoints are:\n", - "\n", - "* __22__ - minimal score to submit the homework, 30% of points\n", - "\n", - "* __27__ - good score, 70% of points\n", - "\n", - "* __29__ - excellent score, 100% of points" - ] - } - ], - "metadata": { - "anaconda-cloud": {}, - "colab": { - "collapsed_sections": [], - "machine_shape": "hm", - "name": "homework.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Py3 Research", - "language": "python", - "name": "py3_research" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.7" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/homeworks/lab01_nlp/my_network.py b/homeworks/lab01_nlp/my_network.py deleted file mode 100644 index 966416d..0000000 --- a/homeworks/lab01_nlp/my_network.py +++ /dev/null @@ -1,182 +0,0 @@ -import torch -import torch.nn as nn -import torch.optim as optim - -import torchtext -from torchtext.datasets import TranslationDataset, Multi30k -from torchtext.data import Field, BucketIterator - -import random -import math -import time - - -class Encoder(nn.Module): - def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout): - super().__init__() - - self.input_dim = input_dim - self.emb_dim = emb_dim - self.hid_dim = hid_dim - self.n_layers = n_layers -# self.dropout = dropout - - self.embedding = nn.Embedding( - num_embeddings=input_dim, - embedding_dim=emb_dim - ) - # - - self.rnn = nn.LSTM( - input_size=emb_dim, - hidden_size=hid_dim, - num_layers=n_layers, - dropout=dropout - ) - # - - self.dropout = nn.Dropout(p=dropout)# - - def forward(self, src): - - #src = [src sent len, batch size] - - # Compute an embedding from the src data and apply dropout to it - embedded = self.embedding(src)# - - embedded = self.dropout(embedded) - - output, (hidden, cell) = self.rnn(embedded) - #embedded = [src sent len, batch size, emb dim] - - # Compute the RNN output values of the encoder RNN. - # outputs, hidden and cell should be initialized here. Refer to nn.LSTM docs ;) - - # - - #outputs = [src sent len, batch size, hid dim * n directions] - #hidden = [n layers * n directions, batch size, hid dim] - #cell = [n layers * n directions, batch size, hid dim] - - #outputs are always from the top hidden layer - - return hidden, cell - - -class Decoder(nn.Module): - def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout): - super().__init__() - - self.emb_dim = emb_dim - self.hid_dim = hid_dim - self.output_dim = output_dim - self.n_layers = n_layers - self.dropout = dropout - - self.embedding = nn.Embedding( - num_embeddings=output_dim, - embedding_dim=emb_dim - ) - # - - self.rnn = nn.LSTM( - input_size=emb_dim, - hidden_size=hid_dim, - num_layers=n_layers, - dropout=dropout - ) - # - - self.out = nn.Linear( - in_features=hid_dim, - out_features=output_dim - ) - # - - self.dropout = nn.Dropout(p=dropout)# - - def forward(self, input, hidden, cell): - - #input = [batch size] - #hidden = [n layers * n directions, batch size, hid dim] - #cell = [n layers * n directions, batch size, hid dim] - - #n directions in the decoder will both always be 1, therefore: - #hidden = [n layers, batch size, hid dim] - #context = [n layers, batch size, hid dim] - - input = input.unsqueeze(0) - - #input = [1, batch size] - - # Compute an embedding from the input data and apply dropout to it - embedded = self.dropout(self.embedding(input))# - - #embedded = [1, batch size, emb dim] - - # Compute the RNN output values of the encoder RNN. - # outputs, hidden and cell should be initialized here. Refer to nn.LSTM docs ;) - # - - - #output = [sent len, batch size, hid dim * n directions] - #hidden = [n layers * n directions, batch size, hid dim] - #cell = [n layers * n directions, batch size, hid dim] - - #sent len and n directions will always be 1 in the decoder, therefore: - #output = [1, batch size, hid dim] - #hidden = [n layers, batch size, hid dim] - #cell = [n layers, batch size, hid dim] - - - output, (hidden, cell) = self.rnn(embedded, (hidden, cell)) - prediction = self.out(output.squeeze(0)) - - #prediction = [batch size, output dim] - - return prediction, hidden, cell - - -class Seq2Seq(nn.Module): - def __init__(self, encoder, decoder, device): - super().__init__() - - self.encoder = encoder - self.decoder = decoder - self.device = device - - assert encoder.hid_dim == decoder.hid_dim, \ - "Hidden dimensions of encoder and decoder must be equal!" - assert encoder.n_layers == decoder.n_layers, \ - "Encoder and decoder must have equal number of layers!" - - def forward(self, src, trg, teacher_forcing_ratio = 0.5): - - #src = [src sent len, batch size] - #trg = [trg sent len, batch size] - #teacher_forcing_ratio is probability to use teacher forcing - #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time - - # Again, now batch is the first dimention instead of zero - batch_size = trg.shape[1] - max_len = trg.shape[0] - trg_vocab_size = self.decoder.output_dim - - #tensor to store decoder outputs - outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device) - - #last hidden state of the encoder is used as the initial hidden state of the decoder - hidden, cell = self.encoder(src) - - #first input to the decoder is the tokens - input = trg[0,:] - - for t in range(1, max_len): - - output, hidden, cell = self.decoder(input, hidden, cell) - outputs[t] = output - teacher_force = random.random() < teacher_forcing_ratio - top1 = output.max(1)[1] - input = (trg[t] if teacher_force else top1) - - return outputs diff --git a/homeworks/lab01_nlp/utils.py b/homeworks/lab01_nlp/utils.py deleted file mode 100644 index f3691d2..0000000 --- a/homeworks/lab01_nlp/utils.py +++ /dev/null @@ -1,33 +0,0 @@ - -def flatten(l): - return [item for sublist in l for item in sublist] - -def remove_tech_tokens(mystr, tokens_to_remove=['', '', '', '']): - return [x for x in mystr if x not in tokens_to_remove] - - -def get_text(x, TRG_vocab): - text = [TRG_vocab.itos[token] for token in x] - try: - end_idx = text.index('') - text = text[:end_idx] - except ValueError: - pass - text = remove_tech_tokens(text) - if len(text) < 1: - text = [] - return text - - -def generate_translation(src, trg, model, TRG_vocab): - model.eval() - - output = model(src, trg, 0) #turn off teacher forcing - output = output.argmax(dim=-1).cpu().numpy() - - original = get_text(list(trg[:,0].cpu().numpy()), TRG_vocab) - generated = get_text(list(output[1:, 0]), TRG_vocab) - - print('Original: {}'.format(' '.join(original))) - print('Generated: {}'.format(' '.join(generated))) - print() diff --git a/homeworks/lab02_qa/LICENSE b/homeworks/lab02_qa/LICENSE deleted file mode 100644 index e1b9ab0..0000000 --- a/homeworks/lab02_qa/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -The MIT License - -Copyright (c) 2019 Christopher Chute http://chrischute.com - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. diff --git a/homeworks/lab02_qa/README.md b/homeworks/lab02_qa/README.md deleted file mode 100644 index de29ad3..0000000 --- a/homeworks/lab02_qa/README.md +++ /dev/null @@ -1,40 +0,0 @@ -#### Lab02: QA system - -In this homework your goal is to build the QA system for specific language. The default code is available for English and Russian languages. Russian example using the [SberQuAD dataset](https://arxiv.org/pdf/1912.09723.pdf). The preprocessing code and baseline solution (BiDAF) are the slightly adapted version of the [Stanford CS224n Starter code](https://github.com/chrischute/squad) for the SQuAD dataset. - -**To use any other language, please, refer to [this post](https://medium.com/deepset-ai/going-beyond-squad-part-1-question-answering-in-different-languages-8eac6cf56f21) or to the Table 2 in the paper [Deep learning based question answering systemin Bengali](https://www.researchgate.net/publication/346129818_Deep_learning_based_question_answering_system_in_Bengali), where the authors provide an overview of available datasets.** - -The available languages are (but not limited to): Korean, Arabic, French, Spanish, Italian, Russian, English, Hindi and Chinese. - -The starting point of this assighnment is the `SberQuAD_preprocessing_and_problem_statement.ipynb` notebook. -[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github//natural-language-processing/tree/master/homeworks/lab02_qa/SberQuAD_preprocessing_and_problem_statement.ipynb) - - -You may choose either this assignment or the `homework05` on the Image Captioning. Or do both ;) - -Next comes the original instructions from the https://github.com/chrischute/squad repository. - -P.s. Downgrading PyTorch is not required, starter code works fine on PyTorch 1.4 -P.p.s. If you are running in Colab, mount your Google Drive and store the checkpoints/word vectors there. [Official instruction (en)](https://colab.research.google.com/notebooks/io.ipynb), [Habr post (ru)](https://habr.com/ru/post/348058/). Restarting the kernel after you finished the preprocessing (and saved the data to your disk) might be a good idea to release the memory. - -#### Setup - -1. Make sure you have [Miniconda](https://docs.conda.io/en/latest/miniconda.html) installed - 1. Conda is a package manager that sandboxes your project’s dependencies in a virtual environment - 2. Miniconda contains Conda and its dependencies with no extra packages by default (as opposed to Anaconda, which installs some extra packages) - -2. cd into src, run `conda env create -f environment.yml` - 1. This creates a Conda environment called `squad` - -3. Run `source activate squad` - 1. This activates the `squad` environment - 2. Do this each time you want to write/test your code - -4. Run `python setup.py` - 1. This downloads SQuAD 2.0 training and dev sets, as well as the GloVe 300-dimensional word vectors (840B) - 2. This also pre-processes the dataset for efficient data loading - 3. For a MacBook Pro on the Stanford network, `setup.py` takes around 30 minutes total - -5. Browse the code in `train.py` - 1. The `train.py` script is the entry point for training a model. It reads command-line arguments, loads the SQuAD dataset, and trains a model. - 2. You may find it helpful to browse the arguments provided by the starter code. Either look directly at the `parser.add_argument` lines in the source code, or run `python train.py -h`. diff --git a/homeworks/lab02_qa/SberQuAD_preprocessing_and_problem_statement.ipynb b/homeworks/lab02_qa/SberQuAD_preprocessing_and_problem_statement.ipynb deleted file mode 100644 index 7dafe1f..0000000 --- a/homeworks/lab02_qa/SberQuAD_preprocessing_and_problem_statement.ipynb +++ /dev/null @@ -1,360 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Credits: the provided initial code is an adaptation of the [Starter code for Stanford CS224n default final project on SQuAD 2.0](https://github.com/chrischute/squad) which is shared under MIT License. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This notebook does initial preprocessing for the SberQuAD dataset and will give you the starting point in this assignment. If it looks too complex and/or time/resourse-expensive, you may stick to homework05 as well." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 1. Preprocessing\n", - "This code is a bit changed version of the code from `setup.py`. If you want to work with the SQuAD dataset, stick to the original instructions from the https://github.com/chrischute/squad repository." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# If running on Colab, uncomment the following lines \n", - "\n", - "# !wget https://raw.githubusercontent.com/girafe-ai/natural-language-processing/master/homeworks/lab02_qa/args.py -nc\n", - "# !wget https://raw.githubusercontent.com/girafe-ai/natural-language-processing/master/homeworks/lab02_qa/layers.py -nc\n", - "# !wget https://raw.githubusercontent.com/girafe-ai/natural-language-processing/master/homeworks/lab02_qa/models.py -nc\n", - "# !wget https://raw.githubusercontent.com/girafe-ai/natural-language-processing/master/homeworks/lab02_qa/setup.py -nc\n", - "# !wget https://raw.githubusercontent.com/girafe-ai/natural-language-processing/master/homeworks/lab02_qa/test.py -nc\n", - "# !wget https://raw.githubusercontent.com/girafe-ai/natural-language-processing/master/homeworks/lab02_qa/train.py -nc\n", - "# !wget https://raw.githubusercontent.com/girafe-ai/natural-language-processing/master/homeworks/lab02_qa/util.py -nc" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# If running on Colab, uncomment the following lines \n", - "\n", - "# !pip install ujson\n", - "# !pip install tensorboardX\n", - "# !pip install pymorphy2==0.8" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\"\"\"Train a model on SQuAD.\n", - "\n", - "Author:\n", - " Chris Chute (chute@stanford.edu)\n", - "\"\"\"\n", - "\n", - "import numpy as np\n", - "import random\n", - "import torch\n", - "import torch.nn as nn\n", - "import torch.nn.functional as F\n", - "import torch.optim as optim\n", - "import torch.optim.lr_scheduler as sched\n", - "import torch.utils.data as data\n", - "import util\n", - "\n", - "from args import get_train_args\n", - "from collections import OrderedDict\n", - "from json import dumps\n", - "from models import BiDAF\n", - "from tensorboardX import SummaryWriter\n", - "from tqdm import tqdm\n", - "from ujson import load as json_load\n", - "from util import collate_fn, SQuAD" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pathlib import Path\n", - "Path(\"./data\").mkdir(parents=True, exist_ok=True)\n", - "Path(\"./save\").mkdir(parents=True, exist_ok=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Downloading the SberQuAD data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!wget http://files.deeppavlov.ai/datasets/sber_squad_clean-v1.1.tar.gz -nc -O ./data/sber_squad_clean-v1.1.tar.gz" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "! tar -xzvf ./data/sber_squad_clean-v1.1.tar.gz\n", - "! mv train-v1.1.json data\n", - "! mv dev-v1.1.json data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Downloading the word vectors (this may take a while)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "! wget http://files.deeppavlov.ai/embeddings/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.vec -nc -O ./data/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.vec" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And finally the preprocessing for the SberQuAD dataset:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "train_file = './data/train-v1.1.json'\n", - "dev_file = './data/dev-v1.1.json'\n", - "glove_file = './data/ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.vec'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from setup import *" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Uncomment this cell if needed\n", - "# !pip install pymorphy2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "nlp = spacy.blank(\"ru\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The following cell may take a while (usually 10 minutes or less)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Process training set and use it to decide on the word/character vocabularies\n", - "word_counter, char_counter = Counter(), Counter()\n", - "train_examples, train_eval = process_file(train_file, \"train\", word_counter, char_counter, nlp)\n", - "word_emb_mat, word2idx_dict = get_embedding(\n", - " word_counter, 'word', emb_file=glove_file, vec_size=300, num_vectors=1560132)\n", - "char_emb_mat, char2idx_dict = get_embedding(\n", - " char_counter, 'char', emb_file=None, vec_size=64)\n", - "\n", - "\n", - "dev_examples, dev_eval = process_file(dev_file, \"dev\", word_counter, char_counter, nlp)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we have the preprocessed data:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "train_record_file = './data/train.npz'\n", - "dev_record_file = './data/dev.npz'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from args import add_common_args, get_setup_args" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Retreiving the default arguments for the preprocessing script\n", - "_args = get_setup_args(bypass=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "_args" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "build_features(_args, train_examples, \"train\", train_record_file, word2idx_dict, char2idx_dict)\n", - "dev_meta = build_features(_args, dev_examples, \"dev\", dev_record_file, word2idx_dict, char2idx_dict)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "save(_args.word_emb_file, word_emb_mat, message=\"word embedding\")\n", - "save(_args.char_emb_file, char_emb_mat, message=\"char embedding\")\n", - "save(_args.train_eval_file, train_eval, message=\"train eval\")\n", - "save(_args.dev_eval_file, dev_eval, message=\"dev eval\")\n", - "save(_args.word2idx_file, word2idx_dict, message=\"word dictionary\")\n", - "save(_args.char2idx_file, char2idx_dict, message=\"char dictionary\")\n", - "save(_args.dev_meta_file, dev_meta, message=\"dev meta\")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 2. The experiment\n", - "\n", - "Now you are almost ready to go. You may follow these steps to begin (or just start your experiments here).\n", - "\n", - "1. Try running the `train.py` script from the console (or via `!`) (default command-line arguments are ok for the start). If will run the BiDAF model on the preprocessed data. Set `--use_squad_v2` flag to False (SberQuAD is similar to SQuAD v1.1).\n", - "\n", - "Example code (be careful with the path and the names of the variables):\n", - "```\n", - "python train.py --name first_run_on_sberquad --use_squad_v2 False\n", - "```\n", - "\n", - "2. After if finishes (might take an 1-2-3 hours depending on the hardware), evaluate your model on the `dev` set and measure the quality.\n", - "Example code (be careful with the path and the names of the variables):\n", - "```\n", - " python test.py --split dev --load_path ./save/train/first_run_on_sberquad-02/best.pth.tar --name best_evaluation_experiment\n", - "```\n", - "The result should be similar to the following:\n", - "```\n", - ">>> Dev NLL: 02.47, F1: 75.62, EM: 55.73, AvNA: 99.42\n", - "```\n", - "\n", - "The [DeepPavlov's RuBERT](http://docs.deeppavlov.ai/en/master/features/models/squad.html) achieves $F1 = 84.60\\pm0.11$ and $EM = 66.30\\pm0.24$" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Here comes your quest: try to improve the quality of this QA system. \n", - "\n", - "This is a very creative assignment. It is all about experimenting, trying different approaches (and a lot of computations). But if you wish to stick to some numbers, try to increase F1 at least by $5$ points.\n", - "\n", - "Here are some ideas that might help you on your way:\n", - "* Try adapting the optimization hyperparameters/network structure to Russian language (the baseline is designed for English SQuAD dataset).\n", - "* Incorporating the additional information about the data (like PoS tags) might be a good idea.\n", - "* __Distilling the knowledge from a pre-trained RuBERT__ (e.g. try to use the predictions of the model we've discussed on `week10` as soft targets).\n", - "* Or anything else.\n", - "\n", - "\n", - "And, first of all, read the initial code carefully.\n", - "\n", - "\n", - "Good luck! Feel free to share your results :)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Py3 Research", - "language": "python", - "name": "py3_research_kernel" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/homeworks/lab02_qa/args.py b/homeworks/lab02_qa/args.py deleted file mode 100644 index 47a59d1..0000000 --- a/homeworks/lab02_qa/args.py +++ /dev/null @@ -1,247 +0,0 @@ -"""Command-line arguments for setup.py, train.py, test.py. - -Author: - Chris Chute (chute@stanford.edu) -""" - -import argparse - - -def get_setup_args(bypass=False): - """Get arguments needed in setup.py.""" - parser = argparse.ArgumentParser('Download and pre-process SQuAD') - - add_common_args(parser) - - parser.add_argument('--train_url', - type=str, - default='https://github.com/chrischute/squad/data/train-v2.0.json') - parser.add_argument('--dev_url', - type=str, - default='https://github.com/chrischute/squad/data/dev-v2.0.json') - parser.add_argument('--test_url', - type=str, - default='https://github.com/chrischute/squad/data/test-v2.0.json') - parser.add_argument('--glove_url', - type=str, - default='http://nlp.stanford.edu/data/glove.840B.300d.zip') - parser.add_argument('--dev_meta_file', - type=str, - default='./data/dev_meta.json') - parser.add_argument('--test_meta_file', - type=str, - default='./data/test_meta.json') - parser.add_argument('--word2idx_file', - type=str, - default='./data/word2idx.json') - parser.add_argument('--char2idx_file', - type=str, - default='./data/char2idx.json') - parser.add_argument('--answer_file', - type=str, - default='./data/answer.json') - parser.add_argument('--para_limit', - type=int, - default=400, - help='Max number of words in a paragraph') - parser.add_argument('--ques_limit', - type=int, - default=50, - help='Max number of words to keep from a question') - parser.add_argument('--test_para_limit', - type=int, - default=1000, - help='Max number of words in a paragraph at test time') - parser.add_argument('--test_ques_limit', - type=int, - default=100, - help='Max number of words in a question at test time') - parser.add_argument('--char_dim', - type=int, - default=64, - help='Size of char vectors (char-level embeddings)') - parser.add_argument('--glove_dim', - type=int, - default=300, - help='Size of GloVe word vectors to use') - parser.add_argument('--glove_num_vecs', - type=int, - default=2196017, - help='Number of GloVe vectors') - parser.add_argument('--ans_limit', - type=int, - default=30, - help='Max number of words in a training example answer') - parser.add_argument('--char_limit', - type=int, - default=16, - help='Max number of chars to keep from a word') - parser.add_argument('--include_test_examples', - type=lambda s: s.lower().startswith('t'), - default=True, - help='Process examples from the test set') - - if bypass: - args = parser.parse_args('') - else: - args = parser.parse_args() - - return args - - -def get_train_args(): - """Get arguments needed in train.py.""" - parser = argparse.ArgumentParser('Train a model on SQuAD') - - add_common_args(parser) - add_train_test_args(parser) - - parser.add_argument('--eval_steps', - type=int, - default=50000, - help='Number of steps between successive evaluations.') - parser.add_argument('--lr', - type=float, - default=0.5, - help='Learning rate.') - parser.add_argument('--l2_wd', - type=float, - default=0, - help='L2 weight decay.') - parser.add_argument('--num_epochs', - type=int, - default=30, - help='Number of epochs for which to train. Negative means forever.') - parser.add_argument('--drop_prob', - type=float, - default=0.2, - help='Probability of zeroing an activation in dropout layers.') - parser.add_argument('--metric_name', - type=str, - default='F1', - choices=('NLL', 'EM', 'F1'), - help='Name of dev metric to determine best checkpoint.') - parser.add_argument('--max_checkpoints', - type=int, - default=5, - help='Maximum number of checkpoints to keep on disk.') - parser.add_argument('--max_grad_norm', - type=float, - default=5.0, - help='Maximum gradient norm for gradient clipping.') - parser.add_argument('--seed', - type=int, - default=224, - help='Random seed for reproducibility.') - parser.add_argument('--ema_decay', - type=float, - default=0.999, - help='Decay rate for exponential moving average of parameters.') - - args = parser.parse_args() - - if args.metric_name == 'NLL': - # Best checkpoint is the one that minimizes negative log-likelihood - args.maximize_metric = False - elif args.metric_name in ('EM', 'F1'): - # Best checkpoint is the one that maximizes EM or F1 - args.maximize_metric = True - else: - raise ValueError(f'Unrecognized metric name: "{args.metric_name}"') - - return args - - -def get_test_args(): - """Get arguments needed in test.py.""" - parser = argparse.ArgumentParser('Test a trained model on SQuAD') - - add_common_args(parser) - add_train_test_args(parser) - - parser.add_argument('--split', - type=str, - default='dev', - choices=('train', 'dev', 'test'), - help='Split to use for testing.') - parser.add_argument('--sub_file', - type=str, - default='submission.csv', - help='Name for submission file.') - - # Require load_path for test.py - args = parser.parse_args() - if not args.load_path: - raise argparse.ArgumentError('Missing required argument --load_path') - - return args - - -def add_common_args(parser): - """Add arguments common to all 3 scripts: setup.py, train.py, test.py""" - parser.add_argument('--train_record_file', - type=str, - default='./data/train.npz') - parser.add_argument('--dev_record_file', - type=str, - default='./data/dev.npz') - parser.add_argument('--test_record_file', - type=str, - default='./data/test.npz') - parser.add_argument('--word_emb_file', - type=str, - default='./data/word_emb.json') - parser.add_argument('--char_emb_file', - type=str, - default='./data/char_emb.json') - parser.add_argument('--train_eval_file', - type=str, - default='./data/train_eval.json') - parser.add_argument('--dev_eval_file', - type=str, - default='./data/dev_eval.json') - parser.add_argument('--test_eval_file', - type=str, - default='./data/test_eval.json') - - -def add_train_test_args(parser): - """Add arguments common to train.py and test.py""" - parser.add_argument('--name', - '-n', - type=str, - required=True, - help='Name to identify training or test run.') - parser.add_argument('--max_ans_len', - type=int, - default=15, - help='Maximum length of a predicted answer.') - parser.add_argument('--num_workers', - type=int, - default=4, - help='Number of sub-processes to use per data loader.') - parser.add_argument('--save_dir', - type=str, - default='./save/', - help='Base directory for saving information.') - parser.add_argument('--batch_size', - type=int, - default=64, - help='Batch size per GPU. Scales automatically when \ - multiple GPUs are available.') - parser.add_argument('--use_squad_v2', - type=lambda s: s.lower().startswith('t'), - default=True, - help='Whether to use SQuAD 2.0 (unanswerable) questions.') - parser.add_argument('--hidden_size', - type=int, - default=100, - help='Number of features in encoder hidden layers.') - parser.add_argument('--num_visuals', - type=int, - default=10, - help='Number of examples to visualize in TensorBoard.') - parser.add_argument('--load_path', - type=str, - default=None, - help='Path to load as a model checkpoint.') diff --git a/homeworks/lab02_qa/layers.py b/homeworks/lab02_qa/layers.py deleted file mode 100644 index 6859e4d..0000000 --- a/homeworks/lab02_qa/layers.py +++ /dev/null @@ -1,222 +0,0 @@ -"""Assortment of layers for use in models.py. - -Author: - Chris Chute (chute@stanford.edu) -""" - -import torch -import torch.nn as nn -import torch.nn.functional as F - -from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence -from util import masked_softmax - - -class Embedding(nn.Module): - """Embedding layer used by BiDAF, without the character-level component. - - Word-level embeddings are further refined using a 2-layer Highway Encoder - (see `HighwayEncoder` class for details). - - Args: - word_vectors (torch.Tensor): Pre-trained word vectors. - hidden_size (int): Size of hidden activations. - drop_prob (float): Probability of zero-ing out activations - """ - def __init__(self, word_vectors, hidden_size, drop_prob): - super(Embedding, self).__init__() - self.drop_prob = drop_prob - self.embed = nn.Embedding.from_pretrained(word_vectors) - self.proj = nn.Linear(word_vectors.size(1), hidden_size, bias=False) - self.hwy = HighwayEncoder(2, hidden_size) - - def forward(self, x): - emb = self.embed(x) # (batch_size, seq_len, embed_size) - emb = F.dropout(emb, self.drop_prob, self.training) - emb = self.proj(emb) # (batch_size, seq_len, hidden_size) - emb = self.hwy(emb) # (batch_size, seq_len, hidden_size) - - return emb - - -class HighwayEncoder(nn.Module): - """Encode an input sequence using a highway network. - - Based on the paper: - "Highway Networks" - by Rupesh Kumar Srivastava, Klaus Greff, Jürgen Schmidhuber - (https://arxiv.org/abs/1505.00387). - - Args: - num_layers (int): Number of layers in the highway encoder. - hidden_size (int): Size of hidden activations. - """ - def __init__(self, num_layers, hidden_size): - super(HighwayEncoder, self).__init__() - self.transforms = nn.ModuleList([nn.Linear(hidden_size, hidden_size) - for _ in range(num_layers)]) - self.gates = nn.ModuleList([nn.Linear(hidden_size, hidden_size) - for _ in range(num_layers)]) - - def forward(self, x): - for gate, transform in zip(self.gates, self.transforms): - # Shapes of g, t, and x are all (batch_size, seq_len, hidden_size) - g = torch.sigmoid(gate(x)) - t = F.relu(transform(x)) - x = g * t + (1 - g) * x - - return x - - -class RNNEncoder(nn.Module): - """General-purpose layer for encoding a sequence using a bidirectional RNN. - - Encoded output is the RNN's hidden state at each position, which - has shape `(batch_size, seq_len, hidden_size * 2)`. - - Args: - input_size (int): Size of a single timestep in the input. - hidden_size (int): Size of the RNN hidden state. - num_layers (int): Number of layers of RNN cells to use. - drop_prob (float): Probability of zero-ing out activations. - """ - def __init__(self, - input_size, - hidden_size, - num_layers, - drop_prob=0.): - super(RNNEncoder, self).__init__() - self.drop_prob = drop_prob - self.rnn = nn.LSTM(input_size, hidden_size, num_layers, - batch_first=True, - bidirectional=True, - dropout=drop_prob if num_layers > 1 else 0.) - - def forward(self, x, lengths): - # Save original padded length for use by pad_packed_sequence - orig_len = x.size(1) - - # Sort by length and pack sequence for RNN - lengths, sort_idx = lengths.sort(0, descending=True) - x = x[sort_idx] # (batch_size, seq_len, input_size) - x = pack_padded_sequence(x, lengths, batch_first=True) - - # Apply RNN - x, _ = self.rnn(x) # (batch_size, seq_len, 2 * hidden_size) - - # Unpack and reverse sort - x, _ = pad_packed_sequence(x, batch_first=True, total_length=orig_len) - _, unsort_idx = sort_idx.sort(0) - x = x[unsort_idx] # (batch_size, seq_len, 2 * hidden_size) - - # Apply dropout (RNN applies dropout after all but the last layer) - x = F.dropout(x, self.drop_prob, self.training) - - return x - - -class BiDAFAttention(nn.Module): - """Bidirectional attention originally used by BiDAF. - - Bidirectional attention computes attention in two directions: - The context attends to the query and the query attends to the context. - The output of this layer is the concatenation of [context, c2q_attention, - context * c2q_attention, context * q2c_attention]. This concatenation allows - the attention vector at each timestep, along with the embeddings from - previous layers, to flow through the attention layer to the modeling layer. - The output has shape (batch_size, context_len, 8 * hidden_size). - - Args: - hidden_size (int): Size of hidden activations. - drop_prob (float): Probability of zero-ing out activations. - """ - def __init__(self, hidden_size, drop_prob=0.1): - super(BiDAFAttention, self).__init__() - self.drop_prob = drop_prob - self.c_weight = nn.Parameter(torch.zeros(hidden_size, 1)) - self.q_weight = nn.Parameter(torch.zeros(hidden_size, 1)) - self.cq_weight = nn.Parameter(torch.zeros(1, 1, hidden_size)) - for weight in (self.c_weight, self.q_weight, self.cq_weight): - nn.init.xavier_uniform_(weight) - self.bias = nn.Parameter(torch.zeros(1)) - - def forward(self, c, q, c_mask, q_mask): - batch_size, c_len, _ = c.size() - q_len = q.size(1) - s = self.get_similarity_matrix(c, q) # (batch_size, c_len, q_len) - c_mask = c_mask.view(batch_size, c_len, 1) # (batch_size, c_len, 1) - q_mask = q_mask.view(batch_size, 1, q_len) # (batch_size, 1, q_len) - s1 = masked_softmax(s, q_mask, dim=2) # (batch_size, c_len, q_len) - s2 = masked_softmax(s, c_mask, dim=1) # (batch_size, c_len, q_len) - - # (bs, c_len, q_len) x (bs, q_len, hid_size) => (bs, c_len, hid_size) - a = torch.bmm(s1, q) - # (bs, c_len, c_len) x (bs, c_len, hid_size) => (bs, c_len, hid_size) - b = torch.bmm(torch.bmm(s1, s2.transpose(1, 2)), c) - - x = torch.cat([c, a, c * a, c * b], dim=2) # (bs, c_len, 4 * hid_size) - - return x - - def get_similarity_matrix(self, c, q): - """Get the "similarity matrix" between context and query (using the - terminology of the BiDAF paper). - - A naive implementation as described in BiDAF would concatenate the - three vectors then project the result with a single weight matrix. This - method is a more memory-efficient implementation of the same operation. - - See Also: - Equation 1 in https://arxiv.org/abs/1611.01603 - """ - c_len, q_len = c.size(1), q.size(1) - c = F.dropout(c, self.drop_prob, self.training) # (bs, c_len, hid_size) - q = F.dropout(q, self.drop_prob, self.training) # (bs, q_len, hid_size) - - # Shapes: (batch_size, c_len, q_len) - s0 = torch.matmul(c, self.c_weight).expand([-1, -1, q_len]) - s1 = torch.matmul(q, self.q_weight).transpose(1, 2)\ - .expand([-1, c_len, -1]) - s2 = torch.matmul(c * self.cq_weight, q.transpose(1, 2)) - s = s0 + s1 + s2 + self.bias - - return s - - -class BiDAFOutput(nn.Module): - """Output layer used by BiDAF for question answering. - - Computes a linear transformation of the attention and modeling - outputs, then takes the softmax of the result to get the start pointer. - A bidirectional LSTM is then applied the modeling output to produce `mod_2`. - A second linear+softmax of the attention output and `mod_2` is used - to get the end pointer. - - Args: - hidden_size (int): Hidden size used in the BiDAF model. - drop_prob (float): Probability of zero-ing out activations. - """ - def __init__(self, hidden_size, drop_prob): - super(BiDAFOutput, self).__init__() - self.att_linear_1 = nn.Linear(8 * hidden_size, 1) - self.mod_linear_1 = nn.Linear(2 * hidden_size, 1) - - self.rnn = RNNEncoder(input_size=2 * hidden_size, - hidden_size=hidden_size, - num_layers=1, - drop_prob=drop_prob) - - self.att_linear_2 = nn.Linear(8 * hidden_size, 1) - self.mod_linear_2 = nn.Linear(2 * hidden_size, 1) - - def forward(self, att, mod, mask): - # Shapes: (batch_size, seq_len, 1) - logits_1 = self.att_linear_1(att) + self.mod_linear_1(mod) - mod_2 = self.rnn(mod, mask.sum(-1)) - logits_2 = self.att_linear_2(att) + self.mod_linear_2(mod_2) - - # Shapes: (batch_size, seq_len) - log_p1 = masked_softmax(logits_1.squeeze(), mask, log_softmax=True) - log_p2 = masked_softmax(logits_2.squeeze(), mask, log_softmax=True) - - return log_p1, log_p2 diff --git a/homeworks/lab02_qa/models.py b/homeworks/lab02_qa/models.py deleted file mode 100644 index 3487ea2..0000000 --- a/homeworks/lab02_qa/models.py +++ /dev/null @@ -1,72 +0,0 @@ -"""Top-level model classes. - -Author: - Chris Chute (chute@stanford.edu) -""" - -import layers -import torch -import torch.nn as nn - - -class BiDAF(nn.Module): - """Baseline BiDAF model for SQuAD. - - Based on the paper: - "Bidirectional Attention Flow for Machine Comprehension" - by Minjoon Seo, Aniruddha Kembhavi, Ali Farhadi, Hannaneh Hajishirzi - (https://arxiv.org/abs/1611.01603). - - Follows a high-level structure commonly found in SQuAD models: - - Embedding layer: Embed word indices to get word vectors. - - Encoder layer: Encode the embedded sequence. - - Attention layer: Apply an attention mechanism to the encoded sequence. - - Model encoder layer: Encode the sequence again. - - Output layer: Simple layer (e.g., fc + softmax) to get final outputs. - - Args: - word_vectors (torch.Tensor): Pre-trained word vectors. - hidden_size (int): Number of features in the hidden state at each layer. - drop_prob (float): Dropout probability. - """ - def __init__(self, word_vectors, hidden_size, drop_prob=0.): - super(BiDAF, self).__init__() - self.emb = layers.Embedding(word_vectors=word_vectors, - hidden_size=hidden_size, - drop_prob=drop_prob) - - self.enc = layers.RNNEncoder(input_size=hidden_size, - hidden_size=hidden_size, - num_layers=1, - drop_prob=drop_prob) - - self.att = layers.BiDAFAttention(hidden_size=2 * hidden_size, - drop_prob=drop_prob) - - self.mod = layers.RNNEncoder(input_size=8 * hidden_size, - hidden_size=hidden_size, - num_layers=2, - drop_prob=drop_prob) - - self.out = layers.BiDAFOutput(hidden_size=hidden_size, - drop_prob=drop_prob) - - def forward(self, cw_idxs, qw_idxs): - c_mask = torch.zeros_like(cw_idxs) != cw_idxs - q_mask = torch.zeros_like(qw_idxs) != qw_idxs - c_len, q_len = c_mask.sum(-1), q_mask.sum(-1) - - c_emb = self.emb(cw_idxs) # (batch_size, c_len, hidden_size) - q_emb = self.emb(qw_idxs) # (batch_size, q_len, hidden_size) - - c_enc = self.enc(c_emb, c_len) # (batch_size, c_len, 2 * hidden_size) - q_enc = self.enc(q_emb, q_len) # (batch_size, q_len, 2 * hidden_size) - - att = self.att(c_enc, q_enc, - c_mask, q_mask) # (batch_size, c_len, 8 * hidden_size) - - mod = self.mod(att, c_len) # (batch_size, c_len, 2 * hidden_size) - - out = self.out(att, mod, c_mask) # 2 tensors, each (batch_size, c_len) - - return out diff --git a/homeworks/lab02_qa/setup.py b/homeworks/lab02_qa/setup.py deleted file mode 100644 index c270cdf..0000000 --- a/homeworks/lab02_qa/setup.py +++ /dev/null @@ -1,396 +0,0 @@ -"""Download and pre-process SQuAD and GloVe. - -Usage: - > source activate squad - > python setup.py - -Pre-processing code adapted from: - > https://github.com/HKUST-KnowComp/R-Net/blob/master/prepro.py - -Author: - Chris Chute (chute@stanford.edu) -""" - -import numpy as np -import os -import spacy -import ujson as json -import urllib.request - -from args import get_setup_args -from codecs import open -from collections import Counter -from subprocess import run -from tqdm import tqdm -from zipfile import ZipFile - - -def download_url(url, output_path, show_progress=True): - class DownloadProgressBar(tqdm): - def update_to(self, b=1, bsize=1, tsize=None): - if tsize is not None: - self.total = tsize - self.update(b * bsize - self.n) - - if show_progress: - # Download with a progress bar - with DownloadProgressBar(unit='B', unit_scale=True, - miniters=1, desc=url.split('/')[-1]) as t: - urllib.request.urlretrieve(url, - filename=output_path, - reporthook=t.update_to) - else: - # Simple download with no progress bar - urllib.request.urlretrieve(url, output_path) - - -def url_to_data_path(url): - return os.path.join('./data/', url.split('/')[-1]) - - -def download(args): - downloads = [ - # Can add other downloads here (e.g., other word vectors) - ('GloVe word vectors', args.glove_url), - ] - - for name, url in downloads: - output_path = url_to_data_path(url) - if not os.path.exists(output_path): - print(f'Downloading {name}...') - download_url(url, output_path) - - if os.path.exists(output_path) and output_path.endswith('.zip'): - extracted_path = output_path.replace('.zip', '') - if not os.path.exists(extracted_path): - print(f'Unzipping {name}...') - with ZipFile(output_path, 'r') as zip_fh: - zip_fh.extractall(extracted_path) - - print('Downloading spacy language model...') - run(['python', '-m', 'spacy', 'download', 'en']) - -def word_tokenize(sent, nlp): - doc = nlp(sent) - return [token.text for token in doc] - - -def convert_idx(text, tokens): - current = 0 - spans = [] - for token in tokens: - current = text.find(token, current) - if current < 0: - print(f"Token {token} cannot be found") - raise Exception() - spans.append((current, current + len(token))) - current += len(token) - return spans - - -def process_file(filename, data_type, word_counter, char_counter, nlp): - print(f"Pre-processing {data_type} examples...") - examples = [] - eval_examples = {} - total = 0 - with open(filename, "r") as fh: - source = json.load(fh) - for article in tqdm(source["data"]): - for para in article["paragraphs"]: - context = para["context"].replace( - "''", '" ').replace("``", '" ') - context_tokens = word_tokenize(context, nlp) - context_chars = [list(token) for token in context_tokens] - spans = convert_idx(context, context_tokens) - for token in context_tokens: - word_counter[token] += len(para["qas"]) - for char in token: - char_counter[char] += len(para["qas"]) - for qa in para["qas"]: - total += 1 - ques = qa["question"].replace( - "''", '" ').replace("``", '" ') - ques_tokens = word_tokenize(ques, nlp) - ques_chars = [list(token) for token in ques_tokens] - for token in ques_tokens: - word_counter[token] += 1 - for char in token: - char_counter[char] += 1 - y1s, y2s = [], [] - answer_texts = [] - for answer in qa["answers"]: - answer_text = answer["text"] - answer_start = answer['answer_start'] - answer_end = answer_start + len(answer_text) - answer_texts.append(answer_text) - answer_span = [] - for idx, span in enumerate(spans): - if not (answer_end <= span[0] or answer_start >= span[1]): - answer_span.append(idx) - y1, y2 = answer_span[0], answer_span[-1] - y1s.append(y1) - y2s.append(y2) - example = {"context_tokens": context_tokens, - "context_chars": context_chars, - "ques_tokens": ques_tokens, - "ques_chars": ques_chars, - "y1s": y1s, - "y2s": y2s, - "id": total} - examples.append(example) - eval_examples[str(total)] = {"context": context, - "question": ques, - "spans": spans, - "answers": answer_texts, - "uuid": qa["id"]} - print(f"{len(examples)} questions in total") - return examples, eval_examples - - -def get_embedding(counter, data_type, limit=-1, emb_file=None, vec_size=None, num_vectors=None): - print(f"Pre-processing {data_type} vectors...") - embedding_dict = {} - filtered_elements = [k for k, v in counter.items() if v > limit] - if emb_file is not None: - assert vec_size is not None - with open(emb_file, "r", encoding="utf-8") as fh: - for line in tqdm(fh, total=num_vectors): - array = line.split() - word = "".join(array[0:-vec_size]) - vector = list(map(float, array[-vec_size:])) - if word in counter and counter[word] > limit: - embedding_dict[word] = vector - print(f"{len(embedding_dict)} / {len(filtered_elements)} tokens have corresponding {data_type} embedding vector") - else: - assert vec_size is not None - for token in filtered_elements: - embedding_dict[token] = [np.random.normal( - scale=0.1) for _ in range(vec_size)] - print(f"{len(filtered_elements)} tokens have corresponding {data_type} embedding vector") - - NULL = "--NULL--" - OOV = "--OOV--" - token2idx_dict = {token: idx for idx, token in enumerate(embedding_dict.keys(), 2)} - token2idx_dict[NULL] = 0 - token2idx_dict[OOV] = 1 - embedding_dict[NULL] = [0. for _ in range(vec_size)] - embedding_dict[OOV] = [0. for _ in range(vec_size)] - idx2emb_dict = {idx: embedding_dict[token] - for token, idx in token2idx_dict.items()} - emb_mat = [idx2emb_dict[idx] for idx in range(len(idx2emb_dict))] - return emb_mat, token2idx_dict - - -def convert_to_features(args, data, word2idx_dict, char2idx_dict, is_test): - example = {} - context, question = data - context = context.replace("''", '" ').replace("``", '" ') - question = question.replace("''", '" ').replace("``", '" ') - example['context_tokens'] = word_tokenize(context) - example['ques_tokens'] = word_tokenize(question) - example['context_chars'] = [list(token) for token in example['context_tokens']] - example['ques_chars'] = [list(token) for token in example['ques_tokens']] - - para_limit = args.test_para_limit if is_test else args.para_limit - ques_limit = args.test_ques_limit if is_test else args.ques_limit - char_limit = args.char_limit - - def filter_func(example): - return len(example["context_tokens"]) > para_limit or \ - len(example["ques_tokens"]) > ques_limit - - if filter_func(example): - raise ValueError("Context/Questions lengths are over the limit") - - context_idxs = np.zeros([para_limit], dtype=np.int32) - context_char_idxs = np.zeros([para_limit, char_limit], dtype=np.int32) - ques_idxs = np.zeros([ques_limit], dtype=np.int32) - ques_char_idxs = np.zeros([ques_limit, char_limit], dtype=np.int32) - - def _get_word(word): - for each in (word, word.lower(), word.capitalize(), word.upper()): - if each in word2idx_dict: - return word2idx_dict[each] - return 1 - - def _get_char(char): - if char in char2idx_dict: - return char2idx_dict[char] - return 1 - - for i, token in enumerate(example["context_tokens"]): - context_idxs[i] = _get_word(token) - - for i, token in enumerate(example["ques_tokens"]): - ques_idxs[i] = _get_word(token) - - for i, token in enumerate(example["context_chars"]): - for j, char in enumerate(token): - if j == char_limit: - break - context_char_idxs[i, j] = _get_char(char) - - for i, token in enumerate(example["ques_chars"]): - for j, char in enumerate(token): - if j == char_limit: - break - ques_char_idxs[i, j] = _get_char(char) - - return context_idxs, context_char_idxs, ques_idxs, ques_char_idxs - - -def is_answerable(example): - return len(example['y2s']) > 0 and len(example['y1s']) > 0 - - -def build_features(args, examples, data_type, out_file, word2idx_dict, char2idx_dict, is_test=False): - para_limit = args.test_para_limit if is_test else args.para_limit - ques_limit = args.test_ques_limit if is_test else args.ques_limit - ans_limit = args.ans_limit - char_limit = args.char_limit - - def drop_example(ex, is_test_=False): - if is_test_: - drop = False - else: - drop = len(ex["context_tokens"]) > para_limit or \ - len(ex["ques_tokens"]) > ques_limit or \ - (is_answerable(ex) and - ex["y2s"][0] - ex["y1s"][0] > ans_limit) - - return drop - - print(f"Converting {data_type} examples to indices...") - total = 0 - total_ = 0 - meta = {} - context_idxs = [] - context_char_idxs = [] - ques_idxs = [] - ques_char_idxs = [] - y1s = [] - y2s = [] - ids = [] - for n, example in tqdm(enumerate(examples)): - total_ += 1 - - if drop_example(example, is_test): - continue - - total += 1 - - def _get_word(word): - for each in (word, word.lower(), word.capitalize(), word.upper()): - if each in word2idx_dict: - return word2idx_dict[each] - return 1 - - def _get_char(char): - if char in char2idx_dict: - return char2idx_dict[char] - return 1 - - context_idx = np.zeros([para_limit], dtype=np.int32) - context_char_idx = np.zeros([para_limit, char_limit], dtype=np.int32) - ques_idx = np.zeros([ques_limit], dtype=np.int32) - ques_char_idx = np.zeros([ques_limit, char_limit], dtype=np.int32) - - for i, token in enumerate(example["context_tokens"]): - context_idx[i] = _get_word(token) - context_idxs.append(context_idx) - - for i, token in enumerate(example["ques_tokens"]): - ques_idx[i] = _get_word(token) - ques_idxs.append(ques_idx) - - for i, token in enumerate(example["context_chars"]): - for j, char in enumerate(token): - if j == char_limit: - break - context_char_idx[i, j] = _get_char(char) - context_char_idxs.append(context_char_idx) - - for i, token in enumerate(example["ques_chars"]): - for j, char in enumerate(token): - if j == char_limit: - break - ques_char_idx[i, j] = _get_char(char) - ques_char_idxs.append(ques_char_idx) - - if is_answerable(example): - start, end = example["y1s"][-1], example["y2s"][-1] - else: - start, end = -1, -1 - - y1s.append(start) - y2s.append(end) - ids.append(example["id"]) - - np.savez(out_file, - context_idxs=np.array(context_idxs), - context_char_idxs=np.array(context_char_idxs), - ques_idxs=np.array(ques_idxs), - ques_char_idxs=np.array(ques_char_idxs), - y1s=np.array(y1s), - y2s=np.array(y2s), - ids=np.array(ids)) - print(f"Built {total} / {total_} instances of features in total") - meta["total"] = total - return meta - - -def save(filename, obj, message=None): - if message is not None: - print(f"Saving {message}...") - with open(filename, "w") as fh: - json.dump(obj, fh) - - -def pre_process(args): - # Process training set and use it to decide on the word/character vocabularies - word_counter, char_counter = Counter(), Counter() - train_examples, train_eval = process_file(args.train_file, "train", word_counter, char_counter) - word_emb_mat, word2idx_dict = get_embedding( - word_counter, 'word', emb_file=args.glove_file, vec_size=args.glove_dim, num_vectors=args.glove_num_vecs) - char_emb_mat, char2idx_dict = get_embedding( - char_counter, 'char', emb_file=None, vec_size=args.char_dim) - - # Process dev and test sets - dev_examples, dev_eval = process_file(args.dev_file, "dev", word_counter, char_counter) - build_features(args, train_examples, "train", args.train_record_file, word2idx_dict, char2idx_dict) - dev_meta = build_features(args, dev_examples, "dev", args.dev_record_file, word2idx_dict, char2idx_dict) - if args.include_test_examples: - test_examples, test_eval = process_file(args.test_file, "test", word_counter, char_counter) - save(args.test_eval_file, test_eval, message="test eval") - test_meta = build_features(args, test_examples, "test", - args.test_record_file, word2idx_dict, char2idx_dict, is_test=True) - save(args.test_meta_file, test_meta, message="test meta") - - save(args.word_emb_file, word_emb_mat, message="word embedding") - save(args.char_emb_file, char_emb_mat, message="char embedding") - save(args.train_eval_file, train_eval, message="train eval") - save(args.dev_eval_file, dev_eval, message="dev eval") - save(args.word2idx_file, word2idx_dict, message="word dictionary") - save(args.char2idx_file, char2idx_dict, message="char dictionary") - save(args.dev_meta_file, dev_meta, message="dev meta") - - -if __name__ == '__main__': - # Get command-line args - args_ = get_setup_args() - - # Download resources - download(args_) - - # Import spacy language model - nlp = spacy.blank("en") - - # Preprocess dataset - args_.train_file = url_to_data_path(args_.train_url) - args_.dev_file = url_to_data_path(args_.dev_url) - if args_.include_test_examples: - args_.test_file = url_to_data_path(args_.test_url) - glove_dir = url_to_data_path(args_.glove_url.replace('.zip', '')) - glove_ext = f'.txt' if glove_dir.endswith('d') else f'.{args_.glove_dim}d.txt' - args_.glove_file = os.path.join(glove_dir, os.path.basename(glove_dir) + glove_ext) - pre_process(args_) diff --git a/homeworks/lab02_qa/test.py b/homeworks/lab02_qa/test.py deleted file mode 100644 index 745fa36..0000000 --- a/homeworks/lab02_qa/test.py +++ /dev/null @@ -1,138 +0,0 @@ -"""Test a model and generate submission CSV. - -Usage: - > python test.py --split SPLIT --load_path PATH --name NAME - where - > SPLIT is either "dev" or "test" - > PATH is a path to a checkpoint (e.g., save/train/model-01/best.pth.tar) - > NAME is a name to identify the test run - -Author: - Chris Chute (chute@stanford.edu) -""" - -import csv -import torch -import torch.nn as nn -import torch.nn.functional as F -import torch.utils.data as data -import util - -from args import get_test_args -from collections import OrderedDict -from json import dumps -from models import BiDAF -from os.path import join -from tensorboardX import SummaryWriter -from tqdm import tqdm -from ujson import load as json_load -from util import collate_fn, SQuAD - - -def main(args): - # Set up logging - args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False) - log = util.get_logger(args.save_dir, args.name) - log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') - device, gpu_ids = util.get_available_devices() - args.batch_size *= max(1, len(gpu_ids)) - - # Get embeddings - log.info('Loading embeddings...') - word_vectors = util.torch_from_json(args.word_emb_file) - - # Get model - log.info('Building model...') - model = BiDAF(word_vectors=word_vectors, - hidden_size=args.hidden_size) - model = nn.DataParallel(model, gpu_ids) - log.info(f'Loading checkpoint from {args.load_path}...') - model = util.load_model(model, args.load_path, gpu_ids, return_step=False) - model = model.to(device) - model.eval() - - # Get data loader - log.info('Building dataset...') - record_file = vars(args)[f'{args.split}_record_file'] - dataset = SQuAD(record_file, args.use_squad_v2) - data_loader = data.DataLoader(dataset, - batch_size=args.batch_size, - shuffle=False, - num_workers=args.num_workers, - collate_fn=collate_fn) - - # Evaluate - log.info(f'Evaluating on {args.split} split...') - nll_meter = util.AverageMeter() - pred_dict = {} # Predictions for TensorBoard - sub_dict = {} # Predictions for submission - eval_file = vars(args)[f'{args.split}_eval_file'] - with open(eval_file, 'r') as fh: - gold_dict = json_load(fh) - with torch.no_grad(), \ - tqdm(total=len(dataset)) as progress_bar: - for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: - # Setup for forward - cw_idxs = cw_idxs.to(device) - qw_idxs = qw_idxs.to(device) - batch_size = cw_idxs.size(0) - - # Forward - log_p1, log_p2 = model(cw_idxs, qw_idxs) - y1, y2 = y1.to(device), y2.to(device) - loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) - nll_meter.update(loss.item(), batch_size) - - # Get F1 and EM scores - p1, p2 = log_p1.exp(), log_p2.exp() - starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) - - # Log info - progress_bar.update(batch_size) - if args.split != 'test': - # No labels for the test set, so NLL would be invalid - progress_bar.set_postfix(NLL=nll_meter.avg) - - idx2pred, uuid2pred = util.convert_tokens(gold_dict, - ids.tolist(), - starts.tolist(), - ends.tolist(), - args.use_squad_v2) - pred_dict.update(idx2pred) - sub_dict.update(uuid2pred) - - # Log results (except for test set, since it does not come with labels) - if args.split != 'test': - results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2) - results_list = [('NLL', nll_meter.avg), - ('F1', results['F1']), - ('EM', results['EM'])] - if args.use_squad_v2: - results_list.append(('AvNA', results['AvNA'])) - results = OrderedDict(results_list) - - # Log to console - results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) - log.info(f'{args.split.title()} {results_str}') - - # Log to TensorBoard - tbx = SummaryWriter(args.save_dir) - util.visualize(tbx, - pred_dict=pred_dict, - eval_path=eval_file, - step=0, - split=args.split, - num_visuals=args.num_visuals) - - # Write submission file - sub_path = join(args.save_dir, args.split + '_' + args.sub_file) - log.info(f'Writing submission file to {sub_path}...') - with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: - csv_writer = csv.writer(csv_fh, delimiter=',') - csv_writer.writerow(['Id', 'Predicted']) - for uuid in sorted(sub_dict): - csv_writer.writerow([uuid, sub_dict[uuid]]) - - -if __name__ == '__main__': - main(get_test_args()) diff --git a/homeworks/lab02_qa/train.py b/homeworks/lab02_qa/train.py deleted file mode 100644 index 42e4265..0000000 --- a/homeworks/lab02_qa/train.py +++ /dev/null @@ -1,212 +0,0 @@ -"""Train a model on SQuAD. - -Author: - Chris Chute (chute@stanford.edu) -""" - -import numpy as np -import random -import torch -import torch.nn as nn -import torch.nn.functional as F -import torch.optim as optim -import torch.optim.lr_scheduler as sched -import torch.utils.data as data -import util - -from args import get_train_args -from collections import OrderedDict -from json import dumps -from models import BiDAF -from tensorboardX import SummaryWriter -from tqdm import tqdm -from ujson import load as json_load -from util import collate_fn, SQuAD - - -def main(args): - # Set up logging and devices - args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) - log = util.get_logger(args.save_dir, args.name) - tbx = SummaryWriter(args.save_dir) - - import warnings - warnings.filterwarnings('ignore') - - device, args.gpu_ids = util.get_available_devices() - log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') - args.batch_size *= max(1, len(args.gpu_ids)) - - # Set random seed - log.info(f'Using random seed {args.seed}...') - random.seed(args.seed) - np.random.seed(args.seed) - torch.manual_seed(args.seed) - torch.cuda.manual_seed_all(args.seed) - - # Get embeddings - log.info('Loading embeddings...') - word_vectors = util.torch_from_json(args.word_emb_file) - - # Get model - log.info('Building model...') - model = BiDAF(word_vectors=word_vectors, - hidden_size=args.hidden_size, - drop_prob=args.drop_prob) - model = nn.DataParallel(model, args.gpu_ids) - if args.load_path: - log.info(f'Loading checkpoint from {args.load_path}...') - model, step = util.load_model(model, args.load_path, args.gpu_ids) - else: - step = 0 - model = model.to(device) - model.train() - ema = util.EMA(model, args.ema_decay) - - # Get saver - saver = util.CheckpointSaver(args.save_dir, - max_checkpoints=args.max_checkpoints, - metric_name=args.metric_name, - maximize_metric=args.maximize_metric, - log=log) - - # Get optimizer and scheduler - optimizer = optim.Adadelta(model.parameters(), args.lr, - weight_decay=args.l2_wd) - scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR - - # Get data loader - log.info('Building dataset...') - train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) - train_loader = data.DataLoader(train_dataset, - batch_size=args.batch_size, - shuffle=True, - num_workers=args.num_workers, - collate_fn=collate_fn) - dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) - dev_loader = data.DataLoader(dev_dataset, - batch_size=args.batch_size, - shuffle=False, - num_workers=args.num_workers, - collate_fn=collate_fn) - - # Train - log.info('Training...') - steps_till_eval = args.eval_steps - epoch = step // len(train_dataset) - while epoch != args.num_epochs: - epoch += 1 - log.info(f'Starting epoch {epoch}...') - with torch.enable_grad(), \ - tqdm(total=len(train_loader.dataset)) as progress_bar: - for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: - # Setup for forward - cw_idxs = cw_idxs.to(device) - qw_idxs = qw_idxs.to(device) - batch_size = cw_idxs.size(0) - optimizer.zero_grad() - - # Forward - log_p1, log_p2 = model(cw_idxs, qw_idxs) - y1, y2 = y1.to(device), y2.to(device) - loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) - loss_val = loss.item() - - # Backward - loss.backward() - nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) - optimizer.step() - scheduler.step(step // batch_size) - ema(model, step // batch_size) - - # Log info - step += batch_size - progress_bar.update(batch_size) - progress_bar.set_postfix(epoch=epoch, - NLL=loss_val) - tbx.add_scalar('train/NLL', loss_val, step) - tbx.add_scalar('train/LR', - optimizer.param_groups[0]['lr'], - step) - - steps_till_eval -= batch_size - if steps_till_eval <= 0: - steps_till_eval = args.eval_steps - - # Evaluate and save checkpoint - log.info(f'Evaluating at step {step}...') - ema.assign(model) - results, pred_dict = evaluate(model, dev_loader, device, - args.dev_eval_file, - args.max_ans_len, - args.use_squad_v2) - saver.save(step, model, results[args.metric_name], device) - ema.resume(model) - - # Log to console - results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) - log.info(f'Dev {results_str}') - - # Log to TensorBoard - log.info('Visualizing in TensorBoard...') - for k, v in results.items(): - tbx.add_scalar(f'dev/{k}', v, step) - util.visualize(tbx, - pred_dict=pred_dict, - eval_path=args.dev_eval_file, - step=step, - split='dev', - num_visuals=args.num_visuals) - - -def evaluate(model, data_loader, device, eval_file, max_len, use_squad_v2): - nll_meter = util.AverageMeter() - - model.eval() - pred_dict = {} - with open(eval_file, 'r') as fh: - gold_dict = json_load(fh) - with torch.no_grad(), \ - tqdm(total=len(data_loader.dataset)) as progress_bar: - for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: - # Setup for forward - cw_idxs = cw_idxs.to(device) - qw_idxs = qw_idxs.to(device) - batch_size = cw_idxs.size(0) - - # Forward - log_p1, log_p2 = model(cw_idxs, qw_idxs) - y1, y2 = y1.to(device), y2.to(device) - loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) - nll_meter.update(loss.item(), batch_size) - - # Get F1 and EM scores - p1, p2 = log_p1.exp(), log_p2.exp() - starts, ends = util.discretize(p1, p2, max_len, use_squad_v2) - - # Log info - progress_bar.update(batch_size) - progress_bar.set_postfix(NLL=nll_meter.avg) - - preds, _ = util.convert_tokens(gold_dict, - ids.tolist(), - starts.tolist(), - ends.tolist(), - use_squad_v2) - pred_dict.update(preds) - - model.train() - - results = util.eval_dicts(gold_dict, pred_dict, use_squad_v2) - results_list = [('NLL', nll_meter.avg), - ('F1', results['F1']), - ('EM', results['EM'])] - if use_squad_v2: - results_list.append(('AvNA', results['AvNA'])) - results = OrderedDict(results_list) - - return results, pred_dict - - -if __name__ == '__main__': - main(get_train_args()) diff --git a/homeworks/lab02_qa/util.py b/homeworks/lab02_qa/util.py deleted file mode 100644 index 1cad0bb..0000000 --- a/homeworks/lab02_qa/util.py +++ /dev/null @@ -1,725 +0,0 @@ -"""Utility classes and methods. - -Author: - Chris Chute (chute@stanford.edu) -""" -import logging -import os -import queue -import re -import shutil -import string -import torch -import torch.nn.functional as F -import torch.utils.data as data -import tqdm -import numpy as np -import ujson as json - -from collections import Counter - - -class SQuAD(data.Dataset): - """Stanford Question Answering Dataset (SQuAD). - - Each item in the dataset is a tuple with the following entries (in order): - - context_idxs: Indices of the words in the context. - Shape (context_len,). - - context_char_idxs: Indices of the characters in the context. - Shape (context_len, max_word_len). - - question_idxs: Indices of the words in the question. - Shape (question_len,). - - question_char_idxs: Indices of the characters in the question. - Shape (question_len, max_word_len). - - y1: Index of word in the context where the answer begins. - -1 if no answer. - - y2: Index of word in the context where the answer ends. - -1 if no answer. - - id: ID of the example. - - Args: - data_path (str): Path to .npz file containing pre-processed dataset. - use_v2 (bool): Whether to use SQuAD 2.0 questions. Otherwise only use SQuAD 1.1. - """ - def __init__(self, data_path, use_v2=True): - super(SQuAD, self).__init__() - - dataset = np.load(data_path) - self.context_idxs = torch.from_numpy(dataset['context_idxs']).long() - self.context_char_idxs = torch.from_numpy(dataset['context_char_idxs']).long() - self.question_idxs = torch.from_numpy(dataset['ques_idxs']).long() - self.question_char_idxs = torch.from_numpy(dataset['ques_char_idxs']).long() - self.y1s = torch.from_numpy(dataset['y1s']).long() - self.y2s = torch.from_numpy(dataset['y2s']).long() - - if use_v2: - # SQuAD 2.0: Use index 0 for no-answer token (token 1 = OOV) - batch_size, c_len, w_len = self.context_char_idxs.size() - ones = torch.ones((batch_size, 1), dtype=torch.int64) - self.context_idxs = torch.cat((ones, self.context_idxs), dim=1) - self.question_idxs = torch.cat((ones, self.question_idxs), dim=1) - - ones = torch.ones((batch_size, 1, w_len), dtype=torch.int64) - self.context_char_idxs = torch.cat((ones, self.context_char_idxs), dim=1) - self.question_char_idxs = torch.cat((ones, self.question_char_idxs), dim=1) - - self.y1s += 1 - self.y2s += 1 - - # SQuAD 1.1: Ignore no-answer examples - self.ids = torch.from_numpy(dataset['ids']).long() - self.valid_idxs = [idx for idx in range(len(self.ids)) - if use_v2 or self.y1s[idx].item() >= 0] - - def __getitem__(self, idx): - idx = self.valid_idxs[idx] - example = (self.context_idxs[idx], - self.context_char_idxs[idx], - self.question_idxs[idx], - self.question_char_idxs[idx], - self.y1s[idx], - self.y2s[idx], - self.ids[idx]) - - return example - - def __len__(self): - return len(self.valid_idxs) - - -def collate_fn(examples): - """Create batch tensors from a list of individual examples returned - by `SQuAD.__getitem__`. Merge examples of different length by padding - all examples to the maximum length in the batch. - - Args: - examples (list): List of tuples of the form (context_idxs, context_char_idxs, - question_idxs, question_char_idxs, y1s, y2s, ids). - - Returns: - examples (tuple): Tuple of tensors (context_idxs, context_char_idxs, question_idxs, - question_char_idxs, y1s, y2s, ids). All of shape (batch_size, ...), where - the remaining dimensions are the maximum length of examples in the input. - - Adapted from: - https://github.com/yunjey/seq2seq-dataloader - """ - def merge_0d(scalars, dtype=torch.int64): - return torch.tensor(scalars, dtype=dtype) - - def merge_1d(arrays, dtype=torch.int64, pad_value=0): - lengths = [(a != pad_value).sum() for a in arrays] - padded = torch.zeros(len(arrays), max(lengths), dtype=dtype) - for i, seq in enumerate(arrays): - end = lengths[i] - padded[i, :end] = seq[:end] - return padded - - def merge_2d(matrices, dtype=torch.int64, pad_value=0): - heights = [(m.sum(1) != pad_value).sum() for m in matrices] - widths = [(m.sum(0) != pad_value).sum() for m in matrices] - padded = torch.zeros(len(matrices), max(heights), max(widths), dtype=dtype) - for i, seq in enumerate(matrices): - height, width = heights[i], widths[i] - padded[i, :height, :width] = seq[:height, :width] - return padded - - # Group by tensor type - context_idxs, context_char_idxs, \ - question_idxs, question_char_idxs, \ - y1s, y2s, ids = zip(*examples) - - # Merge into batch tensors - context_idxs = merge_1d(context_idxs) - context_char_idxs = merge_2d(context_char_idxs) - question_idxs = merge_1d(question_idxs) - question_char_idxs = merge_2d(question_char_idxs) - y1s = merge_0d(y1s) - y2s = merge_0d(y2s) - ids = merge_0d(ids) - - return (context_idxs, context_char_idxs, - question_idxs, question_char_idxs, - y1s, y2s, ids) - - -class AverageMeter: - """Keep track of average values over time. - - Adapted from: - > https://github.com/pytorch/examples/blob/master/imagenet/main.py - """ - def __init__(self): - self.avg = 0 - self.sum = 0 - self.count = 0 - - def reset(self): - """Reset meter.""" - self.__init__() - - def update(self, val, num_samples=1): - """Update meter with new value `val`, the average of `num` samples. - - Args: - val (float): Average value to update the meter with. - num_samples (int): Number of samples that were averaged to - produce `val`. - """ - self.count += num_samples - self.sum += val * num_samples - self.avg = self.sum / self.count - - -class EMA: - """Exponential moving average of model parameters. - Args: - model (torch.nn.Module): Model with parameters whose EMA will be kept. - decay (float): Decay rate for exponential moving average. - """ - def __init__(self, model, decay): - self.decay = decay - self.shadow = {} - self.original = {} - - # Register model parameters - for name, param in model.named_parameters(): - if param.requires_grad: - self.shadow[name] = param.data.clone() - - def __call__(self, model, num_updates): - decay = min(self.decay, (1.0 + num_updates) / (10.0 + num_updates)) - for name, param in model.named_parameters(): - if param.requires_grad: - assert name in self.shadow - new_average = \ - (1.0 - decay) * param.data + decay * self.shadow[name] - self.shadow[name] = new_average.clone() - - def assign(self, model): - """Assign exponential moving average of parameter values to the - respective parameters. - Args: - model (torch.nn.Module): Model to assign parameter values. - """ - for name, param in model.named_parameters(): - if param.requires_grad: - assert name in self.shadow - self.original[name] = param.data.clone() - param.data = self.shadow[name] - - def resume(self, model): - """Restore original parameters to a model. That is, put back - the values that were in each parameter at the last call to `assign`. - Args: - model (torch.nn.Module): Model to assign parameter values. - """ - for name, param in model.named_parameters(): - if param.requires_grad: - assert name in self.shadow - param.data = self.original[name] - - -class CheckpointSaver: - """Class to save and load model checkpoints. - - Save the best checkpoints as measured by a metric value passed into the - `save` method. Overwrite checkpoints with better checkpoints once - `max_checkpoints` have been saved. - - Args: - save_dir (str): Directory to save checkpoints. - max_checkpoints (int): Maximum number of checkpoints to keep before - overwriting old ones. - metric_name (str): Name of metric used to determine best model. - maximize_metric (bool): If true, best checkpoint is that which maximizes - the metric value passed in via `save`. Otherwise, best checkpoint - minimizes the metric. - log (logging.Logger): Optional logger for printing information. - """ - def __init__(self, save_dir, max_checkpoints, metric_name, - maximize_metric=False, log=None): - super(CheckpointSaver, self).__init__() - - self.save_dir = save_dir - self.max_checkpoints = max_checkpoints - self.metric_name = metric_name - self.maximize_metric = maximize_metric - self.best_val = None - self.ckpt_paths = queue.PriorityQueue() - self.log = log - self._print(f"Saver will {'max' if maximize_metric else 'min'}imize {metric_name}...") - - def is_best(self, metric_val): - """Check whether `metric_val` is the best seen so far. - - Args: - metric_val (float): Metric value to compare to prior checkpoints. - """ - if metric_val is None: - # No metric reported - return False - - if self.best_val is None: - # No checkpoint saved yet - return True - - return ((self.maximize_metric and self.best_val < metric_val) - or (not self.maximize_metric and self.best_val > metric_val)) - - def _print(self, message): - """Print a message if logging is enabled.""" - if self.log is not None: - self.log.info(message) - - def save(self, step, model, metric_val, device): - """Save model parameters to disk. - - Args: - step (int): Total number of examples seen during training so far. - model (torch.nn.DataParallel): Model to save. - metric_val (float): Determines whether checkpoint is best so far. - device (torch.device): Device where model resides. - """ - ckpt_dict = { - 'model_name': model.__class__.__name__, - 'model_state': model.cpu().state_dict(), - 'step': step - } - model.to(device) - - checkpoint_path = os.path.join(self.save_dir, - f'step_{step}.pth.tar') - torch.save(ckpt_dict, checkpoint_path) - self._print(f'Saved checkpoint: {checkpoint_path}') - - if self.is_best(metric_val): - # Save the best model - self.best_val = metric_val - best_path = os.path.join(self.save_dir, 'best.pth.tar') - shutil.copy(checkpoint_path, best_path) - self._print(f'New best checkpoint at step {step}...') - - # Add checkpoint path to priority queue (lowest priority removed first) - if self.maximize_metric: - priority_order = metric_val - else: - priority_order = -metric_val - - self.ckpt_paths.put((priority_order, checkpoint_path)) - - # Remove a checkpoint if more than max_checkpoints have been saved - if self.ckpt_paths.qsize() > self.max_checkpoints: - _, worst_ckpt = self.ckpt_paths.get() - try: - os.remove(worst_ckpt) - self._print(f'Removed checkpoint: {worst_ckpt}') - except OSError: - # Avoid crashing if checkpoint has been removed or protected - pass - - -def load_model(model, checkpoint_path, gpu_ids, return_step=True): - """Load model parameters from disk. - - Args: - model (torch.nn.DataParallel): Load parameters into this model. - checkpoint_path (str): Path to checkpoint to load. - gpu_ids (list): GPU IDs for DataParallel. - return_step (bool): Also return the step at which checkpoint was saved. - - Returns: - model (torch.nn.DataParallel): Model loaded from checkpoint. - step (int): Step at which checkpoint was saved. Only if `return_step`. - """ - device = f"cuda:{gpu_ids[0] if gpu_ids else 'cpu'}" - ckpt_dict = torch.load(checkpoint_path, map_location=device) - - # Build model, load parameters - model.load_state_dict(ckpt_dict['model_state']) - - if return_step: - step = ckpt_dict['step'] - return model, step - - return model - - -def get_available_devices(): - """Get IDs of all available GPUs. - - Returns: - device (torch.device): Main device (GPU 0 or CPU). - gpu_ids (list): List of IDs of all GPUs that are available. - """ - gpu_ids = [] - if torch.cuda.is_available(): - gpu_ids += [gpu_id for gpu_id in range(torch.cuda.device_count())] - device = torch.device(f'cuda:{gpu_ids[0]}') - torch.cuda.set_device(device) - else: - device = torch.device('cpu') - - return device, gpu_ids - - -def masked_softmax(logits, mask, dim=-1, log_softmax=False): - """Take the softmax of `logits` over given dimension, and set - entries to 0 wherever `mask` is 0. - - Args: - logits (torch.Tensor): Inputs to the softmax function. - mask (torch.Tensor): Same shape as `logits`, with 0 indicating - positions that should be assigned 0 probability in the output. - dim (int): Dimension over which to take softmax. - log_softmax (bool): Take log-softmax rather than regular softmax. - E.g., some PyTorch functions such as `F.nll_loss` expect log-softmax. - - Returns: - probs (torch.Tensor): Result of taking masked softmax over the logits. - """ - mask = mask.type(torch.float32) - masked_logits = mask * logits + (1 - mask) * -1e30 - softmax_fn = F.log_softmax if log_softmax else F.softmax - probs = softmax_fn(masked_logits, dim) - - return probs - - -def visualize(tbx, pred_dict, eval_path, step, split, num_visuals): - """Visualize text examples to TensorBoard. - - Args: - tbx (tensorboardX.SummaryWriter): Summary writer. - pred_dict (dict): dict of predictions of the form id -> pred. - eval_path (str): Path to eval JSON file. - step (int): Number of examples seen so far during training. - split (str): Name of data split being visualized. - num_visuals (int): Number of visuals to select at random from preds. - """ - if num_visuals <= 0: - return - if num_visuals > len(pred_dict): - num_visuals = len(pred_dict) - - visual_ids = np.random.choice(list(pred_dict), size=num_visuals, replace=False) - - with open(eval_path, 'r') as eval_file: - eval_dict = json.load(eval_file) - for i, id_ in enumerate(visual_ids): - pred = pred_dict[id_] or 'N/A' - example = eval_dict[str(id_)] - question = example['question'] - context = example['context'] - answers = example['answers'] - - gold = answers[0] if answers else 'N/A' - tbl_fmt = (f'- **Question:** {question}\n' - + f'- **Context:** {context}\n' - + f'- **Answer:** {gold}\n' - + f'- **Prediction:** {pred}') - tbx.add_text(tag=f'{split}/{i+1}_of_{num_visuals}', - text_string=tbl_fmt, - global_step=step) - - -def save_preds(preds, save_dir, file_name='predictions.csv'): - """Save predictions `preds` to a CSV file named `file_name` in `save_dir`. - - Args: - preds (list): List of predictions each of the form (id, start, end), - where id is an example ID, and start/end are indices in the context. - save_dir (str): Directory in which to save the predictions file. - file_name (str): File name for the CSV file. - - Returns: - save_path (str): Path where CSV file was saved. - """ - # Validate format - if (not isinstance(preds, list) - or any(not isinstance(p, tuple) or len(p) != 3 for p in preds)): - raise ValueError('preds must be a list of tuples (id, start, end)') - - # Make sure predictions are sorted by ID - preds = sorted(preds, key=lambda p: p[0]) - - # Save to a CSV file - save_path = os.path.join(save_dir, file_name) - np.savetxt(save_path, np.array(preds), delimiter=',', fmt='%d') - - return save_path - - -def get_save_dir(base_dir, name, training, id_max=100): - """Get a unique save directory by appending the smallest positive integer - `id < id_max` that is not already taken (i.e., no dir exists with that id). - - Args: - base_dir (str): Base directory in which to make save directories. - name (str): Name to identify this training run. Need not be unique. - training (bool): Save dir. is for training (determines subdirectory). - id_max (int): Maximum ID number before raising an exception. - - Returns: - save_dir (str): Path to a new directory with a unique name. - """ - for uid in range(1, id_max): - subdir = 'train' if training else 'test' - save_dir = os.path.join(base_dir, subdir, f'{name}-{uid:02d}') - if not os.path.exists(save_dir): - os.makedirs(save_dir) - return save_dir - - raise RuntimeError('Too many save directories created with the same name. \ - Delete old save directories or use another name.') - - -def get_logger(log_dir, name): - """Get a `logging.Logger` instance that prints to the console - and an auxiliary file. - - Args: - log_dir (str): Directory in which to create the log file. - name (str): Name to identify the logs. - - Returns: - logger (logging.Logger): Logger instance for logging events. - """ - class StreamHandlerWithTQDM(logging.Handler): - """Let `logging` print without breaking `tqdm` progress bars. - - See Also: - > https://stackoverflow.com/questions/38543506 - """ - def emit(self, record): - try: - msg = self.format(record) - tqdm.tqdm.write(msg) - self.flush() - except (KeyboardInterrupt, SystemExit): - raise - except: - self.handleError(record) - - # Create logger - logger = logging.getLogger(name) - logger.setLevel(logging.DEBUG) - - # Log everything (i.e., DEBUG level and above) to a file - log_path = os.path.join(log_dir, 'log.txt') - file_handler = logging.FileHandler(log_path) - file_handler.setLevel(logging.DEBUG) - - # Log everything except DEBUG level (i.e., INFO level and above) to console - console_handler = StreamHandlerWithTQDM() - console_handler.setLevel(logging.INFO) - - # Create format for the logs - file_formatter = logging.Formatter('[%(asctime)s] %(message)s', - datefmt='%m.%d.%y %H:%M:%S') - file_handler.setFormatter(file_formatter) - console_formatter = logging.Formatter('[%(asctime)s] %(message)s', - datefmt='%m.%d.%y %H:%M:%S') - console_handler.setFormatter(console_formatter) - - # add the handlers to the logger - logger.addHandler(file_handler) - logger.addHandler(console_handler) - - return logger - - -def torch_from_json(path, dtype=torch.float32): - """Load a PyTorch Tensor from a JSON file. - - Args: - path (str): Path to the JSON file to load. - dtype (torch.dtype): Data type of loaded array. - - Returns: - tensor (torch.Tensor): Tensor loaded from JSON file. - """ - with open(path, 'r') as fh: - array = np.array(json.load(fh)) - - tensor = torch.from_numpy(array).type(dtype) - - return tensor - - -def discretize(p_start, p_end, max_len=15, no_answer=False): - """Discretize soft predictions to get start and end indices. - - Choose the pair `(i, j)` of indices that maximizes `p1[i] * p2[j]` - subject to `i <= j` and `j - i + 1 <= max_len`. - - Args: - p_start (torch.Tensor): Soft predictions for start index. - Shape (batch_size, context_len). - p_end (torch.Tensor): Soft predictions for end index. - Shape (batch_size, context_len). - max_len (int): Maximum length of the discretized prediction. - I.e., enforce that `preds[i, 1] - preds[i, 0] + 1 <= max_len`. - no_answer (bool): Treat 0-index as the no-answer prediction. Consider - a prediction no-answer if `preds[0, 0] * preds[0, 1]` is greater - than the probability assigned to the max-probability span. - - Returns: - start_idxs (torch.Tensor): Hard predictions for start index. - Shape (batch_size,) - end_idxs (torch.Tensor): Hard predictions for end index. - Shape (batch_size,) - """ - if p_start.min() < 0 or p_start.max() > 1 \ - or p_end.min() < 0 or p_end.max() > 1: - raise ValueError('Expected p_start and p_end to have values in [0, 1]') - - # Compute pairwise probabilities - p_start = p_start.unsqueeze(dim=2) - p_end = p_end.unsqueeze(dim=1) - p_joint = torch.matmul(p_start, p_end) # (batch_size, c_len, c_len) - - # Restrict to pairs (i, j) such that i <= j <= i + max_len - 1 - c_len, device = p_start.size(1), p_start.device - is_legal_pair = torch.triu(torch.ones((c_len, c_len), device=device)) - is_legal_pair -= torch.triu(torch.ones((c_len, c_len), device=device), - diagonal=max_len) - if no_answer: - # Index 0 is no-answer - p_no_answer = p_joint[:, 0, 0].clone() - is_legal_pair[0, :] = 0 - is_legal_pair[:, 0] = 0 - else: - p_no_answer = None - p_joint *= is_legal_pair - - # Take pair (i, j) that maximizes p_joint - max_in_row, _ = torch.max(p_joint, dim=2) - max_in_col, _ = torch.max(p_joint, dim=1) - start_idxs = torch.argmax(max_in_row, dim=-1) - end_idxs = torch.argmax(max_in_col, dim=-1) - - if no_answer: - # Predict no-answer whenever p_no_answer > max_prob - max_prob, _ = torch.max(max_in_col, dim=-1) - start_idxs[p_no_answer > max_prob] = 0 - end_idxs[p_no_answer > max_prob] = 0 - - return start_idxs, end_idxs - - -def convert_tokens(eval_dict, qa_id, y_start_list, y_end_list, no_answer): - """Convert predictions to tokens from the context. - - Args: - eval_dict (dict): Dictionary with eval info for the dataset. This is - used to perform the mapping from IDs and indices to actual text. - qa_id (int): List of QA example IDs. - y_start_list (list): List of start predictions. - y_end_list (list): List of end predictions. - no_answer (bool): Questions can have no answer. E.g., SQuAD 2.0. - - Returns: - pred_dict (dict): Dictionary index IDs -> predicted answer text. - sub_dict (dict): Dictionary UUIDs -> predicted answer text (submission). - """ - pred_dict = {} - sub_dict = {} - for qid, y_start, y_end in zip(qa_id, y_start_list, y_end_list): - context = eval_dict[str(qid)]["context"] - spans = eval_dict[str(qid)]["spans"] - uuid = eval_dict[str(qid)]["uuid"] - if no_answer and (y_start == 0 or y_end == 0): - pred_dict[str(qid)] = '' - sub_dict[uuid] = '' - else: - if no_answer: - y_start, y_end = y_start - 1, y_end - 1 - start_idx = spans[y_start][0] - end_idx = spans[y_end][1] - pred_dict[str(qid)] = context[start_idx: end_idx] - sub_dict[uuid] = context[start_idx: end_idx] - return pred_dict, sub_dict - - -def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): - if not ground_truths: - return metric_fn(prediction, '') - scores_for_ground_truths = [] - for ground_truth in ground_truths: - score = metric_fn(prediction, ground_truth) - scores_for_ground_truths.append(score) - return max(scores_for_ground_truths) - - -def eval_dicts(gold_dict, pred_dict, no_answer): - avna = f1 = em = total = 0 - for key, value in pred_dict.items(): - total += 1 - ground_truths = gold_dict[key]['answers'] - prediction = value - em += metric_max_over_ground_truths(compute_em, prediction, ground_truths) - f1 += metric_max_over_ground_truths(compute_f1, prediction, ground_truths) - if no_answer: - avna += compute_avna(prediction, ground_truths) - - eval_dict = {'EM': 100. * em / total, - 'F1': 100. * f1 / total} - - if no_answer: - eval_dict['AvNA'] = 100. * avna / total - - return eval_dict - - -def compute_avna(prediction, ground_truths): - """Compute answer vs. no-answer accuracy.""" - return float(bool(prediction) == bool(ground_truths)) - - -# All methods below this line are from the official SQuAD 2.0 eval script -# https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/ -def normalize_answer(s): - """Convert to lowercase and remove punctuation, articles and extra whitespace.""" - - def remove_articles(text): - regex = re.compile(r'\b(a|an|the)\b', re.UNICODE) - return re.sub(regex, ' ', text) - - def white_space_fix(text): - return ' '.join(text.split()) - - def remove_punc(text): - exclude = set(string.punctuation) - return ''.join(ch for ch in text if ch not in exclude) - - def lower(text): - return text.lower() - - return white_space_fix(remove_articles(remove_punc(lower(s)))) - - -def get_tokens(s): - if not s: - return [] - return normalize_answer(s).split() - - -def compute_em(a_gold, a_pred): - return int(normalize_answer(a_gold) == normalize_answer(a_pred)) - - -def compute_f1(a_gold, a_pred): - gold_toks = get_tokens(a_gold) - pred_toks = get_tokens(a_pred) - common = Counter(gold_toks) & Counter(pred_toks) - num_same = sum(common.values()) - if len(gold_toks) == 0 or len(pred_toks) == 0: - # If either is no-answer, then F1 is 1 if they agree, 0 otherwise - return int(gold_toks == pred_toks) - if num_same == 0: - return 0 - precision = 1.0 * num_same / len(pred_toks) - recall = 1.0 * num_same / len(gold_toks) - f1 = (2 * precision * recall) / (precision + recall) - return f1 diff --git a/poetry.lock b/poetry.lock deleted file mode 100644 index e86885d..0000000 --- a/poetry.lock +++ /dev/null @@ -1,3181 +0,0 @@ -[[package]] -name = "appnope" -version = "0.1.2" -description = "Disable App Nap on macOS >= 10.9" -category = "main" -optional = false -python-versions = "*" - -[[package]] -name = "argon2-cffi" -version = "20.1.0" -description = "The secure Argon2 password hashing algorithm." -category = "main" -optional = false -python-versions = "*" - -[package.dependencies] -cffi = ">=1.0.0" -six = "*" - -[package.extras] -dev = ["coverage[toml] (>=5.0.2)", "hypothesis", "pytest", "sphinx", "wheel", "pre-commit"] -docs = ["sphinx"] -tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "pytest"] - -[[package]] -name = "async-generator" -version = "1.10" -description = "Async generators and context managers for Python 3.5+" -category = "main" -optional = false -python-versions = ">=3.5" - -[[package]] -name = "attrs" -version = "21.2.0" -description = "Classes Without Boilerplate" -category = "main" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" - -[package.extras] -dev = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "furo", "sphinx", "sphinx-notfound-page", "pre-commit"] -docs = ["furo", "sphinx", "zope.interface", "sphinx-notfound-page"] -tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface"] -tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins"] - -[[package]] -name = "backcall" -version = "0.2.0" -description = "Specifications for callback functions passed in to an API" -category = "main" -optional = false -python-versions = "*" - -[[package]] -name = "backports.entry-points-selectable" -version = "1.1.0" -description = "Compatibility shim providing selectable entry points for older implementations" -category = "dev" -optional = false -python-versions = ">=2.7" - -[package.extras] -docs = ["sphinx", "jaraco.packaging (>=8.2)", "rst.linker (>=1.9)"] -testing = ["pytest (>=4.6)", "pytest-flake8", "pytest-cov", "pytest-black (>=0.3.7)", "pytest-mypy", "pytest-checkdocs (>=2.4)", "pytest-enabler (>=1.0.1)"] - -[[package]] -name = "bleach" -version = "4.0.0" -description = "An easy safelist-based HTML-sanitizing tool." -category = "main" -optional = false -python-versions = ">=3.6" - -[package.dependencies] -packaging = "*" -six = ">=1.9.0" -webencodings = "*" - -[[package]] -name = "blis" -version = "0.7.4" -description = "The Blis BLAS-like linear algebra library, as a self-contained C-extension." -category = "main" -optional = false -python-versions = "*" - -[package.dependencies] -numpy = ">=1.15.0" - -[[package]] -name = "bokeh" -version = "2.3.3" -description = "Interactive plots and applications in the browser from Python" -category = "main" -optional = false -python-versions = ">=3.6" - -[package.dependencies] -Jinja2 = ">=2.9" -numpy = ">=1.11.3" -packaging = ">=16.8" -pillow = ">=7.1.0" -python-dateutil = ">=2.1" -PyYAML = ">=3.10" -tornado = ">=5.1" -typing_extensions = ">=3.7.4" - -[[package]] -name = "boto3" -version = "1.18.19" -description = "The AWS SDK for Python" -category = "main" -optional = false -python-versions = ">= 3.6" - -[package.dependencies] -botocore = ">=1.21.19,<1.22.0" -jmespath = ">=0.7.1,<1.0.0" -s3transfer = ">=0.5.0,<0.6.0" - -[package.extras] -crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] - -[[package]] -name = "botocore" -version = "1.21.19" -description = "Low-level, data-driven core of boto 3." -category = "main" -optional = false -python-versions = ">= 3.6" - -[package.dependencies] -jmespath = ">=0.7.1,<1.0.0" -python-dateutil = ">=2.1,<3.0.0" -urllib3 = ">=1.25.4,<1.27" - -[package.extras] -crt = ["awscrt (==0.11.24)"] - -[[package]] -name = "catalogue" -version = "2.0.4" -description = "Super lightweight function registries for your library" -category = "main" -optional = false -python-versions = ">=3.6" - -[[package]] -name = "certifi" -version = "2021.5.30" -description = "Python package for providing Mozilla's CA Bundle." -category = "main" -optional = false -python-versions = "*" - -[[package]] -name = "cffi" -version = "1.14.6" -description = "Foreign Function Interface for Python calling C code." -category = "main" -optional = false -python-versions = "*" - -[package.dependencies] -pycparser = "*" - -[[package]] -name = "cfgv" -version = "3.3.0" -description = "Validate configuration and produce human readable error messages." -category = "dev" -optional = false -python-versions = ">=3.6.1" - -[[package]] -name = "charset-normalizer" -version = "2.0.4" -description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." -category = "main" -optional = false -python-versions = ">=3.5.0" - -[package.extras] -unicode_backport = ["unicodedata2"] - -[[package]] -name = "click" -version = "7.1.2" -description = "Composable command line interface toolkit" -category = "main" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" - -[[package]] -name = "cloudpickle" -version = "1.6.0" -description = "Extended pickling support for Python objects" -category = "main" -optional = true -python-versions = ">=3.5" - -[[package]] -name = "colorama" -version = "0.4.4" -description = "Cross-platform colored terminal text." -category = "main" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" - -[[package]] -name = "cycler" -version = "0.10.0" -description = "Composable style cycles" -category = "main" -optional = false -python-versions = "*" - -[package.dependencies] -six = "*" - -[[package]] -name = "cymem" -version = "2.0.5" -description = "Manage calls to calloc/free through Cython" -category = "main" -optional = false -python-versions = "*" - -[[package]] -name = "cython" -version = "0.29.14" -description = "The Cython compiler for writing C extensions for the Python language." -category = "main" -optional = false -python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" - -[[package]] -name = "decorator" -version = "5.0.9" -description = "Decorators for Humans" -category = "main" -optional = false -python-versions = ">=3.5" - -[[package]] -name = "defusedxml" -version = "0.7.1" -description = "XML bomb protection for Python stdlib modules" -category = "main" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" - -[[package]] -name = "distlib" -version = "0.3.2" -description = "Distribution utilities" -category = "dev" -optional = false -python-versions = "*" - -[[package]] -name = "eli5" -version = "0.11.0" -description = "Debug machine learning classifiers and explain their predictions" -category = "main" -optional = true -python-versions = "*" - -[package.dependencies] -attrs = ">16.0.0" -graphviz = "*" -jinja2 = "*" -numpy = ">=1.9.0" -scikit-learn = ">=0.20" -scipy = "*" -six = "*" -tabulate = ">=0.7.7" - -[[package]] -name = "entrypoints" -version = "0.3" -description = "Discover and load entry points from installed packages." -category = "main" -optional = false -python-versions = ">=2.7" - -[[package]] -name = "filelock" -version = "3.0.12" -description = "A platform independent file lock." -category = "dev" -optional = false -python-versions = "*" - -[[package]] -name = "gensim" -version = "3.8.3" -description = "Python framework for fast Vector Space Modelling" -category = "main" -optional = false -python-versions = "*" - -[package.dependencies] -Cython = "0.29.14" -numpy = ">=1.11.3" -scipy = ">=0.18.1" -six = ">=1.5.0" -smart-open = ">=1.8.1" - -[package.extras] -distributed = ["Pyro4 (>=4.27)"] -docs = ["pytest", "pytest-rerunfailures", "mock", "cython", "nmslib", "pyemd", "testfixtures", "Morfessor (==2.0.2a4)", "python-Levenshtein (>=0.10.2)", "visdom (>0.1.8.7)", "scikit-learn", "Pyro4 (>=4.27)", "sphinx (<=2.4.4)", "sphinx-gallery", "sphinxcontrib.programoutput", "sphinxcontrib-napoleon", "matplotlib", "plotly", "memory-profiler", "annoy", "pyro4", "nltk", "statsmodels", "pandas"] -test = ["pytest", "pytest-rerunfailures", "mock", "cython", "nmslib", "pyemd", "testfixtures", "Morfessor (==2.0.2a4)", "python-Levenshtein (>=0.10.2)", "visdom (>0.1.8.7)", "scikit-learn"] -test-win = ["pytest", "pytest-rerunfailures", "mock", "cython", "nmslib", "pyemd", "testfixtures", "Morfessor (==2.0.2a4)", "python-Levenshtein (>=0.10.2)", "visdom (>0.1.8.7)", "scikit-learn"] - -[[package]] -name = "graphviz" -version = "0.16" -description = "Simple Python interface for Graphviz" -category = "main" -optional = false -python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*" - -[package.extras] -dev = ["tox (>=3)", "flake8", "pep8-naming", "wheel", "twine"] -docs = ["sphinx (>=1.8)", "sphinx-rtd-theme"] -test = ["mock (>=3)", "pytest (>=4)", "pytest-mock (>=2)", "pytest-cov"] - -[[package]] -name = "gym" -version = "0.18.3" -description = "The OpenAI Gym: A toolkit for developing and comparing your reinforcement learning agents." -category = "main" -optional = true -python-versions = ">=3.6" - -[package.dependencies] -cloudpickle = ">=1.2.0,<1.7.0" -numpy = ">=1.10.4" -Pillow = "<=8.2.0" -pyglet = ">=1.4.0,<=1.5.15" -scipy = "*" - -[package.extras] -all = ["box2d-py (>=2.3.5,<2.4.0)", "opencv-python (>=3)", "imageio", "atari_py (>=0.2.0,<0.3.0)", "mujoco_py (>=1.50,<2.0)"] -atari = ["atari_py (>=0.2.0,<0.3.0)", "opencv-python (>=3)"] -box2d = ["box2d-py (>=2.3.5,<2.4.0)"] -mujoco = ["mujoco_py (>=1.50,<2.0)", "imageio"] -nomujoco = ["box2d-py (>=2.3.5,<2.4.0)", "opencv-python (>=3)", "atari_py (>=0.2.0,<0.3.0)"] -robotics = ["mujoco_py (>=1.50,<2.0)", "imageio"] - -[[package]] -name = "h5py" -version = "3.3.0" -description = "Read and write HDF5 files from Python" -category = "main" -optional = true -python-versions = ">=3.7" - -[package.dependencies] -numpy = [ - {version = ">=1.17.5", markers = "python_version == \"3.8\""}, - {version = ">=1.19.3", markers = "python_version >= \"3.9\""}, -] - -[[package]] -name = "identify" -version = "2.2.13" -description = "File identification library for Python" -category = "dev" -optional = false -python-versions = ">=3.6.1" - -[package.extras] -license = ["editdistance-s"] - -[[package]] -name = "idna" -version = "3.2" -description = "Internationalized Domain Names in Applications (IDNA)" -category = "main" -optional = false -python-versions = ">=3.5" - -[[package]] -name = "imageio" -version = "2.9.0" -description = "Library for reading and writing a wide range of image, video, scientific, and volumetric data formats." -category = "main" -optional = true -python-versions = ">=3.5" - -[package.dependencies] -numpy = "*" -pillow = "*" - -[package.extras] -ffmpeg = ["imageio-ffmpeg"] -fits = ["astropy"] -full = ["astropy", "gdal", "imageio-ffmpeg", "itk"] -gdal = ["gdal"] -itk = ["itk"] - -[[package]] -name = "ipykernel" -version = "5.5.5" -description = "IPython Kernel for Jupyter" -category = "main" -optional = false -python-versions = ">=3.5" - -[package.dependencies] -appnope = {version = "*", markers = "platform_system == \"Darwin\""} -ipython = ">=5.0.0" -jupyter-client = "*" -tornado = ">=4.2" -traitlets = ">=4.1.0" - -[package.extras] -test = ["pytest (!=5.3.4)", "pytest-cov", "flaky", "nose", "jedi (<=0.17.2)"] - -[[package]] -name = "ipython" -version = "7.26.0" -description = "IPython: Productive Interactive Computing" -category = "main" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -appnope = {version = "*", markers = "sys_platform == \"darwin\""} -backcall = "*" -colorama = {version = "*", markers = "sys_platform == \"win32\""} -decorator = "*" -jedi = ">=0.16" -matplotlib-inline = "*" -pexpect = {version = ">4.3", markers = "sys_platform != \"win32\""} -pickleshare = "*" -prompt-toolkit = ">=2.0.0,<3.0.0 || >3.0.0,<3.0.1 || >3.0.1,<3.1.0" -pygments = "*" -traitlets = ">=4.2" - -[package.extras] -all = ["Sphinx (>=1.3)", "ipykernel", "ipyparallel", "ipywidgets", "nbconvert", "nbformat", "nose (>=0.10.1)", "notebook", "numpy (>=1.17)", "pygments", "qtconsole", "requests", "testpath"] -doc = ["Sphinx (>=1.3)"] -kernel = ["ipykernel"] -nbconvert = ["nbconvert"] -nbformat = ["nbformat"] -notebook = ["notebook", "ipywidgets"] -parallel = ["ipyparallel"] -qtconsole = ["qtconsole"] -test = ["nose (>=0.10.1)", "requests", "testpath", "pygments", "nbformat", "ipykernel", "numpy (>=1.17)"] - -[[package]] -name = "ipython-genutils" -version = "0.2.0" -description = "Vestigial utilities from IPython" -category = "main" -optional = false -python-versions = "*" - -[[package]] -name = "ipywidgets" -version = "7.6.3" -description = "IPython HTML widgets for Jupyter" -category = "main" -optional = false -python-versions = "*" - -[package.dependencies] -ipykernel = ">=4.5.1" -ipython = {version = ">=4.0.0", markers = "python_version >= \"3.3\""} -jupyterlab-widgets = {version = ">=1.0.0", markers = "python_version >= \"3.6\""} -nbformat = ">=4.2.0" -traitlets = ">=4.3.1" -widgetsnbextension = ">=3.5.0,<3.6.0" - -[package.extras] -test = ["pytest (>=3.6.0)", "pytest-cov", "mock"] - -[[package]] -name = "jedi" -version = "0.18.0" -description = "An autocompletion tool for Python that can be used for text editors." -category = "main" -optional = false -python-versions = ">=3.6" - -[package.dependencies] -parso = ">=0.8.0,<0.9.0" - -[package.extras] -qa = ["flake8 (==3.8.3)", "mypy (==0.782)"] -testing = ["Django (<3.1)", "colorama", "docopt", "pytest (<6.0.0)"] - -[[package]] -name = "jinja2" -version = "3.0.1" -description = "A very fast and expressive template engine." -category = "main" -optional = false -python-versions = ">=3.6" - -[package.dependencies] -MarkupSafe = ">=2.0" - -[package.extras] -i18n = ["Babel (>=2.7)"] - -[[package]] -name = "jmespath" -version = "0.10.0" -description = "JSON Matching Expressions" -category = "main" -optional = false -python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" - -[[package]] -name = "joblib" -version = "1.0.1" -description = "Lightweight pipelining with Python functions" -category = "main" -optional = false -python-versions = ">=3.6" - -[[package]] -name = "jsonschema" -version = "3.2.0" -description = "An implementation of JSON Schema validation for Python" -category = "main" -optional = false -python-versions = "*" - -[package.dependencies] -attrs = ">=17.4.0" -pyrsistent = ">=0.14.0" -six = ">=1.11.0" - -[package.extras] -format = ["idna", "jsonpointer (>1.13)", "rfc3987", "strict-rfc3339", "webcolors"] -format_nongpl = ["idna", "jsonpointer (>1.13)", "webcolors", "rfc3986-validator (>0.1.0)", "rfc3339-validator"] - -[[package]] -name = "jupyter-client" -version = "6.2.0" -description = "Jupyter protocol implementation and client libraries" -category = "main" -optional = false -python-versions = ">=3.6.1" - -[package.dependencies] -jupyter-core = ">=4.6.0" -nest-asyncio = ">=1.5" -python-dateutil = ">=2.1" -pyzmq = ">=13" -tornado = ">=4.1" -traitlets = "*" - -[package.extras] -doc = ["sphinx (>=1.3.6)", "sphinx-rtd-theme", "sphinxcontrib-github-alt"] -test = ["async-generator", "ipykernel", "ipython", "mock", "pytest-asyncio", "pytest-timeout", "pytest", "mypy", "pre-commit", "jedi (<0.18)"] - -[[package]] -name = "jupyter-core" -version = "4.7.1" -description = "Jupyter core package. A base package on which Jupyter projects rely." -category = "main" -optional = false -python-versions = ">=3.6" - -[package.dependencies] -pywin32 = {version = ">=1.0", markers = "sys_platform == \"win32\""} -traitlets = "*" - -[[package]] -name = "jupyterlab-pygments" -version = "0.1.2" -description = "Pygments theme using JupyterLab CSS variables" -category = "main" -optional = false -python-versions = "*" - -[package.dependencies] -pygments = ">=2.4.1,<3" - -[[package]] -name = "jupyterlab-widgets" -version = "1.0.0" -description = "A JupyterLab extension." -category = "main" -optional = false -python-versions = ">=3.6" - -[[package]] -name = "kiwisolver" -version = "1.3.1" -description = "A fast implementation of the Cassowary constraint solver" -category = "main" -optional = false -python-versions = ">=3.6" - -[[package]] -name = "llvmlite" -version = "0.34.0" -description = "lightweight wrapper around basic LLVM functionality" -category = "main" -optional = true -python-versions = ">=3.6" - -[[package]] -name = "markupsafe" -version = "2.0.1" -description = "Safely add untrusted strings to HTML/XML markup." -category = "main" -optional = false -python-versions = ">=3.6" - -[[package]] -name = "matplotlib" -version = "3.4.2" -description = "Python plotting package" -category = "main" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -cycler = ">=0.10" -kiwisolver = ">=1.0.1" -numpy = ">=1.16" -pillow = ">=6.2.0" -pyparsing = ">=2.2.1" -python-dateutil = ">=2.7" - -[[package]] -name = "matplotlib-inline" -version = "0.1.2" -description = "Inline Matplotlib backend for Jupyter" -category = "main" -optional = false -python-versions = ">=3.5" - -[package.dependencies] -traitlets = "*" - -[[package]] -name = "mistune" -version = "0.8.4" -description = "The fastest markdown parser in pure Python" -category = "main" -optional = false -python-versions = "*" - -[[package]] -name = "murmurhash" -version = "1.0.5" -description = "Cython bindings for MurmurHash" -category = "main" -optional = false -python-versions = "*" - -[[package]] -name = "nbclient" -version = "0.5.3" -description = "A client library for executing notebooks. Formerly nbconvert's ExecutePreprocessor." -category = "main" -optional = false -python-versions = ">=3.6.1" - -[package.dependencies] -async-generator = "*" -jupyter-client = ">=6.1.5" -nbformat = ">=5.0" -nest-asyncio = "*" -traitlets = ">=4.2" - -[package.extras] -dev = ["codecov", "coverage", "ipython", "ipykernel", "ipywidgets", "pytest (>=4.1)", "pytest-cov (>=2.6.1)", "check-manifest", "flake8", "mypy", "tox", "bumpversion", "xmltodict", "pip (>=18.1)", "wheel (>=0.31.0)", "setuptools (>=38.6.0)", "twine (>=1.11.0)", "black"] -sphinx = ["Sphinx (>=1.7)", "sphinx-book-theme", "mock", "moto", "myst-parser"] -test = ["codecov", "coverage", "ipython", "ipykernel", "ipywidgets", "pytest (>=4.1)", "pytest-cov (>=2.6.1)", "check-manifest", "flake8", "mypy", "tox", "bumpversion", "xmltodict", "pip (>=18.1)", "wheel (>=0.31.0)", "setuptools (>=38.6.0)", "twine (>=1.11.0)", "black"] - -[[package]] -name = "nbconvert" -version = "6.1.0" -description = "Converting Jupyter Notebooks" -category = "main" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -bleach = "*" -defusedxml = "*" -entrypoints = ">=0.2.2" -jinja2 = ">=2.4" -jupyter-core = "*" -jupyterlab-pygments = "*" -mistune = ">=0.8.1,<2" -nbclient = ">=0.5.0,<0.6.0" -nbformat = ">=4.4" -pandocfilters = ">=1.4.1" -pygments = ">=2.4.1" -testpath = "*" -traitlets = ">=5.0" - -[package.extras] -all = ["pytest", "pytest-cov", "pytest-dependency", "ipykernel", "ipywidgets (>=7)", "pyppeteer (==0.2.2)", "tornado (>=4.0)", "sphinx (>=1.5.1)", "sphinx-rtd-theme", "nbsphinx (>=0.2.12)", "ipython"] -docs = ["sphinx (>=1.5.1)", "sphinx-rtd-theme", "nbsphinx (>=0.2.12)", "ipython"] -serve = ["tornado (>=4.0)"] -test = ["pytest", "pytest-cov", "pytest-dependency", "ipykernel", "ipywidgets (>=7)", "pyppeteer (==0.2.2)"] -webpdf = ["pyppeteer (==0.2.2)"] - -[[package]] -name = "nbformat" -version = "5.1.3" -description = "The Jupyter Notebook format" -category = "main" -optional = false -python-versions = ">=3.5" - -[package.dependencies] -ipython-genutils = "*" -jsonschema = ">=2.4,<2.5.0 || >2.5.0" -jupyter-core = "*" -traitlets = ">=4.1" - -[package.extras] -fast = ["fastjsonschema"] -test = ["check-manifest", "fastjsonschema", "testpath", "pytest", "pytest-cov"] - -[[package]] -name = "nest-asyncio" -version = "1.5.1" -description = "Patch asyncio to allow nested event loops" -category = "main" -optional = false -python-versions = ">=3.5" - -[[package]] -name = "networkx" -version = "2.6.2" -description = "Python package for creating and manipulating graphs and networks" -category = "main" -optional = true -python-versions = ">=3.7" - -[package.extras] -default = ["numpy (>=1.19)", "scipy (>=1.5,!=1.6.1)", "matplotlib (>=3.3)", "pandas (>=1.1)"] -developer = ["black (==21.5b1)", "pre-commit (>=2.12)"] -doc = ["sphinx (>=4.0,<5.0)", "pydata-sphinx-theme (>=0.6,<1.0)", "sphinx-gallery (>=0.9,<1.0)", "numpydoc (>=1.1)", "pillow (>=8.2)", "nb2plots (>=0.6)", "texext (>=0.6.6)"] -extra = ["lxml (>=4.5)", "pygraphviz (>=1.7)", "pydot (>=1.4.1)"] -test = ["pytest (>=6.2)", "pytest-cov (>=2.12)", "codecov (>=2.1)"] - -[[package]] -name = "nltk" -version = "3.6.2" -description = "Natural Language Toolkit" -category = "main" -optional = false -python-versions = ">=3.5.*" - -[package.dependencies] -click = "*" -joblib = "*" -regex = "*" -tqdm = "*" - -[package.extras] -all = ["matplotlib", "twython", "scipy", "numpy", "gensim (<4.0.0)", "python-crfsuite", "pyparsing", "scikit-learn", "requests"] -corenlp = ["requests"] -machine_learning = ["gensim (<4.0.0)", "numpy", "python-crfsuite", "scikit-learn", "scipy"] -plot = ["matplotlib"] -tgrep = ["pyparsing"] -twitter = ["twython"] - -[[package]] -name = "nodeenv" -version = "1.6.0" -description = "Node.js virtual environment builder" -category = "dev" -optional = false -python-versions = "*" - -[[package]] -name = "notebook" -version = "6.4.3" -description = "A web-based notebook environment for interactive computing" -category = "main" -optional = false -python-versions = ">=3.6" - -[package.dependencies] -argon2-cffi = "*" -ipykernel = "*" -ipython-genutils = "*" -jinja2 = "*" -jupyter-client = ">=5.3.4" -jupyter-core = ">=4.6.1" -nbconvert = "*" -nbformat = "*" -prometheus-client = "*" -pyzmq = ">=17" -Send2Trash = ">=1.5.0" -terminado = ">=0.8.3" -tornado = ">=6.1" -traitlets = ">=4.2.1" - -[package.extras] -docs = ["sphinx", "nbsphinx", "sphinxcontrib-github-alt", "sphinx-rtd-theme", "myst-parser"] -json-logging = ["json-logging"] -test = ["pytest", "coverage", "requests", "nbval", "selenium", "pytest-cov", "requests-unixsocket"] - -[[package]] -name = "numba" -version = "0.51.2" -description = "compiling Python code using LLVM" -category = "main" -optional = true -python-versions = ">=3.6" - -[package.dependencies] -llvmlite = ">=0.34.0.dev0,<0.35" -numpy = ">=1.15" - -[[package]] -name = "numpy" -version = "1.21.1" -description = "NumPy is the fundamental package for array computing with Python." -category = "main" -optional = false -python-versions = ">=3.7" - -[[package]] -name = "opencv-python" -version = "4.5.3.56" -description = "Wrapper package for OpenCV python bindings." -category = "main" -optional = false -python-versions = ">=3.6" - -[package.dependencies] -numpy = ">=1.21.0" - -[[package]] -name = "packaging" -version = "21.0" -description = "Core utilities for Python packages" -category = "main" -optional = false -python-versions = ">=3.6" - -[package.dependencies] -pyparsing = ">=2.0.2" - -[[package]] -name = "pandas" -version = "1.3.1" -description = "Powerful data structures for data analysis, time series, and statistics" -category = "main" -optional = false -python-versions = ">=3.7.1" - -[package.dependencies] -numpy = ">=1.17.3" -python-dateutil = ">=2.7.3" -pytz = ">=2017.3" - -[package.extras] -test = ["hypothesis (>=3.58)", "pytest (>=6.0)", "pytest-xdist"] - -[[package]] -name = "pandocfilters" -version = "1.4.3" -description = "Utilities for writing pandoc filters in python" -category = "main" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" - -[[package]] -name = "parso" -version = "0.8.2" -description = "A Python Parser" -category = "main" -optional = false -python-versions = ">=3.6" - -[package.extras] -qa = ["flake8 (==3.8.3)", "mypy (==0.782)"] -testing = ["docopt", "pytest (<6.0.0)"] - -[[package]] -name = "pathy" -version = "0.6.0" -description = "pathlib.Path subclasses for local and cloud bucket storage" -category = "main" -optional = false -python-versions = ">= 3.6" - -[package.dependencies] -smart-open = ">=5.0.0,<6.0.0" -typer = ">=0.3.0,<1.0.0" - -[package.extras] -all = ["google-cloud-storage (>=1.26.0,<2.0.0)", "boto3", "pytest", "pytest-coverage", "mock", "typer-cli"] -gcs = ["google-cloud-storage (>=1.26.0,<2.0.0)"] -s3 = ["boto3"] -test = ["pytest", "pytest-coverage", "mock", "typer-cli"] - -[[package]] -name = "patsy" -version = "0.5.1" -description = "A Python package for describing statistical models and for building design matrices." -category = "main" -optional = false -python-versions = "*" - -[package.dependencies] -numpy = ">=1.4" -six = "*" - -[[package]] -name = "pdpbox" -version = "0.2.0" -description = "python partial dependence plot toolbox" -category = "main" -optional = true -python-versions = "*" - -[package.dependencies] -joblib = "*" -matplotlib = ">=2.1.2" -numpy = "*" -pandas = "*" -psutil = "*" -scikit-learn = "*" -scipy = "*" - -[[package]] -name = "pexpect" -version = "4.8.0" -description = "Pexpect allows easy control of interactive console applications." -category = "main" -optional = false -python-versions = "*" - -[package.dependencies] -ptyprocess = ">=0.5" - -[[package]] -name = "pickleshare" -version = "0.7.5" -description = "Tiny 'shelve'-like database with concurrency support" -category = "main" -optional = false -python-versions = "*" - -[[package]] -name = "pillow" -version = "7.2.0" -description = "Python Imaging Library (Fork)" -category = "main" -optional = false -python-versions = ">=3.5" - -[[package]] -name = "platformdirs" -version = "2.2.0" -description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." -category = "dev" -optional = false -python-versions = ">=3.6" - -[package.extras] -docs = ["Sphinx (>=4)", "furo (>=2021.7.5b38)", "proselint (>=0.10.2)", "sphinx-autodoc-typehints (>=1.12)"] -test = ["appdirs (==1.4.4)", "pytest (>=6)", "pytest-cov (>=2.7)", "pytest-mock (>=3.6)"] - -[[package]] -name = "pre-commit" -version = "2.14.0" -description = "A framework for managing and maintaining multi-language pre-commit hooks." -category = "dev" -optional = false -python-versions = ">=3.6.1" - -[package.dependencies] -cfgv = ">=2.0.0" -identify = ">=1.0.0" -nodeenv = ">=0.11.1" -pyyaml = ">=5.1" -toml = "*" -virtualenv = ">=20.0.8" - -[[package]] -name = "preshed" -version = "3.0.5" -description = "Cython hash table that trusts the keys are pre-hashed" -category = "main" -optional = false -python-versions = "*" - -[package.dependencies] -cymem = ">=2.0.2,<2.1.0" -murmurhash = ">=0.28.0,<1.1.0" - -[[package]] -name = "prometheus-client" -version = "0.11.0" -description = "Python client for the Prometheus monitoring system." -category = "main" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" - -[package.extras] -twisted = ["twisted"] - -[[package]] -name = "prompt-toolkit" -version = "3.0.19" -description = "Library for building powerful interactive command lines in Python" -category = "main" -optional = false -python-versions = ">=3.6.1" - -[package.dependencies] -wcwidth = "*" - -[[package]] -name = "psutil" -version = "5.8.0" -description = "Cross-platform lib for process and system monitoring in Python." -category = "main" -optional = true -python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" - -[package.extras] -test = ["ipaddress", "mock", "unittest2", "enum34", "pywin32", "wmi"] - -[[package]] -name = "ptyprocess" -version = "0.7.0" -description = "Run a subprocess in a pseudo terminal" -category = "main" -optional = false -python-versions = "*" - -[[package]] -name = "py" -version = "1.10.0" -description = "library with cross-python path, ini-parsing, io, code, log facilities" -category = "main" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" - -[[package]] -name = "pycparser" -version = "2.20" -description = "C parser in Python" -category = "main" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" - -[[package]] -name = "pydantic" -version = "1.8.2" -description = "Data validation and settings management using python 3.6 type hinting" -category = "main" -optional = false -python-versions = ">=3.6.1" - -[package.dependencies] -typing-extensions = ">=3.7.4.3" - -[package.extras] -dotenv = ["python-dotenv (>=0.10.4)"] -email = ["email-validator (>=1.0.3)"] - -[[package]] -name = "pydotplus" -version = "2.0.2" -description = "Python interface to Graphviz's Dot language" -category = "main" -optional = true -python-versions = "*" - -[package.dependencies] -pyparsing = ">=2.0.1" - -[[package]] -name = "pyglet" -version = "1.5.15" -description = "Cross-platform windowing and multimedia library" -category = "main" -optional = true -python-versions = "*" - -[[package]] -name = "pygments" -version = "2.9.0" -description = "Pygments is a syntax highlighting package written in Python." -category = "main" -optional = false -python-versions = ">=3.5" - -[[package]] -name = "pyparsing" -version = "2.4.7" -description = "Python parsing module" -category = "main" -optional = false -python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" - -[[package]] -name = "pyrsistent" -version = "0.18.0" -description = "Persistent/Functional/Immutable data structures" -category = "main" -optional = false -python-versions = ">=3.6" - -[[package]] -name = "python-dateutil" -version = "2.8.2" -description = "Extensions to the standard Python datetime module" -category = "main" -optional = false -python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" - -[package.dependencies] -six = ">=1.5" - -[[package]] -name = "pytorch-transformers" -version = "1.2.0" -description = "Repository of pre-trained NLP Transformer models: BERT & RoBERTa, GPT & GPT-2, Transformer-XL, XLNet and XLM" -category = "main" -optional = false -python-versions = "*" - -[package.dependencies] -boto3 = "*" -numpy = "*" -regex = "*" -requests = "*" -sacremoses = "*" -sentencepiece = "*" -torch = ">=1.0.0" -tqdm = "*" - -[[package]] -name = "pytz" -version = "2021.1" -description = "World timezone definitions, modern and historical" -category = "main" -optional = false -python-versions = "*" - -[[package]] -name = "pywavelets" -version = "1.1.1" -description = "PyWavelets, wavelet transform module" -category = "main" -optional = true -python-versions = ">=3.5" - -[package.dependencies] -numpy = ">=1.13.3" - -[[package]] -name = "pywin32" -version = "301" -description = "Python for Window Extensions" -category = "main" -optional = false -python-versions = "*" - -[[package]] -name = "pywinpty" -version = "1.1.3" -description = "Pseudo terminal support for Windows from Python." -category = "main" -optional = false -python-versions = ">=3.6" - -[[package]] -name = "pyyaml" -version = "5.4.1" -description = "YAML parser and emitter for Python" -category = "main" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" - -[[package]] -name = "pyzmq" -version = "22.2.1" -description = "Python bindings for 0MQ" -category = "main" -optional = false -python-versions = ">=3.6" - -[package.dependencies] -cffi = {version = "*", markers = "implementation_name == \"pypy\""} -py = {version = "*", markers = "implementation_name == \"pypy\""} - -[[package]] -name = "regex" -version = "2021.8.3" -description = "Alternative regular expression module, to replace re." -category = "main" -optional = false -python-versions = "*" - -[[package]] -name = "requests" -version = "2.26.0" -description = "Python HTTP for Humans." -category = "main" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" - -[package.dependencies] -certifi = ">=2017.4.17" -charset-normalizer = {version = ">=2.0.0,<2.1.0", markers = "python_version >= \"3\""} -idna = {version = ">=2.5,<4", markers = "python_version >= \"3\""} -urllib3 = ">=1.21.1,<1.27" - -[package.extras] -socks = ["PySocks (>=1.5.6,!=1.5.7)", "win-inet-pton"] -use_chardet_on_py3 = ["chardet (>=3.0.2,<5)"] - -[[package]] -name = "s3transfer" -version = "0.5.0" -description = "An Amazon S3 Transfer Manager" -category = "main" -optional = false -python-versions = ">= 3.6" - -[package.dependencies] -botocore = ">=1.12.36,<2.0a.0" - -[package.extras] -crt = ["botocore[crt] (>=1.20.29,<2.0a.0)"] - -[[package]] -name = "sacremoses" -version = "0.0.45" -description = "SacreMoses" -category = "main" -optional = false -python-versions = "*" - -[package.dependencies] -click = "*" -joblib = "*" -regex = "*" -six = "*" -tqdm = "*" - -[[package]] -name = "scikit-image" -version = "0.18.2" -description = "Image processing in Python" -category = "main" -optional = true -python-versions = ">=3.7" - -[package.dependencies] -imageio = ">=2.3.0" -matplotlib = ">=2.0.0,<3.0.0 || >3.0.0" -networkx = ">=2.0" -numpy = ">=1.16.5" -pillow = ">=4.3.0,<7.1.0 || >7.1.0,<7.1.1 || >7.1.1" -PyWavelets = ">=1.1.1" -scipy = ">=1.0.1" -tifffile = ">=2019.7.26" - -[package.extras] -data = ["pooch (>=1.3.0)"] -docs = ["sphinx (>=1.8,<=2.4.4)", "sphinx-gallery (>=0.7.0,!=0.8.0)", "numpydoc (>=1.0)", "sphinx-copybutton", "pytest-runner", "scikit-learn", "matplotlib (>=3.0.1)", "dask[array] (>=0.15.0,!=2.17.0)", "cloudpickle (>=0.2.1)", "pandas (>=0.23.0)", "seaborn (>=0.7.1)", "pooch (>=1.3.0)", "tifffile (>=2020.5.30)", "myst-parser", "ipywidgets", "plotly (>=4.10.0)"] -optional = ["simpleitk", "astropy (>=3.1.2)", "qtpy", "pyamg", "dask[array] (>=1.0.0,!=2.17.0)", "cloudpickle (>=0.2.1)", "pooch (>=1.3.0)"] -test = ["pytest (>=5.2.0)", "pytest-cov (>=2.7.0)", "pytest-localserver", "pytest-faulthandler", "flake8", "codecov", "pooch (>=1.3.0)"] - -[[package]] -name = "scikit-learn" -version = "0.24.2" -description = "A set of python modules for machine learning and data mining" -category = "main" -optional = false -python-versions = ">=3.6" - -[package.dependencies] -joblib = ">=0.11" -numpy = ">=1.13.3" -scipy = ">=0.19.1" -threadpoolctl = ">=2.0.0" - -[package.extras] -benchmark = ["matplotlib (>=2.1.1)", "pandas (>=0.25.0)", "memory-profiler (>=0.57.0)"] -docs = ["matplotlib (>=2.1.1)", "scikit-image (>=0.13)", "pandas (>=0.25.0)", "seaborn (>=0.9.0)", "memory-profiler (>=0.57.0)", "sphinx (>=3.2.0)", "sphinx-gallery (>=0.7.0)", "numpydoc (>=1.0.0)", "Pillow (>=7.1.2)", "sphinx-prompt (>=1.3.0)"] -examples = ["matplotlib (>=2.1.1)", "scikit-image (>=0.13)", "pandas (>=0.25.0)", "seaborn (>=0.9.0)"] -tests = ["matplotlib (>=2.1.1)", "scikit-image (>=0.13)", "pandas (>=0.25.0)", "pytest (>=5.0.1)", "pytest-cov (>=2.9.0)", "flake8 (>=3.8.2)", "mypy (>=0.770)", "pyamg (>=4.0.0)"] - -[[package]] -name = "scipy" -version = "1.6.1" -description = "SciPy: Scientific Library for Python" -category = "main" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -numpy = ">=1.16.5" - -[[package]] -name = "seaborn" -version = "0.11.1" -description = "seaborn: statistical data visualization" -category = "main" -optional = false -python-versions = ">=3.6" - -[package.dependencies] -matplotlib = ">=2.2" -numpy = ">=1.15" -pandas = ">=0.23" -scipy = ">=1.0" - -[[package]] -name = "send2trash" -version = "1.8.0" -description = "Send file to trash natively under Mac OS X, Windows and Linux." -category = "main" -optional = false -python-versions = "*" - -[package.extras] -nativelib = ["pyobjc-framework-cocoa", "pywin32"] -objc = ["pyobjc-framework-cocoa"] -win32 = ["pywin32"] - -[[package]] -name = "sentencepiece" -version = "0.1.96" -description = "SentencePiece python wrapper" -category = "main" -optional = false -python-versions = "*" - -[[package]] -name = "shap" -version = "0.38.1" -description = "A unified approach to explain the output of any machine learning model." -category = "main" -optional = true -python-versions = "*" - -[package.dependencies] -cloudpickle = "*" -numba = "*" -numpy = "*" -pandas = "*" -scikit-learn = "*" -scipy = "*" -slicer = "0.0.7" -tqdm = ">4.25.0" - -[package.extras] -all = ["xgboost", "lightgbm", "transformers", "lime", "pyspark", "torch", "pytest", "pytest-mpl", "pytest-cov", "nbsphinx", "matplotlib", "sphinx-rtd-theme", "sphinx", "catboost", "pyod", "sentencepiece", "opencv-python", "ipython", "numpydoc"] -docs = ["matplotlib", "ipython", "numpydoc", "sphinx-rtd-theme", "sphinx", "nbsphinx"] -others = ["lime"] -plots = ["matplotlib", "ipython"] -test = ["pytest", "pytest-mpl", "pytest-cov", "xgboost", "lightgbm", "catboost", "pyspark", "pyod", "transformers", "torch", "sentencepiece", "opencv-python"] - -[[package]] -name = "six" -version = "1.16.0" -description = "Python 2 and 3 compatibility utilities" -category = "main" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" - -[[package]] -name = "slicer" -version = "0.0.7" -description = "A small package for big slicing." -category = "main" -optional = true -python-versions = ">=3.6" - -[[package]] -name = "smart-open" -version = "5.1.0" -description = "Utils for streaming large files (S3, HDFS, GCS, Azure Blob Storage, gzip, bz2...)" -category = "main" -optional = false -python-versions = ">=3.6.*" - -[package.extras] -all = ["boto3", "google-cloud-storage", "azure-storage-blob", "azure-common", "azure-core", "requests"] -azure = ["azure-storage-blob", "azure-common", "azure-core"] -gcs = ["google-cloud-storage"] -http = ["requests"] -s3 = ["boto3"] -test = ["boto3", "google-cloud-storage", "azure-storage-blob", "azure-common", "azure-core", "requests", "moto[server] (==1.3.14)", "pathlib2", "responses", "paramiko", "parameterizedtestcase", "pytest", "pytest-rerunfailures"] -webhdfs = ["requests"] - -[[package]] -name = "spacy" -version = "3.1.1" -description = "Industrial-strength Natural Language Processing (NLP) in Python" -category = "main" -optional = false -python-versions = ">=3.6" - -[package.dependencies] -blis = ">=0.4.0,<0.8.0" -catalogue = ">=2.0.4,<2.1.0" -cymem = ">=2.0.2,<2.1.0" -jinja2 = "*" -murmurhash = ">=0.28.0,<1.1.0" -numpy = ">=1.15.0" -packaging = ">=20.0" -pathy = ">=0.3.5" -preshed = ">=3.0.2,<3.1.0" -pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<1.9.0" -requests = ">=2.13.0,<3.0.0" -spacy-legacy = ">=3.0.7,<3.1.0" -srsly = ">=2.4.1,<3.0.0" -thinc = ">=8.0.8,<8.1.0" -tqdm = ">=4.38.0,<5.0.0" -typer = ">=0.3.0,<0.4.0" -wasabi = ">=0.8.1,<1.1.0" - -[package.extras] -cuda = ["cupy (>=5.0.0b4,<10.0.0)"] -cuda100 = ["cupy-cuda100 (>=5.0.0b4,<10.0.0)"] -cuda101 = ["cupy-cuda101 (>=5.0.0b4,<10.0.0)"] -cuda102 = ["cupy-cuda102 (>=5.0.0b4,<10.0.0)"] -cuda110 = ["cupy-cuda110 (>=5.0.0b4,<10.0.0)"] -cuda111 = ["cupy-cuda111 (>=5.0.0b4,<10.0.0)"] -cuda112 = ["cupy-cuda112 (>=5.0.0b4,<10.0.0)"] -cuda80 = ["cupy-cuda80 (>=5.0.0b4,<10.0.0)"] -cuda90 = ["cupy-cuda90 (>=5.0.0b4,<10.0.0)"] -cuda91 = ["cupy-cuda91 (>=5.0.0b4,<10.0.0)"] -cuda92 = ["cupy-cuda92 (>=5.0.0b4,<10.0.0)"] -ja = ["sudachipy (>=0.4.9)", "sudachidict-core (>=20200330)"] -ko = ["natto-py (==0.9.0)"] -lookups = ["spacy-lookups-data (>=1.0.2,<1.1.0)"] -ray = ["spacy-ray (>=0.1.0,<1.0.0)"] -th = ["pythainlp (>=2.0)"] -transformers = ["spacy-transformers (>=1.0.1,<1.1.0)"] - -[[package]] -name = "spacy-legacy" -version = "3.0.8" -description = "Legacy registered functions for spaCy backwards compatibility" -category = "main" -optional = false -python-versions = ">=3.6" - -[[package]] -name = "srsly" -version = "2.4.1" -description = "Modern high-performance serialization utilities for Python" -category = "main" -optional = false -python-versions = ">=3.6" - -[package.dependencies] -catalogue = ">=2.0.1,<2.1.0" - -[[package]] -name = "statsmodels" -version = "0.12.2" -description = "Statistical computations and models for Python" -category = "main" -optional = false -python-versions = ">=3.6" - -[package.dependencies] -numpy = ">=1.15" -pandas = ">=0.21" -patsy = ">=0.5" -scipy = ">=1.1" - -[package.extras] -build = ["cython (>=0.29)"] -develop = ["cython (>=0.29)"] -docs = ["sphinx", "nbconvert", "jupyter-client", "ipykernel", "matplotlib", "nbformat", "numpydoc", "pandas-datareader"] - -[[package]] -name = "subword-nmt" -version = "0.3.7" -description = "Unsupervised Word Segmentation for Neural Machine Translation and Text Generation" -category = "main" -optional = false -python-versions = "*" - -[[package]] -name = "tabulate" -version = "0.8.9" -description = "Pretty-print tabular data" -category = "main" -optional = true -python-versions = "*" - -[package.extras] -widechars = ["wcwidth"] - -[[package]] -name = "terminado" -version = "0.11.0" -description = "Tornado websocket backend for the Xterm.js Javascript terminal emulator library." -category = "main" -optional = false -python-versions = ">=3.6" - -[package.dependencies] -ptyprocess = {version = "*", markers = "os_name != \"nt\""} -pywinpty = {version = ">=1.1.0", markers = "os_name == \"nt\""} -tornado = ">=4" - -[package.extras] -test = ["pytest"] - -[[package]] -name = "testpath" -version = "0.5.0" -description = "Test utilities for code working with files and commands" -category = "main" -optional = false -python-versions = ">= 3.5" - -[package.extras] -test = ["pytest", "pathlib2"] - -[[package]] -name = "thinc" -version = "8.0.8" -description = "A refreshing functional take on deep learning, compatible with your favorite libraries" -category = "main" -optional = false -python-versions = ">=3.6" - -[package.dependencies] -blis = ">=0.4.0,<0.8.0" -catalogue = ">=2.0.4,<2.1.0" -cymem = ">=2.0.2,<2.1.0" -murmurhash = ">=0.28.0,<1.1.0" -numpy = ">=1.15.0" -preshed = ">=3.0.2,<3.1.0" -pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<1.9.0" -srsly = ">=2.4.0,<3.0.0" -wasabi = ">=0.8.1,<1.1.0" - -[package.extras] -cuda = ["cupy (>=5.0.0b4)"] -cuda100 = ["cupy-cuda100 (>=5.0.0b4)"] -cuda101 = ["cupy-cuda101 (>=5.0.0b4)"] -cuda102 = ["cupy-cuda102 (>=5.0.0b4)"] -cuda110 = ["cupy-cuda110 (>=5.0.0b4)"] -cuda111 = ["cupy-cuda111 (>=5.0.0b4)"] -cuda80 = ["cupy-cuda80 (>=5.0.0b4)"] -cuda90 = ["cupy-cuda90 (>=5.0.0b4)"] -cuda91 = ["cupy-cuda91 (>=5.0.0b4)"] -cuda92 = ["cupy-cuda92 (>=5.0.0b4)"] -datasets = ["ml-datasets (>=0.2.0,<0.3.0)"] -mxnet = ["mxnet (>=1.5.1,<1.6.0)"] -tensorflow = ["tensorflow (>=2.0.0,<2.3.0)"] -torch = ["torch (>=1.5.0)"] - -[[package]] -name = "threadpoolctl" -version = "2.2.0" -description = "threadpoolctl" -category = "main" -optional = false -python-versions = ">=3.6" - -[[package]] -name = "tifffile" -version = "2021.8.8" -description = "Read and write TIFF files" -category = "main" -optional = true -python-versions = ">=3.7" - -[package.dependencies] -numpy = ">=1.15.1" - -[package.extras] -all = ["imagecodecs (>=2021.7.30)", "matplotlib (>=3.2)", "lxml"] - -[[package]] -name = "toml" -version = "0.10.2" -description = "Python Library for Tom's Obvious, Minimal Language" -category = "dev" -optional = false -python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" - -[[package]] -name = "torch" -version = "1.7.1" -description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration" -category = "main" -optional = false -python-versions = ">=3.6.2" - -[package.dependencies] -numpy = "*" -typing-extensions = "*" - -[[package]] -name = "torchsummary" -version = "1.5.1" -description = "Model summary in PyTorch similar to `model.summary()` in Keras" -category = "main" -optional = false -python-versions = "*" - -[[package]] -name = "torchtext" -version = "0.8.1" -description = "Text utilities and datasets for PyTorch" -category = "main" -optional = false -python-versions = ">=3.5" - -[package.dependencies] -numpy = "*" -requests = "*" -torch = "1.7.1" -tqdm = "*" - -[[package]] -name = "torchvision" -version = "0.8.2" -description = "image and video datasets and models for torch deep learning" -category = "main" -optional = false -python-versions = "*" - -[package.dependencies] -numpy = "*" -pillow = ">=4.1.1" -torch = "1.7.1" - -[package.extras] -scipy = ["scipy"] - -[[package]] -name = "tornado" -version = "6.1" -description = "Tornado is a Python web framework and asynchronous networking library, originally developed at FriendFeed." -category = "main" -optional = false -python-versions = ">= 3.5" - -[[package]] -name = "tqdm" -version = "4.62.0" -description = "Fast, Extensible Progress Meter" -category = "main" -optional = false -python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7" - -[package.dependencies] -colorama = {version = "*", markers = "platform_system == \"Windows\""} - -[package.extras] -dev = ["py-make (>=0.1.0)", "twine", "wheel"] -notebook = ["ipywidgets (>=6)"] -telegram = ["requests"] - -[[package]] -name = "traitlets" -version = "5.0.5" -description = "Traitlets Python configuration system" -category = "main" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -ipython-genutils = "*" - -[package.extras] -test = ["pytest"] - -[[package]] -name = "typer" -version = "0.3.2" -description = "Typer, build great CLIs. Easy to code. Based on Python type hints." -category = "main" -optional = false -python-versions = ">=3.6" - -[package.dependencies] -click = ">=7.1.1,<7.2.0" - -[package.extras] -test = ["pytest-xdist (>=1.32.0,<2.0.0)", "pytest-sugar (>=0.9.4,<0.10.0)", "mypy (==0.782)", "black (>=19.10b0,<20.0b0)", "isort (>=5.0.6,<6.0.0)", "shellingham (>=1.3.0,<2.0.0)", "pytest (>=4.4.0,<5.4.0)", "pytest-cov (>=2.10.0,<3.0.0)", "coverage (>=5.2,<6.0)"] -all = ["colorama (>=0.4.3,<0.5.0)", "shellingham (>=1.3.0,<2.0.0)"] -dev = ["autoflake (>=1.3.1,<2.0.0)", "flake8 (>=3.8.3,<4.0.0)"] -doc = ["mkdocs (>=1.1.2,<2.0.0)", "mkdocs-material (>=5.4.0,<6.0.0)", "markdown-include (>=0.5.1,<0.6.0)"] - -[[package]] -name = "typing-extensions" -version = "3.10.0.0" -description = "Backported and Experimental Type Hints for Python 3.5+" -category = "main" -optional = false -python-versions = "*" - -[[package]] -name = "urllib3" -version = "1.26.6" -description = "HTTP library with thread-safe connection pooling, file post, and more." -category = "main" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4" - -[package.extras] -brotli = ["brotlipy (>=0.6.0)"] -secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "ipaddress"] -socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] - -[[package]] -name = "virtualenv" -version = "20.7.2" -description = "Virtual Python Environment builder" -category = "dev" -optional = false -python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" - -[package.dependencies] -"backports.entry-points-selectable" = ">=1.0.4" -distlib = ">=0.3.1,<1" -filelock = ">=3.0.0,<4" -platformdirs = ">=2,<3" -six = ">=1.9.0,<2" - -[package.extras] -docs = ["proselint (>=0.10.2)", "sphinx (>=3)", "sphinx-argparse (>=0.2.5)", "sphinx-rtd-theme (>=0.4.3)", "towncrier (>=19.9.0rc1)"] -testing = ["coverage (>=4)", "coverage-enable-subprocess (>=1)", "flaky (>=3)", "pytest (>=4)", "pytest-env (>=0.6.2)", "pytest-freezegun (>=0.4.1)", "pytest-mock (>=2)", "pytest-randomly (>=1)", "pytest-timeout (>=1)", "packaging (>=20.0)"] - -[[package]] -name = "wasabi" -version = "0.8.2" -description = "A lightweight console printing and formatting toolkit" -category = "main" -optional = false -python-versions = "*" - -[[package]] -name = "wcwidth" -version = "0.2.5" -description = "Measures the displayed width of unicode strings in a terminal" -category = "main" -optional = false -python-versions = "*" - -[[package]] -name = "webencodings" -version = "0.5.1" -description = "Character encoding aliases for legacy web content" -category = "main" -optional = false -python-versions = "*" - -[[package]] -name = "widgetsnbextension" -version = "3.5.1" -description = "IPython HTML widgets for Jupyter" -category = "main" -optional = false -python-versions = "*" - -[package.dependencies] -notebook = ">=4.4.1" - -[[package]] -name = "xgboost" -version = "1.4.2" -description = "XGBoost Python Package" -category = "main" -optional = false -python-versions = ">=3.6" - -[package.dependencies] -numpy = "*" -scipy = "*" - -[package.extras] -dask = ["dask", "pandas", "distributed"] -datatable = ["datatable"] -pandas = ["pandas"] -plotting = ["graphviz", "matplotlib"] -scikit-learn = ["scikit-learn"] - -[extras] -basic = ["Pillow", "tqdm", "scikit-image", "h5py", "pydotplus", "eli5", "PDPbox", "shap"] -nlp = ["nltk", "gensim", "spacy", "torchtext", "bokeh"] -rl = ["gym", "graphviz"] - -[metadata] -lock-version = "1.1" -python-versions = "^3.8" -content-hash = "ff0500ffaf9219e3d7ea9dedcfc79ac15c1347545c910a921a43ad75c60c651f" - -[metadata.files] -appnope = [ - {file = "appnope-0.1.2-py2.py3-none-any.whl", hash = "sha256:93aa393e9d6c54c5cd570ccadd8edad61ea0c4b9ea7a01409020c9aa019eb442"}, - {file = "appnope-0.1.2.tar.gz", hash = "sha256:dd83cd4b5b460958838f6eb3000c660b1f9caf2a5b1de4264e941512f603258a"}, -] -argon2-cffi = [ - {file = "argon2-cffi-20.1.0.tar.gz", hash = "sha256:d8029b2d3e4b4cea770e9e5a0104dd8fa185c1724a0f01528ae4826a6d25f97d"}, - {file = "argon2_cffi-20.1.0-cp27-cp27m-macosx_10_6_intel.whl", hash = "sha256:6ea92c980586931a816d61e4faf6c192b4abce89aa767ff6581e6ddc985ed003"}, - {file = "argon2_cffi-20.1.0-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:05a8ac07c7026542377e38389638a8a1e9b78f1cd8439cd7493b39f08dd75fbf"}, - {file = "argon2_cffi-20.1.0-cp27-cp27m-win32.whl", hash = "sha256:0bf066bc049332489bb2d75f69216416329d9dc65deee127152caeb16e5ce7d5"}, - {file = "argon2_cffi-20.1.0-cp27-cp27m-win_amd64.whl", hash = "sha256:57358570592c46c420300ec94f2ff3b32cbccd10d38bdc12dc6979c4a8484fbc"}, - {file = "argon2_cffi-20.1.0-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:7d455c802727710e9dfa69b74ccaab04568386ca17b0ad36350b622cd34606fe"}, - {file = "argon2_cffi-20.1.0-cp35-abi3-manylinux1_x86_64.whl", hash = "sha256:b160416adc0f012fb1f12588a5e6954889510f82f698e23ed4f4fa57f12a0647"}, - {file = "argon2_cffi-20.1.0-cp35-cp35m-win32.whl", hash = "sha256:9bee3212ba4f560af397b6d7146848c32a800652301843df06b9e8f68f0f7361"}, - {file = "argon2_cffi-20.1.0-cp35-cp35m-win_amd64.whl", hash = "sha256:392c3c2ef91d12da510cfb6f9bae52512a4552573a9e27600bdb800e05905d2b"}, - {file = "argon2_cffi-20.1.0-cp36-cp36m-win32.whl", hash = "sha256:ba7209b608945b889457f949cc04c8e762bed4fe3fec88ae9a6b7765ae82e496"}, - {file = "argon2_cffi-20.1.0-cp36-cp36m-win_amd64.whl", hash = "sha256:da7f0445b71db6d3a72462e04f36544b0de871289b0bc8a7cc87c0f5ec7079fa"}, - {file = "argon2_cffi-20.1.0-cp37-abi3-macosx_10_6_intel.whl", hash = "sha256:cc0e028b209a5483b6846053d5fd7165f460a1f14774d79e632e75e7ae64b82b"}, - {file = "argon2_cffi-20.1.0-cp37-cp37m-win32.whl", hash = "sha256:18dee20e25e4be86680b178b35ccfc5d495ebd5792cd00781548d50880fee5c5"}, - {file = "argon2_cffi-20.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:6678bb047373f52bcff02db8afab0d2a77d83bde61cfecea7c5c62e2335cb203"}, - {file = "argon2_cffi-20.1.0-cp38-cp38-win32.whl", hash = "sha256:77e909cc756ef81d6abb60524d259d959bab384832f0c651ed7dcb6e5ccdbb78"}, - {file = "argon2_cffi-20.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:9dfd5197852530294ecb5795c97a823839258dfd5eb9420233c7cfedec2058f2"}, - {file = "argon2_cffi-20.1.0-cp39-cp39-win32.whl", hash = "sha256:e2db6e85c057c16d0bd3b4d2b04f270a7467c147381e8fd73cbbe5bc719832be"}, - {file = "argon2_cffi-20.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:8a84934bd818e14a17943de8099d41160da4a336bcc699bb4c394bbb9b94bd32"}, - {file = "argon2_cffi-20.1.0-pp36-pypy36_pp73-macosx_10_7_x86_64.whl", hash = "sha256:b94042e5dcaa5d08cf104a54bfae614be502c6f44c9c89ad1535b2ebdaacbd4c"}, - {file = "argon2_cffi-20.1.0-pp36-pypy36_pp73-win32.whl", hash = "sha256:8282b84ceb46b5b75c3a882b28856b8cd7e647ac71995e71b6705ec06fc232c3"}, - {file = "argon2_cffi-20.1.0-pp37-pypy37_pp73-macosx_10_7_x86_64.whl", hash = "sha256:3aa804c0e52f208973845e8b10c70d8957c9e5a666f702793256242e9167c4e0"}, - {file = "argon2_cffi-20.1.0-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:36320372133a003374ef4275fbfce78b7ab581440dfca9f9471be3dd9a522428"}, -] -async-generator = [ - {file = "async_generator-1.10-py3-none-any.whl", hash = "sha256:01c7bf666359b4967d2cda0000cc2e4af16a0ae098cbffcb8472fb9e8ad6585b"}, - {file = "async_generator-1.10.tar.gz", hash = "sha256:6ebb3d106c12920aaae42ccb6f787ef5eefdcdd166ea3d628fa8476abe712144"}, -] -attrs = [ - {file = "attrs-21.2.0-py2.py3-none-any.whl", hash = "sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1"}, - {file = "attrs-21.2.0.tar.gz", hash = "sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb"}, -] -backcall = [ - {file = "backcall-0.2.0-py2.py3-none-any.whl", hash = "sha256:fbbce6a29f263178a1f7915c1940bde0ec2b2a967566fe1c65c1dfb7422bd255"}, - {file = "backcall-0.2.0.tar.gz", hash = "sha256:5cbdbf27be5e7cfadb448baf0aa95508f91f2bbc6c6437cd9cd06e2a4c215e1e"}, -] -"backports.entry-points-selectable" = [ - {file = "backports.entry_points_selectable-1.1.0-py2.py3-none-any.whl", hash = "sha256:a6d9a871cde5e15b4c4a53e3d43ba890cc6861ec1332c9c2428c92f977192acc"}, - {file = "backports.entry_points_selectable-1.1.0.tar.gz", hash = "sha256:988468260ec1c196dab6ae1149260e2f5472c9110334e5d51adcb77867361f6a"}, -] -bleach = [ - {file = "bleach-4.0.0-py2.py3-none-any.whl", hash = "sha256:c1685a132e6a9a38bf93752e5faab33a9517a6c0bb2f37b785e47bf253bdb51d"}, - {file = "bleach-4.0.0.tar.gz", hash = "sha256:ffa9221c6ac29399cc50fcc33473366edd0cf8d5e2cbbbb63296dc327fb67cc8"}, -] -blis = [ - {file = "blis-0.7.4-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:5b403deb2ad5515e1edb3c0867bccb5b974b461f24283d9219a3a761fd6dacc6"}, - {file = "blis-0.7.4-cp36-cp36m-manylinux2014_x86_64.whl", hash = "sha256:9f9b829480c12fc834549306821e5c51cb28b216ca5f88c5b2cfedbeb9daf67d"}, - {file = "blis-0.7.4-cp36-cp36m-win_amd64.whl", hash = "sha256:c2d8064217c326dd9a0dcbae294ffe8557263e2a00d76101ffa222b9c9d9c62d"}, - {file = "blis-0.7.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:d717b5dea407aac89a646908e7d9849105abab9c88a539c120518c200f899f4e"}, - {file = "blis-0.7.4-cp37-cp37m-manylinux2014_x86_64.whl", hash = "sha256:5ecddc4c6daf80558154b091db0a9839bb15dbe65d2906a543a73b93fbce4f73"}, - {file = "blis-0.7.4-cp37-cp37m-win_amd64.whl", hash = "sha256:6814991b3e3193db4f9b2417174c6f24b9c0197409d864fa7628583bd2df1f0f"}, - {file = "blis-0.7.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:4222bbc7b9c47bc3cf6f36f2241862c1512ca7ebac3828267a2e05ef6c47fc54"}, - {file = "blis-0.7.4-cp38-cp38-manylinux2014_x86_64.whl", hash = "sha256:445e4838b809e99677f5c0982fb9af320f0d91328fb28c8097e5f1173c4df9d6"}, - {file = "blis-0.7.4-cp38-cp38-win_amd64.whl", hash = "sha256:94890b2296f1449baa56aede46627ea7fc8de11c788f9c261ee38c2eb4a2cc7d"}, - {file = "blis-0.7.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:168fd7bd763ebe529aa25a066d3a6b89f4c3f492f6297f881df6942741b95787"}, - {file = "blis-0.7.4-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:5c1a2023f7d8431daa8d87d32f539bb23e1a009500c37f9eba0ac7b3f20f73eb"}, - {file = "blis-0.7.4-cp39-cp39-win_amd64.whl", hash = "sha256:78a8e0ee72a42c3b2f5b9114500a781119995f76fa6c21d4b02c6fb9c21df2cc"}, - {file = "blis-0.7.4.tar.gz", hash = "sha256:7daa615a97d4f28db0f332b710bfe1900b15d0c25841c6d727965e4fd91e09cf"}, -] -bokeh = [ - {file = "bokeh-2.3.3.tar.gz", hash = "sha256:a5fdcc181835561447fcc5a371300973fce4114692d5853addec284d1cdeb677"}, -] -boto3 = [ - {file = "boto3-1.18.19-py3-none-any.whl", hash = "sha256:72b1f70a5a42dff0a9c26a71486d3dcb3e098fac5b36126bc8fdcdec8c4d3cf4"}, - {file = "boto3-1.18.19.tar.gz", hash = "sha256:096f771c259484dc7140af2b7a9078e9c3efba28e2a298d1e8e40fed404fa38e"}, -] -botocore = [ - {file = "botocore-1.21.19-py3-none-any.whl", hash = "sha256:2fa40a39b338888c9492dc1e36734d8807f9e1c6f5dd3514247338e97f4da0f6"}, - {file = "botocore-1.21.19.tar.gz", hash = "sha256:7dce88db827e9b5c88701c978df00742c854d2b751fbda8db7656fb9a571afc5"}, -] -catalogue = [ - {file = "catalogue-2.0.4-py3-none-any.whl", hash = "sha256:62572ad1a641face0eb1436921ee4e03169162879bdc25ab8d535219b5f65b48"}, - {file = "catalogue-2.0.4.tar.gz", hash = "sha256:9ed345d12855af315f1715583612b26b8621a2b0a2e3bef974dc5d712f7983aa"}, -] -certifi = [ - {file = "certifi-2021.5.30-py2.py3-none-any.whl", hash = "sha256:50b1e4f8446b06f41be7dd6338db18e0990601dce795c2b1686458aa7e8fa7d8"}, - {file = "certifi-2021.5.30.tar.gz", hash = "sha256:2bbf76fd432960138b3ef6dda3dde0544f27cbf8546c458e60baf371917ba9ee"}, -] -cffi = [ - {file = "cffi-1.14.6-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:22b9c3c320171c108e903d61a3723b51e37aaa8c81255b5e7ce102775bd01e2c"}, - {file = "cffi-1.14.6-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:f0c5d1acbfca6ebdd6b1e3eded8d261affb6ddcf2186205518f1428b8569bb99"}, - {file = "cffi-1.14.6-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:99f27fefe34c37ba9875f224a8f36e31d744d8083e00f520f133cab79ad5e819"}, - {file = "cffi-1.14.6-cp27-cp27m-win32.whl", hash = "sha256:55af55e32ae468e9946f741a5d51f9896da6b9bf0bbdd326843fec05c730eb20"}, - {file = "cffi-1.14.6-cp27-cp27m-win_amd64.whl", hash = "sha256:7bcac9a2b4fdbed2c16fa5681356d7121ecabf041f18d97ed5b8e0dd38a80224"}, - {file = "cffi-1.14.6-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:ed38b924ce794e505647f7c331b22a693bee1538fdf46b0222c4717b42f744e7"}, - {file = "cffi-1.14.6-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:e22dcb48709fc51a7b58a927391b23ab37eb3737a98ac4338e2448bef8559b33"}, - {file = "cffi-1.14.6-cp35-cp35m-macosx_10_9_x86_64.whl", hash = "sha256:aedb15f0a5a5949ecb129a82b72b19df97bbbca024081ed2ef88bd5c0a610534"}, - {file = "cffi-1.14.6-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:48916e459c54c4a70e52745639f1db524542140433599e13911b2f329834276a"}, - {file = "cffi-1.14.6-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:f627688813d0a4140153ff532537fbe4afea5a3dffce1f9deb7f91f848a832b5"}, - {file = "cffi-1.14.6-cp35-cp35m-win32.whl", hash = "sha256:f0010c6f9d1a4011e429109fda55a225921e3206e7f62a0c22a35344bfd13cca"}, - {file = "cffi-1.14.6-cp35-cp35m-win_amd64.whl", hash = "sha256:57e555a9feb4a8460415f1aac331a2dc833b1115284f7ded7278b54afc5bd218"}, - {file = "cffi-1.14.6-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:e8c6a99be100371dbb046880e7a282152aa5d6127ae01783e37662ef73850d8f"}, - {file = "cffi-1.14.6-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:19ca0dbdeda3b2615421d54bef8985f72af6e0c47082a8d26122adac81a95872"}, - {file = "cffi-1.14.6-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:d950695ae4381ecd856bcaf2b1e866720e4ab9a1498cba61c602e56630ca7195"}, - {file = "cffi-1.14.6-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e9dc245e3ac69c92ee4c167fbdd7428ec1956d4e754223124991ef29eb57a09d"}, - {file = "cffi-1.14.6-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a8661b2ce9694ca01c529bfa204dbb144b275a31685a075ce123f12331be790b"}, - {file = "cffi-1.14.6-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b315d709717a99f4b27b59b021e6207c64620790ca3e0bde636a6c7f14618abb"}, - {file = "cffi-1.14.6-cp36-cp36m-win32.whl", hash = "sha256:80b06212075346b5546b0417b9f2bf467fea3bfe7352f781ffc05a8ab24ba14a"}, - {file = "cffi-1.14.6-cp36-cp36m-win_amd64.whl", hash = "sha256:a9da7010cec5a12193d1af9872a00888f396aba3dc79186604a09ea3ee7c029e"}, - {file = "cffi-1.14.6-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:4373612d59c404baeb7cbd788a18b2b2a8331abcc84c3ba40051fcd18b17a4d5"}, - {file = "cffi-1.14.6-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:f10afb1004f102c7868ebfe91c28f4a712227fe4cb24974350ace1f90e1febbf"}, - {file = "cffi-1.14.6-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:fd4305f86f53dfd8cd3522269ed7fc34856a8ee3709a5e28b2836b2db9d4cd69"}, - {file = "cffi-1.14.6-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6d6169cb3c6c2ad50db5b868db6491a790300ade1ed5d1da29289d73bbe40b56"}, - {file = "cffi-1.14.6-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5d4b68e216fc65e9fe4f524c177b54964af043dde734807586cf5435af84045c"}, - {file = "cffi-1.14.6-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:33791e8a2dc2953f28b8d8d300dde42dd929ac28f974c4b4c6272cb2955cb762"}, - {file = "cffi-1.14.6-cp37-cp37m-win32.whl", hash = "sha256:0c0591bee64e438883b0c92a7bed78f6290d40bf02e54c5bf0978eaf36061771"}, - {file = "cffi-1.14.6-cp37-cp37m-win_amd64.whl", hash = "sha256:8eb687582ed7cd8c4bdbff3df6c0da443eb89c3c72e6e5dcdd9c81729712791a"}, - {file = "cffi-1.14.6-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ba6f2b3f452e150945d58f4badd92310449876c4c954836cfb1803bdd7b422f0"}, - {file = "cffi-1.14.6-cp38-cp38-manylinux1_i686.whl", hash = "sha256:64fda793737bc4037521d4899be780534b9aea552eb673b9833b01f945904c2e"}, - {file = "cffi-1.14.6-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:9f3e33c28cd39d1b655ed1ba7247133b6f7fc16fa16887b120c0c670e35ce346"}, - {file = "cffi-1.14.6-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:26bb2549b72708c833f5abe62b756176022a7b9a7f689b571e74c8478ead51dc"}, - {file = "cffi-1.14.6-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:eb687a11f0a7a1839719edd80f41e459cc5366857ecbed383ff376c4e3cc6afd"}, - {file = "cffi-1.14.6-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d2ad4d668a5c0645d281dcd17aff2be3212bc109b33814bbb15c4939f44181cc"}, - {file = "cffi-1.14.6-cp38-cp38-win32.whl", hash = "sha256:487d63e1454627c8e47dd230025780e91869cfba4c753a74fda196a1f6ad6548"}, - {file = "cffi-1.14.6-cp38-cp38-win_amd64.whl", hash = "sha256:c33d18eb6e6bc36f09d793c0dc58b0211fccc6ae5149b808da4a62660678b156"}, - {file = "cffi-1.14.6-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:06c54a68935738d206570b20da5ef2b6b6d92b38ef3ec45c5422c0ebaf338d4d"}, - {file = "cffi-1.14.6-cp39-cp39-manylinux1_i686.whl", hash = "sha256:f174135f5609428cc6e1b9090f9268f5c8935fddb1b25ccb8255a2d50de6789e"}, - {file = "cffi-1.14.6-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:f3ebe6e73c319340830a9b2825d32eb6d8475c1dac020b4f0aa774ee3b898d1c"}, - {file = "cffi-1.14.6-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3c8d896becff2fa653dc4438b54a5a25a971d1f4110b32bd3068db3722c80202"}, - {file = "cffi-1.14.6-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4922cd707b25e623b902c86188aca466d3620892db76c0bdd7b99a3d5e61d35f"}, - {file = "cffi-1.14.6-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c9e005e9bd57bc987764c32a1bee4364c44fdc11a3cc20a40b93b444984f2b87"}, - {file = "cffi-1.14.6-cp39-cp39-win32.whl", hash = "sha256:eb9e2a346c5238a30a746893f23a9535e700f8192a68c07c0258e7ece6ff3728"}, - {file = "cffi-1.14.6-cp39-cp39-win_amd64.whl", hash = "sha256:818014c754cd3dba7229c0f5884396264d51ffb87ec86e927ef0be140bfdb0d2"}, - {file = "cffi-1.14.6.tar.gz", hash = "sha256:c9a875ce9d7fe32887784274dd533c57909b7b1dcadcc128a2ac21331a9765dd"}, -] -cfgv = [ - {file = "cfgv-3.3.0-py2.py3-none-any.whl", hash = "sha256:b449c9c6118fe8cca7fa5e00b9ec60ba08145d281d52164230a69211c5d597a1"}, - {file = "cfgv-3.3.0.tar.gz", hash = "sha256:9e600479b3b99e8af981ecdfc80a0296104ee610cab48a5ae4ffd0b668650eb1"}, -] -charset-normalizer = [ - {file = "charset-normalizer-2.0.4.tar.gz", hash = "sha256:f23667ebe1084be45f6ae0538e4a5a865206544097e4e8bbcacf42cd02a348f3"}, - {file = "charset_normalizer-2.0.4-py3-none-any.whl", hash = "sha256:0c8911edd15d19223366a194a513099a302055a962bca2cec0f54b8b63175d8b"}, -] -click = [ - {file = "click-7.1.2-py2.py3-none-any.whl", hash = "sha256:dacca89f4bfadd5de3d7489b7c8a566eee0d3676333fbb50030263894c38c0dc"}, - {file = "click-7.1.2.tar.gz", hash = "sha256:d2b5255c7c6349bc1bd1e59e08cd12acbbd63ce649f2588755783aa94dfb6b1a"}, -] -cloudpickle = [ - {file = "cloudpickle-1.6.0-py3-none-any.whl", hash = "sha256:3a32d0eb0bc6f4d0c57fbc4f3e3780f7a81e6fee0fa935072884d58ae8e1cc7c"}, - {file = "cloudpickle-1.6.0.tar.gz", hash = "sha256:9bc994f9e9447593bd0a45371f0e7ac7333710fcf64a4eb9834bf149f4ef2f32"}, -] -colorama = [ - {file = "colorama-0.4.4-py2.py3-none-any.whl", hash = "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"}, - {file = "colorama-0.4.4.tar.gz", hash = "sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b"}, -] -cycler = [ - {file = "cycler-0.10.0-py2.py3-none-any.whl", hash = "sha256:1d8a5ae1ff6c5cf9b93e8811e581232ad8920aeec647c37316ceac982b08cb2d"}, - {file = "cycler-0.10.0.tar.gz", hash = "sha256:cd7b2d1018258d7247a71425e9f26463dfb444d411c39569972f4ce586b0c9d8"}, -] -cymem = [ - {file = "cymem-2.0.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:9d72d69f7a62a280199c3aa7bc550685c47d6d0689b2d299e6492253b86d2437"}, - {file = "cymem-2.0.5-cp36-cp36m-manylinux2014_x86_64.whl", hash = "sha256:8ea57e6923f40eb51012352161bb5707c14a5a5ce901ff72021e59df06221655"}, - {file = "cymem-2.0.5-cp36-cp36m-win_amd64.whl", hash = "sha256:4bd023c2477198b39b660c2a6b0242880649765ecee8461688a57fd4afd2bfc0"}, - {file = "cymem-2.0.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:1f0eb9b3d03623dcfc746cf8bff0663b0e347f4aea759965c8932087a0307ee9"}, - {file = "cymem-2.0.5-cp37-cp37m-manylinux2014_x86_64.whl", hash = "sha256:a440d63577fcdc9c528c9cc026b7b4f8648193bac462bc0596c9eac10f9fba62"}, - {file = "cymem-2.0.5-cp37-cp37m-win_amd64.whl", hash = "sha256:3d48902d7441645835fefc7832df49feb5362c7300d182475b63a01d25ae44ef"}, - {file = "cymem-2.0.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f2167c9959fcd639b95d51fa5efaa7c61eef8d686cb75a25412a914f428ce980"}, - {file = "cymem-2.0.5-cp38-cp38-manylinux2014_x86_64.whl", hash = "sha256:734d82d0d03c2ceb929bc1744c04dbe0a105e68a4947c8406056a36f86c41830"}, - {file = "cymem-2.0.5-cp38-cp38-win_amd64.whl", hash = "sha256:01d3ea159f7a3f3192b1e800ed8207dac7586794d903a153198b9ea317f144bc"}, - {file = "cymem-2.0.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d307f7f6230d861a938837cae4b855226b6845a21c010242a15e9ce6853856cd"}, - {file = "cymem-2.0.5-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:ce1e81c1d031f56b67bac2136e73b4512cbc794706cd570178972d54ba6115d8"}, - {file = "cymem-2.0.5-cp39-cp39-win_amd64.whl", hash = "sha256:d19f68b90411e02ab33b1654118337f96f41c13a3cd00c4f44f7abed2bc712e7"}, - {file = "cymem-2.0.5.tar.gz", hash = "sha256:190e15d9cf2c3bde60ae37bddbae6568a36044dc4a326d84081a5fa08818eee0"}, -] -cython = [ - {file = "Cython-0.29.14-cp27-cp27m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:47e5e1502d52ef03387cf9d3b3241007961a84a466e58a3b74028e1dd4957f8c"}, - {file = "Cython-0.29.14-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:1dcdaa319558eb924294a554dcf6c12383ec947acc7e779e8d3622409a7f7d28"}, - {file = "Cython-0.29.14-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:7bc18fc5a170f2c1cef5387a3d997c28942918bbee0f700e73fd2178ee8d474d"}, - {file = "Cython-0.29.14-cp27-cp27m-win32.whl", hash = "sha256:89458b49976b1dee5d89ab4ac943da3717b4292bf624367e862e4ee172fcce99"}, - {file = "Cython-0.29.14-cp27-cp27m-win_amd64.whl", hash = "sha256:c0b24bfe3431b3cb7ced323bca813dbd13aca973a1475b512d3331fd0de8ec60"}, - {file = "Cython-0.29.14-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:7f89eff20e4a7a64b55210dac17aea711ed8a3f2e78f2ff784c0e984302583dd"}, - {file = "Cython-0.29.14-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:6c53338c1811f8c6d7f8cb7abd874810b15045e719e8207f957035c9177b4213"}, - {file = "Cython-0.29.14-cp34-cp34m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:521340844cf388d109ceb61397f3fd5250ccb622a1a8e93559e8de76c80940a9"}, - {file = "Cython-0.29.14-cp34-cp34m-manylinux1_i686.whl", hash = "sha256:75c2dda47dcc3c77449712b1417bb6b89ec3b7b02e18c64262494dceffdf455e"}, - {file = "Cython-0.29.14-cp34-cp34m-manylinux1_x86_64.whl", hash = "sha256:05eb79efc8029d487251c8a2702a909a8ba33c332e06d2f3980866541bd81253"}, - {file = "Cython-0.29.14-cp34-cp34m-win32.whl", hash = "sha256:1fc5bdda28f25fec44e4721677458aa509d743cd350862270309d61aa148d6ff"}, - {file = "Cython-0.29.14-cp34-cp34m-win_amd64.whl", hash = "sha256:0c70e842e52e2f50cc43bad43b5e5bc515f30821a374e544abb0e0746f2350ff"}, - {file = "Cython-0.29.14-cp35-cp35m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:094d28a34c3fa992ae02aea1edbe6ff89b3cc5870b6ee38b5baeb805dc57b013"}, - {file = "Cython-0.29.14-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:280573a01d9348d44a42d6a9c651d9f7eb1fe9217df72555b2a118f902996a10"}, - {file = "Cython-0.29.14-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:773c5a98e463b52f7e8197254b39b703a5ea1972aef3a94b3b921515d77dd041"}, - {file = "Cython-0.29.14-cp35-cp35m-win32.whl", hash = "sha256:986f871c0fa649b293061236b93782d25c293a8dd8117c7ba05f8a61bdc261ae"}, - {file = "Cython-0.29.14-cp35-cp35m-win_amd64.whl", hash = "sha256:78c3068dcba300d473fef57cdf523e34b37de522f5a494ef9ee1ac9b4b8bbe3f"}, - {file = "Cython-0.29.14-cp36-cp36m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:f3818e578e687cdb21dc4aa4a3bc6278c656c9c393e9eda14dd04943f478863d"}, - {file = "Cython-0.29.14-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:bb487881608ebd293592553c618f0c83316f4f13a64cb18605b1d2fb9fd3da3e"}, - {file = "Cython-0.29.14-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:03f6bbb380ad0acb744fb06e42996ea217e9d00016ca0ff6f2e7d60f580d0360"}, - {file = "Cython-0.29.14-cp36-cp36m-win32.whl", hash = "sha256:b8ab3ab38afc47d8f4fe629b836243544351cef681b6bdb1dc869028d6fdcbfb"}, - {file = "Cython-0.29.14-cp36-cp36m-win_amd64.whl", hash = "sha256:298ceca7b0f0da4205fcb0b7c9ac9e120e2dafffd5019ba1618e84ef89434b5a"}, - {file = "Cython-0.29.14-cp37-cp37m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:fe666645493d72712c46e4fbe8bec094b06aec3c337400479e9704439c9d9586"}, - {file = "Cython-0.29.14-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:4074a8bff0040035673cc6dd365a762476d6bff4d03d8ce6904e3e53f9a25dc8"}, - {file = "Cython-0.29.14-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:a14aa436586c41633339415de82a41164691d02d3e661038da533be5d40794a5"}, - {file = "Cython-0.29.14-cp37-cp37m-win32.whl", hash = "sha256:41e7068e95fbf9ec94b41437f989caf9674135e770a39cdb9c00de459bafd1bc"}, - {file = "Cython-0.29.14-cp37-cp37m-win_amd64.whl", hash = "sha256:05e8cfd3a3a6087aec49a1ae08a89171db991956209406d1e5576f9db70ece52"}, - {file = "Cython-0.29.14-cp38-cp38-manylinux1_i686.whl", hash = "sha256:e8fab9911fd2fa8e5af407057cb8bdf87762f983cba483fa3234be20a9a0af77"}, - {file = "Cython-0.29.14-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:d4039bb7f234ad32267c55e72fd49fb56078ea102f9d9d8559f6ec34d4887630"}, - {file = "Cython-0.29.14-cp38-cp38-win32.whl", hash = "sha256:c7894c06205166d360ab2915ae306d1f7403e9ce3d3aaeff4095eaf98e42ce66"}, - {file = "Cython-0.29.14-cp38-cp38-win_amd64.whl", hash = "sha256:a0f495a4fe5278aab278feee35e6102efecde5176a8a74dd28c28e3fc5c8d7c7"}, - {file = "Cython-0.29.14.tar.gz", hash = "sha256:e4d6bb8703d0319eb04b7319b12ea41580df44fd84d83ccda13ea463c6801414"}, -] -decorator = [ - {file = "decorator-5.0.9-py3-none-any.whl", hash = "sha256:6e5c199c16f7a9f0e3a61a4a54b3d27e7dad0dbdde92b944426cb20914376323"}, - {file = "decorator-5.0.9.tar.gz", hash = "sha256:72ecfba4320a893c53f9706bebb2d55c270c1e51a28789361aa93e4a21319ed5"}, -] -defusedxml = [ - {file = "defusedxml-0.7.1-py2.py3-none-any.whl", hash = "sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61"}, - {file = "defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69"}, -] -distlib = [ - {file = "distlib-0.3.2-py2.py3-none-any.whl", hash = "sha256:23e223426b28491b1ced97dc3bbe183027419dfc7982b4fa2f05d5f3ff10711c"}, - {file = "distlib-0.3.2.zip", hash = "sha256:106fef6dc37dd8c0e2c0a60d3fca3e77460a48907f335fa28420463a6f799736"}, -] -eli5 = [ - {file = "eli5-0.11.0-py2.py3-none-any.whl", hash = "sha256:1ea45cd0722d20c8c9e9bb89c7c5909feeface4e5942e24b7a89809f0fe593a2"}, - {file = "eli5-0.11.0.tar.gz", hash = "sha256:aea7b51be9157ce615b319711467f358de03da12328e5639818b3cb3755aa056"}, -] -entrypoints = [ - {file = "entrypoints-0.3-py2.py3-none-any.whl", hash = "sha256:589f874b313739ad35be6e0cd7efde2a4e9b6fea91edcc34e58ecbb8dbe56d19"}, - {file = "entrypoints-0.3.tar.gz", hash = "sha256:c70dd71abe5a8c85e55e12c19bd91ccfeec11a6e99044204511f9ed547d48451"}, -] -filelock = [ - {file = "filelock-3.0.12-py3-none-any.whl", hash = "sha256:929b7d63ec5b7d6b71b0fa5ac14e030b3f70b75747cef1b10da9b879fef15836"}, - {file = "filelock-3.0.12.tar.gz", hash = "sha256:18d82244ee114f543149c66a6e0c14e9c4f8a1044b5cdaadd0f82159d6a6ff59"}, -] -gensim = [ - {file = "gensim-3.8.3-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:61eed1d6b5fbe6dda0586ea447ebc2dc7890a7f70c2ed953d5abc3fe3cfb94bb"}, - {file = "gensim-3.8.3-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:3af62709369331c85552fd26caa21504baa64accc426dc094172f5c688750013"}, - {file = "gensim-3.8.3-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:8ff471921b3b10ffb3ae6cbb598dd9c07d9dc030dee5aa167e7682b549c42f87"}, - {file = "gensim-3.8.3-cp27-cp27m-win32.whl", hash = "sha256:440700e29b494bc2e1d52e14b69a821f46ab09ecf85cf36c8988f18e1d6c7a8b"}, - {file = "gensim-3.8.3-cp27-cp27m-win_amd64.whl", hash = "sha256:f8ea67bf8c47ee55cb1b32c97fa1474b7d6d22959dd8097c019a5d9c9df34f5f"}, - {file = "gensim-3.8.3-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:7a90549dfc8ee3822fcad6da957de07d927e4e90ef42b3699543dee35ab2da13"}, - {file = "gensim-3.8.3-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:7629b33cf35f672efdd5269381f7e301958ee2638f27dfc63b80c5bfeaa827d3"}, - {file = "gensim-3.8.3-cp35-cp35m-macosx_10_9_intel.whl", hash = "sha256:6711b6d3a0007530ee7de7adc30a4c48a1d26ec6312ac50e1d1e0a1d54f9de5b"}, - {file = "gensim-3.8.3-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:ef2ddeceff482aee17c1e185f63bf027c8de8f595fdd9fd2d2503de96008f3b7"}, - {file = "gensim-3.8.3-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:41dcf6ecdc9acc657157967c791b8cbaba90ee6391f64efd28339b72f5e0c327"}, - {file = "gensim-3.8.3-cp35-cp35m-win32.whl", hash = "sha256:685a7657278161628821c8f873c5d7d2ffc0c28866648e39f76b450e4c7d5390"}, - {file = "gensim-3.8.3-cp35-cp35m-win_amd64.whl", hash = "sha256:b61a7c841a752c84b685674aa0d610289faad38795b325176481abe19b487e98"}, - {file = "gensim-3.8.3-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:a61179df454a0d4b06a111c4ede0536f61c8121b4c0d0d02d23560a2fd4b3aff"}, - {file = "gensim-3.8.3-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:cc387d0d8bddbf3609ab95b3453296e4c9ff92c35e9799a17d86b1571d77a5fc"}, - {file = "gensim-3.8.3-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:b36e6330471061cfd78aad751e24c6b4f56d575697af0fbab42655128927d296"}, - {file = "gensim-3.8.3-cp36-cp36m-win32.whl", hash = "sha256:1e3d66c2eec494376fc599701d9c2868549aed6e93e47177e39217f0188e2d88"}, - {file = "gensim-3.8.3-cp36-cp36m-win_amd64.whl", hash = "sha256:91fa62d61b21f1878f140b10520f9de4a26a52672fbe407edfc7e09ca2eff235"}, - {file = "gensim-3.8.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:637fc5969f3cef4b7c8fd3e78e31ef09565c5566d5ceabf076b4170eb6444a80"}, - {file = "gensim-3.8.3-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:22f45fd239cacd0e3715ac447a2c8a5eea02e730ec1f701c55b359e9298e63a8"}, - {file = "gensim-3.8.3-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:d79370f78e9013b9d1e867c85ecc678d46a7ae0f01a8ca29e8f4291e5373b170"}, - {file = "gensim-3.8.3-cp37-cp37m-win32.whl", hash = "sha256:9c214b341f5304b906c79844e2787c13b46505df9dc70afca79a9a7dc0894478"}, - {file = "gensim-3.8.3-cp37-cp37m-win_amd64.whl", hash = "sha256:fe98277a7b3b4987b40c928056bbaae1d0715022cf27bba89d05cd0d4fe51a84"}, - {file = "gensim-3.8.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:a47903d104469a7a8b6f22ad5ef74681b19c4f4b71ff2c2893271b53161a43e4"}, - {file = "gensim-3.8.3-cp38-cp38-manylinux1_i686.whl", hash = "sha256:05bfc02e102a34c9c795095b688b1b4aaa2529c624821368c9c3ea6a16536f77"}, - {file = "gensim-3.8.3-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:a8807ebf324dd11e1298a91a92d6e57c7bdabb91d0d5240bf1efa0c0eacd86f0"}, - {file = "gensim-3.8.3-cp38-cp38-win32.whl", hash = "sha256:90115d12ee545c21cc75521ef1bb3dd66aae8a378e9c2eb029c9f22df173c125"}, - {file = "gensim-3.8.3-cp38-cp38-win_amd64.whl", hash = "sha256:4e34cf2e50f3eab3e303da46089ea4972567bf216e28f7535ada155770784ac8"}, - {file = "gensim-3.8.3.tar.gz", hash = "sha256:786adb0571f75114e9c5f7a31dd2e6eb39a9791f22c8757621545e2ded3ea367"}, -] -graphviz = [ - {file = "graphviz-0.16-py2.py3-none-any.whl", hash = "sha256:3cad5517c961090dfc679df6402a57de62d97703e2880a1a46147bb0dc1639eb"}, - {file = "graphviz-0.16.zip", hash = "sha256:d2d25af1c199cad567ce4806f0449cb74eb30cf451fd7597251e1da099ac6e57"}, -] -gym = [ - {file = "gym-0.18.3.tar.gz", hash = "sha256:81a3e3fbf7fcf57c8cf98f7e22d1bdd5815f3824d9c148a7eb42420d3d642967"}, -] -h5py = [ - {file = "h5py-3.3.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:f3bba8ffddd1fd2bf06127c5ff7b73f022cc1c8b7164355ddc760dc3f8570136"}, - {file = "h5py-3.3.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:baef1a2cdef287a83e7f95ce9e0f4d762a9852fe7117b471063442c78b973695"}, - {file = "h5py-3.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:8e09b682e4059c8cd259ddcc34bee35d639b9170105efeeae6ad195e7c1cea7a"}, - {file = "h5py-3.3.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:89d7e10409b62fed81c571e35798763cb8375442b98f8ebfc52ba41ac019e081"}, - {file = "h5py-3.3.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:7ca7d23ebbdd59a4be9b4820de52fe67adc74e6a44d5084881305461765aac47"}, - {file = "h5py-3.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:e0ea3330bf136f8213e43db67448994046ce501585dddc7ea4e8ceef0ef1600c"}, - {file = "h5py-3.3.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:13355234c004ff8bd819f7d3420188aa1936b17d7f8470d622974a373421b7a5"}, - {file = "h5py-3.3.0-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:09e78cefdef0b7566ab66366c5c7d9984c7b23142245bd51b82b744ad1eebf65"}, - {file = "h5py-3.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:5e2f22e66a3fb1815405cfe5711670450c973b8552507c535a546a23a468af3d"}, - {file = "h5py-3.3.0.tar.gz", hash = "sha256:e0dac887d779929778b3cfd13309a939359cc9e74756fc09af7c527a82797186"}, -] -identify = [ - {file = "identify-2.2.13-py2.py3-none-any.whl", hash = "sha256:7199679b5be13a6b40e6e19ea473e789b11b4e3b60986499b1f589ffb03c217c"}, - {file = "identify-2.2.13.tar.gz", hash = "sha256:7bc6e829392bd017236531963d2d937d66fc27cadc643ac0aba2ce9f26157c79"}, -] -idna = [ - {file = "idna-3.2-py3-none-any.whl", hash = "sha256:14475042e284991034cb48e06f6851428fb14c4dc953acd9be9a5e95c7b6dd7a"}, - {file = "idna-3.2.tar.gz", hash = "sha256:467fbad99067910785144ce333826c71fb0e63a425657295239737f7ecd125f3"}, -] -imageio = [ - {file = "imageio-2.9.0-py3-none-any.whl", hash = "sha256:3604d751f03002e8e0e7650aa71d8d9148144a87daf17cb1f3228e80747f2e6b"}, - {file = "imageio-2.9.0.tar.gz", hash = "sha256:52ddbaeca2dccf53ba2d6dec5676ca7bc3b2403ef8b37f7da78b7654bb3e10f0"}, -] -ipykernel = [ - {file = "ipykernel-5.5.5-py3-none-any.whl", hash = "sha256:29eee66548ee7c2edb7941de60c0ccf0a7a8dd957341db0a49c5e8e6a0fcb712"}, - {file = "ipykernel-5.5.5.tar.gz", hash = "sha256:e976751336b51082a89fc2099fb7f96ef20f535837c398df6eab1283c2070884"}, -] -ipython = [ - {file = "ipython-7.26.0-py3-none-any.whl", hash = "sha256:892743b65c21ed72b806a3a602cca408520b3200b89d1924f4b3d2cdb3692362"}, - {file = "ipython-7.26.0.tar.gz", hash = "sha256:0cff04bb042800129348701f7bd68a430a844e8fb193979c08f6c99f28bb735e"}, -] -ipython-genutils = [ - {file = "ipython_genutils-0.2.0-py2.py3-none-any.whl", hash = "sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8"}, - {file = "ipython_genutils-0.2.0.tar.gz", hash = "sha256:eb2e116e75ecef9d4d228fdc66af54269afa26ab4463042e33785b887c628ba8"}, -] -ipywidgets = [ - {file = "ipywidgets-7.6.3-py2.py3-none-any.whl", hash = "sha256:e6513cfdaf5878de30f32d57f6dc2474da395a2a2991b94d487406c0ab7f55ca"}, - {file = "ipywidgets-7.6.3.tar.gz", hash = "sha256:9f1a43e620530f9e570e4a493677d25f08310118d315b00e25a18f12913c41f0"}, -] -jedi = [ - {file = "jedi-0.18.0-py2.py3-none-any.whl", hash = "sha256:18456d83f65f400ab0c2d3319e48520420ef43b23a086fdc05dff34132f0fb93"}, - {file = "jedi-0.18.0.tar.gz", hash = "sha256:92550a404bad8afed881a137ec9a461fed49eca661414be45059329614ed0707"}, -] -jinja2 = [ - {file = "Jinja2-3.0.1-py3-none-any.whl", hash = "sha256:1f06f2da51e7b56b8f238affdd6b4e2c61e39598a378cc49345bc1bd42a978a4"}, - {file = "Jinja2-3.0.1.tar.gz", hash = "sha256:703f484b47a6af502e743c9122595cc812b0271f661722403114f71a79d0f5a4"}, -] -jmespath = [ - {file = "jmespath-0.10.0-py2.py3-none-any.whl", hash = "sha256:cdf6525904cc597730141d61b36f2e4b8ecc257c420fa2f4549bac2c2d0cb72f"}, - {file = "jmespath-0.10.0.tar.gz", hash = "sha256:b85d0567b8666149a93172712e68920734333c0ce7e89b78b3e987f71e5ed4f9"}, -] -joblib = [ - {file = "joblib-1.0.1-py3-none-any.whl", hash = "sha256:feeb1ec69c4d45129954f1b7034954241eedfd6ba39b5e9e4b6883be3332d5e5"}, - {file = "joblib-1.0.1.tar.gz", hash = "sha256:9c17567692206d2f3fb9ecf5e991084254fe631665c450b443761c4186a613f7"}, -] -jsonschema = [ - {file = "jsonschema-3.2.0-py2.py3-none-any.whl", hash = "sha256:4e5b3cf8216f577bee9ce139cbe72eca3ea4f292ec60928ff24758ce626cd163"}, - {file = "jsonschema-3.2.0.tar.gz", hash = "sha256:c8a85b28d377cc7737e46e2d9f2b4f44ee3c0e1deac6bf46ddefc7187d30797a"}, -] -jupyter-client = [ - {file = "jupyter_client-6.2.0-py3-none-any.whl", hash = "sha256:9715152067e3f7ea3b56f341c9a0f9715c8c7cc316ee0eb13c3c84f5ca0065f5"}, - {file = "jupyter_client-6.2.0.tar.gz", hash = "sha256:e2ab61d79fbf8b56734a4c2499f19830fbd7f6fefb3e87868ef0545cb3c17eb9"}, -] -jupyter-core = [ - {file = "jupyter_core-4.7.1-py3-none-any.whl", hash = "sha256:8c6c0cac5c1b563622ad49321d5ec47017bd18b94facb381c6973a0486395f8e"}, - {file = "jupyter_core-4.7.1.tar.gz", hash = "sha256:79025cb3225efcd36847d0840f3fc672c0abd7afd0de83ba8a1d3837619122b4"}, -] -jupyterlab-pygments = [ - {file = "jupyterlab_pygments-0.1.2-py2.py3-none-any.whl", hash = "sha256:abfb880fd1561987efaefcb2d2ac75145d2a5d0139b1876d5be806e32f630008"}, - {file = "jupyterlab_pygments-0.1.2.tar.gz", hash = "sha256:cfcda0873626150932f438eccf0f8bf22bfa92345b814890ab360d666b254146"}, -] -jupyterlab-widgets = [ - {file = "jupyterlab_widgets-1.0.0-py3-none-any.whl", hash = "sha256:caeaf3e6103180e654e7d8d2b81b7d645e59e432487c1d35a41d6d3ee56b3fef"}, - {file = "jupyterlab_widgets-1.0.0.tar.gz", hash = "sha256:5c1a29a84d3069208cb506b10609175b249b6486d6b1cbae8fcde2a11584fb78"}, -] -kiwisolver = [ - {file = "kiwisolver-1.3.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:fd34fbbfbc40628200730bc1febe30631347103fc8d3d4fa012c21ab9c11eca9"}, - {file = "kiwisolver-1.3.1-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:d3155d828dec1d43283bd24d3d3e0d9c7c350cdfcc0bd06c0ad1209c1bbc36d0"}, - {file = "kiwisolver-1.3.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:5a7a7dbff17e66fac9142ae2ecafb719393aaee6a3768c9de2fd425c63b53e21"}, - {file = "kiwisolver-1.3.1-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:f8d6f8db88049a699817fd9178782867bf22283e3813064302ac59f61d95be05"}, - {file = "kiwisolver-1.3.1-cp36-cp36m-manylinux2014_ppc64le.whl", hash = "sha256:5f6ccd3dd0b9739edcf407514016108e2280769c73a85b9e59aa390046dbf08b"}, - {file = "kiwisolver-1.3.1-cp36-cp36m-win32.whl", hash = "sha256:225e2e18f271e0ed8157d7f4518ffbf99b9450fca398d561eb5c4a87d0986dd9"}, - {file = "kiwisolver-1.3.1-cp36-cp36m-win_amd64.whl", hash = "sha256:cf8b574c7b9aa060c62116d4181f3a1a4e821b2ec5cbfe3775809474113748d4"}, - {file = "kiwisolver-1.3.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:232c9e11fd7ac3a470d65cd67e4359eee155ec57e822e5220322d7b2ac84fbf0"}, - {file = "kiwisolver-1.3.1-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:b38694dcdac990a743aa654037ff1188c7a9801ac3ccc548d3341014bc5ca278"}, - {file = "kiwisolver-1.3.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:ca3820eb7f7faf7f0aa88de0e54681bddcb46e485beb844fcecbcd1c8bd01689"}, - {file = "kiwisolver-1.3.1-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:c8fd0f1ae9d92b42854b2979024d7597685ce4ada367172ed7c09edf2cef9cb8"}, - {file = "kiwisolver-1.3.1-cp37-cp37m-manylinux2014_ppc64le.whl", hash = "sha256:1e1bc12fb773a7b2ffdeb8380609f4f8064777877b2225dec3da711b421fda31"}, - {file = "kiwisolver-1.3.1-cp37-cp37m-win32.whl", hash = "sha256:72c99e39d005b793fb7d3d4e660aed6b6281b502e8c1eaf8ee8346023c8e03bc"}, - {file = "kiwisolver-1.3.1-cp37-cp37m-win_amd64.whl", hash = "sha256:8be8d84b7d4f2ba4ffff3665bcd0211318aa632395a1a41553250484a871d454"}, - {file = "kiwisolver-1.3.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:31dfd2ac56edc0ff9ac295193eeaea1c0c923c0355bf948fbd99ed6018010b72"}, - {file = "kiwisolver-1.3.1-cp38-cp38-manylinux1_i686.whl", hash = "sha256:563c649cfdef27d081c84e72a03b48ea9408c16657500c312575ae9d9f7bc1c3"}, - {file = "kiwisolver-1.3.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:78751b33595f7f9511952e7e60ce858c6d64db2e062afb325985ddbd34b5c131"}, - {file = "kiwisolver-1.3.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:a357fd4f15ee49b4a98b44ec23a34a95f1e00292a139d6015c11f55774ef10de"}, - {file = "kiwisolver-1.3.1-cp38-cp38-manylinux2014_ppc64le.whl", hash = "sha256:5989db3b3b34b76c09253deeaf7fbc2707616f130e166996606c284395da3f18"}, - {file = "kiwisolver-1.3.1-cp38-cp38-win32.whl", hash = "sha256:c08e95114951dc2090c4a630c2385bef681cacf12636fb0241accdc6b303fd81"}, - {file = "kiwisolver-1.3.1-cp38-cp38-win_amd64.whl", hash = "sha256:44a62e24d9b01ba94ae7a4a6c3fb215dc4af1dde817e7498d901e229aaf50e4e"}, - {file = "kiwisolver-1.3.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:50af681a36b2a1dee1d3c169ade9fdc59207d3c31e522519181e12f1b3ba7000"}, - {file = "kiwisolver-1.3.1-cp39-cp39-manylinux1_i686.whl", hash = "sha256:a53d27d0c2a0ebd07e395e56a1fbdf75ffedc4a05943daf472af163413ce9598"}, - {file = "kiwisolver-1.3.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:834ee27348c4aefc20b479335fd422a2c69db55f7d9ab61721ac8cd83eb78882"}, - {file = "kiwisolver-1.3.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:5c3e6455341008a054cccee8c5d24481bcfe1acdbc9add30aa95798e95c65621"}, - {file = "kiwisolver-1.3.1-cp39-cp39-manylinux2014_ppc64le.whl", hash = "sha256:acef3d59d47dd85ecf909c359d0fd2c81ed33bdff70216d3956b463e12c38a54"}, - {file = "kiwisolver-1.3.1-cp39-cp39-win32.whl", hash = "sha256:c5518d51a0735b1e6cee1fdce66359f8d2b59c3ca85dc2b0813a8aa86818a030"}, - {file = "kiwisolver-1.3.1-cp39-cp39-win_amd64.whl", hash = "sha256:b9edd0110a77fc321ab090aaa1cfcaba1d8499850a12848b81be2222eab648f6"}, - {file = "kiwisolver-1.3.1-pp36-pypy36_pp73-macosx_10_9_x86_64.whl", hash = "sha256:0cd53f403202159b44528498de18f9285b04482bab2a6fc3f5dd8dbb9352e30d"}, - {file = "kiwisolver-1.3.1-pp36-pypy36_pp73-manylinux2010_x86_64.whl", hash = "sha256:33449715e0101e4d34f64990352bce4095c8bf13bed1b390773fc0a7295967b3"}, - {file = "kiwisolver-1.3.1-pp36-pypy36_pp73-win32.whl", hash = "sha256:401a2e9afa8588589775fe34fc22d918ae839aaaf0c0e96441c0fdbce6d8ebe6"}, - {file = "kiwisolver-1.3.1.tar.gz", hash = "sha256:950a199911a8d94683a6b10321f9345d5a3a8433ec58b217ace979e18f16e248"}, -] -llvmlite = [ - {file = "llvmlite-0.34.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:11342e5ac320c953590bdd9d0dec8c52f4b5252c4c6335ba25f1e7b9f91f9325"}, - {file = "llvmlite-0.34.0-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:5bdf0ce430adfaf938ced5844d12f80616eb8321b5b9edfc45ef84ada5c5242c"}, - {file = "llvmlite-0.34.0-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:e08d9d2dc5a31636bfc6b516d2d7daba95632afa3419eb8730dc76a7951e9558"}, - {file = "llvmlite-0.34.0-cp36-cp36m-win32.whl", hash = "sha256:9ff1dcdad03be0cf953aca5fc8cffdca25ccee2ec9e8ec7e95571722cdc02d55"}, - {file = "llvmlite-0.34.0-cp36-cp36m-win_amd64.whl", hash = "sha256:5acdc3c3c7ea0ef7a1a6b442272e05d695bc8492e5b07666135ed1cfbf4ab9d2"}, - {file = "llvmlite-0.34.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:bb96989bc57a1ccb131e7a0e061d07b68139b6f81a98912345d53d9239e231e1"}, - {file = "llvmlite-0.34.0-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:6d3f81992f52a94077e7b9b16497029daf5b5eebb2cce56f3c8345bbc9c6308e"}, - {file = "llvmlite-0.34.0-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:d841248d1c630426c93e3eb3f8c45bca0dab77c09faeb7553b1a500220e362ce"}, - {file = "llvmlite-0.34.0-cp37-cp37m-win32.whl", hash = "sha256:408b15ffec30696406e821c89da010f1bb1eb0aa572be4561c98eb2536d610ab"}, - {file = "llvmlite-0.34.0-cp37-cp37m-win_amd64.whl", hash = "sha256:5d1f370bf150db7239204f09cf6a0603292ea28bac984e69b167e16fe160d803"}, - {file = "llvmlite-0.34.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:132322bc084abf336c80dd106f9357978c8c085911fb656898d3be0d9ff057ea"}, - {file = "llvmlite-0.34.0-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:8f344102745fceba6eb5bf03c228bb290e9bc79157e9506a4a72878d636f9b3c"}, - {file = "llvmlite-0.34.0-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:05253f3f44fab0148276335b2c1b2c4a78143dfa78e6bafd7f937d6248f297cc"}, - {file = "llvmlite-0.34.0-cp38-cp38-win32.whl", hash = "sha256:28264f9e2b3df4135cbcfca5a91c5b0b31dd3fc02fa623b4bb13327f0cd4fc80"}, - {file = "llvmlite-0.34.0-cp38-cp38-win_amd64.whl", hash = "sha256:964f8f7a2184963cb3617d057c2382575953e488b7bb061b632ee014cfef110a"}, - {file = "llvmlite-0.34.0.tar.gz", hash = "sha256:f03ee0d19bca8f2fe922bb424a909d05c28411983b0c2bc58b020032a0d11f63"}, -] -markupsafe = [ - {file = "MarkupSafe-2.0.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d8446c54dc28c01e5a2dbac5a25f071f6653e6e40f3a8818e8b45d790fe6ef53"}, - {file = "MarkupSafe-2.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:36bc903cbb393720fad60fc28c10de6acf10dc6cc883f3e24ee4012371399a38"}, - {file = "MarkupSafe-2.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2d7d807855b419fc2ed3e631034685db6079889a1f01d5d9dac950f764da3dad"}, - {file = "MarkupSafe-2.0.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:add36cb2dbb8b736611303cd3bfcee00afd96471b09cda130da3581cbdc56a6d"}, - {file = "MarkupSafe-2.0.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:168cd0a3642de83558a5153c8bd34f175a9a6e7f6dc6384b9655d2697312a646"}, - {file = "MarkupSafe-2.0.1-cp310-cp310-win32.whl", hash = "sha256:99df47edb6bda1249d3e80fdabb1dab8c08ef3975f69aed437cb69d0a5de1e28"}, - {file = "MarkupSafe-2.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:e0f138900af21926a02425cf736db95be9f4af72ba1bb21453432a07f6082134"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:f9081981fe268bd86831e5c75f7de206ef275defcb82bc70740ae6dc507aee51"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:0955295dd5eec6cb6cc2fe1698f4c6d84af2e92de33fbcac4111913cd100a6ff"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:0446679737af14f45767963a1a9ef7620189912317d095f2d9ffa183a4d25d2b"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:f826e31d18b516f653fe296d967d700fddad5901ae07c622bb3705955e1faa94"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:fa130dd50c57d53368c9d59395cb5526eda596d3ffe36666cd81a44d56e48872"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:905fec760bd2fa1388bb5b489ee8ee5f7291d692638ea5f67982d968366bef9f"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf5d821ffabf0ef3533c39c518f3357b171a1651c1ff6827325e4489b0e46c3c"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:0d4b31cc67ab36e3392bbf3862cfbadac3db12bdd8b02a2731f509ed5b829724"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:baa1a4e8f868845af802979fcdbf0bb11f94f1cb7ced4c4b8a351bb60d108145"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-win32.whl", hash = "sha256:6c4ca60fa24e85fe25b912b01e62cb969d69a23a5d5867682dd3e80b5b02581d"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-win_amd64.whl", hash = "sha256:b2f4bf27480f5e5e8ce285a8c8fd176c0b03e93dcc6646477d4630e83440c6a9"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:0717a7390a68be14b8c793ba258e075c6f4ca819f15edfc2a3a027c823718567"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:6557b31b5e2c9ddf0de32a691f2312a32f77cd7681d8af66c2692efdbef84c18"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:49e3ceeabbfb9d66c3aef5af3a60cc43b85c33df25ce03d0031a608b0a8b2e3f"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:d7f9850398e85aba693bb640262d3611788b1f29a79f0c93c565694658f4071f"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:6a7fae0dd14cf60ad5ff42baa2e95727c3d81ded453457771d02b7d2b3f9c0c2"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:b7f2d075102dc8c794cbde1947378051c4e5180d52d276987b8d28a3bd58c17d"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e9936f0b261d4df76ad22f8fee3ae83b60d7c3e871292cd42f40b81b70afae85"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:2a7d351cbd8cfeb19ca00de495e224dea7e7d919659c2841bbb7f420ad03e2d6"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:60bf42e36abfaf9aff1f50f52644b336d4f0a3fd6d8a60ca0d054ac9f713a864"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-win32.whl", hash = "sha256:a30e67a65b53ea0a5e62fe23682cfe22712e01f453b95233b25502f7c61cb415"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-win_amd64.whl", hash = "sha256:611d1ad9a4288cf3e3c16014564df047fe08410e628f89805e475368bd304914"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5bb28c636d87e840583ee3adeb78172efc47c8b26127267f54a9c0ec251d41a9"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:be98f628055368795d818ebf93da628541e10b75b41c559fdf36d104c5787066"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-manylinux1_i686.whl", hash = "sha256:1d609f577dc6e1aa17d746f8bd3c31aa4d258f4070d61b2aa5c4166c1539de35"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:7d91275b0245b1da4d4cfa07e0faedd5b0812efc15b702576d103293e252af1b"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:01a9b8ea66f1658938f65b93a85ebe8bc016e6769611be228d797c9d998dd298"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:47ab1e7b91c098ab893b828deafa1203de86d0bc6ab587b160f78fe6c4011f75"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:97383d78eb34da7e1fa37dd273c20ad4320929af65d156e35a5e2d89566d9dfb"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6fcf051089389abe060c9cd7caa212c707e58153afa2c649f00346ce6d260f1b"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:5855f8438a7d1d458206a2466bf82b0f104a3724bf96a1c781ab731e4201731a"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:3dd007d54ee88b46be476e293f48c85048603f5f516008bee124ddd891398ed6"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-win32.whl", hash = "sha256:023cb26ec21ece8dc3907c0e8320058b2e0cb3c55cf9564da612bc325bed5e64"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:984d76483eb32f1bcb536dc27e4ad56bba4baa70be32fa87152832cdd9db0833"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:2ef54abee730b502252bcdf31b10dacb0a416229b72c18b19e24a4509f273d26"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3c112550557578c26af18a1ccc9e090bfe03832ae994343cfdacd287db6a6ae7"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-manylinux1_i686.whl", hash = "sha256:53edb4da6925ad13c07b6d26c2a852bd81e364f95301c66e930ab2aef5b5ddd8"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:f5653a225f31e113b152e56f154ccbe59eeb1c7487b39b9d9f9cdb58e6c79dc5"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-manylinux2010_i686.whl", hash = "sha256:4efca8f86c54b22348a5467704e3fec767b2db12fc39c6d963168ab1d3fc9135"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:ab3ef638ace319fa26553db0624c4699e31a28bb2a835c5faca8f8acf6a5a902"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:f8ba0e8349a38d3001fae7eadded3f6606f0da5d748ee53cc1dab1d6527b9509"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c47adbc92fc1bb2b3274c4b3a43ae0e4573d9fbff4f54cd484555edbf030baf1"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:37205cac2a79194e3750b0af2a5720d95f786a55ce7df90c3af697bfa100eaac"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:1f2ade76b9903f39aa442b4aadd2177decb66525062db244b35d71d0ee8599b6"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-win32.whl", hash = "sha256:10f82115e21dc0dfec9ab5c0223652f7197feb168c940f3ef61563fc2d6beb74"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:693ce3f9e70a6cf7d2fb9e6c9d8b204b6b39897a2c4a1aa65728d5ac97dcc1d8"}, - {file = "MarkupSafe-2.0.1.tar.gz", hash = "sha256:594c67807fb16238b30c44bdf74f36c02cdf22d1c8cda91ef8a0ed8dabf5620a"}, -] -matplotlib = [ - {file = "matplotlib-3.4.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:c541ee5a3287efe066bbe358320853cf4916bc14c00c38f8f3d8d75275a405a9"}, - {file = "matplotlib-3.4.2-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:3a5c18dbd2c7c366da26a4ad1462fe3e03a577b39e3b503bbcf482b9cdac093c"}, - {file = "matplotlib-3.4.2-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:a9d8cb5329df13e0cdaa14b3b43f47b5e593ec637f13f14db75bb16e46178b05"}, - {file = "matplotlib-3.4.2-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:7ad19f3fb6145b9eb41c08e7cbb9f8e10b91291396bee21e9ce761bb78df63ec"}, - {file = "matplotlib-3.4.2-cp37-cp37m-win32.whl", hash = "sha256:7a58f3d8fe8fac3be522c79d921c9b86e090a59637cb88e3bc51298d7a2c862a"}, - {file = "matplotlib-3.4.2-cp37-cp37m-win_amd64.whl", hash = "sha256:6382bc6e2d7e481bcd977eb131c31dee96e0fb4f9177d15ec6fb976d3b9ace1a"}, - {file = "matplotlib-3.4.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6a6a44f27aabe720ec4fd485061e8a35784c2b9ffa6363ad546316dfc9cea04e"}, - {file = "matplotlib-3.4.2-cp38-cp38-manylinux1_i686.whl", hash = "sha256:1c1779f7ab7d8bdb7d4c605e6ffaa0614b3e80f1e3c8ccf7b9269a22dbc5986b"}, - {file = "matplotlib-3.4.2-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:5826f56055b9b1c80fef82e326097e34dc4af8c7249226b7dd63095a686177d1"}, - {file = "matplotlib-3.4.2-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:0bea5ec5c28d49020e5d7923c2725b837e60bc8be99d3164af410eb4b4c827da"}, - {file = "matplotlib-3.4.2-cp38-cp38-win32.whl", hash = "sha256:6475d0209024a77f869163ec3657c47fed35d9b6ed8bccba8aa0f0099fbbdaa8"}, - {file = "matplotlib-3.4.2-cp38-cp38-win_amd64.whl", hash = "sha256:21b31057bbc5e75b08e70a43cefc4c0b2c2f1b1a850f4a0f7af044eb4163086c"}, - {file = "matplotlib-3.4.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b26535b9de85326e6958cdef720ecd10bcf74a3f4371bf9a7e5b2e659c17e153"}, - {file = "matplotlib-3.4.2-cp39-cp39-manylinux1_i686.whl", hash = "sha256:32fa638cc10886885d1ca3d409d4473d6a22f7ceecd11322150961a70fab66dd"}, - {file = "matplotlib-3.4.2-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:956c8849b134b4a343598305a3ca1bdd3094f01f5efc8afccdebeffe6b315247"}, - {file = "matplotlib-3.4.2-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:85f191bb03cb1a7b04b5c2cca4792bef94df06ef473bc49e2818105671766fee"}, - {file = "matplotlib-3.4.2-cp39-cp39-win32.whl", hash = "sha256:b1d5a2cedf5de05567c441b3a8c2651fbde56df08b82640e7f06c8cd91e201f6"}, - {file = "matplotlib-3.4.2-cp39-cp39-win_amd64.whl", hash = "sha256:df815378a754a7edd4559f8c51fc7064f779a74013644a7f5ac7a0c31f875866"}, - {file = "matplotlib-3.4.2.tar.gz", hash = "sha256:d8d994cefdff9aaba45166eb3de4f5211adb4accac85cbf97137e98f26ea0219"}, -] -matplotlib-inline = [ - {file = "matplotlib-inline-0.1.2.tar.gz", hash = "sha256:f41d5ff73c9f5385775d5c0bc13b424535c8402fe70ea8210f93e11f3683993e"}, - {file = "matplotlib_inline-0.1.2-py3-none-any.whl", hash = "sha256:5cf1176f554abb4fa98cb362aa2b55c500147e4bdbb07e3fda359143e1da0811"}, -] -mistune = [ - {file = "mistune-0.8.4-py2.py3-none-any.whl", hash = "sha256:88a1051873018da288eee8538d476dffe1262495144b33ecb586c4ab266bb8d4"}, - {file = "mistune-0.8.4.tar.gz", hash = "sha256:59a3429db53c50b5c6bcc8a07f8848cb00d7dc8bdb431a4ab41920d201d4756e"}, -] -murmurhash = [ - {file = "murmurhash-1.0.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:ef8819d15973e0d6f69688bafc097a1fae081675c1de39807028869a1320b1a9"}, - {file = "murmurhash-1.0.5-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:76251513a2acad6c2e4b7aeffc5fcb807ee97a66cad5c2990557556555a6b7e9"}, - {file = "murmurhash-1.0.5-cp36-cp36m-manylinux2014_x86_64.whl", hash = "sha256:d58315961dc5a5e740f41f2ac5c3a0ebc61ef472f8afeb4db7eeb3b863243105"}, - {file = "murmurhash-1.0.5-cp36-cp36m-win_amd64.whl", hash = "sha256:23c56182822a1ed88e2a098ac56958dfec380696a9a943df203b9b41e4bcf5e4"}, - {file = "murmurhash-1.0.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:023391cfefe584ac544c1ea0936976c0119b17dd27bb8280652cef1704f76428"}, - {file = "murmurhash-1.0.5-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:f00321998f0a6bad3fd068babf448a296d4b0b1f4dd424cab863ebe5ed54182f"}, - {file = "murmurhash-1.0.5-cp37-cp37m-manylinux2014_x86_64.whl", hash = "sha256:8381172e03c5f6f947005fb146a53c5e5a9e0d630be4a40cbf8838e9324bfe1c"}, - {file = "murmurhash-1.0.5-cp37-cp37m-win_amd64.whl", hash = "sha256:fed7578fbaa6c301f27ed80834c1f7494ea7d335e269e98b9aee477cf0b3b487"}, - {file = "murmurhash-1.0.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d4c3a0242014cf4c84e9ea0ba3f13b48f02a3992de3da7b1116d11b816451195"}, - {file = "murmurhash-1.0.5-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:99e55488476a5f70e8d305fd31258f140e52f724f788bcc50c31ec846a2b3766"}, - {file = "murmurhash-1.0.5-cp38-cp38-manylinux2014_x86_64.whl", hash = "sha256:b9292c532538cf47846ca81056cfeab08b877c35fe7521d6524aa92ddcd833e2"}, - {file = "murmurhash-1.0.5-cp38-cp38-win_amd64.whl", hash = "sha256:fd17973fd4554715efd8d86b3e9200358e49e437fdb92a897ca127aced48b61c"}, - {file = "murmurhash-1.0.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:81474a45c4074637a6dfc8fea4cdebf091ab5aa781c2cfcb94c43b16030badd7"}, - {file = "murmurhash-1.0.5-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:a9bd2312996e6e47605af305a1e5f091eba1bdd637cdd9986aec4885cb4c5530"}, - {file = "murmurhash-1.0.5-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:892749023da26420d194f37bfa30df1368aaac0149cfa3b2105db36b66549e37"}, - {file = "murmurhash-1.0.5-cp39-cp39-win_amd64.whl", hash = "sha256:add366944eb8ec73013a4f36e166c5a4f0f7628ffe1746bc5fe031347489e5e8"}, - {file = "murmurhash-1.0.5.tar.gz", hash = "sha256:98ec9d727bd998a35385abd56b062cf0cca216725ea7ec5068604ab566f7e97f"}, -] -nbclient = [ - {file = "nbclient-0.5.3-py3-none-any.whl", hash = "sha256:e79437364a2376892b3f46bedbf9b444e5396cfb1bc366a472c37b48e9551500"}, - {file = "nbclient-0.5.3.tar.gz", hash = "sha256:db17271330c68c8c88d46d72349e24c147bb6f34ec82d8481a8f025c4d26589c"}, -] -nbconvert = [ - {file = "nbconvert-6.1.0-py3-none-any.whl", hash = "sha256:37cd92ff2ae6a268e62075ff8b16129e0be4939c4dfcee53dc77cc8a7e06c684"}, - {file = "nbconvert-6.1.0.tar.gz", hash = "sha256:d22a8ff202644d31db254d24d52c3a96c82156623fcd7c7f987bba2612303ec9"}, -] -nbformat = [ - {file = "nbformat-5.1.3-py3-none-any.whl", hash = "sha256:eb8447edd7127d043361bc17f2f5a807626bc8e878c7709a1c647abda28a9171"}, - {file = "nbformat-5.1.3.tar.gz", hash = "sha256:b516788ad70771c6250977c1374fcca6edebe6126fd2adb5a69aa5c2356fd1c8"}, -] -nest-asyncio = [ - {file = "nest_asyncio-1.5.1-py3-none-any.whl", hash = "sha256:76d6e972265063fe92a90b9cc4fb82616e07d586b346ed9d2c89a4187acea39c"}, - {file = "nest_asyncio-1.5.1.tar.gz", hash = "sha256:afc5a1c515210a23c461932765691ad39e8eba6551c055ac8d5546e69250d0aa"}, -] -networkx = [ - {file = "networkx-2.6.2-py3-none-any.whl", hash = "sha256:5fcb7004be69e8fbdf07dcb502efa5c77cadcaad6982164134eeb9721f826c2e"}, - {file = "networkx-2.6.2.tar.gz", hash = "sha256:2306f1950ce772c5a59a57f5486d59bb9cab98497c45fc49cbc45ac0dec119bb"}, -] -nltk = [ - {file = "nltk-3.6.2-py3-none-any.whl", hash = "sha256:240e23ab1ab159ef9940777d30c7c72d7e76d91877099218a7585370c11f6b9e"}, - {file = "nltk-3.6.2.zip", hash = "sha256:57d556abed621ab9be225cc6d2df1edce17572efb67a3d754630c9f8381503eb"}, -] -nodeenv = [ - {file = "nodeenv-1.6.0-py2.py3-none-any.whl", hash = "sha256:621e6b7076565ddcacd2db0294c0381e01fd28945ab36bcf00f41c5daf63bef7"}, - {file = "nodeenv-1.6.0.tar.gz", hash = "sha256:3ef13ff90291ba2a4a7a4ff9a979b63ffdd00a464dbe04acf0ea6471517a4c2b"}, -] -notebook = [ - {file = "notebook-6.4.3-py3-none-any.whl", hash = "sha256:b50eafa8208d5db966efd1caa4076b4dfc51815e02a805b32ecd717e9e6cc071"}, - {file = "notebook-6.4.3.tar.gz", hash = "sha256:e6b6dfed36b00cf950f63c0d42e947c101d4258aec21624de62b9e0c11ed5c0d"}, -] -numba = [ - {file = "numba-0.51.2-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:af798310eeb318c56cdb83254abbe9a938cc0182d08671d7f9f032dc817e064d"}, - {file = "numba-0.51.2-cp36-cp36m-manylinux2014_i686.whl", hash = "sha256:93e18350f2094e7432321c1275730a3143b94af012fb609cc180fa376c44867f"}, - {file = "numba-0.51.2-cp36-cp36m-manylinux2014_x86_64.whl", hash = "sha256:9e2bb1f129bfadd757ad7a9c18ab79c3ab25ce6d6a68e58565d6c52ad07b3566"}, - {file = "numba-0.51.2-cp36-cp36m-win32.whl", hash = "sha256:31cdf6b6d1301d5fb6c4fcb8b4c711ba5c9f60ba2fca008b550da9b56185367c"}, - {file = "numba-0.51.2-cp36-cp36m-win_amd64.whl", hash = "sha256:df6edca13c04a31fdb5addf5205199478a7da372712829157ef491e8a6e7031f"}, - {file = "numba-0.51.2-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:a628122dacfcba9a3ea68a9e95578c6b6391016e34962c46550ea8e189e0412e"}, - {file = "numba-0.51.2-cp37-cp37m-manylinux2014_i686.whl", hash = "sha256:106736d5a8dab6bebce989d4ab1b3f169c264582598f172e6e5b736210d2e834"}, - {file = "numba-0.51.2-cp37-cp37m-manylinux2014_x86_64.whl", hash = "sha256:a12f16fdb4ca5edc94e2ef412e4e768c29217ef9b6fdfc237d064ebe30acfe14"}, - {file = "numba-0.51.2-cp37-cp37m-win32.whl", hash = "sha256:025b033fd31c44bba17802293c81270084b5454b5b055b8c10c394385c232f00"}, - {file = "numba-0.51.2-cp37-cp37m-win_amd64.whl", hash = "sha256:081788f584fa500339e9b74bf02e3c5029d408c114e555ada19cae0b92721416"}, - {file = "numba-0.51.2-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:5416b584183fd599afda11b947b64f89450fcf26a9c15b408167f412b98a3a94"}, - {file = "numba-0.51.2-cp38-cp38-manylinux2014_i686.whl", hash = "sha256:05da65dca2ac28a192c9d8f20e9e477eb1237205cfc4d131c414f5f8092c6639"}, - {file = "numba-0.51.2-cp38-cp38-manylinux2014_x86_64.whl", hash = "sha256:aee435e3b7e465dd49971f8ea76aa414532a87736916cb399534e017334d1138"}, - {file = "numba-0.51.2-cp38-cp38-win32.whl", hash = "sha256:bbbe2432433b11d3fadab0226a84c1a81918cb905ba1aeb022249e8d2ba8856c"}, - {file = "numba-0.51.2-cp38-cp38-win_amd64.whl", hash = "sha256:259e7c15b24feec4a99fb41eb8c47b5ad49b544d1a5ad40ad0252ef531ba06fd"}, - {file = "numba-0.51.2.tar.gz", hash = "sha256:16bd59572114adbf5f600ea383880d7b2071ae45477e84a24994e089ea390768"}, -] -numpy = [ - {file = "numpy-1.21.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:38e8648f9449a549a7dfe8d8755a5979b45b3538520d1e735637ef28e8c2dc50"}, - {file = "numpy-1.21.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:fd7d7409fa643a91d0a05c7554dd68aa9c9bb16e186f6ccfe40d6e003156e33a"}, - {file = "numpy-1.21.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:a75b4498b1e93d8b700282dc8e655b8bd559c0904b3910b144646dbbbc03e062"}, - {file = "numpy-1.21.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1412aa0aec3e00bc23fbb8664d76552b4efde98fb71f60737c83efbac24112f1"}, - {file = "numpy-1.21.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e46ceaff65609b5399163de5893d8f2a82d3c77d5e56d976c8b5fb01faa6b671"}, - {file = "numpy-1.21.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:c6a2324085dd52f96498419ba95b5777e40b6bcbc20088fddb9e8cbb58885e8e"}, - {file = "numpy-1.21.1-cp37-cp37m-win32.whl", hash = "sha256:73101b2a1fef16602696d133db402a7e7586654682244344b8329cdcbbb82172"}, - {file = "numpy-1.21.1-cp37-cp37m-win_amd64.whl", hash = "sha256:7a708a79c9a9d26904d1cca8d383bf869edf6f8e7650d85dbc77b041e8c5a0f8"}, - {file = "numpy-1.21.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:95b995d0c413f5d0428b3f880e8fe1660ff9396dcd1f9eedbc311f37b5652e16"}, - {file = "numpy-1.21.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:635e6bd31c9fb3d475c8f44a089569070d10a9ef18ed13738b03049280281267"}, - {file = "numpy-1.21.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4a3d5fb89bfe21be2ef47c0614b9c9c707b7362386c9a3ff1feae63e0267ccb6"}, - {file = "numpy-1.21.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:8a326af80e86d0e9ce92bcc1e65c8ff88297de4fa14ee936cb2293d414c9ec63"}, - {file = "numpy-1.21.1-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:791492091744b0fe390a6ce85cc1bf5149968ac7d5f0477288f78c89b385d9af"}, - {file = "numpy-1.21.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0318c465786c1f63ac05d7c4dbcecd4d2d7e13f0959b01b534ea1e92202235c5"}, - {file = "numpy-1.21.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9a513bd9c1551894ee3d31369f9b07460ef223694098cf27d399513415855b68"}, - {file = "numpy-1.21.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:91c6f5fc58df1e0a3cc0c3a717bb3308ff850abdaa6d2d802573ee2b11f674a8"}, - {file = "numpy-1.21.1-cp38-cp38-win32.whl", hash = "sha256:978010b68e17150db8765355d1ccdd450f9fc916824e8c4e35ee620590e234cd"}, - {file = "numpy-1.21.1-cp38-cp38-win_amd64.whl", hash = "sha256:9749a40a5b22333467f02fe11edc98f022133ee1bfa8ab99bda5e5437b831214"}, - {file = "numpy-1.21.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:d7a4aeac3b94af92a9373d6e77b37691b86411f9745190d2c351f410ab3a791f"}, - {file = "numpy-1.21.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d9e7912a56108aba9b31df688a4c4f5cb0d9d3787386b87d504762b6754fbb1b"}, - {file = "numpy-1.21.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:25b40b98ebdd272bc3020935427a4530b7d60dfbe1ab9381a39147834e985eac"}, - {file = "numpy-1.21.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:8a92c5aea763d14ba9d6475803fc7904bda7decc2a0a68153f587ad82941fec1"}, - {file = "numpy-1.21.1-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:05a0f648eb28bae4bcb204e6fd14603de2908de982e761a2fc78efe0f19e96e1"}, - {file = "numpy-1.21.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f01f28075a92eede918b965e86e8f0ba7b7797a95aa8d35e1cc8821f5fc3ad6a"}, - {file = "numpy-1.21.1-cp39-cp39-win32.whl", hash = "sha256:88c0b89ad1cc24a5efbb99ff9ab5db0f9a86e9cc50240177a571fbe9c2860ac2"}, - {file = "numpy-1.21.1-cp39-cp39-win_amd64.whl", hash = "sha256:01721eefe70544d548425a07c80be8377096a54118070b8a62476866d5208e33"}, - {file = "numpy-1.21.1-pp37-pypy37_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:2d4d1de6e6fb3d28781c73fbde702ac97f03d79e4ffd6598b880b2d95d62ead4"}, - {file = "numpy-1.21.1.zip", hash = "sha256:dff4af63638afcc57a3dfb9e4b26d434a7a602d225b42d746ea7fe2edf1342fd"}, -] -opencv-python = [ - {file = "opencv-python-4.5.3.56.tar.gz", hash = "sha256:3c001d3feec7f3140f1fb78dfc52ca28122db8240826882d175a208a89d2731b"}, - {file = "opencv_python-4.5.3.56-cp36-cp36m-macosx_10_15_x86_64.whl", hash = "sha256:9a78558b5ae848386edbb843c761e5fed5a8480be9af16274a5a78838529edeb"}, - {file = "opencv_python-4.5.3.56-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:8d3282138f3a8646941089aae142684910ebe40776266448eab5f4bb609fc63f"}, - {file = "opencv_python-4.5.3.56-cp36-cp36m-manylinux2014_x86_64.whl", hash = "sha256:881f3d85269500e0c7d72b140a6ebb5c14a089f8140fb9da7ce01f12a245858e"}, - {file = "opencv_python-4.5.3.56-cp36-cp36m-win32.whl", hash = "sha256:f1bda4d144f5204e077ca4571453ebb2015e5748d5e0043386c92c2bbf7f52eb"}, - {file = "opencv_python-4.5.3.56-cp36-cp36m-win_amd64.whl", hash = "sha256:6763729fcfee2a08e069aa1982c9a8c1abf55b9cdf2fb9640eda1d85bdece19a"}, - {file = "opencv_python-4.5.3.56-cp37-cp37m-macosx_10_15_x86_64.whl", hash = "sha256:68813b720b88e4951e84399b9a8a7b532d45a07a96ea8f539636242f862e32e0"}, - {file = "opencv_python-4.5.3.56-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:c360cb76ad1ddbd5d2d3e730b42f2ff6e4be08ea6f4a6eefacca175d27467e8f"}, - {file = "opencv_python-4.5.3.56-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:437f30e300725e1d1b3744dbfbc66a523a4744792b58f3dbe1e9140c8f4dfba5"}, - {file = "opencv_python-4.5.3.56-cp37-cp37m-manylinux2014_x86_64.whl", hash = "sha256:e42c644a70d5c54f53a4b114dbd88b4eb83f42a9ca998f07bd5682f3f404efcc"}, - {file = "opencv_python-4.5.3.56-cp37-cp37m-win32.whl", hash = "sha256:f3ac2355217114a683f3f72a9c40a5890914a59c4a2df62e4083c66ff65c9cf9"}, - {file = "opencv_python-4.5.3.56-cp37-cp37m-win_amd64.whl", hash = "sha256:7f41b97d84ac66bdf13cb4d9f4dad3e159525ba1e3f421e670c787ce536eb70a"}, - {file = "opencv_python-4.5.3.56-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:cdc3363c2911d7cfc6c9f55308c51c2841a7aecbf0bf5e791499d220ce89d880"}, - {file = "opencv_python-4.5.3.56-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:18a4a14015eee30d9cd514db8cdefbf594b1d5c234762d27abe512d62a333bc3"}, - {file = "opencv_python-4.5.3.56-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:05c5139d620e8d02f7ce0921796d55736fa19fa15e2ec00a388db2eb1ae1e9a1"}, - {file = "opencv_python-4.5.3.56-cp38-cp38-manylinux2014_x86_64.whl", hash = "sha256:831b92fe63ce18dd628f71104da7e60596658b75e2fa16b83aefa3eb10c115e2"}, - {file = "opencv_python-4.5.3.56-cp38-cp38-win32.whl", hash = "sha256:e1f54736272830a1e895cedf7a4ee67737e31e966d380c82a81ef22515d043a3"}, - {file = "opencv_python-4.5.3.56-cp38-cp38-win_amd64.whl", hash = "sha256:b42bbba9f5421865377c7960bd4f3dd881003b322a6bf46ed2302b89224d102b"}, - {file = "opencv_python-4.5.3.56-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:5366fcd6eae4243add3c8c92142045850f1db8e464bcf0b75313e1596b2e3671"}, - {file = "opencv_python-4.5.3.56-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:54c64e86a087841869901fd34462bb6bec01cd4652800fdf5d92fe7b0596c82f"}, - {file = "opencv_python-4.5.3.56-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:8852be06c0749fef0d9c58f532bbcb0570968c59e41cf56b90f5c92593c6e108"}, - {file = "opencv_python-4.5.3.56-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:8b5bc61be7fc8565140b746288b370a4bfdb4edb9d680b66bb914e7690485db1"}, - {file = "opencv_python-4.5.3.56-cp39-cp39-win32.whl", hash = "sha256:085232718f28bddd265da480874c37db5c7354cb08f23f4a68a8639b16276a89"}, - {file = "opencv_python-4.5.3.56-cp39-cp39-win_amd64.whl", hash = "sha256:205a73adb29c37e42475645519e612e843a985475da993d10b4d5daa6afec36a"}, -] -packaging = [ - {file = "packaging-21.0-py3-none-any.whl", hash = "sha256:c86254f9220d55e31cc94d69bade760f0847da8000def4dfe1c6b872fd14ff14"}, - {file = "packaging-21.0.tar.gz", hash = "sha256:7dc96269f53a4ccec5c0670940a4281106dd0bb343f47b7471f779df49c2fbe7"}, -] -pandas = [ - {file = "pandas-1.3.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:1ee8418d0f936ff2216513aa03e199657eceb67690995d427a4a7ecd2e68f442"}, - {file = "pandas-1.3.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d9acfca191140a518779d1095036d842d5e5bc8e8ad8b5eaad1aff90fe1870d"}, - {file = "pandas-1.3.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e323028ab192fcfe1e8999c012a0fa96d066453bb354c7e7a4a267b25e73d3c8"}, - {file = "pandas-1.3.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9d06661c6eb741ae633ee1c57e8c432bb4203024e263fe1a077fa3fda7817fdb"}, - {file = "pandas-1.3.1-cp37-cp37m-win32.whl", hash = "sha256:23c7452771501254d2ae23e9e9dac88417de7e6eff3ce64ee494bb94dc88c300"}, - {file = "pandas-1.3.1-cp37-cp37m-win_amd64.whl", hash = "sha256:7150039e78a81eddd9f5a05363a11cadf90a4968aac6f086fd83e66cf1c8d1d6"}, - {file = "pandas-1.3.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:5c09a2538f0fddf3895070579082089ff4ae52b6cb176d8ec7a4dacf7e3676c1"}, - {file = "pandas-1.3.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:905fc3e0fcd86b0a9f1f97abee7d36894698d2592b22b859f08ea5a8fe3d3aab"}, - {file = "pandas-1.3.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ee927c70794e875a59796fab8047098aa59787b1be680717c141cd7873818ae"}, - {file = "pandas-1.3.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0c976e023ed580e60a82ccebdca8e1cc24d8b1fbb28175eb6521025c127dab66"}, - {file = "pandas-1.3.1-cp38-cp38-win32.whl", hash = "sha256:22f3fcc129fb482ef44e7df2a594f0bd514ac45aabe50da1a10709de1b0f9d84"}, - {file = "pandas-1.3.1-cp38-cp38-win_amd64.whl", hash = "sha256:45656cd59ae9745a1a21271a62001df58342b59c66d50754390066db500a8362"}, - {file = "pandas-1.3.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:114c6789d15862508900a25cb4cb51820bfdd8595ea306bab3b53cd19f990b65"}, - {file = "pandas-1.3.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:527c43311894aff131dea99cf418cd723bfd4f0bcf3c3da460f3b57e52a64da5"}, - {file = "pandas-1.3.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fdb3b33dde260b1766ea4d3c6b8fbf6799cee18d50a2a8bc534cf3550b7c819a"}, - {file = "pandas-1.3.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c28760932283d2c9f6fa5e53d2f77a514163b9e67fd0ee0879081be612567195"}, - {file = "pandas-1.3.1-cp39-cp39-win32.whl", hash = "sha256:be12d77f7e03c40a2466ed00ccd1a5f20a574d3c622fe1516037faa31aa448aa"}, - {file = "pandas-1.3.1-cp39-cp39-win_amd64.whl", hash = "sha256:9e1fe6722cbe27eb5891c1977bca62d456c19935352eea64d33956db46139364"}, - {file = "pandas-1.3.1.tar.gz", hash = "sha256:341935a594db24f3ff07d1b34d1d231786aa9adfa84b76eab10bf42907c8aed3"}, -] -pandocfilters = [ - {file = "pandocfilters-1.4.3.tar.gz", hash = "sha256:bc63fbb50534b4b1f8ebe1860889289e8af94a23bff7445259592df25a3906eb"}, -] -parso = [ - {file = "parso-0.8.2-py2.py3-none-any.whl", hash = "sha256:a8c4922db71e4fdb90e0d0bc6e50f9b273d3397925e5e60a717e719201778d22"}, - {file = "parso-0.8.2.tar.gz", hash = "sha256:12b83492c6239ce32ff5eed6d3639d6a536170723c6f3f1506869f1ace413398"}, -] -pathy = [ - {file = "pathy-0.6.0-py3-none-any.whl", hash = "sha256:bffa0bd74c66575cf51c96d3ab312f34d08d6bff54aabb8c7a2b9f8b701fe6ef"}, - {file = "pathy-0.6.0.tar.gz", hash = "sha256:f83f1eddf77dd86e824143eef8d9adbe0785c3cdd5ec0ed6c0edea3227385048"}, -] -patsy = [ - {file = "patsy-0.5.1-py2.py3-none-any.whl", hash = "sha256:5465be1c0e670c3a965355ec09e9a502bf2c4cbe4875e8528b0221190a8a5d40"}, - {file = "patsy-0.5.1.tar.gz", hash = "sha256:f115cec4201e1465cd58b9866b0b0e7b941caafec129869057405bfe5b5e3991"}, -] -pdpbox = [ - {file = "PDPbox-0.2.0-py2-none-any.whl", hash = "sha256:def6840f5a3ada5d4269aced1e0b1244c417a471cf3ed87bf4c4f60ee4f64d2b"}, - {file = "PDPbox-0.2.0.tar.gz", hash = "sha256:2eae5a20004657f48ddd5b00f2fb74dd54f9de891c25ec7935a8fd471f9186f9"}, -] -pexpect = [ - {file = "pexpect-4.8.0-py2.py3-none-any.whl", hash = "sha256:0b48a55dcb3c05f3329815901ea4fc1537514d6ba867a152b581d69ae3710937"}, - {file = "pexpect-4.8.0.tar.gz", hash = "sha256:fc65a43959d153d0114afe13997d439c22823a27cefceb5ff35c2178c6784c0c"}, -] -pickleshare = [ - {file = "pickleshare-0.7.5-py2.py3-none-any.whl", hash = "sha256:9649af414d74d4df115d5d718f82acb59c9d418196b7b4290ed47a12ce62df56"}, - {file = "pickleshare-0.7.5.tar.gz", hash = "sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca"}, -] -pillow = [ - {file = "Pillow-7.2.0-cp35-cp35m-macosx_10_10_intel.whl", hash = "sha256:1ca594126d3c4def54babee699c055a913efb01e106c309fa6b04405d474d5ae"}, - {file = "Pillow-7.2.0-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:c92302a33138409e8f1ad16731568c55c9053eee71bb05b6b744067e1b62380f"}, - {file = "Pillow-7.2.0-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:8dad18b69f710bf3a001d2bf3afab7c432785d94fcf819c16b5207b1cfd17d38"}, - {file = "Pillow-7.2.0-cp35-cp35m-manylinux2014_aarch64.whl", hash = "sha256:431b15cffbf949e89df2f7b48528be18b78bfa5177cb3036284a5508159492b5"}, - {file = "Pillow-7.2.0-cp35-cp35m-win32.whl", hash = "sha256:09d7f9e64289cb40c2c8d7ad674b2ed6105f55dc3b09aa8e4918e20a0311e7ad"}, - {file = "Pillow-7.2.0-cp35-cp35m-win_amd64.whl", hash = "sha256:0295442429645fa16d05bd567ef5cff178482439c9aad0411d3f0ce9b88b3a6f"}, - {file = "Pillow-7.2.0-cp36-cp36m-macosx_10_10_x86_64.whl", hash = "sha256:ec29604081f10f16a7aea809ad42e27764188fc258b02259a03a8ff7ded3808d"}, - {file = "Pillow-7.2.0-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:612cfda94e9c8346f239bf1a4b082fdd5c8143cf82d685ba2dba76e7adeeb233"}, - {file = "Pillow-7.2.0-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:0a80dd307a5d8440b0a08bd7b81617e04d870e40a3e46a32d9c246e54705e86f"}, - {file = "Pillow-7.2.0-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:06aba4169e78c439d528fdeb34762c3b61a70813527a2c57f0540541e9f433a8"}, - {file = "Pillow-7.2.0-cp36-cp36m-win32.whl", hash = "sha256:f7e30c27477dffc3e85c2463b3e649f751789e0f6c8456099eea7ddd53be4a8a"}, - {file = "Pillow-7.2.0-cp36-cp36m-win_amd64.whl", hash = "sha256:ffe538682dc19cc542ae7c3e504fdf54ca7f86fb8a135e59dd6bc8627eae6cce"}, - {file = "Pillow-7.2.0-cp37-cp37m-macosx_10_10_x86_64.whl", hash = "sha256:94cf49723928eb6070a892cb39d6c156f7b5a2db4e8971cb958f7b6b104fb4c4"}, - {file = "Pillow-7.2.0-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:6edb5446f44d901e8683ffb25ebdfc26988ee813da3bf91e12252b57ac163727"}, - {file = "Pillow-7.2.0-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:52125833b070791fcb5710fabc640fc1df07d087fc0c0f02d3661f76c23c5b8b"}, - {file = "Pillow-7.2.0-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:9ad7f865eebde135d526bb3163d0b23ffff365cf87e767c649550964ad72785d"}, - {file = "Pillow-7.2.0-cp37-cp37m-win32.whl", hash = "sha256:c79f9c5fb846285f943aafeafda3358992d64f0ef58566e23484132ecd8d7d63"}, - {file = "Pillow-7.2.0-cp37-cp37m-win_amd64.whl", hash = "sha256:d350f0f2c2421e65fbc62690f26b59b0bcda1b614beb318c81e38647e0f673a1"}, - {file = "Pillow-7.2.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:6d7741e65835716ceea0fd13a7d0192961212fd59e741a46bbed7a473c634ed6"}, - {file = "Pillow-7.2.0-cp38-cp38-manylinux1_i686.whl", hash = "sha256:edf31f1150778abd4322444c393ab9c7bd2af271dd4dafb4208fb613b1f3cdc9"}, - {file = "Pillow-7.2.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:d08b23fdb388c0715990cbc06866db554e1822c4bdcf6d4166cf30ac82df8c41"}, - {file = "Pillow-7.2.0-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:5e51ee2b8114def244384eda1c82b10e307ad9778dac5c83fb0943775a653cd8"}, - {file = "Pillow-7.2.0-cp38-cp38-win32.whl", hash = "sha256:725aa6cfc66ce2857d585f06e9519a1cc0ef6d13f186ff3447ab6dff0a09bc7f"}, - {file = "Pillow-7.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:a060cf8aa332052df2158e5a119303965be92c3da6f2d93b6878f0ebca80b2f6"}, - {file = "Pillow-7.2.0-pp36-pypy36_pp73-macosx_10_10_x86_64.whl", hash = "sha256:9c87ef410a58dd54b92424ffd7e28fd2ec65d2f7fc02b76f5e9b2067e355ebf6"}, - {file = "Pillow-7.2.0-pp36-pypy36_pp73-manylinux2010_x86_64.whl", hash = "sha256:e901964262a56d9ea3c2693df68bc9860b8bdda2b04768821e4c44ae797de117"}, - {file = "Pillow-7.2.0-pp36-pypy36_pp73-win32.whl", hash = "sha256:25930fadde8019f374400f7986e8404c8b781ce519da27792cbe46eabec00c4d"}, - {file = "Pillow-7.2.0.tar.gz", hash = "sha256:97f9e7953a77d5a70f49b9a48da7776dc51e9b738151b22dacf101641594a626"}, -] -platformdirs = [ - {file = "platformdirs-2.2.0-py3-none-any.whl", hash = "sha256:4666d822218db6a262bdfdc9c39d21f23b4cfdb08af331a81e92751daf6c866c"}, - {file = "platformdirs-2.2.0.tar.gz", hash = "sha256:632daad3ab546bd8e6af0537d09805cec458dce201bccfe23012df73332e181e"}, -] -pre-commit = [ - {file = "pre_commit-2.14.0-py2.py3-none-any.whl", hash = "sha256:ec3045ae62e1aa2eecfb8e86fa3025c2e3698f77394ef8d2011ce0aedd85b2d4"}, - {file = "pre_commit-2.14.0.tar.gz", hash = "sha256:2386eeb4cf6633712c7cc9ede83684d53c8cafca6b59f79c738098b51c6d206c"}, -] -preshed = [ - {file = "preshed-3.0.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:572899224578d30f6a67fadecb3d62b824866b4d2b6bad73f71abf7585db1389"}, - {file = "preshed-3.0.5-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:67c11e384ce4c008bc487ba3a29bafdfe038b9a2546ccfe0fe2160480b356fed"}, - {file = "preshed-3.0.5-cp36-cp36m-manylinux2014_x86_64.whl", hash = "sha256:6e833f1632a1d0232bdc6df6c3542fb130ef044d8656b24576d9fd19e5f1e0d1"}, - {file = "preshed-3.0.5-cp36-cp36m-win_amd64.whl", hash = "sha256:1ce0846cb7ebb2ea913d44ec2e296098c285443ecdea80ddf02656bbef4deacb"}, - {file = "preshed-3.0.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8a560850b8c53c1487ba51c2b0f5769535512b36d3b129ad5796b64653abe2f9"}, - {file = "preshed-3.0.5-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:6f126bcc414a0304b54956f9dac2628a0f9bef1657d1b3a3837fc82b791aa2a1"}, - {file = "preshed-3.0.5-cp37-cp37m-manylinux2014_x86_64.whl", hash = "sha256:1bdededa7fd81f26a42bc9d11d542657c74746b7ea7fc2b2ca6d0ddbf1f93792"}, - {file = "preshed-3.0.5-cp37-cp37m-win_amd64.whl", hash = "sha256:9ebf444f8487782c84d7b5acb1d7195e603155882fafc4697344199eeeafbe5f"}, - {file = "preshed-3.0.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8a3adffde3126c2a0ab7d57cab1d605cb5f63da1ba88088ad3cf8debfd9aa4dc"}, - {file = "preshed-3.0.5-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:56b9603517bb2a364418163236d6a147a1d722ff7546cbe085e76e25ae118e89"}, - {file = "preshed-3.0.5-cp38-cp38-manylinux2014_x86_64.whl", hash = "sha256:5e06a49477bd257eea02bf823b5d3e201d00a19d6976523a58da8606b2358481"}, - {file = "preshed-3.0.5-cp38-cp38-win_amd64.whl", hash = "sha256:ca4a7681b643b8356e7dfdab9cf668b2b34bd07ef4b09ebed44c8aeb3b1626ee"}, - {file = "preshed-3.0.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:85074eebf90a858a6b68242f1ae265ca99e1af45bf9dafcb9a83d49b0815a2e1"}, - {file = "preshed-3.0.5-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:12cbe1e378b4f1c6b06f5e4130408befe916e55ea1616e6aa63c5cd0ccd9c927"}, - {file = "preshed-3.0.5-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:30f0c8ea85113d0565a1e3eb6222d00513ec39b56f3f9a2615e304575e65422e"}, - {file = "preshed-3.0.5-cp39-cp39-win_amd64.whl", hash = "sha256:fb4d2e82add82d63b2c97802b759a58ff200d06b632e2edc48a9ced1e6472faf"}, - {file = "preshed-3.0.5.tar.gz", hash = "sha256:c6d3dba39ed5059aaf99767017b9568c75b2d0780c3481e204b1daecde00360e"}, -] -prometheus-client = [ - {file = "prometheus_client-0.11.0-py2.py3-none-any.whl", hash = "sha256:b014bc76815eb1399da8ce5fc84b7717a3e63652b0c0f8804092c9363acab1b2"}, - {file = "prometheus_client-0.11.0.tar.gz", hash = "sha256:3a8baade6cb80bcfe43297e33e7623f3118d660d41387593758e2fb1ea173a86"}, -] -prompt-toolkit = [ - {file = "prompt_toolkit-3.0.19-py3-none-any.whl", hash = "sha256:7089d8d2938043508aa9420ec18ce0922885304cddae87fb96eebca942299f88"}, - {file = "prompt_toolkit-3.0.19.tar.gz", hash = "sha256:08360ee3a3148bdb5163621709ee322ec34fc4375099afa4bbf751e9b7b7fa4f"}, -] -psutil = [ - {file = "psutil-5.8.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:0066a82f7b1b37d334e68697faba68e5ad5e858279fd6351c8ca6024e8d6ba64"}, - {file = "psutil-5.8.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:0ae6f386d8d297177fd288be6e8d1afc05966878704dad9847719650e44fc49c"}, - {file = "psutil-5.8.0-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:12d844996d6c2b1d3881cfa6fa201fd635971869a9da945cf6756105af73d2df"}, - {file = "psutil-5.8.0-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:02b8292609b1f7fcb34173b25e48d0da8667bc85f81d7476584d889c6e0f2131"}, - {file = "psutil-5.8.0-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:6ffe81843131ee0ffa02c317186ed1e759a145267d54fdef1bc4ea5f5931ab60"}, - {file = "psutil-5.8.0-cp27-none-win32.whl", hash = "sha256:ea313bb02e5e25224e518e4352af4bf5e062755160f77e4b1767dd5ccb65f876"}, - {file = "psutil-5.8.0-cp27-none-win_amd64.whl", hash = "sha256:5da29e394bdedd9144c7331192e20c1f79283fb03b06e6abd3a8ae45ffecee65"}, - {file = "psutil-5.8.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:74fb2557d1430fff18ff0d72613c5ca30c45cdbfcddd6a5773e9fc1fe9364be8"}, - {file = "psutil-5.8.0-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:74f2d0be88db96ada78756cb3a3e1b107ce8ab79f65aa885f76d7664e56928f6"}, - {file = "psutil-5.8.0-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:99de3e8739258b3c3e8669cb9757c9a861b2a25ad0955f8e53ac662d66de61ac"}, - {file = "psutil-5.8.0-cp36-cp36m-win32.whl", hash = "sha256:36b3b6c9e2a34b7d7fbae330a85bf72c30b1c827a4366a07443fc4b6270449e2"}, - {file = "psutil-5.8.0-cp36-cp36m-win_amd64.whl", hash = "sha256:52de075468cd394ac98c66f9ca33b2f54ae1d9bff1ef6b67a212ee8f639ec06d"}, - {file = "psutil-5.8.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:c6a5fd10ce6b6344e616cf01cc5b849fa8103fbb5ba507b6b2dee4c11e84c935"}, - {file = "psutil-5.8.0-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:61f05864b42fedc0771d6d8e49c35f07efd209ade09a5afe6a5059e7bb7bf83d"}, - {file = "psutil-5.8.0-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:0dd4465a039d343925cdc29023bb6960ccf4e74a65ad53e768403746a9207023"}, - {file = "psutil-5.8.0-cp37-cp37m-win32.whl", hash = "sha256:1bff0d07e76114ec24ee32e7f7f8d0c4b0514b3fae93e3d2aaafd65d22502394"}, - {file = "psutil-5.8.0-cp37-cp37m-win_amd64.whl", hash = "sha256:fcc01e900c1d7bee2a37e5d6e4f9194760a93597c97fee89c4ae51701de03563"}, - {file = "psutil-5.8.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6223d07a1ae93f86451d0198a0c361032c4c93ebd4bf6d25e2fb3edfad9571ef"}, - {file = "psutil-5.8.0-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:d225cd8319aa1d3c85bf195c4e07d17d3cd68636b8fc97e6cf198f782f99af28"}, - {file = "psutil-5.8.0-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:28ff7c95293ae74bf1ca1a79e8805fcde005c18a122ca983abf676ea3466362b"}, - {file = "psutil-5.8.0-cp38-cp38-win32.whl", hash = "sha256:ce8b867423291cb65cfc6d9c4955ee9bfc1e21fe03bb50e177f2b957f1c2469d"}, - {file = "psutil-5.8.0-cp38-cp38-win_amd64.whl", hash = "sha256:90f31c34d25b1b3ed6c40cdd34ff122b1887a825297c017e4cbd6796dd8b672d"}, - {file = "psutil-5.8.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6323d5d845c2785efb20aded4726636546b26d3b577aded22492908f7c1bdda7"}, - {file = "psutil-5.8.0-cp39-cp39-manylinux2010_i686.whl", hash = "sha256:245b5509968ac0bd179287d91210cd3f37add77dad385ef238b275bad35fa1c4"}, - {file = "psutil-5.8.0-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:90d4091c2d30ddd0a03e0b97e6a33a48628469b99585e2ad6bf21f17423b112b"}, - {file = "psutil-5.8.0-cp39-cp39-win32.whl", hash = "sha256:ea372bcc129394485824ae3e3ddabe67dc0b118d262c568b4d2602a7070afdb0"}, - {file = "psutil-5.8.0-cp39-cp39-win_amd64.whl", hash = "sha256:f4634b033faf0d968bb9220dd1c793b897ab7f1189956e1aa9eae752527127d3"}, - {file = "psutil-5.8.0.tar.gz", hash = "sha256:0c9ccb99ab76025f2f0bbecf341d4656e9c1351db8cc8a03ccd62e318ab4b5c6"}, -] -ptyprocess = [ - {file = "ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35"}, - {file = "ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220"}, -] -py = [ - {file = "py-1.10.0-py2.py3-none-any.whl", hash = "sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"}, - {file = "py-1.10.0.tar.gz", hash = "sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3"}, -] -pycparser = [ - {file = "pycparser-2.20-py2.py3-none-any.whl", hash = "sha256:7582ad22678f0fcd81102833f60ef8d0e57288b6b5fb00323d101be910e35705"}, - {file = "pycparser-2.20.tar.gz", hash = "sha256:2d475327684562c3a96cc71adf7dc8c4f0565175cf86b6d7a404ff4c771f15f0"}, -] -pydantic = [ - {file = "pydantic-1.8.2-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:05ddfd37c1720c392f4e0d43c484217b7521558302e7069ce8d318438d297739"}, - {file = "pydantic-1.8.2-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:a7c6002203fe2c5a1b5cbb141bb85060cbff88c2d78eccbc72d97eb7022c43e4"}, - {file = "pydantic-1.8.2-cp36-cp36m-manylinux2014_i686.whl", hash = "sha256:589eb6cd6361e8ac341db97602eb7f354551482368a37f4fd086c0733548308e"}, - {file = "pydantic-1.8.2-cp36-cp36m-manylinux2014_x86_64.whl", hash = "sha256:10e5622224245941efc193ad1d159887872776df7a8fd592ed746aa25d071840"}, - {file = "pydantic-1.8.2-cp36-cp36m-win_amd64.whl", hash = "sha256:99a9fc39470010c45c161a1dc584997f1feb13f689ecf645f59bb4ba623e586b"}, - {file = "pydantic-1.8.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:a83db7205f60c6a86f2c44a61791d993dff4b73135df1973ecd9eed5ea0bda20"}, - {file = "pydantic-1.8.2-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:41b542c0b3c42dc17da70554bc6f38cbc30d7066d2c2815a94499b5684582ecb"}, - {file = "pydantic-1.8.2-cp37-cp37m-manylinux2014_i686.whl", hash = "sha256:ea5cb40a3b23b3265f6325727ddfc45141b08ed665458be8c6285e7b85bd73a1"}, - {file = "pydantic-1.8.2-cp37-cp37m-manylinux2014_x86_64.whl", hash = "sha256:18b5ea242dd3e62dbf89b2b0ec9ba6c7b5abaf6af85b95a97b00279f65845a23"}, - {file = "pydantic-1.8.2-cp37-cp37m-win_amd64.whl", hash = "sha256:234a6c19f1c14e25e362cb05c68afb7f183eb931dd3cd4605eafff055ebbf287"}, - {file = "pydantic-1.8.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:021ea0e4133e8c824775a0cfe098677acf6fa5a3cbf9206a376eed3fc09302cd"}, - {file = "pydantic-1.8.2-cp38-cp38-manylinux1_i686.whl", hash = "sha256:e710876437bc07bd414ff453ac8ec63d219e7690128d925c6e82889d674bb505"}, - {file = "pydantic-1.8.2-cp38-cp38-manylinux2014_i686.whl", hash = "sha256:ac8eed4ca3bd3aadc58a13c2aa93cd8a884bcf21cb019f8cfecaae3b6ce3746e"}, - {file = "pydantic-1.8.2-cp38-cp38-manylinux2014_x86_64.whl", hash = "sha256:4a03cbbe743e9c7247ceae6f0d8898f7a64bb65800a45cbdc52d65e370570820"}, - {file = "pydantic-1.8.2-cp38-cp38-win_amd64.whl", hash = "sha256:8621559dcf5afacf0069ed194278f35c255dc1a1385c28b32dd6c110fd6531b3"}, - {file = "pydantic-1.8.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:8b223557f9510cf0bfd8b01316bf6dd281cf41826607eada99662f5e4963f316"}, - {file = "pydantic-1.8.2-cp39-cp39-manylinux1_i686.whl", hash = "sha256:244ad78eeb388a43b0c927e74d3af78008e944074b7d0f4f696ddd5b2af43c62"}, - {file = "pydantic-1.8.2-cp39-cp39-manylinux2014_i686.whl", hash = "sha256:05ef5246a7ffd2ce12a619cbb29f3307b7c4509307b1b49f456657b43529dc6f"}, - {file = "pydantic-1.8.2-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:54cd5121383f4a461ff7644c7ca20c0419d58052db70d8791eacbbe31528916b"}, - {file = "pydantic-1.8.2-cp39-cp39-win_amd64.whl", hash = "sha256:4be75bebf676a5f0f87937c6ddb061fa39cbea067240d98e298508c1bda6f3f3"}, - {file = "pydantic-1.8.2-py3-none-any.whl", hash = "sha256:fec866a0b59f372b7e776f2d7308511784dace622e0992a0b59ea3ccee0ae833"}, - {file = "pydantic-1.8.2.tar.gz", hash = "sha256:26464e57ccaafe72b7ad156fdaa4e9b9ef051f69e175dbbb463283000c05ab7b"}, -] -pydotplus = [ - {file = "pydotplus-2.0.2.tar.gz", hash = "sha256:91e85e9ee9b85d2391ead7d635e3d9c7f5f44fd60a60e59b13e2403fa66505c4"}, -] -pyglet = [ - {file = "pyglet-1.5.15-py3-none-any.whl", hash = "sha256:4401cc176580e4e17e2df8bbf7536f27e691327dc3f38f209a12f1859c70aed2"}, - {file = "pyglet-1.5.15.zip", hash = "sha256:da9d8337388cedabf1f1c5dc21a45bb2b0e5327fba47f996c8573818c3dfa478"}, -] -pygments = [ - {file = "Pygments-2.9.0-py3-none-any.whl", hash = "sha256:d66e804411278594d764fc69ec36ec13d9ae9147193a1740cd34d272ca383b8e"}, - {file = "Pygments-2.9.0.tar.gz", hash = "sha256:a18f47b506a429f6f4b9df81bb02beab9ca21d0a5fee38ed15aef65f0545519f"}, -] -pyparsing = [ - {file = "pyparsing-2.4.7-py2.py3-none-any.whl", hash = "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"}, - {file = "pyparsing-2.4.7.tar.gz", hash = "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1"}, -] -pyrsistent = [ - {file = "pyrsistent-0.18.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:f4c8cabb46ff8e5d61f56a037974228e978f26bfefce4f61a4b1ac0ba7a2ab72"}, - {file = "pyrsistent-0.18.0-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:da6e5e818d18459fa46fac0a4a4e543507fe1110e808101277c5a2b5bab0cd2d"}, - {file = "pyrsistent-0.18.0-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:5e4395bbf841693eaebaa5bb5c8f5cdbb1d139e07c975c682ec4e4f8126e03d2"}, - {file = "pyrsistent-0.18.0-cp36-cp36m-win32.whl", hash = "sha256:527be2bfa8dc80f6f8ddd65242ba476a6c4fb4e3aedbf281dfbac1b1ed4165b1"}, - {file = "pyrsistent-0.18.0-cp36-cp36m-win_amd64.whl", hash = "sha256:2aaf19dc8ce517a8653746d98e962ef480ff34b6bc563fc067be6401ffb457c7"}, - {file = "pyrsistent-0.18.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:58a70d93fb79dc585b21f9d72487b929a6fe58da0754fa4cb9f279bb92369396"}, - {file = "pyrsistent-0.18.0-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:4916c10896721e472ee12c95cdc2891ce5890898d2f9907b1b4ae0f53588b710"}, - {file = "pyrsistent-0.18.0-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:73ff61b1411e3fb0ba144b8f08d6749749775fe89688093e1efef9839d2dcc35"}, - {file = "pyrsistent-0.18.0-cp37-cp37m-win32.whl", hash = "sha256:b29b869cf58412ca5738d23691e96d8aff535e17390128a1a52717c9a109da4f"}, - {file = "pyrsistent-0.18.0-cp37-cp37m-win_amd64.whl", hash = "sha256:097b96f129dd36a8c9e33594e7ebb151b1515eb52cceb08474c10a5479e799f2"}, - {file = "pyrsistent-0.18.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:772e94c2c6864f2cd2ffbe58bb3bdefbe2a32afa0acb1a77e472aac831f83427"}, - {file = "pyrsistent-0.18.0-cp38-cp38-manylinux1_i686.whl", hash = "sha256:c1a9ff320fa699337e05edcaae79ef8c2880b52720bc031b219e5b5008ebbdef"}, - {file = "pyrsistent-0.18.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:cd3caef37a415fd0dae6148a1b6957a8c5f275a62cca02e18474608cb263640c"}, - {file = "pyrsistent-0.18.0-cp38-cp38-win32.whl", hash = "sha256:e79d94ca58fcafef6395f6352383fa1a76922268fa02caa2272fff501c2fdc78"}, - {file = "pyrsistent-0.18.0-cp38-cp38-win_amd64.whl", hash = "sha256:a0c772d791c38bbc77be659af29bb14c38ced151433592e326361610250c605b"}, - {file = "pyrsistent-0.18.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d5ec194c9c573aafaceebf05fc400656722793dac57f254cd4741f3c27ae57b4"}, - {file = "pyrsistent-0.18.0-cp39-cp39-manylinux1_i686.whl", hash = "sha256:6b5eed00e597b5b5773b4ca30bd48a5774ef1e96f2a45d105db5b4ebb4bca680"}, - {file = "pyrsistent-0.18.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:48578680353f41dca1ca3dc48629fb77dfc745128b56fc01096b2530c13fd426"}, - {file = "pyrsistent-0.18.0-cp39-cp39-win32.whl", hash = "sha256:f3ef98d7b76da5eb19c37fda834d50262ff9167c65658d1d8f974d2e4d90676b"}, - {file = "pyrsistent-0.18.0-cp39-cp39-win_amd64.whl", hash = "sha256:404e1f1d254d314d55adb8d87f4f465c8693d6f902f67eb6ef5b4526dc58e6ea"}, - {file = "pyrsistent-0.18.0.tar.gz", hash = "sha256:773c781216f8c2900b42a7b638d5b517bb134ae1acbebe4d1e8f1f41ea60eb4b"}, -] -python-dateutil = [ - {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"}, - {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"}, -] -pytorch-transformers = [ - {file = "pytorch_transformers-1.2.0-py2-none-any.whl", hash = "sha256:15f12a04424c0f6d3a7c7b57d6c79628dc9c117a204fb7db8c1ea330c77a6898"}, - {file = "pytorch_transformers-1.2.0-py3-none-any.whl", hash = "sha256:bdb606fe1f2d27586710ed03cfa49dbbd80215c38bf965862daada0c137fd7ce"}, - {file = "pytorch_transformers-1.2.0.tar.gz", hash = "sha256:293e4a864ae9d9401f9fba13f16b8696e4a1cb38bcd0b56562d03af5489daeb9"}, -] -pytz = [ - {file = "pytz-2021.1-py2.py3-none-any.whl", hash = "sha256:eb10ce3e7736052ed3623d49975ce333bcd712c7bb19a58b9e2089d4057d0798"}, - {file = "pytz-2021.1.tar.gz", hash = "sha256:83a4a90894bf38e243cf052c8b58f381bfe9a7a483f6a9cab140bc7f702ac4da"}, -] -pywavelets = [ - {file = "PyWavelets-1.1.1-cp35-cp35m-macosx_10_6_intel.whl", hash = "sha256:35959c041ec014648575085a97b498eafbbaa824f86f6e4a59bfdef8a3fe6308"}, - {file = "PyWavelets-1.1.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:55e39ec848ceec13c9fa1598253ae9dd5c31d09dfd48059462860d2b908fb224"}, - {file = "PyWavelets-1.1.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:c06d2e340c7bf8b9ec71da2284beab8519a3908eab031f4ea126e8ccfc3fd567"}, - {file = "PyWavelets-1.1.1-cp35-cp35m-win32.whl", hash = "sha256:be105382961745f88d8196bba5a69ee2c4455d87ad2a2e5d1eed6bd7fda4d3fd"}, - {file = "PyWavelets-1.1.1-cp35-cp35m-win_amd64.whl", hash = "sha256:076ca8907001fdfe4205484f719d12b4a0262dfe6652fa1cfc3c5c362d14dc84"}, - {file = "PyWavelets-1.1.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:7947e51ca05489b85928af52a34fe67022ab5b81d4ae32a4109a99e883a0635e"}, - {file = "PyWavelets-1.1.1-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:9e2528823ccf5a0a1d23262dfefe5034dce89cd84e4e124dc553dfcdf63ebb92"}, - {file = "PyWavelets-1.1.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:80b924edbc012ded8aa8b91cb2fd6207fb1a9a3a377beb4049b8a07445cec6f0"}, - {file = "PyWavelets-1.1.1-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:c2a799e79cee81a862216c47e5623c97b95f1abee8dd1f9eed736df23fb653fb"}, - {file = "PyWavelets-1.1.1-cp36-cp36m-win32.whl", hash = "sha256:d510aef84d9852653d079c84f2f81a82d5d09815e625f35c95714e7364570ad4"}, - {file = "PyWavelets-1.1.1-cp36-cp36m-win_amd64.whl", hash = "sha256:889d4c5c5205a9c90118c1980df526857929841df33e4cd1ff1eff77c6817a65"}, - {file = "PyWavelets-1.1.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:68b5c33741d26c827074b3d8f0251de1c3019bb9567b8d303eb093c822ce28f1"}, - {file = "PyWavelets-1.1.1-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:18a51b3f9416a2ae6e9a35c4af32cf520dd7895f2b69714f4aa2f4342fca47f9"}, - {file = "PyWavelets-1.1.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:cfe79844526dd92e3ecc9490b5031fca5f8ab607e1e858feba232b1b788ff0ea"}, - {file = "PyWavelets-1.1.1-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:2f7429eeb5bf9c7068002d0d7f094ed654c77a70ce5e6198737fd68ab85f8311"}, - {file = "PyWavelets-1.1.1-cp37-cp37m-win32.whl", hash = "sha256:720dbcdd3d91c6dfead79c80bf8b00a1d8aa4e5d551dc528c6d5151e4efc3403"}, - {file = "PyWavelets-1.1.1-cp37-cp37m-win_amd64.whl", hash = "sha256:bc5e87b72371da87c9bebc68e54882aada9c3114e640de180f62d5da95749cd3"}, - {file = "PyWavelets-1.1.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:98b2669c5af842a70cfab33a7043fcb5e7535a690a00cd251b44c9be0be418e5"}, - {file = "PyWavelets-1.1.1-cp38-cp38-manylinux1_i686.whl", hash = "sha256:e02a0558e0c2ac8b8bbe6a6ac18c136767ec56b96a321e0dfde2173adfa5a504"}, - {file = "PyWavelets-1.1.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:6162dc0ae04669ea04b4b51420777b9ea2d30b0a9d02901b2a3b4d61d159c2e9"}, - {file = "PyWavelets-1.1.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:39c74740718e420d38c78ca4498568fa57976d78d5096277358e0fa9629a7aea"}, - {file = "PyWavelets-1.1.1-cp38-cp38-win32.whl", hash = "sha256:79f5b54f9dc353e5ee47f0c3f02bebd2c899d49780633aa771fed43fa20b3149"}, - {file = "PyWavelets-1.1.1-cp38-cp38-win_amd64.whl", hash = "sha256:935ff247b8b78bdf77647fee962b1cc208c51a7b229db30b9ba5f6da3e675178"}, - {file = "PyWavelets-1.1.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6ebfefebb5c6494a3af41ad8c60248a95da267a24b79ed143723d4502b1fe4d7"}, - {file = "PyWavelets-1.1.1-cp39-cp39-manylinux1_i686.whl", hash = "sha256:6bc78fb9c42a716309b4ace56f51965d8b5662c3ba19d4591749f31773db1125"}, - {file = "PyWavelets-1.1.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:411e17ca6ed8cf5e18a7ca5ee06a91c25800cc6c58c77986202abf98d749273a"}, - {file = "PyWavelets-1.1.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:83c5e3eb78ce111c2f0b45f46106cc697c3cb6c4e5f51308e1f81b512c70c8fb"}, - {file = "PyWavelets-1.1.1-cp39-cp39-win32.whl", hash = "sha256:2b634a54241c190ee989a4af87669d377b37c91bcc9cf0efe33c10ff847f7841"}, - {file = "PyWavelets-1.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:732bab78435c48be5d6bc75486ef629d7c8f112e07b313bf1f1a2220ab437277"}, - {file = "PyWavelets-1.1.1.tar.gz", hash = "sha256:1a64b40f6acb4ffbaccce0545d7fc641744f95351f62e4c6aaa40549326008c9"}, -] -pywin32 = [ - {file = "pywin32-301-cp35-cp35m-win32.whl", hash = "sha256:93367c96e3a76dfe5003d8291ae16454ca7d84bb24d721e0b74a07610b7be4a7"}, - {file = "pywin32-301-cp35-cp35m-win_amd64.whl", hash = "sha256:9635df6998a70282bd36e7ac2a5cef9ead1627b0a63b17c731312c7a0daebb72"}, - {file = "pywin32-301-cp36-cp36m-win32.whl", hash = "sha256:c866f04a182a8cb9b7855de065113bbd2e40524f570db73ef1ee99ff0a5cc2f0"}, - {file = "pywin32-301-cp36-cp36m-win_amd64.whl", hash = "sha256:dafa18e95bf2a92f298fe9c582b0e205aca45c55f989937c52c454ce65b93c78"}, - {file = "pywin32-301-cp37-cp37m-win32.whl", hash = "sha256:98f62a3f60aa64894a290fb7494bfa0bfa0a199e9e052e1ac293b2ad3cd2818b"}, - {file = "pywin32-301-cp37-cp37m-win_amd64.whl", hash = "sha256:fb3b4933e0382ba49305cc6cd3fb18525df7fd96aa434de19ce0878133bf8e4a"}, - {file = "pywin32-301-cp38-cp38-win32.whl", hash = "sha256:88981dd3cfb07432625b180f49bf4e179fb8cbb5704cd512e38dd63636af7a17"}, - {file = "pywin32-301-cp38-cp38-win_amd64.whl", hash = "sha256:8c9d33968aa7fcddf44e47750e18f3d034c3e443a707688a008a2e52bbef7e96"}, - {file = "pywin32-301-cp39-cp39-win32.whl", hash = "sha256:595d397df65f1b2e0beaca63a883ae6d8b6df1cdea85c16ae85f6d2e648133fe"}, - {file = "pywin32-301-cp39-cp39-win_amd64.whl", hash = "sha256:87604a4087434cd814ad8973bd47d6524bd1fa9e971ce428e76b62a5e0860fdf"}, -] -pywinpty = [ - {file = "pywinpty-1.1.3-cp36-none-win_amd64.whl", hash = "sha256:81dc6f16d917b756e06fc58943e9750d59dbefc0ffd2086871d3fa5f33824446"}, - {file = "pywinpty-1.1.3-cp37-none-win_amd64.whl", hash = "sha256:54557887e712ea3215ab0d9f089ed55a6cc8d826cd5d1e340d75300654c9663f"}, - {file = "pywinpty-1.1.3-cp38-none-win_amd64.whl", hash = "sha256:f5e25197397f1fef0362caf3eb89f25441827a1e48bf15827c27021592fd2160"}, - {file = "pywinpty-1.1.3-cp39-none-win_amd64.whl", hash = "sha256:b767276224f86b7560eb9173ba7956758cafcdfab97bb33837d42d2a0f1dbf67"}, - {file = "pywinpty-1.1.3.tar.gz", hash = "sha256:3a1d57b338390333812a5eed31c93c7d8ba82b131078063703e731946d90c9f2"}, -] -pyyaml = [ - {file = "PyYAML-5.4.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:3b2b1824fe7112845700f815ff6a489360226a5609b96ec2190a45e62a9fc922"}, - {file = "PyYAML-5.4.1-cp27-cp27m-win32.whl", hash = "sha256:129def1b7c1bf22faffd67b8f3724645203b79d8f4cc81f674654d9902cb4393"}, - {file = "PyYAML-5.4.1-cp27-cp27m-win_amd64.whl", hash = "sha256:4465124ef1b18d9ace298060f4eccc64b0850899ac4ac53294547536533800c8"}, - {file = "PyYAML-5.4.1-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:bb4191dfc9306777bc594117aee052446b3fa88737cd13b7188d0e7aa8162185"}, - {file = "PyYAML-5.4.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:6c78645d400265a062508ae399b60b8c167bf003db364ecb26dcab2bda048253"}, - {file = "PyYAML-5.4.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:4e0583d24c881e14342eaf4ec5fbc97f934b999a6828693a99157fde912540cc"}, - {file = "PyYAML-5.4.1-cp36-cp36m-win32.whl", hash = "sha256:3bd0e463264cf257d1ffd2e40223b197271046d09dadf73a0fe82b9c1fc385a5"}, - {file = "PyYAML-5.4.1-cp36-cp36m-win_amd64.whl", hash = "sha256:e4fac90784481d221a8e4b1162afa7c47ed953be40d31ab4629ae917510051df"}, - {file = "PyYAML-5.4.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:5accb17103e43963b80e6f837831f38d314a0495500067cb25afab2e8d7a4018"}, - {file = "PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:e1d4970ea66be07ae37a3c2e48b5ec63f7ba6804bdddfdbd3cfd954d25a82e63"}, - {file = "PyYAML-5.4.1-cp37-cp37m-win32.whl", hash = "sha256:dd5de0646207f053eb0d6c74ae45ba98c3395a571a2891858e87df7c9b9bd51b"}, - {file = "PyYAML-5.4.1-cp37-cp37m-win_amd64.whl", hash = "sha256:08682f6b72c722394747bddaf0aa62277e02557c0fd1c42cb853016a38f8dedf"}, - {file = "PyYAML-5.4.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d2d9808ea7b4af864f35ea216be506ecec180628aced0704e34aca0b040ffe46"}, - {file = "PyYAML-5.4.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:8c1be557ee92a20f184922c7b6424e8ab6691788e6d86137c5d93c1a6ec1b8fb"}, - {file = "PyYAML-5.4.1-cp38-cp38-win32.whl", hash = "sha256:fa5ae20527d8e831e8230cbffd9f8fe952815b2b7dae6ffec25318803a7528fc"}, - {file = "PyYAML-5.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:0f5f5786c0e09baddcd8b4b45f20a7b5d61a7e7e99846e3c799b05c7c53fa696"}, - {file = "PyYAML-5.4.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:294db365efa064d00b8d1ef65d8ea2c3426ac366c0c4368d930bf1c5fb497f77"}, - {file = "PyYAML-5.4.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:74c1485f7707cf707a7aef42ef6322b8f97921bd89be2ab6317fd782c2d53183"}, - {file = "PyYAML-5.4.1-cp39-cp39-win32.whl", hash = "sha256:49d4cdd9065b9b6e206d0595fee27a96b5dd22618e7520c33204a4a3239d5b10"}, - {file = "PyYAML-5.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:c20cfa2d49991c8b4147af39859b167664f2ad4561704ee74c1de03318e898db"}, - {file = "PyYAML-5.4.1.tar.gz", hash = "sha256:607774cbba28732bfa802b54baa7484215f530991055bb562efbed5b2f20a45e"}, -] -pyzmq = [ - {file = "pyzmq-22.2.1-cp310-cp310-macosx_10_15_universal2.whl", hash = "sha256:d60a407663b7c2af781ab7f49d94a3d379dd148bb69ea8d9dd5bc69adf18097c"}, - {file = "pyzmq-22.2.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:631f932fb1fa4b76f31adf976f8056519bc6208a3c24c184581c3dd5be15066e"}, - {file = "pyzmq-22.2.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:0471d634c7fe48ff7d3849798da6c16afc71676dd890b5ae08eb1efe735c6fec"}, - {file = "pyzmq-22.2.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f520e9fee5d7a2e09b051d924f85b977c6b4e224e56c0551c3c241bbeeb0ad8d"}, - {file = "pyzmq-22.2.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c1b6619ceb33a8907f1cb82ff8afc8a133e7a5f16df29528e919734718600426"}, - {file = "pyzmq-22.2.1-cp310-cp310-win32.whl", hash = "sha256:31c5dfb6df5148789835128768c01bf6402eb753d06f524f12f6786caf96fb44"}, - {file = "pyzmq-22.2.1-cp310-cp310-win_amd64.whl", hash = "sha256:4842a8263cbaba6fce401bbe4e2b125321c401a01714e42624dabc554bfc2629"}, - {file = "pyzmq-22.2.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:b921758f8b5098faa85f341bbdd5e36d5339de5e9032ca2b07d8c8e7bec5069b"}, - {file = "pyzmq-22.2.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:240b83b3a8175b2f616f80092cbb019fcd5c18598f78ffc6aa0ae9034b300f14"}, - {file = "pyzmq-22.2.1-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:da7f7f3bb08bcf59a6b60b4e53dd8f08bb00c9e61045319d825a906dbb3c8fb7"}, - {file = "pyzmq-22.2.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:e66025b64c4724ba683d6d4a4e5ee23de12fe9ae683908f0c7f0f91b4a2fd94e"}, - {file = "pyzmq-22.2.1-cp36-cp36m-win32.whl", hash = "sha256:50d007d5702171bc810c1e74498fa2c7bc5b50f9750697f7fd2a3e71a25aad91"}, - {file = "pyzmq-22.2.1-cp36-cp36m-win_amd64.whl", hash = "sha256:b4a51c7d906dc263a0cc5590761e53e0a68f2c2fefe549cbef21c9ee5d2d98a4"}, - {file = "pyzmq-22.2.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:93705cb90baa9d6f75e8448861a1efd3329006f79095ab18846bd1eaa342f7c3"}, - {file = "pyzmq-22.2.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:620b0abb813958cb3ecb5144c177e26cde92fee6f43c4b9de6b329515532bf27"}, - {file = "pyzmq-22.2.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2dd3896b3c952cf6c8013deda53c1df16bf962f355b5503d23521e0f6403ae3d"}, - {file = "pyzmq-22.2.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:6e9c030222893afa86881d7485d3e841969760a16004bd23e9a83cca28b42778"}, - {file = "pyzmq-22.2.1-cp37-cp37m-win32.whl", hash = "sha256:262f470e7acde18b7217aac78d19d2e29ced91a5afbeb7d98521ebf26461aa7e"}, - {file = "pyzmq-22.2.1-cp37-cp37m-win_amd64.whl", hash = "sha256:246f27b88722cfa729bb04881e94484e40b085720d728c1b05133b3f331b0b7b"}, - {file = "pyzmq-22.2.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0d17bac19e934e9f547a8811b7c2a32651a7840f38086b924e2e3dcb2fae5c3a"}, - {file = "pyzmq-22.2.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:5933d1f4087de6e52906f72d92e1e4dcc630d371860b92c55d7f7a4b815a664c"}, - {file = "pyzmq-22.2.1-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:ac4497e4b7d134ee53ce5532d9cc3b640d6e71806a55062984e0c99a2f88f465"}, - {file = "pyzmq-22.2.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:66375a6094af72a6098ed4403b15b4db6bf00013c6febc1baa832e7abda827f4"}, - {file = "pyzmq-22.2.1-cp38-cp38-win32.whl", hash = "sha256:b2c16d20bd0aef8e57bc9505fdd80ea0d6008020c3740accd96acf1b3d1b5347"}, - {file = "pyzmq-22.2.1-cp38-cp38-win_amd64.whl", hash = "sha256:ff345d48940c834168f81fa1d4724675099f148f1ab6369748c4d712ed71bf7c"}, - {file = "pyzmq-22.2.1-cp39-cp39-macosx_10_15_universal2.whl", hash = "sha256:f5c84c5de9a773bbf8b22c51e28380999ea72e5e85b4db8edf5e69a7a0d4d9f9"}, - {file = "pyzmq-22.2.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2534a036b777f957bd6b89b55fb2136775ca2659fb0f1c85036ba78d17d86fd5"}, - {file = "pyzmq-22.2.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:a649065413ba4eab92a783a7caa4de8ce14cf46ba8a2a09951426143f1298adb"}, - {file = "pyzmq-22.2.1-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:c9cb0bd3a3cb7ccad3caa1d7b0d18ba71ed3a4a3610028e506a4084371d4d223"}, - {file = "pyzmq-22.2.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b4428302c389fffc0c9c07a78cad5376636b9d096f332acfe66b321ae9ff2c63"}, - {file = "pyzmq-22.2.1-cp39-cp39-win32.whl", hash = "sha256:6a5b4566f66d953601d0d47d4071897f550a265bafd52ebcad5ac7aad3838cbb"}, - {file = "pyzmq-22.2.1-cp39-cp39-win_amd64.whl", hash = "sha256:89200ab6ef9081c72a04ed84c52a50b60dcb0655375aeedb40689bc7c934715e"}, - {file = "pyzmq-22.2.1-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ed67df4eaa99a20d162d76655bda23160abdf8abf82a17f41dfd3962e608dbcc"}, - {file = "pyzmq-22.2.1-pp37-pypy37_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:021e22a8c58ab294bd4b96448a2ca4e716e1d76600192ff84c33d71edb1fbd37"}, - {file = "pyzmq-22.2.1-pp37-pypy37_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:200ac096cee5499964c90687306a7244b79ef891f773ed4cf15019fd1f3df330"}, - {file = "pyzmq-22.2.1-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:b3f57bee62e36be5c97712de32237c5589caee0d1154c2ad01a888accfae20bc"}, - {file = "pyzmq-22.2.1.tar.gz", hash = "sha256:6d18c76676771fd891ca8e0e68da0bbfb88e30129835c0ade748016adb3b6242"}, -] -regex = [ - {file = "regex-2021.8.3-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:8764a78c5464ac6bde91a8c87dd718c27c1cabb7ed2b4beaf36d3e8e390567f9"}, - {file = "regex-2021.8.3-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4551728b767f35f86b8e5ec19a363df87450c7376d7419c3cac5b9ceb4bce576"}, - {file = "regex-2021.8.3-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:577737ec3d4c195c4aef01b757905779a9e9aee608fa1cf0aec16b5576c893d3"}, - {file = "regex-2021.8.3-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:c856ec9b42e5af4fe2d8e75970fcc3a2c15925cbcc6e7a9bcb44583b10b95e80"}, - {file = "regex-2021.8.3-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3835de96524a7b6869a6c710b26c90e94558c31006e96ca3cf6af6751b27dca1"}, - {file = "regex-2021.8.3-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:cea56288eeda8b7511d507bbe7790d89ae7049daa5f51ae31a35ae3c05408531"}, - {file = "regex-2021.8.3-cp36-cp36m-win32.whl", hash = "sha256:a4eddbe2a715b2dd3849afbdeacf1cc283160b24e09baf64fa5675f51940419d"}, - {file = "regex-2021.8.3-cp36-cp36m-win_amd64.whl", hash = "sha256:57fece29f7cc55d882fe282d9de52f2f522bb85290555b49394102f3621751ee"}, - {file = "regex-2021.8.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:a5c6dbe09aff091adfa8c7cfc1a0e83fdb8021ddb2c183512775a14f1435fe16"}, - {file = "regex-2021.8.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ff4a8ad9638b7ca52313d8732f37ecd5fd3c8e3aff10a8ccb93176fd5b3812f6"}, - {file = "regex-2021.8.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b63e3571b24a7959017573b6455e05b675050bbbea69408f35f3cb984ec54363"}, - {file = "regex-2021.8.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:fbc20975eee093efa2071de80df7f972b7b35e560b213aafabcec7c0bd00bd8c"}, - {file = "regex-2021.8.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:14caacd1853e40103f59571f169704367e79fb78fac3d6d09ac84d9197cadd16"}, - {file = "regex-2021.8.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:bb350eb1060591d8e89d6bac4713d41006cd4d479f5e11db334a48ff8999512f"}, - {file = "regex-2021.8.3-cp37-cp37m-win32.whl", hash = "sha256:18fdc51458abc0a974822333bd3a932d4e06ba2a3243e9a1da305668bd62ec6d"}, - {file = "regex-2021.8.3-cp37-cp37m-win_amd64.whl", hash = "sha256:026beb631097a4a3def7299aa5825e05e057de3c6d72b139c37813bfa351274b"}, - {file = "regex-2021.8.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:16d9eaa8c7e91537516c20da37db975f09ac2e7772a0694b245076c6d68f85da"}, - {file = "regex-2021.8.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3905c86cc4ab6d71635d6419a6f8d972cab7c634539bba6053c47354fd04452c"}, - {file = "regex-2021.8.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:937b20955806381e08e54bd9d71f83276d1f883264808521b70b33d98e4dec5d"}, - {file = "regex-2021.8.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:28e8af338240b6f39713a34e337c3813047896ace09d51593d6907c66c0708ba"}, - {file = "regex-2021.8.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3c09d88a07483231119f5017904db8f60ad67906efac3f1baa31b9b7f7cca281"}, - {file = "regex-2021.8.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:85f568892422a0e96235eb8ea6c5a41c8ccbf55576a2260c0160800dbd7c4f20"}, - {file = "regex-2021.8.3-cp38-cp38-win32.whl", hash = "sha256:bf6d987edd4a44dd2fa2723fca2790f9442ae4de2c8438e53fcb1befdf5d823a"}, - {file = "regex-2021.8.3-cp38-cp38-win_amd64.whl", hash = "sha256:8fe58d9f6e3d1abf690174fd75800fda9bdc23d2a287e77758dc0e8567e38ce6"}, - {file = "regex-2021.8.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7976d410e42be9ae7458c1816a416218364e06e162b82e42f7060737e711d9ce"}, - {file = "regex-2021.8.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9569da9e78f0947b249370cb8fadf1015a193c359e7e442ac9ecc585d937f08d"}, - {file = "regex-2021.8.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:459bbe342c5b2dec5c5223e7c363f291558bc27982ef39ffd6569e8c082bdc83"}, - {file = "regex-2021.8.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:4f421e3cdd3a273bace013751c345f4ebeef08f05e8c10757533ada360b51a39"}, - {file = "regex-2021.8.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ea212df6e5d3f60341aef46401d32fcfded85593af1d82b8b4a7a68cd67fdd6b"}, - {file = "regex-2021.8.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:a3b73390511edd2db2d34ff09aa0b2c08be974c71b4c0505b4a048d5dc128c2b"}, - {file = "regex-2021.8.3-cp39-cp39-win32.whl", hash = "sha256:f35567470ee6dbfb946f069ed5f5615b40edcbb5f1e6e1d3d2b114468d505fc6"}, - {file = "regex-2021.8.3-cp39-cp39-win_amd64.whl", hash = "sha256:bfa6a679410b394600eafd16336b2ce8de43e9b13f7fb9247d84ef5ad2b45e91"}, - {file = "regex-2021.8.3.tar.gz", hash = "sha256:8935937dad2c9b369c3d932b0edbc52a62647c2afb2fafc0c280f14a8bf56a6a"}, -] -requests = [ - {file = "requests-2.26.0-py2.py3-none-any.whl", hash = "sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24"}, - {file = "requests-2.26.0.tar.gz", hash = "sha256:b8aa58f8cf793ffd8782d3d8cb19e66ef36f7aba4353eec859e74678b01b07a7"}, -] -s3transfer = [ - {file = "s3transfer-0.5.0-py3-none-any.whl", hash = "sha256:9c1dc369814391a6bda20ebbf4b70a0f34630592c9aa520856bf384916af2803"}, - {file = "s3transfer-0.5.0.tar.gz", hash = "sha256:50ed823e1dc5868ad40c8dc92072f757aa0e653a192845c94a3b676f4a62da4c"}, -] -sacremoses = [ - {file = "sacremoses-0.0.45-py3-none-any.whl", hash = "sha256:fa93db44bc04542553ba6090818b892f603d02aa0d681e6c5c3023baf17e8564"}, - {file = "sacremoses-0.0.45.tar.gz", hash = "sha256:58176cc28391830789b763641d0f458819bebe88681dac72b41a19c0aedc07e9"}, -] -scikit-image = [ - {file = "scikit-image-0.18.2.tar.gz", hash = "sha256:32ff472355fbf8ab40a8e9ed685906c6c51a863f1ea8737882d26be9221631f3"}, - {file = "scikit_image-0.18.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:6f6d0e79a91c62360708111951abb3a774cefac865902ea797c3b72d8ece6382"}, - {file = "scikit_image-0.18.2-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:49a9b48bc428d2d56aaefbc042fd79c67ebc908a1cbf542e9c863c49339ca496"}, - {file = "scikit_image-0.18.2-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:32620792e989beb2c3eb67eae38b59291be412be59ad3485ee0f67cb7b37c16f"}, - {file = "scikit_image-0.18.2-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:38efc3fd9023c7849175fc18b2cc96a08629da840b6100ec5038f487fba7d34e"}, - {file = "scikit_image-0.18.2-cp37-cp37m-win32.whl", hash = "sha256:5510b133999a45b2c8ed4c1b659fa0a1cf4ca0db949353d0f54fc6290dac4d5a"}, - {file = "scikit_image-0.18.2-cp37-cp37m-win_amd64.whl", hash = "sha256:b3aa7230d84b12d8a4a9f0b65ee895603d27fe85366bf2b57929ba1cce2e8987"}, - {file = "scikit_image-0.18.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b21d65dcee453539fe70b5903edd8429ad9fe46233b049dd622368bad435f39e"}, - {file = "scikit_image-0.18.2-cp38-cp38-manylinux1_i686.whl", hash = "sha256:f61d65de826abe2f5f6c171c75d2bb93df56aa4a690d1bab5911412f49b9e768"}, - {file = "scikit_image-0.18.2-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:a4fdba1bdd883a8028ddff0b8fe8d43c8dd43360bdab6e1f40599fa210613f1d"}, - {file = "scikit_image-0.18.2-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:8e62228a91b770fbe89d310e833f8797f14136b9635bb67d8b780f1b8cf237e6"}, - {file = "scikit_image-0.18.2-cp38-cp38-win32.whl", hash = "sha256:f80d16ce57e05af8e282620a23e90bb8886e5efa6eedcacb4da1c15293ba5e9a"}, - {file = "scikit_image-0.18.2-cp38-cp38-win_amd64.whl", hash = "sha256:6d576a8249114e6169ea1c2b05a22168745eedba90b06d5765368dbd59b27c7f"}, - {file = "scikit_image-0.18.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:278b0034c509d8b31a9b117837b3d45957dd3408e062ad0f2b24edeb1a460e91"}, - {file = "scikit_image-0.18.2-cp39-cp39-manylinux2010_i686.whl", hash = "sha256:f9648093a0865150fc4ac9eaf02256afbf471a43216b0b6ee6585a4d57674563"}, - {file = "scikit_image-0.18.2-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:2c7e91fb3df5cc58cb13c39094a32bb2e990ced30b08ee34bf0976ff8a1ba579"}, - {file = "scikit_image-0.18.2-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:4f42a36e34a3e659dd5e0fe9c0b07f797f9b66680224a7f2545a564484574d78"}, - {file = "scikit_image-0.18.2-cp39-cp39-win32.whl", hash = "sha256:66ea3bc8f53efbaf751fdae472fe1cbc55ad5e4fadbf6d3a0a268dc7e34d83b6"}, - {file = "scikit_image-0.18.2-cp39-cp39-win_amd64.whl", hash = "sha256:74f7c5920c0b893608ef0d159a61a15e87aa9f31d2707d1ed6621a65233646cd"}, -] -scikit-learn = [ - {file = "scikit-learn-0.24.2.tar.gz", hash = "sha256:d14701a12417930392cd3898e9646cf5670c190b933625ebe7511b1f7d7b8736"}, - {file = "scikit_learn-0.24.2-cp36-cp36m-macosx_10_13_x86_64.whl", hash = "sha256:d5bf9c863ba4717b3917b5227463ee06860fc43931dc9026747de416c0a10fee"}, - {file = "scikit_learn-0.24.2-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:5beaeb091071625e83f5905192d8aecde65ba2f26f8b6719845bbf586f7a04a1"}, - {file = "scikit_learn-0.24.2-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:06ffdcaaf81e2a3b1b50c3ac6842cfb13df2d8b737d61f64643ed61da7389cde"}, - {file = "scikit_learn-0.24.2-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:fec42690a2eb646b384eafb021c425fab48991587edb412d4db77acc358b27ce"}, - {file = "scikit_learn-0.24.2-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:5ff3e4e4cf7592d36541edec434e09fb8ab9ba6b47608c4ffe30c9038d301897"}, - {file = "scikit_learn-0.24.2-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:3cbd734e1aefc7c5080e6b6973fe062f97c26a1cdf1a991037ca196ce1c8f427"}, - {file = "scikit_learn-0.24.2-cp36-cp36m-win32.whl", hash = "sha256:f74429a07fedb36a03c159332b914e6de757176064f9fed94b5f79ebac07d913"}, - {file = "scikit_learn-0.24.2-cp36-cp36m-win_amd64.whl", hash = "sha256:dd968a174aa82f3341a615a033fa6a8169e9320cbb46130686562db132d7f1f0"}, - {file = "scikit_learn-0.24.2-cp37-cp37m-macosx_10_13_x86_64.whl", hash = "sha256:49ec0b1361da328da9bb7f1a162836028e72556356adeb53342f8fae6b450d47"}, - {file = "scikit_learn-0.24.2-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:f18c3ed484eeeaa43a0d45dc2efb4d00fc6542ccdcfa2c45d7b635096a2ae534"}, - {file = "scikit_learn-0.24.2-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:cdf24c1b9bbeb4936456b42ac5bd32c60bb194a344951acb6bfb0cddee5439a4"}, - {file = "scikit_learn-0.24.2-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:d177fe1ff47cc235942d628d41ee5b1c6930d8f009f1a451c39b5411e8d0d4cf"}, - {file = "scikit_learn-0.24.2-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:f3ec00f023d84526381ad0c0f2cff982852d035c921bbf8ceb994f4886c00c64"}, - {file = "scikit_learn-0.24.2-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:ae19ac105cf7ce8c205a46166992fdec88081d6e783ab6e38ecfbe45729f3c39"}, - {file = "scikit_learn-0.24.2-cp37-cp37m-win32.whl", hash = "sha256:f0ed4483c258fb23150e31b91ea7d25ff8495dba108aea0b0d4206a777705350"}, - {file = "scikit_learn-0.24.2-cp37-cp37m-win_amd64.whl", hash = "sha256:39b7e3b71bcb1fe46397185d6c1a5db1c441e71c23c91a31e7ad8cc3f7305f9a"}, - {file = "scikit_learn-0.24.2-cp38-cp38-macosx_10_13_x86_64.whl", hash = "sha256:90a297330f608adeb4d2e9786c6fda395d3150739deb3d42a86d9a4c2d15bc1d"}, - {file = "scikit_learn-0.24.2-cp38-cp38-manylinux1_i686.whl", hash = "sha256:f1d2108e770907540b5248977e4cff9ffaf0f73d0d13445ee938df06ca7579c6"}, - {file = "scikit_learn-0.24.2-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:1eec963fe9ffc827442c2e9333227c4d49749a44e592f305398c1db5c1563393"}, - {file = "scikit_learn-0.24.2-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:2db429090b98045d71218a9ba913cc9b3fe78e0ba0b6b647d8748bc6d5a44080"}, - {file = "scikit_learn-0.24.2-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:62214d2954377fcf3f31ec867dd4e436df80121e7a32947a0b3244f58f45e455"}, - {file = "scikit_learn-0.24.2-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:8fac72b9688176922f9f54fda1ba5f7ffd28cbeb9aad282760186e8ceba9139a"}, - {file = "scikit_learn-0.24.2-cp38-cp38-win32.whl", hash = "sha256:ae426e3a52842c6b6d77d00f906b6031c8c2cfdfabd6af7511bb4bc9a68d720e"}, - {file = "scikit_learn-0.24.2-cp38-cp38-win_amd64.whl", hash = "sha256:038f4e9d6ef10e1f3fe82addc3a14735c299866eb10f2c77c090410904828312"}, - {file = "scikit_learn-0.24.2-cp39-cp39-macosx_10_13_x86_64.whl", hash = "sha256:48f273836e19901ba2beecd919f7b352f09310ce67c762f6e53bc6b81cacf1f0"}, - {file = "scikit_learn-0.24.2-cp39-cp39-manylinux1_i686.whl", hash = "sha256:a2a47449093dcf70babc930beba2ca0423cb7df2fa5fd76be5260703d67fa574"}, - {file = "scikit_learn-0.24.2-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:0e71ce9c7cbc20f6f8b860107ce15114da26e8675238b4b82b7e7cd37ca0c087"}, - {file = "scikit_learn-0.24.2-cp39-cp39-manylinux2010_i686.whl", hash = "sha256:2754c85b2287333f9719db7f23fb7e357f436deed512db3417a02bf6f2830aa5"}, - {file = "scikit_learn-0.24.2-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:7be1b88c23cfac46e06404582215a917017cd2edaa2e4d40abe6aaff5458f24b"}, - {file = "scikit_learn-0.24.2-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:4e6198675a6f9d333774671bd536668680eea78e2e81c0b19e57224f58d17f37"}, - {file = "scikit_learn-0.24.2-cp39-cp39-win32.whl", hash = "sha256:cbdb0b3db99dd1d5f69d31b4234367d55475add31df4d84a3bd690ef017b55e2"}, - {file = "scikit_learn-0.24.2-cp39-cp39-win_amd64.whl", hash = "sha256:40556bea1ef26ef54bc678d00cf138a63069144a0b5f3a436eecd8f3468b903e"}, -] -scipy = [ - {file = "scipy-1.6.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:a15a1f3fc0abff33e792d6049161b7795909b40b97c6cc2934ed54384017ab76"}, - {file = "scipy-1.6.1-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:e79570979ccdc3d165456dd62041d9556fb9733b86b4b6d818af7a0afc15f092"}, - {file = "scipy-1.6.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:a423533c55fec61456dedee7b6ee7dce0bb6bfa395424ea374d25afa262be261"}, - {file = "scipy-1.6.1-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:33d6b7df40d197bdd3049d64e8e680227151673465e5d85723b3b8f6b15a6ced"}, - {file = "scipy-1.6.1-cp37-cp37m-win32.whl", hash = "sha256:6725e3fbb47da428794f243864f2297462e9ee448297c93ed1dcbc44335feb78"}, - {file = "scipy-1.6.1-cp37-cp37m-win_amd64.whl", hash = "sha256:5fa9c6530b1661f1370bcd332a1e62ca7881785cc0f80c0d559b636567fab63c"}, - {file = "scipy-1.6.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:bd50daf727f7c195e26f27467c85ce653d41df4358a25b32434a50d8870fc519"}, - {file = "scipy-1.6.1-cp38-cp38-manylinux1_i686.whl", hash = "sha256:f46dd15335e8a320b0fb4685f58b7471702234cba8bb3442b69a3e1dc329c345"}, - {file = "scipy-1.6.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:0e5b0ccf63155d90da576edd2768b66fb276446c371b73841e3503be1d63fb5d"}, - {file = "scipy-1.6.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:2481efbb3740977e3c831edfd0bd9867be26387cacf24eb5e366a6a374d3d00d"}, - {file = "scipy-1.6.1-cp38-cp38-win32.whl", hash = "sha256:68cb4c424112cd4be886b4d979c5497fba190714085f46b8ae67a5e4416c32b4"}, - {file = "scipy-1.6.1-cp38-cp38-win_amd64.whl", hash = "sha256:5f331eeed0297232d2e6eea51b54e8278ed8bb10b099f69c44e2558c090d06bf"}, - {file = "scipy-1.6.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:0c8a51d33556bf70367452d4d601d1742c0e806cd0194785914daf19775f0e67"}, - {file = "scipy-1.6.1-cp39-cp39-manylinux1_i686.whl", hash = "sha256:83bf7c16245c15bc58ee76c5418e46ea1811edcc2e2b03041b804e46084ab627"}, - {file = "scipy-1.6.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:794e768cc5f779736593046c9714e0f3a5940bc6dcc1dba885ad64cbfb28e9f0"}, - {file = "scipy-1.6.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:5da5471aed911fe7e52b86bf9ea32fb55ae93e2f0fac66c32e58897cfb02fa07"}, - {file = "scipy-1.6.1-cp39-cp39-win32.whl", hash = "sha256:8e403a337749ed40af60e537cc4d4c03febddcc56cd26e774c9b1b600a70d3e4"}, - {file = "scipy-1.6.1-cp39-cp39-win_amd64.whl", hash = "sha256:a5193a098ae9f29af283dcf0041f762601faf2e595c0db1da929875b7570353f"}, - {file = "scipy-1.6.1.tar.gz", hash = "sha256:c4fceb864890b6168e79b0e714c585dbe2fd4222768ee90bc1aa0f8218691b11"}, -] -seaborn = [ - {file = "seaborn-0.11.1-py3-none-any.whl", hash = "sha256:4e1cce9489449a1c6ff3c567f2113cdb41122f727e27a984950d004a88ef3c5c"}, - {file = "seaborn-0.11.1.tar.gz", hash = "sha256:44e78eaed937c5a87fc7a892c329a7cc091060b67ebd1d0d306b446a74ba01ad"}, -] -send2trash = [ - {file = "Send2Trash-1.8.0-py3-none-any.whl", hash = "sha256:f20eaadfdb517eaca5ce077640cb261c7d2698385a6a0f072a4a5447fd49fa08"}, - {file = "Send2Trash-1.8.0.tar.gz", hash = "sha256:d2c24762fd3759860a0aff155e45871447ea58d2be6bdd39b5c8f966a0c99c2d"}, -] -sentencepiece = [ - {file = "sentencepiece-0.1.96-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc969e6694fb27fba7cee2953f350804faf03913f25ae1ee713a7b8a1bc08018"}, - {file = "sentencepiece-0.1.96-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:36e9ff61e7b67c5b7ee96733613622620b4802fc8cf188a4dbc1f355b03dde02"}, - {file = "sentencepiece-0.1.96-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e9e9fe8094ca57549d801e9a2017ac5c24108bbf485ea4f8994a72e8e96ee135"}, - {file = "sentencepiece-0.1.96-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b77d27f59d515c43b61745b8173fbe7c7b3014b14b3702a75bf1793471e7def6"}, - {file = "sentencepiece-0.1.96-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1dac8c2ad02b5ebc1179c0a14cbc7d7c6f4fd73d4dd51820626402d0aefc974e"}, - {file = "sentencepiece-0.1.96-cp35-cp35m-macosx_10_6_x86_64.whl", hash = "sha256:e8ec5bb6777e2060e1499750c50e1b69dca5a0f80f90f2c66656c5f3e5244593"}, - {file = "sentencepiece-0.1.96-cp36-cp36m-macosx_10_6_x86_64.whl", hash = "sha256:99ea2d9db19e63a2d17d5dc64f9ace83fb9308a735be05a1aaf98eb4b496fba7"}, - {file = "sentencepiece-0.1.96-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aeb090ad462833df03af1debce4ae607a2766ef861f992003ad0c56d074ab805"}, - {file = "sentencepiece-0.1.96-cp36-cp36m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f8c90df663cd9759b2cf8dd29998b63140ac39e51ada2e739dc13bdac0b4f001"}, - {file = "sentencepiece-0.1.96-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:26d20d713b3ba1b7a19205336afb1e93a4327c372b2f795e907b8dc2315ac92e"}, - {file = "sentencepiece-0.1.96-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5388882bb24d083f6cc8cffc5c435f3694a7772b018e06ea6fd84d1044009efb"}, - {file = "sentencepiece-0.1.96-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a92e1932ee8fd500680ccbe1bf53eb33228f4c9d6524ed6f300bcc80ac359f27"}, - {file = "sentencepiece-0.1.96-cp36-cp36m-win32.whl", hash = "sha256:bedf0355117fb4e9b1fc9fc92b4d5ee743a7d468be9f6196e3b94447710ea589"}, - {file = "sentencepiece-0.1.96-cp36-cp36m-win_amd64.whl", hash = "sha256:4997c7ccf2ae462320250314aa5709a88d8a09fa271d073458a07bebf33f8e7c"}, - {file = "sentencepiece-0.1.96-cp37-cp37m-macosx_10_6_x86_64.whl", hash = "sha256:a697257a2cd7581732d7741a8d32a06927f0311c3d277dbc47fa1043350c9d17"}, - {file = "sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ff7d752a7f82d87711ec1a95c2262cb74f98be5b457f0300d81a1aefe5be2a95"}, - {file = "sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3e61e0757e49c306fff78ea75d6b75773418fe22214b4a460959203be934e834"}, - {file = "sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ef59ba19340dc1d002ce5713b911c0ef23c577b08f8ed57998ee3c8e62c5bf6e"}, - {file = "sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:89c038da7f827a6e2ca4c73aeb4e4b25b99d981ce47dd61b04d446c8200cba1e"}, - {file = "sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d954d25a8705f972e8bfc1dea5464d7e697dd6f4ade092f1a487387e6d6c829a"}, - {file = "sentencepiece-0.1.96-cp37-cp37m-win32.whl", hash = "sha256:fd907a8f744e5337de7fc532dd800c4416b571ea47f8c3c66be10cd1bc67c925"}, - {file = "sentencepiece-0.1.96-cp37-cp37m-win_amd64.whl", hash = "sha256:335bf84d72112cc91f3c3b691d61802fc963503b7772fd8280d20368048b8f3e"}, - {file = "sentencepiece-0.1.96-cp38-cp38-macosx_10_6_x86_64.whl", hash = "sha256:e811984b0908c14c56de7d8226fdd494d87a7ccb75af8ac3a07423037aaafc35"}, - {file = "sentencepiece-0.1.96-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8179785883b556cd517416cdbda6244745414b00ec83132cfe1d26000971f3ae"}, - {file = "sentencepiece-0.1.96-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:466e381f0a812da8fda97a9707498cef3210ea8385a3421bcbadcb5384063969"}, - {file = "sentencepiece-0.1.96-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f8cb24d8d0b2f8b7463815a59183eb81ec1d7a06e3217bed456063f3303eddfb"}, - {file = "sentencepiece-0.1.96-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e88354b61f59dfdeb41023f7be8ae31dc627c2dc2dacbc2de8b2d82a0997135c"}, - {file = "sentencepiece-0.1.96-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a336575463d75d3aac1f7e32470b8998643ccd9a73786bd726f6b0470520b6b4"}, - {file = "sentencepiece-0.1.96-cp38-cp38-win32.whl", hash = "sha256:81bb77ba3651114943b2f8f77829cf764137dff06e38f4bf7fa43efea12c7f84"}, - {file = "sentencepiece-0.1.96-cp38-cp38-win_amd64.whl", hash = "sha256:eba0471ab0bb2e07ed06d91ecf5185d402c83d194155a41d8e2aa547d187712e"}, - {file = "sentencepiece-0.1.96-cp39-cp39-macosx_10_6_x86_64.whl", hash = "sha256:78e18d9106c36dcca929e18fd2c412378deac661d47fa3ee25defc55eef8a215"}, - {file = "sentencepiece-0.1.96-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b1c24c1d9405b2148184ff27c062493d5e3be5c144575f95b5a0d7c660a515af"}, - {file = "sentencepiece-0.1.96-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:940a6999c7d3f55e9d7b194fd5e1f41a7dbed26d3519fb95333216292a39599e"}, - {file = "sentencepiece-0.1.96-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:384148cead5cdab34a4d74fe1fb6a5a8abaafed25eaa4a7698b49dd9482e4c4e"}, - {file = "sentencepiece-0.1.96-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3c703e68ea192e45b65c5d5836f6980849d828a18da4189899d7150fad82dc9e"}, - {file = "sentencepiece-0.1.96-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d501713a8396193883aa526f48dc609f5f031a5df1afbafa561cf9ab492ffc76"}, - {file = "sentencepiece-0.1.96-cp39-cp39-win32.whl", hash = "sha256:b8b1dd2712f8a7de5b4c8ec912e6c041d25750bf03e1ce325cdba43bae0944ae"}, - {file = "sentencepiece-0.1.96-cp39-cp39-win_amd64.whl", hash = "sha256:d45e3f78e746aa161bc9f5a31c6a2839c512101113a4065f4d2e7a3ab8198d8c"}, - {file = "sentencepiece-0.1.96-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5513298d62fe63dd0862d08a6eb52a9aa3537006f597f2386184e3f95bb88889"}, - {file = "sentencepiece-0.1.96-pp37-pypy37_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dadccb2e49244b6e64b4527d13ec14d5e094a90b41cf9b963e457e64182f1941"}, - {file = "sentencepiece-0.1.96-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:48c6d13b3bfff08060c138248e85df60f6fad11135ad7a8fc2ef6005aacca839"}, - {file = "sentencepiece-0.1.96.tar.gz", hash = "sha256:9bdf097d5bd1d8ce42dfee51f6ff05f5578b96e48c6f6006aa4eff69edfa3639"}, -] -shap = [ - {file = "shap-0.38.1-cp36-cp36m-win_amd64.whl", hash = "sha256:34913391184180f9359e2627131960a473d67143e94b7f649c75a2d0c7d4cd40"}, - {file = "shap-0.38.1-cp37-cp37m-win_amd64.whl", hash = "sha256:4cc1e1ac2e1e30aa9857fcf3fcfa0a6b2bf5e6aa0670c16a36bc28cd9b11aae5"}, - {file = "shap-0.38.1-cp38-cp38-win_amd64.whl", hash = "sha256:0457e7fb80d2398454a16d16c7cd7934003e8c8bd9c1e002d965fce6a3815e54"}, - {file = "shap-0.38.1.tar.gz", hash = "sha256:8f23e2ee3c80774d8c0942ecbd71b4dc0c2beba6d3de41dfc3a86e55adb9d28a"}, -] -six = [ - {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, - {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, -] -slicer = [ - {file = "slicer-0.0.7-py3-none-any.whl", hash = "sha256:0b94faa5251c0f23782c03f7b7eedda91d80144059645f452c4bc80fab875976"}, - {file = "slicer-0.0.7.tar.gz", hash = "sha256:f5d5f7b45f98d155b9c0ba6554fa9770c6b26d5793a3e77a1030fb56910ebeec"}, -] -smart-open = [ - {file = "smart_open-5.1.0-py3-none-any.whl", hash = "sha256:2059b07f530c8c9e2158e4e1575309aacb74bd813da2325c1f348015d04f3bd6"}, - {file = "smart_open-5.1.0.tar.gz", hash = "sha256:e4dc1350b240ef0759e343e4e2f361bfd4e5477bb2619866e97f80240652e92e"}, -] -spacy = [ - {file = "spacy-3.1.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:667a7fd991b49a99403f47003d6b28fd9d1ad0e79d022823b0f608e55660ce06"}, - {file = "spacy-3.1.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4bc38a83d48d6e57f0db88ee8d0683540b97aabc1102797366a0f345e7dc4288"}, - {file = "spacy-3.1.1-cp36-cp36m-win_amd64.whl", hash = "sha256:05730587cd620148a9fc824aea097a3848e955f2d01b8181a6dfb795c351a061"}, - {file = "spacy-3.1.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:fee107573987df50e1c692bd62dfc54a3e36dcb1498ae370b3a381e9ddb0b719"}, - {file = "spacy-3.1.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e15e12b3a9ce9f4c631e5047147eb92309f9b9cbcbf7f4a81e72c822f886af27"}, - {file = "spacy-3.1.1-cp37-cp37m-win_amd64.whl", hash = "sha256:0472162a8a46adcbde2390517657928b2c09cc506ca7835def4ff49ba3dc1bd4"}, - {file = "spacy-3.1.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cb6e01bead3c99deb2d5f9cfc9cc7ac033bbfa16106bfe232681581e1772a4b6"}, - {file = "spacy-3.1.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4e7114d3c35820ef4d0c972cb717788291fc7131da379d5c76773e79a92d9de8"}, - {file = "spacy-3.1.1-cp38-cp38-win_amd64.whl", hash = "sha256:ee4051e81022168999de9faca85ec2c71a54a28c0d5a6a77edb4da950497b688"}, - {file = "spacy-3.1.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:62a98490cf7fd2ed161da185579c037df1f35cde46049d39bd222fe241595176"}, - {file = "spacy-3.1.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a50d9eba682c4f33daef34af84021a1b072aef7776793a712b602056ece724be"}, - {file = "spacy-3.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:04d6b4f29eb737e89c2f31d99cef976f7fd9288412e32a74b48a81349bc7a156"}, - {file = "spacy-3.1.1.tar.gz", hash = "sha256:77a0f78d7e65335e5fae13af7e55684770c76a0457ee1baac414ddb89b5df6e4"}, -] -spacy-legacy = [ - {file = "spacy-legacy-3.0.8.tar.gz", hash = "sha256:b4725c5c161f0685ab4fce3fc912bc68aefdb7e102ba9848e852bb5842256c2f"}, - {file = "spacy_legacy-3.0.8-py2.py3-none-any.whl", hash = "sha256:eb37a3540bb461b5fe9348d4976784f18a0e345982e41e2c5c7cd8229889e825"}, -] -srsly = [ - {file = "srsly-2.4.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:f7c3374184bfb1aa852bcb8e45747b02f2dde0ebe62b4ddf4b0141affeab32e1"}, - {file = "srsly-2.4.1-cp36-cp36m-manylinux2014_x86_64.whl", hash = "sha256:9625a584b26e522b6afb7c24be8783228ff44d7ac624e500020b0b888e09c6b6"}, - {file = "srsly-2.4.1-cp36-cp36m-win_amd64.whl", hash = "sha256:129c85db752b5945c6398a1952294e03b7d20fa111eb7fd1083c4a6b1d02f7c7"}, - {file = "srsly-2.4.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:20f11d5d6ae29e3cc97e93c862d7bf8b75023668daf1ac5892598c512302e5d3"}, - {file = "srsly-2.4.1-cp37-cp37m-manylinux2014_x86_64.whl", hash = "sha256:cefe06912f3944b5729d555ee110f434a0787843c6676b90f4987ff7a0a69500"}, - {file = "srsly-2.4.1-cp37-cp37m-win_amd64.whl", hash = "sha256:b1bd4a55bafbb8cf86be15bf18aa2ba2c953161ad71ce7d2dae0c141201a7d89"}, - {file = "srsly-2.4.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:e896d516ca2e2e89cc01df8c9c8b1528701d6f49e9c814332582cc701af64a91"}, - {file = "srsly-2.4.1-cp38-cp38-manylinux2014_x86_64.whl", hash = "sha256:ff36dc01df8890a239e5d15cffa3ae3b272c19e5ae279840f2d30085d361c20a"}, - {file = "srsly-2.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:867d1154ff7b60043584fe048de9b6d9a7d5a7fc61437850922ae4bd46d3be16"}, - {file = "srsly-2.4.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:76b11e0ec0056bda4ad009b6e0db37f3ad0005a0501d587080023d4312ad2ada"}, - {file = "srsly-2.4.1-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:178aa6d350c9cfedb8adadb5e1f96b7aadde203d088917063415fcd689eb6e42"}, - {file = "srsly-2.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:869fdcf664edf20cd374cf1add869d67960061276478025a5887e080d8f99e1c"}, - {file = "srsly-2.4.1.tar.gz", hash = "sha256:b0f2aec0a329e6e7e742a0a60e99a74968ca29be71f35c5c4de221e328176926"}, -] -statsmodels = [ - {file = "statsmodels-0.12.2-cp36-cp36m-macosx_10_15_x86_64.whl", hash = "sha256:c1d98ce2072f5e772cbf91d05475490368da5d3ee4a3150062330c7b83221ceb"}, - {file = "statsmodels-0.12.2-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:4184487e9c281acad3d0bda19445c69db292f0dbb18f25ebf56a7966a0a28eef"}, - {file = "statsmodels-0.12.2-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:37e107fa11299090ed90f93c7172162b850c28fd09999937b971926813e887c5"}, - {file = "statsmodels-0.12.2-cp36-none-win32.whl", hash = "sha256:5d3e7333e1c5b234797ed57c3d1533371374c1e1e7e7ed54d27805611f96e2d5"}, - {file = "statsmodels-0.12.2-cp36-none-win_amd64.whl", hash = "sha256:aaf3c75fd22cb9dcf9c1b28f8ae87521310870f4dd8a6a4f1010f1e46d992377"}, - {file = "statsmodels-0.12.2-cp37-cp37m-macosx_10_15_x86_64.whl", hash = "sha256:c48b7cbb37a651bb1cd23614abc10f447845ad3c3a713bf74e2aad20cfc94ae7"}, - {file = "statsmodels-0.12.2-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:a3bd3922463dda8ad33e5e5075d2080e9e012aeb2032b5cdaeea9b79c2472000"}, - {file = "statsmodels-0.12.2-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:43de84bc08c8b9f778502aed7a476d6e68674e6878718e533b07d569cf0927a9"}, - {file = "statsmodels-0.12.2-cp37-none-win32.whl", hash = "sha256:0197855aa1d40c42532d6a75b4ca72e30826a50d90ec3047a404f9702d8b814f"}, - {file = "statsmodels-0.12.2-cp37-none-win_amd64.whl", hash = "sha256:93273aa1c31caf59bcce9790ca4c3f54fdc45a37c61084d06f1ba4fbe56e7752"}, - {file = "statsmodels-0.12.2-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:3e94306d4c07e332532ea4911d1f1d1f661c79aa73f22c5bb22e6dd47b40d562"}, - {file = "statsmodels-0.12.2-cp38-cp38-manylinux1_i686.whl", hash = "sha256:f3a7622f3d0ce2fc204f43b74de4e03e42775609705bf94d656b730482ca935a"}, - {file = "statsmodels-0.12.2-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:587deb788e7f8f3f866d28e812cf5c082b4d4a2d3f5beea94d0e9699ea71ef22"}, - {file = "statsmodels-0.12.2-cp38-none-win32.whl", hash = "sha256:cbbdf6f708c9a1f1fad5cdea5e4342d6fdb37e42e92288c2cf906b99976ffe15"}, - {file = "statsmodels-0.12.2-cp38-none-win_amd64.whl", hash = "sha256:1fa720e895112a1b04b27002218b0ea7f10dd1d9cffd1c018c88bbfb82520f57"}, - {file = "statsmodels-0.12.2-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:c3782ce846a52862ac72f89d22b6b1ca13d877bc593872309228a6f05d934321"}, - {file = "statsmodels-0.12.2-cp39-cp39-manylinux1_i686.whl", hash = "sha256:8f93cb3f7d87c1fc7e51b3b239371c25a17a0a8e782467fdf4788cfef600724a"}, - {file = "statsmodels-0.12.2-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:f61f33f64760a22100b6b146217823f73cfedd251c9bdbd58453ca94e63326c7"}, - {file = "statsmodels-0.12.2-cp39-none-win32.whl", hash = "sha256:3aab85174444f1bcad1e9218a3d3db08f0f86eeb97985236ca8605a0a39ce305"}, - {file = "statsmodels-0.12.2-cp39-none-win_amd64.whl", hash = "sha256:94d3632d56c13eebebaefb52bd4b43144ad5a131337b57842f46db826fa7d2d3"}, - {file = "statsmodels-0.12.2.tar.gz", hash = "sha256:8ad7a7ae7cdd929095684118e3b05836c0ccb08b6a01fe984159475d174a1b10"}, -] -subword-nmt = [ - {file = "subword_nmt-0.3.7-py2.py3-none-any.whl", hash = "sha256:a2d92eed5dea55f2b1c9b21225a57b3ae7009ce8a1fa4d2e3f01ab11435c28c9"}, -] -tabulate = [ - {file = "tabulate-0.8.9-py3-none-any.whl", hash = "sha256:d7c013fe7abbc5e491394e10fa845f8f32fe54f8dc60c6622c6cf482d25d47e4"}, - {file = "tabulate-0.8.9.tar.gz", hash = "sha256:eb1d13f25760052e8931f2ef80aaf6045a6cceb47514db8beab24cded16f13a7"}, -] -terminado = [ - {file = "terminado-0.11.0-py3-none-any.whl", hash = "sha256:221eef83e6a504894842f7dccfa971ca2e98ec22a8a9118577e5257527674b42"}, - {file = "terminado-0.11.0.tar.gz", hash = "sha256:1e01183885f64c1bba3cf89a5a995ad4acfed4e5f00aebcce1bf7f089b0825a1"}, -] -testpath = [ - {file = "testpath-0.5.0-py3-none-any.whl", hash = "sha256:8044f9a0bab6567fc644a3593164e872543bb44225b0e24846e2c89237937589"}, - {file = "testpath-0.5.0.tar.gz", hash = "sha256:1acf7a0bcd3004ae8357409fc33751e16d37ccc650921da1094a86581ad1e417"}, -] -thinc = [ - {file = "thinc-8.0.8-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:68401890470062eaa3bcd0cd0bc5ad52a6fa77da87336a927df18c21dbf0ba30"}, - {file = "thinc-8.0.8-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d0303de94e12cd288fdffebef97a460fc95700a527d4e898548477be5406a25"}, - {file = "thinc-8.0.8-cp36-cp36m-win_amd64.whl", hash = "sha256:101047df534a4861ba6fab25a1849c673c83536e067bd917ae735aeb9090fb52"}, - {file = "thinc-8.0.8-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:e35ff1cea8b1ec73fed5c04923ee88ec4799e7948496fe7eca1f754019da87e7"}, - {file = "thinc-8.0.8-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9f7c5381e804d641fff39061d3b9e01feb25790282aa8ed6684c62b8e2731e7f"}, - {file = "thinc-8.0.8-cp37-cp37m-win_amd64.whl", hash = "sha256:d0f46905fdd737a8090609ddc54a48f70fa997e5b304d8c362db93b95365646d"}, - {file = "thinc-8.0.8-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6901e6d7dbb5cf08d1877920e893fe6d721627c946004a495f77c151bf07eb72"}, - {file = "thinc-8.0.8-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:03a0e2679363fafe0c7312dc9eb46697b6fa3e65ffa7a1702ea369e93389fbfd"}, - {file = "thinc-8.0.8-cp38-cp38-win_amd64.whl", hash = "sha256:793cb9113b36df6607089806d9d08b371748b201dda05150f7f531cd63df84b8"}, - {file = "thinc-8.0.8-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:37c31b83f46372283d5f394db9272d35ec6c26b8a0481f1b1995f9ed0cb72a47"}, - {file = "thinc-8.0.8-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c37c96335d74b34e8128569070c0e17e13a213c9564a3553e44c3769a948a35e"}, - {file = "thinc-8.0.8-cp39-cp39-win_amd64.whl", hash = "sha256:998b87d6cd334b5bf080ef5594bc0d1afda36d088deffc1caf7e8fe0bae553c6"}, - {file = "thinc-8.0.8.tar.gz", hash = "sha256:cf2abbd99c56f21b8804f31f995460515d95a5c5988be39e0964469e0070987b"}, -] -threadpoolctl = [ - {file = "threadpoolctl-2.2.0-py3-none-any.whl", hash = "sha256:e5a995e3ffae202758fa8a90082e35783b9370699627ae2733cd1c3a73553616"}, - {file = "threadpoolctl-2.2.0.tar.gz", hash = "sha256:86d4b6801456d780e94681d155779058759eaef3c3564758b17b6c99db5f81cb"}, -] -tifffile = [ - {file = "tifffile-2021.8.8-py3-none-any.whl", hash = "sha256:1309d1f5cc2ee2e8274916dc609922cb2364f947a9d09b388069c63180710dfd"}, - {file = "tifffile-2021.8.8.tar.gz", hash = "sha256:8260f31c4700143e8374ff6cde5cef7fe54fc9b7313afe88329f407881901dc5"}, -] -toml = [ - {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"}, - {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"}, -] -torch = [ - {file = "torch-1.7.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:422e64e98d0e100c360993819d0307e5d56e9517b26135808ad68984d577d75a"}, - {file = "torch-1.7.1-cp36-cp36m-win_amd64.whl", hash = "sha256:f0aaf657145533824b15f2fd8fde8f8c67fe6c6281088ef588091f03fad90243"}, - {file = "torch-1.7.1-cp36-none-macosx_10_9_x86_64.whl", hash = "sha256:af464a6f4314a875035e0c4c2b07517599704b214634f4ed3ad2e748c5ef291f"}, - {file = "torch-1.7.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:5d76c255a41484c1d41a9ff570b9c9f36cb85df9428aa15a58ae16ac7cfc2ea6"}, - {file = "torch-1.7.1-cp37-cp37m-win_amd64.whl", hash = "sha256:d241c3f1c4d563e4ba86f84769c23e12606db167ee6f674eedff6d02901462e3"}, - {file = "torch-1.7.1-cp37-none-macosx_10_9_x86_64.whl", hash = "sha256:de84b4166e3f7335eb868b51d3bbd909ec33828af27290b4171bce832a55be3c"}, - {file = "torch-1.7.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:dd2fc6880c95e836960d86efbbc7f63d3287f2e1893c51d31f96dbfe02f0d73e"}, - {file = "torch-1.7.1-cp38-cp38-win_amd64.whl", hash = "sha256:e000b94be3aa58ad7f61e7d07cf379ea9366cf6c6874e68bd58ad0bdc537b3a7"}, - {file = "torch-1.7.1-cp38-none-macosx_10_9_x86_64.whl", hash = "sha256:2e49cac969976be63117004ee00d0a3e3dd4ea662ad77383f671b8992825de1a"}, - {file = "torch-1.7.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:a3793dcceb12b1e2281290cca1277c5ce86ddfd5bf044f654285a4d69057aea7"}, - {file = "torch-1.7.1-cp39-cp39-win_amd64.whl", hash = "sha256:6652a767a0572ae0feb74ad128758e507afd3b8396b6e7f147e438ba8d4c6f63"}, - {file = "torch-1.7.1-cp39-none-macosx_10_9_x86_64.whl", hash = "sha256:38d67f4fb189a92a977b2c0a38e4f6dd413e0bf55aa6d40004696df7e40a71ff"}, -] -torchsummary = [ - {file = "torchsummary-1.5.1-py3-none-any.whl", hash = "sha256:10f41d1743fb918f83293f13183f532ab1bb8f6639a1b89e5f8592ec1919a976"}, - {file = "torchsummary-1.5.1.tar.gz", hash = "sha256:981bf689e22e0cf7f95c746002f20a24ad26aa6b9d861134a14bc6ce92230590"}, -] -torchtext = [ - {file = "torchtext-0.8.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:a0c0b7221fdfdd124f98de854d922c111084a4defe11ea32ecc22b56d1f46fd9"}, - {file = "torchtext-0.8.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:19c9976400e09ab1008c3fb0d1162dc80214b6ac45012d2e1692c25337119157"}, - {file = "torchtext-0.8.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:991d9d38fd1d47a8517e624223e3537123a48175b00b74c6508daa2906431176"}, - {file = "torchtext-0.8.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:46cae2155fa28ab9920e23e6fb8d445911183e88e7f9eeb74024ee0a20671961"}, - {file = "torchtext-0.8.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:5b479e2c98525a77ab112e6dd624a1ccc783e927b25b618218793254fc09e2d2"}, - {file = "torchtext-0.8.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:90c4699d3f923cf937c89579e08f560094874ecdcd0a62603bef2bda961553ed"}, - {file = "torchtext-0.8.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e2e82629a682064e21c20c2d6b34a3a4212e0ec816de0e69db6ee43da48f3eb0"}, - {file = "torchtext-0.8.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:28036a61bf97d965775b32065ff31661637662124f6aabf4eccd2ef12d9f3d43"}, -] -torchvision = [ - {file = "torchvision-0.8.2-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:86fae370d222f76ad57c57c3bee03f78b8db727743bfb4c1559a3d395159cea8"}, - {file = "torchvision-0.8.2-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:951239b5fcb911dbf78c1385d677f5f48c7a1b12859e3d3ec287562821b17cf2"}, - {file = "torchvision-0.8.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:24db8f4c3d812a032273f68563ad5dbd724f5bfbed523d0c6dce8cede26bb153"}, - {file = "torchvision-0.8.2-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:b068f6bcbe91bdd34dda0a39e8a26392add45a3be82543f6dd523b76484fb56f"}, - {file = "torchvision-0.8.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:afb76a66b9b0693f758a881a2bf333ed97e3c0c3f15a413c4f49d8dd8bd21307"}, - {file = "torchvision-0.8.2-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:cd8817e9197fc60ebae37162a445db90bbf35591314a5767ad3d1490b5d65b0f"}, - {file = "torchvision-0.8.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1bd58acc3366ec02266aae56a7a752d43ef07de4a6ba420c4f907d0c9168bb8c"}, - {file = "torchvision-0.8.2-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:976750a49db2e23dc5a1ed0b5c31f7af51ed2702eee410ee09ef985c3a3e48cf"}, -] -tornado = [ - {file = "tornado-6.1-cp35-cp35m-macosx_10_9_x86_64.whl", hash = "sha256:d371e811d6b156d82aa5f9a4e08b58debf97c302a35714f6f45e35139c332e32"}, - {file = "tornado-6.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:0d321a39c36e5f2c4ff12b4ed58d41390460f798422c4504e09eb5678e09998c"}, - {file = "tornado-6.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:9de9e5188a782be6b1ce866e8a51bc76a0fbaa0e16613823fc38e4fc2556ad05"}, - {file = "tornado-6.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:61b32d06ae8a036a6607805e6720ef00a3c98207038444ba7fd3d169cd998910"}, - {file = "tornado-6.1-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:3e63498f680547ed24d2c71e6497f24bca791aca2fe116dbc2bd0ac7f191691b"}, - {file = "tornado-6.1-cp35-cp35m-manylinux2014_aarch64.whl", hash = "sha256:6c77c9937962577a6a76917845d06af6ab9197702a42e1346d8ae2e76b5e3675"}, - {file = "tornado-6.1-cp35-cp35m-win32.whl", hash = "sha256:6286efab1ed6e74b7028327365cf7346b1d777d63ab30e21a0f4d5b275fc17d5"}, - {file = "tornado-6.1-cp35-cp35m-win_amd64.whl", hash = "sha256:fa2ba70284fa42c2a5ecb35e322e68823288a4251f9ba9cc77be04ae15eada68"}, - {file = "tornado-6.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:0a00ff4561e2929a2c37ce706cb8233b7907e0cdc22eab98888aca5dd3775feb"}, - {file = "tornado-6.1-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:748290bf9112b581c525e6e6d3820621ff020ed95af6f17fedef416b27ed564c"}, - {file = "tornado-6.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:e385b637ac3acaae8022e7e47dfa7b83d3620e432e3ecb9a3f7f58f150e50921"}, - {file = "tornado-6.1-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:25ad220258349a12ae87ede08a7b04aca51237721f63b1808d39bdb4b2164558"}, - {file = "tornado-6.1-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:65d98939f1a2e74b58839f8c4dab3b6b3c1ce84972ae712be02845e65391ac7c"}, - {file = "tornado-6.1-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:e519d64089b0876c7b467274468709dadf11e41d65f63bba207e04217f47c085"}, - {file = "tornado-6.1-cp36-cp36m-win32.whl", hash = "sha256:b87936fd2c317b6ee08a5741ea06b9d11a6074ef4cc42e031bc6403f82a32575"}, - {file = "tornado-6.1-cp36-cp36m-win_amd64.whl", hash = "sha256:cc0ee35043162abbf717b7df924597ade8e5395e7b66d18270116f8745ceb795"}, - {file = "tornado-6.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:7250a3fa399f08ec9cb3f7b1b987955d17e044f1ade821b32e5f435130250d7f"}, - {file = "tornado-6.1-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:ed3ad863b1b40cd1d4bd21e7498329ccaece75db5a5bf58cd3c9f130843e7102"}, - {file = "tornado-6.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:dcef026f608f678c118779cd6591c8af6e9b4155c44e0d1bc0c87c036fb8c8c4"}, - {file = "tornado-6.1-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:70dec29e8ac485dbf57481baee40781c63e381bebea080991893cd297742b8fd"}, - {file = "tornado-6.1-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:d3f7594930c423fd9f5d1a76bee85a2c36fd8b4b16921cae7e965f22575e9c01"}, - {file = "tornado-6.1-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:3447475585bae2e77ecb832fc0300c3695516a47d46cefa0528181a34c5b9d3d"}, - {file = "tornado-6.1-cp37-cp37m-win32.whl", hash = "sha256:e7229e60ac41a1202444497ddde70a48d33909e484f96eb0da9baf8dc68541df"}, - {file = "tornado-6.1-cp37-cp37m-win_amd64.whl", hash = "sha256:cb5ec8eead331e3bb4ce8066cf06d2dfef1bfb1b2a73082dfe8a161301b76e37"}, - {file = "tornado-6.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:20241b3cb4f425e971cb0a8e4ffc9b0a861530ae3c52f2b0434e6c1b57e9fd95"}, - {file = "tornado-6.1-cp38-cp38-manylinux1_i686.whl", hash = "sha256:c77da1263aa361938476f04c4b6c8916001b90b2c2fdd92d8d535e1af48fba5a"}, - {file = "tornado-6.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:fba85b6cd9c39be262fcd23865652920832b61583de2a2ca907dbd8e8a8c81e5"}, - {file = "tornado-6.1-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:1e8225a1070cd8eec59a996c43229fe8f95689cb16e552d130b9793cb570a288"}, - {file = "tornado-6.1-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:d14d30e7f46a0476efb0deb5b61343b1526f73ebb5ed84f23dc794bdb88f9d9f"}, - {file = "tornado-6.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:8f959b26f2634a091bb42241c3ed8d3cedb506e7c27b8dd5c7b9f745318ddbb6"}, - {file = "tornado-6.1-cp38-cp38-win32.whl", hash = "sha256:34ca2dac9e4d7afb0bed4677512e36a52f09caa6fded70b4e3e1c89dbd92c326"}, - {file = "tornado-6.1-cp38-cp38-win_amd64.whl", hash = "sha256:6196a5c39286cc37c024cd78834fb9345e464525d8991c21e908cc046d1cc02c"}, - {file = "tornado-6.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f0ba29bafd8e7e22920567ce0d232c26d4d47c8b5cf4ed7b562b5db39fa199c5"}, - {file = "tornado-6.1-cp39-cp39-manylinux1_i686.whl", hash = "sha256:33892118b165401f291070100d6d09359ca74addda679b60390b09f8ef325ffe"}, - {file = "tornado-6.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:7da13da6f985aab7f6f28debab00c67ff9cbacd588e8477034c0652ac141feea"}, - {file = "tornado-6.1-cp39-cp39-manylinux2010_i686.whl", hash = "sha256:e0791ac58d91ac58f694d8d2957884df8e4e2f6687cdf367ef7eb7497f79eaa2"}, - {file = "tornado-6.1-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:66324e4e1beede9ac79e60f88de548da58b1f8ab4b2f1354d8375774f997e6c0"}, - {file = "tornado-6.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:a48900ecea1cbb71b8c71c620dee15b62f85f7c14189bdeee54966fbd9a0c5bd"}, - {file = "tornado-6.1-cp39-cp39-win32.whl", hash = "sha256:d3d20ea5782ba63ed13bc2b8c291a053c8d807a8fa927d941bd718468f7b950c"}, - {file = "tornado-6.1-cp39-cp39-win_amd64.whl", hash = "sha256:548430be2740e327b3fe0201abe471f314741efcb0067ec4f2d7dcfb4825f3e4"}, - {file = "tornado-6.1.tar.gz", hash = "sha256:33c6e81d7bd55b468d2e793517c909b139960b6c790a60b7991b9b6b76fb9791"}, -] -tqdm = [ - {file = "tqdm-4.62.0-py2.py3-none-any.whl", hash = "sha256:706dea48ee05ba16e936ee91cb3791cd2ea6da348a0e50b46863ff4363ff4340"}, - {file = "tqdm-4.62.0.tar.gz", hash = "sha256:3642d483b558eec80d3c831e23953582c34d7e4540db86d9e5ed9dad238dabc6"}, -] -traitlets = [ - {file = "traitlets-5.0.5-py3-none-any.whl", hash = "sha256:69ff3f9d5351f31a7ad80443c2674b7099df13cc41fc5fa6e2f6d3b0330b0426"}, - {file = "traitlets-5.0.5.tar.gz", hash = "sha256:178f4ce988f69189f7e523337a3e11d91c786ded9360174a3d9ca83e79bc5396"}, -] -typer = [ - {file = "typer-0.3.2-py3-none-any.whl", hash = "sha256:ba58b920ce851b12a2d790143009fa00ac1d05b3ff3257061ff69dbdfc3d161b"}, - {file = "typer-0.3.2.tar.gz", hash = "sha256:5455d750122cff96745b0dec87368f56d023725a7ebc9d2e54dd23dc86816303"}, -] -typing-extensions = [ - {file = "typing_extensions-3.10.0.0-py2-none-any.whl", hash = "sha256:0ac0f89795dd19de6b97debb0c6af1c70987fd80a2d62d1958f7e56fcc31b497"}, - {file = "typing_extensions-3.10.0.0-py3-none-any.whl", hash = "sha256:779383f6086d90c99ae41cf0ff39aac8a7937a9283ce0a414e5dd782f4c94a84"}, - {file = "typing_extensions-3.10.0.0.tar.gz", hash = "sha256:50b6f157849174217d0656f99dc82fe932884fb250826c18350e159ec6cdf342"}, -] -urllib3 = [ - {file = "urllib3-1.26.6-py2.py3-none-any.whl", hash = "sha256:39fb8672126159acb139a7718dd10806104dec1e2f0f6c88aab05d17df10c8d4"}, - {file = "urllib3-1.26.6.tar.gz", hash = "sha256:f57b4c16c62fa2760b7e3d97c35b255512fb6b59a259730f36ba32ce9f8e342f"}, -] -virtualenv = [ - {file = "virtualenv-20.7.2-py2.py3-none-any.whl", hash = "sha256:e4670891b3a03eb071748c569a87cceaefbf643c5bac46d996c5a45c34aa0f06"}, - {file = "virtualenv-20.7.2.tar.gz", hash = "sha256:9ef4e8ee4710826e98ff3075c9a4739e2cb1040de6a2a8d35db0055840dc96a0"}, -] -wasabi = [ - {file = "wasabi-0.8.2-py3-none-any.whl", hash = "sha256:a493e09d86109ec6d9e70d040472f9facc44634d4ae6327182f94091ca73a490"}, - {file = "wasabi-0.8.2.tar.gz", hash = "sha256:b4a36aaa9ca3a151f0c558f269d442afbb3526f0160fd541acd8a0d5e5712054"}, -] -wcwidth = [ - {file = "wcwidth-0.2.5-py2.py3-none-any.whl", hash = "sha256:beb4802a9cebb9144e99086eff703a642a13d6a0052920003a230f3294bbe784"}, - {file = "wcwidth-0.2.5.tar.gz", hash = "sha256:c4d647b99872929fdb7bdcaa4fbe7f01413ed3d98077df798530e5b04f116c83"}, -] -webencodings = [ - {file = "webencodings-0.5.1-py2.py3-none-any.whl", hash = "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78"}, - {file = "webencodings-0.5.1.tar.gz", hash = "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923"}, -] -widgetsnbextension = [ - {file = "widgetsnbextension-3.5.1-py2.py3-none-any.whl", hash = "sha256:bd314f8ceb488571a5ffea6cc5b9fc6cba0adaf88a9d2386b93a489751938bcd"}, - {file = "widgetsnbextension-3.5.1.tar.gz", hash = "sha256:079f87d87270bce047512400efd70238820751a11d2d8cb137a5a5bdbaf255c7"}, -] -xgboost = [ - {file = "xgboost-1.4.2-py3-none-macosx_10_14_x86_64.macosx_10_15_x86_64.macosx_11_0_x86_64.whl", hash = "sha256:e8f1a366a403784afd30a56eb99a429cefc45d906943cd362025ccf942208e13"}, - {file = "xgboost-1.4.2-py3-none-manylinux2010_x86_64.whl", hash = "sha256:ec3f60d53dcd23273a5c7a495ba0f8205656ce750eb2ce7798726a4b2ef4955a"}, - {file = "xgboost-1.4.2-py3-none-manylinux2014_aarch64.whl", hash = "sha256:15dd5987827030b3f68e741490a8b3a4ead7c6064bd911e36235b84e0a9d0765"}, - {file = "xgboost-1.4.2-py3-none-win_amd64.whl", hash = "sha256:7c8973204b2c2362012850605e81de5a180513fc08db36d0da9befb77c3d57c8"}, - {file = "xgboost-1.4.2.tar.gz", hash = "sha256:5a364c152095824445ac56a83fb7f7e75913b4bb128c2fcd99b85877c9f4f8fe"}, -] diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index b26eb4d..0000000 --- a/pyproject.toml +++ /dev/null @@ -1,87 +0,0 @@ -[tool.poetry] -name = "ml-mipt" -version = "1.0.0" -description = "Machine learning course at MIPT" -authors = ["Vladislav Goncharenko , Radoslav Neychev "] -license = "MIT License" - -[tool.poetry.dependencies] -python = "^3.8" -scikit-learn = "^0.24.1" -matplotlib = "^3.3.4" -pandas = "^1.2.2" -numpy = "^1.20.1" -scipy = "^1.6.0" -statsmodels = "^0.12.2" -seaborn = "^0.11.1" -xgboost = "^1.3.3" -opencv-python = "^4.5.1" -torch = "^1.7.1" -torchvision = "^0.8.2" -torchsummary = "^1.5.1" - -# basic -Pillow = {version = "^7.2.0", optional = true} # TODO: remove -tqdm = {version = "^4.56.2", optional = true} # TODO: remove -scikit-image = {version = "^0.18.1", optional = true} # TODO: remove week0_12 imread and resize -h5py = {version = "^3.1.0", optional = true} # parse cats and dogs dataset, maybe remove? -pydotplus = {version = "^2.0.2", optional = true} # graph visualization -eli5 = {version = "^0.11.0", optional = true} # week0_07 feature importance -PDPbox = {version = "^0.2.0", optional = true} # week0_07 feature importance -shap = {version = "^0.38.1", optional = true} # week0_07 feature importance - -# advanced -ipywidgets = "^7.6.3" # week1_15 downloading mnist via torchvision - -# nlp -nltk = "^3.5" -gensim = "^3.8.3" -spacy = "^3.1.1" -subword-nmt = "^0.3.7" - -pytorch-transformers = "^1.2.0" -torchtext = "^0.8" - -bokeh = "^2.3.0" - -# rl -gym = {version = "^0.18.0", optional = true} -graphviz = "^0.16" - -[tool.poetry.extras] -basic = ["Pillow", "tqdm", "scikit-image", "h5py", "pydotplus", "eli5", "PDPbox", "shap"] -nlp = ["nltk", "gensim", "spacy", "subword-nmt", "pytorch-transformers", "torchtext", "bokeh"] -rl = ["gym", "graphviz"] - -[tool.poetry.dev-dependencies] -pre-commit = "^2.10.1" -ipykernel = "^5.4.3" - -[tool.black] -line-length = 100 -target-version = ["py38"] - -[tool.isort] -multi_line_output = 3 -include_trailing_comma = true -force_grid_wrap = 0 -use_parentheses = true -ensure_newline_before_comments = true -line_length = 100 -lines_after_imports = 2 - -[tool.nbqa.config] -black = "pyproject.toml" -isort = "pyproject.toml" -flake8 = "setup.cfg" - -[tool.nbqa.addopts] -flake8 = ["--extend-ignore=E402"] - -[tool.nbqa.mutate] -black = 1 -isort = 1 - -[build-system] -requires = ["poetry-core>=1.0.0"] -build-backend = "poetry.core.masonry.api" diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 8392765..0000000 --- a/setup.cfg +++ /dev/null @@ -1,5 +0,0 @@ -[flake8] -max-line-length = 100 -ignore = E203, E501, W503, B950 -max-complexity = 12 -select = B, C, E, F, W, B9 diff --git a/week05_transformer_pos_tagging/README.md b/week05_transformer_pos_tagging/README.md deleted file mode 100644 index 1443d4f..0000000 --- a/week05_transformer_pos_tagging/README.md +++ /dev/null @@ -1,24 +0,0 @@ -PoS Tagging with BiLSTM: -* Self-practice version: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/girafe-ai/natural-language-processing/blob/master/week05_transformer_pos_tagging/week05_bilstm_for_pos_tagging.ipynb) -* Completed version: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/girafe-ai/natural-language-processing/blob/master/week05_transformer_pos_tagging/week05_bilstm_for_pos_tagging__completed.ipynb) - - -Understanding the positional encoding: -[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/girafe-ai/natural-language-processing/blob/master/week05_transformer_pos_tagging/week05_positional_encoding_carriers.ipynb) - -Full Transformer architecture and training pipeline by Harvard NLP: -[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/harvardnlp/annotated-transformer/blob/master/The%20Annotated%20Transformer.ipynb) - - - -__Further readings__: -* [en] The Illustrated Transformer [blog post](https://jalammar.github.io/illustrated-transformer/) - -* [en] Harvard NLP [full implementation in PyTorch](http://nlp.seas.harvard.edu/2018/04/03/attention.html) - -* [en] OpenAI blog post [Better Language Models -and Their Implications (GPT-2)](https://openai.com/blog/better-language-models/) - -* [en] Paper describing positional encoding ["Convolutional Sequence to Sequence Learning"](https://arxiv.org/pdf/1705.03122) - -* [en] Paper presenting [Layer Normalization](https://arxiv.org/abs/1607.06450) diff --git a/week05_transformer_pos_tagging/assets/pos-bert.png b/week05_transformer_pos_tagging/assets/pos-bert.png deleted file mode 100644 index 139b8df..0000000 Binary files a/week05_transformer_pos_tagging/assets/pos-bert.png and /dev/null differ diff --git a/week05_transformer_pos_tagging/assets/pos-bert.xml b/week05_transformer_pos_tagging/assets/pos-bert.xml deleted file mode 100644 index 4317308..0000000 --- a/week05_transformer_pos_tagging/assets/pos-bert.xml +++ /dev/null @@ -1 +0,0 @@ -7Vxbc5s4GP01PCaDkAHzWDtpd3Z72dadSfLUUUAGNoBcLAe7v34FCANG2Dgxl0nwg40+ELqcc75PEsISnPvbTyFaOV+IhT1Jka2tBG8kRdH1CfuODbvUMAHcYIeulZpAbli4fzA3yty6cS28Ll1ICfGouyobTRIE2KQlGwpDEpUvWxKvXOoK2bhiWJjIq1rvXIs6qXWqaLn9L+zaTlYy0Iz0zCMyn+yQbAJenqTAZfJJT/souxcvYe0gi0SpKWk7vJXgPCSEpkf+do69uGuzbks76GPN2X29QxzQRhl4jmfkbXBW5aRidJd1BrZY3/BkQAL2M0taiON7yCxlEt81+bFDfY8dAna4b1ts/w9TuuNAow0lzERC6hCbBMj7TMiKZ1qSgH5EvuvFzJmTTejikFXnK474SX4PoLF0tbVZt7KMJq++mpriVhSu4d3xCRMf03DHLgixh6j7XGYA4kSy99fts/5LXFaqInPOA924hjDNlbFelct3oSi0MeUZc1TYQaEmuSnBSoybKoCNSWyy/cUPKihSvKUH+NCQPOE58UiYI7t0Pe/AhDzXDljSZL3MwICzZxxSl4nlAz/hu5YVFzOLHJfixQolXR8xz1ChSokTtofWawGHLsOBuJZ4exTy7KxagU6VOXRRLv8JNzkF5StyPU9K8B7BMnN6b1iDWjcaVKpAtqhBrVaDcNTguRpUYK8azCT3hjWowG5ECKtItijCrFUCFU5GFZ6rQgj6VaFxMdHhrUvvY/O1ylMP/CLWO+GucCpOPgxMqtNupGp0qtRprVLBqNRzlVqdbXQp1PvF4+LvyFC/L9Qoir7//EIfwyuwp7JYvrl4bnPrQV8LBSYQYjOls+ObbTGxa+ADkkSeLUntCiifP/jlk7VUSsc6EFw8RDcFVBNNKA+wLCMl0k3doKdONGU1S2wMqOkQaQeSZnYMLBXr58qs4gTaiZAQNhPepD3hyaPwBMLT5cbCU/oSnt5g8j8KTxJOEPsXHjg+YB2Q8F4moGzt/bSAQG8CarCCPQpIEq5ydikgPFve/fB//3rY/v6GjA3Q/0RPV9M3P2JspDNx38iXllT9JJB97T8HFNFbmxCa35TnuxlY/4O/zu4/rKynKTCvdNgnI0CRD9e62pASRUJk2U5y4gDX15Gk6KHFFyod+eOj1ezZSVvao6YKnPRyuVRMcwhOWvAQEfTso41eFTlwH33xecORh8sDctLVpbrRSZ9myWknDTty0mJUp70KvYSq0hBVUEZVOYWqhdZOUlvQF8RyRxAfreYYh8+fLPUfiKtYjYG45DnbD8TJDpNiIO43DqtjHH4BSU476UmvcVgf43D7EPc7HxbtWhnjcKNV//7j8LhoecJzth+Hk01mw4nD2uixX8uY4UyOj1Zz9Njnb5Do0mPXPJ7sV6BtP6cVNtpovu3o4k77VWgZ416klz4sGMCOCNHj+GRzpzp3EJX02U7Sb97dW0prVpwb2DNCKfFj49Fh1qvWqfZbinrbB5rtpjrBgff2lkxbHBDMkQbAgbr3MsoceG/vaLTFAcGoawAcEI2Zqxx4b7v/2+JAdTjQJQXEE2OjwoDZ7Y+fA4W6psMFsNTrUD+Y+ig9IyDYefPZDTAK3w8GcNIaBiyZ/1dDupqU/x8GvP0f \ No newline at end of file diff --git a/week05_transformer_pos_tagging/assets/pos-bidirectional-lstm.png b/week05_transformer_pos_tagging/assets/pos-bidirectional-lstm.png deleted file mode 100644 index 3f0e7ae..0000000 Binary files a/week05_transformer_pos_tagging/assets/pos-bidirectional-lstm.png and /dev/null differ diff --git a/week05_transformer_pos_tagging/assets/pos-bidirectional-lstm.xml b/week05_transformer_pos_tagging/assets/pos-bidirectional-lstm.xml deleted file mode 100644 index e873dc6..0000000 --- a/week05_transformer_pos_tagging/assets/pos-bidirectional-lstm.xml +++ /dev/null @@ -1 +0,0 @@ -7V3blto2FP0aHifL98tjIJN0dWXapqRN8pTlwQLcMWhiPAHy9ZWxDBgdewQIyWDzMrYs37TP3kc6Otb0zMFs9SEJnqcPOERxz9DCVc981zMM3XU08icrWecltmnlBZMkCmmlXcEw+oVoIT1v8hKFaFGqmGIcp9FzuXCE53M0SktlQZLgZbnaGMfluz4HE8QUDEdBzJZ+icJ0mpd6hrMr/w1Fk2lxZ93x8yOPwehpkuCXOb1fzzDHm19+eBYU16J3WEyDEC/zos27m/c9c5BgnOZbs9UAxVnbFs2WN9D7iqPb507QPOU54evwcfj70rc/De3lcvnp80P6mNx5FKyfQfyCivfYPG26Llpo844ou4rWM/vLaZSi4XMwyo4uiU2Qsmk6i8meTja3b5nVncTBYkG3R3gWjej2Ik3wExrgGCebW5ie8Wg6DjkyjuJ4r7yv3Zv3m3I8T98HsyjOrGyAX5IIJeQp/0BLepAalp5VDuJoMic7I9IypJrZZ5uKtt5PlKRotVdEm+4DwjOUJmtSpTiq+W9MMz+LWrqpURNe7uzGokXTPZMpygJqqZPtxXdokQ0K2DHgmR14fOB5DQTP6MDjAs9wGgie1oHHBZ5pKQXP1BlYUEjcP92d4zn50y8jtd/iewihVZR+3dv+llV5Y2d7c/JUX+kZm53dsRKu/6E0XdMWD15STIpwkk7xBM+D+CPGz/TyR6FWCdKCnDiiL02tMw2SCaK1KChZc9QCmaA4SKOf5d7SOZg0W/dCG3mh1QTq6DZLHVMidVhYWkkdk6VOUU0+d5rd4WsOdwxTKXd0/XWcTiVTQ2hhi6YAPfUvHJG77kTQZYC07AOEcmrSEw9A2j4JF242ABsZIFur73SDQZHYY3qAT4ktFNl9otAixuwz645GQfyWHphFYZjdBuRw2VS4aCzGBs7yXzZn188QwUGOLvqVc9CRw0GDBfKCHHQqOWh2HBTgB2Vy0ODwg11/Jeuc6Er7KzxjsivXSvHjXVgsTZZxFxRLcECQq6XVqaUAFspUy4LwbR9xF12A0pBbNHt5QbF84aBQIApYmtfwl9ZIn6GZ4V9OIqEOZdcHYdWPHWlL7YLI4NlO/HZa+K1h/RRPFQcv2U3xKrspetdNOZ+oMnspdoun1Eo9EknOkownHMfc/iyvLNFWdnj3c63y9XNpERIbheJpGXWnPfu+Zw+SzMw2CVnftY7Tx3L6TnvjuAfIyuM0mOOga3Yt03c8u9+VHrQ1yEWAs3zem2y/W+3vrDn8+mZnd9pmrziPyyPbBkt+sMF84XMi3JoMxW9gbrbN3y7I7aL55HNufbWgn+OBDUeiB67n5dUPSRXMKfKOP3mutR01iffBFTrtdDqd0QIIG7nKMjXsqnksVpKNTpLPl2RgtlmqJrudJgueYz5Zk4Frydfk+nBWazTZAzRZU6bJVWEoVpPblmtwEU0Gsg+kavKNT90omMo+WZOBa0nXZL1+frUtmlwwcF+THWWhC4c/rNi2jIaLaDKQ4yBTk2Fmbs30Zpl5Wn6nzTK1QtqUfc3jQDnTB1hKmF73HdcMgOl1pIc2co9lnqQUP13i/HoF8W7eJZ5EPBdwkRXEM1QRz4UcZ0c8rlGIeuLpVxMfOI1AwIR5RUMoi9AWz9gR6PjQqkwCof74y9+zH9+/rX78Gfgvuvtr+XTn3XyPkYtncNsA3kt8tsp58DXDeYXOo2MD3BuPx8Zo1ATuAZ8/6oqp55sd9Srt2gCo1yzmGR3zTvV66qnHYtVRrzBsYKEGZVGR2ifsqHf8iE099boOZ7VhW4DXUzauq33EjnvHhyllcg9ebe1qEtzCYDHd3FPnJVTt8nIXn+Fll9bbTgaJ/woITppWG4HeR3aL8yu6uhXSg+9VKnRVsE28GlIrevjylRcGWMlwsdTopyJXCdbrGEgisAWsG3Eow7wpGpbBXutwyvfCKRpefRblDQq9cK5WpESyC+BKF3q1HegrEnoghlS7/HNDhN7z2kZeSUvKGOzS49LJqzbwdEXkBaJQtQvvN4W8VzMfLQoo4e0Pk1dne1XSydu2XpUvCVsoQVrjw/ZtkgTrvWrPWYVFza2AkbqmHVhLflGxtgPFzfbSc2M07j4sFpidyy6Xb3CmWlwuO1e/mvDbSblKtd/ycyQwNcuR+10+7qmZFeqzAjW13WwFnlrSalfQ56UX8tRQqEWKp37l48adpzY6T32+fAD/lUi9p9auJjVLlH74cvQDWjLkUvoBRHuk6IfPqx/dt9EC9AP4l3QN0A8W1NvWD12TNFEHLHl7If2AAk4y9GPblK8LSPchrwgBAfyEegGBvqkBbaBbI/JoxKEJes7UrAsGh6oQJ2hPg7Tn9tc9911bxxx9nKZ4JpL17EDWVW4CBpcJtLTbKNwEgOxo9SZQ9X9IyibQUscv3ASAJF31JlA1U1Q2gZZOEwk3ATYgIdECPkToH3/RXw7/nYex+WT7g/HwDvie+n72iMKQvP6ioXhXtDqATXWvrBj3FNnyhmocWHf8MZqjIGkPBrrEtdNhDFh/2I/uwihBozTKgiPk2Mfh54cbhsQrQwKNVDwxkJDdBON0P5xBXmr6gEOU1fgf \ No newline at end of file diff --git a/week05_transformer_pos_tagging/week05_bilstm_for_pos_tagging.ipynb b/week05_transformer_pos_tagging/week05_bilstm_for_pos_tagging.ipynb deleted file mode 100644 index e79553d..0000000 --- a/week05_transformer_pos_tagging/week05_bilstm_for_pos_tagging.ipynb +++ /dev/null @@ -1,1533 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Practice: BiLSTM for PoS Tagging\n", - "*This notebook is based on [open-source implementation](https://github.com/bentrevett/pytorch-pos-tagging) of PoS Tagging in PyTorch.*\n", - "\n", - "### Introduction\n", - "\n", - "In this series we'll be building a machine learning model that produces an output for every element in an input sequence, using PyTorch and TorchText. Specifically, we will be inputting a sequence of text and the model will output a part-of-speech (PoS) tag for each token in the input text. This can also be used for named entity recognition (NER), where the output for each token will be what type of entity, if any, the token is.\n", - "\n", - "In this notebook, we'll be implementing a multi-layer bi-directional LSTM (BiLSTM) to predict PoS tags using the Universal Dependencies English Web Treebank (UDPOS) dataset.\n", - "\n", - "### Preparing Data\n", - "\n", - "First, let's import the necessary Python modules." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import torch\n", - "import torch.nn as nn\n", - "import torch.optim as optim\n", - "\n", - "from torchtext.legacy import data\n", - "from torchtext.legacy import datasets\n", - "\n", - "import spacy\n", - "import numpy as np\n", - "\n", - "import time\n", - "import random" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next, we'll set the random seeds for reproducability." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "SEED = 1234\n", - "\n", - "random.seed(SEED)\n", - "np.random.seed(SEED)\n", - "torch.manual_seed(SEED)\n", - "torch.backends.cudnn.deterministic = True" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "One of the key parts of TorchText is the `Field`. The `Field` handles how your dataset is processed.\n", - "\n", - "Our `TEXT` field handles how the text that we need to tag is dealt with. All we do here is set `lower = True` which lowercases all of the text.\n", - "\n", - "Next we'll define the `Fields` for the tags. This dataset actually has two different sets of tags, [universal dependency (UD) tags](https://universaldependencies.org/u/pos/) and [Penn Treebank (PTB) tags](https://www.sketchengine.eu/penn-treebank-tagset/). We'll only train our model on the UD tags, but will load the PTB tags to show how they could be used instead.\n", - "\n", - "`UD_TAGS` handles how the UD tags should be handled. Our `TEXT` vocabulary - which we'll build later - will have *unknown* tokens in it, i.e. tokens that are not within our vocabulary. However, we won't have unknown tags as we are dealing with a finite set of possible tags. TorchText `Fields` initialize a default unknown token, ``, which we remove by setting `unk_token = None`.\n", - "\n", - "`PTB_TAGS` does the same as `UD_TAGS`, but handles the PTB tags instead." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "TEXT = data.Field(lower = True)\n", - "UD_TAGS = data.Field(unk_token = None)\n", - "PTB_TAGS = data.Field(unk_token = None)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We then define `fields`, which handles passing our fields to the dataset.\n", - "\n", - "Note that order matters, if you only wanted to load the PTB tags your field would be:\n", - "\n", - "```\n", - "fields = ((\"text\", TEXT), (None, None), (\"ptbtags\", PTB_TAGS))\n", - "```\n", - "\n", - "Where `None` tells TorchText to not load those tags." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fields = ((\"text\", TEXT), (\"udtags\", UD_TAGS), (\"ptbtags\", PTB_TAGS))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next, we load the UDPOS dataset using our defined fields." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "train_data, valid_data, test_data = datasets.UDPOS.splits(fields)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can check how many examples are in each section of the dataset by checking their length." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(f\"Number of training examples: {len(train_data)}\")\n", - "print(f\"Number of validation examples: {len(valid_data)}\")\n", - "print(f\"Number of testing examples: {len(test_data)}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's print out an example:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(vars(train_data.examples[0]))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can also view the text and tags separately:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(vars(train_data.examples[0])['text'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(vars(train_data.examples[0])['udtags'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(vars(train_data.examples[0])['ptbtags'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next, we'll build the vocabulary - a mapping of tokens to integers. \n", - "\n", - "We want some unknown tokens within our dataset in order to replicate how this model would be used in real life, so we set the `min_freq` to 2 which means only tokens that appear twice in the training set will be added to the vocabulary and the rest will be replaced by `` tokens.\n", - "\n", - "We also load the [GloVe](https://nlp.stanford.edu/projects/glove/) pre-trained token embeddings. Specifically, the 100-dimensional embeddings that have been trained on 6 billion tokens. Using pre-trained embeddings usually leads to improved performance - although admittedly the dataset used in this tutorial is too small to take advantage of the pre-trained embeddings. \n", - "\n", - "`unk_init` is used to initialize the token embeddings which are not in the pre-trained embedding vocabulary. By default this sets those embeddings to zeros, however it is better to not have them all initialized to the same value, so we initialize them from a Normal/Gaussian distribution.\n", - "\n", - "These pre-trained vectors are now loaded into our vocabulary and we will initialize our model with these values later." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "MIN_FREQ = 2\n", - "\n", - "TEXT.build_vocab(train_data, \n", - " min_freq = MIN_FREQ,\n", - " vectors = \"glove.6B.100d\",\n", - " unk_init = torch.Tensor.normal_)\n", - "\n", - "\n", - "UD_TAGS.build_vocab(train_data)\n", - "PTB_TAGS.build_vocab(train_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can check how many tokens and tags are in our vocabulary by getting their length:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(f\"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}\")\n", - "print(f\"Unique tokens in UD_TAG vocabulary: {len(UD_TAGS.vocab)}\")\n", - "print(f\"Unique tokens in PTB_TAG vocabulary: {len(PTB_TAGS.vocab)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Exploring the vocabulary, we can check the most common tokens within our texts:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(TEXT.vocab.freqs.most_common(20))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can see the vocabularies for both of our tags:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(UD_TAGS.vocab.itos)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(PTB_TAGS.vocab.itos)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can also see how many of each tag are in our vocabulary:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(UD_TAGS.vocab.freqs.most_common())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(PTB_TAGS.vocab.freqs.most_common())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can also view how common each of the tags are within the training set:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def tag_percentage(tag_counts):\n", - " \n", - " total_count = sum([count for tag, count in tag_counts])\n", - " \n", - " tag_counts_percentages = [(tag, count, count/total_count) for tag, count in tag_counts]\n", - " \n", - " return tag_counts_percentages" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\"Tag\\t\\tCount\\t\\tPercentage\\n\")\n", - "\n", - "for tag, count, percent in tag_percentage(UD_TAGS.vocab.freqs.most_common()):\n", - " print(f\"{tag}\\t\\t{count}\\t\\t{percent*100:4.1f}%\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\"Tag\\t\\tCount\\t\\tPercentage\\n\")\n", - "\n", - "for tag, count, percent in tag_percentage(PTB_TAGS.vocab.freqs.most_common()):\n", - " print(f\"{tag}\\t\\t{count}\\t\\t{percent*100:4.1f}%\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The final part of data preparation is handling the iterator. \n", - "\n", - "This will be iterated over to return batches of data to process. Here, we set the batch size and the `device` - which is used to place the batches of tensors on our GPU, if we have one. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "BATCH_SIZE = 128\n", - "\n", - "device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')\n", - "\n", - "train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(\n", - " (train_data, valid_data, test_data), \n", - " batch_size = BATCH_SIZE,\n", - " device = device)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Building the Model\n", - "\n", - "Next up, we define our model - a multi-layer bi-directional LSTM. The image below shows a simplified version of the model with only one LSTM layer and omitting the LSTM's cell state for clarity.\n", - "\n", - "![](assets/pos-bidirectional-lstm.png)\n", - "\n", - "The model takes in a sequence of tokens, $X = \\{x_1, x_2,...,x_T\\}$, passes them through an embedding layer, $e$, to get the token embeddings, $e(X) = \\{e(x_1), e(x_2), ..., e(x_T)\\}$.\n", - "\n", - "These embeddings are processed - one per time-step - by the forward and backward LSTMs. The forward LSTM processes the sequence from left-to-right, whilst the backward LSTM processes the sequence right-to-left, i.e. the first input to the forward LSTM is $x_1$ and the first input to the backward LSTM is $x_T$. \n", - "\n", - "The LSTMs also take in the the hidden, $h$, and cell, $c$, states from the previous time-step\n", - "\n", - "$$h^{\\rightarrow}_t = \\text{LSTM}^{\\rightarrow}(e(x^{\\rightarrow}_t), h^{\\rightarrow}_{t-1}, c^{\\rightarrow}_{t-1})$$\n", - "$$h^{\\leftarrow}_t=\\text{LSTM}^{\\leftarrow}(e(x^{\\leftarrow}_t), h^{\\leftarrow}_{t-1}, c^{\\leftarrow}_{t-1})$$\n", - "\n", - "After the whole sequence has been processed, the hidden and cell states are then passed to the next layer of the LSTM.\n", - "\n", - "The initial hidden and cell states, $h_0$ and $c_0$, for each direction and layer are initialized to a tensor full of zeros.\n", - "\n", - "We then concatenate both the forward and backward hidden states from the final layer of the LSTM, $H = \\{h_1, h_2, ... h_T\\}$, where $h_1 = [h^{\\rightarrow}_1;h^{\\leftarrow}_T]$, $h_2 = [h^{\\rightarrow}_2;h^{\\leftarrow}_{T-1}]$, etc. and pass them through a linear layer, $f$, which is used to make the prediction of which tag applies to this token, $\\hat{y}_t = f(h_t)$.\n", - "\n", - "When training the model, we will compare our predicted tags, $\\hat{Y}$ against the actual tags, $Y$, to calculate a loss, the gradients w.r.t. that loss, and then update our parameters.\n", - "\n", - "We implement the model detailed above in the `BiLSTMPOSTagger` class.\n", - "\n", - "`nn.Embedding` is an embedding layer and the input dimension should be the size of the input (text) vocabulary. We tell it what the index of the padding token is so it does not update the padding token's embedding entry.\n", - "\n", - "`nn.LSTM` is the LSTM. We apply dropout as regularization between the layers, if we are using more than one.\n", - "\n", - "`nn.Linear` defines the linear layer to make predictions using the LSTM outputs. We double the size of the input if we are using a bi-directional LSTM. The output dimensions should be the size of the tag vocabulary.\n", - "\n", - "We also define a dropout layer with `nn.Dropout`, which we use in the `forward` method to apply dropout to the embeddings and the outputs of the final layer of the LSTM." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class BiLSTMPOSTagger(nn.Module):\n", - " def __init__(self, \n", - " input_dim, \n", - " embedding_dim, \n", - " hidden_dim, \n", - " output_dim, \n", - " n_layers, \n", - " bidirectional, \n", - " dropout, \n", - " pad_idx):\n", - " \n", - " super().__init__()\n", - " \n", - " self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx = pad_idx)\n", - " \n", - " self.lstm = nn.LSTM(embedding_dim, \n", - " hidden_dim, \n", - " num_layers = n_layers, \n", - " bidirectional = bidirectional,\n", - " dropout = dropout if n_layers > 1 else 0)\n", - " \n", - " self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)\n", - " \n", - " self.dropout = nn.Dropout(dropout)\n", - " \n", - " def forward(self, text):\n", - "\n", - " #text = [sent len, batch size]\n", - " \n", - " #pass text through embedding layer\n", - " embedded = self.dropout(self.embedding(text))\n", - " \n", - " #embedded = [sent len, batch size, emb dim]\n", - " \n", - " #pass embeddings into LSTM\n", - " outputs, (hidden, cell) = self.lstm(embedded)\n", - " \n", - " #outputs holds the backward and forward hidden states in the final layer\n", - " #hidden and cell are the backward and forward hidden and cell states at the final time-step\n", - " \n", - " #output = [sent len, batch size, hid dim * n directions]\n", - " #hidden/cell = [n layers * n directions, batch size, hid dim]\n", - " \n", - " #we use our outputs to make a prediction of what the tag should be\n", - " predictions = self.fc(self.dropout(outputs))\n", - " \n", - " #predictions = [sent len, batch size, output dim]\n", - " \n", - " return predictions" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Training the Model\n", - "\n", - "Next, we instantiate the model. We need to ensure the embedding dimensions matches that of the GloVe embeddings we loaded earlier.\n", - "\n", - "The rest of the hyperparmeters have been chosen as sensible defaults, though there may be a combination that performs better on this model and dataset.\n", - "\n", - "The input and output dimensions are taken directly from the lengths of the respective vocabularies. The padding index is obtained using the vocabulary and the `Field` of the text." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "INPUT_DIM = len(TEXT.vocab)\n", - "EMBEDDING_DIM = 100\n", - "HIDDEN_DIM = 128\n", - "OUTPUT_DIM = len(UD_TAGS.vocab)\n", - "N_LAYERS = 2\n", - "BIDIRECTIONAL = True\n", - "DROPOUT = 0.25\n", - "PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]\n", - "\n", - "model = BiLSTMPOSTagger(INPUT_DIM, \n", - " EMBEDDING_DIM, \n", - " HIDDEN_DIM, \n", - " OUTPUT_DIM, \n", - " N_LAYERS, \n", - " BIDIRECTIONAL, \n", - " DROPOUT, \n", - " PAD_IDX)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We initialize the weights from a simple Normal distribution. Again, there may be a better initialization scheme for this model and dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def init_weights(m):\n", - " for name, param in m.named_parameters():\n", - " nn.init.normal_(param.data, mean = 0, std = 0.1)\n", - " \n", - "model.apply(init_weights)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next, a small function to tell us how many parameters are in our model. Useful for comparing different models." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def count_parameters(model):\n", - " return sum(p.numel() for p in model.parameters() if p.requires_grad)\n", - "\n", - "print(f'The model has {count_parameters(model):,} trainable parameters')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We'll now initialize our model's embedding layer with the pre-trained embedding values we loaded earlier.\n", - "\n", - "This is done by getting them from the vocab's `.vectors` attribute and then performing a `.copy` to overwrite the embedding layer's current weights." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pretrained_embeddings = TEXT.vocab.vectors\n", - "\n", - "print(pretrained_embeddings.shape)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "model.embedding.weight.data.copy_(pretrained_embeddings)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "It's common to initialize the embedding of the pad token to all zeros. This, along with setting the `padding_idx` in the model's embedding layer, means that the embedding should always output a tensor full of zeros when a pad token is input." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)\n", - "\n", - "print(model.embedding.weight.data)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We then define our optimizer, used to update our parameters w.r.t. their gradients. We use Adam with the default learning rate." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "optimizer = optim.Adam(model.parameters())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next, we define our loss function, cross-entropy loss.\n", - "\n", - "Even though we have no `` tokens within our tag vocab, we still have `` tokens. This is because all sentences within a batch need to be the same size. However, we don't want to calculate the loss when the target is a `` token as we aren't training our model to recognize padding tokens.\n", - "\n", - "We handle this by setting the `ignore_index` in our loss function to the index of the padding token in our tag vocabulary." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "TAG_PAD_IDX = UD_TAGS.vocab.stoi[UD_TAGS.pad_token]\n", - "\n", - "criterion = nn.CrossEntropyLoss(ignore_index = TAG_PAD_IDX)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We then place our model and loss function on our GPU, if we have one." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "model = model.to(device)\n", - "criterion = criterion.to(device)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We will be using the loss value between our predicted and actual tags to train the network, but ideally we'd like a more interpretable way to see how well our model is doing - accuracy.\n", - "\n", - "The issue is that we don't want to calculate accuracy over the `` tokens as we aren't interested in predicting them.\n", - "\n", - "The function below only calculates accuracy over non-padded tokens. `non_pad_elements` is a tensor containing the indices of the non-pad tokens within an input batch. We then compare the predictions of those elements with the labels to get a count of how many predictions were correct. We then divide this by the number of non-pad elements to get our accuracy value over the batch." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def categorical_accuracy(preds, y, tag_pad_idx):\n", - " \"\"\"\n", - " Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8\n", - " \"\"\"\n", - " max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability\n", - " non_pad_elements = (y != tag_pad_idx).nonzero()\n", - " correct = max_preds[non_pad_elements].squeeze(1).eq(y[non_pad_elements])\n", - " return correct.sum() / torch.FloatTensor([y[non_pad_elements].shape[0]]).to(device)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next is the function that handles training our model.\n", - "\n", - "We first set the model to `train` mode to turn on dropout/batch-norm/etc. (if used). Then we iterate over our iterator, which returns a batch of examples. \n", - "\n", - "For each batch: \n", - "- we zero the gradients over the parameters from the last gradient calculation\n", - "- insert the batch of text into the model to get predictions\n", - "- as PyTorch loss functions cannot handle 3-dimensional predictions we reshape our predictions\n", - "- calculate the loss and accuracy between the predicted tags and actual tags\n", - "- call `backward` to calculate the gradients of the parameters w.r.t. the loss\n", - "- take an optimizer `step` to update the parameters\n", - "- add to the running total of loss and accuracy" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def train(model, iterator, optimizer, criterion, tag_pad_idx):\n", - " \n", - " epoch_loss = 0\n", - " epoch_acc = 0\n", - " \n", - " model.train()\n", - " \n", - " for batch in iterator:\n", - " \n", - " text = batch.text\n", - " tags = batch.udtags\n", - " \n", - " optimizer.zero_grad()\n", - " \n", - " #text = [sent len, batch size]\n", - " \n", - " predictions = model(text)\n", - " \n", - " #predictions = [sent len, batch size, output dim]\n", - " #tags = [sent len, batch size]\n", - " \n", - " predictions = predictions.view(-1, predictions.shape[-1])\n", - " tags = tags.view(-1)\n", - " \n", - " #predictions = [sent len * batch size, output dim]\n", - " #tags = [sent len * batch size]\n", - " \n", - " loss = criterion(predictions, tags)\n", - " \n", - " acc = categorical_accuracy(predictions, tags, tag_pad_idx)\n", - " \n", - " loss.backward()\n", - " \n", - " optimizer.step()\n", - " \n", - " epoch_loss += loss.item()\n", - " epoch_acc += acc.item()\n", - " \n", - " return epoch_loss / len(iterator), epoch_acc / len(iterator)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `evaluate` function is similar to the `train` function, except with changes made so we don't update the model's parameters.\n", - "\n", - "`model.eval()` is used to put the model in evaluation mode, so dropout/batch-norm/etc. are turned off. \n", - "\n", - "The iteration loop is also wrapped in `torch.no_grad` to ensure we don't calculate any gradients. We also don't need to call `optimizer.zero_grad()` and `optimizer.step()`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def evaluate(model, iterator, criterion, tag_pad_idx):\n", - " \n", - " epoch_loss = 0\n", - " epoch_acc = 0\n", - " \n", - " model.eval()\n", - " \n", - " with torch.no_grad():\n", - " \n", - " for batch in iterator:\n", - "\n", - " text = batch.text\n", - " tags = batch.udtags\n", - " \n", - " predictions = model(text)\n", - " \n", - " predictions = predictions.view(-1, predictions.shape[-1])\n", - " tags = tags.view(-1)\n", - " \n", - " loss = criterion(predictions, tags)\n", - " \n", - " acc = categorical_accuracy(predictions, tags, tag_pad_idx)\n", - "\n", - " epoch_loss += loss.item()\n", - " epoch_acc += acc.item()\n", - " \n", - " return epoch_loss / len(iterator), epoch_acc / len(iterator)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next, we have a small function that tells us how long an epoch takes." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def epoch_time(start_time, end_time):\n", - " elapsed_time = end_time - start_time\n", - " elapsed_mins = int(elapsed_time / 60)\n", - " elapsed_secs = int(elapsed_time - (elapsed_mins * 60))\n", - " return elapsed_mins, elapsed_secs" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Finally, we train our model!\n", - "\n", - "After each epoch we check if our model has achieved the best validation loss so far. If it has then we save the parameters of this model and we will use these \"best\" parameters to calculate performance over our test set." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "N_EPOCHS = 15\n", - "\n", - "best_valid_loss = float('inf')\n", - "\n", - "for epoch in range(N_EPOCHS):\n", - "\n", - " start_time = time.time()\n", - " \n", - " train_loss, train_acc = train(model, train_iterator, optimizer, criterion, TAG_PAD_IDX)\n", - " valid_loss, valid_acc = evaluate(model, valid_iterator, criterion, TAG_PAD_IDX)\n", - " \n", - " end_time = time.time()\n", - "\n", - " epoch_mins, epoch_secs = epoch_time(start_time, end_time)\n", - " \n", - " if valid_loss < best_valid_loss:\n", - " best_valid_loss = valid_loss\n", - " torch.save(model.state_dict(), 'tut1-model.pt')\n", - " \n", - " print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')\n", - " print(f'\\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')\n", - " print(f'\\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We then load our \"best\" parameters and evaluate performance on the test set." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "model.load_state_dict(torch.load('tut1-model.pt'))\n", - "\n", - "test_loss, test_acc = evaluate(model, test_iterator, criterion, TAG_PAD_IDX)\n", - "\n", - "print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Inference\n", - "\n", - "88% accuracy looks pretty good, but let's see our model tag some actual sentences.\n", - "\n", - "We define a `tag_sentence` function that will:\n", - "- put the model into evaluation mode\n", - "- tokenize the sentence with spaCy if it is not a list\n", - "- lowercase the tokens if the `Field` did\n", - "- numericalize the tokens using the vocabulary\n", - "- find out which tokens are not in the vocabulary, i.e. are `` tokens\n", - "- convert the numericalized tokens into a tensor and add a batch dimension\n", - "- feed the tensor into the model\n", - "- get the predictions over the sentence\n", - "- convert the predictions into readable tags\n", - "\n", - "As well as returning the tokens and tags, it also returns which tokens were `` tokens." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def tag_sentence(model, device, sentence, text_field, tag_field):\n", - " \n", - " model.eval()\n", - " \n", - " if isinstance(sentence, str):\n", - " nlp = spacy.load('en')\n", - " tokens = [token.text for token in nlp(sentence)]\n", - " else:\n", - " tokens = [token for token in sentence]\n", - "\n", - " if text_field.lower:\n", - " tokens = [t.lower() for t in tokens]\n", - " \n", - " numericalized_tokens = [text_field.vocab.stoi[t] for t in tokens]\n", - "\n", - " unk_idx = text_field.vocab.stoi[text_field.unk_token]\n", - " \n", - " unks = [t for t, n in zip(tokens, numericalized_tokens) if n == unk_idx]\n", - " \n", - " token_tensor = torch.LongTensor(numericalized_tokens)\n", - " \n", - " token_tensor = token_tensor.unsqueeze(-1).to(device)\n", - " \n", - " predictions = model(token_tensor)\n", - " \n", - " top_predictions = predictions.argmax(-1)\n", - " \n", - " predicted_tags = [tag_field.vocab.itos[t.item()] for t in top_predictions]\n", - " \n", - " return tokens, predicted_tags, unks" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We'll get an already tokenized example from the training set and test our model's performance." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "example_index = 1\n", - "\n", - "sentence = vars(train_data.examples[example_index])['text']\n", - "actual_tags = vars(train_data.examples[example_index])['udtags']\n", - "\n", - "print(sentence)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can then use our `tag_sentence` function to get the tags. Notice how the tokens referring to subject of the sentence, the \"respected cleric\", are both `` tokens!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tokens, pred_tags, unks = tag_sentence(model, \n", - " device, \n", - " sentence, \n", - " TEXT, \n", - " UD_TAGS)\n", - "\n", - "print(unks)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can then check how well it did. Surprisingly, it got every token correct, including the two that were unknown tokens!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\"Pred. Tag\\tActual Tag\\tCorrect?\\tToken\\n\")\n", - "\n", - "for token, pred_tag, actual_tag in zip(tokens, pred_tags, actual_tags):\n", - " correct = '✔' if pred_tag == actual_tag else '✘'\n", - " print(f\"{pred_tag}\\t\\t{actual_tag}\\t\\t{correct}\\t\\t{token}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's now make up our own sentence and see how well the model does.\n", - "\n", - "Our example sentence below has every token within the model's vocabulary." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = 'The Queen will deliver a speech about the conflict in North Korea at 1pm tomorrow.'\n", - "\n", - "tokens, tags, unks = tag_sentence(model, \n", - " device, \n", - " sentence, \n", - " TEXT, \n", - " UD_TAGS)\n", - "\n", - "print(unks)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Looking at the sentence it seems like it gave sensible tags to every token!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\"Pred. Tag\\tToken\\n\")\n", - "\n", - "for token, tag in zip(tokens, tags):\n", - " print(f\"{tag}\\t\\t{token}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We've now seen how to implement PoS tagging with PyTorch and TorchText! \n", - "\n", - "The BiLSTM isn't a state-of-the-art model, in terms of performance, but is a strong baseline for PoS tasks and is a good tool to have in your arsenal." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Going deeper\n", - "What if we could combine word-level and char-level approaches? \n", - "![title](https://i.postimg.cc/tT9hsBfj/ive-put-an-rnn-in-your-rnn-so-you-can-train-an-rnn-on-every-step-of-your-rnn-training-loop.jpg)\n", - "\n", - "\n", - "Actually, we can. Let's use LSTM or GRU to generate embedding for every word on char-level.\n", - "![title](https://guillaumegenthial.github.io/assets/char_representation.png)\n", - "*Image source: https://guillaumegenthial.github.io/sequence-tagging-with-tensorflow.html*\n", - "\n", - "![title](https://guillaumegenthial.github.io/assets/bi-lstm.png)\n", - "*Image source: https://guillaumegenthial.github.io/sequence-tagging-with-tensorflow.html*" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To do that we need to make few adjustments to the code above" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Now lets try both word and character embeddings\n", - "WORD = data.Field(lower = True)\n", - "UD_TAG = data.Field(unk_token = None)\n", - "PTB_TAG = data.Field(unk_token = None)\n", - "\n", - "# We'll use NestedField to tokenize each word into list of chars\n", - "CHAR_NESTING = data.Field(tokenize=list, init_token=\"\", eos_token=\"\")\n", - "CHAR = data.NestedField(CHAR_NESTING)#, init_token=\"\", eos_token=\"\")\n", - "\n", - "fields = [(('word', 'char'), (WORD, CHAR)), ('udtag', UD_TAG), ('ptbtag', PTB_TAG)]\n", - "train_data, valid_data, test_data = datasets.UDPOS.splits(fields)\n", - "# train, val, test = datasets.UDPOS.splits(fields=fields)\n", - "\n", - "print(train_data.fields)\n", - "print(len(train_data))\n", - "print(vars(train_data[0]))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "WORD.build_vocab(\n", - " train_data,\n", - " min_freq = MIN_FREQ,\n", - " vectors=\"glove.6B.100d\",\n", - " unk_init = torch.Tensor.normal_\n", - ")\n", - "\n", - "\n", - "CHAR.build_vocab(train_data)\n", - "UD_TAG.build_vocab(train_data)\n", - "PTB_TAG.build_vocab(train_data)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(f\"Unique tokens in WORD vocabulary: {len(WORD.vocab)}\")\n", - "print(f\"Unique tokens in CHAR vocabulary: {len(CHAR.vocab)}\")\n", - "print(f\"Unique tokens in UD_TAG vocabulary: {len(UD_TAG.vocab)}\")\n", - "print(f\"Unique tokens in PTB_TAG vocabulary: {len(PTB_TAG.vocab)}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "BATCH_SIZE = 64\n", - "\n", - "device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')\n", - "\n", - "train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(\n", - " (train_data, valid_data, test_data), \n", - " batch_size = BATCH_SIZE,\n", - " device = device)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "batch = next(iter(train_iterator))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "text = batch.word\n", - "chars = batch.char\n", - "tags = batch.udtag\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class BiLSTMPOSTaggerWithChars(nn.Module):\n", - " def __init__(self, \n", - " word_input_dim, \n", - " word_embedding_dim,\n", - " char_input_dim,\n", - " char_embedding_dim,\n", - " char_hidden_dim,\n", - " hidden_dim,\n", - " output_dim, \n", - " n_layers, \n", - " bidirectional, \n", - " dropout, \n", - " pad_idx):\n", - " \n", - " super().__init__()\n", - " \n", - " self.char_embedding = # YOUR CODE HERE\n", - " self.char_gru = # YOUR CODE HERE\n", - " \n", - " self.word_embedding = nn.Embedding(word_input_dim, word_embedding_dim, padding_idx = pad_idx)\n", - " self.lstm = nn.LSTM(word_embedding_dim + # YOUR CODE HERE, \n", - " hidden_dim, \n", - " num_layers = n_layers, \n", - " bidirectional = bidirectional,\n", - " dropout = dropout if n_layers > 1 else 0)\n", - " \n", - " self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)\n", - " \n", - " self.dropout = nn.Dropout(dropout)\n", - " \n", - " def forward(self, text, chars):\n", - "\n", - " #text = [sent len, batch size]\n", - " \n", - " #pass text through embedding layer\n", - " embedded = self.dropout(self.word_embedding(text))\n", - " #embedded = [sent len, batch size, emb dim]\n", - " \n", - " chars_embedded = # YOUR CODE HERE\n", - " hid_from_chars = # YOUR CODE HERE\n", - " \n", - " embedded_with_chars = torch.cat([embedded, hid_from_chars], dim=2)\n", - " \n", - " \n", - " #pass embeddings into LSTM\n", - " outputs, (hidden, cell) = self.lstm(embedded_with_chars)\n", - "# outputs, (hidden, cell) = self.lstm(hid)\n", - "\n", - " \n", - " #outputs holds the backward and forward hidden states in the final layer\n", - " #hidden and cell are the backward and forward hidden and cell states at the final time-step\n", - " \n", - " #output = [sent len, batch size, hid dim * n directions]\n", - " #hidden/cell = [n layers * n directions, batch size, hid dim]\n", - " \n", - " #we use our outputs to make a prediction of what the tag should be\n", - " predictions = self.fc(self.dropout(outputs))\n", - " \n", - " #predictions = [sent len, batch size, output dim]\n", - " \n", - " return predictions" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "INPUT_DIM = len(WORD.vocab)\n", - "EMBEDDING_DIM = 100\n", - "HIDDEN_DIM = 160\n", - "CHAR_INPUT_DIM = 112\n", - "CHAR_EMBEDDING_DIM = 30\n", - "CHAR_HIDDEN_DIM = 30\n", - "OUTPUT_DIM = len(UD_TAGS.vocab)\n", - "N_LAYERS = 2\n", - "BIDIRECTIONAL = True\n", - "DROPOUT = 0.25\n", - "PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]\n", - "\n", - "model = BiLSTMPOSTaggerWithChars(\n", - " INPUT_DIM, \n", - " EMBEDDING_DIM,\n", - " CHAR_INPUT_DIM,\n", - " CHAR_EMBEDDING_DIM,\n", - " CHAR_HIDDEN_DIM,\n", - " HIDDEN_DIM, \n", - " OUTPUT_DIM, \n", - " N_LAYERS, \n", - " BIDIRECTIONAL, \n", - " DROPOUT, \n", - " PAD_IDX\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Congratulations, you've got LSTM which relies on GRU output on each step.**\n", - "\n", - "Now we need only to train it. Same actions, very small adjustments." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def init_weights(m):\n", - " for name, param in m.named_parameters():\n", - " nn.init.normal_(param.data, mean = 0, std = 0.1)\n", - " \n", - "model.apply(init_weights)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def count_parameters(model):\n", - " return sum(p.numel() for p in model.parameters() if p.requires_grad)\n", - "\n", - "print(f'The model has {count_parameters(model):,} trainable parameters')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pretrained_embeddings = TEXT.vocab.vectors\n", - "\n", - "print(pretrained_embeddings.shape)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "model.word_embedding.weight.data.copy_(pretrained_embeddings)\n", - "model.word_embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)\n", - "\n", - "print(model.word_embedding.weight.data)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "optimizer = optim.Adam(model.parameters())\n", - "\n", - "TAG_PAD_IDX = UD_TAGS.vocab.stoi[UD_TAGS.pad_token]\n", - "\n", - "criterion = nn.CrossEntropyLoss(ignore_index = TAG_PAD_IDX)\n", - "\n", - "model = model.to(device)\n", - "criterion = criterion.to(device)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def train(model, iterator, optimizer, criterion, tag_pad_idx):\n", - " \n", - " epoch_loss = 0\n", - " epoch_acc = 0\n", - " \n", - " model.train()\n", - " \n", - " for batch in iterator:\n", - " \n", - " text = batch.word\n", - " chars = batch.char\n", - " tags = batch.udtag\n", - " \n", - " optimizer.zero_grad()\n", - " \n", - " #text = [sent len, batch size]\n", - " \n", - " predictions = model(text, chars)\n", - " \n", - " #predictions = [sent len, batch size, output dim]\n", - " #tags = [sent len, batch size]\n", - " \n", - " predictions = predictions.view(-1, predictions.shape[-1])\n", - " tags = tags.view(-1)\n", - " \n", - " #predictions = [sent len * batch size, output dim]\n", - " #tags = [sent len * batch size]\n", - " \n", - " loss = criterion(predictions, tags)\n", - " \n", - " acc = categorical_accuracy(predictions, tags, tag_pad_idx)\n", - " \n", - " loss.backward()\n", - " \n", - " optimizer.step()\n", - " \n", - " epoch_loss += loss.item()\n", - " epoch_acc += acc.item()\n", - " \n", - " return epoch_loss / len(iterator), epoch_acc / len(iterator)\n", - "\n", - "\n", - "def evaluate(model, iterator, criterion, tag_pad_idx):\n", - " \n", - " epoch_loss = 0\n", - " epoch_acc = 0\n", - " \n", - " model.eval()\n", - " \n", - " with torch.no_grad():\n", - " \n", - " for batch in iterator:\n", - "\n", - " text = batch.word\n", - " chars = batch.char\n", - " tags = batch.udtag\n", - " \n", - " predictions = model(text, chars)\n", - " \n", - " predictions = predictions.view(-1, predictions.shape[-1])\n", - " tags = tags.view(-1)\n", - " \n", - " loss = criterion(predictions, tags)\n", - " \n", - " acc = categorical_accuracy(predictions, tags, tag_pad_idx)\n", - "\n", - " epoch_loss += loss.item()\n", - " epoch_acc += acc.item()\n", - " \n", - " return epoch_loss / len(iterator), epoch_acc / len(iterator)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "N_EPOCHS = 15\n", - "\n", - "best_valid_loss = float('inf')\n", - "\n", - "for epoch in range(N_EPOCHS):\n", - "\n", - " start_time = time.time()\n", - " \n", - " train_loss, train_acc = train(model, train_iterator, optimizer, criterion, TAG_PAD_IDX)\n", - " valid_loss, valid_acc = evaluate(model, valid_iterator, criterion, TAG_PAD_IDX)\n", - " \n", - " end_time = time.time()\n", - "\n", - " epoch_mins, epoch_secs = epoch_time(start_time, end_time)\n", - " \n", - " if valid_loss < best_valid_loss:\n", - " best_valid_loss = valid_loss\n", - " torch.save(model.state_dict(), 'tut2-model.pt')\n", - " \n", - " print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')\n", - " print(f'\\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')\n", - " print(f'\\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Let's take a look at the model from the last epoch\n", - "test_loss, test_acc = evaluate(model, test_iterator, criterion, TAG_PAD_IDX)\n", - "\n", - "print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# And at the best checkpoint (based on validation score)\n", - "model.load_state_dict(torch.load('tut2-model.pt'))\n", - "\n", - "test_loss, test_acc = evaluate(model, test_iterator, criterion, TAG_PAD_IDX)\n", - "\n", - "print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Py3 research env", - "language": "python", - "name": "py3_research" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.7" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/week05_transformer_pos_tagging/week05_bilstm_for_pos_tagging__completed.ipynb b/week05_transformer_pos_tagging/week05_bilstm_for_pos_tagging__completed.ipynb deleted file mode 100644 index cd5e9a7..0000000 --- a/week05_transformer_pos_tagging/week05_bilstm_for_pos_tagging__completed.ipynb +++ /dev/null @@ -1,2358 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Practice: BiLSTM for PoS Tagging\n", - "*This notebook is based on [open-source implementation](https://github.com/bentrevett/pytorch-pos-tagging) of PoS Tagging in PyTorch.*\n", - "\n", - "### Introduction\n", - "\n", - "In this series we'll be building a machine learning model that produces an output for every element in an input sequence, using PyTorch and TorchText. Specifically, we will be inputting a sequence of text and the model will output a part-of-speech (PoS) tag for each token in the input text. This can also be used for named entity recognition (NER), where the output for each token will be what type of entity, if any, the token is.\n", - "\n", - "In this notebook, we'll be implementing a multi-layer bi-directional LSTM (BiLSTM) to predict PoS tags using the Universal Dependencies English Web Treebank (UDPOS) dataset.\n", - "\n", - "### Preparing Data\n", - "\n", - "First, let's import the necessary Python modules." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import torch\n", - "import torch.nn as nn\n", - "import torch.optim as optim\n", - "\n", - "from torchtext.legacy import data\n", - "from torchtext.legacy import datasets\n", - "\n", - "import spacy\n", - "import numpy as np\n", - "\n", - "import time\n", - "import random" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next, we'll set the random seeds for reproducability." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "SEED = 1234\n", - "\n", - "random.seed(SEED)\n", - "np.random.seed(SEED)\n", - "torch.manual_seed(SEED)\n", - "torch.backends.cudnn.deterministic = True" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "One of the key parts of TorchText is the `Field`. The `Field` handles how your dataset is processed.\n", - "\n", - "Our `TEXT` field handles how the text that we need to tag is dealt with. All we do here is set `lower = True` which lowercases all of the text.\n", - "\n", - "Next we'll define the `Fields` for the tags. This dataset actually has two different sets of tags, [universal dependency (UD) tags](https://universaldependencies.org/u/pos/) and [Penn Treebank (PTB) tags](https://www.sketchengine.eu/penn-treebank-tagset/). We'll only train our model on the UD tags, but will load the PTB tags to show how they could be used instead.\n", - "\n", - "`UD_TAGS` handles how the UD tags should be handled. Our `TEXT` vocabulary - which we'll build later - will have *unknown* tokens in it, i.e. tokens that are not within our vocabulary. However, we won't have unknown tags as we are dealing with a finite set of possible tags. TorchText `Fields` initialize a default unknown token, ``, which we remove by setting `unk_token = None`.\n", - "\n", - "`PTB_TAGS` does the same as `UD_TAGS`, but handles the PTB tags instead." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "TEXT = data.Field(lower = True)\n", - "UD_TAGS = data.Field(unk_token = None)\n", - "PTB_TAGS = data.Field(unk_token = None)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We then define `fields`, which handles passing our fields to the dataset.\n", - "\n", - "Note that order matters, if you only wanted to load the PTB tags your field would be:\n", - "\n", - "```\n", - "fields = ((\"text\", TEXT), (None, None), (\"ptbtags\", PTB_TAGS))\n", - "```\n", - "\n", - "Where `None` tells TorchText to not load those tags." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "fields = ((\"text\", TEXT), (\"udtags\", UD_TAGS), (\"ptbtags\", PTB_TAGS))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next, we load the UDPOS dataset using our defined fields." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "downloading en-ud-v2.zip\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "en-ud-v2.zip: 100%|██████████| 688k/688k [00:00<00:00, 1.02MB/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "extracting\n" - ] - } - ], - "source": [ - "train_data, valid_data, test_data = datasets.UDPOS.splits(fields)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can check how many examples are in each section of the dataset by checking their length." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Number of training examples: 12543\n", - "Number of validation examples: 2002\n", - "Number of testing examples: 2077\n" - ] - } - ], - "source": [ - "print(f\"Number of training examples: {len(train_data)}\")\n", - "print(f\"Number of validation examples: {len(valid_data)}\")\n", - "print(f\"Number of testing examples: {len(test_data)}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's print out an example:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'text': ['i', 'will', 'never', 'return', 'there', 'again', '(', 'and', 'now', 'have', 'some', 'serious', 'doubts', 'about', 'the', 'quality', 'of', 'work', 'they', 'actually', 'performed', 'on', 'my', 'car', ')', '.'], 'udtags': ['PRON', 'AUX', 'ADV', 'VERB', 'ADV', 'ADV', 'PUNCT', 'CCONJ', 'ADV', 'VERB', 'DET', 'ADJ', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'NOUN', 'PRON', 'ADV', 'VERB', 'ADP', 'PRON', 'NOUN', 'PUNCT', 'PUNCT'], 'ptbtags': ['PRP', 'MD', 'RB', 'VB', 'RB', 'RB', '-LRB-', 'CC', 'RB', 'VBP', 'DT', 'JJ', 'NNS', 'IN', 'DT', 'NN', 'IN', 'NN', 'PRP', 'RB', 'VBD', 'IN', 'PRP$', 'NN', '-RRB-', '.']}\n" - ] - } - ], - "source": [ - "print(vars(train_data.examples[-1]))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can also view the text and tags separately:" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['i', 'will', 'never', 'return', 'there', 'again', '(', 'and', 'now', 'have', 'some', 'serious', 'doubts', 'about', 'the', 'quality', 'of', 'work', 'they', 'actually', 'performed', 'on', 'my', 'car', ')', '.']\n" - ] - } - ], - "source": [ - "print(vars(train_data.examples[-1])['text'])" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['PRON', 'AUX', 'ADV', 'VERB', 'ADV', 'ADV', 'PUNCT', 'CCONJ', 'ADV', 'VERB', 'DET', 'ADJ', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'NOUN', 'PRON', 'ADV', 'VERB', 'ADP', 'PRON', 'NOUN', 'PUNCT', 'PUNCT']\n" - ] - } - ], - "source": [ - "print(vars(train_data.examples[-1])['udtags'])" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['PRP', 'MD', 'RB', 'VB', 'RB', 'RB', '-LRB-', 'CC', 'RB', 'VBP', 'DT', 'JJ', 'NNS', 'IN', 'DT', 'NN', 'IN', 'NN', 'PRP', 'RB', 'VBD', 'IN', 'PRP$', 'NN', '-RRB-', '.']\n" - ] - } - ], - "source": [ - "print(vars(train_data.examples[-1])['ptbtags'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next, we'll build the vocabulary - a mapping of tokens to integers. \n", - "\n", - "We want some unknown tokens within our dataset in order to replicate how this model would be used in real life, so we set the `min_freq` to 2 which means only tokens that appear twice in the training set will be added to the vocabulary and the rest will be replaced by `` tokens.\n", - "\n", - "We also load the [GloVe](https://nlp.stanford.edu/projects/glove/) pre-trained token embeddings. Specifically, the 100-dimensional embeddings that have been trained on 6 billion tokens. Using pre-trained embeddings usually leads to improved performance - although admittedly the dataset used in this tutorial is too small to take advantage of the pre-trained embeddings. \n", - "\n", - "`unk_init` is used to initialize the token embeddings which are not in the pre-trained embedding vocabulary. By default this sets those embeddings to zeros, however it is better to not have them all initialized to the same value, so we initialize them from a Normal/Gaussian distribution.\n", - "\n", - "These pre-trained vectors are now loaded into our vocabulary and we will initialize our model with these values later." - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|█████████▉| 398413/400000 [00:16<00:00, 24234.96it/s]" - ] - } - ], - "source": [ - "MIN_FREQ = 2\n", - "\n", - "TEXT.build_vocab(train_data, \n", - " min_freq = MIN_FREQ,\n", - " vectors = \"glove.6B.100d\",\n", - " unk_init = torch.Tensor.normal_)\n", - "\n", - "\n", - "UD_TAGS.build_vocab(train_data)\n", - "PTB_TAGS.build_vocab(train_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can check how many tokens and tags are in our vocabulary by getting their length:" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Unique tokens in TEXT vocabulary: 8866\n", - "Unique tokens in UD_TAG vocabulary: 18\n", - "Unique tokens in PTB_TAG vocabulary: 51\n" - ] - } - ], - "source": [ - "print(f\"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}\")\n", - "print(f\"Unique tokens in UD_TAG vocabulary: {len(UD_TAGS.vocab)}\")\n", - "print(f\"Unique tokens in PTB_TAG vocabulary: {len(PTB_TAGS.vocab)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Exploring the vocabulary, we can check the most common tokens within our texts:" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[('the', 9076), ('.', 8640), (',', 7021), ('to', 5137), ('and', 5002), ('a', 3782), ('of', 3622), ('i', 3379), ('in', 3112), ('is', 2239), ('you', 2156), ('that', 2036), ('it', 1850), ('for', 1842), ('-', 1426), ('have', 1359), ('\"', 1296), ('on', 1273), ('was', 1244), ('with', 1216)]\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|█████████▉| 398413/400000 [00:30<00:00, 24234.96it/s]" - ] - } - ], - "source": [ - "print(TEXT.vocab.freqs.most_common(20))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can see the vocabularies for both of our tags:" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['', 'NOUN', 'PUNCT', 'VERB', 'PRON', 'ADP', 'DET', 'PROPN', 'ADJ', 'AUX', 'ADV', 'CCONJ', 'PART', 'NUM', 'SCONJ', 'X', 'INTJ', 'SYM']\n" - ] - } - ], - "source": [ - "print(UD_TAGS.vocab.itos)" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['', 'NN', 'IN', 'DT', 'NNP', 'PRP', 'JJ', 'RB', '.', 'VB', 'NNS', ',', 'CC', 'VBD', 'VBP', 'VBZ', 'CD', 'VBN', 'VBG', 'MD', 'TO', 'PRP$', '-RRB-', '-LRB-', 'WDT', 'WRB', ':', '``', \"''\", 'WP', 'RP', 'UH', 'POS', 'HYPH', 'JJR', 'NNPS', 'JJS', 'EX', 'NFP', 'GW', 'ADD', 'RBR', '$', 'PDT', 'RBS', 'SYM', 'LS', 'FW', 'AFX', 'WP$', 'XX']\n" - ] - } - ], - "source": [ - "print(PTB_TAGS.vocab.itos)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can also see how many of each tag are in our vocabulary:" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[('NOUN', 34781), ('PUNCT', 23679), ('VERB', 23081), ('PRON', 18577), ('ADP', 17638), ('DET', 16285), ('PROPN', 12946), ('ADJ', 12477), ('AUX', 12343), ('ADV', 10548), ('CCONJ', 6707), ('PART', 5567), ('NUM', 3999), ('SCONJ', 3843), ('X', 847), ('INTJ', 688), ('SYM', 599)]\n" - ] - } - ], - "source": [ - "print(UD_TAGS.vocab.freqs.most_common())" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[('NN', 26915), ('IN', 20724), ('DT', 16817), ('NNP', 12449), ('PRP', 12193), ('JJ', 11591), ('RB', 10831), ('.', 10317), ('VB', 9476), ('NNS', 8438), (',', 8062), ('CC', 6706), ('VBD', 5402), ('VBP', 5374), ('VBZ', 4578), ('CD', 3998), ('VBN', 3967), ('VBG', 3330), ('MD', 3294), ('TO', 3286), ('PRP$', 3068), ('-RRB-', 1008), ('-LRB-', 973), ('WDT', 948), ('WRB', 869), (':', 866), ('``', 813), (\"''\", 785), ('WP', 760), ('RP', 755), ('UH', 689), ('POS', 684), ('HYPH', 664), ('JJR', 503), ('NNPS', 498), ('JJS', 383), ('EX', 359), ('NFP', 338), ('GW', 294), ('ADD', 292), ('RBR', 276), ('$', 258), ('PDT', 175), ('RBS', 169), ('SYM', 156), ('LS', 117), ('FW', 93), ('AFX', 48), ('WP$', 15), ('XX', 1)]\n" - ] - } - ], - "source": [ - "print(PTB_TAGS.vocab.freqs.most_common())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can also view how common each of the tags are within the training set:" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [], - "source": [ - "def tag_percentage(tag_counts):\n", - " \n", - " total_count = sum([count for tag, count in tag_counts])\n", - " \n", - " tag_counts_percentages = [(tag, count, count/total_count) for tag, count in tag_counts]\n", - " \n", - " return tag_counts_percentages" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Tag\t\tCount\t\tPercentage\n", - "\n", - "NOUN\t\t34781\t\t17.0%\n", - "PUNCT\t\t23679\t\t11.6%\n", - "VERB\t\t23081\t\t11.3%\n", - "PRON\t\t18577\t\t 9.1%\n", - "ADP\t\t17638\t\t 8.6%\n", - "DET\t\t16285\t\t 8.0%\n", - "PROPN\t\t12946\t\t 6.3%\n", - "ADJ\t\t12477\t\t 6.1%\n", - "AUX\t\t12343\t\t 6.0%\n", - "ADV\t\t10548\t\t 5.2%\n", - "CCONJ\t\t6707\t\t 3.3%\n", - "PART\t\t5567\t\t 2.7%\n", - "NUM\t\t3999\t\t 2.0%\n", - "SCONJ\t\t3843\t\t 1.9%\n", - "X\t\t847\t\t 0.4%\n", - "INTJ\t\t688\t\t 0.3%\n", - "SYM\t\t599\t\t 0.3%\n" - ] - } - ], - "source": [ - "print(\"Tag\\t\\tCount\\t\\tPercentage\\n\")\n", - "\n", - "for tag, count, percent in tag_percentage(UD_TAGS.vocab.freqs.most_common()):\n", - " print(f\"{tag}\\t\\t{count}\\t\\t{percent*100:4.1f}%\")" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Tag\t\tCount\t\tPercentage\n", - "\n", - "NN\t\t26915\t\t13.2%\n", - "IN\t\t20724\t\t10.1%\n", - "DT\t\t16817\t\t 8.2%\n", - "NNP\t\t12449\t\t 6.1%\n", - "PRP\t\t12193\t\t 6.0%\n", - "JJ\t\t11591\t\t 5.7%\n", - "RB\t\t10831\t\t 5.3%\n", - ".\t\t10317\t\t 5.0%\n", - "VB\t\t9476\t\t 4.6%\n", - "NNS\t\t8438\t\t 4.1%\n", - ",\t\t8062\t\t 3.9%\n", - "CC\t\t6706\t\t 3.3%\n", - "VBD\t\t5402\t\t 2.6%\n", - "VBP\t\t5374\t\t 2.6%\n", - "VBZ\t\t4578\t\t 2.2%\n", - "CD\t\t3998\t\t 2.0%\n", - "VBN\t\t3967\t\t 1.9%\n", - "VBG\t\t3330\t\t 1.6%\n", - "MD\t\t3294\t\t 1.6%\n", - "TO\t\t3286\t\t 1.6%\n", - "PRP$\t\t3068\t\t 1.5%\n", - "-RRB-\t\t1008\t\t 0.5%\n", - "-LRB-\t\t973\t\t 0.5%\n", - "WDT\t\t948\t\t 0.5%\n", - "WRB\t\t869\t\t 0.4%\n", - ":\t\t866\t\t 0.4%\n", - "``\t\t813\t\t 0.4%\n", - "''\t\t785\t\t 0.4%\n", - "WP\t\t760\t\t 0.4%\n", - "RP\t\t755\t\t 0.4%\n", - "UH\t\t689\t\t 0.3%\n", - "POS\t\t684\t\t 0.3%\n", - "HYPH\t\t664\t\t 0.3%\n", - "JJR\t\t503\t\t 0.2%\n", - "NNPS\t\t498\t\t 0.2%\n", - "JJS\t\t383\t\t 0.2%\n", - "EX\t\t359\t\t 0.2%\n", - "NFP\t\t338\t\t 0.2%\n", - "GW\t\t294\t\t 0.1%\n", - "ADD\t\t292\t\t 0.1%\n", - "RBR\t\t276\t\t 0.1%\n", - "$\t\t258\t\t 0.1%\n", - "PDT\t\t175\t\t 0.1%\n", - "RBS\t\t169\t\t 0.1%\n", - "SYM\t\t156\t\t 0.1%\n", - "LS\t\t117\t\t 0.1%\n", - "FW\t\t93\t\t 0.0%\n", - "AFX\t\t48\t\t 0.0%\n", - "WP$\t\t15\t\t 0.0%\n", - "XX\t\t1\t\t 0.0%\n" - ] - } - ], - "source": [ - "print(\"Tag\\t\\tCount\\t\\tPercentage\\n\")\n", - "\n", - "for tag, count, percent in tag_percentage(PTB_TAGS.vocab.freqs.most_common()):\n", - " print(f\"{tag}\\t\\t{count}\\t\\t{percent*100:4.1f}%\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The final part of data preparation is handling the iterator. \n", - "\n", - "This will be iterated over to return batches of data to process. Here, we set the batch size and the `device` - which is used to place the batches of tensors on our GPU, if we have one. " - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [], - "source": [ - "BATCH_SIZE = 128\n", - "\n", - "device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')\n", - "\n", - "train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(\n", - " (train_data, valid_data, test_data), \n", - " batch_size = BATCH_SIZE,\n", - " device = device)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Building the Model\n", - "\n", - "Next up, we define our model - a multi-layer bi-directional LSTM. The image below shows a simplified version of the model with only one LSTM layer and omitting the LSTM's cell state for clarity.\n", - "\n", - "![](assets/pos-bidirectional-lstm.png)\n", - "\n", - "The model takes in a sequence of tokens, $X = \\{x_1, x_2,...,x_T\\}$, passes them through an embedding layer, $e$, to get the token embeddings, $e(X) = \\{e(x_1), e(x_2), ..., e(x_T)\\}$.\n", - "\n", - "These embeddings are processed - one per time-step - by the forward and backward LSTMs. The forward LSTM processes the sequence from left-to-right, whilst the backward LSTM processes the sequence right-to-left, i.e. the first input to the forward LSTM is $x_1$ and the first input to the backward LSTM is $x_T$. \n", - "\n", - "The LSTMs also take in the the hidden, $h$, and cell, $c$, states from the previous time-step\n", - "\n", - "$$h^{\\rightarrow}_t = \\text{LSTM}^{\\rightarrow}(e(x^{\\rightarrow}_t), h^{\\rightarrow}_{t-1}, c^{\\rightarrow}_{t-1})$$\n", - "$$h^{\\leftarrow}_t=\\text{LSTM}^{\\leftarrow}(e(x^{\\leftarrow}_t), h^{\\leftarrow}_{t-1}, c^{\\leftarrow}_{t-1})$$\n", - "\n", - "After the whole sequence has been processed, the hidden and cell states are then passed to the next layer of the LSTM.\n", - "\n", - "The initial hidden and cell states, $h_0$ and $c_0$, for each direction and layer are initialized to a tensor full of zeros.\n", - "\n", - "We then concatenate both the forward and backward hidden states from the final layer of the LSTM, $H = \\{h_1, h_2, ... h_T\\}$, where $h_1 = [h^{\\rightarrow}_1;h^{\\leftarrow}_T]$, $h_2 = [h^{\\rightarrow}_2;h^{\\leftarrow}_{T-1}]$, etc. and pass them through a linear layer, $f$, which is used to make the prediction of which tag applies to this token, $\\hat{y}_t = f(h_t)$.\n", - "\n", - "When training the model, we will compare our predicted tags, $\\hat{Y}$ against the actual tags, $Y$, to calculate a loss, the gradients w.r.t. that loss, and then update our parameters.\n", - "\n", - "We implement the model detailed above in the `BiLSTMPOSTagger` class.\n", - "\n", - "`nn.Embedding` is an embedding layer and the input dimension should be the size of the input (text) vocabulary. We tell it what the index of the padding token is so it does not update the padding token's embedding entry.\n", - "\n", - "`nn.LSTM` is the LSTM. We apply dropout as regularization between the layers, if we are using more than one.\n", - "\n", - "`nn.Linear` defines the linear layer to make predictions using the LSTM outputs. We double the size of the input if we are using a bi-directional LSTM. The output dimensions should be the size of the tag vocabulary.\n", - "\n", - "We also define a dropout layer with `nn.Dropout`, which we use in the `forward` method to apply dropout to the embeddings and the outputs of the final layer of the LSTM." - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [], - "source": [ - "example_batch = next(iter(train_iterator))" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "tensor([[1355, 50, 9, ..., 11, 368, 2],\n", - " [ 4, 23, 660, ..., 14, 107, 781],\n", - " [ 69, 79, 9, ..., 844, 48, 20],\n", - " ...,\n", - " [ 1, 1, 1, ..., 1, 1, 1],\n", - " [ 1, 1, 1, ..., 1, 1, 1],\n", - " [ 1, 1, 1, ..., 1, 1, 1]], device='cuda:0')" - ] - }, - "execution_count": 41, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "example_batch.text" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [], - "source": [ - "class BiLSTMPOSTagger(nn.Module):\n", - " def __init__(self, \n", - " input_dim, \n", - " embedding_dim, \n", - " hidden_dim, \n", - " output_dim, \n", - " n_layers, \n", - " bidirectional, \n", - " dropout, \n", - " pad_idx):\n", - " \n", - " super().__init__()\n", - " \n", - " self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx = pad_idx)\n", - " \n", - " self.lstm = nn.LSTM(embedding_dim, \n", - " hidden_dim, \n", - " num_layers = n_layers, \n", - " bidirectional = bidirectional,\n", - " dropout = dropout if n_layers > 1 else 0)\n", - " \n", - " self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)\n", - " \n", - " self.dropout = nn.Dropout(dropout)\n", - " \n", - " def forward(self, text):\n", - "\n", - " #text = [sent len, batch size]\n", - " \n", - " #pass text through embedding layer\n", - " embedded = self.dropout(self.embedding(text))\n", - " \n", - " #embedded = [sent len, batch size, emb dim]\n", - " \n", - " #pass embeddings into LSTM\n", - " outputs, (hidden, cell) = self.lstm(embedded)\n", - " \n", - " #outputs holds the backward and forward hidden states in the final layer\n", - " #hidden and cell are the backward and forward hidden and cell states at the final time-step\n", - " \n", - " #output = [sent len, batch size, hid dim * n directions]\n", - " #hidden/cell = [n layers * n directions, batch size, hid dim]\n", - " \n", - " #we use our outputs to make a prediction of what the tag should be\n", - " predictions = self.fc(self.dropout(outputs))\n", - " \n", - " #predictions = [sent len, batch size, output dim]\n", - " \n", - " return predictions" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Training the Model\n", - "\n", - "Next, we instantiate the model. We need to ensure the embedding dimensions matches that of the GloVe embeddings we loaded earlier.\n", - "\n", - "The rest of the hyperparmeters have been chosen as sensible defaults, though there may be a combination that performs better on this model and dataset.\n", - "\n", - "The input and output dimensions are taken directly from the lengths of the respective vocabularies. The padding index is obtained using the vocabulary and the `Field` of the text." - ] - }, - { - "cell_type": "code", - "execution_count": 71, - "metadata": {}, - "outputs": [], - "source": [ - "INPUT_DIM = len(TEXT.vocab)\n", - "EMBEDDING_DIM = 100\n", - "HIDDEN_DIM = 128\n", - "OUTPUT_DIM = len(UD_TAGS.vocab)\n", - "N_LAYERS = 2\n", - "BIDIRECTIONAL = True\n", - "DROPOUT = 0.25\n", - "PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]\n", - "\n", - "model = BiLSTMPOSTagger(INPUT_DIM, \n", - " EMBEDDING_DIM, \n", - " HIDDEN_DIM, \n", - " OUTPUT_DIM, \n", - " N_LAYERS, \n", - " BIDIRECTIONAL, \n", - " DROPOUT, \n", - " PAD_IDX)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We initialize the weights from a simple Normal distribution. Again, there may be a better initialization scheme for this model and dataset." - ] - }, - { - "cell_type": "code", - "execution_count": 72, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "BiLSTMPOSTagger(\n", - " (embedding): Embedding(8866, 100, padding_idx=1)\n", - " (lstm): LSTM(100, 128, num_layers=2, dropout=0.25, bidirectional=True)\n", - " (fc): Linear(in_features=256, out_features=18, bias=True)\n", - " (dropout): Dropout(p=0.25, inplace=False)\n", - ")" - ] - }, - "execution_count": 72, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "def init_weights(m):\n", - " for name, param in m.named_parameters():\n", - " nn.init.normal_(param.data, mean = 0, std = 0.1)\n", - " \n", - "model.apply(init_weights)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next, a small function to tell us how many parameters are in our model. Useful for comparing different models." - ] - }, - { - "cell_type": "code", - "execution_count": 73, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The model has 1,522,010 trainable parameters\n" - ] - } - ], - "source": [ - "def count_parameters(model):\n", - " return sum(p.numel() for p in model.parameters() if p.requires_grad)\n", - "\n", - "print(f'The model has {count_parameters(model):,} trainable parameters')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We'll now initialize our model's embedding layer with the pre-trained embedding values we loaded earlier.\n", - "\n", - "This is done by getting them from the vocab's `.vectors` attribute and then performing a `.copy` to overwrite the embedding layer's current weights." - ] - }, - { - "cell_type": "code", - "execution_count": 74, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "torch.Size([8866, 100])\n" - ] - } - ], - "source": [ - "pretrained_embeddings = TEXT.vocab.vectors\n", - "\n", - "print(pretrained_embeddings.shape)" - ] - }, - { - "cell_type": "code", - "execution_count": 75, - "metadata": {}, - "outputs": [], - "source": [ - "a = next(model.embedding.parameters())" - ] - }, - { - "cell_type": "code", - "execution_count": 76, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([8866, 100])" - ] - }, - "execution_count": 76, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "a.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 77, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "tensor([[-0.1117, -0.4966, 0.1631, ..., 1.2647, -0.2753, -0.1325],\n", - " [-0.8555, -0.7208, 1.3755, ..., 0.0825, -1.1314, 0.3997],\n", - " [-0.0382, -0.2449, 0.7281, ..., -0.1459, 0.8278, 0.2706],\n", - " ...,\n", - " [ 0.9261, 2.3049, 0.5502, ..., -0.3492, -0.5298, -0.1577],\n", - " [-0.5972, 0.0471, -0.2406, ..., -0.9446, -0.1126, -0.2260],\n", - " [-0.4809, 2.5629, 0.9530, ..., 0.5278, -0.4588, 0.7294]])" - ] - }, - "execution_count": 77, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "model.embedding.weight.data.copy_(pretrained_embeddings)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "It's common to initialize the embedding of the pad token to all zeros. This, along with setting the `padding_idx` in the model's embedding layer, means that the embedding should always output a tensor full of zeros when a pad token is input." - ] - }, - { - "cell_type": "code", - "execution_count": 80, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "tensor([[-0.1117, -0.4966, 0.1631, ..., 1.2647, -0.2753, -0.1325],\n", - " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000],\n", - " [-0.0382, -0.2449, 0.7281, ..., -0.1459, 0.8278, 0.2706],\n", - " ...,\n", - " [ 0.9261, 2.3049, 0.5502, ..., -0.3492, -0.5298, -0.1577],\n", - " [-0.5972, 0.0471, -0.2406, ..., -0.9446, -0.1126, -0.2260],\n", - " [-0.4809, 2.5629, 0.9530, ..., 0.5278, -0.4588, 0.7294]])\n" - ] - } - ], - "source": [ - "model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)\n", - "\n", - "print(model.embedding.weight.data)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We then define our optimizer, used to update our parameters w.r.t. their gradients. We use Adam with the default learning rate." - ] - }, - { - "cell_type": "code", - "execution_count": 81, - "metadata": {}, - "outputs": [], - "source": [ - "optimizer = optim.Adam(model.parameters())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next, we define our loss function, cross-entropy loss.\n", - "\n", - "Even though we have no `` tokens within our tag vocab, we still have `` tokens. This is because all sentences within a batch need to be the same size. However, we don't want to calculate the loss when the target is a `` token as we aren't training our model to recognize padding tokens.\n", - "\n", - "We handle this by setting the `ignore_index` in our loss function to the index of the padding token in our tag vocabulary." - ] - }, - { - "cell_type": "code", - "execution_count": 82, - "metadata": {}, - "outputs": [], - "source": [ - "TAG_PAD_IDX = UD_TAGS.vocab.stoi[UD_TAGS.pad_token]\n", - "\n", - "criterion = nn.CrossEntropyLoss(ignore_index = TAG_PAD_IDX)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We then place our model and loss function on our GPU, if we have one." - ] - }, - { - "cell_type": "code", - "execution_count": 83, - "metadata": {}, - "outputs": [], - "source": [ - "model = model.to(device)\n", - "criterion = criterion.to(device)" - ] - }, - { - "cell_type": "code", - "execution_count": 84, - "metadata": {}, - "outputs": [], - "source": [ - "model.embedding.weight.requires_grad = True" - ] - }, - { - "cell_type": "code", - "execution_count": 85, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 85, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "model.embedding.weight.requires_grad" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We will be using the loss value between our predicted and actual tags to train the network, but ideally we'd like a more interpretable way to see how well our model is doing - accuracy.\n", - "\n", - "The issue is that we don't want to calculate accuracy over the `` tokens as we aren't interested in predicting them.\n", - "\n", - "The function below only calculates accuracy over non-padded tokens. `non_pad_elements` is a tensor containing the indices of the non-pad tokens within an input batch. We then compare the predictions of those elements with the labels to get a count of how many predictions were correct. We then divide this by the number of non-pad elements to get our accuracy value over the batch." - ] - }, - { - "cell_type": "code", - "execution_count": 86, - "metadata": {}, - "outputs": [], - "source": [ - "def categorical_accuracy(preds, y, tag_pad_idx):\n", - " \"\"\"\n", - " Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8\n", - " \"\"\"\n", - " max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability\n", - " non_pad_elements = (y != tag_pad_idx).nonzero()\n", - " correct = max_preds[non_pad_elements].squeeze(1).eq(y[non_pad_elements])\n", - " return correct.sum() / torch.FloatTensor([y[non_pad_elements].shape[0]])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next is the function that handles training our model.\n", - "\n", - "We first set the model to `train` mode to turn on dropout/batch-norm/etc. (if used). Then we iterate over our iterator, which returns a batch of examples. \n", - "\n", - "For each batch: \n", - "- we zero the gradients over the parameters from the last gradient calculation\n", - "- insert the batch of text into the model to get predictions\n", - "- as PyTorch loss functions cannot handle 3-dimensional predictions we reshape our predictions\n", - "- calculate the loss and accuracy between the predicted tags and actual tags\n", - "- call `backward` to calculate the gradients of the parameters w.r.t. the loss\n", - "- take an optimizer `step` to update the parameters\n", - "- add to the running total of loss and accuracy" - ] - }, - { - "cell_type": "code", - "execution_count": 87, - "metadata": {}, - "outputs": [], - "source": [ - "def train(model, iterator, optimizer, criterion, tag_pad_idx):\n", - " \n", - " epoch_loss = 0\n", - " epoch_acc = 0\n", - " \n", - " model.train()\n", - " \n", - " for batch in iterator:\n", - " \n", - " text = batch.text\n", - " tags = batch.udtags\n", - " \n", - " optimizer.zero_grad()\n", - " \n", - " #text = [sent len, batch size]\n", - " \n", - " predictions = model(text)\n", - " \n", - " #predictions = [sent len, batch size, output dim]\n", - " #tags = [sent len, batch size]\n", - " \n", - " predictions = predictions.view(-1, predictions.shape[-1])\n", - " tags = tags.view(-1)\n", - " \n", - " #predictions = [sent len * batch size, output dim]\n", - " #tags = [sent len * batch size]\n", - " \n", - " loss = criterion(predictions, tags)\n", - " \n", - " acc = categorical_accuracy(predictions, tags, tag_pad_idx)\n", - " \n", - " loss.backward()\n", - " \n", - " optimizer.step()\n", - " \n", - " epoch_loss += loss.item()\n", - " epoch_acc += acc.item()\n", - " \n", - " return epoch_loss / len(iterator), epoch_acc / len(iterator)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `evaluate` function is similar to the `train` function, except with changes made so we don't update the model's parameters.\n", - "\n", - "`model.eval()` is used to put the model in evaluation mode, so dropout/batch-norm/etc. are turned off. \n", - "\n", - "The iteration loop is also wrapped in `torch.no_grad` to ensure we don't calculate any gradients. We also don't need to call `optimizer.zero_grad()` and `optimizer.step()`." - ] - }, - { - "cell_type": "code", - "execution_count": 88, - "metadata": {}, - "outputs": [], - "source": [ - "def evaluate(model, iterator, criterion, tag_pad_idx):\n", - " \n", - " epoch_loss = 0\n", - " epoch_acc = 0\n", - " \n", - " model.eval()\n", - " \n", - " with torch.no_grad():\n", - " \n", - " for batch in iterator:\n", - "\n", - " text = batch.text\n", - " tags = batch.udtags\n", - " \n", - " predictions = model(text)\n", - " \n", - " predictions = predictions.view(-1, predictions.shape[-1])\n", - " tags = tags.view(-1)\n", - " \n", - " loss = criterion(predictions, tags)\n", - " \n", - " acc = categorical_accuracy(predictions, tags, tag_pad_idx)\n", - "\n", - " epoch_loss += loss.item()\n", - " epoch_acc += acc.item()\n", - " \n", - " return epoch_loss / len(iterator), epoch_acc / len(iterator)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next, we have a small function that tells us how long an epoch takes." - ] - }, - { - "cell_type": "code", - "execution_count": 89, - "metadata": {}, - "outputs": [], - "source": [ - "def epoch_time(start_time, end_time):\n", - " elapsed_time = end_time - start_time\n", - " elapsed_mins = int(elapsed_time / 60)\n", - " elapsed_secs = int(elapsed_time - (elapsed_mins * 60))\n", - " return elapsed_mins, elapsed_secs" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Finally, we train our model!\n", - "\n", - "After each epoch we check if our model has achieved the best validation loss so far. If it has then we save the parameters of this model and we will use these \"best\" parameters to calculate performance over our test set." - ] - }, - { - "cell_type": "code", - "execution_count": 90, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Epoch: 01 | Epoch Time: 0m 2s\n", - "\tTrain Loss: 1.343 | Train Acc: 58.15%\n", - "\t Val. Loss: 0.684 | Val. Acc: 78.59%\n", - "Epoch: 02 | Epoch Time: 0m 2s\n", - "\tTrain Loss: 0.477 | Train Acc: 85.02%\n", - "\t Val. Loss: 0.499 | Val. Acc: 83.89%\n", - "Epoch: 03 | Epoch Time: 0m 2s\n", - "\tTrain Loss: 0.347 | Train Acc: 89.12%\n", - "\t Val. Loss: 0.446 | Val. Acc: 85.16%\n", - "Epoch: 04 | Epoch Time: 0m 2s\n", - "\tTrain Loss: 0.287 | Train Acc: 90.97%\n", - "\t Val. Loss: 0.406 | Val. Acc: 86.60%\n", - "Epoch: 05 | Epoch Time: 0m 2s\n", - "\tTrain Loss: 0.250 | Train Acc: 92.03%\n", - "\t Val. Loss: 0.397 | Val. Acc: 86.90%\n", - "Epoch: 06 | Epoch Time: 0m 2s\n", - "\tTrain Loss: 0.223 | Train Acc: 92.95%\n", - "\t Val. Loss: 0.384 | Val. Acc: 87.23%\n", - "Epoch: 07 | Epoch Time: 0m 2s\n", - "\tTrain Loss: 0.203 | Train Acc: 93.52%\n", - "\t Val. Loss: 0.366 | Val. Acc: 87.35%\n", - "Epoch: 08 | Epoch Time: 0m 2s\n", - "\tTrain Loss: 0.189 | Train Acc: 93.96%\n", - "\t Val. Loss: 0.360 | Val. Acc: 87.65%\n", - "Epoch: 09 | Epoch Time: 0m 2s\n", - "\tTrain Loss: 0.175 | Train Acc: 94.35%\n", - "\t Val. Loss: 0.356 | Val. Acc: 87.71%\n", - "Epoch: 10 | Epoch Time: 0m 2s\n", - "\tTrain Loss: 0.165 | Train Acc: 94.64%\n", - "\t Val. Loss: 0.366 | Val. Acc: 87.55%\n", - "Epoch: 11 | Epoch Time: 0m 2s\n", - "\tTrain Loss: 0.154 | Train Acc: 95.05%\n", - "\t Val. Loss: 0.367 | Val. Acc: 87.82%\n", - "Epoch: 12 | Epoch Time: 0m 2s\n", - "\tTrain Loss: 0.146 | Train Acc: 95.27%\n", - "\t Val. Loss: 0.349 | Val. Acc: 88.14%\n", - "Epoch: 13 | Epoch Time: 0m 2s\n", - "\tTrain Loss: 0.139 | Train Acc: 95.43%\n", - "\t Val. Loss: 0.340 | Val. Acc: 88.35%\n", - "Epoch: 14 | Epoch Time: 0m 2s\n", - "\tTrain Loss: 0.130 | Train Acc: 95.76%\n", - "\t Val. Loss: 0.338 | Val. Acc: 88.56%\n", - "Epoch: 15 | Epoch Time: 0m 2s\n", - "\tTrain Loss: 0.124 | Train Acc: 95.91%\n", - "\t Val. Loss: 0.336 | Val. Acc: 88.58%\n" - ] - } - ], - "source": [ - "N_EPOCHS = 15\n", - "\n", - "best_valid_loss = float('inf')\n", - "\n", - "for epoch in range(N_EPOCHS):\n", - "\n", - " start_time = time.time()\n", - " \n", - " train_loss, train_acc = train(model, train_iterator, optimizer, criterion, TAG_PAD_IDX)\n", - " valid_loss, valid_acc = evaluate(model, valid_iterator, criterion, TAG_PAD_IDX)\n", - " \n", - " end_time = time.time()\n", - "\n", - " epoch_mins, epoch_secs = epoch_time(start_time, end_time)\n", - " \n", - " if valid_loss < best_valid_loss:\n", - " best_valid_loss = valid_loss\n", - " torch.save(model.state_dict(), 'tut1-model.pt')\n", - " \n", - " print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')\n", - " print(f'\\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')\n", - " print(f'\\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We then load our \"best\" parameters and evaluate performance on the test set." - ] - }, - { - "cell_type": "code", - "execution_count": 91, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Test Loss: 0.349 | Test Acc: 88.66%\n" - ] - } - ], - "source": [ - "model.load_state_dict(torch.load('tut1-model.pt'))\n", - "\n", - "test_loss, test_acc = evaluate(model, test_iterator, criterion, TAG_PAD_IDX)\n", - "\n", - "print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Inference\n", - "\n", - "88% accuracy looks pretty good, but let's see our model tag some actual sentences.\n", - "\n", - "We define a `tag_sentence` function that will:\n", - "- put the model into evaluation mode\n", - "- tokenize the sentence with spaCy if it is not a list\n", - "- lowercase the tokens if the `Field` did\n", - "- numericalize the tokens using the vocabulary\n", - "- find out which tokens are not in the vocabulary, i.e. are `` tokens\n", - "- convert the numericalized tokens into a tensor and add a batch dimension\n", - "- feed the tensor into the model\n", - "- get the predictions over the sentence\n", - "- convert the predictions into readable tags\n", - "\n", - "As well as returning the tokens and tags, it also returns which tokens were `` tokens." - ] - }, - { - "cell_type": "code", - "execution_count": 92, - "metadata": {}, - "outputs": [], - "source": [ - "def tag_sentence(model, device, sentence, text_field, tag_field):\n", - " \n", - " model.eval()\n", - " \n", - " if isinstance(sentence, str):\n", - " nlp = spacy.load('en')\n", - " tokens = [token.text for token in nlp(sentence)]\n", - " else:\n", - " tokens = [token for token in sentence]\n", - "\n", - " if text_field.lower:\n", - " tokens = [t.lower() for t in tokens]\n", - " \n", - " numericalized_tokens = [text_field.vocab.stoi[t] for t in tokens]\n", - "\n", - " unk_idx = text_field.vocab.stoi[text_field.unk_token]\n", - " \n", - " unks = [t for t, n in zip(tokens, numericalized_tokens) if n == unk_idx]\n", - " \n", - " token_tensor = torch.LongTensor(numericalized_tokens)\n", - " \n", - " token_tensor = token_tensor.unsqueeze(-1).to(device)\n", - " \n", - " predictions = model(token_tensor)\n", - " \n", - " top_predictions = predictions.argmax(-1)\n", - " \n", - " predicted_tags = [tag_field.vocab.itos[t.item()] for t in top_predictions]\n", - " \n", - " return tokens, predicted_tags, unks" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We'll get an already tokenized example from the training set and test our model's performance." - ] - }, - { - "cell_type": "code", - "execution_count": 93, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['[', 'this', 'killing', 'of', 'a', 'respected', 'cleric', 'will', 'be', 'causing', 'us', 'trouble', 'for', 'years', 'to', 'come', '.', ']']\n" - ] - } - ], - "source": [ - "example_index = 1\n", - "\n", - "sentence = vars(train_data.examples[example_index])['text']\n", - "actual_tags = vars(train_data.examples[example_index])['udtags']\n", - "\n", - "print(sentence)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can then use our `tag_sentence` function to get the tags. Notice how the tokens referring to subject of the sentence, the \"respected cleric\", are both `` tokens!" - ] - }, - { - "cell_type": "code", - "execution_count": 94, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['respected', 'cleric']\n" - ] - } - ], - "source": [ - "tokens, pred_tags, unks = tag_sentence(model, \n", - " device, \n", - " sentence, \n", - " TEXT, \n", - " UD_TAGS)\n", - "\n", - "print(unks)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can then check how well it did. Surprisingly, it got every token correct, including the two that were unknown tokens!" - ] - }, - { - "cell_type": "code", - "execution_count": 98, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Pred. Tag\tActual Tag\tCorrect?\tToken\n", - "\n", - "PUNCT\t\tPUNCT\t\t✔\t\tthe\n", - "DET\t\tDET\t\t✔\t\tqueen\n", - "NOUN\t\tNOUN\t\t✔\t\twill\n", - "ADP\t\tADP\t\t✔\t\tdeliver\n", - "DET\t\tDET\t\t✔\t\ta\n", - "ADJ\t\tADJ\t\t✔\t\tspeech\n", - "NOUN\t\tNOUN\t\t✔\t\tabout\n", - "AUX\t\tAUX\t\t✔\t\tthe\n", - "AUX\t\tAUX\t\t✔\t\tconflict\n", - "VERB\t\tVERB\t\t✔\t\tin\n", - "PRON\t\tPRON\t\t✔\t\tnorth\n", - "NOUN\t\tNOUN\t\t✔\t\tkorea\n", - "ADP\t\tADP\t\t✔\t\tat\n", - "NOUN\t\tNOUN\t\t✔\t\t1\n", - "PART\t\tPART\t\t✔\t\tpm\n", - "VERB\t\tVERB\t\t✔\t\ttomorrow\n", - "PUNCT\t\tPUNCT\t\t✔\t\t.\n" - ] - } - ], - "source": [ - "print(\"Pred. Tag\\tActual Tag\\tCorrect?\\tToken\\n\")\n", - "\n", - "for token, pred_tag, actual_tag in zip(tokens, pred_tags, actual_tags):\n", - " correct = '✔' if pred_tag == actual_tag else '✘'\n", - " print(f\"{pred_tag}\\t\\t{actual_tag}\\t\\t{correct}\\t\\t{token}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's now make up our own sentence and see how well the model does.\n", - "\n", - "Our example sentence below has every token within the model's vocabulary." - ] - }, - { - "cell_type": "code", - "execution_count": 99, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[]\n" - ] - } - ], - "source": [ - "sentence = 'The Queen will deliver a speech about the conflict in North Korea at 1pm tomorrow.'\n", - "\n", - "tokens, tags, unks = tag_sentence(model, \n", - " device, \n", - " sentence, \n", - " TEXT, \n", - " UD_TAGS)\n", - "\n", - "print(unks)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Looking at the sentence it seems like it gave sensible tags to every token!" - ] - }, - { - "cell_type": "code", - "execution_count": 100, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Pred. Tag\tToken\n", - "\n", - "DET\t\tthe\n", - "NOUN\t\tqueen\n", - "AUX\t\twill\n", - "VERB\t\tdeliver\n", - "DET\t\ta\n", - "NOUN\t\tspeech\n", - "ADP\t\tabout\n", - "DET\t\tthe\n", - "NOUN\t\tconflict\n", - "ADP\t\tin\n", - "PROPN\t\tnorth\n", - "PROPN\t\tkorea\n", - "ADP\t\tat\n", - "NUM\t\t1\n", - "NOUN\t\tpm\n", - "NOUN\t\ttomorrow\n", - "PUNCT\t\t.\n" - ] - } - ], - "source": [ - "print(\"Pred. Tag\\tToken\\n\")\n", - "\n", - "for token, tag in zip(tokens, tags):\n", - " print(f\"{tag}\\t\\t{token}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We've now seen how to implement PoS tagging with PyTorch and TorchText! \n", - "\n", - "The BiLSTM isn't a state-of-the-art model, in terms of performance, but is a strong baseline for PoS tasks and is a good tool to have in your arsenal." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Going deeper\n", - "What if we could combine word-level and char-level approaches? \n", - "![title](https://i.postimg.cc/tT9hsBfj/ive-put-an-rnn-in-your-rnn-so-you-can-train-an-rnn-on-every-step-of-your-rnn-training-loop.jpg)\n", - "\n", - "\n", - "Actually, we can. Let's use LSTM or GRU to generate embedding for every word on char-level.\n", - "![title](https://guillaumegenthial.github.io/assets/char_representation.png)\n", - "*Image source: https://guillaumegenthial.github.io/sequence-tagging-with-tensorflow.html*\n", - "\n", - "![title](https://guillaumegenthial.github.io/assets/bi-lstm.png)\n", - "*Image source: https://guillaumegenthial.github.io/sequence-tagging-with-tensorflow.html*" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To do that we need to make few adjustments to the code above" - ] - }, - { - "cell_type": "code", - "execution_count": 104, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'udtag': , 'ptbtag': , 'word': , 'char': }\n", - "12543\n", - "{'word': ['i', 'will', 'never', 'return', 'there', 'again', '(', 'and', 'now', 'have', 'some', 'serious', 'doubts', 'about', 'the', 'quality', 'of', 'work', 'they', 'actually', 'performed', 'on', 'my', 'car', ')', '.'], 'char': [['I'], ['w', 'i', 'l', 'l'], ['n', 'e', 'v', 'e', 'r'], ['r', 'e', 't', 'u', 'r', 'n'], ['t', 'h', 'e', 'r', 'e'], ['a', 'g', 'a', 'i', 'n'], ['('], ['a', 'n', 'd'], ['n', 'o', 'w'], ['h', 'a', 'v', 'e'], ['s', 'o', 'm', 'e'], ['s', 'e', 'r', 'i', 'o', 'u', 's'], ['d', 'o', 'u', 'b', 't', 's'], ['a', 'b', 'o', 'u', 't'], ['t', 'h', 'e'], ['q', 'u', 'a', 'l', 'i', 't', 'y'], ['o', 'f'], ['w', 'o', 'r', 'k'], ['t', 'h', 'e', 'y'], ['a', 'c', 't', 'u', 'a', 'l', 'l', 'y'], ['p', 'e', 'r', 'f', 'o', 'r', 'm', 'e', 'd'], ['o', 'n'], ['m', 'y'], ['c', 'a', 'r'], [')'], ['.']], 'udtag': ['PRON', 'AUX', 'ADV', 'VERB', 'ADV', 'ADV', 'PUNCT', 'CCONJ', 'ADV', 'VERB', 'DET', 'ADJ', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'NOUN', 'PRON', 'ADV', 'VERB', 'ADP', 'PRON', 'NOUN', 'PUNCT', 'PUNCT'], 'ptbtag': ['PRP', 'MD', 'RB', 'VB', 'RB', 'RB', '-LRB-', 'CC', 'RB', 'VBP', 'DT', 'JJ', 'NNS', 'IN', 'DT', 'NN', 'IN', 'NN', 'PRP', 'RB', 'VBD', 'IN', 'PRP$', 'NN', '-RRB-', '.']}\n" - ] - } - ], - "source": [ - "# Now lets try both word and character embeddings\n", - "WORD = data.Field(lower = True)\n", - "UD_TAG = data.Field(unk_token = None)\n", - "PTB_TAG = data.Field(unk_token = None)\n", - "\n", - "# We'll use NestedField to tokenize each word into list of chars\n", - "CHAR_NESTING = data.Field(tokenize=list, init_token=\"\", eos_token=\"\")\n", - "CHAR = data.NestedField(CHAR_NESTING)#, init_token=\"\", eos_token=\"\")\n", - "\n", - "fields = [(('word', 'char'), (WORD, CHAR)), ('udtag', UD_TAG), ('ptbtag', PTB_TAG)]\n", - "train_data, valid_data, test_data = datasets.UDPOS.splits(fields)\n", - "# train, val, test = datasets.UDPOS.splits(fields=fields)\n", - "\n", - "print(train_data.fields)\n", - "print(len(train_data))\n", - "print(vars(train_data[-1]))" - ] - }, - { - "cell_type": "code", - "execution_count": 106, - "metadata": {}, - "outputs": [], - "source": [ - "WORD.build_vocab(\n", - " train_data,\n", - " min_freq = MIN_FREQ,\n", - " vectors=\"glove.6B.100d\",\n", - " unk_init = torch.Tensor.normal_\n", - ")\n", - "\n", - "\n", - "CHAR.build_vocab(train_data)\n", - "UD_TAG.build_vocab(train_data)\n", - "PTB_TAG.build_vocab(train_data)" - ] - }, - { - "cell_type": "code", - "execution_count": 107, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Unique tokens in WORD vocabulary: 8866\n", - "Unique tokens in CHAR vocabulary: 112\n", - "Unique tokens in UD_TAG vocabulary: 18\n", - "Unique tokens in PTB_TAG vocabulary: 51\n" - ] - } - ], - "source": [ - "print(f\"Unique tokens in WORD vocabulary: {len(WORD.vocab)}\")\n", - "print(f\"Unique tokens in CHAR vocabulary: {len(CHAR.vocab)}\")\n", - "print(f\"Unique tokens in UD_TAG vocabulary: {len(UD_TAG.vocab)}\")\n", - "print(f\"Unique tokens in PTB_TAG vocabulary: {len(PTB_TAG.vocab)}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 110, - "metadata": {}, - "outputs": [], - "source": [ - "BATCH_SIZE = 64\n", - "\n", - "device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')\n", - "\n", - "train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(\n", - " (train_data, valid_data, test_data), \n", - " batch_size = BATCH_SIZE,\n", - " device = device)" - ] - }, - { - "cell_type": "code", - "execution_count": 116, - "metadata": {}, - "outputs": [], - "source": [ - "batch = next(iter(train_iterator))" - ] - }, - { - "cell_type": "code", - "execution_count": 127, - "metadata": {}, - "outputs": [], - "source": [ - "text = batch.word\n", - "chars = batch.char\n", - "tags = batch.udtag\n" - ] - }, - { - "cell_type": "code", - "execution_count": 128, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([46, 64])" - ] - }, - "execution_count": 128, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# seq len, batch_size\n", - "text.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 129, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([64, 46, 19])" - ] - }, - "execution_count": 129, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# not another_seq_len, batch_size\n", - "chars.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 130, - "metadata": {}, - "outputs": [], - "source": [ - "# new seq_len, batch_size_1, batch_size_2\n", - "chars = chars.permute(2, 0, 1)" - ] - }, - { - "cell_type": "code", - "execution_count": 135, - "metadata": {}, - "outputs": [], - "source": [ - "# new seq_len, new batch_size\n", - "\n", - "chars_new = chars.view(chars.shape[0], -1)" - ] - }, - { - "cell_type": "code", - "execution_count": 139, - "metadata": {}, - "outputs": [], - "source": [ - "emb_test = nn.Embedding(112, 64).to(device)" - ] - }, - { - "cell_type": "code", - "execution_count": 143, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([19, 2944, 64])" - ] - }, - "execution_count": 143, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "a = emb_test(chars_new)\n", - "a.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 144, - "metadata": {}, - "outputs": [], - "source": [ - "lstm_test = nn.LSTM(64, 32).to(device)" - ] - }, - { - "cell_type": "code", - "execution_count": 146, - "metadata": {}, - "outputs": [], - "source": [ - "b = lstm_test(a)" - ] - }, - { - "cell_type": "code", - "execution_count": 156, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([46, 64, 32])" - ] - }, - "execution_count": 156, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "b[1][0].permute(1, 2, 0).reshape(32, -1).reshape(*text.shape, -1).shape" - ] - }, - { - "cell_type": "code", - "execution_count": 158, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([46, 64])" - ] - }, - "execution_count": 158, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "text.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 120, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([46, 64])" - ] - }, - "execution_count": 120, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tags.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 195, - "metadata": {}, - "outputs": [], - "source": [ - "class BiLSTMPOSTaggerWithChars(nn.Module):\n", - " def __init__(self, \n", - " word_input_dim, \n", - " word_embedding_dim,\n", - " char_input_dim,\n", - " char_embedding_dim,\n", - " char_hidden_dim,\n", - " hidden_dim,\n", - " output_dim, \n", - " n_layers, \n", - " bidirectional, \n", - " dropout, \n", - " pad_idx):\n", - " \n", - " super().__init__()\n", - " \n", - " self.word_embedding = nn.Embedding(word_input_dim, word_embedding_dim, padding_idx = pad_idx)\n", - " self.char_embedding = nn.Embedding(char_input_dim, char_embedding_dim, padding_idx = pad_idx)\n", - " self.char_lstm = nn.LSTM(char_embedding_dim, char_hidden_dim, bidirectional=True)\n", - " \n", - " self.lstm = nn.LSTM(word_embedding_dim + char_hidden_dim*2, \n", - " hidden_dim, \n", - " num_layers = n_layers, \n", - " bidirectional = bidirectional,\n", - " dropout = dropout if n_layers > 1 else 0)\n", - " \n", - " self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)\n", - " \n", - " self.dropout = nn.Dropout(dropout)\n", - " \n", - " def forward(self, text, chars):\n", - "\n", - " #text = [sent len, batch size]\n", - " \n", - " #pass text through embedding layer\n", - " embedded = self.dropout(self.word_embedding(text))\n", - " #embedded = [sent len, batch size, emb dim]\n", - " \n", - " chars = chars.permute(2, 0, 1)\n", - " chars = chars.view(chars.shape[0], -1)\n", - "\n", - " chars_embedded = self.char_embedding(chars)\n", - " _, (hid, _) = self.char_lstm(chars_embedded)\n", - " hid = hid.permute(1, 2, 0)\n", - " hid = hid.reshape(hid.shape[0], -1)\n", - " hid = hid.reshape(*text.shape, -1)\n", - " \n", - " embedded_with_chars = torch.cat([embedded, hid], dim=2)\n", - " \n", - " \n", - " #pass embeddings into LSTM\n", - " outputs, (hidden, cell) = self.lstm(self.dropout(embedded_with_chars))\n", - " \n", - " #outputs holds the backward and forward hidden states in the final layer\n", - " #hidden and cell are the backward and forward hidden and cell states at the final time-step\n", - " \n", - " #output = [sent len, batch size, hid dim * n directions]\n", - " #hidden/cell = [n layers * n directions, batch size, hid dim]\n", - " \n", - " #we use our outputs to make a prediction of what the tag should be\n", - " predictions = self.fc(self.dropout(outputs))\n", - " \n", - " #predictions = [sent len, batch size, output dim]\n", - " \n", - " return predictions" - ] - }, - { - "cell_type": "code", - "execution_count": 196, - "metadata": {}, - "outputs": [], - "source": [ - "INPUT_DIM = len(WORD.vocab)\n", - "EMBEDDING_DIM = 100\n", - "HIDDEN_DIM = 160\n", - "CHAR_INPUT_DIM = 112\n", - "CHAR_EMBEDDING_DIM = 30\n", - "CHAR_HIDDEN_DIM = 30\n", - "OUTPUT_DIM = len(UD_TAGS.vocab)\n", - "N_LAYERS = 2\n", - "BIDIRECTIONAL = True\n", - "DROPOUT = 0.25\n", - "PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]\n", - "\n", - "model = BiLSTMPOSTaggerWithChars(\n", - " INPUT_DIM, \n", - " EMBEDDING_DIM,\n", - " CHAR_INPUT_DIM,\n", - " CHAR_EMBEDDING_DIM,\n", - " CHAR_HIDDEN_DIM,\n", - " HIDDEN_DIM, \n", - " OUTPUT_DIM, \n", - " N_LAYERS, \n", - " BIDIRECTIONAL, \n", - " DROPOUT, \n", - " PAD_IDX\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Congratulations, you've got LSTM which relies on GRU output on each step.**\n", - "\n", - "Now we need only to train it. Same actions, very small adjustments." - ] - }, - { - "cell_type": "code", - "execution_count": 197, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "BiLSTMPOSTaggerWithChars(\n", - " (word_embedding): Embedding(8866, 100, padding_idx=1)\n", - " (char_embedding): Embedding(112, 30, padding_idx=1)\n", - " (char_lstm): LSTM(30, 30, bidirectional=True)\n", - " (lstm): LSTM(160, 160, num_layers=2, dropout=0.25, bidirectional=True)\n", - " (fc): Linear(in_features=320, out_features=18, bias=True)\n", - " (dropout): Dropout(p=0.25, inplace=False)\n", - ")" - ] - }, - "execution_count": 197, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "def init_weights(m):\n", - " for name, param in m.named_parameters():\n", - " nn.init.normal_(param.data, mean = 0, std = 0.1)\n", - " \n", - "model.apply(init_weights)" - ] - }, - { - "cell_type": "code", - "execution_count": 198, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The model has 1,939,738 trainable parameters\n" - ] - } - ], - "source": [ - "def count_parameters(model):\n", - " return sum(p.numel() for p in model.parameters() if p.requires_grad)\n", - "\n", - "print(f'The model has {count_parameters(model):,} trainable parameters')" - ] - }, - { - "cell_type": "code", - "execution_count": 199, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "torch.Size([8866, 100])\n" - ] - } - ], - "source": [ - "pretrained_embeddings = TEXT.vocab.vectors\n", - "\n", - "print(pretrained_embeddings.shape)" - ] - }, - { - "cell_type": "code", - "execution_count": 200, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "tensor([[-0.1117, -0.4966, 0.1631, ..., 1.2647, -0.2753, -0.1325],\n", - " [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000],\n", - " [-0.0382, -0.2449, 0.7281, ..., -0.1459, 0.8278, 0.2706],\n", - " ...,\n", - " [ 0.9261, 2.3049, 0.5502, ..., -0.3492, -0.5298, -0.1577],\n", - " [-0.5972, 0.0471, -0.2406, ..., -0.9446, -0.1126, -0.2260],\n", - " [-0.4809, 2.5629, 0.9530, ..., 0.5278, -0.4588, 0.7294]])\n" - ] - } - ], - "source": [ - "model.word_embedding.weight.data.copy_(pretrained_embeddings)\n", - "model.word_embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)\n", - "\n", - "print(model.word_embedding.weight.data)" - ] - }, - { - "cell_type": "code", - "execution_count": 201, - "metadata": {}, - "outputs": [], - "source": [ - "optimizer = optim.Adam(model.parameters())\n", - "\n", - "TAG_PAD_IDX = UD_TAGS.vocab.stoi[UD_TAGS.pad_token]\n", - "\n", - "criterion = nn.CrossEntropyLoss(ignore_index = TAG_PAD_IDX)\n", - "\n", - "model = model.to(device)\n", - "criterion = criterion.to(device)" - ] - }, - { - "cell_type": "code", - "execution_count": 202, - "metadata": {}, - "outputs": [], - "source": [ - "def train(model, iterator, optimizer, criterion, tag_pad_idx):\n", - " \n", - " epoch_loss = 0\n", - " epoch_acc = 0\n", - " \n", - " model.train()\n", - " \n", - " for batch in iterator:\n", - " \n", - " text = batch.word\n", - " chars = batch.char\n", - " tags = batch.udtag\n", - " \n", - " optimizer.zero_grad()\n", - " \n", - " #text = [sent len, batch size]\n", - " \n", - " predictions = model(text, chars)\n", - " \n", - " #predictions = [sent len, batch size, output dim]\n", - " #tags = [sent len, batch size]\n", - " \n", - " predictions = predictions.view(-1, predictions.shape[-1])\n", - " tags = tags.view(-1)\n", - " \n", - " #predictions = [sent len * batch size, output dim]\n", - " #tags = [sent len * batch size]\n", - " \n", - " loss = criterion(predictions, tags)\n", - " \n", - " acc = categorical_accuracy(predictions, tags, tag_pad_idx)\n", - " \n", - " loss.backward()\n", - " \n", - " optimizer.step()\n", - " \n", - " epoch_loss += loss.item()\n", - " epoch_acc += acc.item()\n", - " \n", - " return epoch_loss / len(iterator), epoch_acc / len(iterator)\n", - "\n", - "\n", - "def evaluate(model, iterator, criterion, tag_pad_idx):\n", - " \n", - " epoch_loss = 0\n", - " epoch_acc = 0\n", - " \n", - " model.eval()\n", - " \n", - " with torch.no_grad():\n", - " \n", - " for batch in iterator:\n", - "\n", - " text = batch.word\n", - " chars = batch.char\n", - " tags = batch.udtag\n", - " \n", - " predictions = model(text, chars)\n", - " \n", - " predictions = predictions.view(-1, predictions.shape[-1])\n", - " tags = tags.view(-1)\n", - " \n", - " loss = criterion(predictions, tags)\n", - " \n", - " acc = categorical_accuracy(predictions, tags, tag_pad_idx)\n", - "\n", - " epoch_loss += loss.item()\n", - " epoch_acc += acc.item()\n", - " \n", - " return epoch_loss / len(iterator), epoch_acc / len(iterator)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 203, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Epoch: 01 | Epoch Time: 0m 12s\n", - "\tTrain Loss: 1.029 | Train Acc: 67.43%\n", - "\t Val. Loss: 0.560 | Val. Acc: 81.96%\n", - "Epoch: 02 | Epoch Time: 0m 12s\n", - "\tTrain Loss: 0.453 | Train Acc: 85.30%\n", - "\t Val. Loss: 0.465 | Val. Acc: 84.58%\n", - "Epoch: 03 | Epoch Time: 0m 13s\n", - "\tTrain Loss: 0.345 | Train Acc: 88.82%\n", - "\t Val. Loss: 0.431 | Val. Acc: 85.22%\n", - "Epoch: 04 | Epoch Time: 0m 12s\n", - "\tTrain Loss: 0.289 | Train Acc: 90.62%\n", - "\t Val. Loss: 0.402 | Val. Acc: 86.37%\n", - "Epoch: 05 | Epoch Time: 0m 12s\n", - "\tTrain Loss: 0.253 | Train Acc: 91.79%\n", - "\t Val. Loss: 0.379 | Val. Acc: 87.08%\n", - "Epoch: 06 | Epoch Time: 0m 12s\n", - "\tTrain Loss: 0.228 | Train Acc: 92.57%\n", - "\t Val. Loss: 0.368 | Val. Acc: 86.99%\n", - "Epoch: 07 | Epoch Time: 0m 12s\n", - "\tTrain Loss: 0.207 | Train Acc: 93.28%\n", - "\t Val. Loss: 0.357 | Val. Acc: 87.62%\n", - "Epoch: 08 | Epoch Time: 0m 12s\n", - "\tTrain Loss: 0.192 | Train Acc: 93.71%\n", - "\t Val. Loss: 0.353 | Val. Acc: 89.27%\n", - "Epoch: 09 | Epoch Time: 0m 12s\n", - "\tTrain Loss: 0.179 | Train Acc: 94.13%\n", - "\t Val. Loss: 0.343 | Val. Acc: 89.94%\n", - "Epoch: 10 | Epoch Time: 0m 13s\n", - "\tTrain Loss: 0.168 | Train Acc: 94.47%\n", - "\t Val. Loss: 0.343 | Val. Acc: 89.78%\n", - "Epoch: 11 | Epoch Time: 0m 12s\n", - "\tTrain Loss: 0.158 | Train Acc: 94.82%\n", - "\t Val. Loss: 0.336 | Val. Acc: 90.07%\n", - "Epoch: 12 | Epoch Time: 0m 12s\n", - "\tTrain Loss: 0.150 | Train Acc: 95.05%\n", - "\t Val. Loss: 0.337 | Val. Acc: 90.19%\n", - "Epoch: 13 | Epoch Time: 0m 12s\n", - "\tTrain Loss: 0.143 | Train Acc: 95.29%\n", - "\t Val. Loss: 0.329 | Val. Acc: 90.39%\n", - "Epoch: 14 | Epoch Time: 0m 12s\n", - "\tTrain Loss: 0.137 | Train Acc: 95.46%\n", - "\t Val. Loss: 0.344 | Val. Acc: 89.93%\n", - "Epoch: 15 | Epoch Time: 0m 12s\n", - "\tTrain Loss: 0.129 | Train Acc: 95.74%\n", - "\t Val. Loss: 0.330 | Val. Acc: 90.45%\n" - ] - } - ], - "source": [ - "N_EPOCHS = 15\n", - "\n", - "best_valid_loss = float('inf')\n", - "\n", - "for epoch in range(N_EPOCHS):\n", - "\n", - " start_time = time.time()\n", - " \n", - " train_loss, train_acc = train(model, train_iterator, optimizer, criterion, TAG_PAD_IDX)\n", - " valid_loss, valid_acc = evaluate(model, valid_iterator, criterion, TAG_PAD_IDX)\n", - " \n", - " end_time = time.time()\n", - "\n", - " epoch_mins, epoch_secs = epoch_time(start_time, end_time)\n", - " \n", - " if valid_loss < best_valid_loss:\n", - " best_valid_loss = valid_loss\n", - " torch.save(model.state_dict(), 'tut2-model.pt')\n", - " \n", - " print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')\n", - " print(f'\\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')\n", - " print(f'\\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%')" - ] - }, - { - "cell_type": "code", - "execution_count": 206, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Test Loss: 0.342 | Test Acc: 89.85%\n" - ] - } - ], - "source": [ - "# Let's take a look at the model from the last epoch\n", - "test_loss, test_acc = evaluate(model, test_iterator, criterion, TAG_PAD_IDX)\n", - "\n", - "print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')" - ] - }, - { - "cell_type": "code", - "execution_count": 207, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Test Loss: 0.342 | Test Acc: 89.85%\n" - ] - } - ], - "source": [ - "# And at the best checkpoint (based on validation score)\n", - "model.load_state_dict(torch.load('tut2-model.pt'))\n", - "\n", - "test_loss, test_acc = evaluate(model, test_iterator, criterion, TAG_PAD_IDX)\n", - "\n", - "print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "py3_research env", - "language": "python", - "name": "py3_research" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.1" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/week05_transformer_pos_tagging/week05_positional_encoding_carriers.ipynb b/week05_transformer_pos_tagging/week05_positional_encoding_carriers.ipynb deleted file mode 100644 index 160ef2c..0000000 --- a/week05_transformer_pos_tagging/week05_positional_encoding_carriers.ipynb +++ /dev/null @@ -1,2438 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## week04: understanding the positional encoding\n", - "\n", - "_This notebook is brought to you by [Vladislav Goncharenko](https://www.linkedin.com/in/vladislav-goncharenko/)_" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# If using Colab, uncomment this cell\n", - "#! pip install plotly --upgrade" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "ExecuteTime": { - "end_time": "2019-09-26T22:51:09.014457Z", - "start_time": "2019-09-26T22:51:08.160758Z" - } - }, - "outputs": [], - "source": [ - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", - "import pandas as pd\n", - "\n", - "import plotly.express as px" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "ExecuteTime": { - "end_time": "2019-09-26T22:51:09.019768Z", - "start_time": "2019-09-26T22:51:09.016810Z" - } - }, - "outputs": [], - "source": [ - "plt.rcParams.update({'font.size': 14})" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Positional Encoding matrix components proposed in the article\n", - "\n", - "$$\n", - "PE_{(pos,2i)} = \n", - "\\sin \\left( \\frac{pos}{10000^{2i/d_{\\text{model}}}} \\right) \\sim\n", - "$$\n", - "\n", - "$$\n", - "\\sim \\sin \\left( \\exp \\left( -\\frac{2i}{d_{\\text{model}}} \\right) \\cdot \\text{pos} \\right) =\n", - "\\sin(\\omega \\cdot t)\n", - "$$\n", - "\n", - "$$ \\\\ $$\n", - "\n", - "$$\n", - "PE_{(pos,2i+1)} =\n", - "\\cos(\\dots) \\sim \\cos (\\omega \\cdot t)\n", - "$$\n", - "\n", - "Let's treat $\\text{pos}$ as time and number of embedding component as carrier frequency of our signal.\n", - "\n", - "Note that carrier frequencies decrease exponentionally." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "ExecuteTime": { - "end_time": "2019-09-26T22:51:10.035945Z", - "start_time": "2019-09-26T22:51:10.032742Z" - } - }, - "outputs": [], - "source": [ - "def make_carriers(d_mod, denom):\n", - " return 1 / np.power(denom, np.arange(d_mod) / d_mod)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "ExecuteTime": { - "end_time": "2019-09-26T22:51:10.770645Z", - "start_time": "2019-09-26T22:51:10.473266Z" - } - }, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "carriers = {i: make_carriers(10, 10**i) for i in (8, 4, 2)}\n", - "\n", - "plt.figure(figsize=(15, 7))\n", - "for i, carrier in carriers.items():\n", - " plt.plot(carrier, label=f'denominator: 10^{i}')\n", - "plt.legend()\n", - "plt.xlabel('Number of embedding component')\n", - "plt.ylabel('Carrier frequency')\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In fact different frequencies correspond to different \"clocks\" that we have to measure $\\text{pos}$ as time.\n", - "\n", - "Model doesn't know $\\text{pos}$ value directly but it sees all the \"times\" (in fact phases) of differend \"clocks\" ($\\sin$s and $\\cos$s of different frequencies)\n", - "\n", - "Having representative sutie of \"clocks\" we can definetly say _what time is it now ($\\text{pos}$ value)_ for every given \"moment\"" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "ExecuteTime": { - "end_time": "2019-09-26T22:51:11.741823Z", - "start_time": "2019-09-26T22:51:11.736491Z" - } - }, - "outputs": [], - "source": [ - "def make_pa_matrix(n_pos, d_mod, denom):\n", - " res= np.empty((n_pos, d_mod))\n", - " carriers = make_carriers(d_mod, denom)\n", - "\n", - " for pos in range(n_pos):\n", - " if pos % 2:\n", - " funct = np.sin\n", - " else:\n", - " funct = np.cos\n", - "\n", - " for i in range(d_mod):\n", - " res[pos, i] = funct((pos // 2) * carriers[i])\n", - " return res" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 2d case" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "ExecuteTime": { - "end_time": "2019-09-26T22:51:13.233975Z", - "start_time": "2019-09-26T22:51:12.672375Z" - }, - "scrolled": false - }, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "pa2 = make_pa_matrix(1000, 2, 10**4)\n", - "\n", - "plt.figure(figsize=(15, 12))\n", - "plt.scatter(pa2[:, 0], pa2[:, 1], c=np.arange(len(pa2)))\n", - "plt.colorbar()\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Nice and harmonic picture, isn't it?**\n", - "\n", - "That's because the curve plotted is [Lissajou's curve](https://en.wikipedia.org/wiki/Lissajous_curve):\n", - "\n", - "![](https://upload.wikimedia.org/wikipedia/commons/5/5d/Lissajous_animation.gif)\n", - "\n", - "Curve implicitly specified by harmonic coordinates\n", - "\n", - "$$\\left\\{ \\begin{align}\n", - " & x(t)=A\\sin (at+\\delta ) \\\\ \n", - " & y(t)=B\\sin (bt) \\\\ \n", - "\\end{align} \\right.\n", - "$$\n", - "\n", - "In our case $\\delta = \\pi / 2$" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 3d case" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "ExecuteTime": { - "end_time": "2019-09-26T22:54:15.024781Z", - "start_time": "2019-09-26T22:54:15.019525Z" - } - }, - "outputs": [], - "source": [ - "pa3 = make_pa_matrix(250, 3, 2**3)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "ExecuteTime": { - "end_time": "2019-09-26T22:54:15.422821Z", - "start_time": "2019-09-26T22:54:15.410521Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
xyzc
01.0000001.0000001.0000000.0
10.0000000.0000000.0000001.0
20.5403020.8775830.9689122.0
30.8414710.4794260.2474043.0
4-0.4161470.5403020.8775834.0
\n", - "
" - ], - "text/plain": [ - " x y z c\n", - "0 1.000000 1.000000 1.000000 0.0\n", - "1 0.000000 0.000000 0.000000 1.0\n", - "2 0.540302 0.877583 0.968912 2.0\n", - "3 0.841471 0.479426 0.247404 3.0\n", - "4 -0.416147 0.540302 0.877583 4.0" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pa3_df = pd.DataFrame(\n", - " np.concatenate((pa3, np.arange(len(pa3))[:, None]), axis=1),\n", - " columns=['x', 'y', 'z', 'c'],\n", - ")\n", - "pa3_df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "ExecuteTime": { - "end_time": "2019-09-26T22:54:16.196313Z", - "start_time": "2019-09-26T22:54:15.882954Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - " \n", - " " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.plotly.v1+json": { - "config": { - "plotlyServerURL": "https://plot.ly" - }, - "data": [ - { - "hovertemplate": "x=%{x}
y=%{y}
z=%{z}
c=%{marker.color}", - "legendgroup": "", - "marker": { - "color": [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - 30, - 31, - 32, - 33, - 34, - 35, - 36, - 37, - 38, - 39, - 40, - 41, - 42, - 43, - 44, - 45, - 46, - 47, - 48, - 49, - 50, - 51, - 52, - 53, - 54, - 55, - 56, - 57, - 58, - 59, - 60, - 61, - 62, - 63, - 64, - 65, - 66, - 67, - 68, - 69, - 70, - 71, - 72, - 73, - 74, - 75, - 76, - 77, - 78, - 79, - 80, - 81, - 82, - 83, - 84, - 85, - 86, - 87, - 88, - 89, - 90, - 91, - 92, - 93, - 94, - 95, - 96, - 97, - 98, - 99, - 100, - 101, - 102, - 103, - 104, - 105, - 106, - 107, - 108, - 109, - 110, - 111, - 112, - 113, - 114, - 115, - 116, - 117, - 118, - 119, - 120, - 121, - 122, - 123, - 124, - 125, - 126, - 127, - 128, - 129, - 130, - 131, - 132, - 133, - 134, - 135, - 136, - 137, - 138, - 139, - 140, - 141, - 142, - 143, - 144, - 145, - 146, - 147, - 148, - 149, - 150, - 151, - 152, - 153, - 154, - 155, - 156, - 157, - 158, - 159, - 160, - 161, - 162, - 163, - 164, - 165, - 166, - 167, - 168, - 169, - 170, - 171, - 172, - 173, - 174, - 175, - 176, - 177, - 178, - 179, - 180, - 181, - 182, - 183, - 184, - 185, - 186, - 187, - 188, - 189, - 190, - 191, - 192, - 193, - 194, - 195, - 196, - 197, - 198, - 199, - 200, - 201, - 202, - 203, - 204, - 205, - 206, - 207, - 208, - 209, - 210, - 211, - 212, - 213, - 214, - 215, - 216, - 217, - 218, - 219, - 220, - 221, - 222, - 223, - 224, - 225, - 226, - 227, - 228, - 229, - 230, - 231, - 232, - 233, - 234, - 235, - 236, - 237, - 238, - 239, - 240, - 241, - 242, - 243, - 244, - 245, - 246, - 247, - 248, - 249 - ], - "coloraxis": "coloraxis", - "symbol": "circle" - }, - "mode": "markers", - "name": "", - "scene": "scene", - "showlegend": false, - "type": "scatter3d", - "x": [ - 1, - 0, - 0.5403023058681398, - 0.8414709848078965, - -0.4161468365471424, - 0.9092974268256817, - -0.9899924966004454, - 0.1411200080598672, - -0.6536436208636119, - -0.7568024953079282, - 0.2836621854632263, - -0.9589242746631385, - 0.9601702866503661, - -0.27941549819892586, - 0.7539022543433046, - 0.6569865987187891, - -0.14550003380861354, - 0.9893582466233818, - -0.9111302618846769, - 0.4121184852417566, - -0.8390715290764524, - -0.5440211108893699, - 0.004425697988050786, - -0.9999902065507035, - 0.8438539587324921, - -0.5365729180004349, - 0.9074467814501962, - 0.4201670368266409, - 0.1367372182078336, - 0.9906073556948704, - -0.7596879128588212, - 0.6502878401571169, - -0.9576594803233847, - -0.2879033166650653, - -0.27516333805159693, - -0.9613974918795568, - 0.6603167082440802, - -0.750987246771676, - 0.9887046181866692, - 0.14987720966295234, - 0.40808206181339196, - 0.9129452507276277, - -0.5477292602242685, - 0.836655638536056, - -0.9999608263946371, - -0.008851309290403876, - -0.5328330203333975, - -0.8462204041751706, - 0.4241790073369969, - -0.9055783620066239, - 0.9912028118634736, - -0.13235175009777303, - 0.6469193223286404, - 0.7625584504796028, - -0.2921388087338362, - 0.956375928404503, - -0.9626058663135666, - 0.27090578830786904, - -0.7480575296890004, - -0.6636338842129675, - 0.15425144988758405, - -0.9880316240928618, - 0.9147423578045313, - -0.404037645323065, - 0.8342233605065102, - 0.5514266812416906, - -0.013276747223059479, - 0.9999118601072672, - -0.8485702747846052, - 0.5290826861200238, - -0.9036922050915067, - -0.428182669496151, - -0.12796368962740468, - -0.9917788534431158, - 0.7654140519453434, - -0.6435381333569994, - 0.9550736440472949, - 0.2963685787093853, - 0.2666429323599373, - 0.9637953862840878, - -0.6669380616522619, - 0.7451131604793488, - -0.9873392775238264, - -0.158622668804709, - -0.3999853149883513, - -0.9165215479156338, - 0.5551133015206257, - -0.8317747426285983, - 0.9998433086476912, - 0.017701925105413577, - 0.5253219888177297, - 0.8509035245341184, - -0.4321779448847783, - 0.9017883476488092, - -0.9923354691509287, - 0.123573122745224, - -0.6401443394691997, - -0.7682546613236668, - 0.3005925437436371, - -0.9537526527594719, - 0.9649660284921133, - -0.26237485370392877, - 0.7421541968137826, - 0.6702291758433747, - -0.16299078079570548, - 0.9866275920404853, - -0.9182827862121189, - 0.3959251501818342, - -0.8293098328631502, - -0.5587890488516162, - 0.022126756261955732, - -0.9997551733586199, - 0.853220107722584, - -0.5215510020869119, - 0.8998668269691938, - 0.43616475524782494, - 0.11918013544881928, - 0.9928726480845371, - -0.7710802229758452, - 0.6367380071391379, - -0.9524129804151563, - -0.3048106211022167, - -0.25810163593826746, - -0.9661177700083929, - 0.6735071623235862, - -0.7391806966492229, - 0.9858965815825497, - 0.16735570030280694, - 0.39185723042955, - 0.9200260381967907, - -0.562453851238172, - 0.8268286794901035, - -0.99964745596635, - -0.026551154023966794, - -0.5177697997895051, - -0.8555199789753223, - 0.4401430224960407, - -0.8979276806892913, - 0.9933903797222716, - -0.11478481378318722, - 0.6333192030862999, - 0.7738906815578891, - -0.3090227281660707, - 0.9510546532543747, - -0.9672505882738824, - 0.25382336276203626, - -0.7361927182273159, - -0.6767719568873076, - 0.17171734183077755, - -0.9851462604682474, - 0.9217512697247493, - -0.38778163540943045, - 0.8243313311075577, - 0.5661076368981803, - -0.030975031731216456, - 0.9995201585807313, - -0.8578030932449878, - 0.5139784559875352, - -0.8959709467909631, - -0.4441126687075084, - -0.11038724383904756, - -0.9938886539233752, - 0.7766859820216312, - -0.629887994274454, - 0.9496776978825432, - 0.31322878243308516, - 0.2495401179733381, - 0.9683644611001854, - -0.6800234955873388, - 0.7331903200732921, - -0.9843766433940419, - -0.1760756199485871, - -0.38369844494974187, - -0.9234584470040598, - 0.569750334265312, - -0.8218178366308225, - 0.9993732836951247, - 0.03539830273366069, - 0.5101770449416688, - 0.8600694058124533, - -0.4480736161291701, - 0.8939966636005579, - -0.9943674609282015, - 0.10598751175115685, - -0.6264444479103392, - -0.7794660696158047, - 0.31742870151970165, - -0.9482821412699473, - 0.9694593666699876, - -0.2452519854676543, - 0.7301735609948197, - 0.683261714736121, - -0.18043044929108393, - 0.9835877454343449, - -0.9251475365964139, - 0.37960773902752165, - -0.8192882452914593, - -0.5733818719904229, - 0.0398208803931389, - -0.9992068341863537, - 0.862318872287684, - -0.5063656411097588, - 0.8920048697881602, - 0.45202578717835057, - 0.10158570369662134, - 0.9948267913584063, - -0.782230889887116, - 0.6229886314423488, - -0.9468680107512126, - -0.32162240316253093, - -0.24095904923620143, - -0.9705352835374847, - 0.6864865509069841, - -0.7271425000808527, - 0.9827795820412206, - 0.18478174456066745, - 0.37550959776701204, - 0.926818505417785, - -0.577002178942952, - 0.8167426066363169, - -0.999020813314648, - -0.044242678085070965, - -0.5025443191453852, - -0.8645514486106083, - 0.4559691044442761, - -0.8899956043668333, - 0.9952666362171313, - -0.09718190589320902, - 0.6195206125592099, - 0.7849803886813105, - -0.3258098052199642, - 0.9454353340247703, - -0.9715921906288022, - 0.23666139336428602, - -0.7240971967004738, - -0.689697940935389, - 0.1891294205289584, - -0.9819521690440836, - 0.9284713207390763, - -0.3714041014380903, - 0.8141809705265618, - 0.5806111842123143, - -0.0486636092001539, - 0.9988152247235795, - -0.8667670910519801, - 0.4987131538963941, - -0.8879689066918555, - -0.45990349068959124, - -0.09277620459766088, - -0.9956869868891794 - ], - "y": [ - 1, - 0, - 0.8775825618903728, - 0.479425538604203, - 0.5403023058681398, - 0.8414709848078965, - 0.0707372016677029, - 0.9974949866040544, - -0.4161468365471424, - 0.9092974268256817, - -0.8011436155469337, - 0.5984721441039564, - -0.9899924966004454, - 0.1411200080598672, - -0.9364566872907963, - -0.35078322768961984, - -0.6536436208636119, - -0.7568024953079282, - -0.21079579943077972, - -0.977530117665097, - 0.2836621854632263, - -0.9589242746631385, - 0.70866977429126, - -0.7055403255703919, - 0.9601702866503661, - -0.27941549819892586, - 0.9765876257280235, - 0.21511998808781552, - 0.7539022543433046, - 0.6569865987187891, - 0.3466353178350258, - 0.9379999767747389, - -0.14550003380861354, - 0.9893582466233818, - -0.6020119026848236, - 0.7984871126234903, - -0.9111302618846769, - 0.4121184852417566, - -0.9971721561963784, - -0.07515112046180931, - -0.8390715290764524, - -0.5440211108893699, - -0.47553692799599256, - -0.87969575997167, - 0.004425697988050786, - -0.9999902065507035, - 0.4833047587530059, - -0.8754521746884285, - 0.8438539587324921, - -0.5365729180004349, - 0.9977982791785807, - -0.06632189735120068, - 0.9074467814501962, - 0.4201670368266409, - 0.594920663309892, - 0.803784426551621, - 0.1367372182078336, - 0.9906073556948704, - -0.354924266788705, - 0.934895055524683, - -0.7596879128588212, - 0.6502878401571169, - -0.9784534628188842, - 0.2064674819377966, - -0.9576594803233847, - -0.2879033166650653, - -0.7023970575027135, - -0.7117853423691232, - -0.27516333805159693, - -0.9613974918795568, - 0.21943996321145934, - -0.9756260054681576, - 0.6603167082440802, - -0.750987246771676, - 0.939524893748256, - -0.34248061846961253, - 0.9887046181866692, - 0.14987720966295234, - 0.7958149698139441, - 0.6055398697196009, - 0.40808206181339196, - 0.9129452507276277, - -0.07956356727854007, - 0.9968297942787993, - -0.5477292602242685, - 0.836655638536056, - -0.8817917275413242, - 0.47163900309419615, - -0.9999608263946371, - -0.008851309290403876, - -0.8733046400935156, - -0.4871745124605095, - -0.5328330203333975, - -0.8462204041751706, - -0.061905293994420546, - -0.9980820279793963, - 0.4241790073369969, - -0.9055783620066239, - 0.8064094939122546, - -0.5913575298651244, - 0.9912028118634736, - -0.13235175009777303, - 0.933315112063922, - 0.35905835402216824, - 0.6469193223286404, - 0.7625584504796028, - 0.202135120387182, - 0.979357643103917, - -0.2921388087338362, - 0.956375928404503, - -0.7148869687796651, - 0.6992400316550977, - -0.9626058663135666, - 0.27090578830786904, - -0.9746452757206577, - -0.22375564018679642, - -0.7480575296890004, - -0.6636338842129675, - -0.3383192109710552, - -0.9410314083429536, - 0.15425144988758405, - -0.9880316240928618, - 0.6090559761063562, - -0.7931272394572851, - 0.9147423578045313, - -0.404037645323065, - 0.9964679075571249, - 0.08397445569174683, - 0.8342233605065102, - 0.5514266812416906, - 0.4677318402470736, - 0.8838704235458307, - -0.013276747223059479, - 0.9999118601072672, - -0.49103472393024045, - 0.8711400001691764, - -0.8485702747846052, - 0.5290826861200238, - -0.9983462274487422, - 0.057487478104924564, - -0.9036922050915067, - -0.428182669496151, - -0.587782813560387, - -0.8090187662119064, - -0.12796368962740468, - -0.9917788534431158, - 0.3631854084160624, - -0.9317168878547055, - 0.7654140519453434, - -0.6435381333569994, - 0.9802426408101081, - -0.19779879963646227, - 0.9550736440472949, - 0.2963685787093853, - 0.6960693098638898, - 0.7179745927716441, - 0.2666429323599373, - 0.9637953862840878, - -0.22806693448309956, - 0.9736454556949781, - -0.6669380616522619, - 0.7451131604793488, - -0.9425194910508831, - 0.33415117684842055, - -0.9873392775238264, - -0.158622668804709, - -0.7904239741978156, - -0.6125601529754698, - -0.3999853149883513, - -0.9165215479156338, - 0.08838369930580556, - -0.996086503119594, - 0.5551133015206257, - -0.8317747426285983, - 0.8859318072699817, - -0.4638155159838274, - 0.9998433086476912, - 0.017701925105413577, - 0.8689582973139933, - 0.49488531755262816, - 0.5253219888177297, - 0.8509035245341184, - 0.05306853621402457, - 0.9985908724117705, - -0.4321779448847783, - 0.9017883476488092, - -0.8116121923430246, - 0.5841965844132857, - -0.9923354691509287, - 0.123573122745224, - -0.9301004142012892, - -0.36730534913419133, - -0.6401443394691997, - -0.7682546613236668, - -0.1934586046207122, - -0.981108438603097, - 0.3005925437436371, - -0.9537526527594719, - 0.7210481538680822, - -0.6928849542336957, - 0.9649660284921133, - -0.26237485370392877, - 0.9726265649744922, - 0.2323737616554845, - 0.7421541968137826, - 0.6702291758433747, - 0.32997659774057014, - 0.9439891127251193, - -0.16299078079570548, - 0.9866275920404853, - -0.616052331690985, - 0.7877052269841179, - -0.9182827862121189, - 0.3959251501818342, - -0.9956855884367365, - -0.09279121175730869, - -0.8293098328631502, - -0.5587890488516162, - -0.45989010701310373, - -0.8879758383376634, - 0.022126756261955732, - -0.9997551733586199, - 0.49872621790648564, - -0.8667595742607592, - 0.853220107722584, - -0.5215510020869119, - 0.9988159580766447, - -0.04864855487508726, - 0.8998668269691938, - 0.43616475524782494, - 0.580598912666927, - 0.8141897215084345, - 0.11918013544881928, - 0.9928726480845371, - -0.37141809547969407, - 0.9284657227653786, - -0.7710802229758452, - 0.6367380071391379, - -0.9819550195245901, - 0.18911462035089152, - -0.9524129804151563, - -0.3048106211022167, - -0.6896870271361664, - -0.7241075918674496, - -0.25810163593826746, - -0.9661177700083929, - 0.23667603734656428, - -0.9715886235161092, - 0.6735071623235862, - -0.7391806966492229 - ], - "z": [ - 1, - 0, - 0.9689124217106447, - 0.247403959254523, - 0.8775825618903726, - 0.4794255386042031, - 0.7316888688738208, - 0.6816387600233343, - 0.5403023058681395, - 0.8414709848078966, - 0.31532236239526845, - 0.9489846193555863, - 0.07073720166770246, - 0.9974949866040544, - -0.17824605564949253, - 0.9839859468739368, - -0.4161468365471428, - 0.9092974268256815, - -0.6281736227227394, - 0.778073196887921, - -0.801143615546934, - 0.5984721441039561, - -0.9243023786324638, - 0.3816609920523313, - -0.9899924966004456, - 0.14112000805986633, - -0.9941296760805461, - -0.10819513453010926, - -0.936456687290796, - -0.35078322768962067, - -0.8205593573395602, - -0.5715613187423445, - -0.6536436208636113, - -0.7568024953079288, - -0.446087489913792, - -0.8949893582285839, - -0.21079579943077884, - -0.9775301176650972, - 0.03760215288797745, - -0.9992927889753779, - 0.28366218546322713, - -0.9589242746631382, - 0.5120854772418414, - -0.8589344934265916, - 0.7086697742912607, - -0.7055403255703913, - 0.8611924171615213, - -0.5082790774992575, - 0.9601702866503665, - -0.27941549819892414, - 0.9994494182244995, - -0.03317921654755504, - 0.9765876257280232, - 0.21511998808781727, - 0.8930063446890758, - 0.4500440737806192, - 0.7539022543433034, - 0.6569865987187904, - 0.5679241732886934, - 0.8230808790115065, - 0.34663531783502416, - 0.9379999767747395, - 0.1037943572192512, - 0.9945987791111763, - -0.1455000338086153, - 0.9893582466233816, - -0.3857479374522234, - 0.9226042102393396, - -0.6020119026848251, - 0.7984871126234893, - -0.7808456836057502, - 0.624723953754191, - -0.9111302618846777, - 0.412118485241755, - -0.984765173467324, - 0.17388948538043178, - -0.9971721561963783, - -0.07515112046181109, - -0.9475798039779927, - -0.3195191936222753, - -0.8390715290764514, - -0.5440211108893713, - -0.678393850473844, - -0.7346984304047967, - -0.475536927995991, - -0.8796957599716709, - -0.24311342256102825, - -0.969997867920679, - 0.004425697988052563, - -0.9999902065507035, - 0.25168965007175614, - -0.967807997511261, - 0.48330475875300744, - -0.8754521746884276, - 0.6848703183835546, - -0.7286649758271688, - 0.843853958732494, - -0.5365729180004319, - 0.9503708470676746, - -0.311119354981124, - 0.9977982791785809, - -0.06632189735119715, - 0.9831874470475911, - 0.1825991346311375, - 0.9074467814501948, - 0.42016703682664414, - 0.7752854701292857, - 0.6316109877182414, - 0.5949206633098891, - 0.8037844265516231, - 0.37756657109729, - 0.9259824428086285, - 0.13673721820783008, - 0.9906073556948708, - -0.11259379263383901, - 0.9936411011327622, - -0.3549242667887083, - 0.9348950555246818, - -0.5751872690824057, - 0.8180217634546921, - -0.7596879128588235, - 0.6502878401571142, - -0.8969548417022905, - 0.4421221685765362, - -0.9784534628188849, - 0.2064674819377931, - -0.9991165866797339, - -0.042024352718844346, - -0.9576594803233837, - -0.2879033166650687, - -0.8566597458288405, - -0.5158818468181123, - -0.702397057502711, - -0.7117853423691256, - -0.5044627221459249, - -0.8634334728079074, - -0.2751633380515935, - -0.9613974918795578, - -0.02875563032918361, - -0.9995864713592173, - 0.2194399632114628, - -0.9756260054681568, - 0.45399184267981, - -0.8910058399248518, - 0.6603167082440828, - -0.7509872467716737, - 0.8255862790817411, - -0.5642759039618523, - 0.9395248937482572, - -0.3424806184696092, - 0.9950484010363791, - -0.09939154689884465, - 0.9887046181866688, - 0.14987720966295587, - 0.9208879708911081, - 0.38982732724638186, - 0.7958149698139418, - 0.6055398697196038, - 0.621262048380912, - 0.7836028759783575, - 0.40808206181338874, - 0.9129452507276291, - 0.16952950915565496, - 0.9855251115651202, - -0.0795635672785436, - 0.9968297942787989, - -0.323709766459238, - 0.9461564284508697, - -0.5477292602242714, - 0.8366556385360541, - -0.7376936014721196, - 0.6751356532927985, - -0.8817917275413258, - 0.471639003094193, - -0.9710643148808387, - 0.23881812402957928, - -0.9999608263946371, - -0.008851309290407429, - -0.9666846169547724, - -0.2559704110693365, - -0.8733046400935138, - -0.4871745124605126, - -0.7256268104935268, - -0.6880884622582994, - -0.5328330203333945, - -0.8462204041751725, - -0.30691025370372627, - -0.9517384599623546, - -0.061905293994417, - -0.9980820279793965, - 0.1869486370620462, - -0.9823696896284226, - 0.42417900733700337, - -0.9055783620066209, - 0.635035981413377, - -0.772482557932766, - 0.8064094939122588, - -0.5913575298651187, - 0.9276443698605873, - -0.3734647547841081, - 0.9912028118634746, - -0.13235175009776598, - 0.9931330638374922, - 0.1169902453743711, - 0.9333151120639195, - 0.3590583540221749, - 0.8154681470604958, - 0.578801953287756, - 0.6469193223286349, - 0.7625584504796074, - 0.43814818743719797, - 0.8989027566124703, - 0.20213512038717504, - 0.9793576431039185, - -0.046445729422976985, - 0.9989208147888238, - -0.29213880873384296, - 0.9563759284045009, - -0.5196681118689644, - 0.8543682189235187, - -0.7148869687796701, - 0.6992400316550927, - -0.8656576164704202, - 0.5006364859324088, - -0.9626058663135685, - 0.27090578830786216, - -0.9997039456950856, - 0.02433148087719517, - -0.9746452757206561, - -0.22375564018680333, - -0.8889878831195944, - -0.4579307192868178, - -0.7480575296889956, - -0.6636338842129729, - -0.5606165822201001, - -0.8280755084772485, - -0.33831921097104856, - -0.941031408342956, - -0.09498678980628619, - -0.9954785330494558, - 0.15425144988759107, - -0.9880316240928607, - 0.39389908153221426, - -0.9191536942035744, - 0.6090559761063619, - -0.7931272394572808, - 0.786344720000897, - -0.6177879744108904, - 0.9147423578045342, - -0.4040376453230585 - ] - } - ], - "layout": { - "coloraxis": { - "colorbar": { - "title": { - "text": "c" - } - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ] - }, - "legend": { - "tracegroupgap": 0 - }, - "margin": { - "t": 60 - }, - "scene": { - "domain": { - "x": [ - 0, - 1 - ], - "y": [ - 0, - 1 - ] - }, - "xaxis": { - "title": { - "text": "x" - } - }, - "yaxis": { - "title": { - "text": "y" - } - }, - "zaxis": { - "title": { - "text": "z" - } - } - }, - "template": { - "data": { - "bar": [ - { - "error_x": { - "color": "#2a3f5f" - }, - "error_y": { - "color": "#2a3f5f" - }, - "marker": { - "line": { - "color": "#E5ECF6", - "width": 0.5 - } - }, - "type": "bar" - } - ], - "barpolar": [ - { - "marker": { - "line": { - "color": "#E5ECF6", - "width": 0.5 - } - }, - "type": "barpolar" - } - ], - "carpet": [ - { - "aaxis": { - "endlinecolor": "#2a3f5f", - "gridcolor": "white", - "linecolor": "white", - "minorgridcolor": "white", - "startlinecolor": "#2a3f5f" - }, - "baxis": { - "endlinecolor": "#2a3f5f", - "gridcolor": "white", - "linecolor": "white", - "minorgridcolor": "white", - "startlinecolor": "#2a3f5f" - }, - "type": "carpet" - } - ], - "choropleth": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "type": "choropleth" - } - ], - "contour": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "contour" - } - ], - "contourcarpet": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "type": "contourcarpet" - } - ], - "heatmap": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "heatmap" - } - ], - "heatmapgl": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "heatmapgl" - } - ], - "histogram": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "histogram" - } - ], - "histogram2d": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "histogram2d" - } - ], - "histogram2dcontour": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "histogram2dcontour" - } - ], - "mesh3d": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "type": "mesh3d" - } - ], - "parcoords": [ - { - "line": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "parcoords" - } - ], - "pie": [ - { - "automargin": true, - "type": "pie" - } - ], - "scatter": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatter" - } - ], - "scatter3d": [ - { - "line": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatter3d" - } - ], - "scattercarpet": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattercarpet" - } - ], - "scattergeo": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattergeo" - } - ], - "scattergl": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattergl" - } - ], - "scattermapbox": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattermapbox" - } - ], - "scatterpolar": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatterpolar" - } - ], - "scatterpolargl": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatterpolargl" - } - ], - "scatterternary": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatterternary" - } - ], - "surface": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "surface" - } - ], - "table": [ - { - "cells": { - "fill": { - "color": "#EBF0F8" - }, - "line": { - "color": "white" - } - }, - "header": { - "fill": { - "color": "#C8D4E3" - }, - "line": { - "color": "white" - } - }, - "type": "table" - } - ] - }, - "layout": { - "annotationdefaults": { - "arrowcolor": "#2a3f5f", - "arrowhead": 0, - "arrowwidth": 1 - }, - "coloraxis": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "colorscale": { - "diverging": [ - [ - 0, - "#8e0152" - ], - [ - 0.1, - "#c51b7d" - ], - [ - 0.2, - "#de77ae" - ], - [ - 0.3, - "#f1b6da" - ], - [ - 0.4, - "#fde0ef" - ], - [ - 0.5, - "#f7f7f7" - ], - [ - 0.6, - "#e6f5d0" - ], - [ - 0.7, - "#b8e186" - ], - [ - 0.8, - "#7fbc41" - ], - [ - 0.9, - "#4d9221" - ], - [ - 1, - "#276419" - ] - ], - "sequential": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "sequentialminus": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ] - }, - "colorway": [ - "#636efa", - "#EF553B", - "#00cc96", - "#ab63fa", - "#FFA15A", - "#19d3f3", - "#FF6692", - "#B6E880", - "#FF97FF", - "#FECB52" - ], - "font": { - "color": "#2a3f5f" - }, - "geo": { - "bgcolor": "white", - "lakecolor": "white", - "landcolor": "#E5ECF6", - "showlakes": true, - "showland": true, - "subunitcolor": "white" - }, - "hoverlabel": { - "align": "left" - }, - "hovermode": "closest", - "mapbox": { - "style": "light" - }, - "paper_bgcolor": "white", - "plot_bgcolor": "#E5ECF6", - "polar": { - "angularaxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - }, - "bgcolor": "#E5ECF6", - "radialaxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - } - }, - "scene": { - "xaxis": { - "backgroundcolor": "#E5ECF6", - "gridcolor": "white", - "gridwidth": 2, - "linecolor": "white", - "showbackground": true, - "ticks": "", - "zerolinecolor": "white" - }, - "yaxis": { - "backgroundcolor": "#E5ECF6", - "gridcolor": "white", - "gridwidth": 2, - "linecolor": "white", - "showbackground": true, - "ticks": "", - "zerolinecolor": "white" - }, - "zaxis": { - "backgroundcolor": "#E5ECF6", - "gridcolor": "white", - "gridwidth": 2, - "linecolor": "white", - "showbackground": true, - "ticks": "", - "zerolinecolor": "white" - } - }, - "shapedefaults": { - "line": { - "color": "#2a3f5f" - } - }, - "ternary": { - "aaxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - }, - "baxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - }, - "bgcolor": "#E5ECF6", - "caxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - } - }, - "title": { - "x": 0.05 - }, - "xaxis": { - "automargin": true, - "gridcolor": "white", - "linecolor": "white", - "ticks": "", - "title": { - "standoff": 15 - }, - "zerolinecolor": "white", - "zerolinewidth": 2 - }, - "yaxis": { - "automargin": true, - "gridcolor": "white", - "linecolor": "white", - "ticks": "", - "title": { - "standoff": 15 - }, - "zerolinecolor": "white", - "zerolinewidth": 2 - } - } - } - } - }, - "text/html": [ - "
\n", - " \n", - " \n", - "
\n", - " \n", - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "fig = px.scatter_3d(pa3_df, x='x', y='y', z='z', color='c')\n", - "fig.show()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Py3 research env", - "language": "python", - "name": "py3_research" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.7" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/week06_bert/bert_for_text_classification.ipynb b/week06_bert/bert_for_text_classification.ipynb deleted file mode 100644 index 3412b19..0000000 --- a/week06_bert/bert_for_text_classification.ipynb +++ /dev/null @@ -1,2895 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "izA3-6kffbdT" - }, - "source": [ - "# Practice: A Visual Notebook to Using BERT for the First Time" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "izA3-6kffbdT" - }, - "source": [ - "*Credits: first part of this notebook is strongly based on Jay Alammar's [great blog post](http://jalammar.github.io/a-visual-guide-to-using-bert-for-the-first-time/). His blog is a great way to dive into the DL and NLP concepts.*\n", - "\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "izA3-6kffbdT" - }, - "source": [ - "In this notebook, we will use pre-trained deep learning model to process some text. We will then use the output of that model to classify the text. The text is a list of sentences from film reviews. And we will calssify each sentence as either speaking \"positively\" about its subject of \"negatively\"." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "izA3-6kffbdT" - }, - "source": [ - "## Models: Sentence Sentiment Classification" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "izA3-6kffbdT" - }, - "source": [ - "Our goal is to create a model that takes a sentence (just like the ones in our dataset) and produces either 1 (indicating the sentence carries a positive sentiment) or a 0 (indicating the sentence carries a negative sentiment). We can think of it as looking like this:\n", - "\n", - "\n", - "\n", - "Under the hood, the model is actually made up of two model.\n", - "\n", - "* DistilBERT processes the sentence and passes along some information it extracted from it on to the next model. DistilBERT is a smaller version of BERT developed and open sourced by the team at HuggingFace. It’s a lighter and faster version of BERT that roughly matches its performance.\n", - "* The next model, a basic Logistic Regression model from scikit learn will take in the result of DistilBERT’s processing, and classify the sentence as either positive or negative (1 or 0, respectively).\n", - "\n", - "The data we pass between the two models is a vector of size 768. We can think of this of vector as an embedding for the sentence that we can use for classification.\n", - "\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "izA3-6kffbdT" - }, - "source": [ - "## Dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "izA3-6kffbdT" - }, - "source": [ - "The dataset we will use in this example is [SST2](https://nlp.stanford.edu/sentiment/index.html), which contains sentences from movie reviews, each labeled as either positive (has the value 1) or negative (has the value 0):\n", - "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " sentence\n", - " \n", - " label\n", - "
\n", - " a stirring , funny and finally transporting re imagining of beauty and the beast and 1930s horror films\n", - " \n", - " 1\n", - "
\n", - " apparently reassembled from the cutting room floor of any given daytime soap\n", - " \n", - " 0\n", - "
\n", - " they presume their audience won't sit still for a sociology lesson\n", - " \n", - " 0\n", - "
\n", - " this is a visually stunning rumination on love , memory , history and the war between art and commerce\n", - " \n", - " 1\n", - "
\n", - " jonathan parker 's bartleby should have been the be all end all of the modern office anomie films\n", - " \n", - " 1\n", - "
" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "izA3-6kffbdT" - }, - "source": [ - "## Installing the transformers library" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "izA3-6kffbdT" - }, - "source": [ - "Let's start by installing the huggingface transformers library so we can load our deep learning NLP model." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "To9ENLU90WGl", - "outputId": "6e5be1cb-9674-40e5-cfe7-17f562537556" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Note: you may need to restart the kernel to use updated packages.\n" - ] - } - ], - "source": [ - "%pip install -Uqq transformers" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "zQ-42fh0hjsF" - }, - "source": [ - "## Part 1. Using BERT for text classification." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Loading pretrained BERT." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here we will be using the pretrained DistilBERT model. Here is an example of it:" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'sequence': \"hello i'm a role model.\",\n", - " 'score': 0.052928753197193146,\n", - " 'token': 2535,\n", - " 'token_str': 'role'},\n", - " {'sequence': \"hello i'm a fashion model.\",\n", - " 'score': 0.03968575596809387,\n", - " 'token': 4827,\n", - " 'token_str': 'fashion'},\n", - " {'sequence': \"hello i'm a business model.\",\n", - " 'score': 0.03474372997879982,\n", - " 'token': 2449,\n", - " 'token_str': 'business'},\n", - " {'sequence': \"hello i'm a model model.\",\n", - " 'score': 0.03462280333042145,\n", - " 'token': 2944,\n", - " 'token_str': 'model'},\n", - " {'sequence': \"hello i'm a modeling model.\",\n", - " 'score': 0.018145091831684113,\n", - " 'token': 11643,\n", - " 'token_str': 'modeling'}]" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from transformers import pipeline\n", - "\n", - "model_name = 'distilbert-base-uncased'\n", - "unmasker = pipeline('fill-mask', model_name)\n", - "unmasker(\"Hello I'm a [MASK] model.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here is how we can use the same model to extract features from our text:" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([1, 23, 768])" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import torch\n", - "\n", - "from transformers import logging; logging.set_verbosity_error() # Ignore warning on model loading.\n", - "from transformers import DistilBertModel, DistilBertTokenizer\n", - "\n", - "tokenizer = DistilBertTokenizer.from_pretrained(model_name)\n", - "model = DistilBertModel.from_pretrained(model_name)\n", - "\n", - "text = 'Lorem ipsum dolor sit amet, consectetur adipiscing elit.'\n", - "tokenized_text = tokenizer(text, return_tensors='pt')\n", - "\n", - "with torch.no_grad():\n", - " output = model(**tokenized_text)\n", - "\n", - "output.last_hidden_state.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "zQ-42fh0hjsF" - }, - "source": [ - "## Loading the dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "zQ-42fh0hjsF" - }, - "source": [ - "We'll use pandas to read the dataset and load it into a dataframe." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "id": "cyoj29J24hPX" - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "\n", - "dataset_url = 'https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv'\n", - "dataset = pd.read_csv(dataset_url, delimiter='\\t', header=None)\n", - "dataset.columns = ['text', 'label']" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dMVE3waNhuNj" - }, - "source": [ - "For performance reasons, we'll only use 2,000 sentences from the dataset" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "id": "gTM3hOHW4hUY" - }, - "outputs": [], - "source": [ - "dataset = dataset[:2000]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "PRc2L89hh1Tf" - }, - "source": [ - "We can ask pandas how many sentences are labeled as \"positive\" (value 1) and how many are labeled \"negative\" (having the value 0)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "jGvcfcCP5xpZ", - "outputId": "2679553c-f061-4254-8b9d-b005529de44d" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "1 1041\n", - "0 959\n", - "Name: label, dtype: int64" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dataset['label'].value_counts()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lZDBMn3wiSX6" - }, - "source": [ - "## Preparing the Dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lZDBMn3wiSX6" - }, - "source": [ - "Before we can hand our sentences to BERT, we need to so some minimal processing to put them in the format it requires." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lZDBMn3wiSX6" - }, - "source": [ - "### Tokenization" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lZDBMn3wiSX6" - }, - "source": [ - "Our first step is to tokenize the sentences -- break them up into word and subwords in the format BERT is comfortable with." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "id": "Dg82ndBA5xlN" - }, - "outputs": [], - "source": [ - "tokenized_texts = dataset['text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True)).values" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "mHwjUwYgi-uL" - }, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "mHwjUwYgi-uL" - }, - "source": [ - "### Padding" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "mHwjUwYgi-uL" - }, - "source": [ - "After tokenization, `tokenized_texts` is a list of sentences -- each sentences is represented as a list of tokens. We want BERT to process our examples all at once (as one batch). It's just faster that way. For that reason, we need to pad all lists to the same size, so we can represent the input as one 2-d array, rather than a list of lists (of different lengths)." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "URn-DWJt5xhP" - }, - "outputs": [], - "source": [ - "max_len = max(len(text) for text in tokenized_texts)\n", - "padded_texts = torch.tensor([text + [0] * (max_len - len(text)) for text in tokenized_texts])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Mdjg306wjjmL" - }, - "source": [ - "Our dataset is now in the `padded_texts` variable, we can view its dimensions below:" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "jdi7uXo95xeq", - "outputId": "cc0bd2d3-921f-4dcc-d619-bcb621b2e707" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([2000, 59])" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "padded_texts.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "sDZBsYSDjzDV" - }, - "source": [ - "### Masking" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "sDZBsYSDjzDV" - }, - "source": [ - "If we directly send `padded_texts` to BERT, that would slightly confuse it. We need to create another variable to tell it to ignore (mask) the padding we've added when it's processing its input. That's what `attention_mask` is for:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "4K_iGRNa_Ozc", - "outputId": "8802e284-ba3c-4bd3-bc6c-d9830c033b30" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([2000, 59])" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "attention_mask = torch.where(padded_texts > 0, 1, 0)\n", - "attention_mask.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "import matplotlib.pyplot as plt\n", - "\n", - "plt.pcolormesh(attention_mask)\n", - "plt.colorbar();" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "jK-CQB9-kN99" - }, - "source": [ - "## And Now, Deep Learning!" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "jK-CQB9-kN99" - }, - "source": [ - "Now that we have our model and inputs ready, let's run our model!\n", - "\n", - "" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "id": "39UVjAV56PJz" - }, - "outputs": [], - "source": [ - "with torch.no_grad():\n", - " output = model(padded_texts, attention_mask)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "FoCep_WVuB3v" - }, - "source": [ - "Let's slice only the part of the output that we need. That is the output corresponding the first token of each sentence. The way BERT does sentence classification, is that it adds a token called `[CLS]` (for classification) at the beginning of every sentence. The output corresponding to that token can be thought of as an embedding for the entire sentence.\n", - "\n", - "\n", - "\n", - "We'll save those in the `features` variable, as they'll serve as the features to our logitics regression model." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "id": "C9t60At16PVs" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "(2000, 768)" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "features = output.last_hidden_state[:, 0, :].numpy()\n", - "features.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_VZVU66Gurr-" - }, - "source": [ - "The labels indicating which sentence is positive and negative now go into the `labels` variable" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "id": "JD3fX2yh6PTx" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "(2000,)" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "labels = dataset['label'].values\n", - "labels.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "iaoEvM2evRx1" - }, - "source": [ - "## Classifier training" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "iaoEvM2evRx1" - }, - "source": [ - "Let's now split our datset into a training set and testing set (even though we're using 2,000 sentences from the SST2 training set)." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "id": "ddAqbkoU6PP9" - }, - "outputs": [], - "source": [ - "from sklearn.model_selection import train_test_split\n", - "\n", - "train_features, test_features, train_labels, test_labels = train_test_split(features, labels)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "B9bhSJpcv1Bl" - }, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "B9bhSJpcv1Bl" - }, - "source": [ - "## [Extra] Grid Search for Parameters" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "B9bhSJpcv1Bl" - }, - "source": [ - "We can dive into Logistic regression directly with the Scikit Learn default parameters, but sometimes it's worth searching for the best value of the C parameter, which determines regularization strength." - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "id": "cyEwr7yYD3Ci" - }, - "outputs": [], - "source": [ - "# from sklearn.model_selection import GridSearchCV\n", - "\n", - "# parameters = {'C': np.linspace(0.0001, 100, 20)}\n", - "# grid_search = GridSearchCV(LogisticRegression(), parameters)\n", - "# grid_search.fit(train_features, train_labels)\n", - "\n", - "# print('best parameters: ', grid_search.best_params_)\n", - "# print('best scrores: ', grid_search.best_score_)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KCT9u8vAwnID" - }, - "source": [ - "We now train the LogisticRegression model. If you've chosen to do the gridsearch, you can plug the value of C into the model declaration (e.g. `LogisticRegression(C=5.2)`)." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "gG-EVWx4CzBc", - "outputId": "9ae43345-4003-4dfe-bb22-cd9c2fa82fe0" - }, - "outputs": [], - "source": [ - "import warnings; warnings.simplefilter('ignore') # Ignore warning on model fitting.\n", - "from sklearn.linear_model import LogisticRegression\n", - "\n", - "lr_clf = LogisticRegression().fit(train_features, train_labels)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3rUMKuVgwzkY" - }, - "source": [ - "\n", - "\n", - "So how well does our model do in classifying sentences? One way is to check the accuracy against the testing dataset:" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "iCoyxRJ7ECTA", - "outputId": "45b90744-a478-45db-a420-2ebbaf0b9236" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "0.84" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "lr_clf.score(test_features, test_labels)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "75oyhr3VxHoE" - }, - "source": [ - "How good is this score? What can we compare it against? Let's first look at a dummy classifier:" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "lnwgmqNG7i5l", - "outputId": "fe7730c4-446b-4a1d-f4f2-1164c45a0a31" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Dummy classifier score: 0.519 (+/- 0.00)\n" - ] - } - ], - "source": [ - "from sklearn.dummy import DummyClassifier\n", - "from sklearn.model_selection import cross_val_score\n", - "\n", - "clf = DummyClassifier()\n", - "\n", - "scores = cross_val_score(clf, train_features, train_labels)\n", - "print(\"Dummy classifier score: %0.3f (+/- %0.2f)\" % (scores.mean(), scores.std() * 2))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "7Lg4LOpoxSOR" - }, - "source": [ - "So our model clearly does better than a dummy classifier. But how does it compare against the best models?\n", - "\n", - "For reference, the [highest accuracy score](http://nlpprogress.com/english/sentiment_analysis.html) for this dataset is currently **96.8**. DistilBERT can be trained to improve its score on this task – a process called **fine-tuning** which updates BERT’s weights to make it achieve a better performance in this sentence classification task (which we can call the downstream task). The fine-tuned DistilBERT turns out to achieve an accuracy score of **90.7**. The full size BERT model achieves **94.9**.\n", - "\n", - "And that’s it! That’s a good first contact with BERT. The next step would be to head over to the documentation and try your hand at [fine-tuning](https://huggingface.co/transformers/examples.html#glue). You can also go back and switch from distilBERT to BERT and see how that works." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 152 - }, - "id": "EJQuqV6cnWQu", - "outputId": "402d109c-01bb-485d-a510-4be8684c9c06" - }, - "source": [ - "## Part 2: Looking back." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 152 - }, - "id": "EJQuqV6cnWQu", - "outputId": "402d109c-01bb-485d-a510-4be8684c9c06" - }, - "source": [ - "__Now it is your turn to reproduce the steps above.__\n", - "\n", - "We shall revisit the first homework and see whether we could improve the results a little bit more. The average ROC-AUC on test set was around $0.9$ (using the words embeddings). \n", - "\n", - "__Let's see whether we can beat it.__" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "kz8QBEXozHJx", - "outputId": "bdf0a0d8-2ac5-4dfd-a609-72011121abda" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
should_bancomment_text
500\"Those who're in advantageous positions are th...
2501Fartsalot56 says f**k you motherclucker!!
4501Are you a fool? \\n\\nI am sorry, but you seem t...
6501I AM NOT A VANDAL!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
8500Citing sources\\n\\nCheck out the Wikipedia:Citi...
\n", - "
" - ], - "text/plain": [ - " should_ban comment_text\n", - "50 0 \"Those who're in advantageous positions are th...\n", - "250 1 Fartsalot56 says f**k you motherclucker!!\n", - "450 1 Are you a fool? \\n\\nI am sorry, but you seem t...\n", - "650 1 I AM NOT A VANDAL!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n", - "850 0 Citing sources\\n\\nCheck out the Wikipedia:Citi..." - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import os; os.environ['TOKENIZERS_PARALLELISM'] = 'false' # Ignore warning on wget.\n", - "\n", - "!wget -q -nc https://raw.githubusercontent.com/neychev/made_nlp_course/master/datasets/comments_small_dataset/comments.tsv\n", - "dataset = pd.read_csv('comments.tsv', sep='\\t')\n", - "dataset[50::200]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's prepare data for our BERT model." - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [], - "source": [ - "tokenized_texts = dataset['comment_text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512, truncation=True)).values\n", - "\n", - "max_len = max(len(text) for text in tokenized_texts)\n", - "padded_texts = torch.tensor([text + [0] * (max_len - len(text)) for text in tokenized_texts])\n", - "\n", - "attention_mask = torch.where(padded_texts > 0, 1, 0)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now move the model to GPU and use it for feature extraction." - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "device(type='cuda', index=2)" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "gpu_num = 0\n", - "device = torch.device(f'cuda:{gpu_num}' if torch.cuda.is_available() else 'cpu')\n", - "device" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(1000, 768)" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import numpy as np\n", - "\n", - "model.to(device)\n", - "batch_size = 16\n", - "features = []\n", - "with torch.no_grad():\n", - " for i in range(0, len(padded_texts), batch_size):\n", - " texts_batch = padded_texts[i : i + batch_size].to(device)\n", - " mask_batch = attention_mask[i : i + batch_size].to(device)\n", - " output = model(texts_batch, mask_batch)\n", - " batch_features = output.last_hidden_state[:, 0, :].cpu().numpy()\n", - " features.append(batch_features)\n", - "\n", - "features = np.concatenate(features, axis=0)\n", - "features.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now it is time to split our objects into train and test and train our classifier." - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [], - "source": [ - "target = dataset['should_ban'].values\n", - "train_features, test_features, y_train, y_test = train_test_split(features, target, test_size=0.5, random_state=42)" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.862" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "lr_clf = LogisticRegression(C=0.1)\n", - "lr_clf.fit(train_features, y_train)\n", - "lr_clf.score(test_features, y_test)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's also plot the ROC curve and calculate the AUC metric." - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "from sklearn.metrics import roc_auc_score, roc_curve\n", - "\n", - "proba = lr_clf.predict_proba(train_features)[:, 1]\n", - "auc = roc_auc_score(y_train, proba)\n", - "plt.plot(*roc_curve(y_train, proba)[:2], label='%s AUC=%.4f' % ('train', auc))\n", - "\n", - "proba = lr_clf.predict_proba(test_features)[:, 1]\n", - "auc = roc_auc_score(y_test, proba)\n", - "plt.plot(*roc_curve(y_test, proba)[:2], label='%s AUC=%.4f' % ('test', auc))\n", - "\n", - "plt.legend();" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "cc1hBVfbzHJ7" - }, - "source": [ - "So, how does it look? Did we achieve better results? \n", - "\n", - "Here come some further ideas:\n", - "\n", - "* Try using the larger BERT (e.g. BERT-base or BERT-large) and compare the results (be careful, they require more memory).\n", - "\n", - "* Using BERT output for translation? Why not ;)" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "machine_shape": "hm", - "name": "A Visual Notebook to Using BERT for the First Time.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "057e5786db794af7ae7d9cb86cf271a7": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_ddb91ccd465f4fb586b4e65681ed421e", - "placeholder": "​", - "style": "IPY_MODEL_18877a9f1bef436da6294b8e239a3e87", - "value": " 255M/255M [00:08<00:00, 29.4MB/s]" - } - }, - "08e91434c84e461fb028d5aaafa0d2ac": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_69a2d9186a1f4f0dbe07bf0cffa9efde", - "placeholder": "​", - "style": "IPY_MODEL_d89ac674a7d042aa9640cac0d852b103", - "value": " 32/32 [00:10<00:00, 3.12it/s]" - } - }, - "095377480b704abb80aee1dfc023594e": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "initial" - } - }, - "16598e7fb085414abedff55c7422bac3": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "18877a9f1bef436da6294b8e239a3e87": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "1902ef0b5d7e42cab7323b99d5a5e0f7": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "100%", - "description_tooltip": null, - "layout": "IPY_MODEL_394aae946bd5416db64691a92fd222e8", - "max": 32, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_6dabfd45bcf04128932f72f8e79fb126", - "value": 32 - } - }, - "1f288559046f4b15916737cfc6887c29": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "1fce08ae597a49c6844825f9a52e2d20": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "21f5cf249e2e46dc896a537304bc6ecb": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "281af34bb99e4436b0f49c5cf46822ac": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_c1b26841521d43ea9f4f21fa15f7b762", - "IPY_MODEL_d866f8b057c74513bf21b3a217718a30" - ], - "layout": "IPY_MODEL_c906b5aa2d0c4a9f81580e5dfc6c3d03" - } - }, - "285eb13a7fe34b31b74e6c56084e6c49": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "3500f61b19834c6c99701d7b24a17bbb": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_89374ef6e7634341b1f9d1cc2c045064", - "placeholder": "​", - "style": "IPY_MODEL_6e5f2aea17ed4f1d8bd83a3c9fccfb2e", - "value": " 49.0/49.0 [00:00<00:00, 706B/s]" - } - }, - "350fc6d45be242678a9d36d6da366f01": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "394aae946bd5416db64691a92fd222e8": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "4a3ec2955c694bf49cc39913b1a104dd": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "4ec263486149477382f71f37759929ba": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "504cad20350e43a19dbb367cda255729": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "initial" - } - }, - "5b2bb84e7788484dbc9949ebb0d91ec7": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_6be0571f20634e3d9a20314eee0aba64", - "IPY_MODEL_98bd16e5bf0f4604b84b17dbf3260070" - ], - "layout": "IPY_MODEL_4a3ec2955c694bf49cc39913b1a104dd" - } - }, - "5cd796c3da374bc4ba3d607bcd0f2377": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "601f514e09c54843a4ec85f69176161d": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "initial" - } - }, - "626e3aeb683c42a7bd6d057cf78ee082": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "69a2d9186a1f4f0dbe07bf0cffa9efde": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "6af0fc5df8ab44ce980d711e9cef851e": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "6be0571f20634e3d9a20314eee0aba64": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "Downloading: 100%", - "description_tooltip": null, - "layout": "IPY_MODEL_5cd796c3da374bc4ba3d607bcd0f2377", - "max": 465, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_985a11a5b169401f82f25f72cc81053f", - "value": 465 - } - }, - "6dabfd45bcf04128932f72f8e79fb126": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "initial" - } - }, - "6e5f2aea17ed4f1d8bd83a3c9fccfb2e": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "84108be2c2604819a38a16a624318313": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "87b7803428d44e6c9807e64ae03c4d29": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "Downloading: 100%", - "description_tooltip": null, - "layout": "IPY_MODEL_21f5cf249e2e46dc896a537304bc6ecb", - "max": 173939, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_095377480b704abb80aee1dfc023594e", - "value": 173939 - } - }, - "87f20435e6254088b85f8397750b2ba1": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_1902ef0b5d7e42cab7323b99d5a5e0f7", - "IPY_MODEL_b7fb8fc65d084795a52eb4d80d549e43" - ], - "layout": "IPY_MODEL_84108be2c2604819a38a16a624318313" - } - }, - "89374ef6e7634341b1f9d1cc2c045064": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "985a11a5b169401f82f25f72cc81053f": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "initial" - } - }, - "98bd16e5bf0f4604b84b17dbf3260070": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_16598e7fb085414abedff55c7422bac3", - "placeholder": "​", - "style": "IPY_MODEL_c861b31fa3ea4b139a0f2b940abaedd1", - "value": " 465/465 [00:00<00:00, 1.98kB/s]" - } - }, - "a23b1080e91f44b7947bb0d90b72eddf": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_6af0fc5df8ab44ce980d711e9cef851e", - "placeholder": "​", - "style": "IPY_MODEL_f2fea47b9f834911899bf975fa5ea737", - "value": " 174k/174k [00:01<00:00, 111kB/s]" - } - }, - "a3b23cadc2844a38857d546fdd1d1cc1": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "a3f0428ab70f4c9aa5e3e40518e11098": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "initial" - } - }, - "a72c347172ec4184b64a92678b6d6f8f": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "a8f9ae6ba52d4bcea7966b1c72748bfb": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_87b7803428d44e6c9807e64ae03c4d29", - "IPY_MODEL_a23b1080e91f44b7947bb0d90b72eddf" - ], - "layout": "IPY_MODEL_1f288559046f4b15916737cfc6887c29" - } - }, - "b37801f82e8f4a3fb673061e0cc49cdc": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_be08b9fe75a543ea92a5cbd3943a961d", - "IPY_MODEL_057e5786db794af7ae7d9cb86cf271a7" - ], - "layout": "IPY_MODEL_285eb13a7fe34b31b74e6c56084e6c49" - } - }, - "b63953a573c948e19bb175f46acf02af": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "initial" - } - }, - "b7fb8fc65d084795a52eb4d80d549e43": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_a72c347172ec4184b64a92678b6d6f8f", - "placeholder": "​", - "style": "IPY_MODEL_bc9cc9adc1c044629bcee301749dfc9f", - "value": " 32/32 [01:14<00:00, 2.34s/it]" - } - }, - "bc9cc9adc1c044629bcee301749dfc9f": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "bd9d17c4142046c4971bb28b8dd8f82f": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "Downloading: 100%", - "description_tooltip": null, - "layout": "IPY_MODEL_626e3aeb683c42a7bd6d057cf78ee082", - "max": 49, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_a3f0428ab70f4c9aa5e3e40518e11098", - "value": 49 - } - }, - "be08b9fe75a543ea92a5cbd3943a961d": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "Downloading: 100%", - "description_tooltip": null, - "layout": "IPY_MODEL_a3b23cadc2844a38857d546fdd1d1cc1", - "max": 255182217, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_504cad20350e43a19dbb367cda255729", - "value": 255182217 - } - }, - "bf196950f21649a3bac4154bd02c673c": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_c13f14f2652e48368e4ca0f9690dd430", - "IPY_MODEL_08e91434c84e461fb028d5aaafa0d2ac" - ], - "layout": "IPY_MODEL_4ec263486149477382f71f37759929ba" - } - }, - "c13f14f2652e48368e4ca0f9690dd430": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "100%", - "description_tooltip": null, - "layout": "IPY_MODEL_350fc6d45be242678a9d36d6da366f01", - "max": 32, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_601f514e09c54843a4ec85f69176161d", - "value": 32 - } - }, - "c1b26841521d43ea9f4f21fa15f7b762": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "100%", - "description_tooltip": null, - "layout": "IPY_MODEL_c4fc0e5df81e47ee8193f04118fac0d5", - "max": 32, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_b63953a573c948e19bb175f46acf02af", - "value": 32 - } - }, - "c4fc0e5df81e47ee8193f04118fac0d5": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "c861b31fa3ea4b139a0f2b940abaedd1": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "c906b5aa2d0c4a9f81580e5dfc6c3d03": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "d7d44fa068b048d093c4eac1f3d46497": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_bd9d17c4142046c4971bb28b8dd8f82f", - "IPY_MODEL_3500f61b19834c6c99701d7b24a17bbb" - ], - "layout": "IPY_MODEL_1fce08ae597a49c6844825f9a52e2d20" - } - }, - "d866f8b057c74513bf21b3a217718a30": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_fe5fdb885d93428485a31b1d5260c09e", - "placeholder": "​", - "style": "IPY_MODEL_db1c11f8d801430ca11917ccbbdabd02", - "value": " 32/32 [01:05<00:00, 2.06s/it]" - } - }, - "d89ac674a7d042aa9640cac0d852b103": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "db1c11f8d801430ca11917ccbbdabd02": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "ddb91ccd465f4fb586b4e65681ed421e": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "f2fea47b9f834911899bf975fa5ea737": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "fe5fdb885d93428485a31b1d5260c09e": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - } - } - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/week06_bert/lect07_BERT.pdf b/week06_bert/lect07_BERT.pdf new file mode 100644 index 0000000..d4dcf0f Binary files /dev/null and b/week06_bert/lect07_BERT.pdf differ diff --git a/week06_bert/practice_bert_for_text_classification.ipynb b/week06_bert/practice_bert_for_text_classification.ipynb new file mode 100644 index 0000000..e5711b9 --- /dev/null +++ b/week06_bert/practice_bert_for_text_classification.ipynb @@ -0,0 +1,902 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "izA3-6kffbdT" + }, + "source": [ + "# Practice: A Visual Notebook to Using BERT for the First Time" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SEBgv15zoaX6" + }, + "source": [ + "*Credits: first part of this notebook is strongly based on Jay Alammar's [great blog post](http://jalammar.github.io/a-visual-guide-to-using-bert-for-the-first-time/). His blog is a great way to dive into the DL and NLP concepts.*\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dVgtANpYoaX7" + }, + "source": [ + "In this notebook, we will use pre-trained deep learning model to process some text. We will then use the output of that model to classify the text. The text is a list of sentences from film reviews. And we will calssify each sentence as either speaking \"positively\" about its subject of \"negatively\"." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0oCi6ZSnoaX7" + }, + "source": [ + "## Models: Sentence Sentiment Classification" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pyRwVEI4oaX7" + }, + "source": [ + "Our goal is to create a model that takes a sentence (just like the ones in our dataset) and produces either 1 (indicating the sentence carries a positive sentiment) or a 0 (indicating the sentence carries a negative sentiment). We can think of it as looking like this:\n", + "\n", + "\n", + "\n", + "Under the hood, the model is actually made up of two model.\n", + "\n", + "* DistilBERT processes the sentence and passes along some information it extracted from it on to the next model. DistilBERT is a smaller version of BERT developed and open sourced by the team at HuggingFace. It’s a lighter and faster version of BERT that roughly matches its performance.\n", + "* The next model, a basic Logistic Regression model from scikit learn will take in the result of DistilBERT’s processing, and classify the sentence as either positive or negative (1 or 0, respectively).\n", + "\n", + "The data we pass between the two models is a vector of size 768. We can think of this of vector as an embedding for the sentence that we can use for classification.\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mQVOYe4PoaX8" + }, + "source": [ + "## Dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3S7DFxaeoaX9" + }, + "source": [ + "The dataset we will use in this example is [SST2](https://nlp.stanford.edu/sentiment/index.html), which contains sentences from movie reviews, each labeled as either positive (has the value 1) or negative (has the value 0):\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " sentence\n", + " \n", + " label\n", + "
\n", + " a stirring , funny and finally transporting re imagining of beauty and the beast and 1930s horror films\n", + " \n", + " 1\n", + "
\n", + " apparently reassembled from the cutting room floor of any given daytime soap\n", + " \n", + " 0\n", + "
\n", + " they presume their audience won't sit still for a sociology lesson\n", + " \n", + " 0\n", + "
\n", + " this is a visually stunning rumination on love , memory , history and the war between art and commerce\n", + " \n", + " 1\n", + "
\n", + " jonathan parker 's bartleby should have been the be all end all of the modern office anomie films\n", + " \n", + " 1\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kPMv-7fOoaX-" + }, + "source": [ + "## Installing the transformers library" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "S9zZk1UgoaX_" + }, + "source": [ + "Let's start by installing the huggingface transformers library so we can load our deep learning NLP model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "To9ENLU90WGl" + }, + "outputs": [], + "source": [ + "!pip install -Uqq transformers" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zQ-42fh0hjsF" + }, + "source": [ + "## Part 1. Using BERT for text classification." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SwbK19NFoaYB" + }, + "source": [ + "## Loading pretrained BERT." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GITfp914oaYB" + }, + "source": [ + "Here we will be using the pretrained DistilBERT model from `transformers` library. The easiest way to use such model is to use a `pipeline`. This can be done as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "4LyOoG8IoaYB" + }, + "outputs": [], + "source": [ + "from transformers import pipeline\n", + "\n", + "\n", + "unmasker = pipeline('fill-mask', 'distilbert-base-uncased')\n", + "unmasker(\"Hello I'm a [MASK] model.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NnA23_CpoaYC" + }, + "source": [ + "However, such approach is not very flexible and certainly doesn't allow you to fine-tune a model. For this reason we will use the model in a more manual way. For this we load the model and appropriate tokenizer and use them together. Here is how we can use them to extract features from our text:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "O8gHVTAEoaYC" + }, + "outputs": [], + "source": [ + "import torch\n", + "from transformers import DistilBertModel, DistilBertTokenizer, logging\n", + "\n", + "\n", + "logging.set_verbosity_error() # Ignore warning on model loading.\n", + "tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')\n", + "model = DistilBertModel.from_pretrained('distilbert-base-uncased')\n", + "\n", + "text = 'Lorem ipsum dolor sit amet, consectetur adipiscing elit.'\n", + "tokenized_text = tokenizer(text, return_tensors='pt')\n", + "\n", + "with torch.no_grad():\n", + " output = model(**tokenized_text)\n", + "\n", + "output.last_hidden_state.shape" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Except for the `logging` part, everything looks very similar to the code we saw in previous practice notebooks. The first thing we do is, just like always, tokenize our text.\n", + "\n", + "> **Note:** as you can see, we used `return_tensors` keyword argument in code above. This parameters just tells tokenizer to convert result into a PyTorch tensors to use them with our model. If we don't specify this parameter, we will get exactly same results, but packed into a python `list` objects.\n", + "\n", + "Let's look at this step a little closer. What exactly does the `tokenizer.__call__` return? Let's find out:" + ], + "metadata": { + "id": "P5DAZw0tN8wi" + } + }, + { + "cell_type": "code", + "source": [ + "tokenized_text = tokenizer(text)\n", + "\n", + "for key, values in tokenized_text.items():\n", + " values_type = type(values).__name__\n", + " item_type = type(values[0]).__name__\n", + " values_sample = f\"[{', '.join(str(value) for value in values[:5])}, ...]\"\n", + " print(f\"{key}: {values_type}[{item_type}], length {len(values)}: {values_sample}\")" + ], + "metadata": { + "id": "KH77JkiNPCxA" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "The contents may differ for different models, however for the `DistilBert` model tokenizer returns a `dict`-like object with two python lists under keys `\"input_ids\"` and `\"attention_mask\"`. Both lists have the same length and the attention mask seems to only have ones. We'll deal with the mask later, for now let's focus on `\"input_ids\"` which is the token ids for the tokenized sequence. Let's decode them to make sure and see what is actually going on:" + ], + "metadata": { + "id": "MzDnPSvrQdCB" + } + }, + { + "cell_type": "code", + "source": [ + "print(f\"Tokens: {tokenizer.convert_ids_to_tokens(tokenized_text['input_ids'])}\")\n", + "print(f\"Decoded sequence: '{tokenizer.decode(tokenized_text['input_ids'])}'\")" + ], + "metadata": { + "id": "SK-Yg-FDMN3I" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "We see that tokenizer actually does quite a lot of work behind the curtains: it lowercases the sequence (remember, we use `*-uncased` model, which implies that it doesn't understand the upper case), adds special tokens (`[CLS]` and `[SEP]`) and applies the BPE. That is how we get 23 tokens for such a little text." + ], + "metadata": { + "id": "syJ8naWFT87_" + } + }, + { + "cell_type": "markdown", + "source": [ + "" + ], + "metadata": { + "id": "pUmYkLeHd1m7" + } + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SVlcZqXqoaYC" + }, + "source": [ + "## Loading the dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "M3YBY3DWoaYC" + }, + "source": [ + "However, working with manually edited sentence is not interesting. Let's use our model to work with a dataset for sentiment classification. We'll use pandas to read the dataset and load it into a dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cyoj29J24hPX" + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "\n", + "dataset_url = (\n", + " 'https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv'\n", + ")\n", + "dataset = pd.read_csv(dataset_url, delimiter='\\t', header=None)\n", + "dataset.columns = ['text', 'label']\n", + "dataset.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dMVE3waNhuNj" + }, + "source": [ + "For performance reasons, we'll only use 2,000 sentences from the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gTM3hOHW4hUY" + }, + "outputs": [], + "source": [ + "dataset = dataset[:2000]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PRc2L89hh1Tf" + }, + "source": [ + "We can ask pandas how many sentences are labeled as \"positive\" (value 1) and how many are labeled \"negative\" (having the value 0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "jGvcfcCP5xpZ" + }, + "outputs": [], + "source": [ + "dataset['label'].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lZDBMn3wiSX6" + }, + "source": [ + "## Preparing the Dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NiNzCErkoaYE" + }, + "source": [ + "Before we can hand our sentences to BERT, we need to so some processing to put them in the format it requires. First, let's split our `dataset` into separate `texts` and `labels`." + ] + }, + { + "cell_type": "code", + "source": [ + "texts = dataset['text'].tolist()\n", + "labels = dataset['label'].values" + ], + "metadata": { + "id": "m8ipGP0raY7r" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Now we need to tokenize our texts." + ], + "metadata": { + "id": "PmNUir6fcLbl" + } + }, + { + "cell_type": "code", + "source": [ + "# YOUR CODE HERE\n", + "# Tokenize the texts in dataset.\n", + "# Hint: our tokenizer can also work with lists of strings.\n", + "# tokenized_texts = ...\n", + "\n", + "for key, values in tokenized_texts.items():\n", + " values_type = type(values).__name__\n", + " item_type = type(values[0]).__name__\n", + " print(f\"{key}: {values_type}[{item_type}], length {len(values)}\")" + ], + "metadata": { + "id": "PyLowtCxajkq" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "We obtained a list of lists. However, what we want is `torch.tensor` so that we could use it with our model. It's time to remember how we used to specify the `return_tensors` option! However, if we were to just specify it blindly, we would get an error. The problem here lies in the fact that sequences tend to have different lenghts:" + ], + "metadata": { + "id": "ooqGLMl7cTe-" + } + }, + { + "cell_type": "code", + "source": [ + "for seq in tokenized_texts[\"input_ids\"][:5]:\n", + " print(len(seq))" + ], + "metadata": { + "id": "7SMycKghdkh2" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "The most common solution to this problem, often used in NLP is the use of padding. Luckily for us, tokenizer from `transformers` can do the padding for us:" + ], + "metadata": { + "id": "_MjmmfMseJVf" + } + }, + { + "cell_type": "code", + "source": [ + "tokenized_texts = tokenizer(dataset['text'].tolist(), return_tensors=\"pt\", padding=True)\n", + "\n", + "for key, values in tokenized_texts.items():\n", + " values_type = type(values).__name__\n", + " print(f\"{key}: {values_type}, {values.shape}\")" + ], + "metadata": { + "id": "ONgnhkuxe9kB" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "However, we just added lot's of extra items into most of our sequences:" + ], + "metadata": { + "id": "u1AlfPcefj-m" + } + }, + { + "cell_type": "code", + "source": [ + "tokenized_texts[\"input_ids\"]" + ], + "metadata": { + "id": "XjfXDg8pgsFd" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Note how all the sequences end with zeros. We already encountered such a problem, when trained our RNN on previous lessons and we tackled it with specifying the padding index to the `CrossEntropyLoss` so that our model doesn't train to predict padding. Right now we don't want to train our model to do anything, however, we are working with a transformer, which means that it uses the self-attention operation extensively. If we were to simply add these extra items, it would be likely to affect our results. And this is exactly the place where the `\"attention_mask\"` comes into play. It is used exactly to mask the padding from getting in a way of attention!" + ], + "metadata": { + "id": "zkhDolkRgrti" + } + }, + { + "cell_type": "code", + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "\n", + "plt.pcolormesh(tokenized_texts[\"attention_mask\"])\n", + "plt.axis(\"off\")\n", + "plt.colorbar()\n", + "plt.show()" + ], + "metadata": { + "id": "gSCXbXVLg_7T" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jK-CQB9-kN99" + }, + "source": [ + "## And Now, Deep Learning!" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "a0UsQGQ2oaYF" + }, + "source": [ + "Now that we have our model and inputs ready, let's run our model! However, running it on a cpu takes several minutes. We can speed this up via using the GPU. For this, however, we would need to split our dataset into batches of data.\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Let's slice only the part of the output that we need. That is the output corresponding the first token of each sentence. The way BERT does sentence classification, is that it adds a token called `[CLS]` (for classification) at the beginning of every sentence. The output corresponding to that token can be thought of as an embedding for the entire sentence.\n", + "\n", + "\n", + "\n", + "We'll save those in the `features` variable, as they'll serve as the features to our logitics regression model. Also remember, how we created the `labels` variable to hold our labels." + ], + "metadata": { + "id": "KBqHs4d_ObTc" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "39UVjAV56PJz" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "\n", + "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n", + "model.to(device)\n", + "\n", + "batch_size = 32\n", + "features = []\n", + "with torch.no_grad():\n", + " for i in range(0, len(texts), batch_size):\n", + " texts_batch = tokenized_texts[\"input_ids\"][i : i + batch_size].to(device)\n", + " masks_batch = tokenized_texts[\"attention_mask\"][i : i + batch_size].to(device)\n", + " output = model(texts_batch, masks_batch)\n", + " batch_features = output.last_hidden_state[:, 0, :].cpu().numpy()\n", + " features.append(batch_features)\n", + "\n", + "features = np.concatenate(features, axis=0)\n", + "features.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iaoEvM2evRx1" + }, + "source": [ + "## Classifier training" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "02wlGPSLoaYG" + }, + "source": [ + "Let's now split our datset into a training set and testing set (even though we're using 2,000 sentences from the SST2 training set)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ddAqbkoU6PP9" + }, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "\n", + "train_features, test_features, train_labels, test_labels = train_test_split(features, labels)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "B9bhSJpcv1Bl" + }, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uW_IiKvToaYG" + }, + "source": [ + "We can dive into Logistic regression directly with the Scikit Learn default parameters, but sometimes it's worth searching for the best value of the C parameter, which determines regularization strength." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cyEwr7yYD3Ci" + }, + "outputs": [], + "source": [ + "# YOUR CODE HERE\n", + "# [EXTRA] Grid search for parameters" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KCT9u8vAwnID" + }, + "source": [ + "We now train the LogisticRegression model. If you've chosen to do the gridsearch, you can plug the best hyperparameter values into the model declaration." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gG-EVWx4CzBc" + }, + "outputs": [], + "source": [ + "import warnings\n", + "\n", + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "\n", + "warnings.simplefilter('ignore') # Ignore warning on model fitting.\n", + "lr_clf = LogisticRegression().fit(train_features, train_labels)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3rUMKuVgwzkY" + }, + "source": [ + "\n", + "\n", + "So how well does our model do in classifying sentences? One way is to check the accuracy against the testing dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "iCoyxRJ7ECTA" + }, + "outputs": [], + "source": [ + "lr_clf.score(test_features, test_labels)" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Another way to evaluate classification model is to plot the ROC curve and compute the area under it." + ], + "metadata": { + "id": "2xA5YIJPDYek" + } + }, + { + "cell_type": "code", + "source": [ + "from sklearn.metrics import roc_auc_score, roc_curve\n", + "\n", + "\n", + "plt.figure(figsize=(10, 6))\n", + "\n", + "proba = lr_clf.predict_proba(train_features)[:, 1]\n", + "auc = roc_auc_score(train_labels, proba)\n", + "plt.plot(*roc_curve(train_labels, proba)[:2], label=f'train AUC={auc:.4f}')\n", + "\n", + "proba = lr_clf.predict_proba(test_features)[:, 1]\n", + "auc = roc_auc_score(test_labels, proba)\n", + "plt.plot(*roc_curve(test_labels, proba)[:2], label=f'test AUC={auc:.4f}')\n", + "\n", + "plt.legend()\n", + "plt.show()" + ], + "metadata": { + "id": "186Bwg7UDLkU" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "75oyhr3VxHoE" + }, + "source": [ + "How good is this score? What can we compare it against? Let's first look at a dummy classifier:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lnwgmqNG7i5l" + }, + "outputs": [], + "source": [ + "from sklearn.dummy import DummyClassifier\n", + "from sklearn.model_selection import cross_val_score\n", + "\n", + "\n", + "clf = DummyClassifier()\n", + "scores = cross_val_score(clf, train_features, train_labels)\n", + "print(f\"Dummy classifier score: {scores.mean():.3f} (+/- {2 * scores.std():.3f})\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7Lg4LOpoxSOR" + }, + "source": [ + "So our model clearly does better than a dummy classifier. But how does it compare against the best models?\n", + "\n", + "For reference, the [highest accuracy score](http://nlpprogress.com/english/sentiment_analysis.html) for this dataset is currently **96.8**. DistilBERT can be trained to improve its score on this task – a process called **fine-tuning** which updates BERT’s weights to make it achieve a better performance in this sentence classification task (which we can call the downstream task). The fine-tuned DistilBERT turns out to achieve an accuracy score of **90.7**. The full size BERT model achieves **94.9**.\n", + "\n", + "And that’s it! That’s a good first contact with BERT. The next step would be to head over to the documentation and try your hand at [fine-tuning](https://huggingface.co/transformers/examples.html#glue). You can also go back and switch from distilBERT to BERT and see how that works." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EJQuqV6cnWQu", + "outputId": "402d109c-01bb-485d-a510-4be8684c9c06" + }, + "source": [ + "## Part 2: Looking back." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "outputId": "402d109c-01bb-485d-a510-4be8684c9c06", + "id": "zIBbP_oroaYH" + }, + "source": [ + "Now it is your turn to reproduce the steps above.\n", + "\n", + "We shall revisit the first homework and see whether we could improve the results a little bit more. The average ROC-AUC on test set was around $0.9$ (using the words embeddings). \n", + "\n", + "__Let's see whether we can beat it.__" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kz8QBEXozHJx" + }, + "outputs": [], + "source": [ + "dataset_url = 'https://raw.githubusercontent.com/neychev/made_nlp_course/master/datasets/comments_small_dataset/comments.tsv'\n", + "dataset = pd.read_csv(dataset_url, sep='\\t')\n", + "dataset.head()" + ] + }, + { + "cell_type": "markdown", + "source": [ + "One last note: this dataset contains some very long sentences, while the vast majority of sequences fall into category of 500 tokens and less:" + ], + "metadata": { + "id": "ZPyxeBCgGtfg" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XdZjuVxRoaYH" + }, + "outputs": [], + "source": [ + "texts = dataset[\"comment_text\"].tolist()\n", + "tokenized_texts = tokenizer(texts)\n", + "ids_lens = list(len(toks) for toks in tokenized_texts[\"input_ids\"])\n", + "\n", + "plt.figure(figsize=(10, 6))\n", + "plt.hist(ids_lens)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "source": [ + "We already know, how to tackle the problem of different sizes of sequences with padding. However, blind padding here would make pad all the sequenes to the size of the largest one, which seems to be an overkill. In such case it might be sensible to actually truncate the too-long sequences into a fixed length, say 512. And we can do this easily by specifying the `max_length` and `truncation=True` arguments to the tokenizer." + ], + "metadata": { + "id": "66wufVncHF8f" + } + }, + { + "cell_type": "code", + "source": [ + "# YOUR CODE HERE" + ], + "metadata": { + "id": "_LOYXhz_FKTr" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cc1hBVfbzHJ7" + }, + "source": [ + "So, how does it look? Did we achieve better results? \n", + "\n", + "Here come some further ideas:\n", + "\n", + "* Try using the larger BERT (e.g. BERT-base or BERT-large) and compare the results (be careful, they require more memory).\n", + "\n", + "* Using BERT output for translation? Why not ;)" + ] + } + ], + "metadata": { + "colab": { + "machine_shape": "hm", + "name": "practice_bert_for_text_classification.ipynb", + "provenance": [], + "collapsed_sections": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + }, + "accelerator": "GPU" + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/week06_bert/practice_bert_for_text_classification_solved.ipynb b/week06_bert/practice_bert_for_text_classification_solved.ipynb new file mode 100644 index 0000000..54726e4 --- /dev/null +++ b/week06_bert/practice_bert_for_text_classification_solved.ipynb @@ -0,0 +1,1560 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "izA3-6kffbdT" + }, + "source": [ + "# Practice: A Visual Notebook to Using BERT for the First Time" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SEBgv15zoaX6" + }, + "source": [ + "*Credits: first part of this notebook is strongly based on Jay Alammar's [great blog post](http://jalammar.github.io/a-visual-guide-to-using-bert-for-the-first-time/). His blog is a great way to dive into the DL and NLP concepts.*\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dVgtANpYoaX7" + }, + "source": [ + "In this notebook, we will use pre-trained deep learning model to process some text. We will then use the output of that model to classify the text. The text is a list of sentences from film reviews. And we will calssify each sentence as either speaking \"positively\" about its subject of \"negatively\"." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0oCi6ZSnoaX7" + }, + "source": [ + "## Models: Sentence Sentiment Classification" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pyRwVEI4oaX7" + }, + "source": [ + "Our goal is to create a model that takes a sentence (just like the ones in our dataset) and produces either 1 (indicating the sentence carries a positive sentiment) or a 0 (indicating the sentence carries a negative sentiment). We can think of it as looking like this:\n", + "\n", + "\n", + "\n", + "Under the hood, the model is actually made up of two model.\n", + "\n", + "* DistilBERT processes the sentence and passes along some information it extracted from it on to the next model. DistilBERT is a smaller version of BERT developed and open sourced by the team at HuggingFace. It’s a lighter and faster version of BERT that roughly matches its performance.\n", + "* The next model, a basic Logistic Regression model from scikit learn will take in the result of DistilBERT’s processing, and classify the sentence as either positive or negative (1 or 0, respectively).\n", + "\n", + "The data we pass between the two models is a vector of size 768. We can think of this of vector as an embedding for the sentence that we can use for classification.\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mQVOYe4PoaX8" + }, + "source": [ + "## Dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3S7DFxaeoaX9" + }, + "source": [ + "The dataset we will use in this example is [SST2](https://nlp.stanford.edu/sentiment/index.html), which contains sentences from movie reviews, each labeled as either positive (has the value 1) or negative (has the value 0):\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " sentence\n", + " \n", + " label\n", + "
\n", + " a stirring , funny and finally transporting re imagining of beauty and the beast and 1930s horror films\n", + " \n", + " 1\n", + "
\n", + " apparently reassembled from the cutting room floor of any given daytime soap\n", + " \n", + " 0\n", + "
\n", + " they presume their audience won't sit still for a sociology lesson\n", + " \n", + " 0\n", + "
\n", + " this is a visually stunning rumination on love , memory , history and the war between art and commerce\n", + " \n", + " 1\n", + "
\n", + " jonathan parker 's bartleby should have been the be all end all of the modern office anomie films\n", + " \n", + " 1\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kPMv-7fOoaX-" + }, + "source": [ + "## Installing the transformers library" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "S9zZk1UgoaX_" + }, + "source": [ + "Let's start by installing the huggingface transformers library so we can load our deep learning NLP model." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "To9ENLU90WGl" + }, + "outputs": [], + "source": [ + "!pip install -Uqq transformers" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zQ-42fh0hjsF" + }, + "source": [ + "## Part 1. Using BERT for text classification." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SwbK19NFoaYB" + }, + "source": [ + "## Loading pretrained BERT." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GITfp914oaYB" + }, + "source": [ + "Here we will be using the pretrained DistilBERT model from `transformers` library. The easiest way to use such model is to use a `pipeline`. This can be done as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "4LyOoG8IoaYB", + "outputId": "ffc9b3d4-f5c6-43ee-c31c-947f25e5e9f7", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[{'score': 0.052928660064935684,\n", + " 'sequence': \"hello i'm a role model.\",\n", + " 'token': 2535,\n", + " 'token_str': 'role'},\n", + " {'score': 0.03968597203493118,\n", + " 'sequence': \"hello i'm a fashion model.\",\n", + " 'token': 4827,\n", + " 'token_str': 'fashion'},\n", + " {'score': 0.0347437709569931,\n", + " 'sequence': \"hello i'm a business model.\",\n", + " 'token': 2449,\n", + " 'token_str': 'business'},\n", + " {'score': 0.034622907638549805,\n", + " 'sequence': \"hello i'm a model model.\",\n", + " 'token': 2944,\n", + " 'token_str': 'model'},\n", + " {'score': 0.01814514584839344,\n", + " 'sequence': \"hello i'm a modeling model.\",\n", + " 'token': 11643,\n", + " 'token_str': 'modeling'}]" + ] + }, + "metadata": {}, + "execution_count": 2 + } + ], + "source": [ + "from transformers import pipeline\n", + "\n", + "\n", + "unmasker = pipeline('fill-mask', 'distilbert-base-uncased')\n", + "unmasker(\"Hello I'm a [MASK] model.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NnA23_CpoaYC" + }, + "source": [ + "However, such approach is not very flexible and certainly doesn't allow you to fine-tune a model. For this reason we will use the model in a more manual way. For this we load the model and appropriate tokenizer and use them together. Here is how we can use them to extract features from our text:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "O8gHVTAEoaYC", + "outputId": "42d75b3d-0ff7-4061-ef1b-efb09ca7ca49", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "torch.Size([1, 23, 768])" + ] + }, + "metadata": {}, + "execution_count": 3 + } + ], + "source": [ + "import torch\n", + "from transformers import DistilBertModel, DistilBertTokenizer, logging\n", + "\n", + "\n", + "logging.set_verbosity_error() # Ignore warning on model loading.\n", + "tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')\n", + "model = DistilBertModel.from_pretrained('distilbert-base-uncased')\n", + "\n", + "text = 'Lorem ipsum dolor sit amet, consectetur adipiscing elit.'\n", + "tokenized_text = tokenizer(text, return_tensors='pt')\n", + "\n", + "with torch.no_grad():\n", + " output = model(**tokenized_text)\n", + "\n", + "output.last_hidden_state.shape" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Except for the `logging` part, everything looks very similar to the code we saw in previous practice notebooks. The first thing we do is, just like always, tokenize our text.\n", + "\n", + "> **Note:** as you can see, we used `return_tensors` keyword argument in code above. This parameters just tells tokenizer to convert result into a PyTorch tensors to use them with our model. If we don't specify this parameter, we will get exactly same results, but packed into a python `list` objects.\n", + "\n", + "Let's look at this step a little closer. What exactly does the `tokenizer.__call__` return? Let's find out:" + ], + "metadata": { + "id": "P5DAZw0tN8wi" + } + }, + { + "cell_type": "code", + "source": [ + "tokenized_text = tokenizer(text)\n", + "\n", + "for key, values in tokenized_text.items():\n", + " values_type = type(values).__name__\n", + " item_type = type(values[0]).__name__\n", + " values_sample = f\"[{', '.join(str(value) for value in values[:5])}, ...]\"\n", + " print(f\"{key}: {values_type}[{item_type}], length {len(values)}: {values_sample}\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "KH77JkiNPCxA", + "outputId": "13ff8021-d2b2-4760-eb79-73d5d7cc5cc0" + }, + "execution_count": 4, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "input_ids: list[int], length 23: [101, 19544, 2213, 12997, 17421, ...]\n", + "attention_mask: list[int], length 23: [1, 1, 1, 1, 1, ...]\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "The contents may differ for different models, however for the `DistilBert` model tokenizer returns a `dict`-like object with two python lists under keys `\"input_ids\"` and `\"attention_mask\"`. Both lists have the same length and the attention mask seems to only have ones. We'll deal with the mask later, for now let's focus on `\"input_ids\"` which is the token ids for the tokenized sequence. Let's decode them to make sure and see what is actually going on:" + ], + "metadata": { + "id": "MzDnPSvrQdCB" + } + }, + { + "cell_type": "code", + "source": [ + "print(f\"Tokens: {tokenizer.convert_ids_to_tokens(tokenized_text['input_ids'])}\")\n", + "print(f\"Decoded sequence: '{tokenizer.decode(tokenized_text['input_ids'])}'\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "SK-Yg-FDMN3I", + "outputId": "3b76b053-d60a-414e-9557-8681cd77fe55" + }, + "execution_count": 5, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Tokens: ['[CLS]', 'lore', '##m', 'ip', '##sum', 'do', '##lor', 'sit', 'am', '##et', ',', 'con', '##se', '##ct', '##et', '##ur', 'adi', '##pis', '##cing', 'eli', '##t', '.', '[SEP]']\n", + "Decoded sequence: '[CLS] lorem ipsum dolor sit amet, consectetur adipiscing elit. [SEP]'\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "We see that tokenizer actually does quite a lot of work behind the curtains: it lowercases the sequence (remember, we use `*-uncased` model, which implies that it doesn't understand the upper case), adds special tokens (`[CLS]` and `[SEP]`) and applies the BPE. That is how we get 23 tokens for such a little text." + ], + "metadata": { + "id": "syJ8naWFT87_" + } + }, + { + "cell_type": "markdown", + "source": [ + "" + ], + "metadata": { + "id": "pUmYkLeHd1m7" + } + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SVlcZqXqoaYC" + }, + "source": [ + "## Loading the dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "M3YBY3DWoaYC" + }, + "source": [ + "However, working with manually edited sentence is not interesting. Let's use our model to work with a dataset for sentiment classification. We'll use pandas to read the dataset and load it into a dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "cyoj29J24hPX", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "outputId": "908c0b1c-1a7a-44e5-942b-3e18cd3c5e1f" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textlabel
0a stirring , funny and finally transporting re...1
1apparently reassembled from the cutting room f...0
2they presume their audience wo n't sit still f...0
3this is a visually stunning rumination on love...1
4jonathan parker 's bartleby should have been t...1
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ], + "text/plain": [ + " text label\n", + "0 a stirring , funny and finally transporting re... 1\n", + "1 apparently reassembled from the cutting room f... 0\n", + "2 they presume their audience wo n't sit still f... 0\n", + "3 this is a visually stunning rumination on love... 1\n", + "4 jonathan parker 's bartleby should have been t... 1" + ] + }, + "metadata": {}, + "execution_count": 6 + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "\n", + "dataset_url = (\n", + " 'https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv'\n", + ")\n", + "dataset = pd.read_csv(dataset_url, delimiter='\\t', header=None)\n", + "dataset.columns = ['text', 'label']\n", + "dataset.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dMVE3waNhuNj" + }, + "source": [ + "For performance reasons, we'll only use 2,000 sentences from the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "id": "gTM3hOHW4hUY" + }, + "outputs": [], + "source": [ + "dataset = dataset[:2000]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PRc2L89hh1Tf" + }, + "source": [ + "We can ask pandas how many sentences are labeled as \"positive\" (value 1) and how many are labeled \"negative\" (having the value 0)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "jGvcfcCP5xpZ", + "outputId": "a6214c63-65d0-42a4-f78d-d94fdd3bc4cc" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "1 1041\n", + "0 959\n", + "Name: label, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 8 + } + ], + "source": [ + "dataset['label'].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lZDBMn3wiSX6" + }, + "source": [ + "## Preparing the Dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NiNzCErkoaYE" + }, + "source": [ + "Before we can hand our sentences to BERT, we need to so some processing to put them in the format it requires. First, let's split our `dataset` into separate `texts` and `labels`." + ] + }, + { + "cell_type": "code", + "source": [ + "texts = dataset['text'].tolist()\n", + "labels = dataset['label'].values" + ], + "metadata": { + "id": "m8ipGP0raY7r" + }, + "execution_count": 9, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Now we need to tokenize our texts." + ], + "metadata": { + "id": "PmNUir6fcLbl" + } + }, + { + "cell_type": "code", + "source": [ + "# YOUR CODE HERE\n", + "# Tokenize the texts in dataset.\n", + "# Hint: our tokenizer can also work with lists of strings.\n", + "# tokenized_texts = ...\n", + "tokenized_texts = tokenizer(texts)\n", + "\n", + "for key, values in tokenized_texts.items():\n", + " values_type = type(values).__name__\n", + " item_type = type(values[0]).__name__\n", + " print(f\"{key}: {values_type}[{item_type}], length {len(values)}\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "PyLowtCxajkq", + "outputId": "9d386cd9-3c37-45cd-f8fe-34f054285270" + }, + "execution_count": 10, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "input_ids: list[list], length 2000\n", + "attention_mask: list[list], length 2000\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "We obtained a list of lists. However, what we want is `torch.tensor` so that we could use it with our model. It's time to remember how we used to specify the `return_tensors` option! However, if we were to just specify it blindly, we would get an error. The problem here lies in the fact that sequences tend to have different lenghts:" + ], + "metadata": { + "id": "ooqGLMl7cTe-" + } + }, + { + "cell_type": "code", + "source": [ + "for seq in tokenized_texts[\"input_ids\"][:5]:\n", + " print(len(seq))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "7SMycKghdkh2", + "outputId": "dcb95cd1-907e-4edb-8fcb-df6cfd67d6b2" + }, + "execution_count": 11, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "20\n", + "16\n", + "45\n", + "22\n", + "25\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "The most common solution to this problem, often used in NLP is the use of padding. Luckily for us, tokenizer from `transformers` can do the padding for us:" + ], + "metadata": { + "id": "_MjmmfMseJVf" + } + }, + { + "cell_type": "code", + "source": [ + "tokenized_texts = tokenizer(dataset['text'].tolist(), return_tensors=\"pt\", padding=True)\n", + "\n", + "for key, values in tokenized_texts.items():\n", + " values_type = type(values).__name__\n", + " print(f\"{key}: {values_type}, {values.shape}\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ONgnhkuxe9kB", + "outputId": "a63fa412-5317-4788-8050-4d73c38a507b" + }, + "execution_count": 12, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "input_ids: Tensor, torch.Size([2000, 59])\n", + "attention_mask: Tensor, torch.Size([2000, 59])\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "However, we just added lot's of extra items into most of our sequences:" + ], + "metadata": { + "id": "u1AlfPcefj-m" + } + }, + { + "cell_type": "code", + "source": [ + "tokenized_texts[\"input_ids\"]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "XjfXDg8pgsFd", + "outputId": "b1635101-98de-48bb-b7cf-5b9560c96edb" + }, + "execution_count": 13, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "tensor([[ 101, 1037, 18385, ..., 0, 0, 0],\n", + " [ 101, 4593, 2128, ..., 0, 0, 0],\n", + " [ 101, 2027, 3653, ..., 0, 0, 0],\n", + " ...,\n", + " [ 101, 2023, 2028, ..., 0, 0, 0],\n", + " [ 101, 1999, 1996, ..., 0, 0, 0],\n", + " [ 101, 1996, 3185, ..., 0, 0, 0]])" + ] + }, + "metadata": {}, + "execution_count": 13 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Note how all the sequences end with zeros. We already encountered such a problem, when trained our RNN on previous lessons and we tackled it with specifying the padding index to the `CrossEntropyLoss` so that our model doesn't train to predict padding. Right now we don't want to train our model to do anything, however, we are working with a transformer, which means that it uses the self-attention operation extensively. If we were to simply add these extra items, it would be likely to affect our results. And this is exactly the place where the `\"attention_mask\"` comes into play. It is used exactly to mask the padding from getting in a way of attention!" + ], + "metadata": { + "id": "zkhDolkRgrti" + } + }, + { + "cell_type": "code", + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "\n", + "plt.pcolormesh(tokenized_texts[\"attention_mask\"])\n", + "plt.axis(\"off\")\n", + "plt.colorbar()\n", + "plt.show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 258 + }, + "id": "gSCXbXVLg_7T", + "outputId": "6cd0e97a-4127-43fe-8342-02b8117d00da" + }, + "execution_count": 14, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAUwAAADxCAYAAACgTY5AAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAPh0lEQVR4nO3dX4hk6VnH8e+zvepCjAk4SOLMBBecgBIR47iD5CIr7prRi8yF4s4uAaOrgjgiRgOKsi7rzURRWHGItuu4JhcOZi+kwdFJRENATOwRSXDmIg6jZnqibPaPixASd7ofL6qyW91MVZ+qeqvOOe/5fqCh/px+T13U+b3P+77nnIrMRJJ0uHva/gCS1BcGpiQ1ZGBKUkMGpiQ1ZGBKUkMGpiQ1ZGBKqlJEXIyI5yPiX6e8HxHx+xFxIyI+FxHvPKxNA1NSrZ4FTs94/4eBE+O/nwU+fFiDBqakKmXmp4CXZmxyBvhIjnwaeHNEvHVWm/fOenPvv9/uZUBq7D3f+t1tfwS16BN7H4tl23jPD7whX3xpt9G2//y5r14DvjLx0mZmbs6xu6PArYnnO+PX/mvaP8wMTK2GwSLd3Ysv7fJPV97WaNuNt/7bVzLz5Io/0j6DCEwDSuqHBPbYW9fubgPHJ54fG782VXWBaThK/ZUkr2azIXkBW8C5iLgEnAJeycypw3GoMDCvfPGzbX+EuRny0utKVZgR8efAg8CRiNgBfhP4OoDM/EPgMvAjwA3gy8BPHtZmdYFp+Ej9lSS7hW45mZmPHvJ+Aj8/T5vVBWYfK8ym7Aw0BHt09+Sc6gKzZl3vDAx0LSuBXQOzDgaCtHqDqjANFUmLSuDVDv9sTvHA7PqwURqKPhYvSTokl4agjwHVOQm73c1LA1P9YBgNw+hKn+4yMKfwAJXaEOyy9D08VsbAnGJIc7F2DuqK0aKPgdkpBoTUTaPzMA3MThlS9VgLO7nh2LPClOZjQA6TFaaKMURUuyTY7fAv5xiYHWAQSq9zSK6ZnFM9nJ3KMCTB/+VG2x9jqlYC0y+/pLsZnbjukHyfJhWVoSoNk4s+C+jDMNVQl8rKDHbTCnNtDDGp3/asMOdn8EnDM1r06WwsdTcw+zAklybZyS/PRR/pAINFs+x6Hqb0unWOHgznfvFKn57wwJK6Yc9V8u7r2pypAa4hGt18w8DUnBYNcINWfZYEr3pppNala5VyLeyI1iMTT1yXJhk+mi48cV3zM1Q0RIkVphZQamht8KpvXPRRa9qe0zSwNY8kvIGwhqvtwC7F4F+P0c/sdjeWuvvJesCDSCotvB9mrWqpnlbJTkXzSLzSR4UYPhoCK8xKGWBSWZlRrMKMiNPA08AG8Exmnj/w/tuAPwPePN7mVzPz8qw2DcwlOCTvHzu5bhst+ix/aWREbAAXgIeBHWA7IrYy8/rEZr8B/EVmfjgivhO4DHzbrHYNTK2doaXpiv2mzwPAjcy8CRARl4AzwGRgJvBN48dvAr54WKMGZmUMI/XZaNGn8RzmkYi4OvF8MzM3x4+PArcm3tsBTh34/yeBj0fELwBvAB46bIcGZmX6ME1gqGuWOa70eSEzTy6xq0eBZzPzdyPi+4GPRsQ7MnNv2j8YmB1ggEgjBa/0uQ0cn3h+bPzapMeB0wCZ+Y8RcR9wBHh+WqMGZgf0oSrsKzuj/in0I2jbwImIuJ9RUJ4FHjuwzReAHwSejYjvAO4DvjSr0eKB6RdU0qIy4dW95QMzM+9ExDngCqNThi5m5rWIeAq4mplbwC8DfxwRv8Ro+vT9mZmz2i0emOuulgxoqR6jIXmZ8zDH51RePvDaExOPrwPvmqfN3g/JHc7uZweivvNKH61Nkw7EUFVXzXla0doZmANUS1Vu8Neo3JB8FQzMwjyIpeX4mz4NGDSSRqvk/szuoWoZJkptqaHo8CcqpANqOLC1Og7JpQltjiYM625zlVyDYRipBFfJF+DBJw1PZnDHwJyfi0Bl2PGobxySS1IDzmGqVYtW6lamaouBWQlDRFotz8OsiPOqZdjxaBbPw9Q+BoZ0d5lwp8ANhFfFwGzBkCpVOwfNyyH5gBgQ0uKcwxyYIVWPOpwd6PzSwJyfXzRpmFz0WcDQKzU7DA1RpnOYvWRgSW0Idl0l7x9/TExqh3OYlXLawA5DZXkt+Yp50EoVydE8Zlf1PjCHXuWpO+y8y3CVfI380kr9lS76rNcqK07DWFo9h+SVqGX4b/Cry1wlV6fUEvxN2Dn0S6aBqR4yaNQWTyvSPoaRNJ1zmJLUQBLsuUquSaXmEK1UVaMOF5gGZp/VvHhjZzBQBRd9IuI08DSwATyTmefvss2PA0+O9sxnM/OxWW0amB1lYGiwCpSYEbEBXAAeBnaA7YjYyszrE9ucAH4NeFdmvhwR33JYuwZmR9VcPWo1aulkC1WYDwA3MvMmQERcAs4A1ye2+RngQma+PNpvPn9YowamBqWWUKlVAnt7jQPzSERcnXi+mZmb48dHgVsT7+0Apw78/9sBIuIfGA3bn8zMv5m1QwOzRzzYVb0EmleYL2TmySX2di9wAngQOAZ8KiK+KzP/Z9Y/qGUGofS6Qudh3gaOTzw/Nn5t0g7wmcx8Ffj3iPg8owDdntaogdkBtcxXGvwqokxgbgMnIuJ+RkF5Fji4Av6XwKPAn0bEEUZD9JuzGjUwZdCpQ6LIok9m3omIc8AVRvOTFzPzWkQ8BVzNzK3xez8UEdeBXeCDmfnirHYNzB4x2DQIhc5cz8zLwOUDrz0x8TiBD4z/GjEwe2SRobshq15JyOar5GvX+8A0EKTaGJgrU8uCyUF2BBqsDl9M3vvA7BqDTlqSgTk/g0caoPlOXF+7zgZmrUPtkuxUVCNvIKyVsFNZHzunNXKVXCV40GoIwgpTkhpIXPSRpGbCRR+V4Zzlejj10bKhV5h+ASU1ttf2B5huLYHpNdCSGvE8zMU4/NRQWBzs5yq5OsUDVJ1mYHaLgSFpEYMMTIf7mmQH2i0OyVWEB7aql3hppMqwMp7NDqUSVphaBQNCNXJIrn0MOmkGA1OSGjIwy7Ayk+oW6ZC8GBc95mcno95xlXy4DCxpPlaYA9akKjZUpQkGpmZxqmHY7DAnOIc5LH75pSUZmGUYRlL9osM3EL6n7Q8gSX3RqwrTuT5NcsRRKYfkUnn+9EmFXPSplweftAIGZp2cIlgfO6cBMTCl5ZTqnAzebgu6vUpuYBbmASktoeAcZkScBp4GNoBnMvP8lO1+FHgO+L7MvDqrTQOzMIfp/WMn1zEFAjMiNoALwMPADrAdEVuZef3Adm8EfhH4TJN2DUz1lkFXqTIV5gPAjcy8CRARl4AzwPUD2/0W8CHgg00aNTCX4AErlTfHkPxIREwOoTczc3P8+Chwa+K9HeDUvv1EvBM4npl/FREG5qp5JyJpBZoH5guZeXKRXUTEPcDvAe+f5/8MzBWreU7TzkDFZbFV8tvA8Ynnx8avfc0bgXcAn4wIgLcAWxHx3lkLPwamFuapPlqJMnOY28CJiLifUVCeBR57bReZrwBHvvY8Ij4J/Iqr5DqUgaUuKXFaUWbeiYhzwBVGpxVdzMxrEfEUcDUztxZptzOB6UErCSh2pU9mXgYuH3jtiSnbPtikzc4EZs1zfTWwQ9NaJF4aqfYYdOqTwLsVVcPwkVbPwKzEkKcN7Cy0Ngam+m7dnYUBPWAGpjQf76Y+UN5xXYvw4NdgGZia1yqHwIaxuswbCKtTVhXGBrFKcEguSU144roOY2UmTTAwNcuiQ2SDVrXxSp+KGVhSebHX3cQ0MJcw5Ct/7sYOREtzDlNtMsTUNw7J1ZqhV8F2GD1kYPaPB5rUDivMHupiZWaIaxAMzP4xnKQWlPvVyJUwMKewwpTWz/MwVUwXQ7wGdkQdk91NTANzCR5oUnlWmJUaesVnh6HiPHFdkppz0Ue9Y/WothiYLfPgl3oicdGnbbXONdoRqEYu+mglau0ImrLDqJSBuZ9fdEl344nrdzH0yqhtdljqrExvIFyKB7o0AN3NS4fkkrrFIfkBDsn3swORxhJwSK5ZutiBGOJqTXfz0sCchyEirV6pIXlEnAaeBjaAZzLz/IH3PwD8NHAH+BLwU5n5n7PaNDDn0MVKUMuzI+yWEqvkEbEBXAAeBnaA7YjYyszrE5v9C3AyM78cET8H/DbwyKx2DcwpPIikFpS7W9EDwI3MvAkQEZeAM8BrgZmZfz+x/aeB9x3WqIE5RRerSUNctRuduN44MY9ExNWJ55uZuTl+fBS4NfHeDnBqRluPA3992A4NzB7pYoi3yQ6kUs3vVvRCZp5cdncR8T7gJPDuw7Y1MHvEgNAQzFFhznIbOD7x/Nj4tf37ingI+HXg3Zn51cMaNTB7pNYK045Aryk3h7kNnIiI+xkF5VngsckNIuJ7gD8CTmfm800aNTC1MINO5ZW5ljwz70TEOeAKo9OKLmbmtYh4CriamVvA7wDfCHwsIgC+kJnvndWugdlRhpEGq9ANhDPzMnD5wGtPTDx+aN42DcwxA0rqgPQnKhZigEkD5U9UzK/WBY6S7FRUpe7mZXcDU4e7W6diiKrvYq+7Y3IDszJ9rMwNeb0mmefE9bUzMLUwg06lBVnqxPWVWEtgemBJamzogdnHYeI62aFIE4YemJptaB2KHYSmcg5ztTz4pLq4Sr5CQ6vOSrCTUXelQ/JaGTxSYYmBWauhV7d2GFqJ7o7IDUwtbugdRhN2KvMb/HmYQ+IBIi3JwByOWqoug1+tyITd7o7JDUxJ3WKFKUkNGZjqm1qmFhbhdESLEijwmz6rMojA9ACQ+iIhncNs1SLVkiErtSBx0aePhjwkXZSdjIpwDlNdYrCp0wxMlWDQqX7efEOFrHOawHBWKxLw9m5qi8Gn3rHCVFtcvCrDjmddvDSyc/zySx2VkJ6H2S1WXathR6QivNLncB5skgDnMJtYtOozaKWKZLpKvko1DK8NfWmCFeb6GD5SnyW5u9v2h5iqusCsoeLsKzsrLc3bu3WPB7bUYZ5W1C01V6F2BuqzBLJQhRkRp4GngQ3gmcw8f+D9bwA+Anwv8CLwSGb+x6w2BxmYfWUYqnpZ5gbCEbEBXAAeBnaA7YjYyszrE5s9Drycmd8eEWeBDwGPzGrXwCzMUJOWU2jR5wHgRmbeBIiIS8AZYDIwzwBPjh8/B/xBRETm9GX6mYF5z1s+H8t84iH6RHenX6TO+19evvK3+dyRhpvfFxFXJ55vZubm+PFR4NbEezvAqQP//9o2mXknIl4Bvhl4YdoOrTAldUZmnm77M8xyT9sfQJJW4DZwfOL5sfFrd90mIu4F3sRo8WcqA1NSjbaBExFxf0R8PXAW2DqwzRbwE+PHPwb83az5S3BILqlC4znJc8AVRqcVXczMaxHxFHA1M7eAPwE+GhE3gJcYhepMcUigSpLGHJJLUkMGpiQ1ZGBKUkMGpiQ1ZGBKUkMGpiQ1ZGBKUkP/D1kzYjRAuknzAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jK-CQB9-kN99" + }, + "source": [ + "## And Now, Deep Learning!" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "a0UsQGQ2oaYF" + }, + "source": [ + "Now that we have our model and inputs ready, let's run our model! However, running it on a cpu takes several minutes. We can speed this up via using the GPU. For this, however, we would need to split our dataset into batches of data.\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Let's slice only the part of the output that we need. That is the output corresponding the first token of each sentence. The way BERT does sentence classification, is that it adds a token called `[CLS]` (for classification) at the beginning of every sentence. The output corresponding to that token can be thought of as an embedding for the entire sentence.\n", + "\n", + "\n", + "\n", + "We'll save those in the `features` variable, as they'll serve as the features to our logitics regression model. Also remember, how we created the `labels` variable to hold our labels." + ], + "metadata": { + "id": "KBqHs4d_ObTc" + } + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "id": "39UVjAV56PJz", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "78b7f067-325f-47f2-c218-a99331f2c7d2" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(2000, 768)" + ] + }, + "metadata": {}, + "execution_count": 15 + } + ], + "source": [ + "import numpy as np\n", + "\n", + "\n", + "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n", + "model.to(device)\n", + "\n", + "batch_size = 32\n", + "features = []\n", + "with torch.no_grad():\n", + " for i in range(0, len(texts), batch_size):\n", + " texts_batch = tokenized_texts[\"input_ids\"][i : i + batch_size].to(device)\n", + " masks_batch = tokenized_texts[\"attention_mask\"][i : i + batch_size].to(device)\n", + " output = model(texts_batch, masks_batch)\n", + " batch_features = output.last_hidden_state[:, 0, :].cpu().numpy()\n", + " features.append(batch_features)\n", + "\n", + "features = np.concatenate(features, axis=0)\n", + "features.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iaoEvM2evRx1" + }, + "source": [ + "## Classifier training" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "02wlGPSLoaYG" + }, + "source": [ + "Let's now split our datset into a training set and testing set (even though we're using 2,000 sentences from the SST2 training set)." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "id": "ddAqbkoU6PP9" + }, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "\n", + "train_features, test_features, train_labels, test_labels = train_test_split(features, labels)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "B9bhSJpcv1Bl" + }, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uW_IiKvToaYG" + }, + "source": [ + "We can dive into Logistic regression directly with the Scikit Learn default parameters, but sometimes it's worth searching for the best value of the C parameter, which determines regularization strength." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "id": "cyEwr7yYD3Ci" + }, + "outputs": [], + "source": [ + "# YOUR CODE HERE\n", + "# [EXTRA] Grid search for parameters" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KCT9u8vAwnID" + }, + "source": [ + "We now train the LogisticRegression model. If you've chosen to do the gridsearch, you can plug the best hyperparameter values into the model declaration." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "id": "gG-EVWx4CzBc" + }, + "outputs": [], + "source": [ + "import warnings\n", + "\n", + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "\n", + "warnings.simplefilter('ignore') # Ignore warning on model fitting.\n", + "lr_clf = LogisticRegression().fit(train_features, train_labels)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3rUMKuVgwzkY" + }, + "source": [ + "\n", + "\n", + "So how well does our model do in classifying sentences? One way is to check the accuracy against the testing dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "iCoyxRJ7ECTA", + "outputId": "3472fa11-e7bb-45aa-b5c9-d23db7392423" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0.868" + ] + }, + "metadata": {}, + "execution_count": 19 + } + ], + "source": [ + "lr_clf.score(test_features, test_labels)" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Another way to evaluate classification model is to plot the ROC curve and compute the area under it." + ], + "metadata": { + "id": "2xA5YIJPDYek" + } + }, + { + "cell_type": "code", + "source": [ + "from sklearn.metrics import roc_auc_score, roc_curve\n", + "\n", + "\n", + "plt.figure(figsize=(10, 6))\n", + "\n", + "proba = lr_clf.predict_proba(train_features)[:, 1]\n", + "auc = roc_auc_score(train_labels, proba)\n", + "plt.plot(*roc_curve(train_labels, proba)[:2], label=f'train AUC={auc:.4f}')\n", + "\n", + "proba = lr_clf.predict_proba(test_features)[:, 1]\n", + "auc = roc_auc_score(test_labels, proba)\n", + "plt.plot(*roc_curve(test_labels, proba)[:2], label=f'test AUC={auc:.4f}')\n", + "\n", + "plt.legend()\n", + "plt.show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 374 + }, + "id": "186Bwg7UDLkU", + "outputId": "26f970d5-4bad-44d4-fa09-cbe42c1c9d6c" + }, + "execution_count": 20, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "75oyhr3VxHoE" + }, + "source": [ + "How good is this score? What can we compare it against? Let's first look at a dummy classifier:" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "lnwgmqNG7i5l", + "outputId": "2425fa54-58a7-4224-effe-f9c7f37d3a05" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Dummy classifier score: 0.518 (+/- 0.003)\n" + ] + } + ], + "source": [ + "from sklearn.dummy import DummyClassifier\n", + "from sklearn.model_selection import cross_val_score\n", + "\n", + "\n", + "clf = DummyClassifier()\n", + "scores = cross_val_score(clf, train_features, train_labels)\n", + "print(f\"Dummy classifier score: {scores.mean():.3f} (+/- {2 * scores.std():.3f})\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7Lg4LOpoxSOR" + }, + "source": [ + "So our model clearly does better than a dummy classifier. But how does it compare against the best models?\n", + "\n", + "For reference, the [highest accuracy score](http://nlpprogress.com/english/sentiment_analysis.html) for this dataset is currently **96.8**. DistilBERT can be trained to improve its score on this task – a process called **fine-tuning** which updates BERT’s weights to make it achieve a better performance in this sentence classification task (which we can call the downstream task). The fine-tuned DistilBERT turns out to achieve an accuracy score of **90.7**. The full size BERT model achieves **94.9**.\n", + "\n", + "And that’s it! That’s a good first contact with BERT. The next step would be to head over to the documentation and try your hand at [fine-tuning](https://huggingface.co/transformers/examples.html#glue). You can also go back and switch from distilBERT to BERT and see how that works." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EJQuqV6cnWQu", + "outputId": "402d109c-01bb-485d-a510-4be8684c9c06" + }, + "source": [ + "## Part 2: Looking back." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "outputId": "402d109c-01bb-485d-a510-4be8684c9c06", + "id": "zIBbP_oroaYH" + }, + "source": [ + "Now it is your turn to reproduce the steps above.\n", + "\n", + "We shall revisit the first homework and see whether we could improve the results a little bit more. The average ROC-AUC on test set was around $0.9$ (using the words embeddings). \n", + "\n", + "__Let's see whether we can beat it.__" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "kz8QBEXozHJx", + "outputId": "d4ac5072-c505-4c8f-89a6-b09eac02856a" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
should_bancomment_text
00The picture on the article is not of the actor...
11Its madness. Shes of Chinese heritage, but JAP...
21Fuck You. Why don't you suck a turd out of my ...
31God is dead\\nI don't mean to startle anyone bu...
41THIS USER IS A PLANT FROM BRUCE PERENS AND GRO...
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ], + "text/plain": [ + " should_ban comment_text\n", + "0 0 The picture on the article is not of the actor...\n", + "1 1 Its madness. Shes of Chinese heritage, but JAP...\n", + "2 1 Fuck You. Why don't you suck a turd out of my ...\n", + "3 1 God is dead\\nI don't mean to startle anyone bu...\n", + "4 1 THIS USER IS A PLANT FROM BRUCE PERENS AND GRO..." + ] + }, + "metadata": {}, + "execution_count": 22 + } + ], + "source": [ + "dataset_url = 'https://raw.githubusercontent.com/neychev/made_nlp_course/master/datasets/comments_small_dataset/comments.tsv'\n", + "dataset = pd.read_csv(dataset_url, sep='\\t')\n", + "dataset.head()" + ] + }, + { + "cell_type": "markdown", + "source": [ + "One last note: this dataset contains some very long sentences, while the vast majority of sequences fall into category of 500 tokens and less:" + ], + "metadata": { + "id": "ZPyxeBCgGtfg" + } + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "id": "XdZjuVxRoaYH", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 374 + }, + "outputId": "b3c3d368-87b5-4c37-b24c-0507f2a8ad0d" + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAlYAAAFlCAYAAAApo6aBAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAARCElEQVR4nO3dbYyld1nH8d9ll6I8hBa6aXDbuFUaTWMiNBuswRBCDY/GrQkQjIENaVJfFAXRyMIbiL4pRkFIDEmlmJIgDwFMG6lgw0OML6hsoQJtRdZSaDeFLlAKShALly/mLgxl6852r9k5M/18ksncT2fOf/Lfc/ab+z5zTnV3AAA4eT+11QMAANgphBUAwBBhBQAwRFgBAAwRVgAAQ4QVAMCQXVs9gCQ566yzeu/evVs9DACA47rxxhu/1t27j7VvJcJq7969OXTo0FYPAwDguKrqSw+2z6VAAIAhwgoAYIiwAgAYIqwAAIYIKwCAIcIKAGCIsAIAGCKsAACGCCsAgCHCCgBgiLACABgirAAAhggrAIAhu7Z6AKfK3oMf3OohjLn9iudv9RAAgGNwxgoAYIiwAgAYIqwAAIYIKwCAIcIKAGCIsAIAGCKsAACGCCsAgCHCCgBgiLACABgirAAAhggrAIAhwgoAYIiwAgAYIqwAAIYIKwCAIcIKAGCIsAIAGCKsAACGCCsAgCHCCgBgiLACABgirAAAhggrAIAhwgoAYIiwAgAYIqwAAIYIKwCAIcIKAGCIsAIAGCKsAACGCCsAgCHCCgBgiLACABgirAAAhggrAIAhwgoAYIiwAgAYIqwAAIYIKwCAIcIKAGCIsAIAGLKhsKqqP6yqm6vqc1X1rqr66ao6r6puqKrDVfWeqjp9OfaRy/rhZf/ezfwFAABWxXHDqqr2JPmDJPu6+5eTnJbkxUnekORN3f2kJPckuXS5yaVJ7lm2v2k5DgBgx9vopcBdSX6mqnYleVSSu5I8M8n7lv1XJ7lkWd6/rGfZf3FV1cxwAQBW13HDqruPJPmLJF/OWlDdm+TGJN/s7vuWw+5MsmdZ3pPkjuW29y3HP+GBP7eqLquqQ1V16OjRoyf7ewAAbLmNXAo8M2tnoc5L8rNJHp3kOSd7x919ZXfv6+59u3fvPtkfBwCw5TZyKfA3knyxu4929/8m+UCSpyU5Y7k0mCTnJDmyLB9Jcm6SLPsfl+Tro6MGAFhBGwmrLye5qKoetbxW6uIktyT5WJIXLMccSHLNsnztsp5l/0e7u+eGDACwmjbyGqsbsvYi9E8l+exymyuTvDrJq6rqcNZeQ3XVcpOrkjxh2f6qJAc3YdwAACtn1/EPSbr7dUle94DNtyV56jGO/W6SF5780AAAthfvvA4AMERYAQAMEVYAAEOEFQDAEGEFADBEWAEADBFWAABDhBUAwBBhBQAwRFgBAAwRVgAAQ4QVAMAQYQUAMERYAQAMEVYAAEOEFQDAEGEFADBEWAEADBFWAABDhBUAwBBhBQAwRFgBAAwRVgAAQ4QVAMAQYQUAMERYAQAMEVYAAEOEFQDAEGEFADBEWAEADBFWAABDhBUAwBBhBQAwRFgBAAwRVgAAQ4QVAMAQYQUAMERYAQAMEVYAAEOEFQDAEGEFADBEWAEADBFWAABDhBUAwBBhBQAwRFgBAAwRVgAAQ4QVAMAQYQUAMERYAQAMEVYAAEOEFQDAEGEFADBEWAEADBFWAABDhBUAwBBhBQAwZENhVVVnVNX7qurfq+rWqvq1qnp8VV1fVV9Yvp+5HFtV9ZaqOlxVn6mqCzf3VwAAWA0bPWP15iQf6u5fSvIrSW5NcjDJR7r7/CQfWdaT5LlJzl++Lkvy1tERAwCsqOOGVVU9LsnTk1yVJN39ve7+ZpL9Sa5eDrs6ySXL8v4k7+g1n0hyRlU9cXzkAAArZiNnrM5LcjTJ31bVp6vqbVX16CRnd/ddyzFfSXL2srwnyR3rbn/nsg0AYEfbSFjtSnJhkrd291OS/Hd+dNkvSdLdnaRP5I6r6rKqOlRVh44ePXoiNwUAWEkbCas7k9zZ3Tcs6+/LWmh99f5LfMv3u5f9R5Kcu+725yzbfkx3X9nd+7p73+7dux/q+AEAVsZxw6q7v5Lkjqr6xWXTxUluSXJtkgPLtgNJrlmWr03y0uWvAy9Kcu+6S4YAADvWrg0e9/tJ3llVpye5LcnLshZl762qS5N8KcmLlmOvS/K8JIeTfGc5FgBgx9tQWHX3TUn2HWPXxcc4tpNcfpLjAgDYdrzzOgDAEGEFADBEWAEADBFWAABDhBUAwBBhBQAwRFgBAAwRVgAAQ4QVAMAQYQUAMERYAQAMEVYAAEOEFQDAEGEFADBEWAEADBFWAABDhBUAwBBhBQAwRFgBAAwRVgAAQ4QVAMAQYQUAMERYAQAMEVYAAEOEFQDAEGEFADBEWAEADBFWAABDhBUAwBBhBQAwRFgBAAwRVgAAQ4QVAMAQYQUAMERYAQAMEVYAAEOEFQDAEGEFADBEWAEADBFWAABDhBUAwBBhBQAwRFgBAAwRVgAAQ4QVAMAQYQUAMERYAQAMEVYAAEOEFQDAEGEFADBEWAEADBFWAABDhBUAwBBhBQAwRFgBAAwRVgAAQ4QVAMCQDYdVVZ1WVZ+uqn9Y1s+rqhuq6nBVvaeqTl+2P3JZP7zs37s5QwcAWC0ncsbqFUluXbf+hiRv6u4nJbknyaXL9kuT3LNsf9NyHADAjrehsKqqc5I8P8nblvVK8swk71sOuTrJJcvy/mU9y/6Ll+MBAHa0jZ6x+qskf5LkB8v6E5J8s7vvW9bvTLJnWd6T5I4kWfbfuxwPALCjHTesquo3k9zd3TdO3nFVXVZVh6rq0NGjRyd/NADAltjIGaunJfmtqro9ybuzdgnwzUnOqKpdyzHnJDmyLB9Jcm6SLPsfl+TrD/yh3X1ld+/r7n27d+8+qV8CAGAVHDesuvs13X1Od+9N8uIkH+3u303ysSQvWA47kOSaZfnaZT3L/o92d4+OGgBgBZ3M+1i9Osmrqupw1l5DddWy/aokT1i2vyrJwZMbIgDA9rDr+If8SHd/PMnHl+Xbkjz1GMd8N8kLB8YGALCteOd1AIAhwgoAYIiwAgAYIqwAAIYIKwCAIcIKAGCIsAIAGCKsAACGCCsAgCHCCgBgiLACABgirAAAhggrAIAhwgoAYIiwAgAYIqwAAIYIKwCAIcIKAGCIsAIAGCKsAACGCCsAgCHCCgBgiLACABgirAAAhggrAIAhwgoAYIiwAgAYIqwAAIYIKwCAIcIKAGCIsAIAGCKsAACGCCsAgCHCCgBgiLACABgirAAAhggrAIAhwgoAYIiwAgAYIqwAAIYIKwCAIcIKAGCIsAIAGCKsAACGCCsAgCHCCgBgiLACABgirAAAhggrAIAhwgoAYIiwAgAYIqwAAIYIKwCAIcIKAGCIsAIAGCKsAACGCCsAgCHHDauqOreqPlZVt1TVzVX1imX746vq+qr6wvL9zGV7VdVbqupwVX2mqi7c7F8CAGAVbOSM1X1J/qi7L0hyUZLLq+qCJAeTfKS7z0/ykWU9SZ6b5Pzl67Ikbx0fNQDACjpuWHX3Xd39qWX520luTbInyf4kVy+HXZ3kkmV5f5J39JpPJDmjqp44PnIAgBVzQq+xqqq9SZ6S5IYkZ3f3XcuuryQ5e1nek+SOdTe7c9kGALCjbTisquoxSd6f5JXd/a31+7q7k/SJ3HFVXVZVh6rq0NGjR0/kpgAAK2lDYVVVj8haVL2zuz+wbP7q/Zf4lu93L9uPJDl33c3PWbb9mO6+srv3dfe+3bt3P9TxAwCsjI38VWAluSrJrd39xnW7rk1yYFk+kOSaddtfuvx14EVJ7l13yRAAYMfatYFjnpbkJUk+W1U3Ldtem+SKJO+tqkuTfCnJi5Z91yV5XpLDSb6T5GWjIwYAWFHHDavu/pck9SC7Lz7G8Z3k8pMcFwDAtuOd1wEAhggrAIAhwgoAYIiwAgAYIqwAAIYIKwCAIcIKAGCIsAIAGCKsAACGCCsAgCHCCgBgiLACABgirAAAhggrAIAhwgoAYIiwAgAYIqwAAIYIKwCAIcIKAGCIsAIAGCKsAACGCCsAgCHCCgBgiLACABgirAAAhggrAIAhwgoAYIiwAgAYIqwAAIYIKwCAIcIKAGCIsAIAGCKsAACGCCsAgCHCCgBgiLACABgirAAAhggrAIAhwgoAYIiwAgAYIqwAAIYIKwCAIcIKAGCIsAIAGCKsAACGCCsAgCHCCgBgiLACABgirAAAhuza6gFw4vYe/OBWD2HM7Vc8f6uHAABjnLECABgirAAAhggrAIAhwgoAYIiwAgAYIqwAAIYIKwCAIZvyPlZV9Zwkb05yWpK3dfcVm3E/sEp2yvuLeW8xgIdu/IxVVZ2W5K+TPDfJBUl+p6oumL4fAIBVsxlnrJ6a5HB335YkVfXuJPuT3LIJ98U2t1PO8uwkO2lOnH0DTrXNCKs9Se5Yt35nkl/dhPsB+H/tpEjcSQTv6tlJj5Wt/ve1ZZ8VWFWXJblsWf2vqvr8Jt7dWUm+tok/n81h3rYfc7Y9ndJ5qzecqnva8TzejuEU/fv6uQfbsRlhdSTJuevWz1m2/ZjuvjLJlZtw/z+hqg51975TcV/MMW/bjznbnszb9mTeVtNmvN3CJ5OcX1XnVdXpSV6c5NpNuB8AgJUyfsaqu++rqpcn+XDW3m7h7d198/T9AACsmk15jVV3X5fkus342Q/RKbnkyDjztv2Ys+3JvG1P5m0FVXdv9RgAAHYEH2kDADBkx4dVVT2nqj5fVYer6uBWj4cfqarbq+qzVXVTVR1atj2+qq6vqi8s389ctldVvWWZx89U1YVbO/qHj6p6e1XdXVWfW7fthOepqg4sx3+hqg5sxe/ycPIg8/b6qjqyPOZuqqrnrdv3mmXePl9Vz1633XPoKVJV51bVx6rqlqq6uapesWz3eNtOunvHfmXtxfP/meTnk5ye5N+SXLDV4/L1w/m5PclZD9j250kOLssHk7xhWX5ekn9MUkkuSnLDVo//4fKV5OlJLkzyuYc6T0ken+S25fuZy/KZW/277eSvB5m31yf542Mce8Hy/PjIJOctz5uneQ495XP2xCQXLsuPTfIfy9x4vG2jr51+xuqHH6/T3d9Lcv/H67C69ie5elm+Oskl67a/o9d8IskZVfXErRjgw013/3OSbzxg84nO07OTXN/d3+jue5Jcn+Q5mz/6h68HmbcHsz/Ju7v7f7r7i0kOZ+3503PoKdTdd3X3p5blbye5NWufZuLxto3s9LA61sfr7NmisfCTOsk/VdWNyzvxJ8nZ3X3XsvyVJGcvy+ZytZzoPJm/1fHy5bLR2++/pBTztnKqam+SpyS5IR5v28pODytW269394VJnpvk8qp6+vqdvXZO25+trjjztK28NckvJHlykruS/OXWDodjqarHJHl/kld297fW7/N4W307Paw29PE6bI3uPrJ8vzvJ32ftssNX77/Et3y/ezncXK6WE50n87cCuvur3f397v5Bkr/J2mMuMW8ro6oekbWoemd3f2DZ7PG2jez0sPLxOiuqqh5dVY+9fznJs5J8Lmvzc/9fsBxIcs2yfG2Sly5/BXNRknvXnRrn1DvRefpwkmdV1ZnL5adnLds4hR7wusTfztpjLlmbtxdX1SOr6rwk5yf513gOPaWqqpJcleTW7n7jul0eb9vIprzz+qpoH6+zys5O8vdrzyPZleTvuvtDVfXJJO+tqkuTfCnJi5bjr8vaX8AcTvKdJC879UN+eKqqdyV5RpKzqurOJK9LckVOYJ66+xtV9WdZ+486Sf60uzf6wmoeggeZt2dU1ZOzdinp9iS/lyTdfXNVvTfJLUnuS3J5d39/+TmeQ0+dpyV5SZLPVtVNy7bXxuNtW/HO6wAAQ3b6pUAAgFNGWAEADBFWAABDhBUAwBBhBQAwRFgBAAwRVgAAQ4QVAMCQ/wPygunfOAK/eQAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + } + } + ], + "source": [ + "texts = dataset[\"comment_text\"].tolist()\n", + "tokenized_texts = tokenizer(texts)\n", + "ids_lens = list(len(toks) for toks in tokenized_texts[\"input_ids\"])\n", + "\n", + "plt.figure(figsize=(10, 6))\n", + "plt.hist(ids_lens)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "source": [ + "We already know, how to tackle the problem of different sizes of sequences with padding. However, blind padding here would make pad all the sequenes to the size of the largest one, which seems to be an overkill. In such case it might be sensible to actually truncate the too-long sequences into a fixed length, say 512. And we can do this easily by specifying the `max_length` and `truncation=True` arguments to the tokenizer." + ], + "metadata": { + "id": "66wufVncHF8f" + } + }, + { + "cell_type": "code", + "source": [ + "# YOUR CODE HERE\n", + "tokenized_texts = tokenizer(\n", + " texts, max_length=512, return_tensors=\"pt\", padding=True, truncation=True\n", + ")\n", + "\n", + "features = []\n", + "with torch.no_grad():\n", + " for i in range(0, len(texts), batch_size):\n", + " texts_batch = tokenized_texts[\"input_ids\"][i : i + batch_size].to(device)\n", + " masks_batch = tokenized_texts[\"attention_mask\"][i : i + batch_size].to(device)\n", + " output = model(texts_batch, masks_batch)\n", + " batch_features = output.last_hidden_state[:, 0, :].cpu().numpy()\n", + " features.append(batch_features)\n", + "\n", + "features = np.concatenate(features, axis=0)\n", + "features.shape" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "_LOYXhz_FKTr", + "outputId": "3810844f-2c91-4131-d510-bd75f1593975" + }, + "execution_count": 24, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(1000, 768)" + ] + }, + "metadata": {}, + "execution_count": 24 + } + ] + }, + { + "cell_type": "code", + "source": [ + "labels = dataset[\"should_ban\"].values\n", + "train_features, test_features, train_labels, test_labels = train_test_split(features, labels)\n", + "lr_clf = LogisticRegression(C=0.1)\n", + "lr_clf.fit(train_features, train_labels)\n", + "lr_clf.score(test_features, test_labels)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Q6dOtVJsMXGa", + "outputId": "22d7b8e5-3205-4c95-8f57-e9dcaddc2add" + }, + "execution_count": 25, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0.832" + ] + }, + "metadata": {}, + "execution_count": 25 + } + ] + }, + { + "cell_type": "code", + "source": [ + "plt.figure(figsize=(10, 6))\n", + "\n", + "proba = lr_clf.predict_proba(train_features)[:, 1]\n", + "auc = roc_auc_score(train_labels, proba)\n", + "plt.plot(*roc_curve(train_labels, proba)[:2], label=f'train AUC={auc:.4f}')\n", + "\n", + "proba = lr_clf.predict_proba(test_features)[:, 1]\n", + "auc = roc_auc_score(test_labels, proba)\n", + "plt.plot(*roc_curve(test_labels, proba)[:2], label=f'test AUC={auc:.4f}')\n", + "\n", + "plt.legend()\n", + "plt.show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 374 + }, + "id": "spZ4yhrdMm4V", + "outputId": "02c2449b-1786-4de1-cb6b-aeff6de3c775" + }, + "execution_count": 26, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cc1hBVfbzHJ7" + }, + "source": [ + "So, how does it look? Did we achieve better results? \n", + "\n", + "Here come some further ideas:\n", + "\n", + "* Try using the larger BERT (e.g. BERT-base or BERT-large) and compare the results (be careful, they require more memory).\n", + "\n", + "* Using BERT output for translation? Why not ;)" + ] + } + ], + "metadata": { + "colab": { + "machine_shape": "hm", + "name": "practice_bert_for_text_classification_solved.ipynb", + "provenance": [], + "collapsed_sections": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + }, + "accelerator": "GPU" + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/week07_bert_finetuning/README.md b/week07_bert_finetuning/README.md deleted file mode 100644 index 0dd49e1..0000000 --- a/week07_bert_finetuning/README.md +++ /dev/null @@ -1,16 +0,0 @@ -How to fine-tune BERT: -[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/girafe-ai/natural-language-processing/blob/master/week07_bert_finetuning/bert_finetuning.ipynb) - - -__Further readings__: -* [Blog post](http://mccormickml.com/2019/07/22/BERT-fine-tuning/) about the aforementioned notebook - -* The Illustrated BERT [blog post](http://jalammar.github.io/illustrated-bert/) - -* DistillBERT overview (distillation will be covered later in our course) [blog post](https://medium.com/huggingface/distilbert-8cf3380435b5) - -* Google AI Blog [post about open sourcing BERT](https://ai.googleblog.com/2018/11/open-sourcing-bert-state-of-art-pre.html) - -* One more [blog post explaining BERT](https://yashuseth.blog/2019/06/12/bert-explained-faqs-understand-bert-working/) - -* Great PyTorch library: [pytorch-transformers](https://github.com/huggingface/transformers) diff --git a/week07_bert_finetuning/bert_finetuning.ipynb b/week07_bert_finetuning/bert_finetuning.ipynb deleted file mode 100644 index e281de0..0000000 --- a/week07_bert_finetuning/bert_finetuning.ipynb +++ /dev/null @@ -1,4876 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "EKOTlwcmxmej" - }, - "source": [ - "# BERT Fine-Tuning Tutorial with PyTorch\n", - "\n", - "Source (by Chris McCormick and Nick Ryan):\n", - "[blog post](http://mccormickml.com/2019/07/22/BERT-fine-tuning/) and [notebook](https://colab.research.google.com/drive/1pTuQhug6Dhl9XalKB0zUGf4FIdYFlpcX)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qCgvR9INuP5q" - }, - "source": [ - "\n", - "## What is BERT?\n", - "\n", - "BERT (Bidirectional Encoder Representations from Transformers), released in late 2018, is the model we will use in this tutorial to provide readers with a better understanding of and practical guidance for using transfer learning models in NLP. BERT is a method of pretraining language representations that was used to create models that NLP practicioners can then download and use for free. You can either use these models to extract high quality language features from your text data, or you can fine-tune these models on a specific task (classification, entity recognition, question answering, etc.) with your own data to produce state of the art predictions.\n", - "\n", - "This notebook will explain how you can modify and fine-tune BERT to create a powerful NLP model that quickly gives you state of the art results. \n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DaVGdtOkuXUZ" - }, - "source": [ - "\n", - "## Advantages of Fine-Tuning\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5llwu8GBuqMb" - }, - "source": [ - "\n", - "In this tutorial, we will use BERT to train a text classifier. Specifically, we will take the pre-trained BERT model, add an untrained layer of neurons on the end, and train the new model for our classification task. Why do this rather than train a train a specific deep learning model (a CNN, BiLSTM, etc.) that is well suited for the specific NLP task you need? \n", - "\n", - "1. **Quicker Development**\n", - "\n", - " * First, the pre-trained BERT model weights already encode a lot of information about our language. As a result, it takes much less time to train our fine-tuned model - it is as if we have already trained the bottom layers of our network extensively and only need to gently tune them while using their output as features for our classification task. In fact, the authors recommend only 2-4 epochs of training for fine-tuning BERT on a specific NLP task (compared to the hundreds of GPU hours needed to train the original BERT model or a LSTM from scratch!). \n", - "\n", - "2. **Less Data**\n", - "\n", - " * In addition and perhaps just as important, because of the pre-trained weights this method allows us to fine-tune our task on a much smaller dataset than would be required in a model that is built from scratch. A major drawback of NLP models built from scratch is that we often need a prohibitively large dataset in order to train our network to reasonable accuracy, meaning a lot of time and energy had to be put into dataset creation. By fine-tuning BERT, we are now able to get away with training a model to good performance on a much smaller amount of training data.\n", - "\n", - "3. **Better Results**\n", - "\n", - " * Finally, this simple fine-tuning procedure (typically adding one fully-connected layer on top of BERT and training for a few epochs) was shown to achieve state of the art results with minimal task-specific adjustments for a wide variety of tasks: classification, language inference, semantic similarity, question answering, etc. Rather than implementing custom and sometimes-obscure architetures shown to work well on a specific task, simply fine-tuning BERT is shown to be a better (or at least equal) alternative.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZEynC5F4u7Nb" - }, - "source": [ - "\n", - "### A Shift in NLP\n", - "\n", - "This shift to transfer learning parallels the same shift that took place in computer vision a few years ago. Creating a good deep learning network for computer vision tasks can take millions of parameters and be very expensive to train. Researchers discovered that deep networks learn hierarchical feature representations (simple features like edges at the lowest layers with gradually more complex features at higher layers). Rather than training a new network from scratch each time, the lower layers of a trained network with generalized image features could be copied and transfered for use in another network with a different task. It soon became common practice to download a pre-trained deep network and quickly retrain it for the new task or add additional layers on top - vastly preferable to the expensive process of training a network from scratch. For many, the introduction of deep pre-trained language models in 2018 (ELMO, BERT, ULMFIT, Open-GPT, etc.) signals the same shift to transfer learning in NLP that computer vision saw.\n", - "\n", - "Let's get started!" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RX_ZDhicpHkV" - }, - "source": [ - "# 1. Setup" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nSU7yERLP_66" - }, - "source": [ - "## 1.1. Using Colab GPU for Training\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "oYsV4H8fCpZ-", - "outputId": "a5a4a7ba-264e-4e5f-efe9-b5aade286979" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "There are 1 GPU(s) available.\n", - "We will use the GPU: Tesla K80\n" - ] - } - ], - "source": [ - "import torch\n", - "\n", - "# If there's a GPU available...\n", - "if torch.cuda.is_available(): \n", - "\n", - " # Tell PyTorch to use the GPU. \n", - " device = torch.device(\"cuda\")\n", - "\n", - " print('There are %d GPU(s) available.' % torch.cuda.device_count())\n", - "\n", - " print('We will use the GPU:', torch.cuda.get_device_name(0))\n", - "\n", - "# If not...\n", - "else:\n", - " print('No GPU available, using the CPU instead.')\n", - " device = torch.device(\"cpu\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2ElsnSNUridI" - }, - "source": [ - "## 1.2. Installing the Hugging Face Library\n" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "0NmMdkZO8R6q", - "outputId": "8f8e7b63-7398-4b9f-e9d7-ca9898a019d4" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Collecting transformers\n", - " Downloading transformers-4.14.1-py3-none-any.whl (3.4 MB)\n", - "\u001b[K |████████████████████████████████| 3.4 MB 10.8 MB/s \n", - "\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (1.19.5)\n", - "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.7/dist-packages (from transformers) (21.3)\n", - "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers) (2.23.0)\n", - "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers) (4.62.3)\n", - "Collecting huggingface-hub<1.0,>=0.1.0\n", - " Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)\n", - "\u001b[K |████████████████████████████████| 61 kB 502 kB/s \n", - "\u001b[?25hCollecting tokenizers<0.11,>=0.10.1\n", - " Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)\n", - "\u001b[K |████████████████████████████████| 3.3 MB 23.5 MB/s \n", - "\u001b[?25hRequirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from transformers) (4.8.2)\n", - "Collecting sacremoses\n", - " Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)\n", - "\u001b[K |████████████████████████████████| 895 kB 38.6 MB/s \n", - "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers) (3.4.0)\n", - "Collecting pyyaml>=5.1\n", - " Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)\n", - "\u001b[K |████████████████████████████████| 596 kB 36.1 MB/s \n", - "\u001b[?25hRequirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (2019.12.20)\n", - "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0,>=0.1.0->transformers) (3.10.0.2)\n", - "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>=20.0->transformers) (3.0.6)\n", - "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->transformers) (3.6.0)\n", - "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (1.24.3)\n", - "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (3.0.4)\n", - "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2.10)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2021.10.8)\n", - "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (7.1.2)\n", - "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.15.0)\n", - "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.1.0)\n", - "Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers\n", - " Attempting uninstall: pyyaml\n", - " Found existing installation: PyYAML 3.13\n", - " Uninstalling PyYAML-3.13:\n", - " Successfully uninstalled PyYAML-3.13\n", - "Successfully installed huggingface-hub-0.2.1 pyyaml-6.0 sacremoses-0.0.46 tokenizers-0.10.3 transformers-4.14.1\n" - ] - } - ], - "source": [ - "!pip install transformers" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lxddqmruamSj" - }, - "source": [ - "The code in this notebook is actually a simplified version of the [run_glue.py](https://github.com/huggingface/transformers/blob/master/examples/run_glue.py) example script from huggingface.\n", - "\n", - "`run_glue.py` is a helpful utility which allows you to pick which GLUE benchmark task you want to run on, and which pre-trained model you want to use (you can see the list of possible models [here](https://github.com/huggingface/transformers/blob/e6cff60b4cbc1158fbd6e4a1c3afda8dc224f566/examples/run_glue.py#L69)). It also supports using either the CPU, a single GPU, or multiple GPUs. It even supports using 16-bit precision if you want further speed up." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "guw6ZNtaswKc" - }, - "source": [ - "# 2. Loading CoLA Dataset\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_9ZKxKc04Btk" - }, - "source": [ - "We'll use [The Corpus of Linguistic Acceptability (CoLA)](https://nyu-mll.github.io/CoLA/) dataset for single sentence classification. It's a set of sentences labeled as grammatically correct or incorrect. It was first published in May of 2018, and is one of the tests included in the \"GLUE Benchmark\" on which models like BERT are competing.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4JrUHXms16cn" - }, - "source": [ - "## 2.1. Download & Extract" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3ZNVW6xd0T0X" - }, - "source": [ - "We'll use the `wget` package to download the dataset to the Colab instance's file system. " - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "5m6AnuFv0QXQ", - "outputId": "07b55b91-6eed-4fc1-dedd-63cc69449f27" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Collecting wget\n", - " Downloading wget-3.2.zip (10 kB)\n", - "Building wheels for collected packages: wget\n", - " Building wheel for wget (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9672 sha256=e92c9692ca261f97aa7a0da639679a843eac2f0500957b6ea7b85fbff5897f5d\n", - " Stored in directory: /root/.cache/pip/wheels/a1/b6/7c/0e63e34eb06634181c63adacca38b79ff8f35c37e3c13e3c02\n", - "Successfully built wget\n", - "Installing collected packages: wget\n", - "Successfully installed wget-3.2\n" - ] - } - ], - "source": [ - "!pip install wget" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "08pO03Ff1BjI" - }, - "source": [ - "The dataset is hosted on GitHub in this repo: https://nyu-mll.github.io/CoLA/" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "pMtmPMkBzrvs", - "outputId": "3faece61-284e-4ff3-c69d-d6c0cb186e63" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Downloading dataset...\n" - ] - } - ], - "source": [ - "import wget\n", - "import os\n", - "\n", - "print('Downloading dataset...')\n", - "\n", - "# The URL for the dataset zip file.\n", - "url = 'https://nyu-mll.github.io/CoLA/cola_public_1.1.zip'\n", - "\n", - "# Download the file (if we haven't already)\n", - "if not os.path.exists('./cola_public_1.1.zip'):\n", - " wget.download(url, './cola_public_1.1.zip')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_mKctx-ll2FB" - }, - "source": [ - "Unzip the dataset to the file system. You can browse the file system of the Colab instance in the sidebar on the left." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "0Yv-tNv20dnH", - "outputId": "483ba743-4dc9-45e0-f430-356520b3b7d8" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Archive: cola_public_1.1.zip\n", - " creating: cola_public/\n", - " inflating: cola_public/README \n", - " creating: cola_public/tokenized/\n", - " inflating: cola_public/tokenized/in_domain_dev.tsv \n", - " inflating: cola_public/tokenized/in_domain_train.tsv \n", - " inflating: cola_public/tokenized/out_of_domain_dev.tsv \n", - " creating: cola_public/raw/\n", - " inflating: cola_public/raw/in_domain_dev.tsv \n", - " inflating: cola_public/raw/in_domain_train.tsv \n", - " inflating: cola_public/raw/out_of_domain_dev.tsv \n" - ] - } - ], - "source": [ - "# Unzip the dataset (if we haven't already)\n", - "if not os.path.exists('./cola_public/'):\n", - " !unzip cola_public_1.1.zip" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "oQUy9Tat2EF_" - }, - "source": [ - "## 2.2. Parse" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xeyVCXT31EZQ" - }, - "source": [ - "We can see from the file names that both `tokenized` and `raw` versions of the data are available. \n", - "\n", - "We can't use the pre-tokenized version because, in order to apply the pre-trained BERT, we *must* use the tokenizer provided by the model. This is because (1) the model has a specific, fixed vocabulary and (2) the BERT tokenizer has a particular way of handling out-of-vocabulary words." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MYWzeGSY2xh3" - }, - "source": [ - "We'll use pandas to parse the \"in-domain\" training set and look at a few of its properties and data points." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 398 - }, - "id": "_UkeC7SG2krJ", - "outputId": "c71135dd-b9f1-4f66-ebfe-1e4502524d6a" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Number of training sentences: 8,551\n", - "\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sentence_sourcelabellabel_notessentence
1815r-671NaNI made it easy to get along with John.
6660m_021NaNThe mouse jumped out of the cheese box.
1228r-671NaNSam progressed.
4048ks081NaNTom tried to ask a question.
3815ks081NaNShe disappeared when the main party arrived.
5179kl931NaNAlmost every lawyer could answer that question.
6352d_981NaNAny woman who heard the news contributed to th...
6744m_021NaNBecause the bus drivers were on strike, the co...
4988ks081NaNThe fact that scientists have now established ...
3678ks081NaNChocolate cakes and pies are my favorite desse...
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ], - "text/plain": [ - " sentence_source ... sentence\n", - "1815 r-67 ... I made it easy to get along with John.\n", - "6660 m_02 ... The mouse jumped out of the cheese box.\n", - "1228 r-67 ... Sam progressed.\n", - "4048 ks08 ... Tom tried to ask a question.\n", - "3815 ks08 ... She disappeared when the main party arrived.\n", - "5179 kl93 ... Almost every lawyer could answer that question.\n", - "6352 d_98 ... Any woman who heard the news contributed to th...\n", - "6744 m_02 ... Because the bus drivers were on strike, the co...\n", - "4988 ks08 ... The fact that scientists have now established ...\n", - "3678 ks08 ... Chocolate cakes and pies are my favorite desse...\n", - "\n", - "[10 rows x 4 columns]" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pandas as pd\n", - "\n", - "# Load the dataset into a pandas dataframe.\n", - "df = pd.read_csv(\"./cola_public/raw/in_domain_train.tsv\", delimiter='\\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])\n", - "\n", - "# Report the number of sentences.\n", - "print('Number of training sentences: {:,}\\n'.format(df.shape[0]))\n", - "\n", - "# Display 10 random rows from the data.\n", - "df.sample(10)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "kfWzpPi92UAH" - }, - "source": [ - "The two properties we actually care about are the the `sentence` and its `label`, which is referred to as the \"acceptibility judgment\" (0=unacceptable, 1=acceptable)." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "H_LpQfzCn9_o" - }, - "source": [ - "Here are five sentences which are labeled as not grammatically acceptible. Note how much more difficult this task is than something like sentiment analysis!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 195 - }, - "id": "blqIvQaQncdJ", - "outputId": "ea5e6a0b-23b0-42d6-9b05-0e136f6ee5b9" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sentencelabel
4821I don't know that to agree with him or not.0
7669John convinced that Bill has slept.0
1731Hank plays the guitar and finds arrangements f...0
588the branch dropped bare of its apple.0
4341John is likely to appear that he will win the ...0
\n", - "
" - ], - "text/plain": [ - " sentence label\n", - "4821 I don't know that to agree with him or not. 0\n", - "7669 John convinced that Bill has slept. 0\n", - "1731 Hank plays the guitar and finds arrangements f... 0\n", - "588 the branch dropped bare of its apple. 0\n", - "4341 John is likely to appear that he will win the ... 0" - ] - }, - "execution_count": 8, - "metadata": { - "tags": [] - }, - "output_type": "execute_result" - } - ], - "source": [ - "df.loc[df.label == 0].sample(5)[['sentence', 'label']]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4SMZ5T5Imhlx" - }, - "source": [ - "\n", - "\n", - "Let's extract the sentences and labels of our training set as numpy ndarrays." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "GuE5BqICAne2" - }, - "outputs": [], - "source": [ - "# Get the lists of sentences and their labels.\n", - "sentences = df.sentence.values\n", - "labels = df.label.values" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ex5O1eV-Pfct" - }, - "source": [ - "# 3. Tokenization & Input Formatting" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-8kEDRvShcU5" - }, - "source": [ - "## 3.1. BERT Tokenizer" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bWOPOyWghJp2" - }, - "source": [ - "\n", - "To feed our text to BERT, it must be split into tokens, and then these tokens must be mapped to their index in the tokenizer vocabulary.\n", - "\n", - "The tokenization must be performed by the tokenizer included with BERT--the below cell will download this for us. We'll be using the \"uncased\" version here.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 162, - "referenced_widgets": [ - "93853fa079c546b58b1c9ed8e586ae6c", - "8f66495735614898b617c27c0a89255d", - "e4306d918bbb47abae367df2c7ac2e9d", - "ac1cb16ed2ef4e6ab40e790e5e9ce47f", - "89c02af3d0f74c18a8e294f78b1f961e", - "a868dfc37b55447d8bab7350ca8cd0cf", - "ab13bda5dc204c3cbb633535494eb453", - "3eb262cbc05a4a48808623fbd11d7649", - "dbb229077bcf437aa25c5a16276e9c48", - "3dfad6bb2985460da31cc42d7e88168c", - "972be3faee024479ab0667892594d00c", - "747ba5c2117a42b28539371fe04b19ad", - "c91181d3293d457a8c6d32aca47e5c27", - "37ea42eb1ecd44eab1ea46db51fe2800", - "ef3a445c93764273b132d56d8b7545a3", - "ca26fc3ff341416f820e3ea94aa83188", - "24fd7d7781a448f6b4164bbeb64888b3", - "0d0fa745757f4c93a5e3ce9c2fadc9a3", - "da0235bd5a3642eba1b5749afe2c6f61", - "edeb0997fdb84daba10b4a9a102adee5", - "879a098d294d444c8176ce6221846d95", - "e389ecd683b64c28be15da07866f73c1", - "44e2066d588b496f97aa00cf51c9447c", - "176c3ac86a8f4dfa9ff221d67076edc5", - "93d601fafc26454b8de64f626daf3074", - "2a70c1b9bda34e6a89ca14b3b663b959", - "e9afd7edefb44fe4847ab5a9c0b24999", - "7d6e576e3c464b5693c51d35032107cb", - "fb5ff76f955942dda1bb6c98f827b893", - "f57a3f9b473543bda0c27ebcba001fad", - "b59dd15b11a64b8cace572993e8323ba", - "d0970cd03bc74b5da9f913fda325c1e0", - "9110261cd8b94147b72dec393886b308", - "521bcdb6786b4ba1a361cc8de1c97036", - "da7f2189149d4dcb8d012ffa2b9e34bb", - "04b9332a3fa9454da1a7169353b99221", - "a29e31e998974e758d09daf1a6186610", - "d2f54cdaba8e4a04bdb3b8d279afe5ef", - "ce8f59b53f894031b4ff4e0e1b63a61f", - "cf5d78f166d54aadbd8f68976f1e9ccc", - "c9f13c482ecc4448816bec89e13816ab", - "3a5ad7be77ce4127b434b749ad14b00a", - "f2dcfd7f0f9a48b08730714d44d18218", - "86fc378943aa48ee893bef7404abc96a" - ] - }, - "id": "Z474sSC6oe7A", - "outputId": "d7ac0ff8-54b1-4e2d-c3ea-a9a578fe46c6" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Loading BERT tokenizer...\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "93853fa079c546b58b1c9ed8e586ae6c", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading: 0%| | 0.00/226k [00:00 \"The first token of every sequence is always a special classification token (`[CLS]`). The final hidden state\n", - "corresponding to this token is used as the aggregate sequence representation for classification\n", - "tasks.\" (from the [BERT paper](https://arxiv.org/pdf/1810.04805.pdf))\n", - "\n", - "You might think to try some pooling strategy over the final embeddings, but this isn't necessary. Because BERT is trained to only use this [CLS] token for classification, we know that the model has been motivated to encode everything it needs for the classification step into that single 768-value embedding vector. It's already done the pooling for us!\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "u51v0kFxeteu" - }, - "source": [ - "### Sentence Length & Attention Mask\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qPNuwqZVK3T6" - }, - "source": [ - "The sentences in our dataset obviously have varying lengths, so how does BERT handle this?\n", - "\n", - "BERT has two constraints:\n", - "1. All sentences must be padded or truncated to a single, fixed length.\n", - "2. The maximum sentence length is 512 tokens.\n", - "\n", - "Padding is done with a special `[PAD]` token, which is at index 0 in the BERT vocabulary. The below illustration demonstrates padding out to a \"MAX_LEN\" of 8 tokens.\n", - "\n", - "\n", - "\n", - "The \"Attention Mask\" is simply an array of 1s and 0s indicating which tokens are padding and which aren't (seems kind of redundant, doesn't it?!). This mask tells the \"Self-Attention\" mechanism in BERT not to incorporate these PAD tokens into its interpretation of the sentence.\n", - "\n", - "The maximum length does impact training and evaluation speed, however. \n", - "For example, with a Tesla K80:\n", - "\n", - "`MAX_LEN = 128 --> Training epochs take ~5:28 each`\n", - "\n", - "`MAX_LEN = 64 --> Training epochs take ~2:57 each`\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "l6w8elb-58GJ" - }, - "source": [ - "## 3.3. Tokenize Dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "U28qy4P-NwQ9" - }, - "source": [ - "The transformers library provides a helpful `encode` function which will handle most of the parsing and data prep steps for us.\n", - "\n", - "Before we are ready to encode our text, though, we need to decide on a **maximum sentence length** for padding / truncating to.\n", - "\n", - "The below cell will perform one tokenization pass of the dataset in order to measure the maximum sentence length." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "cKsH2sU0OCQA", - "outputId": "eb654716-a9b6-4cbd-ab4d-73eb7057d3ad" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Max sentence length: 47\n" - ] - } - ], - "source": [ - "max_len = 0\n", - "\n", - "# For every sentence...\n", - "for sent in sentences:\n", - "\n", - " # Tokenize the text and add `[CLS]` and `[SEP]` tokens.\n", - " input_ids = tokenizer.encode(sent, add_special_tokens=True)\n", - "\n", - " # Update the maximum sentence length.\n", - " max_len = max(max_len, len(input_ids))\n", - "\n", - "print('Max sentence length: ', max_len)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1M296yz577fV" - }, - "source": [ - "Just in case there are some longer test sentences, I'll set the maximum length to 64.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tIWAoWL2RK1p" - }, - "source": [ - "Now we're ready to perform the real tokenization.\n", - "\n", - "The `tokenizer.encode_plus` function combines multiple steps for us:\n", - "\n", - "1. Split the sentence into tokens.\n", - "2. Add the special `[CLS]` and `[SEP]` tokens.\n", - "3. Map the tokens to their IDs.\n", - "4. Pad or truncate all sentences to the same length.\n", - "5. Create the attention masks which explicitly differentiate real tokens from `[PAD]` tokens.\n", - "\n", - "The first four features are in `tokenizer.encode`, but I'm using `tokenizer.encode_plus` to get the fifth item (attention masks). Documentation is [here](https://huggingface.co/transformers/main_classes/tokenizer.html?highlight=encode_plus#transformers.PreTrainedTokenizer.encode_plus).\n" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "2bBdb3pt8LuQ", - "outputId": "2c69be67-ed4e-4fde-b82a-1e20e2a493d0" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n", - "/usr/local/lib/python3.7/dist-packages/transformers/tokenization_utils_base.py:2227: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", - " FutureWarning,\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Original: Our friends won't buy this analysis, let alone the next one we propose.\n", - "Token IDs: tensor([ 101, 2256, 2814, 2180, 1005, 1056, 4965, 2023, 4106, 1010,\n", - " 2292, 2894, 1996, 2279, 2028, 2057, 16599, 1012, 102, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0])\n" - ] - } - ], - "source": [ - "# Tokenize all of the sentences and map the tokens to thier word IDs.\n", - "input_ids = []\n", - "attention_masks = []\n", - "\n", - "# For every sentence...\n", - "for sent in sentences:\n", - " # `encode_plus` will:\n", - " # (1) Tokenize the sentence.\n", - " # (2) Prepend the `[CLS]` token to the start.\n", - " # (3) Append the `[SEP]` token to the end.\n", - " # (4) Map tokens to their IDs.\n", - " # (5) Pad or truncate the sentence to `max_length`\n", - " # (6) Create attention masks for [PAD] tokens.\n", - " encoded_dict = tokenizer.encode_plus(\n", - " sent, # Sentence to encode.\n", - " add_special_tokens = True, # Add '[CLS]' and '[SEP]'\n", - " max_length = 64, # Pad & truncate all sentences.\n", - " pad_to_max_length = True,\n", - " return_attention_mask = True, # Construct attn. masks.\n", - " return_tensors = 'pt', # Return pytorch tensors.\n", - " )\n", - " \n", - " # Add the encoded sentence to the list. \n", - " input_ids.append(encoded_dict['input_ids'])\n", - " \n", - " # And its attention mask (simply differentiates padding from non-padding).\n", - " attention_masks.append(encoded_dict['attention_mask'])\n", - "\n", - "# Convert the lists into tensors.\n", - "input_ids = torch.cat(input_ids, dim=0)\n", - "attention_masks = torch.cat(attention_masks, dim=0)\n", - "labels = torch.tensor(labels)\n", - "\n", - "# Print sentence 0, now as a list of IDs.\n", - "print('Original: ', sentences[0])\n", - "print('Token IDs:', input_ids[0])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aRp4O7D295d_" - }, - "source": [ - "## 3.4. Training & Validation Split\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qu0ao7p8rb06" - }, - "source": [ - "Divide up our training set to use 90% for training and 10% for validation." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "GEgLpFVlo1Z-", - "outputId": "9e68f506-acf3-4f45-f430-dc00cea3326b" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "7,695 training samples\n", - " 856 validation samples\n" - ] - } - ], - "source": [ - "from torch.utils.data import TensorDataset, random_split\n", - "\n", - "# Combine the training inputs into a TensorDataset.\n", - "dataset = TensorDataset(input_ids, attention_masks, labels)\n", - "\n", - "# Create a 90-10 train-validation split.\n", - "\n", - "# Calculate the number of samples to include in each set.\n", - "train_size = int(0.9 * len(dataset))\n", - "val_size = len(dataset) - train_size\n", - "\n", - "# Divide the dataset by randomly selecting samples.\n", - "train_dataset, val_dataset = random_split(dataset, [train_size, val_size])\n", - "\n", - "print('{:>5,} training samples'.format(train_size))\n", - "print('{:>5,} validation samples'.format(val_size))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dD9i6Z2pG-sN" - }, - "source": [ - "We'll also create an iterator for our dataset using the torch DataLoader class. This helps save on memory during training because, unlike a for loop, with an iterator the entire dataset does not need to be loaded into memory." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "id": "XGUqOCtgqGhP" - }, - "outputs": [], - "source": [ - "from torch.utils.data import DataLoader, RandomSampler, SequentialSampler\n", - "\n", - "# The DataLoader needs to know our batch size for training, so we specify it \n", - "# here. For fine-tuning BERT on a specific task, the authors recommend a batch \n", - "# size of 16 or 32.\n", - "batch_size = 32\n", - "\n", - "# Create the DataLoaders for our training and validation sets.\n", - "# We'll take training samples in random order. \n", - "train_dataloader = DataLoader(\n", - " train_dataset, # The training samples.\n", - " sampler = RandomSampler(train_dataset), # Select batches randomly\n", - " batch_size = batch_size # Trains with this batch size.\n", - " )\n", - "\n", - "# For validation the order doesn't matter, so we'll just read them sequentially.\n", - "validation_dataloader = DataLoader(\n", - " val_dataset, # The validation samples.\n", - " sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.\n", - " batch_size = batch_size # Evaluate with this batch size.\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8bwa6Rts-02-" - }, - "source": [ - "# 4. Train Our Classification Model" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3xYQ3iLO08SX" - }, - "source": [ - "Now that our input data is properly formatted, it's time to fine tune the BERT model. " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "D6TKgyUzPIQc" - }, - "source": [ - "## 4.1. BertForSequenceClassification" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1sjzRT1V0zwm" - }, - "source": [ - "For this task, we first want to modify the pre-trained BERT model to give outputs for classification, and then we want to continue training the model on our dataset until that the entire model, end-to-end, is well-suited for our task. \n", - "\n", - "Thankfully, the huggingface pytorch implementation includes a set of interfaces designed for a variety of NLP tasks. Though these interfaces are all built on top of a trained BERT model, each has different top layers and output types designed to accomodate their specific NLP task. \n", - "\n", - "Here is the current list of classes provided for fine-tuning:\n", - "* BertModel\n", - "* BertForPreTraining\n", - "* BertForMaskedLM\n", - "* BertForNextSentencePrediction\n", - "* **BertForSequenceClassification** - The one we'll use.\n", - "* BertForTokenClassification\n", - "* BertForQuestionAnswering\n", - "\n", - "The documentation for these can be found under [here](https://huggingface.co/transformers/v2.2.0/model_doc/bert.html)." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "BXYitPoE-cjH" - }, - "source": [ - "\n", - "\n", - "We'll be using [BertForSequenceClassification](https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#bertforsequenceclassification). This is the normal BERT model with an added single linear layer on top for classification that we will use as a sentence classifier. As we feed input data, the entire pre-trained BERT model and the additional untrained classification layer is trained on our specific task. \n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WnQW9E-bBCRt" - }, - "source": [ - "OK, let's load BERT! There are a few different pre-trained BERT models available. \"bert-base-uncased\" means the version that has only lowercase letters (\"uncased\") and is the smaller version of the two (\"base\" vs \"large\").\n", - "\n", - "The documentation for `from_pretrained` can be found [here](https://huggingface.co/transformers/v2.2.0/main_classes/model.html#transformers.PreTrainedModel.from_pretrained), with the additional parameters defined [here](https://huggingface.co/transformers/v2.2.0/main_classes/configuration.html#transformers.PretrainedConfig)." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000, - "referenced_widgets": [ - "bf663b176284485e8db6b888bbd73d00", - "0358864aed7145e5a8b1ae70b33d8011", - "0a2a650a8e68491c85be1d84112645c3", - "6494768c95954aeaa092d1b8f494bf65", - "d4bd45ba11a94400a6b2afb62638506c", - "0a49f12eb51244f2882a979483acd105", - "801f8b39de0c488baf8c92705d3e2905", - "17b79c0e5d194a07bfff70f84ec10858", - "cca16e2b7d7446ccb05a2a09fdb1134b", - "54829969375b46d0a07900700a6e21a3", - "962dda4c15404a2c84caf50e208164b9" - ] - }, - "id": "gFsCTp_mporB", - "outputId": "2f63a147-b192-4f71-d1b6-93edc2abddcd" - }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "bf663b176284485e8db6b888bbd73d00", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading: 0%| | 0.00/420M [00:0012}\".format(p[0], str(tuple(p[1].size()))))\n", - "\n", - "print('\\n==== First Transformer ====\\n')\n", - "\n", - "for p in params[5:21]:\n", - " print(\"{:<55} {:>12}\".format(p[0], str(tuple(p[1].size()))))\n", - "\n", - "print('\\n==== Output Layer ====\\n')\n", - "\n", - "for p in params[-4:]:\n", - " print(\"{:<55} {:>12}\".format(p[0], str(tuple(p[1].size()))))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qRWT-D4U_Pvx" - }, - "source": [ - "## 4.2. Optimizer & Learning Rate Scheduler" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8o-VEBobKwHk" - }, - "source": [ - "Now that we have our model loaded we need to grab the training hyperparameters from within the stored model.\n", - "\n", - "For the purposes of fine-tuning, the authors recommend choosing from the following values (from Appendix A.3 of the [BERT paper](https://arxiv.org/pdf/1810.04805.pdf)):\n", - "\n", - ">- **Batch size:** 16, 32 \n", - "- **Learning rate (Adam):** 5e-5, 3e-5, 2e-5 \n", - "- **Number of epochs:** 2, 3, 4 \n", - "\n", - "We chose:\n", - "* Batch size: 32 (set when creating our DataLoaders)\n", - "* Learning rate: 2e-5\n", - "* Epochs: 4 (we'll see that this is probably too many...)\n", - "\n", - "The epsilon parameter `eps = 1e-8` is \"a very small number to prevent any division by zero in the implementation\" (from [here](https://machinelearningmastery.com/adam-optimization-algorithm-for-deep-learning/)).\n", - "\n", - "You can find the creation of the AdamW optimizer in `run_glue.py` [here](https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L109)." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "id": "GLs72DuMODJO" - }, - "outputs": [], - "source": [ - "# Note: AdamW is a class from the huggingface library (as opposed to pytorch) \n", - "# I believe the 'W' stands for 'Weight Decay fix\"\n", - "optimizer = AdamW(model.parameters(),\n", - " lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5\n", - " eps = 1e-8 # args.adam_epsilon - default is 1e-8.\n", - " )\n" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "id": "-p0upAhhRiIx" - }, - "outputs": [], - "source": [ - "from transformers import get_linear_schedule_with_warmup\n", - "\n", - "# Number of training epochs. The BERT authors recommend between 2 and 4. \n", - "# We chose to run for 4, but we'll see later that this may be over-fitting the\n", - "# training data.\n", - "epochs = 4\n", - "\n", - "# Total number of training steps is [number of batches] x [number of epochs]. \n", - "# (Note that this is not the same as the number of training samples).\n", - "total_steps = len(train_dataloader) * epochs\n", - "\n", - "# Create the learning rate scheduler.\n", - "scheduler = get_linear_schedule_with_warmup(optimizer, \n", - " num_warmup_steps = 0, # Default value in run_glue.py\n", - " num_training_steps = total_steps)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RqfmWwUR_Sox" - }, - "source": [ - "## 4.3. Training Loop" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_QXZhFb4LnV5" - }, - "source": [ - "Below is our training loop. There's a lot going on, but fundamentally for each pass in our loop we have a training phase and a validation phase. \n", - "\n", - "> *Thank you to [Stas Bekman](https://ca.linkedin.com/in/stasbekman) for contributing the insights and code for using validation loss to detect over-fitting!*\n", - "\n", - "**Training:**\n", - "- Unpack our data inputs and labels\n", - "- Load data onto the GPU for acceleration\n", - "- Clear out the gradients calculated in the previous pass. \n", - " - In pytorch the gradients accumulate by default (useful for things like RNNs) unless you explicitly clear them out.\n", - "- Forward pass (feed input data through the network)\n", - "- Backward pass (backpropagation)\n", - "- Tell the network to update parameters with optimizer.step()\n", - "- Track variables for monitoring progress\n", - "\n", - "**Evalution:**\n", - "- Unpack our data inputs and labels\n", - "- Load data onto the GPU for acceleration\n", - "- Forward pass (feed input data through the network)\n", - "- Compute loss on our validation data and track variables for monitoring progress\n", - "\n", - "Pytorch hides all of the detailed calculations from us, but we've commented the code to point out which of the above steps are happening on each line. \n", - "\n", - "> *PyTorch also has some [beginner tutorials](https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html#sphx-glr-beginner-blitz-cifar10-tutorial-py) which you may also find helpful.*" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pE5B99H5H2-W" - }, - "source": [ - "Define a helper function for calculating accuracy." - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "id": "9cQNvaZ9bnyy" - }, - "outputs": [], - "source": [ - "import numpy as np\n", - "\n", - "# Function to calculate the accuracy of our predictions vs labels\n", - "def flat_accuracy(preds, labels):\n", - " pred_flat = np.argmax(preds, axis=1).flatten()\n", - " labels_flat = labels.flatten()\n", - " return np.sum(pred_flat == labels_flat) / len(labels_flat)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KNhRtWPXH9C3" - }, - "source": [ - "Helper function for formatting elapsed times as `hh:mm:ss`\n" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": { - "id": "gpt6tR83keZD" - }, - "outputs": [], - "source": [ - "import time\n", - "import datetime\n", - "\n", - "def format_time(elapsed):\n", - " '''\n", - " Takes a time in seconds and returns a string hh:mm:ss\n", - " '''\n", - " # Round to the nearest second.\n", - " elapsed_rounded = int(round((elapsed)))\n", - " \n", - " # Format as hh:mm:ss\n", - " return str(datetime.timedelta(seconds=elapsed_rounded))\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "cfNIhN19te3N" - }, - "source": [ - "We're ready to kick off the training!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "6J-FYdx6nFE_", - "outputId": "6847197f-8ee4-40fe-a3ff-c354aefba69a" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "======== Epoch 1 / 4 ========\n", - "Training...\n", - " Batch 40 of 241. Elapsed: 0:00:26.\n", - " Batch 80 of 241. Elapsed: 0:00:52.\n", - " Batch 120 of 241. Elapsed: 0:01:17.\n", - " Batch 160 of 241. Elapsed: 0:01:43.\n", - " Batch 200 of 241. Elapsed: 0:02:09.\n", - " Batch 240 of 241. Elapsed: 0:02:35.\n", - "\n", - " Average training loss: 0.49\n", - " Training epcoh took: 0:02:36\n", - "\n", - "Running Validation...\n", - " Accuracy: 0.82\n", - " Validation Loss: 0.41\n", - " Validation took: 0:00:06\n", - "\n", - "======== Epoch 2 / 4 ========\n", - "Training...\n", - " Batch 40 of 241. Elapsed: 0:00:26.\n", - " Batch 80 of 241. Elapsed: 0:00:52.\n", - " Batch 120 of 241. Elapsed: 0:01:18.\n", - " Batch 160 of 241. Elapsed: 0:01:44.\n", - " Batch 200 of 241. Elapsed: 0:02:10.\n", - " Batch 240 of 241. Elapsed: 0:02:36.\n", - "\n", - " Average training loss: 0.30\n", - " Training epcoh took: 0:02:36\n", - "\n", - "Running Validation...\n", - " Accuracy: 0.84\n", - " Validation Loss: 0.39\n", - " Validation took: 0:00:06\n", - "\n", - "======== Epoch 3 / 4 ========\n", - "Training...\n", - " Batch 40 of 241. Elapsed: 0:00:26.\n", - " Batch 80 of 241. Elapsed: 0:00:52.\n" - ] - } - ], - "source": [ - "import random\n", - "import numpy as np\n", - "\n", - "# This training code is based on the `run_glue.py` script here:\n", - "# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128\n", - "\n", - "# Set the seed value all over the place to make this reproducible.\n", - "seed_val = 42\n", - "\n", - "random.seed(seed_val)\n", - "np.random.seed(seed_val)\n", - "torch.manual_seed(seed_val)\n", - "torch.cuda.manual_seed_all(seed_val)\n", - "\n", - "# We'll store a number of quantities such as training and validation loss, \n", - "# validation accuracy, and timings.\n", - "training_stats = []\n", - "\n", - "# Measure the total training time for the whole run.\n", - "total_t0 = time.time()\n", - "\n", - "# For each epoch...\n", - "for epoch_i in range(0, epochs):\n", - " \n", - " # ========================================\n", - " # Training\n", - " # ========================================\n", - " \n", - " # Perform one full pass over the training set.\n", - "\n", - " print(\"\")\n", - " print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))\n", - " print('Training...')\n", - "\n", - " # Measure how long the training epoch takes.\n", - " t0 = time.time()\n", - "\n", - " # Reset the total loss for this epoch.\n", - " total_train_loss = 0\n", - "\n", - " # Put the model into training mode. Don't be mislead--the call to \n", - " # `train` just changes the *mode*, it doesn't *perform* the training.\n", - " # `dropout` and `batchnorm` layers behave differently during training\n", - " # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)\n", - " model.train()\n", - "\n", - " # For each batch of training data...\n", - " for step, batch in enumerate(train_dataloader):\n", - "\n", - " # Progress update every 40 batches.\n", - " if step % 40 == 0 and not step == 0:\n", - " # Calculate elapsed time in minutes.\n", - " elapsed = format_time(time.time() - t0)\n", - " \n", - " # Report progress.\n", - " print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))\n", - "\n", - " # Unpack this training batch from our dataloader. \n", - " #\n", - " # As we unpack the batch, we'll also copy each tensor to the GPU using the \n", - " # `to` method.\n", - " #\n", - " # `batch` contains three pytorch tensors:\n", - " # [0]: input ids \n", - " # [1]: attention masks\n", - " # [2]: labels \n", - " b_input_ids = batch[0].to(device)\n", - " b_input_mask = batch[1].to(device)\n", - " b_labels = batch[2].to(device)\n", - "\n", - " # Always clear any previously calculated gradients before performing a\n", - " # backward pass. PyTorch doesn't do this automatically because \n", - " # accumulating the gradients is \"convenient while training RNNs\". \n", - " # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)\n", - " model.zero_grad() \n", - "\n", - " # Perform a forward pass (evaluate the model on this training batch).\n", - " # In PyTorch, calling `model` will in turn call the model's `forward` \n", - " # function and pass down the arguments. The `forward` function is \n", - " # documented here: \n", - " # https://huggingface.co/transformers/model_doc/bert.html#bertforsequenceclassification\n", - " # The results are returned in a results object, documented here:\n", - " # https://huggingface.co/transformers/main_classes/output.html#transformers.modeling_outputs.SequenceClassifierOutput\n", - " # Specifically, we'll get the loss (because we provided labels) and the\n", - " # \"logits\"--the model outputs prior to activation.\n", - " result = model(b_input_ids, \n", - " token_type_ids=None, \n", - " attention_mask=b_input_mask, \n", - " labels=b_labels,\n", - " return_dict=True)\n", - "\n", - " loss = result.loss\n", - " logits = result.logits\n", - "\n", - " # Accumulate the training loss over all of the batches so that we can\n", - " # calculate the average loss at the end. `loss` is a Tensor containing a\n", - " # single value; the `.item()` function just returns the Python value \n", - " # from the tensor.\n", - " total_train_loss += loss.item()\n", - "\n", - " # Perform a backward pass to calculate the gradients.\n", - " loss.backward()\n", - "\n", - " # Clip the norm of the gradients to 1.0.\n", - " # This is to help prevent the \"exploding gradients\" problem.\n", - " torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)\n", - "\n", - " # Update parameters and take a step using the computed gradient.\n", - " # The optimizer dictates the \"update rule\"--how the parameters are\n", - " # modified based on their gradients, the learning rate, etc.\n", - " optimizer.step()\n", - "\n", - " # Update the learning rate.\n", - " scheduler.step()\n", - "\n", - " # Calculate the average loss over all of the batches.\n", - " avg_train_loss = total_train_loss / len(train_dataloader) \n", - " \n", - " # Measure how long this epoch took.\n", - " training_time = format_time(time.time() - t0)\n", - "\n", - " print(\"\")\n", - " print(\" Average training loss: {0:.2f}\".format(avg_train_loss))\n", - " print(\" Training epcoh took: {:}\".format(training_time))\n", - " \n", - " # ========================================\n", - " # Validation\n", - " # ========================================\n", - " # After the completion of each training epoch, measure our performance on\n", - " # our validation set.\n", - "\n", - " print(\"\")\n", - " print(\"Running Validation...\")\n", - "\n", - " t0 = time.time()\n", - "\n", - " # Put the model in evaluation mode--the dropout layers behave differently\n", - " # during evaluation.\n", - " model.eval()\n", - "\n", - " # Tracking variables \n", - " total_eval_accuracy = 0\n", - " total_eval_loss = 0\n", - " nb_eval_steps = 0\n", - "\n", - " # Evaluate data for one epoch\n", - " for batch in validation_dataloader:\n", - " \n", - " # Unpack this training batch from our dataloader. \n", - " #\n", - " # As we unpack the batch, we'll also copy each tensor to the GPU using \n", - " # the `to` method.\n", - " #\n", - " # `batch` contains three pytorch tensors:\n", - " # [0]: input ids \n", - " # [1]: attention masks\n", - " # [2]: labels \n", - " b_input_ids = batch[0].to(device)\n", - " b_input_mask = batch[1].to(device)\n", - " b_labels = batch[2].to(device)\n", - " \n", - " # Tell pytorch not to bother with constructing the compute graph during\n", - " # the forward pass, since this is only needed for backprop (training).\n", - " with torch.no_grad(): \n", - "\n", - " # Forward pass, calculate logit predictions.\n", - " # token_type_ids is the same as the \"segment ids\", which \n", - " # differentiates sentence 1 and 2 in 2-sentence tasks.\n", - " result = model(b_input_ids, \n", - " token_type_ids=None, \n", - " attention_mask=b_input_mask,\n", - " labels=b_labels,\n", - " return_dict=True)\n", - "\n", - " # Get the loss and \"logits\" output by the model. The \"logits\" are the \n", - " # output values prior to applying an activation function like the \n", - " # softmax.\n", - " loss = result.loss\n", - " logits = result.logits\n", - " \n", - " # Accumulate the validation loss.\n", - " total_eval_loss += loss.item()\n", - "\n", - " # Move logits and labels to CPU\n", - " logits = logits.detach().cpu().numpy()\n", - " label_ids = b_labels.to('cpu').numpy()\n", - "\n", - " # Calculate the accuracy for this batch of test sentences, and\n", - " # accumulate it over all batches.\n", - " total_eval_accuracy += flat_accuracy(logits, label_ids)\n", - " \n", - "\n", - " # Report the final accuracy for this validation run.\n", - " avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)\n", - " print(\" Accuracy: {0:.2f}\".format(avg_val_accuracy))\n", - "\n", - " # Calculate the average loss over all of the batches.\n", - " avg_val_loss = total_eval_loss / len(validation_dataloader)\n", - " \n", - " # Measure how long the validation run took.\n", - " validation_time = format_time(time.time() - t0)\n", - " \n", - " print(\" Validation Loss: {0:.2f}\".format(avg_val_loss))\n", - " print(\" Validation took: {:}\".format(validation_time))\n", - "\n", - " # Record all statistics from this epoch.\n", - " training_stats.append(\n", - " {\n", - " 'epoch': epoch_i + 1,\n", - " 'Training Loss': avg_train_loss,\n", - " 'Valid. Loss': avg_val_loss,\n", - " 'Valid. Accur.': avg_val_accuracy,\n", - " 'Training Time': training_time,\n", - " 'Validation Time': validation_time\n", - " }\n", - " )\n", - "\n", - "print(\"\")\n", - "print(\"Training complete!\")\n", - "\n", - "print(\"Total training took {:} (h:mm:ss)\".format(format_time(time.time()-total_t0)))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "VQTvJ1vRP7u4" - }, - "source": [ - "Let's view the summary of the training process." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 195 - }, - "id": "6O_NbXFGMukX", - "outputId": "a9e51eda-5eae-4800-87d5-8d016ff25bb2" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Training LossValid. LossValid. Accur.Training TimeValidation Time
epoch
10.490.440.810:00:350:00:01
20.300.450.810:00:340:00:01
30.180.500.830:00:340:00:01
40.120.570.840:00:340:00:01
\n", - "
" - ], - "text/plain": [ - " Training Loss Valid. Loss Valid. Accur. Training Time Validation Time\n", - "epoch \n", - "1 0.49 0.44 0.81 0:00:35 0:00:01\n", - "2 0.30 0.45 0.81 0:00:34 0:00:01\n", - "3 0.18 0.50 0.83 0:00:34 0:00:01\n", - "4 0.12 0.57 0.84 0:00:34 0:00:01" - ] - }, - "execution_count": 23, - "metadata": { - "tags": [] - }, - "output_type": "execute_result" - } - ], - "source": [ - "import pandas as pd\n", - "\n", - "# Display floats with two decimal places.\n", - "pd.set_option('precision', 2)\n", - "\n", - "# Create a DataFrame from our training statistics.\n", - "df_stats = pd.DataFrame(data=training_stats)\n", - "\n", - "# Use the 'epoch' as the row index.\n", - "df_stats = df_stats.set_index('epoch')\n", - "\n", - "# A hack to force the column headers to wrap.\n", - "#df = df.style.set_table_styles([dict(selector=\"th\",props=[('max-width', '70px')])])\n", - "\n", - "# Display the table.\n", - "df_stats" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1-G03mmwH3aI" - }, - "source": [ - "Notice that, while the the training loss is going down with each epoch, the validation loss is increasing! This suggests that we are training our model too long, and it's over-fitting on the training data. \n", - "\n", - "(For reference, we are using 7,695 training samples and 856 validation samples).\n", - "\n", - "Validation Loss is a more precise measure than accuracy, because with accuracy we don't care about the exact output value, but just which side of a threshold it falls on. \n", - "\n", - "If we are predicting the correct answer, but with less confidence, then validation loss will catch this, while accuracy will not." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 427 - }, - "id": "68xreA9JAmG5", - "outputId": "70b8500d-7efc-4c99-de1f-05e8795e6298" - }, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "tags": [] - }, - "output_type": "display_data" - } - ], - "source": [ - "import matplotlib.pyplot as plt\n", - "% matplotlib inline\n", - "\n", - "import seaborn as sns\n", - "\n", - "# Use plot styling from seaborn.\n", - "sns.set(style='darkgrid')\n", - "\n", - "# Increase the plot size and font size.\n", - "sns.set(font_scale=1.5)\n", - "plt.rcParams[\"figure.figsize\"] = (12,6)\n", - "\n", - "# Plot the learning curve.\n", - "plt.plot(df_stats['Training Loss'], 'b-o', label=\"Training\")\n", - "plt.plot(df_stats['Valid. Loss'], 'g-o', label=\"Validation\")\n", - "\n", - "# Label the plot.\n", - "plt.title(\"Training & Validation Loss\")\n", - "plt.xlabel(\"Epoch\")\n", - "plt.ylabel(\"Loss\")\n", - "plt.legend()\n", - "plt.xticks([1, 2, 3, 4])\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "mkyubuJSOzg3" - }, - "source": [ - "# 5. Performance On Test Set" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DosV94BYIYxg" - }, - "source": [ - "Now we'll load the holdout dataset and prepare inputs just as we did with the training set. Then we'll evaluate predictions using [Matthew's correlation coefficient](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.matthews_corrcoef.html) because this is the metric used by the wider NLP community to evaluate performance on CoLA. With this metric, +1 is the best score, and -1 is the worst score. This way, we can see how well we perform against the state of the art models for this specific task." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Tg42jJqqM68F" - }, - "source": [ - "### 5.1. Data Preparation\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xWe0_JW21MyV" - }, - "source": [ - "\n", - "We'll need to apply all of the same steps that we did for the training data to prepare our test data set." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "mAN0LZBOOPVh", - "outputId": "7385ca3f-72d5-45f0-bbfe-5056c2f62c4f" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Number of test sentences: 516\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.6/dist-packages/transformers/tokenization_utils_base.py:2143: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", - " FutureWarning,\n" - ] - } - ], - "source": [ - "import pandas as pd\n", - "\n", - "# Load the dataset into a pandas dataframe.\n", - "df = pd.read_csv(\"./cola_public/raw/out_of_domain_dev.tsv\", delimiter='\\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])\n", - "\n", - "# Report the number of sentences.\n", - "print('Number of test sentences: {:,}\\n'.format(df.shape[0]))\n", - "\n", - "# Create sentence and label lists\n", - "sentences = df.sentence.values\n", - "labels = df.label.values\n", - "\n", - "# Tokenize all of the sentences and map the tokens to thier word IDs.\n", - "input_ids = []\n", - "attention_masks = []\n", - "\n", - "# For every sentence...\n", - "for sent in sentences:\n", - " # `encode_plus` will:\n", - " # (1) Tokenize the sentence.\n", - " # (2) Prepend the `[CLS]` token to the start.\n", - " # (3) Append the `[SEP]` token to the end.\n", - " # (4) Map tokens to their IDs.\n", - " # (5) Pad or truncate the sentence to `max_length`\n", - " # (6) Create attention masks for [PAD] tokens.\n", - " encoded_dict = tokenizer.encode_plus(\n", - " sent, # Sentence to encode.\n", - " add_special_tokens = True, # Add '[CLS]' and '[SEP]'\n", - " max_length = 64, # Pad & truncate all sentences.\n", - " pad_to_max_length = True,\n", - " return_attention_mask = True, # Construct attn. masks.\n", - " return_tensors = 'pt', # Return pytorch tensors.\n", - " )\n", - " \n", - " # Add the encoded sentence to the list. \n", - " input_ids.append(encoded_dict['input_ids'])\n", - " \n", - " # And its attention mask (simply differentiates padding from non-padding).\n", - " attention_masks.append(encoded_dict['attention_mask'])\n", - "\n", - "# Convert the lists into tensors.\n", - "input_ids = torch.cat(input_ids, dim=0)\n", - "attention_masks = torch.cat(attention_masks, dim=0)\n", - "labels = torch.tensor(labels)\n", - "\n", - "# Set the batch size. \n", - "batch_size = 32 \n", - "\n", - "# Create the DataLoader.\n", - "prediction_data = TensorDataset(input_ids, attention_masks, labels)\n", - "prediction_sampler = SequentialSampler(prediction_data)\n", - "prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "16lctEOyNFik" - }, - "source": [ - "## 5.2. Evaluate on Test Set\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "rhR99IISNMg9" - }, - "source": [ - "\n", - "With the test set prepared, we can apply our fine-tuned model to generate predictions on the test set." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Hba10sXR7Xi6", - "outputId": "e35f0a6e-72c5-4bd0-9c4b-dcec9ef5059d" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Predicting labels for 516 test sentences...\n", - " DONE.\n" - ] - } - ], - "source": [ - "# Prediction on test set\n", - "\n", - "print('Predicting labels for {:,} test sentences...'.format(len(input_ids)))\n", - "\n", - "# Put model in evaluation mode\n", - "model.eval()\n", - "\n", - "# Tracking variables \n", - "predictions , true_labels = [], []\n", - "\n", - "# Predict \n", - "for batch in prediction_dataloader:\n", - " # Add batch to GPU\n", - " batch = tuple(t.to(device) for t in batch)\n", - " \n", - " # Unpack the inputs from our dataloader\n", - " b_input_ids, b_input_mask, b_labels = batch\n", - " \n", - " # Telling the model not to compute or store gradients, saving memory and \n", - " # speeding up prediction\n", - " with torch.no_grad():\n", - " # Forward pass, calculate logit predictions.\n", - " result = model(b_input_ids, \n", - " token_type_ids=None, \n", - " attention_mask=b_input_mask,\n", - " return_dict=True)\n", - "\n", - " logits = result.logits\n", - " \n", - " # Move logits and labels to CPU\n", - " logits = logits.detach().cpu().numpy()\n", - " label_ids = b_labels.to('cpu').numpy()\n", - " \n", - " # Store predictions and true labels\n", - " predictions.append(logits)\n", - " true_labels.append(label_ids)\n", - "\n", - "print(' DONE.')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-5jscIM8R4Gv" - }, - "source": [ - "Accuracy on the CoLA benchmark is measured using the \"[Matthews correlation coefficient](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.matthews_corrcoef.html)\" (MCC).\n", - "\n", - "We use MCC here because the classes are imbalanced:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "hWcy0X1hirdx", - "outputId": "ef5e6753-c244-406a-8141-5078d71b04ee" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Positive samples: 354 of 516 (68.60%)\n" - ] - } - ], - "source": [ - "print('Positive samples: %d of %d (%.2f%%)' % (df.label.sum(), len(df.label), (df.label.sum() / len(df.label) * 100.0)))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "cRaZQ4XC7kLs", - "outputId": "d922af70-1216-4cfb-ac37-1dde75744fd5" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Calculating Matthews Corr. Coef. for each batch...\n" - ] - } - ], - "source": [ - "from sklearn.metrics import matthews_corrcoef\n", - "\n", - "matthews_set = []\n", - "\n", - "# Evaluate each test batch using Matthew's correlation coefficient\n", - "print('Calculating Matthews Corr. Coef. for each batch...')\n", - "\n", - "# For each input batch...\n", - "for i in range(len(true_labels)):\n", - " # The predictions for this batch are a 2-column ndarray (one column for \"0\" \n", - " # and one column for \"1\"). Pick the label with the highest value and turn this\n", - " # in to a list of 0s and 1s.\n", - " pred_labels_i = np.argmax(predictions[i], axis=1).flatten()\n", - " \n", - " # Calculate and store the coef for this batch. \n", - " matthews = matthews_corrcoef(true_labels[i], pred_labels_i) \n", - " matthews_set.append(matthews)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "IUM0UA1qJaVB" - }, - "source": [ - "The final score will be based on the entire test set, but let's take a look at the scores on the individual batches to get a sense of the variability in the metric between batches. \n", - "\n", - "Each batch has 32 sentences in it, except the last batch which has only (516 % 32) = 4 test sentences in it.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 427 - }, - "id": "pyfY1tqxU0t9", - "outputId": "5e477de2-e6a9-466a-9b36-f3651f2996df" - }, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "tags": [] - }, - "output_type": "display_data" - } - ], - "source": [ - "# Create a barplot showing the MCC score for each batch of test samples.\n", - "ax = sns.barplot(x=list(range(len(matthews_set))), y=matthews_set, ci=None)\n", - "\n", - "plt.title('MCC Score per Batch')\n", - "plt.ylabel('MCC Score (-1 to +1)')\n", - "plt.xlabel('Batch #')\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1YrjAPX2V-l4" - }, - "source": [ - "Now we'll combine the results for all of the batches and calculate our final MCC score." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "oCYZa1lQ8Jn8", - "outputId": "b4650298-0e35-4ed8-be13-83f074a617ed" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Total MCC: 0.514\n" - ] - } - ], - "source": [ - "# Combine the results across all batches. \n", - "flat_predictions = np.concatenate(predictions, axis=0)\n", - "\n", - "# For each sample, pick the label (0 or 1) with the higher score.\n", - "flat_predictions = np.argmax(flat_predictions, axis=1).flatten()\n", - "\n", - "# Combine the correct labels for each batch into a single list.\n", - "flat_true_labels = np.concatenate(true_labels, axis=0)\n", - "\n", - "# Calculate the MCC\n", - "mcc = matthews_corrcoef(flat_true_labels, flat_predictions)\n", - "\n", - "print('Total MCC: %.3f' % mcc)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "jXx0jPc4HUfZ" - }, - "source": [ - "Cool! In about half an hour and without doing any hyperparameter tuning (adjusting the learning rate, epochs, batch size, ADAM properties, etc.) we are able to get a good score. \n", - "\n", - "> *Note: To maximize the score, we should remove the \"validation set\" (which we used to help determine how many epochs to train for) and train on the entire training set.*\n", - "\n", - "The library documents the expected accuracy for this benchmark [here](https://huggingface.co/transformers/examples.html#glue) as `49.23`.\n", - "\n", - "You can also look at the official leaderboard [here](https://gluebenchmark.com/leaderboard/submission/zlssuBTm5XRs0aSKbFYGVIVdvbj1/-LhijX9VVmvJcvzKymxy). \n", - "\n", - "Note that (due to the small dataset size?) the accuracy can vary significantly between runs.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "GfjYoa6WmkN6" - }, - "source": [ - "# Conclusion" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xlQG7qgkmf4n" - }, - "source": [ - "This notebook demonstrates that with a pre-trained BERT model you can quickly and effectively create a high quality model with minimal effort and training time using the pytorch interface, regardless of the specific NLP task you are interested in." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "YUmsUOIv8EUO" - }, - "source": [ - "# Appendix\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "q2079Qyn8Mt8" - }, - "source": [ - "### Saving & Loading Fine-Tuned Model\n", - "\n", - "This first cell (taken from `run_glue.py` [here](https://github.com/huggingface/transformers/blob/35ff345fc9df9e777b27903f11fa213e4052595b/examples/run_glue.py#L495)) writes the model and tokenizer out to disk." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "6ulTWaOr8QNY", - "outputId": "1b73b37b-2598-4992-d6d7-0649f410b5c0" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Saving model to ./model_save/\n" - ] - }, - { - "data": { - "text/plain": [ - "('./model_save/tokenizer_config.json',\n", - " './model_save/special_tokens_map.json',\n", - " './model_save/vocab.txt',\n", - " './model_save/added_tokens.json')" - ] - }, - "execution_count": 31, - "metadata": { - "tags": [] - }, - "output_type": "execute_result" - } - ], - "source": [ - "import os\n", - "\n", - "# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()\n", - "\n", - "output_dir = './model_save/'\n", - "\n", - "# Create output directory if needed\n", - "if not os.path.exists(output_dir):\n", - " os.makedirs(output_dir)\n", - "\n", - "print(\"Saving model to %s\" % output_dir)\n", - "\n", - "# Save a trained model, configuration and tokenizer using `save_pretrained()`.\n", - "# They can then be reloaded using `from_pretrained()`\n", - "model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training\n", - "model_to_save.save_pretrained(output_dir)\n", - "tokenizer.save_pretrained(output_dir)\n", - "\n", - "# Good practice: save your training arguments together with the trained model\n", - "# torch.save(args, os.path.join(output_dir, 'training_args.bin'))\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Z-tjHkR7lc1I" - }, - "source": [ - "Let's check out the file sizes, out of curiosity." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "mqMzI3VTCZo5", - "outputId": "96104fe5-67d0-4310-d778-58da5194c2e1" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "total 428000K\n", - "-rw-r--r-- 1 root root 1K Feb 2 17:10 config.json\n", - "-rw-r--r-- 1 root root 427757K Feb 2 17:10 pytorch_model.bin\n", - "-rw-r--r-- 1 root root 1K Feb 2 17:10 special_tokens_map.json\n", - "-rw-r--r-- 1 root root 1K Feb 2 17:10 tokenizer_config.json\n", - "-rw-r--r-- 1 root root 227K Feb 2 17:10 vocab.txt\n" - ] - } - ], - "source": [ - "!ls -l --block-size=K ./model_save/" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "fr_bt2rFlgDn" - }, - "source": [ - "The largest file is the model weights, at around 418 megabytes." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "-WUFUIQ8Cu8D", - "outputId": "b0c9b6c6-5fb8-4d61-d28a-be4324be5a5b" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "-rw-r--r-- 1 root root 418M Feb 2 17:10 ./model_save/pytorch_model.bin\n" - ] - } - ], - "source": [ - "!ls -l --block-size=M ./model_save/pytorch_model.bin" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dzGKvOFAll_e" - }, - "source": [ - "To save your model across Colab Notebook sessions, download it to your local machine, or ideally copy it to your Google Drive." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Trr-A-POC18_" - }, - "outputs": [], - "source": [ - "# Mount Google Drive to this Notebook instance.\n", - "from google.colab import drive\n", - " drive.mount('/content/drive')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NxlZsafTC-V5" - }, - "outputs": [], - "source": [ - "# Copy the model files to a directory in your Google Drive.\n", - "!cp -r ./model_save/ \"./drive/Shared drives/ChrisMcCormick.AI/Blog Posts/BERT Fine-Tuning/\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "W0vstijw85SZ" - }, - "source": [ - "The following functions will load the model back from disk." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "nskPzUM084zL" - }, - "outputs": [], - "source": [ - "# Load a trained model and vocabulary that you have fine-tuned\n", - "model = model_class.from_pretrained(output_dir)\n", - "tokenizer = tokenizer_class.from_pretrained(output_dir)\n", - "\n", - "# Copy the model to the GPU.\n", - "model.to(device)" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "collapsed_sections": [], - "name": "BERT Fine-Tuning Sentence Classification v4.ipynb", - "provenance": [], - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.5" - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "0358864aed7145e5a8b1ae70b33d8011": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "04b9332a3fa9454da1a7169353b99221": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_cf5d78f166d54aadbd8f68976f1e9ccc", - "placeholder": "​", - "style": "IPY_MODEL_ce8f59b53f894031b4ff4e0e1b63a61f", - "value": "Downloading: 100%" - } - }, - "0a2a650a8e68491c85be1d84112645c3": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_801f8b39de0c488baf8c92705d3e2905", - "placeholder": "​", - "style": "IPY_MODEL_0a49f12eb51244f2882a979483acd105", - "value": "Downloading: 100%" - } - }, - "0a49f12eb51244f2882a979483acd105": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "0d0fa745757f4c93a5e3ce9c2fadc9a3": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "176c3ac86a8f4dfa9ff221d67076edc5": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "17b79c0e5d194a07bfff70f84ec10858": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "24fd7d7781a448f6b4164bbeb64888b3": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "2a70c1b9bda34e6a89ca14b3b663b959": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_b59dd15b11a64b8cace572993e8323ba", - "max": 466062, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_f57a3f9b473543bda0c27ebcba001fad", - "value": 466062 - } - }, - "37ea42eb1ecd44eab1ea46db51fe2800": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_0d0fa745757f4c93a5e3ce9c2fadc9a3", - "placeholder": "​", - "style": "IPY_MODEL_24fd7d7781a448f6b4164bbeb64888b3", - "value": "Downloading: 100%" - } - }, - "3a5ad7be77ce4127b434b749ad14b00a": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "3dfad6bb2985460da31cc42d7e88168c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "3eb262cbc05a4a48808623fbd11d7649": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "44e2066d588b496f97aa00cf51c9447c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_93d601fafc26454b8de64f626daf3074", - "IPY_MODEL_2a70c1b9bda34e6a89ca14b3b663b959", - "IPY_MODEL_e9afd7edefb44fe4847ab5a9c0b24999" - ], - "layout": "IPY_MODEL_176c3ac86a8f4dfa9ff221d67076edc5" - } - }, - "521bcdb6786b4ba1a361cc8de1c97036": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_04b9332a3fa9454da1a7169353b99221", - "IPY_MODEL_a29e31e998974e758d09daf1a6186610", - "IPY_MODEL_d2f54cdaba8e4a04bdb3b8d279afe5ef" - ], - "layout": "IPY_MODEL_da7f2189149d4dcb8d012ffa2b9e34bb" - } - }, - "54829969375b46d0a07900700a6e21a3": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "6494768c95954aeaa092d1b8f494bf65": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_cca16e2b7d7446ccb05a2a09fdb1134b", - "max": 440473133, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_17b79c0e5d194a07bfff70f84ec10858", - "value": 440473133 - } - }, - "747ba5c2117a42b28539371fe04b19ad": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_37ea42eb1ecd44eab1ea46db51fe2800", - "IPY_MODEL_ef3a445c93764273b132d56d8b7545a3", - "IPY_MODEL_ca26fc3ff341416f820e3ea94aa83188" - ], - "layout": "IPY_MODEL_c91181d3293d457a8c6d32aca47e5c27" - } - }, - "7d6e576e3c464b5693c51d35032107cb": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "801f8b39de0c488baf8c92705d3e2905": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "86fc378943aa48ee893bef7404abc96a": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "879a098d294d444c8176ce6221846d95": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "89c02af3d0f74c18a8e294f78b1f961e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_972be3faee024479ab0667892594d00c", - "placeholder": "​", - "style": "IPY_MODEL_3dfad6bb2985460da31cc42d7e88168c", - "value": " 226k/226k [00:00<00:00, 568kB/s]" - } - }, - "8f66495735614898b617c27c0a89255d": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "9110261cd8b94147b72dec393886b308": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "93853fa079c546b58b1c9ed8e586ae6c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_e4306d918bbb47abae367df2c7ac2e9d", - "IPY_MODEL_ac1cb16ed2ef4e6ab40e790e5e9ce47f", - "IPY_MODEL_89c02af3d0f74c18a8e294f78b1f961e" - ], - "layout": "IPY_MODEL_8f66495735614898b617c27c0a89255d" - } - }, - "93d601fafc26454b8de64f626daf3074": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_fb5ff76f955942dda1bb6c98f827b893", - "placeholder": "​", - "style": "IPY_MODEL_7d6e576e3c464b5693c51d35032107cb", - "value": "Downloading: 100%" - } - }, - "962dda4c15404a2c84caf50e208164b9": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "972be3faee024479ab0667892594d00c": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "a29e31e998974e758d09daf1a6186610": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_3a5ad7be77ce4127b434b749ad14b00a", - "max": 570, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_c9f13c482ecc4448816bec89e13816ab", - "value": 570 - } - }, - "a868dfc37b55447d8bab7350ca8cd0cf": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "ab13bda5dc204c3cbb633535494eb453": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "ac1cb16ed2ef4e6ab40e790e5e9ce47f": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_dbb229077bcf437aa25c5a16276e9c48", - "max": 231508, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_3eb262cbc05a4a48808623fbd11d7649", - "value": 231508 - } - }, - "b59dd15b11a64b8cace572993e8323ba": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "bf663b176284485e8db6b888bbd73d00": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_0a2a650a8e68491c85be1d84112645c3", - "IPY_MODEL_6494768c95954aeaa092d1b8f494bf65", - "IPY_MODEL_d4bd45ba11a94400a6b2afb62638506c" - ], - "layout": "IPY_MODEL_0358864aed7145e5a8b1ae70b33d8011" - } - }, - "c91181d3293d457a8c6d32aca47e5c27": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "c9f13c482ecc4448816bec89e13816ab": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "ca26fc3ff341416f820e3ea94aa83188": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_e389ecd683b64c28be15da07866f73c1", - "placeholder": "​", - "style": "IPY_MODEL_879a098d294d444c8176ce6221846d95", - "value": " 28.0/28.0 [00:00<00:00, 741B/s]" - } - }, - "cca16e2b7d7446ccb05a2a09fdb1134b": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "ce8f59b53f894031b4ff4e0e1b63a61f": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "cf5d78f166d54aadbd8f68976f1e9ccc": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "d0970cd03bc74b5da9f913fda325c1e0": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "d2f54cdaba8e4a04bdb3b8d279afe5ef": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_86fc378943aa48ee893bef7404abc96a", - "placeholder": "​", - "style": "IPY_MODEL_f2dcfd7f0f9a48b08730714d44d18218", - "value": " 570/570 [00:00<00:00, 14.3kB/s]" - } - }, - "d4bd45ba11a94400a6b2afb62638506c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_962dda4c15404a2c84caf50e208164b9", - "placeholder": "​", - "style": "IPY_MODEL_54829969375b46d0a07900700a6e21a3", - "value": " 420M/420M [00:13<00:00, 35.5MB/s]" - } - }, - "da0235bd5a3642eba1b5749afe2c6f61": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "da7f2189149d4dcb8d012ffa2b9e34bb": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "dbb229077bcf437aa25c5a16276e9c48": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "e389ecd683b64c28be15da07866f73c1": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "e4306d918bbb47abae367df2c7ac2e9d": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_ab13bda5dc204c3cbb633535494eb453", - "placeholder": "​", - "style": "IPY_MODEL_a868dfc37b55447d8bab7350ca8cd0cf", - "value": "Downloading: 100%" - } - }, - "e9afd7edefb44fe4847ab5a9c0b24999": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_9110261cd8b94147b72dec393886b308", - "placeholder": "​", - "style": "IPY_MODEL_d0970cd03bc74b5da9f913fda325c1e0", - "value": " 455k/455k [00:00<00:00, 867kB/s]" - } - }, - "edeb0997fdb84daba10b4a9a102adee5": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "ef3a445c93764273b132d56d8b7545a3": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_edeb0997fdb84daba10b4a9a102adee5", - "max": 28, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_da0235bd5a3642eba1b5749afe2c6f61", - "value": 28 - } - }, - "f2dcfd7f0f9a48b08730714d44d18218": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "f57a3f9b473543bda0c27ebcba001fad": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "fb5ff76f955942dda1bb6c98f827b893": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - } - } - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/week08_question_answering/README.md b/week08_question_answering/README.md deleted file mode 100644 index f2091ab..0000000 --- a/week08_question_answering/README.md +++ /dev/null @@ -1,2 +0,0 @@ -Question Answering and TTS: -[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/girafe-ai/natural-language-processing/blob/master/week08_question_answering/practice_question_answering_and_tts.ipynb) \ No newline at end of file diff --git a/week08_question_answering/lect08_Question_Answering.pdf b/week08_question_answering/lect08_Question_Answering.pdf deleted file mode 100644 index 33e1740..0000000 Binary files a/week08_question_answering/lect08_Question_Answering.pdf and /dev/null differ diff --git a/week08_question_answering/practice_question_answering_and_tts.ipynb b/week08_question_answering/practice_question_answering_and_tts.ipynb deleted file mode 100644 index 0bb69a6..0000000 --- a/week08_question_answering/practice_question_answering_and_tts.ipynb +++ /dev/null @@ -1,3261 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "DphyQXreodzp" - }, - "source": [ - "# Practice: Question Answering with a Fine-Tuned BERT (and TTS example)\n", - "\n", - "This notebook is based on great [post and corresponding notebook](https://mccormickml.com/2020/03/10/question-answering-with-a-fine-tuned-BERT/) *by Chris McCormick*. It contains some minor changes and additions (especially parts 3 and 4)." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6_mAnIPKaXyw" - }, - "source": [ - "What does it mean for BERT to achieve \"human-level performance on Question Answering\"? Is BERT the greatest search engine ever, able to find the answer to any question we pose it?\n", - "\n", - "In **Part 1** of this notebook, we will discuss what it really means to apply BERT to QA, and illustrate the details.\n", - "\n", - "**Part 2** contains example code--we'll be downloading a model that's *already been fine-tuned* for question answering, and try it out on our own text! \n", - "\n", - "In **Part 3** we will apply the same approach to Russian language using the model pre-trained on SberQuAD dataset.\n", - "\n", - "And in **Part 4** and **Part 5** we will generate question and answer as audio in english and russian languages.\n", - "\n", - "**Links**\n", - "\n", - "* The [video walkthrough](https://youtu.be/l8ZYCvgGu0o) on this topic. \n", - "* The [original blog post](https://mccormickml.com/2020/03/10/question-answering-with-a-fine-tuned-BERT/) version.\n", - "* The [original Colab Notebook](https://colab.research.google.com/drive/1uSlWtJdZmLrI3FCNIlUHFxwAJiSu2J0-)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "id": "-5uC3kgC7rRK", - "outputId": "97c24371-376c-4f56-a8fa-460d0d398bca" - }, - "outputs": [], - "source": [ - "!pip install -U transformers deeppavlov unidecode omegaconf" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "d5YCFVasLbaM" - }, - "outputs": [], - "source": [ - "# This cell is optional and needed only for Russian language inference\n", - "\n", - "\n", - "# !python -m deeppavlov install squad_ru_rubert\n", - "\n", - "# # Pre-downloading the BERT for Russian language. Same result can be achieved with\n", - "# # `!python -m deeppavlov download squad_ru_rubert`\n", - "# # But it works significantly slower.\n", - "# !wget -nc https://www.dropbox.com/s/7za1o6vaffbdlcg/rubert_cased_L-12_H-768_A-12_v1.tar.gz\n", - "# !mkdir -p /root/.deeppavlov/downloads/bert_models/\n", - "# !tar -xzvf rubert_cased_L-12_H-768_A-12_v1.tar.gz -C /root/.deeppavlov/downloads/bert_models\n", - "\n", - "# !wget -nc https://www.dropbox.com/s/ns8280pd9t9n9dc/squad_model_ru_rubert.tar.gz\n", - "# !mkdir -p /root/.deeppavlov/models/\n", - "# !tar -xzvf squad_model_ru_rubert.tar.gz -C /root/.deeppavlov/models" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "OfzJwB_17rRM", - "outputId": "f48ad518-2d66-4912-b720-2d772fb04b4f" - }, - "outputs": [], - "source": [ - "import torch\n", - "\n", - "assert torch.cuda.is_available(), 'Tacotron2 by NVIDIA infers only on GPU, so the Part 4 will not work on CPU-only machine'\n", - "\n", - "device = torch.device('cuda:0')\n", - "tacotron2 = torch.hub.load('nvidia/DeepLearningExamples:torchhub', 'nvidia_tacotron2', **{'map_location': device})\n", - "tacotron2.to(device)\n", - "tacotron2.eval()\n", - "\n", - "waveglow = torch.hub.load('nvidia/DeepLearningExamples:torchhub', 'nvidia_waveglow')\n", - "waveglow = waveglow.remove_weightnorm(waveglow)\n", - "waveglow.to(device)\n", - "waveglow.eval();" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "X2bUvKUffHNY" - }, - "source": [ - "## Part 1: Applying BERT to Question Answering" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Su7fixBdiUex" - }, - "source": [ - "### The SQuAD v1.1 Benchmark" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bT5ESKDxfnLf" - }, - "source": [ - "When someone mentions \"Question Answering\" as an application of BERT, what they are really referring to is applying BERT to the Stanford Question Answering Dataset (SQuAD).\n", - "\n", - "The task posed by the SQuAD benchmark is a little different than you might think. Given a question, and *a passage of text containing the answer* (often refered to as context), BERT needs to highlight the \"span\" of text corresponding to the correct answer. \n", - "\n", - "The SQuAD homepage has a fantastic tool for exploring the questions and reference text for this dataset, and even shows the predictions made by top-performing models.\n", - "\n", - "For example, here are some [interesting examples](https://rajpurkar.github.io/SQuAD-explorer/explore/1.1/dev/Super_Bowl_50.html?model=r-net+%20(ensemble)%20(Microsoft%20Research%20Asia)&version=1.1) on the topic of Super Bowl 50.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_xN5f1bxf6K_" - }, - "source": [ - "### BERT Input Format" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Ctum5SK6f9uP" - }, - "source": [ - "To feed a QA task into BERT, we pack both the question and the reference text into the input.\n", - "\n", - "![Input format for QA](https://raw.githubusercontent.com/neychev/made_nlp_course/master/week10_speech_distillation_and_perspectives/img/input_formatting_image.png)\n", - "*Image credits: [Chris McCormick](https://mccormickml.com/2020/03/10/question-answering-with-a-fine-tuned-BERT/)*\n", - "\n", - "The two pieces of text are separated by the special `[SEP]` token. \n", - "\n", - "> _Side note:_ Original BERT also uses \"Segment Embeddings\" to differentiate the question from the reference text. These are simply two embeddings (for segments \"A\" and \"B\") that BERT learned, and which it adds to the token embeddings before feeding them into the input layer. However today we will be using DistilBERT model, which relies solely on the special tokens." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xs31dcrPg5Tg" - }, - "source": [ - "### Start & End Token Classifiers" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lvOdUa9Wg-Uv" - }, - "source": [ - "BERT needs to highlight a \"span\" of text containing the answer--this is represented as simply predicting which token marks the start of the answer, and which token marks the end.\n", - "\n", - "![Start token classification](https://raw.githubusercontent.com/neychev/made_nlp_course/master/week10_speech_distillation_and_perspectives/img/start_token_classification_image.png)\n", - "*Image credits: [Chris McCormick](https://mccormickml.com/2020/03/10/question-answering-with-a-fine-tuned-BERT/)*\n", - "\n", - "For every token in the text, we feed its final embedding into the start token classifier. The start token classifier only has a single set of weights (represented by the blue \"start\" rectangle in the above illustration) which it applies to every word.\n", - "\n", - "After taking the dot product between the output embeddings and the 'start' weights, we apply the softmax activation to produce a probability distribution over all of the words. Whichever word has the highest probability of being the start token is the one that we pick.\n", - "\n", - "We repeat this process for the end token--we have a separate weight vector this.\n", - "\n", - "![End token classification](https://raw.githubusercontent.com/neychev/made_nlp_course/master/week10_speech_distillation_and_perspectives/img/end_token_classification_image.png)\n", - "*Image credits: [Chris McCormick](https://mccormickml.com/2020/03/10/question-answering-with-a-fine-tuned-BERT/)*" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "457VPa20fZzY" - }, - "source": [ - "## Part 2: Example Code" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9wpCLgVCkki5" - }, - "source": [ - "In the example code below, we'll be downloading a model that's *already been fine-tuned* for question answering, and try it out on our own text.\n", - "\n", - "If you do want to fine-tune on your own dataset, it is possible to fine-tune BERT for question answering yourself. See [run_squad.py](https://github.com/huggingface/transformers/blob/master/examples/run_squad.py) in the `transformers` library. However, you may find that the \"fine-tuned-on-squad\" model already does a good job, even if your text is from a different domain." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gVq-TuylYRDW" - }, - "source": [ - "### 1. Load Fine-Tuned BERT" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "f9nhy3PzGQ44" - }, - "source": [ - "This example uses the `transformers` [library](https://github.com/huggingface/transformers/) by huggingface. We've already installed it in the top of this notebook.\n", - "\n", - "For Question Answering we use the `DistilBertForQuestionAnswering` class from the `transformers` library.\n", - "\n", - "This class supports fine-tuning, but for this example we will keep things simpler and load a BERT model that has already been fine-tuned for the SQuAD benchmark.\n", - "\n", - "The `transformers` library has a large collection of pre-trained models which you can reference by name and load easily. The full list is in their documentation [here](https://huggingface.co/transformers/pretrained_models.html)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 231, - "referenced_widgets": [ - "f10f304cd83a424bb1acce2a79cac5a5", - "60050037056f4231ab5c6afa8ae9ec27", - "8d9aded51b004aababcda8e76798a534", - "e4c848c91a394012bbab804ff3e888a8", - "71f49045ee564ca3b485270db1e69792", - "1b981c0d4a0a4bdca32bb12960ce8a7a", - "07f8ddf6b1ae4143862d4e1888b253cb", - "32364789d1ff47fc8c4056a7b48a7634", - "7d1f7a87ed484be999ea793bf4c34dd3", - "a51916d0bd0a43fdba9e46f20a80d6a2", - "eef6db037885440f891824e5a8f5358f", - "1d6782097f9a45758fea597727c42da3", - "301ac3b9e5d04242b05d5943367bf048", - "18a7445e957446ffa372185756f6b64c", - "c6e9545c4c03427591aac4f49c774ff7", - "0ae8437438404098a5a6206b05ecf453", - "7669b34be42a4453b3e3ba0e60faee75", - "e36ef4cbcb4c466283f85f5d523d66cd", - "49107e49101e469abe491d5fb4e4466a", - "d434a8d87ce24bf7865ba75298d95729", - "2165f1cd0739407a92079f643ac74ae3", - "494fb6f653f845b3837db00241bf4798", - "3056ad61695a42ffadd7954a5a7503ed", - "2061f0fcebbd4806a5b7308ea3079a7b", - "18a3cd219c8642d3b6001f4dbed8a1a9", - "e4099cf7a2c1426d844c1febc644001a", - "fe8d0c53f5704e42b7c271fecaa48851", - "4f9a99283eec4c31b8ed463a0a5c4366", - "c30803de30894426958ad02fbc9f0002", - "3ffe97d1a93d46c5ad72625841831419", - "b12f548d82ca4586b61f701cb1abacb4", - "f7819d286ecc424e83c1d6e509ab1486", - "ed11dd778e2f49af96e269c4c37c6417", - "17f2ae98a8d1440b8beebc2f8aaff051", - "7ce0c95f422148ec92feee15262845f7", - "0bae2a0bb31b46f39a29c450e8c983c9", - "3d5da11d2072432face40e6f729f644b", - "f8318de1c5c74051adcf59d5f6261903", - "228cd2bb1c5a48088cd48fed5e8ee2e5", - "1c750fddb4f14ac397026785a004921d", - "e99dd095993f4053b5ba2a9c306582b9", - "f7d2834f66ac48719f1fd47dda33d7dc", - "3f4e292341444c128a6941020bf78ea7", - "0695270fa12047a69a2dcef569155b25", - "85e24abbc05d4937ac86793c9c36e1f5", - "3ec7e9e71b5e4feea47fe12b5f751d57", - "8ca5505516334e50b305f559633213fd", - "96eb7cdf8e9b48bfb3a619c3a6b4e3e0", - "c97f4996f8d14b9c94cfefaf1c7adfea", - "59df97cee8854be19b28d63f714816be", - "28e9a4b59d8940228874918132a4379a", - "26506f018a824fd2981562f17ada25e9", - "0e8242194958409abe8c5830767ae7b8", - "68976e9d2ac1420cb548307c96c5a8a6", - "1aaa4d2569574c89a646d202738a829e" - ] - }, - "id": "apS1yS6CdRyX", - "outputId": "077a5f84-dcf5-45ec-973c-944f48a4d101" - }, - "outputs": [], - "source": [ - "from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering\n", - "\n", - "tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased-distilled-squad')\n", - "model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased-distilled-squad')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8imoOxoqGZ0h" - }, - "source": [ - "> _Side note:_ Apparently the vocabulary of this model is identicaly to the one in bert-base-uncased. You can load the tokenizer from `bert-base-uncased` and that works just as well." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "I__1ubvcZYow" - }, - "source": [ - "### 2. Ask a Question" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "o8MQ7b-GJIcM" - }, - "source": [ - "Now we're ready to feed in an example!\n", - "\n", - "A QA example consists of a question and a passage of text containing the answer to that question." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "kWzZP4EN-Zxg" - }, - "outputs": [], - "source": [ - "question = \"How many parameters does BERT-large have?\"\n", - "context = (\n", - " \"BERT-large is really big... it has 24-layers and an embedding size of 1,024, \"\n", - " \"for a total of 340M parameters! Altogether it is 1.34GB, so expect it to \"\n", - " \"take a couple minutes to download to your Colab instance.\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "llLvxhScKLZn" - }, - "source": [ - "We'll need to run the BERT tokenizer against both the `question` and the `context`. To feed these into BERT, we actually concatenate them together and place the special `[SEP]` token in between.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "tYoX33CfKGsr", - "outputId": "033d60bf-6741-441b-d3ef-bfab2050c046" - }, - "outputs": [], - "source": [ - "# Apply the tokenizer to the input text, treating them as a text-pair.\n", - "input_ids = tokenizer.encode(question, context)\n", - "print(f'The input has a total of {len(input_ids)} tokens.')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pNRVuaKSNFG8" - }, - "source": [ - "Just to see exactly what the tokenizer is doing, let's print out the tokens with their IDs." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Iow838yPNDTv", - "outputId": "9d3bfc3a-2386-4d1a-ad49-b35f4d959c98" - }, - "outputs": [], - "source": [ - "# BERT only needs the token IDs, but for the purpose of inspecting the \n", - "# tokenizer's behavior, let's also get the token strings and display them.\n", - "tokens = tokenizer.convert_ids_to_tokens(input_ids)\n", - "\n", - "# Display tokens and ids as table.\n", - "# For each token and its id...\n", - "for token, token_id in zip(tokens, input_ids):\n", - " \n", - " # If this is the [SEP] token, add some space around it to make it stand out.\n", - " if token_id == tokenizer.sep_token_id:\n", - " print()\n", - " \n", - " # Print the token string and its ID in two columns.\n", - " print('{:<12} {:>6,}'.format(token, token_id))\n", - "\n", - " if token_id == tokenizer.sep_token_id:\n", - " print()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "CNwhEw0kQPBN" - }, - "source": [ - "We're ready to feed our example into the model!\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "HK0obn5x-1EI", - "outputId": "89898262-bfe5-4fac-ea23-f211f3703faf" - }, - "outputs": [], - "source": [ - "import torch\n", - "\n", - "inputs = tokenizer(question, context, return_tensors='pt')\n", - "with torch.no_grad():\n", - " outputs = model(**inputs)\n", - "\n", - "start_scores = outputs.start_logits\n", - "end_scores = outputs.end_logits\n", - "start_scores" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "a30sBTcqQv6X" - }, - "source": [ - ">*Side Note: Where's the padding?*\n", - ">\n", - "> The original [example code](https://huggingface.co/transformers/model_doc/bert.html?highlight=bertforquestionanswering#transformers.BertForQuestionAnswering) does not perform any padding. I suspect that this is because we are only feeding in a *single example*. If we instead fed in a batch of examples, then we would need to pad or truncate all of the samples in the batch to a single length, and supply an attention mask to tell BERT to ignore the padding tokens. " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "mBdS_QkIbDzh" - }, - "source": [ - "Now we can highlight the answer just by looking at the most probable start and end words. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "LeUQ44hAJmn9", - "outputId": "c9afd2de-5a24-480d-cb76-fa3726b549cd" - }, - "outputs": [], - "source": [ - "# Find the tokens with the highest `start` and `end` scores.\n", - "answer_start = torch.argmax(start_scores)\n", - "answer_end = torch.argmax(end_scores)\n", - "\n", - "# Combine the tokens in the answer and print it out.\n", - "answer = ' '.join(tokens[answer_start : answer_end + 1])\n", - "\n", - "print(f'Answer: \"{answer}\"')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "twMUWmr2brRw" - }, - "source": [ - "It got it right! Awesome :)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "cERYCGKMbOXX" - }, - "source": [ - "> *Side Note: It's a little naive to pick the highest scores for start and end--what if it predicts an end word that's before the start word?! The correct implementation is to pick the highest total score for which end >= start.*" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "N6j2znkwXYsn" - }, - "source": [ - "With a little more effort, we can reconstruct any words that got broken down into subwords." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "pBrAsWMJrw7i", - "outputId": "dd9159ea-7377-4a5c-c499-e9815f8c677f" - }, - "outputs": [], - "source": [ - "answer = tokenizer.convert_tokens_to_string(tokens[answer_start : answer_end + 1])\n", - "print(f'Answer: \"{answer}\"')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-hh6nkIdXq-O" - }, - "source": [ - "### 3. Visualizing Scores" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-hG2YCHYXtg-" - }, - "source": [ - "Let's see what the scores were for all of the words. The following cells generate bar plots showing the start and end scores for every word in the input." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "gkKFa73eJkPE" - }, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "\n", - "# Use plot styling from seaborn.\n", - "sns.set(style='darkgrid')\n", - "\n", - "# Increase the plot size and font size.\n", - "plt.rcParams['figure.figsize'] = (16, 8)\n", - "plt.rcParams['font.size'] = 16" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "W7kazkb2iEuQ" - }, - "source": [ - "Retrieve all of the start and end scores, and use all of the tokens as x-axis labels." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "C56AtMg2UBxN" - }, - "outputs": [], - "source": [ - "# Pull the scores out of PyTorch Tensors and convert them to 1D numpy arrays.\n", - "start_scores = start_scores.numpy().flatten()\n", - "end_scores = end_scores.numpy().flatten()\n", - "\n", - "# We'll use the tokens as the x-axis labels. In order to do that, they all need\n", - "# to be unique, so we'll add the token index to the end of each one.\n", - "token_labels = []\n", - "for (i, token) in enumerate(tokens):\n", - " token_labels.append('{:} - {:>2}'.format(token, i))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "NIaW7RyTiLeu" - }, - "source": [ - "Create a bar plot showing the score for every input word being the \"start\" word." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 581 - }, - "id": "y6OAV1dL3-UB", - "outputId": "4b56edcf-a145-4212-ac1b-0cb5c6b187bf" - }, - "outputs": [], - "source": [ - "# Create a barplot showing the start word score for all of the tokens.\n", - "ax = sns.barplot(x=token_labels, y=start_scores, ci=None)\n", - "\n", - "# Turn the xlabels vertical.\n", - "ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha=\"center\")\n", - "\n", - "# Turn on the vertical grid to help align words to scores.\n", - "ax.grid(True)\n", - "\n", - "plt.title('Start Word Scores');" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "zIwrF7y6iS1l" - }, - "source": [ - "Create a second bar plot showing the score for every input word being the \"end\" word." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 581 - }, - "id": "6tXEqIp-Tzou", - "outputId": "a2aff3b3-8767-4870-bb78-d32fae66ce29" - }, - "outputs": [], - "source": [ - "# Create a barplot showing the end word score for all of the tokens.\n", - "ax = sns.barplot(x=token_labels, y=end_scores, ci=None)\n", - "\n", - "# Turn the xlabels vertical.\n", - "ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha=\"center\")\n", - "\n", - "# Turn on the vertical grid to help align words to scores.\n", - "ax.grid(True)\n", - "\n", - "plt.title('End Word Scores');" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "awgi7Z_a9KSq" - }, - "source": [ - "**Alternate View**\n", - "\n", - "I also tried visualizing both the start and end scores on a single bar plot, but I think it may actually be more confusing then seeing them separately. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "m4VUk6R05uXS" - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "\n", - "# Store the tokens and scores in a DataFrame. \n", - "# Each token will have two rows, one for its start score and one for its end\n", - "# score. The \"marker\" column will differentiate them. A little wacky, I know.\n", - "scores = []\n", - "for (i, token_label) in enumerate(token_labels):\n", - "\n", - " # Add the token's start score as one row.\n", - " scores.append({'token_label': token_label, \n", - " 'score': start_scores[i],\n", - " 'marker': 'start'})\n", - " \n", - " # Add the token's end score as another row.\n", - " scores.append({'token_label': token_label, \n", - " 'score': end_scores[i],\n", - " 'marker': 'end'})\n", - " \n", - "df = pd.DataFrame(scores)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 508 - }, - "id": "07xyo-I97Ntt", - "outputId": "0cd0ac4d-d9b2-4d0b-86b2-0ee77241e2bc" - }, - "outputs": [], - "source": [ - "# Draw a grouped barplot to show start and end scores for each word.\n", - "# The \"hue\" parameter is where we tell it which datapoints belong to which\n", - "# of the two series.\n", - "plot = sns.catplot(\n", - " x=\"token_label\", y=\"score\", hue=\"marker\",\n", - " data=df, kind=\"bar\", height=6, aspect=4\n", - ")\n", - "\n", - "# Turn the xlabels vertical.\n", - "plot.set_xticklabels(plot.ax.get_xticklabels(), rotation=90, ha=\"center\")\n", - "\n", - "# Turn on the vertical grid to help align words to scores.\n", - "plot.ax.grid(True);" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8UyBYNmeegGf" - }, - "source": [ - "### 4. More Examples" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MWtcRpPef-Ce" - }, - "source": [ - "Turn the QA process into a function so we can easily try out other examples." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "rH8NbBlsfxZ_" - }, - "outputs": [], - "source": [ - "def answer_question(question, context):\n", - " # ======== Tokenize ========\n", - " # Apply the tokenizer to the input text, treating them as a text-pair.\n", - " inputs = tokenizer(question, context, return_tensors='pt')\n", - " input_ids = inputs.input_ids.numpy().flatten()\n", - "\n", - " # ======== Evaluate ========\n", - " # Run our example question through the model.\n", - " outputs = model(**inputs)\n", - " start_scores = outputs.start_logits\n", - " end_scores = outputs.end_logits\n", - "\n", - " # ======== Reconstruct Answer ========\n", - " # Find the tokens with the highest `start` and `end` scores.\n", - " answer_start = torch.argmax(start_scores)\n", - " answer_end = torch.argmax(end_scores)\n", - "\n", - " # Get the string versions of the input tokens.\n", - " token_ids = input_ids[answer_start : answer_end + 1]\n", - " tokens = tokenizer.convert_ids_to_tokens(token_ids)\n", - " answer = tokenizer.convert_tokens_to_string(tokens)\n", - "\n", - " return answer" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DVlKTK-njWrX" - }, - "source": [ - "As our reference text, we've taken the Abstract of the [BERT paper](https://arxiv.org/pdf/1810.04805.pdf).\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "mqfBGgc8AKk2" - }, - "outputs": [], - "source": [ - "bert_abstract = (\n", - " 'We introduce a new language representation model called BERT, which stands for '\n", - " 'Bidirectional Encoder Representations from Transformers. Unlike recent language '\n", - " 'representation models (Peters et al., 2018a; Radford et al., 2018), BERT is '\n", - " 'designed to pretrain deep bidirectional representations from unlabeled text by '\n", - " 'jointly conditioning on both left and right context in all layers. As a result, '\n", - " 'the pre-trained BERT model can be finetuned with just one additional output '\n", - " 'layer to create state-of-the-art models for a wide range of tasks, such as '\n", - " 'question answering and language inference, without substantial taskspecific '\n", - " 'architecture modifications. BERT is conceptually simple and empirically '\n", - " 'powerful. It obtains new state-of-the-art results on eleven natural language '\n", - " 'processing tasks, including pushing the GLUE score to 80.5% (7.7% point absolute '\n", - " 'improvement), MultiNLI accuracy to 86.7% (4.6% absolute improvement), SQuAD v1.1 '\n", - " 'question answering Test F1 to 93.2 (1.5 point absolute improvement) and SQuAD '\n", - " 'v2.0 Test F1 to 83.1 (5.1 point absolute improvement).'\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ay_mwbBJAP87" - }, - "source": [ - "Let's ask BERT what its name stands for (the answer is in the first sentence of the abstract)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "y4VPq6FdjxyX", - "outputId": "feaefc87-1e67-4ce9-f24f-b5c732e0c989" - }, - "outputs": [], - "source": [ - "question = \"What does the 'B' in BERT stand for?\"\n", - "answer = answer_question(question, bert_abstract)\n", - "print(f'Answer: \"{answer}\"')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "B6HcijzxkTO9" - }, - "source": [ - "Let's ask BERT about example applications of itself :)\n", - "\n", - "The answer to the question comes from this passage from the abstract: \n", - "\n", - "> \"...BERT model can be finetuned with just one additional output\n", - "layer to create state-of-the-art models for **a wide range of tasks, such as\n", - "question answering and language inference,** without substantial taskspecific\n", - "architecture modifications.\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "MVNVGN5-gI06", - "outputId": "8ab4d754-6d14-4515-8772-bf08553ea7c9" - }, - "outputs": [], - "source": [ - "question = \"What are some example applications of BERT?\"\n", - "answer = answer_question(question, bert_abstract)\n", - "print(f'Answer: \"{answer}\"')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WXAJ2wkV7rRl" - }, - "source": [ - "## [Optional] Part 3. RuBERT for question answering." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TcnPsGzEbGL6" - }, - "source": [ - "Here we will use the model pre-trained on the SberQuAD dataset from the [SDSJ-2017 challenge problem B](https://github.com/sberbank-ai/data-science-journey-2017/tree/master/problem_B)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "3JslS5CG7rRl", - "outputId": "e1cca72a-e1ce-423a-a2f9-ed51b7ce4340" - }, - "outputs": [], - "source": [ - "from deeppavlov import build_model, configs\n", - "\n", - "model_ru = build_model(configs.squad.squad_ru_rubert, download=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "GHVayqJ37rRl" - }, - "source": [ - "The following text is copied from [habr post on Crew Dragon flight](https://habr.com/ru/news/t/504642/)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "l5pDyTRL7rRl" - }, - "outputs": [], - "source": [ - "context = (\n", - " 'Первая многоразовая ступень ракеты-носителя Falcon 9 успешно отделилась через две с половиной '\n", - " 'минуты после старта и автоматически приземлилась на плавучую платформу Of Course I Still '\n", - " 'Love You у берегов Флориды. Через 12 минут после запуска космический корабль Crew Dragon '\n", - " 'вышел на расчетную орбиту и отделился от второй ступени ракеты.'\n", - " '\\n\\n'\n", - " 'Сближение корабля Crew Dragon с Международной космической станцией запланировано на 31 мая. '\n", - " 'К стыковочному адаптеру на узловом модуле «Гармония» американского сегмента МКС Crew Dragon '\n", - " 'должен причалить в ручном или, при необходимости, в автоматическом режиме. Эта процедура '\n", - " 'запланирована на 10:29 по времени Восточного побережья США (17:29 по московскому времени).'\n", - " '\\n\\n'\n", - " 'В испытательном полете DM2 астронавт Херли является командиром космического корабля (spacecraft '\n", - " 'commander), а его напарник Бенкен — командир по операциям стыковки и расстыковки (joint '\n", - " 'operations commander). Фактически это означает, что именно Херли управляет Crew Dragon в '\n", - " 'полете к МКС, к которой они должны пристыковаться в течение суток после старта. Херли и Бенкен '\n", - " 'также будут выполнять необходимые для сертификации НАСА проверки систем корабля в полете.'\n", - " '\\n\\n'\n", - " 'Во время полета Херли и Бенкен провели небольшую экскурсию по Crew Dragon.'\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5tVX9PJ_GPE-" - }, - "source": [ - "And here is how to use deeppavlov's model:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "05BDo1IjGFPG", - "outputId": "fba7a5d0-d835-47b3-beb1-19f9cadcaf1e" - }, - "outputs": [], - "source": [ - "question = 'Когда отделилась первая ступень?'\n", - "model_ru([context], [question])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "yyRAYAc_GxAL" - }, - "source": [ - "The model returns list with answer, answer starting position in context and the answer logit.\n", - "\n", - "This yields the following `answer_question` function." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "UXi5hm_AEFB4" - }, - "outputs": [], - "source": [ - "def answer_question_ru(question, context):\n", - " output = model_ru([context], [question])\n", - " return output[0][0]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "0wfCi3FvHBuL" - }, - "source": [ - "Let's ask a bunch of other questions to the model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "98PIvy4g7rRm", - "outputId": "6af3638d-bf18-4a48-9832-947b37cc336c" - }, - "outputs": [], - "source": [ - "question = 'На какую дату запланирована стыковка?'\n", - "answer = answer_question_ru(question, context)\n", - "print(f'Ответ: \"{answer}\"')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "K1BK3PAm7rRn", - "outputId": "90665a68-76d3-46e0-9622-34617b142b19" - }, - "outputs": [], - "source": [ - "question = 'Кто участвует в полете?'\n", - "answer = answer_question_ru(question, context)\n", - "print(f'Ответ: \"{answer}\"')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Ugo2Wyd57rRn", - "outputId": "da924f61-81fa-446c-f1aa-0972289703d2" - }, - "outputs": [], - "source": [ - "question = 'Кто участвует в полете кроме астронавта Херли?'\n", - "answer = answer_question_ru(question, context)\n", - "print(f'Ответ: \"{answer}\"')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "5B4vytlCYTvs", - "outputId": "2fdf869c-dcdd-405c-9542-f3d259191ddc" - }, - "outputs": [], - "source": [ - "question = 'Какие астронавты участвовали в полете?'\n", - "answer = answer_question_ru(question, context)\n", - "\n", - "# Notice how model finds the appropriate answer dispite slightly different context.\n", - "print(f'Ответ: \"{answer}\"')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "I-AwQsIU7rRn", - "outputId": "d72a2aed-32ad-4b70-a146-ec8febc7f7d1" - }, - "outputs": [], - "source": [ - "question = 'Какая ступень приземлилась на плавучую платформу Of Course I Still Love You?'\n", - "answer = answer_question_ru(question, context)\n", - "print(f'Ответ: \"{answer}\"')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZU4ZxB6o7rRp" - }, - "source": [ - "## Part 4. Question answering with speech using Tacotron 2." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "NVstcIVxadEr" - }, - "source": [ - "### Text to speech using Tacotron 2." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aoj8MMIJbTji" - }, - "source": [ - "Tacotron 2 is a network proposed in 2017 in [Natural TTS Synthesis By Conditioning\n", - "Wavenet On Mel Spectrogram Predictions](https://arxiv.org/pdf/1712.05884.pdf) paper. This network takes an input text and maps it into the mel-frequency spectrogram. This spectrogram is then passed through a modified WaveNet (generative model for audio, original paper can be found [here](https://arxiv.org/pdf/1609.03499.pdf)) to generate the actual speech." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "btil2I1zejk3" - }, - "source": [ - "Let's look more closely at a mel spectrogram (for more info on its nature please refer to the [Tacotron 2 paper](https://arxiv.org/pdf/1712.05884.pdf))." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 298 - }, - "id": "xDAPjR_Lx-uZ", - "outputId": "a12ee1da-0bbe-4914-e572-fb0c5986e0a7" - }, - "outputs": [], - "source": [ - "assert tacotron2 is not None and waveglow is not None, 'Tacotron2 by NVIDIA infers only on GPU, so the Part 4 will not work on CPU-only machine'\n", - "utils = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_tts_utils')\n", - "\n", - "text = 'Some test text.'\n", - "sequences, lengths = utils.prepare_input_sequence([text])\n", - "with torch.no_grad():\n", - " mel, _, _ = tacotron2.infer(sequences, lengths)\n", - "\n", - "sns.reset_orig()\n", - "plt.imshow(mel[0].cpu().numpy())\n", - "plt.title('mel-frequency spectrogram');" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9odRMQS3fISF" - }, - "source": [ - "After obtaining this spectrogram, we can generate the audio with `waveglow` model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 52 - }, - "id": "BdFfCjmsUUxQ", - "outputId": "8d94bfcd-4157-4416-e6cc-b93a40edaea0" - }, - "outputs": [], - "source": [ - "from IPython.display import Audio\n", - "\n", - "sampling_rate = 22050\n", - "\n", - "with torch.no_grad():\n", - " audio = waveglow.infer(mel)\n", - "\n", - "audio_numpy = audio[0].cpu().numpy()\n", - "Audio(audio_numpy, rate=sampling_rate)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "W0mnDWpdhpdi" - }, - "source": [ - "We've generated a `.wav` format audio. We can save it using the `scipy.io.wavfile.write`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "rTMNu9Krh2cW" - }, - "outputs": [], - "source": [ - "from scipy.io.wavfile import write\n", - "\n", - "write('audio.wav', sampling_rate, audio_numpy)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "0pjEZy4TfT3w" - }, - "source": [ - "This yields the following `text_to_speech` function." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "YEvDynkPVscj" - }, - "outputs": [], - "source": [ - "def text_to_speech(text):\n", - " # preprocessing\n", - " sequences, lengths = utils.prepare_input_sequence([text])\n", - "\n", - " # run the models\n", - " with torch.no_grad():\n", - " mel, _, _ = tacotron2.infer(sequences, lengths)\n", - " audio = waveglow.infer(mel)\n", - "\n", - " audio_numpy = audio[0].cpu().numpy()\n", - " return audio_numpy" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 52 - }, - "id": "RwqwZoxsWNmq", - "outputId": "69f79ad9-2300-4a8c-bf4c-a4b952128f3c" - }, - "outputs": [], - "source": [ - "text = 'Another test text.'\n", - "audio_numpy = text_to_speech(text)\n", - "Audio(audio_numpy, rate=sampling_rate)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gYittYwlfZfU" - }, - "source": [ - "### Tying text to speech with question answering." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "83WLbVnv7rRq" - }, - "source": [ - "Let's take a look at [Mail.ru group blog post on Computer Vision on habr.com](https://habr.com/ru/company/mailru/blog/467905/)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "m2eMzsMY7rRq" - }, - "outputs": [], - "source": [ - "context = (\n", - " 'One of Mail.ru Cloud’s objectives is to provide the handiest means for accessing '\n", - " 'and searching your own photo and video archives. For this purpose, we at Mail.ru '\n", - " 'Computer Vision Team have created and implemented systems for smart image '\n", - " 'processing: search by object, by scene, by face, etc. Another spectacular '\n", - " 'technology is landmark recognition. Today, I am going to tell you how we made '\n", - " 'this a reality using Deep Learning.'\n", - " '\\n\\n'\n", - " 'Imagine the situation: you return from your vacation with a load of photos. Talking '\n", - " 'to your friends, you are asked to show a picture of a place worth seeing, like '\n", - " 'palace, castle, pyramid, temple, lake, waterfall, mountain, and so on. You rush to '\n", - " 'scroll your gallery folder trying to find one that is really good. Most likely, it '\n", - " 'is lost amongst hundreds of images, and you say you will show it later.'\n", - " '\\n\\n'\n", - " 'We solve this problem by grouping user photos in albums. This will let you find '\n", - " 'pictures you need just in few clicks. Now we have albums compiled by face, by '\n", - " 'object and by scene, and also by landmark.'\n", - " '\\n\\n'\n", - " 'Photos with landmarks are essential because they often capture highlights of our '\n", - " 'lives (journeys, for example). These can be pictures with some architecture or '\n", - " 'wilderness in the background. This is why we seek to locate such images and make '\n", - " 'them readily available to users.'\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "4tkwZk-B7rRq", - "outputId": "31b6de57-91a3-4bed-8b64-00cfa9ad0e14" - }, - "outputs": [], - "source": [ - "question = 'Why photos with landmarks are essential?'\n", - "answer = answer_question(question, context)\n", - "print(f'Answer: \"{answer}\"')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "chBm8WdIh_Bc" - }, - "source": [ - "Let's cat question and answer into one phrase and convert it to audio!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 52 - }, - "id": "j9NhT0Np7rRr", - "outputId": "c09ed603-a4cc-423c-b8b9-f8f4def9b42f" - }, - "outputs": [], - "source": [ - "text = f'{question}\\n{answer}'\n", - "audio_numpy = text_to_speech(text)\n", - "Audio(audio_numpy, rate=sampling_rate)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "cFDgT4OuijIp" - }, - "source": [ - "And another one." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 69 - }, - "id": "8iqcYLwgy1nT", - "outputId": "03d7ca7d-cf6d-4b5d-b88a-b810f874f117" - }, - "outputs": [], - "source": [ - "question = \"Which places except mountain are worth seeing?\"\n", - "answer = answer_question(question, context)\n", - "print(f'Answer: \"{answer}\"')\n", - "\n", - "text = f'{question}\\n{answer}'\n", - "audio_numpy = text_to_speech(text)\n", - "Audio(audio_numpy, rate=sampling_rate)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NIUIia_G7rRs" - }, - "outputs": [], - "source": [ - "# Take your time, experiment with questions and the generated audio" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "vv5jEpkw7rRs" - }, - "source": [ - "## [Optional] 5. Russian langugage speech generation" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_0uhvcBhj2k6" - }, - "source": [ - "Of course, text to speech is not specific to english language. Here is how you can do it with russian." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "TD79JX0g7rRs" - }, - "outputs": [], - "source": [ - "from omegaconf import OmegaConf\n", - "\n", - "torch.hub.download_url_to_file(\n", - " 'https://raw.githubusercontent.com/snakers4/silero-models/master/models.yml',\n", - " 'latest_silero_models.yml',\n", - " progress=False\n", - ")\n", - "models = OmegaConf.load('latest_silero_models.yml')\n", - "\n", - "# see latest avaiable models\n", - "available_languages = list(models['tts_models'].keys())\n", - "print(f'Available languages {available_languages}')\n", - "\n", - "for lang in available_languages:\n", - " speakers = list(models['tts_models'][lang].keys())\n", - " print(f'Available speakers for {lang}: {speakers}')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZVaR1xG8k94K" - }, - "source": [ - "Let's choose our language and speaker and try using them!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "zKCT72on7rRt" - }, - "outputs": [], - "source": [ - "language = 'ru'\n", - "speaker = 'kseniya_16khz'\n", - "device = torch.device('cpu')\n", - "model, symbols, sample_rate, example_text, apply_tts = torch.hub.load(\n", - " 'snakers4/silero-models', 'silero_tts',\n", - " language=language, speaker=speaker\n", - ")\n", - "model = model.to(device)\n", - "\n", - "\n", - "audio = apply_tts(\n", - " texts=[example_text],\n", - " model=model,\n", - " sample_rate=sample_rate,\n", - " symbols=symbols,\n", - " device=device\n", - ")\n", - "\n", - "print(example_text)\n", - "Audio(audio[0], rate=sample_rate)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "U0O3eCX87rRt" - }, - "outputs": [], - "source": [ - "audio = apply_tts(\n", - " texts=[\"Дерзайте знать! Спасибо за внимание!\"],\n", - " model=model,\n", - " sample_rate=sample_rate,\n", - " symbols=symbols,\n", - " device=device\n", - ")\n", - "Audio(audio[0], rate=sample_rate)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "N2_1sCYvALxG" - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "collapsed_sections": [], - "name": "practice_question_answering_and_tts.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "0695270fa12047a69a2dcef569155b25": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "07f8ddf6b1ae4143862d4e1888b253cb": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "0ae8437438404098a5a6206b05ecf453": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_494fb6f653f845b3837db00241bf4798", - "placeholder": "​", - "style": "IPY_MODEL_2165f1cd0739407a92079f643ac74ae3", - "value": " 28.0/28.0 [00:00<00:00, 487B/s]" - } - }, - "0bae2a0bb31b46f39a29c450e8c983c9": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_1c750fddb4f14ac397026785a004921d", - "placeholder": "​", - "style": "IPY_MODEL_228cd2bb1c5a48088cd48fed5e8ee2e5", - "value": "Downloading: 100%" - } - }, - "0e8242194958409abe8c5830767ae7b8": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "17f2ae98a8d1440b8beebc2f8aaff051": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_0bae2a0bb31b46f39a29c450e8c983c9", - "IPY_MODEL_3d5da11d2072432face40e6f729f644b", - "IPY_MODEL_f8318de1c5c74051adcf59d5f6261903" - ], - "layout": "IPY_MODEL_7ce0c95f422148ec92feee15262845f7" - } - }, - "18a3cd219c8642d3b6001f4dbed8a1a9": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_c30803de30894426958ad02fbc9f0002", - "placeholder": "​", - "style": "IPY_MODEL_4f9a99283eec4c31b8ed463a0a5c4366", - "value": "Downloading: 100%" - } - }, - "18a7445e957446ffa372185756f6b64c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_e36ef4cbcb4c466283f85f5d523d66cd", - "placeholder": "​", - "style": "IPY_MODEL_7669b34be42a4453b3e3ba0e60faee75", - "value": "Downloading: 100%" - } - }, - "1aaa4d2569574c89a646d202738a829e": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "1b981c0d4a0a4bdca32bb12960ce8a7a": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "1c750fddb4f14ac397026785a004921d": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "1d6782097f9a45758fea597727c42da3": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_18a7445e957446ffa372185756f6b64c", - "IPY_MODEL_c6e9545c4c03427591aac4f49c774ff7", - "IPY_MODEL_0ae8437438404098a5a6206b05ecf453" - ], - "layout": "IPY_MODEL_301ac3b9e5d04242b05d5943367bf048" - } - }, - "2061f0fcebbd4806a5b7308ea3079a7b": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "2165f1cd0739407a92079f643ac74ae3": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "228cd2bb1c5a48088cd48fed5e8ee2e5": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "26506f018a824fd2981562f17ada25e9": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "28e9a4b59d8940228874918132a4379a": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "301ac3b9e5d04242b05d5943367bf048": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "3056ad61695a42ffadd7954a5a7503ed": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_18a3cd219c8642d3b6001f4dbed8a1a9", - "IPY_MODEL_e4099cf7a2c1426d844c1febc644001a", - "IPY_MODEL_fe8d0c53f5704e42b7c271fecaa48851" - ], - "layout": "IPY_MODEL_2061f0fcebbd4806a5b7308ea3079a7b" - } - }, - "32364789d1ff47fc8c4056a7b48a7634": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "3d5da11d2072432face40e6f729f644b": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_f7d2834f66ac48719f1fd47dda33d7dc", - "max": 451, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_e99dd095993f4053b5ba2a9c306582b9", - "value": 451 - } - }, - "3ec7e9e71b5e4feea47fe12b5f751d57": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "3f4e292341444c128a6941020bf78ea7": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "3ffe97d1a93d46c5ad72625841831419": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "49107e49101e469abe491d5fb4e4466a": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "494fb6f653f845b3837db00241bf4798": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "4f9a99283eec4c31b8ed463a0a5c4366": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "59df97cee8854be19b28d63f714816be": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "60050037056f4231ab5c6afa8ae9ec27": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "68976e9d2ac1420cb548307c96c5a8a6": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "71f49045ee564ca3b485270db1e69792": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_eef6db037885440f891824e5a8f5358f", - "placeholder": "​", - "style": "IPY_MODEL_a51916d0bd0a43fdba9e46f20a80d6a2", - "value": " 226k/226k [00:00<00:00, 477kB/s]" - } - }, - "7669b34be42a4453b3e3ba0e60faee75": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "7ce0c95f422148ec92feee15262845f7": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "7d1f7a87ed484be999ea793bf4c34dd3": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "85e24abbc05d4937ac86793c9c36e1f5": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_8ca5505516334e50b305f559633213fd", - "IPY_MODEL_96eb7cdf8e9b48bfb3a619c3a6b4e3e0", - "IPY_MODEL_c97f4996f8d14b9c94cfefaf1c7adfea" - ], - "layout": "IPY_MODEL_3ec7e9e71b5e4feea47fe12b5f751d57" - } - }, - "8ca5505516334e50b305f559633213fd": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_28e9a4b59d8940228874918132a4379a", - "placeholder": "​", - "style": "IPY_MODEL_59df97cee8854be19b28d63f714816be", - "value": "Downloading: 100%" - } - }, - "8d9aded51b004aababcda8e76798a534": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_07f8ddf6b1ae4143862d4e1888b253cb", - "placeholder": "​", - "style": "IPY_MODEL_1b981c0d4a0a4bdca32bb12960ce8a7a", - "value": "Downloading: 100%" - } - }, - "96eb7cdf8e9b48bfb3a619c3a6b4e3e0": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_0e8242194958409abe8c5830767ae7b8", - "max": 265481570, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_26506f018a824fd2981562f17ada25e9", - "value": 265481570 - } - }, - "a51916d0bd0a43fdba9e46f20a80d6a2": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "b12f548d82ca4586b61f701cb1abacb4": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "c30803de30894426958ad02fbc9f0002": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "c6e9545c4c03427591aac4f49c774ff7": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_d434a8d87ce24bf7865ba75298d95729", - "max": 28, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_49107e49101e469abe491d5fb4e4466a", - "value": 28 - } - }, - "c97f4996f8d14b9c94cfefaf1c7adfea": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_1aaa4d2569574c89a646d202738a829e", - "placeholder": "​", - "style": "IPY_MODEL_68976e9d2ac1420cb548307c96c5a8a6", - "value": " 253M/253M [00:16<00:00, 9.50MB/s]" - } - }, - "d434a8d87ce24bf7865ba75298d95729": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "e36ef4cbcb4c466283f85f5d523d66cd": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "e4099cf7a2c1426d844c1febc644001a": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_b12f548d82ca4586b61f701cb1abacb4", - "max": 466062, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_3ffe97d1a93d46c5ad72625841831419", - "value": 466062 - } - }, - "e4c848c91a394012bbab804ff3e888a8": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_7d1f7a87ed484be999ea793bf4c34dd3", - "max": 231508, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_32364789d1ff47fc8c4056a7b48a7634", - "value": 231508 - } - }, - "e99dd095993f4053b5ba2a9c306582b9": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "ed11dd778e2f49af96e269c4c37c6417": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "eef6db037885440f891824e5a8f5358f": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "f10f304cd83a424bb1acce2a79cac5a5": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_8d9aded51b004aababcda8e76798a534", - "IPY_MODEL_e4c848c91a394012bbab804ff3e888a8", - "IPY_MODEL_71f49045ee564ca3b485270db1e69792" - ], - "layout": "IPY_MODEL_60050037056f4231ab5c6afa8ae9ec27" - } - }, - "f7819d286ecc424e83c1d6e509ab1486": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "f7d2834f66ac48719f1fd47dda33d7dc": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "f8318de1c5c74051adcf59d5f6261903": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_0695270fa12047a69a2dcef569155b25", - "placeholder": "​", - "style": "IPY_MODEL_3f4e292341444c128a6941020bf78ea7", - "value": " 451/451 [00:00<00:00, 8.05kB/s]" - } - }, - "fe8d0c53f5704e42b7c271fecaa48851": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_ed11dd778e2f49af96e269c4c37c6417", - "placeholder": "​", - "style": "IPY_MODEL_f7819d286ecc424e83c1d6e509ab1486", - "value": " 455k/455k [00:00<00:00, 720kB/s]" - } - } - } - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/week09_pagerank/README.md b/week09_pagerank/README.md deleted file mode 100644 index 649531c..0000000 --- a/week09_pagerank/README.md +++ /dev/null @@ -1,5 +0,0 @@ -Page Rank explanation: -[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/girafe-ai/natural-language-processing/blob/master/week09_pagerank/practice_pagerank.ipynb) - -Further readings: -* https://en.wikipedia.org/wiki/PageRank \ No newline at end of file diff --git a/week09_pagerank/practice_pagerank.ipynb b/week09_pagerank/practice_pagerank.ipynb deleted file mode 100644 index 145d4c4..0000000 --- a/week09_pagerank/practice_pagerank.ipynb +++ /dev/null @@ -1,271 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# PageRank" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This page demonstrates the use of a short Python implementation of the PageRank algorithm on the link structure contained in the graph on the [PageRank Wikipedia](http://en.wikipedia.org/wiki/PageRank) page:" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from IPython.display import Image\n", - "Image(url='http://upload.wikimedia.org/wikipedia/commons/f/fb/PageRanks-Example.svg')" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import numpy as np" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First, we will encode the links present on this graph as a count matrix `M_counts`." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[[ 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]\n", - " [ 1. 0. 1. 1. 1. 1. 1. 1. 1. 0. 0.]\n", - " [ 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", - " [ 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]\n", - " [ 1. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1.]\n", - " [ 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]\n", - " [ 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", - " [ 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", - " [ 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", - " [ 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", - " [ 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]\n" - ] - } - ], - "source": [ - "n_pages = 11 # numbering pages A through K as 0 to 10\n", - "M_counts = np.zeros((n_pages, n_pages)) # will hold the number of link counts (assumed 0 or 1)\n", - "# columns = starting page, row = destination page, ie M_ij = whether or not there is a link from j to i\n", - "\n", - "M_counts[:,0] = 1 # page 0 (A in the graphic) is a sink because it has no outgoing links at all; \n", - "# however, M cannot contain an all-zero column, so do as if A was linking to all other pages (ie put 1's everywhere)\n", - "M_counts[2,1] = 1 # B->C\n", - "M_counts[1,2] = 1 # C->B\n", - "M_counts[0,3] = 1 # D->A\n", - "M_counts[1,3] = 1 # D->B\n", - "M_counts[1,4] = 1 # E->B\n", - "M_counts[3,4] = 1 # E->D\n", - "M_counts[5,4] = 1 # E->F\n", - "M_counts[1,5] = 1 # F->B\n", - "M_counts[4,5] = 1 # F->E\n", - "M_counts[1,6] = 1 # G,H,I->B,E\n", - "M_counts[4,6] = 1\n", - "M_counts[1,7] = 1\n", - "M_counts[4,7] = 1\n", - "M_counts[1,8] = 1\n", - "M_counts[4,8] = 1\n", - "M_counts[4,9] = 1 # J,K->E\n", - "M_counts[4,10] = 1\n", - "print(M_counts)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we can make an adjacency matrix `M` out of `M_counts`, by dividing each column by its sum, ie we are making sure columns sum to 1 :" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[[ 0.091 0. 0. 0.5 0. 0. 0. 0. 0. 0. 0. ]\n", - " [ 0.091 0. 1. 0.5 0.333 0.5 0.5 0.5 0.5 0. 0. ]\n", - " [ 0.091 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. ]\n", - " [ 0.091 0. 0. 0. 0.333 0. 0. 0. 0. 0. 0. ]\n", - " [ 0.091 0. 0. 0. 0. 0.5 0.5 0.5 0.5 1. 1. ]\n", - " [ 0.091 0. 0. 0. 0.333 0. 0. 0. 0. 0. 0. ]\n", - " [ 0.091 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ]\n", - " [ 0.091 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ]\n", - " [ 0.091 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ]\n", - " [ 0.091 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ]\n", - " [ 0.091 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ]]\n" - ] - } - ], - "source": [ - "M = np.empty((n_pages, n_pages))\n", - "for j in range(n_pages):\n", - " M[:,j] = M_counts[:,j] / M_counts[:,j].sum()\n", - "np.set_printoptions(precision=3)\n", - "print(M)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let us check that all the conditions on M are fulfilled." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy\n", - "def check_M(M):\n", - " \"\"\"\n", - " check that M has the right format to be used by pagerank function\n", - " \"\"\"\n", - " n_pages = M.shape[0] # n_pages is the number of rows of M\n", - " np.testing.assert_equal(M.shape[0], M.shape[1], err_msg = 'M should be square')\n", - " np.testing.assert_array_almost_equal(M.sum(axis=0), np.ones((n_pages)), \n", - " err_msg = 'assert each column sums to one (M is assumed column-stochastic)')\n", - " for j in range(n_pages):\n", - " M_column = M[:,j]\n", - " n_nonzero = np.count_nonzero(M[:,j])\n", - " np.testing.assert_array_almost_equal(M_column[M_column.nonzero()], np.ones((n_nonzero)) / n_nonzero,\n", - " err_msg = 'in column %g, all non-zero entries should be equal (and equal to 1 divided by their number)' % j)\n", - "\n", - "check_M(M) # will produce error if M does not have the right format" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And we are now ready to apply the `pagerank` function, which will iteratively apply page transitions to an randomly initialized distribution over the pages, until convergence." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "def pagerank(M, d=0.85, square_error=1e-6):\n", - " \"\"\"\n", - " M : the adjacency matrix of the pages. It is assumed to be column-stochastic (ie column sum to 1); all links have equal weight.\n", - " A page with no outgoing links (sink) is represented as a page with outgoing links to each other page (ie restart page).\n", - " d: damping factor\n", - " square_error : the algorithm iterates until the difference between two successive PageRank vectors v is less than this (in squared norm)\n", - " returns the PageRanks of all pages\n", - " \"\"\"\n", - " n_pages = M.shape[0] # n_pages is the number of rows of M\n", - " v = np.random.rand(n_pages) # initialize to random vector\n", - " v = v / v.sum() # make v sum to 1\n", - " last_v = np.ones((n_pages)) # will contain the previous v\n", - " M_hat = d * M + (1-d)/n_pages * np.ones((n_pages, n_pages)) # equation (***) in Wikipedia page\n", - " while np.square(v - last_v).sum() > square_error:\n", - " last_v = v\n", - " v = M_hat.dot(v) # at each iteration, progress one timestep\n", - " return v\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([ 0.033, 0.384, 0.343, 0.039, 0.081, 0.039, 0.016, 0.016,\n", - " 0.016, 0.016, 0.016])" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pagerank(M)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "These are the numbers (within the allowed error) displayed on the graph (the numbers on the graph are rounded exact values)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 2", - "language": "python", - "name": "python2" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.13" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -}