refactor: some improvements

marcpinet · Dec 2, 2024 · a538367 · a538367
1 parent ffab2fe
commit a538367
Show file tree

Hide file tree

Showing 10 changed files with 129 additions and 145 deletions.
diff --git a/examples/compression/autoencoder_fashonized_mnist_basic.ipynb b/examples/compression/autoencoder_fashonized_mnist_basic.ipynb
diff --git a/examples/compression/autoencoder_fashonized_mnist_convolution.ipynb b/examples/compression/autoencoder_fashonized_mnist_convolution.ipynb
diff --git a/examples/generation/gan-image-generation/gan-mnist.ipynb b/examples/generation/gan-image-generation/gan-mnist.ipynb
@@ -17,10 +17,7 @@
     "\n",
     "from neuralnetlib.preprocessing import one_hot_encode\n",
     "from neuralnetlib.models import Sequential, GAN\n",
-    "from neuralnetlib.layers import Input, Dense, BatchNormalization, Dropout, Activation\n",
-    "from neuralnetlib.activations import LeakyReLU\n",
-    "from neuralnetlib.optimizers import Adam\n",
-    "from neuralnetlib.losses import Wasserstein"
+    "from neuralnetlib.layers import Input, Dense"
    ]
   },
   {
@@ -197,14 +194,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "GIF créé avec succès sous le nom 'output.gif'\n"
+      "GIF 'output.gif' succesffuly created!\n"
      ]
     }
    ],
@@ -217,7 +214,7 @@
     "if images:\n",
     "    images[0].save('output.gif', save_all=True, append_images=images[1:], duration=100, loop=0)\n",
     "\n",
-    "print(\"GIF créé avec succès sous le nom 'output.gif'\")"
+    "print(\"GIF 'output.gif' succesffuly created!\")"
    ]
   },
   {

diff --git a/examples/generation/rnn-text-generation/dinosaur_names_generator.ipynb b/examples/generation/rnn-text-generation/dinosaur_names_generator.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "a036f9b8eee0491",
    "metadata": {
     "ExecuteTime": {
@@ -14,6 +14,7 @@
    "outputs": [],
    "source": [
     "import numpy as np\n",
+    "\n",
     "from neuralnetlib.layers import Input, Embedding, LSTM, Dense\n",
     "from neuralnetlib.models import Sequential\n",
     "from neuralnetlib.preprocessing import one_hot_encode\n",

diff --git a/examples/generation/transformer-text-generation/large-language-model.ipynb b/examples/generation/transformer-text-generation/large-language-model.ipynb
@@ -2,48 +2,36 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "import numpy as np\n",
-    "import pandas as pd\n",
-    "from sklearn.model_selection import train_test_split\n",
-    "import re\n",
-    "import numpy as np\n",
-    "import pandas as pd\n",
+    "\n",
     "from neuralnetlib.models import Transformer\n",
     "from neuralnetlib.preprocessing import Tokenizer, pad_sequences\n",
-    "from neuralnetlib.utils import train_test_split\n",
     "from neuralnetlib.losses import CrossEntropyWithLabelSmoothing\n",
     "from neuralnetlib.optimizers import Adam\n",
     "from neuralnetlib.callbacks import EarlyStopping, Callback, LearningRateScheduler"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "def prepare_causal_lm_data(text_data, tokenizer, max_length=512, stride=256):\n",
-    "    \"\"\"\n",
-    "    Prépare les données pour l'entraînement causal LM en utilisant une fenêtre glissante\n",
-    "    \"\"\"\n",
-    "    # Tokenisation du texte complet\n",
     "    tokens = tokenizer.texts_to_sequences([text_data], add_special_tokens=True)[0]\n",
     "    \n",
-    "    # Création des séquences d'entraînement avec une fenêtre glissante\n",
     "    sequences = []\n",
     "    for i in range(0, len(tokens) - max_length + 1, stride):\n",
     "        sequence = tokens[i:i + max_length]\n",
     "        if len(sequence) == max_length:\n",
     "            sequences.append(sequence)\n",
     "    \n",
-    "    # Conversion en array numpy\n",
     "    sequences = np.array(sequences)\n",
     "    \n",
-    "    # Création des entrées et cibles (décalées d'une position)\n",
     "    X = sequences[:, :-1]\n",
     "    y = sequences[:, 1:]\n",
     "    \n",
@@ -52,7 +40,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -75,7 +63,7 @@
     "                input_sequence,\n",
     "                max_length=self.max_length,\n",
     "                temperature=self.temperature,\n",
-    "                beam_size=1  # On utilise un beam search de 1 pour la génération simple\n",
+    "                beam_size=1\n",
     "            )\n",
     "            \n",
     "            generated_text = self.tokenizer.sequences_to_texts(generated.tolist())[0]\n",
@@ -85,26 +73,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "def train_causal_lm(text_data, model, tokenizer, max_length=512, batch_size=32, epochs=10):\n",
-    "    \"\"\"\n",
-    "    Entraîne le modèle comme un LLM causal\n",
-    "    \"\"\"\n",
-    "    # Préparation des données\n",
     "    X, y = prepare_causal_lm_data(text_data, tokenizer, max_length)\n",
     "    \n",
-    "    # Prompts de test pour la génération\n",
     "    test_prompts = [\n",
     "        \"Il était une fois\",\n",
     "        \"Le chat\",\n",
     "        \"Je pense que\",\n",
     "        \"Dans la forêt\"\n",
     "    ]\n",
     "    \n",
-    "    # Callbacks\n",
     "    callbacks = [\n",
     "        EarlyStopping(monitor='loss', patience=5, restore_best_weights=True),\n",
     "        TextGenerationCallback(model, tokenizer, test_prompts),\n",
@@ -115,7 +97,6 @@
     "        )\n",
     "    ]\n",
     "    \n",
-    "    # Entraînement\n",
     "    history = model.fit(\n",
     "        X, y,\n",
     "        epochs=epochs,\n",
@@ -129,14 +110,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "def generate_text(model, tokenizer, prompt, max_length=50, temperature=0.8):\n",
-    "    \"\"\"\n",
-    "    Génère du texte à partir d'un prompt\n",
-    "    \"\"\"\n",
     "    sequence = tokenizer.texts_to_sequences([prompt], add_special_tokens=True)[0]\n",
     "    input_sequence = pad_sequences([sequence], max_length=model.max_sequence_length, \n",
     "                                 padding='post', pad_value=model.PAD_IDX)\n",
@@ -153,15 +131,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Charger votre corpus de texte\n",
     "with open('text8_light.txt', 'r', encoding='utf-8') as f:\n",
     "    text_data = f.read()\n",
     "\n",
-    "# Créer et entraîner le tokenizer\n",
     "tokenizer = Tokenizer(filters='')\n",
     "tokenizer.fit_on_texts([text_data])\n",
     "\n",
@@ -201,7 +177,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -239,14 +215,12 @@
     }
    ],
    "source": [
-    "# Entraîner le modèle\n",
-    "history = train_causal_lm(text_data, model, tokenizer)\n",
-    "\n"
+    "history = train_causal_lm(text_data, model, tokenizer)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -258,7 +232,6 @@
     }
    ],
    "source": [
-    "# Générer du texte\n",
     "generated = generate_text(model, tokenizer, \"this would ensure that\", temperature=0.8)\n",
     "print(generated)"
    ]

diff --git a/examples/generation/transformer-text-generation/transformer-debug.ipynb b/examples/generation/transformer-text-generation/transformer-debug.ipynb
@@ -2,20 +2,22 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
+    "import re\n",
+    "import logging\n",
     "import numpy as np\n",
+    "import pandas as pd\n",
     "import matplotlib.pyplot as plt\n",
+    "\n",
+    "\n",
     "from typing import Optional, Tuple, Dict\n",
-    "import logging\n",
-    "import re\n",
-    "import pandas as pd\n",
     "from collections import Counter\n",
     "\n",
     "from neuralnetlib.layers import MultiHeadAttention, PositionalEncoding, AddNorm, FeedForward, TransformerEncoderLayer, TransformerDecoderLayer, Embedding\n",
-    "from neuralnetlib.preprocessing import PCA, Tokenizer, one_hot_encode, pad_sequences, normalize_gradient\n",
+    "from neuralnetlib.preprocessing import PCA, Tokenizer, pad_sequences, normalize_gradient\n",
     "\n",
     "logging.getLogger('matplotlib').setLevel(logging.ERROR)\n"
    ]

diff --git a/examples/generation/transformer-text-generation/transformer-for-translation.ipynb b/examples/generation/transformer-text-generation/transformer-for-translation.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "a036f9b8eee0491",
    "metadata": {
     "ExecuteTime": {
@@ -14,12 +14,11 @@
    "outputs": [],
    "source": [
     "import re\n",
-    "import numpy as np\n",
     "import pandas as pd\n",
+    "\n",
     "from neuralnetlib.models import Transformer\n",
     "from neuralnetlib.preprocessing import Tokenizer, pad_sequences\n",
     "from neuralnetlib.utils import train_test_split\n",
-    "from neuralnetlib.losses import CrossEntropyWithLabelSmoothing\n",
     "from neuralnetlib.optimizers import Adam\n",
     "from neuralnetlib.callbacks import EarlyStopping, Callback, LearningRateScheduler"
    ]
@@ -229,7 +228,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "id": "e3bdab93",
    "metadata": {},
    "outputs": [
@@ -288,7 +287,7 @@
     "    verbose=True,\n",
     "    callbacks=[\n",
     "        EarlyStopping(monitor='loss', patience=20),\n",
-    "        #LearningRateScheduler(schedule=\"warmup_cosine\", initial_learning_rate=0.0001, verbose=True),\n",
+    "        LearningRateScheduler(schedule=\"warmup_cosine\", initial_learning_rate=5e-5, verbose=True),\n",
     "        DebugCallback(model, fr_tokenizer, en_tokenizer)\n",
     "    ],\n",
     "    validation_data=(x_test, y_test),\n",

diff --git a/examples/generation/vae-image-generation/autoencoder_vae_example.ipynb b/examples/generation/vae-image-generation/autoencoder_vae_example.ipynb
@@ -18,7 +18,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "4977326d",
    "metadata": {
     "ExecuteTime": {
@@ -28,16 +28,12 @@
    },
    "outputs": [],
    "source": [
-    "import numpy as np\n",
     "from sklearn.datasets import fetch_openml\n",
-    "from sklearn.model_selection import train_test_split\n",
+    "from neuralnetlib.utils import train_test_split\n",
     "from neuralnetlib.models import Autoencoder\n",
-    "from scipy.stats import kurtosis, skew, pearsonr\n",
     "from neuralnetlib.layers import *\n",
-    "from neuralnetlib.preprocessing import PCA\n",
-    "from sklearn.metrics import mean_squared_error, mean_absolute_error\n",
-    "import matplotlib.pyplot as plt\n",
-    "from tqdm import tqdm"
+    "\n",
+    "import matplotlib.pyplot as plt"
    ]
   },
   {

diff --git a/neuralnetlib/metrics.py b/neuralnetlib/metrics.py
@@ -1,5 +1,8 @@
+import math
 import numpy as np
 
+from collections import namedtuple
+
 
 def _reshape_inputs(y_pred: np.ndarray, y_true: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
     y_pred = np.asarray(y_pred)

diff --git a/neuralnetlib/utils.py b/neuralnetlib/utils.py
@@ -67,29 +67,33 @@ def progress_bar(current: int, total: int, width: int = 30, message: str = "") -
     sys.stdout.flush()
 
 
-def train_test_split(x: np.ndarray, y: np.ndarray, test_size: float = 0.2, random_state: int = None, shuffle: bool = True) -> tuple:
+def train_test_split(x: np.ndarray, y: np.ndarray = None, test_size: float = 0.2, random_state: int = None, shuffle: bool = True) -> tuple:
     """
     Splits the data into training and test sets.
 
     Args:
         x (np.ndarray or list): input data
-        y (np.ndarray or list): target data
+        y (np.ndarray or list, optional): target data. If None, only x will be split
         test_size (float): the proportion of the dataset to include in the test split
         random_state (int): seed for the random number generator
         shuffle (bool): whether to shuffle the data before splitting
 
     Returns:
-        tuple: (x_train, x_test, y_train, y_test)
+        tuple: (x_train, x_test) if y is None, else (x_train, x_test, y_train, y_test)
     """
     x = np.array(x)
-    y = np.array(y)
     rng = np.random.default_rng(random_state if random_state is not None else int(time.time_ns()))
     indices = np.arange(len(x))
     if shuffle:
         rng.shuffle(indices)
     split_index = int(len(x) * (1 - test_size))
     x_train = x[indices[:split_index]]
     x_test = x[indices[split_index:]]
+
+    if y is None:
+        return x_train, x_test
+
+    y = np.array(y)
     y_train = y[indices[:split_index]]
     y_test = y[indices[split_index:]]
     return x_train, x_test, y_train, y_test