Skip to content

Commit

Permalink
refactor: some improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
marcpinet committed Dec 2, 2024
1 parent ffab2fe commit a538367
Show file tree
Hide file tree
Showing 10 changed files with 129 additions and 145 deletions.
120 changes: 60 additions & 60 deletions examples/compression/autoencoder_fashonized_mnist_basic.ipynb

Large diffs are not rendered by default.

43 changes: 26 additions & 17 deletions examples/compression/autoencoder_fashonized_mnist_convolution.ipynb

Large diffs are not rendered by default.

11 changes: 4 additions & 7 deletions examples/generation/gan-image-generation/gan-mnist.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,7 @@
"\n",
"from neuralnetlib.preprocessing import one_hot_encode\n",
"from neuralnetlib.models import Sequential, GAN\n",
"from neuralnetlib.layers import Input, Dense, BatchNormalization, Dropout, Activation\n",
"from neuralnetlib.activations import LeakyReLU\n",
"from neuralnetlib.optimizers import Adam\n",
"from neuralnetlib.losses import Wasserstein"
"from neuralnetlib.layers import Input, Dense"
]
},
{
Expand Down Expand Up @@ -197,14 +194,14 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"GIF créé avec succès sous le nom 'output.gif'\n"
"GIF 'output.gif' succesffuly created!\n"
]
}
],
Expand All @@ -217,7 +214,7 @@
"if images:\n",
" images[0].save('output.gif', save_all=True, append_images=images[1:], duration=100, loop=0)\n",
"\n",
"print(\"GIF créé avec succès sous le nom 'output.gif'\")"
"print(\"GIF 'output.gif' succesffuly created!\")"
]
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"id": "a036f9b8eee0491",
"metadata": {
"ExecuteTime": {
Expand All @@ -14,6 +14,7 @@
"outputs": [],
"source": [
"import numpy as np\n",
"\n",
"from neuralnetlib.layers import Input, Embedding, LSTM, Dense\n",
"from neuralnetlib.models import Sequential\n",
"from neuralnetlib.preprocessing import one_hot_encode\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,48 +2,36 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"import re\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"from neuralnetlib.models import Transformer\n",
"from neuralnetlib.preprocessing import Tokenizer, pad_sequences\n",
"from neuralnetlib.utils import train_test_split\n",
"from neuralnetlib.losses import CrossEntropyWithLabelSmoothing\n",
"from neuralnetlib.optimizers import Adam\n",
"from neuralnetlib.callbacks import EarlyStopping, Callback, LearningRateScheduler"
]
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def prepare_causal_lm_data(text_data, tokenizer, max_length=512, stride=256):\n",
" \"\"\"\n",
" Prépare les données pour l'entraînement causal LM en utilisant une fenêtre glissante\n",
" \"\"\"\n",
" # Tokenisation du texte complet\n",
" tokens = tokenizer.texts_to_sequences([text_data], add_special_tokens=True)[0]\n",
" \n",
" # Création des séquences d'entraînement avec une fenêtre glissante\n",
" sequences = []\n",
" for i in range(0, len(tokens) - max_length + 1, stride):\n",
" sequence = tokens[i:i + max_length]\n",
" if len(sequence) == max_length:\n",
" sequences.append(sequence)\n",
" \n",
" # Conversion en array numpy\n",
" sequences = np.array(sequences)\n",
" \n",
" # Création des entrées et cibles (décalées d'une position)\n",
" X = sequences[:, :-1]\n",
" y = sequences[:, 1:]\n",
" \n",
Expand All @@ -52,7 +40,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -75,7 +63,7 @@
" input_sequence,\n",
" max_length=self.max_length,\n",
" temperature=self.temperature,\n",
" beam_size=1 # On utilise un beam search de 1 pour la génération simple\n",
" beam_size=1\n",
" )\n",
" \n",
" generated_text = self.tokenizer.sequences_to_texts(generated.tolist())[0]\n",
Expand All @@ -85,26 +73,20 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def train_causal_lm(text_data, model, tokenizer, max_length=512, batch_size=32, epochs=10):\n",
" \"\"\"\n",
" Entraîne le modèle comme un LLM causal\n",
" \"\"\"\n",
" # Préparation des données\n",
" X, y = prepare_causal_lm_data(text_data, tokenizer, max_length)\n",
" \n",
" # Prompts de test pour la génération\n",
" test_prompts = [\n",
" \"Il était une fois\",\n",
" \"Le chat\",\n",
" \"Je pense que\",\n",
" \"Dans la forêt\"\n",
" ]\n",
" \n",
" # Callbacks\n",
" callbacks = [\n",
" EarlyStopping(monitor='loss', patience=5, restore_best_weights=True),\n",
" TextGenerationCallback(model, tokenizer, test_prompts),\n",
Expand All @@ -115,7 +97,6 @@
" )\n",
" ]\n",
" \n",
" # Entraînement\n",
" history = model.fit(\n",
" X, y,\n",
" epochs=epochs,\n",
Expand All @@ -129,14 +110,11 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def generate_text(model, tokenizer, prompt, max_length=50, temperature=0.8):\n",
" \"\"\"\n",
" Génère du texte à partir d'un prompt\n",
" \"\"\"\n",
" sequence = tokenizer.texts_to_sequences([prompt], add_special_tokens=True)[0]\n",
" input_sequence = pad_sequences([sequence], max_length=model.max_sequence_length, \n",
" padding='post', pad_value=model.PAD_IDX)\n",
Expand All @@ -153,15 +131,13 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Charger votre corpus de texte\n",
"with open('text8_light.txt', 'r', encoding='utf-8') as f:\n",
" text_data = f.read()\n",
"\n",
"# Créer et entraîner le tokenizer\n",
"tokenizer = Tokenizer(filters='')\n",
"tokenizer.fit_on_texts([text_data])\n",
"\n",
Expand Down Expand Up @@ -201,7 +177,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -239,14 +215,12 @@
}
],
"source": [
"# Entraîner le modèle\n",
"history = train_causal_lm(text_data, model, tokenizer)\n",
"\n"
"history = train_causal_lm(text_data, model, tokenizer)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand All @@ -258,7 +232,6 @@
}
],
"source": [
"# Générer du texte\n",
"generated = generate_text(model, tokenizer, \"this would ensure that\", temperature=0.8)\n",
"print(generated)"
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,22 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"import logging\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"\n",
"\n",
"from typing import Optional, Tuple, Dict\n",
"import logging\n",
"import re\n",
"import pandas as pd\n",
"from collections import Counter\n",
"\n",
"from neuralnetlib.layers import MultiHeadAttention, PositionalEncoding, AddNorm, FeedForward, TransformerEncoderLayer, TransformerDecoderLayer, Embedding\n",
"from neuralnetlib.preprocessing import PCA, Tokenizer, one_hot_encode, pad_sequences, normalize_gradient\n",
"from neuralnetlib.preprocessing import PCA, Tokenizer, pad_sequences, normalize_gradient\n",
"\n",
"logging.getLogger('matplotlib').setLevel(logging.ERROR)\n"
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"id": "a036f9b8eee0491",
"metadata": {
"ExecuteTime": {
Expand All @@ -14,12 +14,11 @@
"outputs": [],
"source": [
"import re\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"from neuralnetlib.models import Transformer\n",
"from neuralnetlib.preprocessing import Tokenizer, pad_sequences\n",
"from neuralnetlib.utils import train_test_split\n",
"from neuralnetlib.losses import CrossEntropyWithLabelSmoothing\n",
"from neuralnetlib.optimizers import Adam\n",
"from neuralnetlib.callbacks import EarlyStopping, Callback, LearningRateScheduler"
]
Expand Down Expand Up @@ -229,7 +228,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": null,
"id": "e3bdab93",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -288,7 +287,7 @@
" verbose=True,\n",
" callbacks=[\n",
" EarlyStopping(monitor='loss', patience=20),\n",
" #LearningRateScheduler(schedule=\"warmup_cosine\", initial_learning_rate=0.0001, verbose=True),\n",
" LearningRateScheduler(schedule=\"warmup_cosine\", initial_learning_rate=5e-5, verbose=True),\n",
" DebugCallback(model, fr_tokenizer, en_tokenizer)\n",
" ],\n",
" validation_data=(x_test, y_test),\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"id": "4977326d",
"metadata": {
"ExecuteTime": {
Expand All @@ -28,16 +28,12 @@
},
"outputs": [],
"source": [
"import numpy as np\n",
"from sklearn.datasets import fetch_openml\n",
"from sklearn.model_selection import train_test_split\n",
"from neuralnetlib.utils import train_test_split\n",
"from neuralnetlib.models import Autoencoder\n",
"from scipy.stats import kurtosis, skew, pearsonr\n",
"from neuralnetlib.layers import *\n",
"from neuralnetlib.preprocessing import PCA\n",
"from sklearn.metrics import mean_squared_error, mean_absolute_error\n",
"import matplotlib.pyplot as plt\n",
"from tqdm import tqdm"
"\n",
"import matplotlib.pyplot as plt"
]
},
{
Expand Down
3 changes: 3 additions & 0 deletions neuralnetlib/metrics.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import math
import numpy as np

from collections import namedtuple


def _reshape_inputs(y_pred: np.ndarray, y_true: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
y_pred = np.asarray(y_pred)
Expand Down
12 changes: 8 additions & 4 deletions neuralnetlib/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,29 +67,33 @@ def progress_bar(current: int, total: int, width: int = 30, message: str = "") -
sys.stdout.flush()


def train_test_split(x: np.ndarray, y: np.ndarray, test_size: float = 0.2, random_state: int = None, shuffle: bool = True) -> tuple:
def train_test_split(x: np.ndarray, y: np.ndarray = None, test_size: float = 0.2, random_state: int = None, shuffle: bool = True) -> tuple:
"""
Splits the data into training and test sets.
Args:
x (np.ndarray or list): input data
y (np.ndarray or list): target data
y (np.ndarray or list, optional): target data. If None, only x will be split
test_size (float): the proportion of the dataset to include in the test split
random_state (int): seed for the random number generator
shuffle (bool): whether to shuffle the data before splitting
Returns:
tuple: (x_train, x_test, y_train, y_test)
tuple: (x_train, x_test) if y is None, else (x_train, x_test, y_train, y_test)
"""
x = np.array(x)
y = np.array(y)
rng = np.random.default_rng(random_state if random_state is not None else int(time.time_ns()))
indices = np.arange(len(x))
if shuffle:
rng.shuffle(indices)
split_index = int(len(x) * (1 - test_size))
x_train = x[indices[:split_index]]
x_test = x[indices[split_index:]]

if y is None:
return x_train, x_test

y = np.array(y)
y_train = y[indices[:split_index]]
y_test = y[indices[split_index:]]
return x_train, x_test, y_train, y_test
Expand Down

0 comments on commit a538367

Please sign in to comment.