-
Notifications
You must be signed in to change notification settings - Fork 1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Initial AI exploration for the backend of the applications #50
base: main
Are you sure you want to change the base?
Changes from all commits
7cbcc38
2489374
1885c64
e0fea15
c591f45
c8fa4ca
d7d6379
0151a05
2c3c2fb
ed091f8
96d74da
89dfc3d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
"id","title" | ||
697435,Maîtrise avec projet | ||
738239,Microprogramme | ||
697451,Maîtrise avec mémoire | ||
915770,Concentration en technologies de la santé | ||
697388,Doctorat | ||
700093,Programme court | ||
700092,Certificat | ||
700095,DESS | ||
700094,Cheminement universitaire en technologie (CUT) | ||
699771,Baccalauréat |
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
Model Name,Silhouette Score,Number of Clusters,Inertia,Time Taken,dataframe | ||
MiniLM,0.06354355,5,2104.3017578125,62.3960816860199,course | ||
Multilingual MiniLM-L12,0.06972561,5,1927.947265625,88.9396481513977,course | ||
MiniLM-L6,0.06067439,5,6040.69970703125,48.64740014076233,course | ||
Multilingual MPNet,0.055970665,5,1884.258056640625,254.99050879478455,course | ||
DistilRoBERTa,0.049180128,5,13973.9873046875,159.84893226623535,course | ||
MiniLM-L12,0.04269758,5,3714.1708984375,98.60889649391174,course | ||
MiniLM-L3,0.038517494,5,2678.4150390625,23.103832721710205,course | ||
XLM-RoBERTa Base Multilingual,0.0772015,5,49950.2265625,277.00257778167725,course | ||
STSB XLM-R Multilingual,0.0772015,5,49950.2265625,330.6788206100464,course | ||
MS MARCO DistilBERT,0.0336069,5,5619.89453125,149.96267318725586,course | ||
MS MARCO BERT,0.044978958,5,16743.794921875,338.83691787719727,course | ||
T5 Small,0.14505102,5,204.54937744140625,77.6448757648468,course | ||
T5 Base,0.06801192,5,996.11279296875,396.1023998260498,course | ||
T5 Large,0.041167583,5,1061.4305419921875,1004.1282534599304,course | ||
FLAN-T5 Small,0.10541956,5,162.1136932373047,71.80396556854248,course | ||
FLAN-T5 Base,0.06095163,5,388.6905517578125,278.6274485588074,course | ||
FLAN-T5 Large,0.10817401,5,129.44412231445312,915.0140993595123,course | ||
RoBERTa Large v1,0.051263746,5,224872.84375,903.1814665794373,course | ||
MPNet Base v2,0.06346093,5,3160.8056640625,266.6551468372345,course | ||
MiniLM,0.06354355,5,2104.3017578125,35.63298296928406,combined | ||
Multilingual MiniLM-L12,0.06972561,5,1927.947265625,59.94369435310364,combined | ||
MiniLM-L6,0.06067439,5,6040.69921875,35.490906953811646,combined | ||
Multilingual MPNet,0.055970665,5,1884.258056640625,177.7440221309662,combined | ||
DistilRoBERTa,0.049180128,5,13973.9873046875,117.59407949447632,combined | ||
MiniLM-L12,0.04269758,5,3714.1708984375,68.83575224876404,combined | ||
MiniLM-L3,0.038517494,5,2678.4150390625,18.43731689453125,combined | ||
XLM-RoBERTa Base Multilingual,0.0772015,5,49950.2265625,177.5923249721527,combined | ||
STSB XLM-R Multilingual,0.0772015,5,49950.2265625,177.61623072624207,combined | ||
MS MARCO DistilBERT,0.0336069,5,5619.89453125,105.29467010498047,combined | ||
MS MARCO BERT,0.044978958,5,16743.794921875,209.93503665924072,combined | ||
T5 Small,0.14505102,5,204.54937744140625,59.6424446105957,combined | ||
T5 Base,0.06801192,5,996.11279296875,227.7394471168518,combined | ||
T5 Large,0.041167583,5,1061.4306640625,745.5525488853455,combined | ||
FLAN-T5 Small,0.10541956,5,162.1136932373047,67.67559218406677,combined | ||
FLAN-T5 Base,0.06095163,5,388.6905517578125,252.0490207672119,combined | ||
FLAN-T5 Large,0.10817401,5,129.44412231445312,818.083892583847,combined | ||
RoBERTa Large v1,0.051263746,5,224872.84375,748.9289219379425,combined | ||
MPNet Base v2,0.06346093,5,3160.8056640625,219.38725876808167,combined |
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
torch | ||
numpy==1.26 | ||
pandas | ||
scikit-learn | ||
plotly | ||
nltk | ||
transformers==4.46.3 | ||
sentence-transformers | ||
einops | ||
datasets | ||
gradio | ||
networkx | ||
umap-learn | ||
ipywidgets |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
from sklearn.cluster import KMeans | ||
from sklearn.metrics import silhouette_score | ||
|
||
def evaluate_clustering(embeddings, n_clusters=5): | ||
kmeans = KMeans(n_clusters=n_clusters, random_state=42) | ||
labels = kmeans.fit_predict(embeddings) | ||
silhouette_avg = silhouette_score(embeddings, labels) | ||
inertia = kmeans.inertia_ | ||
return labels, silhouette_avg, inertia |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
import torch | ||
import time | ||
|
||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | ||
|
||
def generate_embeddings(texts, model, tokenizer, batch_size=32): | ||
""" | ||
Generate embeddings using the encoder of a T5 model. | ||
Returns embeddings and time taken in seconds. | ||
""" | ||
model.to(device) | ||
model.eval() | ||
embeddings = [] | ||
start_time = time.time() | ||
|
||
with torch.no_grad(): | ||
for i in range(0, len(texts), batch_size): | ||
batch_texts = texts[i:i+batch_size] | ||
tokens = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=512) | ||
tokens = {k: v.to(device) for k, v in tokens.items()} | ||
|
||
# Use the encoder of T5 | ||
encoder_outputs = model.encoder(input_ids=tokens['input_ids'], attention_mask=tokens['attention_mask']) | ||
batch_embeddings = encoder_outputs.last_hidden_state.mean(dim=1) # Mean pooling | ||
embeddings.append(batch_embeddings.cpu()) | ||
|
||
embeddings = torch.cat(embeddings, dim=0).numpy() | ||
time_taken = time.time() - start_time | ||
return embeddings, time_taken |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModel | ||
|
||
|
||
|
||
def load_model_and_tokenizer(model_name): | ||
""" | ||
Load tokenizer and model based on the model type. | ||
:param model_name: Name of the model | ||
:return: tokenizer and model | ||
""" | ||
tokenizer = AutoTokenizer.from_pretrained(model_name) | ||
if models_to_test_general[model_name] == "encoder": | ||
model = AutoModel.from_pretrained(model_name) | ||
else: | ||
model = AutoModelForSeq2SeqLM.from_pretrained(model_name) | ||
return tokenizer, model | ||
|
||
# Define general and French-specific models | ||
models_to_test_general = { | ||
# Encoder Models | ||
"sentence-transformers/all-MiniLM-L6-v2": "encoder", | ||
"sentence-transformers/all-distilroberta-v1": "encoder", | ||
"sentence-transformers/all-MiniLM-L12-v2": "encoder", | ||
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2": "encoder", | ||
"sentence-transformers/paraphrase-multilingual-mpnet-base-v2": "encoder", | ||
"sentence-transformers/xlm-r-bert-base-nli-stsb-mean-tokens": "encoder", | ||
"sentence-transformers/stsb-xlm-r-multilingual": "encoder", | ||
"sentence-transformers/msmarco-distilbert-base-v2": "encoder", | ||
"sentence-transformers/msmarco-bert-base-dot-v5": "encoder", | ||
"sentence-transformers/all-mpnet-base-v2": "encoder", | ||
"sentence-transformers/roberta-large-v1": "encoder", | ||
"sentence-transformers/miniLM-L6": "encoder", | ||
"sentence-transformers/miniLM-L3": "encoder", | ||
"sentence-transformers/miniLM": "encoder", | ||
|
||
# Seq2Seq Models | ||
"t5-small": "seq2seq", | ||
"t5-base": "seq2seq", | ||
"t5-large": "seq2seq", | ||
"google/flan-t5-small": "seq2seq", | ||
"google/flan-t5-base": "seq2seq", | ||
"google/flan-t5-large": "seq2seq", | ||
} | ||
|
||
|
||
models_to_test_french = { | ||
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2": "encoder", | ||
"sentence-transformers/paraphrase-multilingual-mpnet-base-v2": "encoder", | ||
"sentence-transformers/xlm-r-bert-base-nli-stsb-mean-tokens": "encoder", | ||
"sentence-transformers/stsb-xlm-r-multilingual": "encoder", | ||
"t5-small": "seq2seq", | ||
"t5-base": "seq2seq", | ||
"google/flan-t5-small": "seq2seq", | ||
"google/flan-t5-base": "seq2seq", | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
import nltk | ||
import re | ||
from nltk.corpus import stopwords | ||
from nltk.tokenize import word_tokenize, sent_tokenize | ||
|
||
nltk.download('punkt') | ||
nltk.download('stopwords') | ||
|
||
stop_words = set(stopwords.words('french')) | ||
|
||
def remove_stopwords(text): | ||
words = word_tokenize(text.lower()) | ||
filtered_words = [word for word in words if word.isalpha() and word not in stop_words] | ||
return ' '.join(filtered_words) | ||
|
||
def preprocess_text(text): | ||
text = re.sub(r'[^\w\s]', '', text.lower()) | ||
return remove_stopwords(text) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,140 @@ | ||
import plotly.express as px | ||
from sklearn.decomposition import PCA | ||
from sklearn.manifold import TSNE | ||
import umap | ||
|
||
def visualize_embeddings(embeddings, labels, method='PCA'): | ||
if method == 'PCA': | ||
reducer = PCA(n_components=2) | ||
Check notice Code scanning / SonarCloud Results that depend on random number generation should be reproducible Low
Provide a seed for the random\_state parameter. See more on SonarQube Cloud
|
||
elif method == 't-SNE': | ||
reducer = TSNE(n_components=2, perplexity=30, n_iter=1000) | ||
Check notice Code scanning / SonarCloud Results that depend on random number generation should be reproducible Low
Provide a seed for the random\_state parameter. See more on SonarQube Cloud
|
||
elif method == 'UMAP': | ||
reducer = umap.UMAP(n_components=2, n_neighbors=15, min_dist=0.1) | ||
else: | ||
raise ValueError("Unsupported method") | ||
|
||
reduced = reducer.fit_transform(embeddings) | ||
fig = px.scatter( | ||
x=reduced[:, 0], | ||
y=reduced[:, 1], | ||
color=labels, | ||
title=f"Embeddings Visualization - {method}" | ||
) | ||
fig.show() | ||
|
||
|
||
def visualize_embeddings_programs_type(embeddings, program_df, method='PCA'): | ||
# Define a function to assign categories | ||
def assign_category(title): | ||
title = title.lower() | ||
color_map = { | ||
"logiciel": "blue", | ||
"mécanique": "green", | ||
"construction": "orange", | ||
"information": "purple", | ||
"production": "red", | ||
"logistique": "yellow", | ||
"électrique": "pink", | ||
"santé": "brown", | ||
"financière": "gray", | ||
"télécom": "black", | ||
"gestion": "cyan", | ||
"environnement": "magenta", | ||
"aérospatiale": "olive", | ||
"expérience": "teal", | ||
"interna": "navy", | ||
"entreprenariat": "maroon", | ||
"projet": "gold", | ||
"génie": "silver", | ||
"informatique": "indigo", | ||
"aérospatial": "orchid", | ||
} | ||
|
||
for category, color in color_map.items(): | ||
if category in title: | ||
return category | ||
|
||
return "Other" | ||
|
||
# Assign categories to the program DataFrame | ||
program_df['category'] = program_df['title'].apply(assign_category) | ||
|
||
# Choose dimensionality reduction method | ||
if method == 'PCA': | ||
reducer = PCA(n_components=2) | ||
Check notice Code scanning / SonarCloud Results that depend on random number generation should be reproducible Low
Provide a seed for the random\_state parameter. See more on SonarQube Cloud
|
||
elif method == 't-SNE': | ||
reducer = TSNE(n_components=2, perplexity=30, n_iter=1000) | ||
Check notice Code scanning / SonarCloud Results that depend on random number generation should be reproducible Low
Provide a seed for the random\_state parameter. See more on SonarQube Cloud
|
||
elif method == 'UMAP': | ||
reducer = umap.UMAP(n_components=2, n_neighbors=15, min_dist=0.1) | ||
else: | ||
raise ValueError("Unsupported method") | ||
|
||
# Reduce embeddings to 2D | ||
reduced = reducer.fit_transform(embeddings) | ||
|
||
# Create scatter plot | ||
fig = px.scatter( | ||
x=reduced[:, 0], | ||
y=reduced[:, 1], | ||
color=program_df['category'], | ||
labels={'color': 'Category'}, | ||
title=f"Embeddings Visualization - {method}", | ||
hover_data={'Title': program_df['title']} | ||
) | ||
|
||
# Show the figure | ||
fig.show() | ||
|
||
def visualize_embeddings_programs(embeddings, program_df, method='PCA'): | ||
# Define a function to assign categories | ||
def assign_category(title): | ||
title = title.lower() | ||
if "dess" in title: | ||
return "DESS" | ||
elif "micro" in title: | ||
return "Microprogramme" | ||
elif "certificat" in title: | ||
return "Certificat" | ||
elif "court" in title: | ||
return "Programme court" | ||
elif "bac" in title: | ||
return "Baccalauréat" | ||
elif "maîtrise" in title or "maitrise" in title: | ||
return "Maîtrise" | ||
elif "doc" in title: | ||
return "Doctorat" | ||
elif "Cheminement" in title: | ||
return "Cheminement" | ||
elif "Annee" in title: | ||
return "Année" | ||
else: | ||
return "Other" | ||
|
||
# Assign categories to the program DataFrame | ||
program_df['category'] = program_df['title'].apply(assign_category) | ||
|
||
# Choose dimensionality reduction method | ||
if method == 'PCA': | ||
reducer = PCA(n_components=2) | ||
Check notice Code scanning / SonarCloud Results that depend on random number generation should be reproducible Low
Provide a seed for the random\_state parameter. See more on SonarQube Cloud
|
||
elif method == 't-SNE': | ||
reducer = TSNE(n_components=2, perplexity=30, n_iter=1000) | ||
Check notice Code scanning / SonarCloud Results that depend on random number generation should be reproducible Low
Provide a seed for the random\_state parameter. See more on SonarQube Cloud
|
||
elif method == 'UMAP': | ||
reducer = umap.UMAP(n_components=2, n_neighbors=15, min_dist=0.1) | ||
else: | ||
raise ValueError("Unsupported method") | ||
|
||
# Reduce embeddings to 2D | ||
reduced = reducer.fit_transform(embeddings) | ||
|
||
# Create scatter plot | ||
fig = px.scatter( | ||
x=reduced[:, 0], | ||
y=reduced[:, 1], | ||
color=program_df['category'], | ||
labels={'color': 'Category'}, | ||
title=f"Embeddings Visualization - {method}", | ||
hover_data={'Title': program_df['title']} | ||
) | ||
|
||
# Show the figure | ||
fig.show() |
Check notice
Code scanning / SonarCloud
Results that depend on random number generation should be reproducible Low