Merge pull request #1 from jiucchu/feat/215/setting

[AI/FEAT] API 연결을 위한 초기 세팅
JNU-econovation · Jan 12, 2024 · 549347b · 549347b
2 parents c9e4bcc + a19e9e4
commit 549347b
Show file tree

Hide file tree

Showing 17 changed files with 64,644 additions and 0 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1 @@
+*.safetensors filter=lfs diff=lfs merge=lfs -text
diff --git a/AI/main.py b/AI/main.py
@@ -0,0 +1,11 @@
+from typing import List
+from pydantic import BaseModel
+
+from fastapi import FastAPI, Depends, File, Request, APIRouter
+import uvicorn
+import requests
+
+from model import sbert
+
+app = FastAPI()
+app.include_router(sbert)
diff --git a/AI/model.py b/AI/model.py
@@ -0,0 +1,116 @@
+import numpy as np
+import pandas as pd
+from numpy import dot
+from numpy.linalg import norm
+import urllib.request
+import torch
+from pydantic import BaseModel
+from typing import List
+
+from sentence_transformers import SentenceTransformer
+from sklearn.cluster import KMeans
+
+from fastapi import APIRouter
+
+#모델 가져오기
+model_path = './models'
+model = SentenceTransformer(model_path)
+
+
+sbert = APIRouter(prefix='/sbert')
+
+# 경로 설정
+@sbert.get('/')
+async def start_sbert():
+    return {'message' : "sbert is starting"}
+
+class Item(BaseModel) :
+    name: str
+    sentence : str
+
+class Item_and_num(BaseModel) :
+    item_list : List[Item]
+    min_n : int
+    max_n : int
+
+@sbert.post('/clustering')
+async def start_sbert(data:Item_and_num):
+
+    input = data.item_list
+    min_num = data.min_n
+    max_num = data.max_n
+
+
+    # 문장 임베딩
+    corpus = [Item.sentence for Item in input]
+    corpus_embeddings = model.encode(corpus, convert_to_tensor = True)
+
+    # 클러스터링
+    from sklearn.cluster import KMeans
+
+    num_clusters = 4
+    clustering_model = KMeans(n_clusters=num_clusters)
+    clustering_model.fit(corpus_embeddings.cpu())
+    cluster_assignment = clustering_model.labels_
+
+    clustered_sentences = [[] for i in range(num_clusters)]
+    for sentence_id, cluster_id in enumerate(cluster_assignment):
+        clustered_sentences[cluster_id].append(corpus[sentence_id])
+
+
+    cluster_list = list()
+
+    for i, cluster in enumerate(clustered_sentences):
+        cluster_list.append(cluster)
+
+
+    # 인원 조절
+    copy_list = list()
+    for i in cluster_list :
+        copy_list.append(i)
+
+    # 최대인원을 넘겼을 때 잘라버리기
+    st_len = len(copy_list)
+
+    for i in range(st_len):
+        if len(copy_list[i]) > max_num :
+            copy_list.append(copy_list[i][:(len(copy_list[i])//2)])
+            copy_list.append(copy_list[i][(len(copy_list[i])//2):])
+            copy_list[i]=[]
+
+    # 하나인 것끼리 묶어준다.
+    st_len = len(copy_list)
+
+    one_list =[]
+    for i in range(st_len) :
+        if len(copy_list[i]) == 1 :
+            one_list += copy_list[i]
+            copy_list[i] = []
+
+    copy_list.append(one_list)
+
+
+    # min_num보다 작은 애들끼리 더해줌
+    st_len = len(copy_list)
+
+    for i in range(st_len-1):
+        for j in range(1, st_len-i-1):
+            team_check = copy_list[i] + copy_list[i+j]
+            # print(team_check)
+            if (len(team_check) <= max_num) and (len(team_check) >= min_num):
+
+                copy_list.append(team_check)
+                copy_list[i+j] = []
+                copy_list[i] = []
+
+    while [] in copy_list:
+        copy_list.remove([])
+
+    for i in range(len(copy_list)):
+        for j in range (len(copy_list[i])):
+            target_value = copy_list[i][j]
+            for Item in input :
+                if Item.sentence == target_value :
+                    copy_list[i][j] = Item.name
+
+    return copy_list
diff --git a/AI/models/1_Pooling/config.json b/AI/models/1_Pooling/config.json
@@ -0,0 +1,7 @@
+{
+  "word_embedding_dimension": 768,
+  "pooling_mode_cls_token": false,
+  "pooling_mode_mean_tokens": true,
+  "pooling_mode_max_tokens": false,
+  "pooling_mode_mean_sqrt_len_tokens": false
+}
diff --git a/AI/models/README.md b/AI/models/README.md
@@ -0,0 +1,126 @@
+---
+pipeline_tag: sentence-similarity
+tags:
+- sentence-transformers
+- feature-extraction
+- sentence-similarity
+- transformers
+
+---
+
+# {MODEL_NAME}
+
+This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.
+
+<!--- Describe your model here -->
+
+## Usage (Sentence-Transformers)
+
+Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
+
+```
+pip install -U sentence-transformers
+```
+
+Then you can use the model like this:
+
+```python
+from sentence_transformers import SentenceTransformer
+sentences = ["This is an example sentence", "Each sentence is converted"]
+
+model = SentenceTransformer('{MODEL_NAME}')
+embeddings = model.encode(sentences)
+print(embeddings)
+```
+
+
+
+## Usage (HuggingFace Transformers)
+Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
+
+```python
+from transformers import AutoTokenizer, AutoModel
+import torch
+
+
+#Mean Pooling - Take attention mask into account for correct averaging
+def mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+
+
+# Sentences we want sentence embeddings for
+sentences = ['This is an example sentence', 'Each sentence is converted']
+
+# Load model from HuggingFace Hub
+tokenizer = AutoTokenizer.from_pretrained('{MODEL_NAME}')
+model = AutoModel.from_pretrained('{MODEL_NAME}')
+
+# Tokenize sentences
+encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
+
+# Compute token embeddings
+with torch.no_grad():
+    model_output = model(**encoded_input)
+
+# Perform pooling. In this case, mean pooling.
+sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
+
+print("Sentence embeddings:")
+print(sentence_embeddings)
+```
+
+
+
+## Evaluation Results
+
+<!--- Describe how your model was evaluated -->
+
+For an automated evaluation of this model, see the *Sentence Embeddings Benchmark*: [https://seb.sbert.net](https://seb.sbert.net?model_name={MODEL_NAME})
+
+
+## Training
+The model was trained with the parameters:
+
+**DataLoader**:
+
+`torch.utils.data.dataloader.DataLoader` of length 506 with parameters:
+```
+{'batch_size': 32, 'sampler': 'torch.utils.data.sampler.RandomSampler', 'batch_sampler': 'torch.utils.data.sampler.BatchSampler'}
+```
+
+**Loss**:
+
+`sentence_transformers.losses.CosineSimilarityLoss.CosineSimilarityLoss` 
+
+Parameters of the fit()-Method:
+```
+{
+    "epochs": 4,
+    "evaluation_steps": 50,
+    "evaluator": "sentence_transformers.evaluation.EmbeddingSimilarityEvaluator.EmbeddingSimilarityEvaluator",
+    "max_grad_norm": 1,
+    "optimizer_class": "<class 'torch.optim.adamw.AdamW'>",
+    "optimizer_params": {
+        "lr": 2e-05
+    },
+    "scheduler": "WarmupLinear",
+    "steps_per_epoch": null,
+    "warmup_steps": 405,
+    "weight_decay": 0.01
+}
+```
+
+
+## Full Model Architecture
+```
+SentenceTransformer(
+  (0): Transformer({'max_seq_length': 256, 'do_lower_case': True}) with Transformer model: RobertaModel 
+  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
+)
+```
+
+## Citing & Authors
+
+<!--- Describe where people can find more information -->
diff --git a/AI/models/config.json b/AI/models/config.json
@@ -0,0 +1,29 @@
+{
+  "_name_or_path": "klue/roberta-base",
+  "architectures": [
+    "RobertaModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "tokenizer_class": "BertTokenizer",
+  "torch_dtype": "float32",
+  "transformers_version": "4.36.2",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 32000
+}
diff --git a/AI/models/config_sentence_transformers.json b/AI/models/config_sentence_transformers.json
@@ -0,0 +1,7 @@
+{
+  "__version__": {
+    "sentence_transformers": "2.2.2",
+    "transformers": "4.36.2",
+    "pytorch": "2.0.0+cpu"
+  }
+}
diff --git a/AI/models/eval/similarity_evaluation_sts-dev_results.csv b/AI/models/eval/similarity_evaluation_sts-dev_results.csv
@@ -0,0 +1,45 @@
+epoch,steps,cosine_pearson,cosine_spearman,euclidean_pearson,euclidean_spearman,manhattan_pearson,manhattan_spearman,dot_pearson,dot_spearman
+0,50,0.8213558834738078,0.8232169418951444,0.7960443162793418,0.8059073638615916,0.7983373729106155,0.8080682267991697,0.7212937358126283,0.7356543370390979
+0,100,0.8604063495395026,0.8524774968017809,0.8442028945043516,0.845487564474039,0.8443316192870001,0.8455816491737835,0.8351245841635585,0.828604777399964
+0,150,0.876880588195492,0.867319439422618,0.8595302195800993,0.8589863803103351,0.8595293794267221,0.859014369575217,0.8579298413530259,0.8497061498737797
+0,200,0.8874622581860445,0.8779408294631077,0.8741335415038562,0.873446072780437,0.8742320194905383,0.8734333212174922,0.8737387599841837,0.8640406898326938
+0,250,0.8998911072146302,0.8932404108389441,0.8882516373010673,0.8894609576478131,0.8881477936466039,0.8895964433753458,0.8855084573685533,0.8778331050332694
+0,300,0.9053480735441131,0.8982859751527836,0.8945325030211617,0.8961267904167957,0.8947878974184137,0.8965537037329793,0.8935626635810684,0.8851951230705846
+0,350,0.9080900611412226,0.9005862887071884,0.8978376298865633,0.89831960573774,0.8979769743121653,0.8983667574393941,0.8941250997289429,0.8852259103671599
+0,400,0.9089203129272925,0.9017823538201424,0.8975328694575475,0.8979906172759377,0.8974256075512494,0.8978544509621934,0.8993626418587692,0.8902685577895166
+0,450,0.9100148839601779,0.9036438203915619,0.8988456132212452,0.8996695294581922,0.8989107348163481,0.8997896788924492,0.8925793078320046,0.8840176505227004
+0,500,0.9091293510155692,0.900967838306071,0.897025504928038,0.8969233145401189,0.8969613126008412,0.8968674343265658,0.8907536077134277,0.8810413210733496
+0,-1,0.9093999595823966,0.9014580723053609,0.8951307116692413,0.8963418687676254,0.8949511225765099,0.8962193691155657,0.8912864397818828,0.8827138200707191
+1,50,0.9098958415825638,0.9026071137758258,0.8959905559134265,0.8980815553136738,0.895829007759678,0.8980499658153012,0.8983298552843442,0.88950634946343
+1,100,0.9113061589355321,0.9042655402883096,0.8984767150326062,0.8994501609163165,0.8985144111654215,0.8994702814087927,0.8964503071325304,0.8870732689585261
+1,150,0.9100862002363415,0.9029342114235758,0.8998470616425711,0.8994875958873592,0.8999197653674252,0.8996330501447899,0.8958547131430264,0.8856324376898586
+1,200,0.9108590373029742,0.903057476186006,0.8988525404208976,0.9000881514540889,0.898805631024423,0.900045869530118,0.897157705875127,0.8871499988452332
+1,250,0.9111987392881561,0.9055415833585166,0.9001171059653101,0.9033308746811642,0.900023747722682,0.903304681451952,0.8999071094842706,0.8918822032640259
+1,300,0.9133735425293207,0.9053990544741953,0.8999597090440623,0.90179633752211,0.899850814678237,0.9016495741470723,0.8993635994761471,0.8894113878854429
+1,350,0.9156699328588741,0.907892268273922,0.9019569035633918,0.9042916125607924,0.9018923413743889,0.9044277310680321,0.9015062830063499,0.8918961070256621
+1,400,0.9163846156994926,0.907708815507214,0.9030782087359049,0.9042378095573449,0.9030396740527544,0.9042074632606978,0.9034620233810849,0.8925564358399269
+1,450,0.9147554915390246,0.906892114021664,0.9025702127042678,0.9042587607789744,0.9023596521588005,0.9041098403037665,0.9023666619771951,0.8923460762702662
+1,500,0.9159177849566856,0.9087732383522906,0.9056520534636011,0.906279951024877,0.9053479781852499,0.9060276673899722,0.9040158216330119,0.8935049214818698
+1,-1,0.9155156435924803,0.9081267200189171,0.9038380514107146,0.905382845283694,0.9035465038826089,0.9051115502156257,0.9042877530248404,0.8940577181171764
+2,50,0.9174646500486721,0.9100582430966995,0.9057666370461361,0.9071310137794958,0.9056321753609117,0.9071038189494635,0.9064782666204766,0.8963628959459086
+2,100,0.9178690322067143,0.9103774085057775,0.9061887206298532,0.90679097672793,0.9060856452314302,0.9068009021009794,0.9062513854223387,0.8959950464320343
+2,150,0.917511195448178,0.9104323757739448,0.9054339678865194,0.906710787509189,0.9051740030971264,0.9066257412662804,0.9053548431767278,0.8957341519983978
+2,200,0.9168705050990384,0.9102359090719631,0.9044827132081518,0.9061221156761637,0.9043311704918802,0.9060578676816395,0.90239226617239,0.8933248523264196
+2,250,0.9172578223584863,0.9105036562008504,0.9044593352133885,0.9063151350806621,0.904264759650337,0.9062764007635956,0.9054439937229677,0.8962336699965426
+2,300,0.9168046789666718,0.9103056546487991,0.9030841277987778,0.9059281646914303,0.9028868379188827,0.9058810611088021,0.9050803285112552,0.8965990692130592
+2,350,0.9173767679457552,0.9102573243605926,0.9053261024122603,0.9061604744497893,0.9050261197343531,0.9058984614099856,0.906276026079678,0.8962961706778504
+2,400,0.9170215409107296,0.9103286908084964,0.9045717339082588,0.9060791842767477,0.9042784784235123,0.9058303977070038,0.9034026516611994,0.8939083204087628
+2,450,0.9176180576347966,0.9111013007498091,0.9046564736217279,0.9070018076032861,0.9043559571395178,0.9068845159789399,0.9053441337705441,0.8963542468633958
+2,500,0.9175804309032914,0.910972311783262,0.9049577550876283,0.907141624354291,0.9046417909939524,0.9070234750074041,0.9049206468609045,0.8957832206978227
+2,-1,0.9174441148636763,0.9107327413253089,0.9048047744303681,0.9070424718055843,0.9044815593621378,0.9068967980462956,0.9043267145989402,0.8952630008247983
+3,50,0.9175365345252344,0.9108241770174955,0.9048845715190894,0.9073817112523234,0.9046408485689816,0.9073091477767476,0.9049773943228198,0.8961400784924454
+3,100,0.9179663557563542,0.9113891347555866,0.9060880899801388,0.9080287121097788,0.905854853026103,0.907811424520173,0.9064843435578074,0.8974639096695648
+3,150,0.9182955623970648,0.9116088981519387,0.9061603883627756,0.9080784483963724,0.9060127981806363,0.9079494208538067,0.9059739375683986,0.8967601725507954
+3,200,0.9183153160599816,0.911371225099685,0.9060338347795299,0.9076340026164751,0.9058621217995846,0.9076160809159833,0.906126876431413,0.8964431548546076
+3,250,0.9180840731296855,0.9114594163979561,0.9056372728854591,0.9078520961689196,0.905474049239513,0.9078170740872004,0.9058184752797055,0.8966899336051473
+3,300,0.9178286502507732,0.9114159810522678,0.9052806873237578,0.9075266035873079,0.9050487217730196,0.9074398515752434,0.9061460798081795,0.8971067821291782
+3,350,0.9182063310615758,0.911645751063719,0.9053877567215127,0.9076447090997396,0.9051674309205792,0.9075767108033967,0.9066061727513136,0.8975103287089705
+3,400,0.9182574735828971,0.9117431535152695,0.9052570456999282,0.9075097668719755,0.9050255464922876,0.9074616140483948,0.9066058743020591,0.8975567955378192
+3,450,0.9183982738737914,0.9118205368336817,0.9056085282657284,0.9077127655519093,0.9053634166576782,0.9076180949938418,0.9069137357243465,0.8977486091611588
+3,500,0.918400498393795,0.9117449106836713,0.9053827981293202,0.9075114253304576,0.9051418254241795,0.9074220600597875,0.9068004204110657,0.897609836546096
+3,-1,0.9184001943260172,0.9117420721702916,0.9053768498671652,0.9075047435268219,0.9051358794669365,0.9074146599468511,0.9067969229042736,0.897609755286942
diff --git a/AI/models/model.safetensors b/AI/models/model.safetensors
diff --git a/AI/models/modules.json b/AI/models/modules.json
@@ -0,0 +1,14 @@
+[
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "sentence_transformers.models.Transformer"
+  },
+  {
+    "idx": 1,
+    "name": "1",
+    "path": "1_Pooling",
+    "type": "sentence_transformers.models.Pooling"
+  }
+]
diff --git a/AI/models/sentence_bert_config.json b/AI/models/sentence_bert_config.json
@@ -0,0 +1,4 @@
+{
+  "max_seq_length": 256,
+  "do_lower_case": true
+}
diff --git a/AI/models/similarity_evaluation_sts-test_results.csv b/AI/models/similarity_evaluation_sts-test_results.csv
@@ -0,0 +1,2 @@
+epoch,steps,cosine_pearson,cosine_spearman,euclidean_pearson,euclidean_spearman,manhattan_pearson,manhattan_spearman,dot_pearson,dot_spearman
+-1,-1,0.8423922245967316,0.840173399111646,0.8350402889850528,0.835533483492224,0.8347274295138905,0.8351793428867286,0.8290534626479439,0.8228540098597439
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		*.safetensors filter=lfs diff=lfs merge=lfs -text
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		epoch,steps,cosine_pearson,cosine_spearman,euclidean_pearson,euclidean_spearman,manhattan_pearson,manhattan_spearman,dot_pearson,dot_spearman
		-1,-1,0.8423922245967316,0.840173399111646,0.8350402889850528,0.835533483492224,0.8347274295138905,0.8351793428867286,0.8290534626479439,0.8228540098597439