Skip to content

Commit

Permalink
Merge pull request #1 from jiucchu/feat/215/setting
Browse files Browse the repository at this point in the history
[AI/FEAT] API 연결을 위한 초기 세팅
  • Loading branch information
jiucchu authored Jan 12, 2024
2 parents c9e4bcc + a19e9e4 commit 549347b
Show file tree
Hide file tree
Showing 17 changed files with 64,644 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.safetensors filter=lfs diff=lfs merge=lfs -text
11 changes: 11 additions & 0 deletions AI/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from typing import List
from pydantic import BaseModel

from fastapi import FastAPI, Depends, File, Request, APIRouter
import uvicorn
import requests

from model import sbert

app = FastAPI()
app.include_router(sbert)
116 changes: 116 additions & 0 deletions AI/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import numpy as np
import pandas as pd
from numpy import dot
from numpy.linalg import norm
import urllib.request
import torch
from pydantic import BaseModel
from typing import List

from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

from fastapi import APIRouter

#모델 가져오기
model_path = './models'
model = SentenceTransformer(model_path)


sbert = APIRouter(prefix='/sbert')

# 경로 설정
@sbert.get('/')
async def start_sbert():
return {'message' : "sbert is starting"}

class Item(BaseModel) :
name: str
sentence : str

class Item_and_num(BaseModel) :
item_list : List[Item]
min_n : int
max_n : int

@sbert.post('/clustering')
async def start_sbert(data:Item_and_num):

input = data.item_list
min_num = data.min_n
max_num = data.max_n


# 문장 임베딩
corpus = [Item.sentence for Item in input]
corpus_embeddings = model.encode(corpus, convert_to_tensor = True)

# 클러스터링
from sklearn.cluster import KMeans

num_clusters = 4
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(corpus_embeddings.cpu())
cluster_assignment = clustering_model.labels_

clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
clustered_sentences[cluster_id].append(corpus[sentence_id])


cluster_list = list()

for i, cluster in enumerate(clustered_sentences):
cluster_list.append(cluster)


# 인원 조절
copy_list = list()
for i in cluster_list :
copy_list.append(i)

# 최대인원을 넘겼을 때 잘라버리기
st_len = len(copy_list)

for i in range(st_len):
if len(copy_list[i]) > max_num :
copy_list.append(copy_list[i][:(len(copy_list[i])//2)])
copy_list.append(copy_list[i][(len(copy_list[i])//2):])
copy_list[i]=[]

# 하나인 것끼리 묶어준다.
st_len = len(copy_list)

one_list =[]
for i in range(st_len) :
if len(copy_list[i]) == 1 :
one_list += copy_list[i]
copy_list[i] = []

copy_list.append(one_list)


# min_num보다 작은 애들끼리 더해줌
st_len = len(copy_list)

for i in range(st_len-1):
for j in range(1, st_len-i-1):
team_check = copy_list[i] + copy_list[i+j]
# print(team_check)
if (len(team_check) <= max_num) and (len(team_check) >= min_num):

copy_list.append(team_check)
copy_list[i+j] = []
copy_list[i] = []

while [] in copy_list:
copy_list.remove([])

for i in range(len(copy_list)):
for j in range (len(copy_list[i])):
target_value = copy_list[i][j]
for Item in input :
if Item.sentence == target_value :
copy_list[i][j] = Item.name

return copy_list
7 changes: 7 additions & 0 deletions AI/models/1_Pooling/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"word_embedding_dimension": 768,
"pooling_mode_cls_token": false,
"pooling_mode_mean_tokens": true,
"pooling_mode_max_tokens": false,
"pooling_mode_mean_sqrt_len_tokens": false
}
126 changes: 126 additions & 0 deletions AI/models/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
---
pipeline_tag: sentence-similarity
tags:
- sentence-transformers
- feature-extraction
- sentence-similarity
- transformers

---

# {MODEL_NAME}

This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.

<!--- Describe your model here -->

## Usage (Sentence-Transformers)

Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:

```
pip install -U sentence-transformers
```

Then you can use the model like this:

```python
from sentence_transformers import SentenceTransformer
sentences = ["This is an example sentence", "Each sentence is converted"]

model = SentenceTransformer('{MODEL_NAME}')
embeddings = model.encode(sentences)
print(embeddings)
```



## Usage (HuggingFace Transformers)
Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.

```python
from transformers import AutoTokenizer, AutoModel
import torch


#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
token_embeddings = model_output[0] #First element of model_output contains all token embeddings
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Sentences we want sentence embeddings for
sentences = ['This is an example sentence', 'Each sentence is converted']

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('{MODEL_NAME}')
model = AutoModel.from_pretrained('{MODEL_NAME}')

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
model_output = model(**encoded_input)

# Perform pooling. In this case, mean pooling.
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

print("Sentence embeddings:")
print(sentence_embeddings)
```



## Evaluation Results

<!--- Describe how your model was evaluated -->

For an automated evaluation of this model, see the *Sentence Embeddings Benchmark*: [https://seb.sbert.net](https://seb.sbert.net?model_name={MODEL_NAME})


## Training
The model was trained with the parameters:

**DataLoader**:

`torch.utils.data.dataloader.DataLoader` of length 506 with parameters:
```
{'batch_size': 32, 'sampler': 'torch.utils.data.sampler.RandomSampler', 'batch_sampler': 'torch.utils.data.sampler.BatchSampler'}
```

**Loss**:

`sentence_transformers.losses.CosineSimilarityLoss.CosineSimilarityLoss`

Parameters of the fit()-Method:
```
{
"epochs": 4,
"evaluation_steps": 50,
"evaluator": "sentence_transformers.evaluation.EmbeddingSimilarityEvaluator.EmbeddingSimilarityEvaluator",
"max_grad_norm": 1,
"optimizer_class": "<class 'torch.optim.adamw.AdamW'>",
"optimizer_params": {
"lr": 2e-05
},
"scheduler": "WarmupLinear",
"steps_per_epoch": null,
"warmup_steps": 405,
"weight_decay": 0.01
}
```


## Full Model Architecture
```
SentenceTransformer(
(0): Transformer({'max_seq_length': 256, 'do_lower_case': True}) with Transformer model: RobertaModel
(1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)
```

## Citing & Authors

<!--- Describe where people can find more information -->
29 changes: 29 additions & 0 deletions AI/models/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
{
"_name_or_path": "klue/roberta-base",
"architectures": [
"RobertaModel"
],
"attention_probs_dropout_prob": 0.1,
"bos_token_id": 0,
"classifier_dropout": null,
"eos_token_id": 2,
"gradient_checkpointing": false,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-05,
"max_position_embeddings": 514,
"model_type": "roberta",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pad_token_id": 1,
"position_embedding_type": "absolute",
"tokenizer_class": "BertTokenizer",
"torch_dtype": "float32",
"transformers_version": "4.36.2",
"type_vocab_size": 1,
"use_cache": true,
"vocab_size": 32000
}
7 changes: 7 additions & 0 deletions AI/models/config_sentence_transformers.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"__version__": {
"sentence_transformers": "2.2.2",
"transformers": "4.36.2",
"pytorch": "2.0.0+cpu"
}
}
45 changes: 45 additions & 0 deletions AI/models/eval/similarity_evaluation_sts-dev_results.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
epoch,steps,cosine_pearson,cosine_spearman,euclidean_pearson,euclidean_spearman,manhattan_pearson,manhattan_spearman,dot_pearson,dot_spearman
0,50,0.8213558834738078,0.8232169418951444,0.7960443162793418,0.8059073638615916,0.7983373729106155,0.8080682267991697,0.7212937358126283,0.7356543370390979
0,100,0.8604063495395026,0.8524774968017809,0.8442028945043516,0.845487564474039,0.8443316192870001,0.8455816491737835,0.8351245841635585,0.828604777399964
0,150,0.876880588195492,0.867319439422618,0.8595302195800993,0.8589863803103351,0.8595293794267221,0.859014369575217,0.8579298413530259,0.8497061498737797
0,200,0.8874622581860445,0.8779408294631077,0.8741335415038562,0.873446072780437,0.8742320194905383,0.8734333212174922,0.8737387599841837,0.8640406898326938
0,250,0.8998911072146302,0.8932404108389441,0.8882516373010673,0.8894609576478131,0.8881477936466039,0.8895964433753458,0.8855084573685533,0.8778331050332694
0,300,0.9053480735441131,0.8982859751527836,0.8945325030211617,0.8961267904167957,0.8947878974184137,0.8965537037329793,0.8935626635810684,0.8851951230705846
0,350,0.9080900611412226,0.9005862887071884,0.8978376298865633,0.89831960573774,0.8979769743121653,0.8983667574393941,0.8941250997289429,0.8852259103671599
0,400,0.9089203129272925,0.9017823538201424,0.8975328694575475,0.8979906172759377,0.8974256075512494,0.8978544509621934,0.8993626418587692,0.8902685577895166
0,450,0.9100148839601779,0.9036438203915619,0.8988456132212452,0.8996695294581922,0.8989107348163481,0.8997896788924492,0.8925793078320046,0.8840176505227004
0,500,0.9091293510155692,0.900967838306071,0.897025504928038,0.8969233145401189,0.8969613126008412,0.8968674343265658,0.8907536077134277,0.8810413210733496
0,-1,0.9093999595823966,0.9014580723053609,0.8951307116692413,0.8963418687676254,0.8949511225765099,0.8962193691155657,0.8912864397818828,0.8827138200707191
1,50,0.9098958415825638,0.9026071137758258,0.8959905559134265,0.8980815553136738,0.895829007759678,0.8980499658153012,0.8983298552843442,0.88950634946343
1,100,0.9113061589355321,0.9042655402883096,0.8984767150326062,0.8994501609163165,0.8985144111654215,0.8994702814087927,0.8964503071325304,0.8870732689585261
1,150,0.9100862002363415,0.9029342114235758,0.8998470616425711,0.8994875958873592,0.8999197653674252,0.8996330501447899,0.8958547131430264,0.8856324376898586
1,200,0.9108590373029742,0.903057476186006,0.8988525404208976,0.9000881514540889,0.898805631024423,0.900045869530118,0.897157705875127,0.8871499988452332
1,250,0.9111987392881561,0.9055415833585166,0.9001171059653101,0.9033308746811642,0.900023747722682,0.903304681451952,0.8999071094842706,0.8918822032640259
1,300,0.9133735425293207,0.9053990544741953,0.8999597090440623,0.90179633752211,0.899850814678237,0.9016495741470723,0.8993635994761471,0.8894113878854429
1,350,0.9156699328588741,0.907892268273922,0.9019569035633918,0.9042916125607924,0.9018923413743889,0.9044277310680321,0.9015062830063499,0.8918961070256621
1,400,0.9163846156994926,0.907708815507214,0.9030782087359049,0.9042378095573449,0.9030396740527544,0.9042074632606978,0.9034620233810849,0.8925564358399269
1,450,0.9147554915390246,0.906892114021664,0.9025702127042678,0.9042587607789744,0.9023596521588005,0.9041098403037665,0.9023666619771951,0.8923460762702662
1,500,0.9159177849566856,0.9087732383522906,0.9056520534636011,0.906279951024877,0.9053479781852499,0.9060276673899722,0.9040158216330119,0.8935049214818698
1,-1,0.9155156435924803,0.9081267200189171,0.9038380514107146,0.905382845283694,0.9035465038826089,0.9051115502156257,0.9042877530248404,0.8940577181171764
2,50,0.9174646500486721,0.9100582430966995,0.9057666370461361,0.9071310137794958,0.9056321753609117,0.9071038189494635,0.9064782666204766,0.8963628959459086
2,100,0.9178690322067143,0.9103774085057775,0.9061887206298532,0.90679097672793,0.9060856452314302,0.9068009021009794,0.9062513854223387,0.8959950464320343
2,150,0.917511195448178,0.9104323757739448,0.9054339678865194,0.906710787509189,0.9051740030971264,0.9066257412662804,0.9053548431767278,0.8957341519983978
2,200,0.9168705050990384,0.9102359090719631,0.9044827132081518,0.9061221156761637,0.9043311704918802,0.9060578676816395,0.90239226617239,0.8933248523264196
2,250,0.9172578223584863,0.9105036562008504,0.9044593352133885,0.9063151350806621,0.904264759650337,0.9062764007635956,0.9054439937229677,0.8962336699965426
2,300,0.9168046789666718,0.9103056546487991,0.9030841277987778,0.9059281646914303,0.9028868379188827,0.9058810611088021,0.9050803285112552,0.8965990692130592
2,350,0.9173767679457552,0.9102573243605926,0.9053261024122603,0.9061604744497893,0.9050261197343531,0.9058984614099856,0.906276026079678,0.8962961706778504
2,400,0.9170215409107296,0.9103286908084964,0.9045717339082588,0.9060791842767477,0.9042784784235123,0.9058303977070038,0.9034026516611994,0.8939083204087628
2,450,0.9176180576347966,0.9111013007498091,0.9046564736217279,0.9070018076032861,0.9043559571395178,0.9068845159789399,0.9053441337705441,0.8963542468633958
2,500,0.9175804309032914,0.910972311783262,0.9049577550876283,0.907141624354291,0.9046417909939524,0.9070234750074041,0.9049206468609045,0.8957832206978227
2,-1,0.9174441148636763,0.9107327413253089,0.9048047744303681,0.9070424718055843,0.9044815593621378,0.9068967980462956,0.9043267145989402,0.8952630008247983
3,50,0.9175365345252344,0.9108241770174955,0.9048845715190894,0.9073817112523234,0.9046408485689816,0.9073091477767476,0.9049773943228198,0.8961400784924454
3,100,0.9179663557563542,0.9113891347555866,0.9060880899801388,0.9080287121097788,0.905854853026103,0.907811424520173,0.9064843435578074,0.8974639096695648
3,150,0.9182955623970648,0.9116088981519387,0.9061603883627756,0.9080784483963724,0.9060127981806363,0.9079494208538067,0.9059739375683986,0.8967601725507954
3,200,0.9183153160599816,0.911371225099685,0.9060338347795299,0.9076340026164751,0.9058621217995846,0.9076160809159833,0.906126876431413,0.8964431548546076
3,250,0.9180840731296855,0.9114594163979561,0.9056372728854591,0.9078520961689196,0.905474049239513,0.9078170740872004,0.9058184752797055,0.8966899336051473
3,300,0.9178286502507732,0.9114159810522678,0.9052806873237578,0.9075266035873079,0.9050487217730196,0.9074398515752434,0.9061460798081795,0.8971067821291782
3,350,0.9182063310615758,0.911645751063719,0.9053877567215127,0.9076447090997396,0.9051674309205792,0.9075767108033967,0.9066061727513136,0.8975103287089705
3,400,0.9182574735828971,0.9117431535152695,0.9052570456999282,0.9075097668719755,0.9050255464922876,0.9074616140483948,0.9066058743020591,0.8975567955378192
3,450,0.9183982738737914,0.9118205368336817,0.9056085282657284,0.9077127655519093,0.9053634166576782,0.9076180949938418,0.9069137357243465,0.8977486091611588
3,500,0.918400498393795,0.9117449106836713,0.9053827981293202,0.9075114253304576,0.9051418254241795,0.9074220600597875,0.9068004204110657,0.897609836546096
3,-1,0.9184001943260172,0.9117420721702916,0.9053768498671652,0.9075047435268219,0.9051358794669365,0.9074146599468511,0.9067969229042736,0.897609755286942
3 changes: 3 additions & 0 deletions AI/models/model.safetensors
Git LFS file not shown
14 changes: 14 additions & 0 deletions AI/models/modules.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
[
{
"idx": 0,
"name": "0",
"path": "",
"type": "sentence_transformers.models.Transformer"
},
{
"idx": 1,
"name": "1",
"path": "1_Pooling",
"type": "sentence_transformers.models.Pooling"
}
]
4 changes: 4 additions & 0 deletions AI/models/sentence_bert_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"max_seq_length": 256,
"do_lower_case": true
}
2 changes: 2 additions & 0 deletions AI/models/similarity_evaluation_sts-test_results.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
epoch,steps,cosine_pearson,cosine_spearman,euclidean_pearson,euclidean_spearman,manhattan_pearson,manhattan_spearman,dot_pearson,dot_spearman
-1,-1,0.8423922245967316,0.840173399111646,0.8350402889850528,0.835533483492224,0.8347274295138905,0.8351793428867286,0.8290534626479439,0.8228540098597439
Loading

0 comments on commit 549347b

Please sign in to comment.