-
Notifications
You must be signed in to change notification settings - Fork 0
/
BERT_colab.py
65 lines (47 loc) · 1.9 KB
/
BERT_colab.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# -*- coding: utf-8 -*-
"""142project.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1SoXREjCqnJG3zfYju0utrwPO7fhYv-7z
"""
!pip3 install simpletransformers
# Commented out IPython magic to ensure Python compatibility.
# %%writefile setup.sh
#
# git clone https://github.com/NVIDIA/apex
# pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex
!sh setup.sh
from simpletransformers.classification import ClassificationModel
import pandas as pd
import logging
from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger('transformers')
transformers_logger.setLevel(logging.WARNING)
def dataForBert(data):
reviews = list(data['review'])
labels = list(data['sentiment'])
clean_list = []
for review, label in zip(reviews, labels):
clean_review = BeautifulSoup(review).get_text()
clean_review = re.sub('["]', '', clean_review)
clean_list.append([clean_review, label])
return pd.DataFrame(clean_list)
data = pd.read_csv('labeledTrainData.tsv', header=0, quoting=3, delimiter='\t')
train_df, test_df = train_test_split(data, test_size=0.2)
train_data = dataForBert(train_df)
test_data = dataForBert(test_df)
print(f'Train data has {len(train_data)} entries')
print(f'Test data has {len(test_data)} entries')
model_args={'reprocess_input_data': True,
'overwrite_output_dir': True,
'sliding_window': True,
'num_train_epochs': 1,
'do_lower_case': False,
}
model = ClassificationModel('roberta', 'roberta-base', args=model_args)
model.train_model(train_data)
from sklearn.metrics import accuracy_score, classification_report
result, model_outputs, wrong_predictions = model.eval_model(test_data, acc=classification_report)
print(result['acc'])