forked from MM-IR/rank4_NLP_textclassification
-
Notifications
You must be signed in to change notification settings - Fork 0
/
easyEDA.py
27 lines (24 loc) · 815 Bytes
/
easyEDA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
train_df = pd.read_csv('./train_set.csv', sep='\t')
test_df = pd.read_csv('./test_a.csv',sep='\t')
train_df['len_text'] = train_df['text'].apply(lambda x: len(x.split()))
test_df['len_text'] = train_df['text'].apply(lambda x: len(x.split()))
# length
fig, ax = plt.subplots(1,1,figsize=(12,6))
ax = plt.hist(x=train_df['len_text'],bins=100,label='train')
ax = plt.hist(x=test_df['len_text'],bins=100,label='test')
plt.axis('tight')
plt.xlabel("length of sample")
plt.ylabel("count")
plt.legend()
# seaborn length
import seaborn as sns
plt.figure(figsize=(15,5))
ax = sns.distplot(train_df['len_text'],bins=100)
ax = sns.distplot(test_df['len_text'],bins=100)
plt.axis('tight')
plt.xlabel('length of sample')
plt.ylabel('count')
plt.legend()