-
Notifications
You must be signed in to change notification settings - Fork 0
/
stats.py
85 lines (68 loc) · 2.46 KB
/
stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import numpy as np
from matplotlib import pyplot as plt
from matplotlib import cm as cm
def ed2cen(ed):
cen = .5*(ed[1:]+ed[:-1])
return cen
data = np.loadtxt('raw.dat')
stars = data[:,0]
approval_ratio = data[:,1]/data[:,2]
bins = [i+.5 for i in range(0,11)]
H, ed = np.histogram(stars, bins=bins)#, density=True)
mean_confidence=np.empty(10)
mean_weight=np.empty(10)
for i in range(1,11):
mask=data[:,0]==int(i)
mean_confidence[i-1] = np.mean(approval_ratio[mask])
mean_weight[i-1] = np.mean(data[:,2][mask])
print(np.min(mean_weight), np.max(mean_weight))
#fig, (ax,ax2) = plt.subplots(2,1)
##l1 = ax.plot(ed2cen(ed), H, 'o', label="1")
#l1 = ax.bar(ed2cen(ed), H,facecolor='#9999ff', edgecolor='white', label="1")
##ax2 = ax.twinx()
#l2 = ax2.scatter(ed2cen(ed), mean_confidence, marker='s', s=mean_weight,
# color='r', label='2')
fig, ax = plt.subplots(1,1)
l1 = ax.bar(ed2cen(ed), H,facecolor='#9999ff', edgecolor='white', label="Reviews")
#plt.legend(bbox_to_anchor=(0., 1.0), loc=3, mode="expand", borderaxespad=0.,
# frameon=False)
plt.legend(bbox_to_anchor=(1.1, 0.9), loc=4, mode="expand", borderaxespad=0.,
frameon=False)
ax2 = ax.twinx()
l2 = ax2.scatter(ed2cen(ed), mean_confidence, marker='s', s=mean_weight,
color='r', label='Approval')
#plt.legend(bbox_to_anchor=(0.3, 1.0), loc=3, mode="expand", borderaxespad=0.,
# frameon=False)
plt.legend(bbox_to_anchor=(1.1, 0.8), loc=4, mode="expand", borderaxespad=0.,
frameon=False)
#ax2.fill_between(ed2cen(ed),mean_confidence)
ax2.set_ylabel("Approval")
ax2.set_xlabel("Stars")
ax2.text(0.65,.4,"Approval := Usefulness/Total\nSize := Total")
ax.set_ylabel("#Reviews")
fig.suptitle("Star Wars - The Force Awakens IMDB Reviews")
#plt.legend()
#ax2.plot(ed2cen(ed), mean_weight, 'g^')
#ax.set_label("1")
#ax2.set_label("2", )
mask=data[:,0]<6
bad = len(data[mask])
mask=data[:,0]>6
good = len(data[mask])
mask=data[:,0]==6
even = len(data[mask])
print("TOTAL = ", bad+good+even)
print("bad = %i, even = %i, good = %i" % (bad, even, good))
average=0
for i in range(1,11): average+=i*H[i-1]
print("average = %f" % (average/(bad+good+even)) )
fig2, ax3 = plt.subplots(1,1)
J, st_ed, ap_ed = np.histogram2d(stars,approval_ratio )
X, Y = np.meshgrid(st_ed, ap_ed)
M = ax3.pcolormesh(X, Y, J)
ax3.set_xlabel("Stars")
ax3.set_ylabel("Approval")
cb = plt.colorbar(M)
cb.set_label("Reviews")
plt.title("Stars vs Approval review distribution")
plt.show()