-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprepdata.py
129 lines (114 loc) · 3.89 KB
/
prepdata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import codecs
import ast
import numpy as np
import operator
from scipy.io import savemat
filename = "media_metadata2.txt"
with codecs.open(filename, "r", "utf-8") as f:
lines = f.readlines()
f.close()
alldating = []
allsigned = []
allmedium = []
for ind,dic in enumerate(lines):
print(ind)
tempdict = ast.literal_eval(lines[ind])
if 'signed' in tempdict:
allsigned.append(tempdict['signed'])
if 'dating' in tempdict:
alldating.append(tempdict['dating'])
if 'medium' in tempdict:
allmedium.append(tempdict['medium'])
# Go through again and get an approximate count of each class
datingdict = {}
signeddict = {}
mediumdict = {}
artistlist = []
for ind,dic in enumerate(lines):
print(ind)
tempdict = ast.literal_eval(lines[ind])
artistlist.append(tempdict['artist'])
if 'signed' in tempdict:
if tempdict['signed'] not in signeddict:
signeddict[tempdict['signed']] = 0
signeddict[tempdict['signed']] += 1
if 'dating' in tempdict:
if tempdict['dating'] not in datingdict:
datingdict[tempdict['dating']] = 0
datingdict[tempdict['dating']] += 1
if 'medium' in tempdict:
if tempdict['medium'] not in mediumdict:
mediumdict[tempdict['medium']] = 0
mediumdict[tempdict['medium']] += 1
# Sort dict and print highest values
sorted_signed = sorted(signeddict.items(), key=operator.itemgetter(1), reverse=True)
sorted_dating = sorted(datingdict.items(), key=operator.itemgetter(1), reverse=True)
sorted_medium = sorted(mediumdict.items(), key=operator.itemgetter(1), reverse=True)
mediumlist = []
for ind,ist in enumerate(sorted_signed):
print(ist[0] + ': ' + str(ist[1]))
for ind,ist in enumerate(sorted_dating):
#if ind > 100: continue
print(ist[0] + ': ' + str(ist[1]))
for ind,ist in enumerate(sorted_medium):
#if ind > 100: continue
if ist[1] < 100: break
mediumlist.append(ist[0])
print(ist[0] + ': ' + str(ist[1]))
# Should contain 250 artists
artistlist = list(set(artistlist))
print('# of artists: ' + str(len(artistlist)))
# Should contain about 15 mediums
print('# of mediums: ' + str(len(mediumlist)))
# Create big dict of all features
featlist = ['Signed', 'Dating'] + artistlist + mediumlist
featdict = {el:0 for el in featlist}
# Create an input matrix with every sample
inputmat = []
# All the classes for pricing
classes = [[0,101],[101,196],[196,321],[321,499],[499,760],[760,1202],[1202,2033],[2033,3857],[3857,9643],[9643,60130038]]
# Create class matrix
classmat = []
# Okay now we loop through one last time and create our data. We'll make dicts for each piece
for ind,dic in enumerate(lines):
print(ind)
tempdict = ast.literal_eval(lines[ind])
piecedict = dict(featdict)
yvec = np.zeros((len(classes),))
# Try to get dating. If not, skip
try:
date = int(tempdict['dating'][-4:])
except ValueError:
continue
piecedict['Dating'] = 2017 - date
# Get if signed
if 'signed' in tempdict:
if tempdict['signed'] == 'yes':
piecedict['Signed'] = 1
else:
piecedict['Signed'] = 0
else:
piecedict['Signed'] = 0
# Get artist
piecedict[tempdict['artist']] = 1
# Get medium
if tempdict['medium'] in mediumlist:
piecedict[tempdict['medium']] = 1
else:
continue
# Get pricing
price_label = tempdict['sell_price_adjusted']
for i in range(len(classes)):
if price_label >= classes[i][0] and price_label < classes[i][1]:
yvec[i] = 1
# Transform dict into list
piecelist = list(piecedict.values())
# Append to master list
inputmat.append(piecelist)
# Append to class list
classmat.append(yvec)
# Turn inputmat to array and save as a mat file
savemat('NNdata.mat', mdict={'xarr': inputmat, 'yarr' : classmat})
# Save numpy array to file
np.save('xdata.npy', inputmat)
np.save('ydata.npy', classmat)