diff --git a/Clusters.py b/Clusters.py index ac8c23d..cca3512 100644 --- a/Clusters.py +++ b/Clusters.py @@ -4,21 +4,15 @@ @author: Vadim Shkaberda """ -import os -os.chdir(os.getcwd() + '\\Silpo') - - -#%% - -#from matplotlib import cm +from load_data import DBConnect from matplotlib import rc -#from mpl_toolkits.mplot3d import Axes3D +from mpl_toolkits.mplot3d import Axes3D +from sklearn.decomposition import PCA +from sklearn.cluster import DBSCAN, KMeans +from sklearn.metrics import silhouette_score from sklearn.preprocessing import scale -#from sklearn.linear_model import LinearRegression, Ridge -#from sklearn.pipeline import Pipeline from time import time -import csv_loader import matplotlib.patches as mpatches import matplotlib.pyplot as plt import numpy as np @@ -29,28 +23,34 @@ rc('font', **font) # Loading data -data = np.genfromtxt('CSV/Clusters_Pal_Silpo_2017.csv', - delimiter=';') +business = '' # business name -# Loading references -to_load = ('regions', 'RC', 'filials', 'post_city_id') +with DBConnect() as dbc: + data = dbc.get_data(3, business) -for current in to_load: - globals()['{}'.format(current)] = {} - csv_loader.load_csv(globals()['{}'.format(current)], 'CSV/' + current + '.csv') + # Loading lists of id + to_load = ('regions', 'filials') -#%% + for cur_load in to_load: + globals()['{}'.format(cur_load)] = dbc.get_id_lists(cur_load) + +# number of monthes in data +feautures_num = data.shape[1] - 3 # number of ID after data import -def plot_clusters(n_clusters, title): +#%% +def plot_clusters(n_clusters, reduced_data, labels, title): + ''' 2D plot of clustered data. + Input: + n_clusters: int - number of clusters; + reduced_data: (x, 2) shape array - data to plot; + labels: (x) shape array - list of clusters; + title: string - title of plot. + ''' fig = plt.gcf() fig.set_size_inches(15, 10) plt.figure(1) plt.clf() - #plt.imshow(Z, interpolation='nearest', - # extent=(xx.min(), xx.max(), yy.min(), yy.max()), - # cmap=plt.cm.Paired, - # aspect='auto', origin='lower') plt.scatter(reduced_data[:-n_clusters, 0], reduced_data[:-n_clusters, 1], @@ -64,10 +64,11 @@ def plot_clusters(n_clusters, title): # Plot the centroids as a numbers for i in range(reduced_data.shape[0]-n_clusters, reduced_data.shape[0]): - plt.text(reduced_data[i, 0], reduced_data[i, 1], str(labels[i]), + plt.text(reduced_data[i, 0], reduced_data[i, 1], + str(labels[i]), color=plt.cm.gist_ncar(labels[i] / float(n_clusters)), fontdict={'weight': 'bold', 'size': 12}) - + # Plot specific filial names/ID's # for r_data, filID, label in zip(reduced_data, filIDs, labels): # if label == 3: @@ -78,58 +79,131 @@ def plot_clusters(n_clusters, title): plt.title(title + u' кластеризация\n' + \ u'Цифры являются центроидами\n' u'Кол-во кластеров: {}'.format(n_clusters)) -# plt.xlim(-4, 4) -# plt.ylim(-5, 3) - #plt.xticks(()) - #plt.yticks(()) + plt.show() #plt.savefig('KMeans_{}_PCA_scaled.png'.format(clusters)) plt.close(fig) -#%% -# fit to an order-3 polynomial data +def plot_clusters_3D(n_clusters, reduced_data, labels, title, region=None): + ''' 3D plot of clustered data. + Input: + n_clusters: int - number of clusters; + reduced_data: (x, 3) shape array - data to plot; + labels: (x) shape array - list of clusters; + title: string - title of plot; + region: int - region to plot, uses array "outliers". If mentioned, wull + be plotted data with mask outliers[region], otherwise - + all data from reduced_data will be plotted. + ''' + # 3-d plot + fig = plt.gcf() + fig.set_size_inches(15, 10) + plt.clf() -# closed for Thrash -stable = np.ma.masked_equal(data[:, 2], 1) -data_scaled_all = scale(data[:, 3:], axis=1) -data_cleared = data[np.where(~(data[:, 2] == 0))] + # angle from which we will be looking at plot + ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134) + #ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=4, azim=54) + #ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=38, azim=-74) + + if region: + ax.scatter(reduced_data[outliers[region].mask][:, 0], + reduced_data[outliers[region].mask][:, 1], + reduced_data[outliers[region].mask][:, 2], + c=plt.cm.gist_ncar(labels[:-n_clusters] / float(n_clusters)) + ) + + ax.scatter(reduced_data[~outliers[region].mask][:, 0], + reduced_data[~outliers[region].mask][:, 1], + reduced_data[~outliers[region].mask][:, 2], + c='k', marker='x') + else: + ax.scatter(reduced_data[outliers.mask][:, 0], + reduced_data[outliers.mask][:, 1], + reduced_data[outliers.mask][:, 2], + c=plt.cm.gist_ncar(labels[:-n_clusters] / float(n_clusters)) + ) + + ax.scatter(reduced_data[~outliers.mask][:, 0], + reduced_data[~outliers.mask][:, 1], + reduced_data[~outliers.mask][:, 2], + c='k', marker='x') -#data_diff = np.diff(data_cleared[:, 2:-2]) / data_cleared[:, 2:-3] + plt.title(title + u' кластеризация\n' + \ + u'Цифры являются центроидами\n' + u'Кол-во кластеров: {}'.format(n_clusters)) + + plt.show() + plt.close(fig) -# closed for Thrash -data_scaled = data_scaled_all[stable.mask, :] +#%% +# identify satable data +stable = np.ma.masked_equal(data[:, 2], 1) +data_scaled = scale(data[stable.mask, 3:], axis=1) +data_cleared = data[np.where(data[:, 2] == 1)] -# use for Thrash -#data_cleared = data[np.where(~(data[:, 2:-2] == 0).any(axis=1))[0], :] -#data_scaled = scale(data_cleared[:, 2:-2], axis=1) +# not used at this time +#data_scaled = data_scaled_all[stable.mask, :] + +# staorage for data after PCA decomposition reduced_data = {} -#data_normalized = normalize(data_cleared[:, 2:], axis=1) + #%% -from sklearn.decomposition import PCA -from sklearn.cluster import KMeans -from sklearn.metrics import silhouette_score -from sklearn.cluster import DBSCAN +def plot_rc_bindings(): + ''' Function to plot all filials with color, respectively to binded RC. + ''' + LABEL_COLOR_MAP = {1 : 'red', + 2 : 'blue', + 3 : 'green', + 4 : 'purple', + 5 : 'orange'} + label_color = [LABEL_COLOR_MAP[l] for l in data_cleared[:, 1]] + + fig = plt.gcf() + fig.set_size_inches(15, 10) + plt.scatter(reduced_data[0][:, 0], reduced_data[0][:, 1], s=5, color=label_color) + + # labels for lowest 6 filials + #lowest = np.where(reduced_data[:, 1]<-3.5) + #for low in lowest[0]: + # skew = 0.05 if data_cleared[low, 0] != 518 else -0.25 + # plt.text(reduced_data[low, 0]+skew, reduced_data[low, 1]+0.05, str(int(data_cleared[low, 0])), + # #color=plt.cm.spectral(labels[i] / float(n_clusters)), + # fontdict={'weight': 'bold', 'size': 10}) + + # Legend + classes = [] + recs = [] + for i in LABEL_COLOR_MAP.keys(): + recs.append(mpatches.Rectangle((0,0),1,1,fc=LABEL_COLOR_MAP[i])) + classes.append(regions[i]) + plt.legend(recs, classes, loc=4) + + plt.title(u'Распределение филиалов по привязкам к РЦ.\n' + u'Регионы (Бизнес --)') + plt.show() + plt.close(fig) #%% -# detecting filials far away -db = DBSCAN(eps=2.8, min_samples=2, algorithm='brute').fit(data_scaled) -outliers = np.ma.masked_not_equal(db.labels_, -1) +################################################# +# Part for clustering without region separation # +################################################# +# detecting outliers - filials far away, which have less than 2 neighbors +db = DBSCAN(eps=2.5, min_samples=3, algorithm='brute').fit(data_scaled) +outliers = np.ma.masked_not_equal(db.labels_, -1) # for plotting outliers after DBSCAN reduced_data[0] = PCA(n_components=2).fit_transform(data_scaled) - #%% - - +# plot data with outliers fig = plt.gcf() fig.set_size_inches(15, 10) @@ -152,88 +226,19 @@ def plot_clusters(n_clusters, title): plt.legend(handles=(fils, fils_out)) plt.title(u'Филиалы-outliers, не подлежащие дальнейшей кластеризации\n' - u'(Бизнес Фора)') - -plt.show() -plt.close(fig) - -#%% - -LABEL_COLOR_MAP = {1 : 'red', - 2 : 'blue', - 3 : 'green', - 4 : 'purple', - 5 : 'orange'} - -label_color = [LABEL_COLOR_MAP[l] for l in data_cleared[:, 1]] - -fig = plt.gcf() -fig.set_size_inches(15, 10) -plt.scatter(reduced_data[0][:, 0], reduced_data[0][:, 1], s=5, color=label_color) - -# labels for lowest 6 filials -#lowest = np.where(reduced_data[:, 1]<-3.5) -#for low in lowest[0]: -# skew = 0.05 if data_cleared[low, 0] != 518 else -0.25 -# plt.text(reduced_data[low, 0]+skew, reduced_data[low, 1]+0.05, str(int(data_cleared[low, 0])), -# #color=plt.cm.spectral(labels[i] / float(n_clusters)), -# fontdict={'weight': 'bold', 'size': 10}) - -# Legend -classes = [] -recs = [] -for i in LABEL_COLOR_MAP.keys(): - recs.append(mpatches.Rectangle((0,0),1,1,fc=LABEL_COLOR_MAP[i])) - classes.append(regions[i]) -plt.legend(recs, classes, loc=4) - -plt.title(u'Распределение филиалов в пространстве уменьшенной размерности\n' - u'Регионы (Бизнес --)') -plt.show() -plt.close(fig) - -#%% - -RC_COLOR_MAP = {0 : 'gray', - 2080 : 'red', - 2101 : 'blue', - 2103 : 'green', - 2137 : 'orange'} - -rc_color = [RC_COLOR_MAP[int(l)] for l in data_cleared[:, -2]] - -fig = plt.gcf() -fig.set_size_inches(15, 10) -plt.scatter(reduced_data[0][:, 0], reduced_data[0][:, 1], s=5, color=rc_color) - -# Legend -classes = [] -recs = [] -for i in RC_COLOR_MAP.keys(): - recs.append(mpatches.Rectangle((0,0),1,1,fc=RC_COLOR_MAP[i])) - classes.append(RC[i]) -plt.legend(recs, classes, loc=3) - -#plt.ylim(-6, 4) - -plt.title(u'Распределение филиалов в пространстве уменьшенной размерности\n' - u'Привязки к РЦ (Бизнес --)') + u'(Бизнес {})'.format(business)) plt.show() plt.close(fig) +region = None #%% -################################################# -# Part for clustering without region separation # -################################################# - -region = None - +# compute clusters start_time = time() data_scaled_ready = data_scaled[outliers.mask] -cluster_limit = 5 +cluster_limit = 7 silhouettes = np.zeros((cluster_limit - 2, 4)) clusters = range(2, cluster_limit) @@ -254,22 +259,17 @@ def plot_clusters(n_clusters, title): print("--- %f seconds ---" % (time() - start_time)) -#kmeans = KMeans(init='k-means++', n_clusters=12, n_init=20, random_state=15) -#kmeans.fit(data_scaled) # reduced_data or data_scaled -#silhouette_avg = silhouette_score(data_scaled, kmeans.labels_) -#print ("The average silhouette_score is :", silhouette_avg) - #%% - +# plot silhouette and inertia fig = plt.gcf() fig.set_size_inches(10, 7.5) plt.plot(clusters, silhouettes[:, 2]) plt.title(u'Показатель silhouette для алгоритма K-means\n' - u'Бизнес Фора, с исключением outliers') + u'Бизнес {}, с исключением outliers'.format(business)) plt.xlabel(u'Количество кластеров') plt.ylabel(u'Значение Silhouette') @@ -283,7 +283,7 @@ def plot_clusters(n_clusters, title): plt.plot(clusters, silhouettes[:, 3], marker='s') plt.title(u'Показатель inertia для алгоритма K-means\n' - u'Бизнес Фора, с исключением outliers') + u'Бизнес {}, с исключением outliers'.format(business)) plt.xlabel(u'Количество кластеров, n') plt.ylabel(u'Inertia $J(C_n)$') @@ -292,31 +292,65 @@ def plot_clusters(n_clusters, title): plt.close(fig) - #%% +def plot_cluster_one_region(n_clusters, seed, dim=2): -data_scaled_ready = data_scaled[outliers.mask] -n_clusters = 4 + global kmeans + + data_scaled_ready = data_scaled[outliers.mask] + + kmeans = KMeans(init='k-means++', n_clusters=n_clusters, n_init=100, random_state=seed) + kmeans.fit(data_scaled_ready) # reduced_data or data_scaled + silhouette_avg = silhouette_score(data_scaled_ready, kmeans.labels_) + print ("The average silhouette_score is :", silhouette_avg) + + labels = np.hstack((kmeans.labels_, np.arange(n_clusters))) + Title = u'Бизнес {}. K-means'.format(business) + + if dim == 2: + reduced_data = PCA(n_components=dim).fit_transform(np.vstack((data_scaled_ready, kmeans.cluster_centers_))) + plot_clusters(n_clusters, reduced_data, labels, Title) + + if dim == 3: + reduced_data = PCA(n_components=dim).fit_transform(data_scaled) + plot_clusters_3D(n_clusters, reduced_data, labels, Title) + +# storage for current run agter choosing appropriate seed and clustersnumber + +# data from silhouettes (F) +plot_cluster_one_region(n_clusters=5, seed=66129, dim=2) + +# data from silhouettes (T) +#plot_cluster_one_region(n_clusters=3, seed=81829, dim=2) + +#%% +''' Plot all centroids. +''' + +x_plot = np.arange(1, feautures_num+1) +lw = 2 -kmeans = KMeans(init='k-means++', n_clusters=n_clusters, n_init=100, random_state=67066) -kmeans.fit(data_scaled_ready) # reduced_data or data_scaled -silhouette_avg = silhouette_score(data_scaled_ready, kmeans.labels_) -print ("The average silhouette_score is :", silhouette_avg) +fig = plt.gcf() +fig.set_size_inches(15, 10) -reduced_data = PCA(n_components=2).fit_transform(np.vstack((data_scaled_ready, kmeans.cluster_centers_))) -labels = np.hstack((kmeans.labels_, np.arange(n_clusters))) +for clust_num in range(max(kmeans.labels_)+1): + plt.plot(x_plot, kmeans.cluster_centers_[clust_num], linewidth=lw, label='Cluster {}'.format(clust_num)) -Title = u'Бизнес Фора. K-means' -plot_clusters(n_clusters, Title) +Title = 'Динамика центроидов для бизнеcа {}'.format(business) +plt.legend() +plt.title(Title) +#plt.show() +plt.savefig('Centroids_{}.png'.format(business)) +plt.close(fig) #%% u_clusters = {} -u_clusters[0] = np.zeros((max(kmeans.labels_) + 1, 15)) +u_clusters[0] = np.zeros((max(kmeans.labels_) + 1, feautures_num)) for j in range((max(kmeans.labels_) + 1)): clust_mask = np.ma.masked_equal(kmeans.labels_, j) u_clusters[0][j] = np.average(data_cleared[outliers.mask][clust_mask.mask], axis=0)[3:] @@ -326,7 +360,10 @@ def plot_clusters(n_clusters, title): for i, fil in enumerate(data[~stable.mask, :]): # Mask for stable months - fil_mask = np.ma.masked_not_equal(fil[3:], 0) + #fil_mask = np.ma.masked_not_equal(fil[3:], 0) + # In case of NaN instead of zeros use next mask for stable months + fil_mask = np.ma.masked_invalid(fil[3:]) + fil_mask.mask = np.invert(fil_mask.mask) # Scaled data of stable months for filial fil_stable = scale(fil[3:][fil_mask.mask]) # Creating scaled clusters @@ -339,67 +376,52 @@ def plot_clusters(n_clusters, title): #%% -file_to_write = 'output_Kmeans_Pallets_Fora_2017.csv' +# Writing final data (in brackets - number of clusters according to the last run) +file_to_write = 'output_Kmeans_Pallets_' + business + '_2018.csv' with open(file_to_write, 'w') as f: - f.write('FilID;FilialName;MacroRegionName;Stable;Outlier;Cluster\n') + f.write('FilID;FilialName;MacroRegionName;Stable;Outlier;Cluster;ClusterID\n') #%% +# Stable filials and outliers data_scaled_ready = data_scaled[~outliers.mask] with open(file_to_write, 'a') as f: for dat, label in zip(data_cleared[outliers.mask], kmeans.labels_): - f.write("{0:d};{1};{2};{3};0;{5} {4:d}\n".format(int(dat[0]), + f.write("{0:d};{1};{2};{3:n};0;{5} {4:d};{4:d}\n".format(int(dat[0]), filials[int(dat[0])], regions[int(dat[1])], dat[2], label, - 'Фора')) + business)) if not all(outliers.mask == True): Z = kmeans.predict(data_scaled_ready) for dat, label in zip(data_cleared[~outliers.mask], Z): - f.write("{0:d};{1};{2};{3};1;{5} {4:d}\n".format(int(dat[0]), + f.write("{0:d};{1};{2};{3:n};1;{5} {4:d};{4:d}\n".format(int(dat[0]), filials[int(dat[0])], regions[int(dat[1])], dat[2], label, - 'Фора')) + business)) #with open('output_Kmeans_reg_5_6_cleared_centroids.csv', 'w') as f: # for label, centroid in zip(np.arange(n_clusters), kmeans.cluster_centers_): # f.write(("{}"+15*";{}"+"\n").format(label, *centroid.tolist())) -#%% -# Unstable filials with NO zeros -#no_zeros = np.where(~(data[~stable.mask, 2:-2] == 0).any(axis=1)) -#data_scaled_ready = data_scaled_all[~stable.mask][no_zeros] -#Z = kmeans.predict(data_scaled_ready) -# -#with open('output_Kmeans_pal_Fora_5_clusters_2017.csv', 'a') as f: -# for dat, label in zip(data[~stable.mask][no_zeros], Z): -# f.write("{0:d};{1};{2};{3};{4};1;{6} {5:d}\n".format(int(dat[0]), -# filials[int(dat[0])].encode('cp1251'), -# post_city_id[int(dat[-1])].encode('cp1251'), -# regions[int(dat[1])].encode('cp1251'), -# dat[-2], -# label, -# u'Фора'.encode('cp1251'))) - - #%% # Unstable filials with open(file_to_write, 'a') as f: for dat, label in zip(data[~stable.mask], unstable_clusters): - f.write("{0:d};{1};{2};{3};0;{5} {4:n}\n".format(int(dat[0]), + f.write("{0:d};{1};{2};{3:n};0;{5} {4:n};{4:n}\n".format(int(dat[0]), filials[int(dat[0])], regions[int(dat[1])], dat[2], label, - u'Фора')) + business)) #%% @@ -411,7 +433,7 @@ def plot_clusters(n_clusters, title): new_mask = np.ma.masked_equal(kmeans.labels_, label) X = data_scaled[new_mask.mask, :] -x_plot = np.arange(1, 16) +x_plot = np.arange(1, feautures_num+1) lw = 2 for row, filid in zip(X, data_cleared[new_mask.mask, 0]): @@ -432,12 +454,10 @@ def plot_clusters(n_clusters, title): ''' Outliers after K-means. ''' -label = 2 - data_scaled_ready = data_scaled[~outliers.mask] Z = kmeans.predict(data_scaled_ready) -x_plot = np.arange(1, 16) +x_plot = np.arange(1, feautures_num+1) lw = 2 for row, filid in zip(range(len(data_scaled_ready)), data_cleared[~outliers.mask, 0]): @@ -447,11 +467,10 @@ def plot_clusters(n_clusters, title): plt.scatter(x_plot, data_scaled_ready[row], color='navy', s=30, marker='o', label="training points") plt.plot(x_plot, kmeans.cluster_centers_[Z[row]], color='g', linewidth=lw) - Title = 'FilID {0}, {1}. Label {2}'.format(int(filid), filials[int(filid)].encode('UTF-8'), Z[row]).decode('UTF-8') + Title = 'FilID {0}, {1}. Label {2}'.format(int(filid), filials[int(filid)], Z[row]) plt.title(Title) -# plt.title("Total, label {1}".format(int(filid), label)) -# plt.show() + #plt.show() plt.savefig('{}_{}.png'.format(Z[row], int(filid))) plt.close(fig) @@ -461,8 +480,7 @@ def plot_clusters(n_clusters, title): # Part for clustering with region separation # ################################################# -# Reg bindings - +# Region bindings reg = {} for i in range(1, 6): @@ -471,13 +489,14 @@ def plot_clusters(n_clusters, title): #%% +# plot initial data, divided by regions fig, axarr = plt.subplots(3, 2) -fig.set_size_inches(15, 10) +fig.set_size_inches(18, 12) subplots = (((0, 0)), ((0, 1)), ((1, 0)), ((1, 1)), ((2, 0))) for i, sp in enumerate(subplots, 1): axarr[sp].scatter(reduced_data[i][:, 0], reduced_data[i][:, 1]) - axarr[sp].set_title(u'Бизнес Сильпо, регион {}'.format(regions[i])) + axarr[sp].set_title(u'Бизнес {1}, регион {0}'.format(regions[i], business)) # Fine-tune figure; hide x ticks for top plots and y ticks for right plots #plt.setp([a.get_xticklabels() for a in axarr[0, :]], visible=False) @@ -489,24 +508,26 @@ def plot_clusters(n_clusters, title): #%% +# detecting outliers - filials far away, which have less than 2 neighbors outliers = {} for i in range(1, 6): # if i == 2 or i == 4: # eps = 2.75 # else: - eps = 2.25 if not (i == 5 or i == 4) else 2.4 + eps = 2.25# if not (i == 4) else 2.4 db = DBSCAN(eps=eps, min_samples=3, algorithm='brute').fit(data_scaled[reg[i].mask, :]) outliers[i] = np.ma.masked_not_equal(db.labels_, -1) -outliers[3].mask[np.where(data_cleared[reg[3].mask, 0] == 2125)] = False -outliers[3].mask[np.where(data_cleared[reg[3].mask, 0] == 2122)] = False +outliers[4].mask[np.where(data_cleared[reg[4].mask, 0] == 2272)] = False +outliers[4].mask[np.where(data_cleared[reg[4].mask, 0] == 2273)] = False +outliers[4].mask[np.where(data_cleared[reg[4].mask, 0] == 2280)] = False #%% - +# plot outliers fig, axarr = plt.subplots(3, 2) -fig.set_size_inches(15, 10) +fig.set_size_inches(18, 12) for i, sp in enumerate(subplots, 1): xy = reduced_data[i][outliers[i].mask] @@ -516,12 +537,12 @@ def plot_clusters(n_clusters, title): axarr[sp].plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor='k', markeredgecolor='k', markersize=10) for outlier, filID in zip(xy, data_cleared[reg[i].mask, :][~outliers[i].mask][:, 0]): - skew = -0.35 if int(filID) == 2059 else 0.1 + skew = -0.35 if int(filID) == 2154 else 0.1 axarr[sp].text(outlier[0]+0.05, outlier[1]+skew, filials[int(filID)], #color=plt.cm.spectral(labels[i] / float(n_clusters)), fontdict={'weight': 'bold', 'size': 8}) - axarr[sp].set_title(u'Бизнес Сильпо, регион {}'.format(regions[i])) + axarr[sp].set_title(u'Бизнес {1}, регион {0}'.format(regions[i], business)) # Fine-tune figure; hide x ticks for top plots and y ticks for right plots #plt.setp([a.get_xticklabels() for a in axarr[0, :]], visible=False) @@ -533,9 +554,11 @@ def plot_clusters(n_clusters, title): #%% -region = 3 -cluster_limit = 8 +# parameters for clusterization +region = 4 +cluster_limit = 9 +# compute clusters start_time = time() data_scaled_ready = data_scaled[reg[region].mask, :][outliers[region].mask] @@ -550,7 +573,6 @@ def plot_clusters(n_clusters, title): kmeans.fit(data_scaled_ready) # reduced_data or data_scaled silhouette_avg = silhouette_score(data_scaled_ready, kmeans.labels_) scores[seed] = (silhouette_avg, kmeans.inertia_) - #print seed, silhouette_avg best_silhouette = max(scores, key=scores.get) silhouettes[cluster-2] = [cluster] + [best_silhouette] + list(scores[max(scores, key=scores.get)]) @@ -566,13 +588,14 @@ def plot_clusters(n_clusters, title): #%% +# plot silhouette and inertia fig = plt.gcf() fig.set_size_inches(10, 7.5) plt.plot(clusters, silhouettes[:, 2]) plt.title(u'Показатель silhouette для алгоритма K-means\n' - u'Бизнес Сильпо, Регион {0} с исключением outliers'.format(regions[region])) + u'Бизнес {1}, Регион {0} с исключением outliers'.format(regions[region], business)) plt.xlabel(u'Количество кластеров') plt.ylabel(u'Значение Silhouette') @@ -586,7 +609,7 @@ def plot_clusters(n_clusters, title): plt.plot(clusters, silhouettes[:, 3], marker='s') plt.title(u'Показатель inertia для алгоритма K-means\n' - u'Бизнес Сильпо, Регион {0} с исключением outliers'.format(regions[region])) + u'Бизнес {1}, Регион {0} с исключением outliers'.format(regions[region], business)) plt.xlabel(u'Количество кластеров, n') plt.ylabel(u'Inertia $J(C_n)$') @@ -596,31 +619,60 @@ def plot_clusters(n_clusters, title): #%% -# cluster_data[number of clusters, seed] -#cluster_data = ((3, 36909), (3, 90903), (4, 78816), (4, 51184), (6, 4344)) -cluster_data = ((3, 10000), (3, 208), (4, 7750), (4, 4740), (6, 921)) -cluster_data = ((3, 4286),) +# storage for current run agter choosing appropriate seed and clustersnumber +# cluster_data[number of clusters, seed] - order of final decision is Region order +cluster_data = ((5, 8332)) +# Pallets +#cluster_data = ((3, 9742), (3, 2137), (4, 3872), [(3, 5146), (4, 4415), (5, 2066)], [(5, 8506), (6, 7778), (7, 4943)]) + +# Final data M3 V1 +cluster_data = ((3, 8012), (3, 7941), (4, 3348), (4, 1152), (7, 5756)) + +# Final data M3 V2 +cluster_data = ((3, 8012), (3, 7941), (3, 2394), (4, 1152), (5, 5253)) + +# Final data M3 V2 +cluster_data = ((3, 8012), (3, 7941), (3, 2394), (3, 5532), (5, 5253)) + +# storage of the trained K-means kmeans = {} -for i, cdata in enumerate(cluster_data, 1): - i = 3 - data_scaled_ready = data_scaled[reg[i].mask, :][outliers[i].mask] - filIDs = data_cleared[reg[i].mask, :][outliers[i].mask][:, 0] - n_clusters = cdata[0] - - kmeans[i] = KMeans(init='k-means++', n_clusters=n_clusters, n_init=100, random_state=cdata[1]) - kmeans[i].fit(data_scaled_ready) # reduced_data or data_scaled - silhouette_avg = silhouette_score(data_scaled_ready, kmeans[i].labels_) - print ("Region:{}. The average silhouette_score is :{}".format(regions[i], silhouette_avg)) - - reduced_data = PCA(n_components=2).fit_transform(np.vstack((data_scaled_ready, kmeans[i].cluster_centers_))) - labels = np.hstack((kmeans[i].labels_, np.arange(n_clusters))) - - Title = 'Регион {0}. K-means'.format(regions[i]) - plot_clusters(n_clusters, Title) - - +def plot_cluster_versions(cluster_data, region=None, dim=2): + ''' Plots clusters, PCA reduced to dim dimentions. + Input: + cluster_data - tuple: tuple of (number of clusters, seed) tuples; + region - int: None if i-th tuple in cluster_data corresponds to region, + number = n - if all data in cluster_data are from the n-th cluster; + dim - int: 2 or 3 - dimension of plot. + ''' + for i, cdata in enumerate(cluster_data, 1): + if region: + i = region + data_scaled_ready = data_scaled[reg[i].mask, :][outliers[i].mask] + #filIDs = data_cleared[reg[i].mask, :][outliers[i].mask][:, 0] + n_clusters = cdata[0] + + kmeans[i] = KMeans(init='k-means++', n_clusters=n_clusters, n_init=100, random_state=cdata[1]) + kmeans[i].fit(data_scaled_ready) # reduced_data or data_scaled + silhouette_avg = silhouette_score(data_scaled_ready, kmeans[i].labels_) + print ("Region:{}. The average silhouette_score is :{}".format(regions[i], silhouette_avg)) + + labels = np.hstack((kmeans[i].labels_, np.arange(n_clusters))) + Title = 'Регион {0}. K-means'.format(regions[i]) + + if dim == 2: + reduced_data = PCA(n_components=dim).fit_transform( + np.vstack((data_scaled_ready, kmeans[i].cluster_centers_))) + plot_clusters(n_clusters, reduced_data, labels, Title) + + if dim == 3: + reduced_data = PCA(n_components=dim).fit_transform(data_scaled[reg[region].mask, :]) + plot_clusters_3D(n_clusters, reduced_data, labels, Title, region=i) + +# disable region for the final run +plot_cluster_versions(cluster_data, region=None, dim=2) + #%% ################################################# @@ -629,10 +681,13 @@ def plot_clusters(n_clusters, title): u_clusters = {} +# for each region for i in range(1, 6): - u_clusters[i] = np.zeros((max(kmeans[i].labels_) + 1, 15)) + u_clusters[i] = np.zeros((max(kmeans[i].labels_) + 1, feautures_num)) for j in range((max(kmeans[i].labels_) + 1)): + # mask of cluster j in region i clust_mask = np.ma.masked_equal(kmeans[i].labels_, j) + # calculate centroid of cluster j in region i u_clusters[i][j] = np.average(data_cleared[reg[i].mask, :][outliers[i].mask][clust_mask.mask], axis=0)[3:] #%% @@ -642,7 +697,10 @@ def plot_clusters(n_clusters, title): for i, fil in enumerate(data[~stable.mask, :]): # Mask for stable months - fil_mask = np.ma.masked_not_equal(fil[3:], 0) + #fil_mask = np.ma.masked_not_equal(fil[3:], 0) + # In case of NaN instead of zeros use next mask for stable months + fil_mask = np.ma.masked_invalid(fil[3:]) + fil_mask.mask = np.invert(fil_mask.mask) # Scaled data of stable months for filial fil_stable = scale(fil[3:][fil_mask.mask]) # Creating scaled clusters @@ -651,44 +709,16 @@ def plot_clusters(n_clusters, title): dist = np.sum((scaled_clusters - fil_stable) ** 2, axis=1) # Save the nearest cluster into storage unstable_clusters[i] = np.argmin(dist) - -#%% - -# 3-d plot -#reduced_data = PCA(n_components=3).fit_transform(data_scaled[reg_5.mask, :]) -# -#fig = plt.gcf() -#fig.set_size_inches(15, 10) -#plt.clf() -# -#ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134) -##ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=4, azim=54) -##ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=38, azim=-74) -# -#ax.scatter(reduced_data[outliers_5.mask][:, 0], -# reduced_data[outliers_5.mask][:, 1], -# reduced_data[outliers_5.mask][:, 2], c=kmeans.labels_.astype(np.float)) -# -#ax.scatter(reduced_data[~outliers_5.mask][:, 0], -# reduced_data[~outliers_5.mask][:, 1], -# reduced_data[~outliers_5.mask][:, 2], c='k', marker='x') -# -#Title = 'Kmeans, регион {}\n'.format(regions[4].encode('UTF-8')).decode('UTF-8') -#Title += 'Кол-во кластеров: {}\n'.format(n_clusters).decode('UTF-8') -#plt.title(Title) -# -#plt.show() -#plt.close(fig) #%% -# Writing final data -file_to_write = 'output_Kmeans_Pallets_Silpo_2017(3,3,4,4,6)_V2.csv' +# Writing final data (in brackets - number of clusters according to the last run) +file_to_write = 'output_Kmeans_m3_' + business + '_2018(3,3,3,3,5).csv' with open(file_to_write, 'w') as f: - f.write('FilID;FilialName;MacroRegionName;Stable;Outlier;Cluster\n') + f.write('FilID;FilialName;MacroRegionName;Stable;Outlier;Cluster;ClusterID\n') #%% @@ -696,57 +726,40 @@ def plot_clusters(n_clusters, title): # Stable filials and outliers with open(file_to_write, 'a') as f: for i in range(1, 6): - data_scaled_ready = data_scaled[reg[i].mask, :][~outliers[i].mask] - + data_scaled_ready = data_scaled[reg[i].mask, :][~outliers[i].mask] + for dat, label in zip(data_cleared[reg[i].mask, :][outliers[i].mask], kmeans[i].labels_): - f.write("{0:d};{1};{2};{3};0;{2} {4:d}\n".format(int(dat[0]), + f.write("{0:d};{1};{2};{3:n};0;{2} {4:d};{4:d}\n".format(int(dat[0]), filials[int(dat[0])], regions[int(dat[1])], dat[2], label, - u'Фора')) + business)) # Check if there exist outliers if not all(outliers[i].mask == True): Z = kmeans[i].predict(data_scaled_ready) for dat, label in zip(data_cleared[reg[i].mask, :][~outliers[i].mask], Z): - f.write("{0:d};{1};{2};{3};1;{2} {4:d}\n".format(int(dat[0]), + f.write("{0:d};{1};{2};{3:n};1;{2} {4:d};{4:d}\n".format(int(dat[0]), filials[int(dat[0])], regions[int(dat[1])], dat[2], label, - u'Фора')) + business)) #with open('output_Kmeans_reg_5_6_cleared_centroids.csv', 'w') as f: # for label, centroid in zip(np.arange(n_clusters), kmeans.cluster_centers_): # f.write(("{}"+15*";{}"+"\n").format(label, *centroid.tolist())) -#%% -# Unstable filials with NO zeros -#region_mask = np.ma.masked_equal(data[~stable.mask, 1], region) -#no_zeros = np.where(~(data[~stable.mask][region_mask.mask, 2:-2] == 0).any(axis=1)) -#data_scaled_ready = data_scaled_all[~stable.mask][region_mask.mask][no_zeros] -#Z = kmeans.predict(data_scaled_ready) -# -#with open('output_Kmeans_Sales_Fora_4_clusters_2017.csv', 'a') as f: -# for dat, label in zip(data[~stable.mask][region_mask.mask][no_zeros], Z): -# f.write("{0:d};{1};{2};{3};{4};1;{6} {5:d}\n".format(int(dat[0]), -# filials[int(dat[0])].encode('cp1251'), -# post_city_id[int(dat[-1])].encode('cp1251'), -# regions[int(dat[1])].encode('cp1251'), -# dat[-2], -# label, -# u'Фора'.encode('cp1251'))) - #%% # Unstable filials with open(file_to_write, 'a') as f: for dat, label in zip(data[~stable.mask], unstable_clusters): - f.write("{0:d};{1};{2};{3};0;{2} {4:n}\n".format(int(dat[0]), + f.write("{0:d};{1};{2};{3:n};0;{2} {4:n};{4:n}\n".format(int(dat[0]), filials[int(dat[0])], regions[int(dat[1])], dat[2], label, - u'Фора')) + business)) diff --git a/Clusters_interact.py b/Clusters_interact.py new file mode 100644 index 0000000..015a8b5 --- /dev/null +++ b/Clusters_interact.py @@ -0,0 +1,331 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Jan 04 17:59:02 2017 +@author: Vadim Shkaberda +""" + +from load_data import DBConnect +from matplotlib import rc +from mpl_toolkits.mplot3d import Axes3D +from sklearn.decomposition import PCA +from sklearn.cluster import DBSCAN, KMeans +from sklearn.metrics import silhouette_score +from sklearn.preprocessing import scale +from time import time + +import matplotlib.patches as mpatches +import matplotlib.pyplot as plt +import numpy as np + +# для корректного отображения кириллицы +font = {'family': 'Verdana', + 'weight': 'normal'} +rc('font', **font) + +# Loading data +business = 'Сильпо' # business name + +with DBConnect() as dbc: + data = dbc.get_data(3, business) + + # Loading lists of id + to_load = ('regions', 'filials') + + for cur_load in to_load: + globals()['{}'.format(cur_load)] = dbc.get_id_lists(cur_load) + +#%% + +def plot_clusters(n_clusters, reduced_data, labels, title): + + fig = plt.gcf() + fig.set_size_inches(15, 10) + plt.figure(1) + plt.clf() + + plt.scatter(reduced_data[:-n_clusters, 0], + reduced_data[:-n_clusters, 1], + color=plt.cm.gist_ncar(labels[:-n_clusters] / float(n_clusters)), + marker='o') + + # Plot the centroids as a white X + #plt.scatter(reduced_data[-12:, 0], reduced_data[-12:, 1], + # marker='x', s=169, linewidths=2, + # color=plt.cm.spectral(labels[-12:] / 12.), zorder=10) + + # Plot the centroids as a numbers + for i in range(reduced_data.shape[0]-n_clusters, reduced_data.shape[0]): + plt.text(reduced_data[i, 0], reduced_data[i, 1], + str(labels[i]), + color=plt.cm.gist_ncar(labels[i] / float(n_clusters)), + fontdict={'weight': 'bold', 'size': 12}) + + # Plot specific filial names/ID's +# for r_data, filID, label in zip(reduced_data, filIDs, labels): +# if label == 3: +# plt.text(r_data[0]+0.05, r_data[1]+0.1, filials[int(filID)], +# #color=plt.cm.spectral(labels[i] / float(n_clusters)), +# fontdict={'weight': 'bold', 'size': 8}) + + plt.title(title + u' кластеризация\n' + \ + u'Цифры являются центроидами\n' + u'Кол-во кластеров: {}'.format(n_clusters)) + + plt.show() + #plt.savefig('KMeans_{}_PCA_scaled.png'.format(clusters)) + plt.close(fig) + + +def plot_clusters_3D(n_clusters, outliers, reduced_data, labels, title, region=None): + + # 3-d plot + fig = plt.gcf() + fig.set_size_inches(15, 10) + plt.clf() + + #ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134) + #ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=4, azim=54) + ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=38, azim=-74) + + if region: + ax.scatter(reduced_data[outliers[region].mask][:, 0], + reduced_data[outliers[region].mask][:, 1], + reduced_data[outliers[region].mask][:, 2], + c=plt.cm.gist_ncar(labels[:-n_clusters] / float(n_clusters)) + #c=kmeans.labels_.astype(np.float) + ) + + ax.scatter(reduced_data[~outliers[region].mask][:, 0], + reduced_data[~outliers[region].mask][:, 1], + reduced_data[~outliers[region].mask][:, 2], + c='k', marker='x') + else: + ax.scatter(reduced_data[outliers.mask][:, 0], + reduced_data[outliers.mask][:, 1], + reduced_data[outliers.mask][:, 2], + c=plt.cm.gist_ncar(labels[:-n_clusters] / float(n_clusters)) + #c=kmeans.labels_.astype(np.float) + ) + + ax.scatter(reduced_data[~outliers.mask][:, 0], + reduced_data[~outliers.mask][:, 1], + reduced_data[~outliers.mask][:, 2], + c='k', marker='x') + + plt.title(title + u' кластеризация\n' + \ + u'Цифры являются центроидами\n' + u'Кол-во кластеров: {}'.format(n_clusters)) + + plt.show() + plt.close(fig) + +#%% +# identify satable data +stable = np.ma.masked_equal(data[:, 2], 1) +data_scaled = scale(data[stable.mask, 3:], axis=1) +data_cleared = data[np.where(data[:, 2] == 1)] + +# not used at this time +#data_diff = np.diff(data_cleared[:, 2:-2]) / data_cleared[:, 2:-3] + +reduced_data = {} + +#%% + +################################################# +# Part for clustering without region separation # +################################################# + +def clustering_one_region(cluster_data): + + # detecting filials far away + db = DBSCAN(eps=2.5, min_samples=3, algorithm='brute').fit(data_scaled) + outliers = np.ma.masked_not_equal(db.labels_, -1) + +#%% + # for plotting outliers after DBSCAN + reduced_data[0] = PCA(n_components=2).fit_transform(data_scaled) + + fig = plt.gcf() + fig.set_size_inches(15, 10) + + + xy = reduced_data[0][outliers.mask] + fils, = plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor='g', + markeredgecolor='k', markersize=16, label=u'Филиалы') + + xy = reduced_data[0][~outliers.mask] + fils_out, = plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor='k', + markeredgecolor='k', markersize=10, label=u'Исключения') + + # labels for outliers + for outlier, filID in zip(xy, data_cleared[~outliers.mask][:, 0]): + #skew = 0.05 if data_cleared[low, 0] != 518 else -0.25 + plt.text(outlier[0]+0.05, outlier[1]+0.1, filials[int(filID)], + #color=plt.cm.spectral(labels[i] / float(n_clusters)), + fontdict={'weight': 'bold', 'size': 10}) + + plt.legend(handles=(fils, fils_out)) + + plt.title(u'Филиалы-outliers, не подлежащие дальнейшей кластеризации\n' + u'(Бизнес {})'.format(business)) + + plt.show() + plt.close(fig) + + # data from silhouettes + plot_cluster_one_region(n_clusters=cluster_data[0], + seed=cluster_data[1], + outliers=outliers, + dim=3) + + + #%% + + +def plot_cluster_one_region(n_clusters, seed, outliers, dim=2): + + data_scaled_ready = data_scaled[outliers.mask] + + kmeans = KMeans(init='k-means++', n_clusters=n_clusters, n_init=100, random_state=seed) + kmeans.fit(data_scaled_ready) # reduced_data or data_scaled + silhouette_avg = silhouette_score(data_scaled_ready, kmeans.labels_) + print ("The average silhouette_score is :", silhouette_avg) + + labels = np.hstack((kmeans.labels_, np.arange(n_clusters))) + Title = u'Бизнес {}. K-means'.format(business) + + if dim == 2: + reduced_data = PCA(n_components=dim).fit_transform(np.vstack((data_scaled_ready, kmeans.cluster_centers_))) + plot_clusters(n_clusters, reduced_data, labels, Title) + + if dim == 3: + reduced_data = PCA(n_components=dim).fit_transform(data_scaled) + plot_clusters_3D(n_clusters, outliers, reduced_data, labels, Title) + + +#%% + +################################################# +# Part for clustering with region separation # +################################################# + +def clustering_with_region_separation(cluster_data, region): +# Reg bindings + + reg = {} + + for i in range(1, 6): + reg[i] = np.ma.masked_equal(data_cleared[:, 1], i) + reduced_data[i] = PCA(n_components=2).fit_transform(data_scaled[reg[i].mask, :]) + + #%% + + fig, axarr = plt.subplots(3, 2) + fig.set_size_inches(18, 12) + subplots = (((0, 0)), ((0, 1)), ((1, 0)), ((1, 1)), ((2, 0))) + + for i, sp in enumerate(subplots, 1): + axarr[sp].scatter(reduced_data[i][:, 0], reduced_data[i][:, 1]) + axarr[sp].set_title(u'Бизнес Сильпо, регион {}'.format(regions[i])) + + # Fine-tune figure; hide x ticks for top plots and y ticks for right plots + #plt.setp([a.get_xticklabels() for a in axarr[0, :]], visible=False) + #plt.setp([a.get_yticklabels() for a in axarr[:, 1]], visible=False) + + plt.show() + + plt.close(fig) + + #%% + + outliers = {} + + for i in range(1, 6): + # if i == 2 or i == 4: + # eps = 2.75 + # else: + eps = 2.25 if not (i == 4) else 2.4 + db = DBSCAN(eps=eps, min_samples=3, algorithm='brute').fit(data_scaled[reg[i].mask, :]) + outliers[i] = np.ma.masked_not_equal(db.labels_, -1) + + outliers[4].mask[np.where(data_cleared[reg[4].mask, 0] == 2272)] = False + outliers[4].mask[np.where(data_cleared[reg[4].mask, 0] == 2273)] = False + outliers[4].mask[np.where(data_cleared[reg[4].mask, 0] == 2280)] = False + + #%% + + + fig, axarr = plt.subplots(3, 2) + fig.set_size_inches(18, 12) + + for i, sp in enumerate(subplots, 1): + xy = reduced_data[i][outliers[i].mask] + axarr[sp].plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor='g', + markeredgecolor='k', markersize=16) + xy = reduced_data[i][~outliers[i].mask] + axarr[sp].plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor='k', + markeredgecolor='k', markersize=10) + for outlier, filID in zip(xy, data_cleared[reg[i].mask, :][~outliers[i].mask][:, 0]): + skew = -0.35 if int(filID) == 2059 else 0.1 + axarr[sp].text(outlier[0]+0.05, outlier[1]+skew, filials[int(filID)], + #color=plt.cm.spectral(labels[i] / float(n_clusters)), + fontdict={'weight': 'bold', 'size': 8}) + + axarr[sp].set_title(u'Бизнес Сильпо, регион {}'.format(regions[i])) + + # Fine-tune figure; hide x ticks for top plots and y ticks for right plots + #plt.setp([a.get_xticklabels() for a in axarr[0, :]], visible=False) + #plt.setp([a.get_yticklabels() for a in axarr[:, 1]], visible=False) + + plt.show() + + plt.close(fig) + + plot_cluster_versions(cluster_data, outliers, reg, region, dim=3) + +#%% + +def plot_cluster_versions(cluster_data, outliers, reg, region=None, dim=2): + ''' Plots clusters, PCA reduced to dim dimentions. + Input: + cluster_data - tuple: tuple of (number of clusters, seed) tuples; + region - int: None if i-th tuple in cluster_data corresponds to region, + number = n - if all data in cluster_data are from the n-th cluster; + dim - int: 2 or 3 - dimension of plot. + ''' + kmeans = {} + for i, cdata in enumerate(cluster_data, 1): + if region: + i = region + data_scaled_ready = data_scaled[reg[i].mask, :][outliers[i].mask] + #filIDs = data_cleared[reg[i].mask, :][outliers[i].mask][:, 0] + n_clusters = cdata[0] + + kmeans[i] = KMeans(init='k-means++', n_clusters=n_clusters, n_init=100, random_state=cdata[1]) + kmeans[i].fit(data_scaled_ready) # reduced_data or data_scaled + silhouette_avg = silhouette_score(data_scaled_ready, kmeans[i].labels_) + print ("Region:{}. The average silhouette_score is :{}".format(regions[i], silhouette_avg)) + + labels = np.hstack((kmeans[i].labels_, np.arange(n_clusters))) + Title = 'Регион {0}. K-means'.format(regions[i]) + + if dim == 2: + reduced_data = PCA(n_components=dim).fit_transform( + np.vstack((data_scaled_ready, kmeans[i].cluster_centers_))) + plot_clusters(n_clusters, reduced_data, labels, Title) + if dim == 3: + reduced_data = PCA(n_components=dim).fit_transform(data_scaled[reg[region].mask, :]) + plot_clusters_3D(n_clusters, outliers, reduced_data, labels, Title, region=i) + + +#%% + +if __name__ == '__main__': + # cluster_data[number of clusters, seed] - order of final decision is Region order + cluster_data = ((3, 5532),) + clustering_with_region_separation(cluster_data, region=4) + + # cluster_data[number of clusters, seed] - order of final decision is Region order + #cluster_data = (3, 81829) + #clustering_one_region(cluster_data) \ No newline at end of file diff --git a/load_data.py b/load_data.py new file mode 100644 index 0000000..f5af1d2 --- /dev/null +++ b/load_data.py @@ -0,0 +1,72 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Jul 30 15:16:55 2018 + +@author: v.shkaberda +""" +from numpy import asarray + +import pyodbc + +class DBConnect(object): + ''' Provides connection to database and functions to work with server. + ''' + def __enter__(self): + # Connection properties + conn_str = ( + r'Driver={SQL Server};' + r'Server=s-kv-center-s64;' + r'Database=SILPOAnalitic;' + r'Trusted_Connection=yes;' + ) + self.__db = pyodbc.connect(conn_str) + self.__cursor = self.__db.cursor() + return self + + + def __exit__(self, type, value, traceback): + self.__db.close() + + + def connection_check(self): + self.__cursor.execute('''SELECT SUBSTRING(ORIGINAL_LOGIN(),CHARINDEX('\\', + ORIGINAL_LOGIN())+1, + LEN(ORIGINAL_LOGIN()))''') + return self.__cursor.fetchone()[0] + + + def get_data(self, Type, BusinessName): + ''' Download working data from server. + Type - int: 2 - Pallet; 3 - m3. + ''' + self.__cursor.execute('''SELECT FilID, MacroRegionID, Stable, + [201706], [201707], [201708], [201709], [201710], [201711], [201712], + [201801], [201802], [201803], [201804], [201805], [201806], [201807] + FROM SILPOAnalitic.dbo.aid_budget_Filial_Stable_ForClaster + WHERE EdIzm = ? and BusinessName = ? + ''', (Type, BusinessName)) + + # 'num_rows' needed to reshape the 1D NumPy array returend by 'fromiter' + # in other words, to restore original dimensions of the results set + + return asarray(self.__cursor.fetchall(), dtype=float) + + + def get_id_lists(self, list_name): + ''' Download id lists from server + ''' + if list_name == 'regions': + self.__cursor.execute('SELECT [macroRegionId], [macroRegionNameRu] \ + FROM [MasterData].[fil].[ListMacroRegions]') + elif list_name == 'filials': + self.__cursor.execute('SELECT [FilID], [FilialName] \ + FROM [SILPOAnalitic].[dbo].[aid_FilialsAll]') + return dict(self.__cursor.fetchall()) + + +if __name__ == '__main__': + import getpass + with DBConnect() as dbconn: + assert dbconn.connection_check() == getpass.getuser(), 'Connection check failed.' + print('Connected successfully.') + input('Press any button.') \ No newline at end of file