From 3054a1772863d51cc2ae41fe26d1374c901997bf Mon Sep 17 00:00:00 2001 From: SabeenN <32990272+SabeenN@users.noreply.github.com> Date: Mon, 20 May 2019 00:22:04 +0200 Subject: [PATCH] Add files via upload --- .../k-layer net w BN/BatchNormBackPass.m | 12 +++ .../k-layer net w BN/BatchNormalize.m | 4 + .../k-layer net w BN/ComputeAccuracy.m | 12 +++ .../k-layer net w BN/ComputeCost.m | 18 ++++ .../k-layer net w BN/ComputeGradients.m | 35 ++++++++ .../k-layer net w BN/ComputeGradsNumSlow.m | 71 ++++++++++++++++ .../k-layer net w BN/ComputeGradsNumSlow1.m | 85 +++++++++++++++++++ .../k-layer net w BN/EvaluateClassifier.m | 28 ++++++ .../k-layer net w BN/InitializeWb.m | 20 +++++ .../k-layer net w BN/LoadBatch.m | 19 +++++ .../k-layer net w BN/MiniBatchGD.m | 67 +++++++++++++++ Speech_Recognition/k-layer net w BN/main.m | 77 +++++++++++++++++ 12 files changed, 448 insertions(+) create mode 100644 Speech_Recognition/k-layer net w BN/BatchNormBackPass.m create mode 100644 Speech_Recognition/k-layer net w BN/BatchNormalize.m create mode 100644 Speech_Recognition/k-layer net w BN/ComputeAccuracy.m create mode 100644 Speech_Recognition/k-layer net w BN/ComputeCost.m create mode 100644 Speech_Recognition/k-layer net w BN/ComputeGradients.m create mode 100644 Speech_Recognition/k-layer net w BN/ComputeGradsNumSlow.m create mode 100644 Speech_Recognition/k-layer net w BN/ComputeGradsNumSlow1.m create mode 100644 Speech_Recognition/k-layer net w BN/EvaluateClassifier.m create mode 100644 Speech_Recognition/k-layer net w BN/InitializeWb.m create mode 100644 Speech_Recognition/k-layer net w BN/LoadBatch.m create mode 100644 Speech_Recognition/k-layer net w BN/MiniBatchGD.m create mode 100644 Speech_Recognition/k-layer net w BN/main.m diff --git a/Speech_Recognition/k-layer net w BN/BatchNormBackPass.m b/Speech_Recognition/k-layer net w BN/BatchNormBackPass.m new file mode 100644 index 0000000..8cc8fc1 --- /dev/null +++ b/Speech_Recognition/k-layer net w BN/BatchNormBackPass.m @@ -0,0 +1,12 @@ +function g = BatchNormBackPass(g, si, mui, vari) + eps = 1e-6; + n = size(g,2); + one_v = ones(n,1); + sigma1 = ((vari + eps).^(-0.5)); + sigma2 = ((vari + eps).^(-1.5)); + G1 = g.*(sigma1*one_v'); + G2 = g.*(sigma2*one_v'); + D = si - (repmat(mui,1,size(mui,2))*one_v'); + c = (G2.*D)*one_v; + g = G1 - (1/n)*G1*one_v-(1/n)*(D.*(c*one_v')); +end \ No newline at end of file diff --git a/Speech_Recognition/k-layer net w BN/BatchNormalize.m b/Speech_Recognition/k-layer net w BN/BatchNormalize.m new file mode 100644 index 0000000..90399c8 --- /dev/null +++ b/Speech_Recognition/k-layer net w BN/BatchNormalize.m @@ -0,0 +1,4 @@ +function [shat] = BatchNormalize(s,mean,var) + eps=1e-6; + shat=(diag(var+eps)^(-0.5))*(s-repmat(mean,1,size(s,2))); +end \ No newline at end of file diff --git a/Speech_Recognition/k-layer net w BN/ComputeAccuracy.m b/Speech_Recognition/k-layer net w BN/ComputeAccuracy.m new file mode 100644 index 0000000..4e4e553 --- /dev/null +++ b/Speech_Recognition/k-layer net w BN/ComputeAccuracy.m @@ -0,0 +1,12 @@ +function acc = ComputeAccuracy(X, y, W, b,gamma,beta) + [P,~,~,~,~]=EvaluateClassifier(X,W,b,gamma,beta); + [~,I]= max(P,[],1); + correct=0; + img=size(I,2); + for i=1:img + if I(i)==y(i) %y are the true labels + correct=correct + 1; + end + end + acc= correct/img; +end \ No newline at end of file diff --git a/Speech_Recognition/k-layer net w BN/ComputeCost.m b/Speech_Recognition/k-layer net w BN/ComputeCost.m new file mode 100644 index 0000000..65013e9 --- /dev/null +++ b/Speech_Recognition/k-layer net w BN/ComputeCost.m @@ -0,0 +1,18 @@ +function J = ComputeCost(X, Y, W, b,lambda,gamma,beta,varargin) + k=size(W,2); + if numel(varargin)==2 + [P,~,~,~,~]=EvaluateClassifier(X,W,b,gamma,beta,varargin{1},varargin{2}); + else + [P,~,~,~,~]=EvaluateClassifier(X,W,b,gamma,beta); + end + n=size(X,2); + py= Y'*P; %Y is one hot representation of labels + l=-log(py); + L2=0; + for i=1:k + L2=L2+sumsqr(W{i}); + end + regularization= lambda*L2; + J= sum(diag(l))/n; + J = J + regularization; +end \ No newline at end of file diff --git a/Speech_Recognition/k-layer net w BN/ComputeGradients.m b/Speech_Recognition/k-layer net w BN/ComputeGradients.m new file mode 100644 index 0000000..8308430 --- /dev/null +++ b/Speech_Recognition/k-layer net w BN/ComputeGradients.m @@ -0,0 +1,35 @@ +function [gradW, gradb,gradGamma,gradBeta] = ComputeGradients(X, Y, P, s, W,lambda,shat,mu,v,gamma,beta) + k=size(W,2); + n=size(X,2); + gradb=cell(1,k); %size(b) is number of layers + gradW=cell(1,k); + Ic=ones(n,1); % n x 1 ones + + g= -(Y-P); %Y is K x N and P is K x N + gradW{k}=((1/n)*(g*s{k-1}'))+(2*lambda*W{k}); + gradb{k}=((1/n)*(g*Ic)); + g=W{k}'*g; + Indsk=s{k-1}>0; + g=g.*Indsk; + + + for l=k-1:-1:1 + gradGamma{l}=((g .* shat{l})*Ic)/n; + gradBeta{l}=(g*Ic)/n; + g=g .* (gamma{l}*Ic'); + + g=BatchNormBackPass(g,s{l},mu{l},v{l}); + if l == 1 + si_1=X; + else + si_1=s{l-1}; + end + gradW{l}=((g*si_1')/n)+(2*lambda*W{l}); + gradb{l}=(g*Ic)/n; + if l > 1 + g=(W{l}')*g; + si_1=si_1 > 0; + g=g .* si_1; + end + end +end diff --git a/Speech_Recognition/k-layer net w BN/ComputeGradsNumSlow.m b/Speech_Recognition/k-layer net w BN/ComputeGradsNumSlow.m new file mode 100644 index 0000000..093b4a5 --- /dev/null +++ b/Speech_Recognition/k-layer net w BN/ComputeGradsNumSlow.m @@ -0,0 +1,71 @@ +function [grad_b, grad_W,grad_gamma,grad_beta] = ComputeGradsNumSlow(X, Y, W, b, lambda, h,gamma,beta) + +grad_W = cell(1,numel(W)); +grad_b = cell(1,numel(b)); + +grad_gamma = cell(1,numel(gamma)); +grad_beta = cell(1,numel(beta)); + +for j=1:length(b) + grad_b{j} = zeros(size(b{j})); + + for i=1:length(b{j}) + + b_try = b; + b_try{j}(i) = b_try{j}(i) - h; + c1 = ComputeCost(X, Y, W, b_try, lambda,gamma,beta); + + b_try = b; + b_try{j}(i) = b_try{j}(i) + h; + c2 = ComputeCost(X, Y, W, b_try, lambda,gamma,beta); + + grad_b{j}(i) = (c2-c1) / (2*h); + end +end + +for j=1:length(W) + grad_W{j} = zeros(size(W{j})); + + for i=1:numel(W{j}) + + W_try = W; + W_try{j}(i) = W_try{j}(i) - h; + c1 = ComputeCost(X, Y, W_try, b, lambda,gamma,beta); + + W_try = W; + W_try{j}(i) = W_try{j}(i) + h; + c2 = ComputeCost(X, Y, W_try, b, lambda,gamma,beta); + + grad_W{j}(i) = (c2-c1) / (2*h); + end +end + +for j=1:length(gamma) + grad_gamma{j} = zeros(size(gamma{j})); + for i=1:numel(gamma{j}) + + gammas_try = gamma; + gammas_try{j}(i) = gamma{j}(i) - h; + c1 = ComputeCost(X, Y, W, b, lambda,gammas_try,beta); + + gammas_try = gamma; + gammas_try{j}(i) = gamma{j}(i) + h; + c2 = ComputeCost(X, Y, W, b, lambda,gammas_try,beta); + + grad_gamma{j}(i) = (c2-c1) / (2*h); + end + end + for j=1:length(beta) + grad_beta{j} = zeros(size(beta{j})); + for i=1:numel(beta{j}) + + beta_try = beta; + beta_try{j}(i) = beta{j}(i) - h; + c1 = ComputeCost(X, Y, W, b, lambda,gamma,beta_try); + + beta_try = beta; + beta_try{j}(i) = beta{j}(i) + h; + c2 = ComputeCost(X, Y, W, b, lambda,gamma,beta_try); + grad_beta{j}(i) = (c2-c1) / (2*h); + end + end \ No newline at end of file diff --git a/Speech_Recognition/k-layer net w BN/ComputeGradsNumSlow1.m b/Speech_Recognition/k-layer net w BN/ComputeGradsNumSlow1.m new file mode 100644 index 0000000..edc75b1 --- /dev/null +++ b/Speech_Recognition/k-layer net w BN/ComputeGradsNumSlow1.m @@ -0,0 +1,85 @@ +function Grads = ComputeGradsNumSlow1(X, Y, NetParams, lambda, h) + +Grads.W = cell(numel(NetParams.W), 1); +Grads.b = cell(numel(NetParams.b), 1); +if NetParams.use_bn + Grads.gammas = cell(numel(NetParams.gammas), 1); + Grads.betas = cell(numel(NetParams.betas), 1); +end + +for j=1:length(NetParams.b) + Grads.b{j} = zeros(size(NetParams.b{j})); + NetTry = NetParams; + for i=1:length(NetParams.b{j}) + b_try = NetParams.b; + b_try{j}(i) = b_try{j}(i) - h; + NetTry.b = b_try; + c1 = ComputeCost(X, Y, NetTry, lambda); + + b_try = NetParams.b; + b_try{j}(i) = b_try{j}(i) + h; + NetTry.b = b_try; + c2 = ComputeCost(X, Y, NetTry, lambda); + + Grads.b{j}(i) = (c2-c1) / (2*h); + end +end + +for j=1:length(NetParams.W) + Grads.W{j} = zeros(size(NetParams.W{j})); + NetTry = NetParams; + for i=1:numel(NetParams.W{j}) + + W_try = NetParams.W; + W_try{j}(i) = W_try{j}(i) - h; + NetTry.W = W_try; + c1 = ComputeCost(X, Y, NetTry, lambda); + + W_try = NetParams.W; + W_try{j}(i) = W_try{j}(i) + h; + NetTry.W = W_try; + c2 = ComputeCost(X, Y, NetTry, lambda); + + Grads.W{j}(i) = (c2-c1) / (2*h); + end +end + +if NetParams.use_bn + for j=1:length(NetParams.gammas) + Grads.gammas{j} = zeros(size(NetParams.gammas{j})); + NetTry = NetParams; + for i=1:numel(NetParams.gammas{j}) + + gammas_try = NetParams.gammas; + gammas_try{j}(i) = gammas_try{j}(i) - h; + NetTry.gammas = gammas_try; + c1 = ComputeCost(X, Y, NetTry, lambda); + + gammas_try = NetParams.gammas; + gammas_try{j}(i) = gammas_try{j}(i) + h; + NetTry.gammas = gammas_try; + c2 = ComputeCost(X, Y, NetTry, lambda); + + Grads.gammas{j}(i) = (c2-c1) / (2*h); + end + end + + for j=1:length(NetParams.betas) + Grads.betas{j} = zeros(size(NetParams.betas{j})); + NetTry = NetParams; + for i=1:numel(NetParams.betas{j}) + + betas_try = NetParams.betas; + betas_try{j}(i) = betas_try{j}(i) - h; + NetTry.betas = betas_try; + c1 = ComputeCost(X, Y, NetTry, lambda); + + betas_try = NetParams.betas; + betas_try{j}(i) = betas_try{j}(i) + h; + NetTry.betas = betas_try; + c2 = ComputeCost(X, Y, NetTry, lambda); + + Grads.betas{j}(i) = (c2-c1) / (2*h); + end + end +end \ No newline at end of file diff --git a/Speech_Recognition/k-layer net w BN/EvaluateClassifier.m b/Speech_Recognition/k-layer net w BN/EvaluateClassifier.m new file mode 100644 index 0000000..2426544 --- /dev/null +++ b/Speech_Recognition/k-layer net w BN/EvaluateClassifier.m @@ -0,0 +1,28 @@ +function [P,s,shat,mu,v] = EvaluateClassifier(X,W,b,gamma,beta,varargin) + k=size(W,2); + s=cell(1,k); + shat=cell(1,k); + shift=cell(1,k); + n=size(X,2); + + if numel(varargin)==2 + mu=varargin{1}; + v=varargin{2}; + else + mu=cell(1,k); + v=cell(1,k); + end + + for l=1:k-1 + s{l}= W{l}*X + b{l}*ones(1,n); + if numel(varargin)~=2 + mu{l}=mean(s{l},2); + v{l}=((var(s{l},0,2)*(n-1)) / n); + end + shat{l}=BatchNormalize(s{l},mu{l},v{l}); + shift{l}=repmat(gamma{l},1,size(shat{l},2)) .* shat{l} + repmat(beta{l},1,size(shat{l},2)); + X=max(0,shift{l}); + end + s{k}= W{k}*X + b{k}*ones(1,n); + P=softmax(s{k}); +end diff --git a/Speech_Recognition/k-layer net w BN/InitializeWb.m b/Speech_Recognition/k-layer net w BN/InitializeWb.m new file mode 100644 index 0000000..49e83b9 --- /dev/null +++ b/Speech_Recognition/k-layer net w BN/InitializeWb.m @@ -0,0 +1,20 @@ +function [W,b,gamma,beta] = InitializeWb(m,d) + k=size(m,2); %nr of layers + W=cell(1,size(m,2)); + b=cell(1,size(m,2)); + gamma=cell(1,size(m,2)-1); + beta=cell(1,size(m,2)-1); + rng(400); + sigma=0; %1e-3 1e-4 + sigma2=1e-1;%1/sqrt(d) + for i=k:-1:2 + W{i}=normrnd(0,1/sqrt(m(i-1)),m(i),m(i-1)); % k x m + b{i}=zeros(m(i),1); + end + for i=k-1:-1:1 + gamma{i}=normrnd(1,0.0005,m(i),1);%ones(m(i),1); + beta{i}=zeros(m(i),1); + end + W{1}=normrnd(0,1/sqrt(d),m(1),d);%*(2/sqrt(m(1))); % m x d random initialization of weights and biases + b{1}=zeros(m(1),1); +end \ No newline at end of file diff --git a/Speech_Recognition/k-layer net w BN/LoadBatch.m b/Speech_Recognition/k-layer net w BN/LoadBatch.m new file mode 100644 index 0000000..ac6586a --- /dev/null +++ b/Speech_Recognition/k-layer net w BN/LoadBatch.m @@ -0,0 +1,19 @@ +function [X, Y, y] = LoadBatch(filename) + A = load(filename); + X=A.mfccs; + %X = double(X) / double(255); + y=A.labels'; + y=cast(y,'single'); + Y=A.onehot'; + Y=cast(Y,'single'); + X=reshape(X, [], size(X,1)); + maxX=max(abs(X)); + + % X=X ./max(maxX); + + meanX = mean(X, 2); + stdX = std(X, 0, 2); + + X = X - repmat(meanX, [1, size(X, 2)]); + X = X ./ repmat(stdX, [1, size(X, 2)]); +end \ No newline at end of file diff --git a/Speech_Recognition/k-layer net w BN/MiniBatchGD.m b/Speech_Recognition/k-layer net w BN/MiniBatchGD.m new file mode 100644 index 0000000..33e43b9 --- /dev/null +++ b/Speech_Recognition/k-layer net w BN/MiniBatchGD.m @@ -0,0 +1,67 @@ +function [W,b,gamma,beta,costs,costsv,xaxis] = MiniBatchGD(X, Y,valx,valy, cycleparams, W, b, lambda,gamma,beta) + +%% cyclic learning rate hyperparameters +N=size(X,2); +k=size(W,2); +costs=[]; +costsv=[]; +xaxis=[]; +freq=10; %number of times the cost should be computed per cycle +nmin=cycleparams(1); +nmax=cycleparams(2); +ns=cycleparams(3); % bsize =100 --> 1 cycle=10 epochs +nbatch=cycleparams(4); %ns=k*10000/100 = k*100 +epochs=cycleparams(5); %so that training stops after one cycle +lmax=cycleparams(6); +alpha=0.99; +%% +t=0; +for l=0:lmax + for epoch=1:epochs + for j=1:N/nbatch + if (t >= 2*l*ns) && (t <= (2*l+1)*ns) + nt=nmin + ((t-2*l*ns)*(nmax-nmin))/ns; + end + if ((2*l+1)*ns < t) && (2*(l+1)*ns >=t) + nt=nmax - ((t-((2*l+1)*ns))*(nmax-nmin))/ns; + end + jstart = (j-1)*nbatch + 1; + jend = j*nbatch; + inds = jstart:jend; + Xbatch = X(:, jstart:jend); + Ybatch = Y(:, jstart:jend); + [P,s,shat,mu,v]=EvaluateClassifier(Xbatch,W,b,gamma,beta); % K x n + [gW,gb,ggamma,gbeta]=ComputeGradients(Xbatch,Ybatch,P,s,W,lambda,shat,mu,v,gamma,beta); + for i=1:k + W{i}= W{i} - (nt * gW{i}); + b{i}= b{i} - (nt * gb{i}); + end + for i=1:k-1 + gamma{i}= gamma{i} - (nt * ggamma{i}); + beta{i}= beta{i} - (nt * gbeta{i}); + end + if t==0 + movingMu=mu; + movingVar=v; + end + for i=1:size(mu,2) + movingMu{i}=alpha*movingMu{i} + (1-alpha)*mu{i}; + movingVar{i}=alpha*movingVar{i} + (1-alpha)*v{i}; + end + if mod(t,500)==0 % calculate cost 10 times per cycle, 1 cycle with ns=800 is 1600 updates + costs=[costs, ComputeCost(Xbatch,Ybatch,W,b,lambda,gamma,beta,movingMu,movingVar)]; % which is 16 epochs when + costsv=[costsv, ComputeCost(valx,valy,W,b,lambda,gamma,beta,movingMu,movingVar)]; % batchsize is 100 + xaxis=[xaxis, t]; + end + t=t+1; + end + seed=randperm(size(X,2)); % shuffle order of data(g) + X=X(:,seed); + Y=Y(:,seed); + end +end +% costs=[costs, ComputeCost(X,Y,W,b,lambda)]; % which is 16 epochs when +% costsv=[costsv, ComputeCost(valx,valy,W,b,lambda)]; % when batchsize is 100 +% xaxis=[xaxis, t]; + +end \ No newline at end of file diff --git a/Speech_Recognition/k-layer net w BN/main.m b/Speech_Recognition/k-layer net w BN/main.m new file mode 100644 index 0000000..6e64fe9 --- /dev/null +++ b/Speech_Recognition/k-layer net w BN/main.m @@ -0,0 +1,77 @@ +%% Loading all sets +[trainX,trainY,trainy]= LoadBatch('../train20.mat'); +[testX,testY,testy]= LoadBatch('../test20.mat'); + +seed=randperm(size(trainX,2)); % shuffle order of data(g) +trainX=trainX(:,seed); +trainY=trainY(:,seed); +trainy=trainy(seed); +seed=randperm(size(testX,2)); % shuffle order of data(g) +testX=testX(:,seed); +testY=testY(:,seed); +testy=testy(seed); +sTest=10000; +sVal=3000; + +% testX1=testX(:,1:sTest); testY1=testY(:,1:sTest); testy1=testy(1:sTest); %take 5k for test set +% testX(:,1:sTest)=[];testY(:,1:sTest)=[];testy(1:sTest)=[]; %remove the 5 k test set images from training +% +% trainX=[trainX testX1]; trainY=[trainY testY1];trainy=[trainy;testy1]; + +valX=trainX(:,1:sVal); valY=trainY(:,1:sVal); valy=trainy(1:sVal); %take 5k for val set +trainX(:,1:sVal)=[];trainY(:,1:sVal)=[];trainy(1:sVal)=[]; %remove the 5 k val set images from training + +%% Hyperparameter settings of network +m=[100,100,100,100,70,50,50,30];%[30,100,100,100,70,50,20,30];%8.94 %hidden nodes in layer 1 +d=880; %dimension of the samples = 220 +lambda=0; +N=size(trainX,2); +%%[220,200,180,160,140,130,120,110,100,90,80,70,60,50,20,30]; %hidden nodes in layer 1 + +%5*110 b=100 lambda=0.0001 l=10 nmin=1e-3; nmax=1e-1;[30,100,100,100,70,50,20,30] +%% Check analytically computed gradients against numerically computed (WORKS) +% steps=1e-5; +% eps=1e-3; +% [W,b,gamma,beta] = InitializeWb(m,d); +% [P,s,shat,mu,v] = EvaluateClassifier(trainX(1:20,1:5),W,b,gamma,beta); +% [gradW,gradb,gradgamma,gradbeta] = ComputeGradients(trainX(1:20,1:5), trainY(:,1:5),P, s, W,lambda,shat,mu,v,gamma,beta); +% [gradbNum, gradWNum,gradGNum,gradBNum] = ComputeGradsNumSlow(trainX(1:20,1:5), trainY(:,1:5), W, b,lambda, steps,gamma,beta); +% +% for i=1:size(m,2) +% relDiffNumb(i)=sum(abs(gradb{i} - gradbNum{i})/max(eps, sum(abs(gradb{i}) + abs(gradbNum{i})))); +% relDiffNumW(i)=sum(abs(gradW{i} - gradWNum{i})/max(eps, sum(abs(gradW{i}) + abs(gradWNum{i})))); +% centrDiffNumb(i)=sum(sum(abs(gradb{i}-gradbNum{i}))); +% centrDiffNumW(i)=sum(sum(abs(gradW{i}-gradWNum{i}))); +% end +% +% for i=1:size(gamma,2) +% relDiffNumG(i)=sum(abs(gradgamma{i} - gradGNum{i})/max(eps, sum(abs(gradgamma{i}) + abs(gradGNum{i})))); +% relDiffNumB(i)=sum(abs(gradbeta{i} - gradBNum{i})/max(eps, sum(abs(gradbeta{i}) + abs(gradBNum{i})))); +% centrDiffNumG(i)=sum(sum(abs(gradgamma{i}-gradGNum{i}))); +% centrDiffNumB(i)=sum(sum(abs(gradbeta{i}-gradBNum{i}))); +% end + +%% check Mini batch gd without batchnormalization (WORKS) +nmin=1e-6; +nmax=1e-2; +ns=4*110;%5*450; % ns=500 & bsize =100 --> 1 cycle=10 epochs +l=5; +lambda=0.0001; +nbatch=100; %ns=k*10000/100 = k*100 +epochs=(2*ns)/(11000/nbatch); +cycleparams=[nmin,nmax,ns,nbatch,epochs,l]; +allAcc=[]; + + +for lambda=0.0001 %0.002:0.0004:0.0.004 + [W,b,gamma,beta] = InitializeWb(m,d); + [Wstar,bstar,gstar,bestar,costs,costsv,xaxis]=MiniBatchGD(trainX,trainY,valX,valY,cycleparams,W,b,lambda,gamma,beta); + accFinal=ComputeAccuracy(testX,testy,Wstar,bstar,gstar,bestar); %accuracy after training + allAcc=[allAcc, accFinal]; +end +figure +plot(xaxis,costs) +ylim([0 4]) +hold on; +plot(xaxis,costsv); +legend('training cost','validation cost')