diff --git a/examples/keyword_spotting/README.md b/examples/keyword_spotting/README.md index 3b0c2cec..fdef4915 100644 --- a/examples/keyword_spotting/README.md +++ b/examples/keyword_spotting/README.md @@ -164,8 +164,33 @@ Total Memory cost (Network and NNoM): 15020 msh > ~~~ +## Testing it on PC +Now, you can test your KWS model on PC using KWS test data to be preprocessed by the C MFCC implementation. First, you need to download the raw test data from Google Drive [here](https://drive.google.com/drive/folders/1gS2klWb02YvaoE5UTNDy9SQsS5ZTsvNN?usp=sharing). Then, replace the main_pc.c to be you main file. It will print the model predictions followed by the groundtruth, and will also calculate the Top-1 accuracy score every 100 samples. - +~~~ +0 right : 100% - Ground Truth is: right +1 nine : 100% - Ground Truth is: right +2 right : 100% - Ground Truth is: right +3 right : 100% - Ground Truth is: right +4 right : 100% - Ground Truth is: right +5 right : 100% - Ground Truth is: right +6 right : 100% - Ground Truth is: right +7 right : 100% - Ground Truth is: right +8 right : 100% - Ground Truth is: right +9 forward : 66% - Ground Truth is: right +10 right : 100% - Ground Truth is: right +11 right : 100% - Ground Truth is: right +12 right : 100% - Ground Truth is: right +13 right : 100% - Ground Truth is: right +14 right : 52% - Ground Truth is: right +15 right : 100% - Ground Truth is: right +16 right : 100% - Ground Truth is: right +17 right : 100% - Ground Truth is: right +18 right : 100% - Ground Truth is: right +19 right : 74% - Ground Truth is: right +20 right : 100% - Ground Truth is: right +... +~~~ diff --git a/examples/keyword_spotting/main_pc.c b/examples/keyword_spotting/main_pc.c new file mode 100644 index 00000000..1db3db50 --- /dev/null +++ b/examples/keyword_spotting/main_pc.c @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2018-2020, Jianjia Ma + * + * SPDX-License-Identifier: Apache-2.0 + * + * Change Logs: + * Date Author Notes + * 2019-03-29 Jianjia Ma first implementation + * + * Notes: + * This is a keyword spotting example using NNoM + * + */ + +#include +#include +#include +#include +#include +#include + +#include "nnom.h" +#include "kws_weights.h" + +#include "mfcc.h" +#include "math.h" + + +// NNoM model +nnom_model_t *model; + +// 10 labels-1 +//const char label_name[][10] = {"yes", "no", "up", "down", "left", "right", "on", "off", "stop", "go", "unknow"}; + +// 10 labels-2 +//const char label_name[][10] = {"marvin", "sheila", "yes", "no", "left", "right", "forward", "backward", "stop", "go", "unknow"}; + +// full 34 labels +const char label_name[][10] = {"backward", "bed", "bird", "cat", "dog", "down", "eight","five", "follow", "forward", + "four", "go", "happy", "house", "learn", "left", "marvin", "nine", "no", "off", "on", "one", "right", + "seven", "sheila", "six", "stop", "three", "tree", "two", "up", "visual", "yes", "zero", "unknow"}; + + + +int16_t audio[512]; +char ground_truth[12000][10]; +#define SAMP_FREQ 16000 +#define AUDIO_FRAME_LEN (512) //31.25ms * 16000hz = 512, // FFT (windows size must be 2 power n) + +mfcc_t * mfcc; +//int32_t audio_data[4000]; //32000/8 +int dma_audio_buffer[AUDIO_FRAME_LEN]; //512 +int16_t audio_buffer_16bit[(int)(AUDIO_FRAME_LEN*1.5)]; // an easy method for 50% overlapping +int audio_sample_i = 0; + +//the mfcc feature for kws +#define MFCC_LEN (62) +#define MFCC_COEFFS_FIRST (1) // ignore the mfcc feature before this number +#define MFCC_COEFFS_LEN (13) // the total coefficient to calculate +#define MFCC_TOTAL_NUM_BANK (26) // total number of filter bands +#define MFCC_COEFFS (MFCC_COEFFS_LEN-MFCC_COEFFS_FIRST) + +#define MFCC_FEAT_SIZE (MFCC_LEN * MFCC_COEFFS) +float mfcc_features_f[MFCC_COEFFS]; // output of mfcc +int8_t mfcc_features[MFCC_LEN][MFCC_COEFFS]; // ring buffer +int8_t mfcc_features_seq[MFCC_LEN][MFCC_COEFFS]; // sequencial buffer for neural network input. +uint32_t mfcc_feat_index = 0; + +// msh debugging controls +bool is_print_abs_mean = false; // to print the mean of absolute value of the mfcc_features_seq[][] +bool is_print_mfcc = false; // to print the raw mfcc features at each update +void Error_Handler() +{ + printf("error\n"); +} + +static int32_t abs_mean(int8_t *p, size_t size) +{ + int64_t sum = 0; + for(size_t i = 0; i (y)) ? (x) : (y)) + #define _MIN(x, y) (((x) < (y)) ? (x) : (y)) + float limit = (1 << int_bit); + float d; + for(uint32_t i=0; i(H))?(H):(N))) + int *p_raw_audio; + + + // calculate 13 coefficient, use number #2~13 coefficient. discard #1 + // features, offset, bands, 512fft, 0 preempha, attached_energy_to_band0 + mfcc = mfcc_create(MFCC_COEFFS_LEN, MFCC_COEFFS_FIRST, MFCC_TOTAL_NUM_BANK, AUDIO_FRAME_LEN, 0.97f, true); + + if (audio_sample_i == 15872) + memset(&dma_audio_buffer[128], 0, sizeof(int) * 128); //to fill the latest quarter in the latest frame + p_raw_audio = dma_audio_buffer; + + + // memory move + // audio buffer = | 256 byte old data | 256 byte new data 1 | 256 byte new data 2 | + // ^------------------------------------------| + memcpy(audio_buffer_16bit, &audio_buffer_16bit[AUDIO_FRAME_LEN], (AUDIO_FRAME_LEN/2)*sizeof(int16_t)); + + // convert it to 16 bit. + // volume*4 + for(int i = 0; i < AUDIO_FRAME_LEN; i++) + { + audio_buffer_16bit[AUDIO_FRAME_LEN/2+i] = p_raw_audio[i]; + } + + // MFCC + // do the first mfcc with half old data(256) and half new data(256) + // then do the second mfcc with all new data(512). + // take mfcc buffer + + for(int i=0; i<2; i++) + { + if ((audio_sample_i != 0 || i==1) && (audio_sample_i != 15872 || i==0)) //to skip computing first mfcc block that's half empty + { + mfcc_compute(mfcc, &audio_buffer_16bit[i*AUDIO_FRAME_LEN/2], mfcc_features_f); + + + // quantise them using the same scale as training data (in keras), by 2^n. + quantize_data(mfcc_features_f, mfcc_features[mfcc_feat_index], MFCC_COEFFS, 3); + + // debug only, to print mfcc data on console + if(0) + { + for(int q=0; q= MFCC_LEN) + mfcc_feat_index = 0; + } + + } +} + + + +int main(void) +{ + uint32_t last_mfcc_index = 0; + uint32_t label; + float prob; + audio_sample_i = 0; + int s = 0; //number of audio samples to scan + float acc; + int correct = 0; + FILE * file; + FILE * ground_truth_f; + char str[10]; + int j=0; + int F = 512; + + file = fopen ("test_x.txt","r"); //the audio data stored in a textfile + ground_truth_f = fopen ("test_y.txt","r"); //the ground truth textfile + + while (!feof (ground_truth_f)) + { + fscanf (ground_truth_f, "%s", ground_truth[j]); + j++; + } + fclose (ground_truth_f); + + int p = 0; + + // create and compile the model + model = nnom_model_create(); + + while(1) + { + while (p=16000) + { + // ML + memcpy(nnom_input_data, mfcc_features, MFCC_FEAT_SIZE); + nnom_predict(model, &label, &prob); + + // output + printf("%d %s : %d%% - Ground Truth is: %s\n", s, (char*)&label_name[label], (int)(prob * 100),ground_truth[s]); + if(strcmp(ground_truth[s], label_name[label])==0) correct++; + if(s%100==0 && s > 0) + { + acc = ((float)correct/(s) * 100); + printf("Accuracy : %.6f%%\n",acc); + } + audio_sample_i = 0; + F = 512; + s=s+1; + } + + if(s>=11000) break; + } + acc = ((float)correct/(s) * 100); + printf("Accuracy : %.6f%%\n",acc); + fclose(file); + +} diff --git a/examples/keyword_spotting/mfcc.c b/examples/keyword_spotting/mfcc.c index efee1a59..6e16aea4 100644 --- a/examples/keyword_spotting/mfcc.c +++ b/examples/keyword_spotting/mfcc.c @@ -34,7 +34,7 @@ #ifndef MFCC_PLATFORM_ARM // FFT code from arduino_fft: https://github.com/lloydroc/arduino_fft -// change to float dataŁ¬ modify to fit within this file +// change to float dataŁ¬ modify to fit within this file // see the above link for license( MIT license). #include #include @@ -108,16 +108,16 @@ static void *mfcc_malloc(size_t size) memset(p, 0, size); return p; } - + static void mfcc_free(void*p){ if(p!=NULL) free(p); } -mfcc_t *mfcc_create(int num_mfcc_features, int feature_offset, int num_fbank, int frame_len, float preempha, int is_append_energy) +mfcc_t *mfcc_create(int num_mfcc_features, int feature_offset, int num_fbank, int frame_len, float preempha, int is_append_energy) { mfcc_t * mfcc; mfcc = mfcc_malloc(sizeof(mfcc_t)); - + mfcc->num_mfcc_features = num_mfcc_features; mfcc->num_features_offset = feature_offset; mfcc->num_fbank = num_fbank; @@ -132,7 +132,7 @@ mfcc_t *mfcc_create(int num_mfcc_features, int feature_offset, int num_fbank, in mfcc->buffer = mfcc_malloc(sizeof(float)* mfcc->frame_len_padded); mfcc->mel_energies = mfcc_malloc(sizeof(float)*mfcc->num_fbank ); - //create window function, hanning + //create window function, hanning mfcc->window_func = mfcc_malloc(sizeof(float)*frame_len); for (int i = 0; i < frame_len; i++) mfcc->window_func[i] = 0.5f - 0.5f*cosf((float)M_2PI * ((float)i) / (frame_len)); @@ -178,7 +178,7 @@ void mfcc_delete(mfcc_t* mfcc) mfcc_free(mfcc); } -float * create_dct_matrix(int32_t input_length, int32_t coefficient_count) +float * create_dct_matrix(int32_t input_length, int32_t coefficient_count) { int32_t k, n; float * M = mfcc_malloc(sizeof(float) * input_length * coefficient_count); @@ -188,9 +188,9 @@ float * create_dct_matrix(int32_t input_length, int32_t coefficient_count) #else normalizer = sqrtf(2.0f/(float)input_length); #endif - for (k = 0; k < coefficient_count; k++) + for (k = 0; k < coefficient_count; k++) { - for (n = 0; n < input_length; n++) + for (n = 0; n < input_length; n++) { M[k*input_length+n] = normalizer * cosf( ((float)M_PI)/input_length * (n + 0.5f) * k ); } @@ -198,75 +198,46 @@ float * create_dct_matrix(int32_t input_length, int32_t coefficient_count) return M; } -float ** create_mel_fbank(mfcc_t *mfcc) { - - int32_t bin, i; - - int32_t num_fft_bins = mfcc->frame_len_padded/2; - float fft_bin_width = ((float)SAMP_FREQ) / mfcc->frame_len_padded; - float mel_low_freq = MelScale(MEL_LOW_FREQ); - float mel_high_freq = MelScale(MEL_HIGH_FREQ); - float mel_freq_delta = (mel_high_freq - mel_low_freq) / (mfcc->num_fbank +1); - - float *this_bin = mfcc_malloc(sizeof(float) * num_fft_bins); - - float ** mel_fbank = mfcc_malloc(sizeof(float*) * mfcc->num_fbank); - - for (bin = 0; bin < mfcc->num_fbank ; bin++) { - - float left_mel = mel_low_freq + bin * mel_freq_delta; - float center_mel = mel_low_freq + (bin + 1) * mel_freq_delta; - float right_mel = mel_low_freq + (bin + 2) * mel_freq_delta; - - int32_t first_index = -1, last_index = -1; - - for (i = 0; i < num_fft_bins; i++) { - - float freq = (fft_bin_width * i); // center freq of this fft bin. - float mel = MelScale(freq); - this_bin[i] = 0.0; - - if (mel > left_mel && mel < right_mel) { - float weight; - if (mel <= center_mel) { - weight = (mel - left_mel) / (center_mel - left_mel); - } else { - weight = (right_mel-mel) / (right_mel-center_mel); - } - this_bin[i] = weight; - if (first_index == -1) - first_index = i; - last_index = i; - } - } - - mfcc->fbank_filter_first[bin] = first_index; - mfcc->fbank_filter_last[bin] = last_index; - //size = size + size % 16; - mel_fbank[bin] = mfcc_malloc(sizeof(float) * ((size_t)last_index - first_index + 1)); - - int32_t j = 0; - //copy the part we care about - for (i = first_index; i <= last_index; i++) { - mel_fbank[bin][j++] = this_bin[i]; - } - } - mfcc_free(this_bin); - return mel_fbank; +float ** create_mel_fbank(mfcc_t *mfcc) { + + // compute points evenly spaced in mels + float mel_low_freq = MelScale(MEL_LOW_FREQ); + float mel_high_freq = MelScale(MEL_HIGH_FREQ); + float mel_freq_delta = (mel_high_freq - mel_low_freq) / (mfcc->num_fbank +1); + + float * bin = mfcc_malloc(sizeof(float) * mfcc->num_fbank+2); + for (int i=0; inum_fbank+2; i++) + { + bin[i] = mel_low_freq + mel_freq_delta*i; + bin[i] = floor((mfcc->frame_len_padded+1)*InverseMelScale(bin[i])/SAMP_FREQ); + } + + float ** mel_fbank = mfcc_malloc(sizeof(float*) * mfcc->num_fbank); + + for (int j=0; jnum_fbank; j++) { + mel_fbank[j] = mfcc_malloc(sizeof(float) * (mfcc->frame_len_padded/2+1)); + for (int i=(int)bin[j]; i<(int)bin[j+1]; i++) + mel_fbank[j][i] = (i - bin[j]) / (bin[j+1]-bin[j]); + for (int i=(int)bin[j+1]; i<(int)bin[j+2]; i++) + mel_fbank[j][i] = (bin[j+2]-i) / (bin[j+2]-bin[j+1]); + } + + mfcc_free(bin); + return mel_fbank; + } -void mfcc_compute(mfcc_t *mfcc, const int16_t * audio_data, float* mfcc_out) +void mfcc_compute(mfcc_t *mfcc, const int16_t * audio_data, float* mfcc_out) { int32_t i, j, bin; - //1. TensorFlow way of normalizing .wav data to (-1,1) and 2. do pre-emphasis. - float last = (float)audio_data[0] / (1 << 15); - mfcc->frame[0] = last; + //1. TensorFlow way of normalizing .wav data to (-1,1) and 2. do pre-emphasis. + float last = (float)audio_data[0]; + mfcc->frame[0] = last / (1 << 15); for (i = 1; i < mfcc->frame_len; i++) { mfcc->frame[i] = ((float)audio_data[i] - last * mfcc->preempha) / (1<<15); last = (float)audio_data[i]; } - //Fill up remaining with zeros if(mfcc->frame_len_padded - mfcc->frame_len) memset(&mfcc->frame[mfcc->frame_len], 0, sizeof(float) * (mfcc->frame_len_padded - mfcc->frame_len)); @@ -291,8 +262,8 @@ void mfcc_compute(mfcc_t *mfcc, const int16_t * audio_data, float* mfcc_out) mfcc->buffer[i] = real*real + im*im; } mfcc->buffer[0] = first_energy; - mfcc->buffer[half_dim] = last_energy; - + mfcc->buffer[half_dim] = last_energy; + #else // end of ARM_fft // not yet optimized for memory float *data_re = mfcc->fft_buffer; @@ -304,54 +275,44 @@ void mfcc_compute(mfcc_t *mfcc, const int16_t * audio_data, float* mfcc_out) fft(data_re, data_im, mfcc->frame_len_padded); // only need half (N/2+1) for (int i = 0; i <= mfcc->frame_len_padded/2; i++) { - mfcc->buffer[i] = data_re[i] * data_re[i] + data_im[i]* data_im[i]; + mfcc->buffer[i] = (data_re[i] * data_re[i] + data_im[i]* data_im[i])/mfcc->frame_len_padded; } #endif float sqrt_data; //Apply mel filterbanks - for (bin = 0; bin < mfcc->num_fbank ; bin++) - { - j = 0; + for (bin = 0; bin < mfcc->num_fbank ; bin++) + { float mel_energy = 0; - int32_t first_index = mfcc->fbank_filter_first[bin]; - int32_t last_index = mfcc->fbank_filter_last[bin]; - for (i = first_index; i <= last_index; i++) { - mel_energy += mfcc->buffer[i] * mfcc->mel_fbank[bin][j++]; + for (i = 0; i < mfcc->frame_len_padded/2+1; i++) { + mel_energy += mfcc->buffer[i] * mfcc->mel_fbank[bin][i]; } - mfcc->mel_energies[bin] = mel_energy / mfcc->frame_len_padded; + mfcc->mel_energies[bin] = mel_energy; //avoid log of zero if (mel_energy == 0.0f) mfcc->mel_energies[bin] = FLT_MIN; - } + } //Take log float total_energy = 0; for (bin = 0; bin < mfcc->num_fbank; bin++) { total_energy += mfcc->mel_energies[bin]; - mfcc->mel_energies[bin] = logf(mfcc->mel_energies[bin]); + mfcc->mel_energies[bin] = logf(mfcc->mel_energies[bin]); } - //Take DCT. Uses matrix mul. int out_index = 0; - for (i = mfcc->num_features_offset; i < mfcc->num_mfcc_features; i++) + for (i = mfcc->num_features_offset; i < mfcc->num_mfcc_features; i++) { float sum = 0.0; - for (j = 0; j < mfcc->num_fbank ; j++) + for (j = 0; j < mfcc->num_fbank ; j++) { sum += mfcc->dct_matrix[i*mfcc->num_fbank +j] * mfcc->mel_energies[j]; } mfcc_out[out_index] = sum; out_index ++; - } - - // whether replace the first energy by log of total energy - if (mfcc->is_append_energy) - { - mfcc_out[0] = logf(total_energy); - } + } } diff --git a/examples/keyword_spotting/mfcc.h b/examples/keyword_spotting/mfcc.h index dfef67a2..d19a14eb 100644 --- a/examples/keyword_spotting/mfcc.h +++ b/examples/keyword_spotting/mfcc.h @@ -23,7 +23,7 @@ #define __MFCC_H__ -// in main.c define "PLATFORM_ARM" before including 'mfcc.h' to use ARM optimized FFT +// in main.c define "PLATFORM_ARM" before including 'mfcc.h' to use ARM optimized FFT #ifdef PLATFORM_ARM #include "arm_math.h" #define MFCC_PLATFORM_ARM @@ -63,14 +63,14 @@ typedef struct _mfcc_t{ } mfcc_t; static inline float InverseMelScale(float mel_freq) { - return 700.0f * (expf (mel_freq / 1127.0f) - 1.0f); + return 700.0f * (pow(10,(mel_freq / 2595.0f)) - 1.0f); } static inline float MelScale(float freq) { - return 1127.0f * logf (1.0f + freq / 700.0f); + return 2595.0f * log10(1.0f + freq / 700.0f); } -float * create_dct_matrix(int32_t input_length, int32_t coefficient_count); +float * create_dct_matrix(int32_t input_length, int32_t coefficient_count); float ** create_mel_fbank(mfcc_t* mfcc); mfcc_t *mfcc_create(int num_mfcc_features, int feature_offset, int num_fbank, int frame_len, float preempha, int is_append_energy); diff --git a/examples/keyword_spotting/mfcc.py b/examples/keyword_spotting/mfcc.py index 0a046d79..a7906f07 100644 --- a/examples/keyword_spotting/mfcc.py +++ b/examples/keyword_spotting/mfcc.py @@ -27,8 +27,7 @@ def generate_mfcc(sig, rate, sig_len, noise=None, noise_weight=0.1, winlen=0.032 if(len(sig) >sig_len): sig = sig[0:sig_len] # i dont know, 'tensorflow' normalization - sig = sig.astype('float') / 32768 - + sig = sig.astype('float32') / 32768 if(noise is not None): noise = noise[random.randint(0, len(noise)-1)] # pick a noise start = random.randint(0, len(noise)-sig_len) # pick a sequence @@ -59,7 +58,6 @@ def merge_mfcc_file(input_path='dat/', mix_noise=True, sig_len=16000, winlen=0.0 test_list = f.read() with open(input_path + 'validation_list.txt', 'r') as f: validate_list = f.read() - files = os.listdir(input_path) for fi in files: fi_d = os.path.join(input_path, fi) @@ -68,6 +66,7 @@ def merge_mfcc_file(input_path='dat/', mix_noise=True, sig_len=16000, winlen=0.0 label = fi_d.split('/')[1] # get the label from the dir print(label) # noise in training + if 'noise' in label: for f in os.listdir(fi_d): filename = f @@ -76,6 +75,7 @@ def merge_mfcc_file(input_path='dat/', mix_noise=True, sig_len=16000, winlen=0.0 f = os.path.join(fi_d, f) (rate, sig) = wav.read(f) for i in range(0, len(sig), sig_len): + data = generate_mfcc(sig[i:i+sig_len], rate, sig_len, winlen=winlen, winstep=winstep, numcep=numcep, nfilt=nfilt, nfft=nfft, lowfreq=lowfreq, highfreq=highfreq, winfunc=winfunc, ceplifter=ceplifter, preemph=preemph) @@ -87,22 +87,31 @@ def merge_mfcc_file(input_path='dat/', mix_noise=True, sig_len=16000, winlen=0.0 # dataset for f in os.listdir(fi_d): filename = f + f = os.path.join(fi_d, f) (rate, sig) = wav.read(f) - data = generate_mfcc(sig, rate, sig_len, noise=noise, winlen=winlen, winstep=winstep, numcep=numcep, nfilt=nfilt, nfft=nfft, lowfreq=lowfreq, - highfreq=highfreq, winfunc=winfunc, ceplifter=ceplifter, preemph=preemph) - data = np.array(data) # ?? no idea why this works # split dataset into train, test, validate + if filename in test_list: + data = generate_mfcc(sig, rate, sig_len, winlen=winlen, winstep=winstep, numcep=numcep, nfilt=nfilt, nfft=nfft, lowfreq=lowfreq, highfreq=highfreq, winfunc=winfunc, ceplifter=ceplifter, preemph=preemph) + data = np.array(data) # ?? no idea why this works + test_data.append(data) test_label.append(label) + elif filename in validate_list: + data = generate_mfcc(sig, rate, sig_len, winlen=winlen, winstep=winstep, numcep=numcep, nfilt=nfilt, nfft=nfft, lowfreq=lowfreq, highfreq=highfreq, winfunc=winfunc, ceplifter=ceplifter, preemph=preemph) + data = np.array(data) # ?? no idea why this works validate_data.append(data) validate_label.append(label) else: + data = generate_mfcc(sig, rate, sig_len, noise=noise, winlen=winlen, winstep=winstep, numcep=numcep, nfilt=nfilt, nfft=nfft, lowfreq=lowfreq, highfreq=highfreq, winfunc=winfunc, ceplifter=ceplifter, preemph=preemph) + data = np.array(data) # ?? no idea why this works train_data.append(data) train_lable.append(label) + + # finalize train_data = np.array(train_data) @@ -116,14 +125,14 @@ def merge_mfcc_file(input_path='dat/', mix_noise=True, sig_len=16000, winlen=0.0 # test (x_train, y_train), (x_test, y_test), (x_val, y_val) = merge_mfcc_file() - + np.save('train_data.npy', x_train) np.save('train_label.npy', y_train) np.save('test_data.npy', x_test) np.save('test_label.npy', y_test) np.save('val_data.npy', x_val) np.save('val_label.npy', y_val) - + print('x_train shape:', x_train.shape, 'max', x_train.max(), 'min', x_train.min()) mfcc_feat = x_train[3948] diff --git a/examples/keyword_spotting/python_speech_features/base.py b/examples/keyword_spotting/python_speech_features/base.py index acf72444..1afe8ee0 100644 --- a/examples/keyword_spotting/python_speech_features/base.py +++ b/examples/keyword_spotting/python_speech_features/base.py @@ -44,10 +44,13 @@ def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13, """ nfft = nfft or calculate_nfft(samplerate, winlen) feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph,winfunc) + #print(type(feat[0][0])) + feat = numpy.log(feat) - feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep] + feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep] feat = lifter(feat,ceplifter) if appendEnergy: feat[:,0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy + return feat def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, @@ -70,15 +73,15 @@ def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, """ highfreq= highfreq or samplerate/2 signal = sigproc.preemphasis(signal,preemph) - frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc) + frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc) pspec = sigproc.powspec(frames,nfft) + + energy = numpy.sum(pspec,1) # this stores the total energy in each frame energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log - fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) feat = numpy.dot(pspec,fb.T) # compute the filterbank energies feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log - return feat,energy def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, @@ -136,7 +139,7 @@ def hz2mel(hz): :param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise. :returns: a value in Mels. If an array was passed in, an identical sized array is returned. """ - return 2595 * numpy.log10(1+hz/700.) + return 2595 * numpy.log10(1+hz/700.).astype('float32') def mel2hz(mel): """Convert a value in Mels to Hertz @@ -163,12 +166,12 @@ def get_filterbanks(nfilt=20,nfft=512,samplerate=16000,lowfreq=0,highfreq=None): # compute points evenly spaced in mels lowmel = hz2mel(lowfreq) highmel = hz2mel(highfreq) - melpoints = numpy.linspace(lowmel,highmel,nfilt+2) + melpoints = numpy.linspace(lowmel,highmel,nfilt+2).astype('float32') # our points are in Hz, but we use fft bins, so we have to convert # from Hz to fft bin number - bin = numpy.floor((nfft+1)*mel2hz(melpoints)/samplerate) + bin = numpy.floor((nfft+1)*mel2hz(melpoints)/samplerate).astype('float32') - fbank = numpy.zeros([nfilt,nfft//2+1]) + fbank = numpy.zeros([nfilt,nfft//2+1]).astype('float32') for j in range(0,nfilt): for i in range(int(bin[j]), int(bin[j+1])): fbank[j,i] = (i - bin[j]) / (bin[j+1]-bin[j]) diff --git a/examples/keyword_spotting/python_speech_features/sigproc.py b/examples/keyword_spotting/python_speech_features/sigproc.py index a786c4fb..9aba4ea9 100644 --- a/examples/keyword_spotting/python_speech_features/sigproc.py +++ b/examples/keyword_spotting/python_speech_features/sigproc.py @@ -38,10 +38,13 @@ def framesig(sig, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,)), str padlen = int((numframes - 1) * frame_step + frame_len) - zeros = numpy.zeros((padlen - slen,)) + zeros = numpy.zeros((padlen - slen,)).astype('float32') padsignal = numpy.concatenate((sig, zeros)) if stride_trick: - win = winfunc(frame_len) + win = winfunc(frame_len).astype('float32') + for i in range (0,512): + win[i] = 0.5 - 0.5*numpy.cos(numpy.pi*2 * (i / 512)) + win = win.astype('float32') frames = rolling_window(padsignal, window=frame_len, step=frame_step) else: indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile( @@ -49,7 +52,6 @@ def framesig(sig, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,)), str indices = numpy.array(indices, dtype=numpy.int32) frames = padsignal[indices] win = numpy.tile(winfunc(frame_len), (numframes, 1)) - return frames * win @@ -100,7 +102,7 @@ def magspec(frames, NFFT): 'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.', numpy.shape(frames)[1], NFFT) complex_spec = numpy.fft.rfft(frames, NFFT) - return numpy.absolute(complex_spec) + return numpy.absolute(complex_spec).astype('float32') def powspec(frames, NFFT): @@ -110,7 +112,7 @@ def powspec(frames, NFFT): :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded. :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the power spectrum of the corresponding frame. """ - return 1.0 / NFFT * numpy.square(magspec(frames, NFFT)) + return 1.0 / NFFT * numpy.square(magspec(frames, NFFT)).astype('float32') def logpowspec(frames, NFFT, norm=1): @@ -123,7 +125,7 @@ def logpowspec(frames, NFFT, norm=1): """ ps = powspec(frames, NFFT); ps[ps <= 1e-30] = 1e-30 - lps = 10 * numpy.log10(ps) + lps = 10 * numpy.log10(ps).astype('float32') if norm: return lps - numpy.max(lps) else: @@ -137,4 +139,4 @@ def preemphasis(signal, coeff=0.95): :param coeff: The preemphasis coefficient. 0 is no filter, default is 0.95. :returns: the filtered signal. """ - return numpy.append(signal[0], signal[1:] - coeff * signal[:-1]) + return numpy.append(signal[0], signal[1:] - coeff * signal[:-1]).astype('float32')