Tasmota/lib/libesp32_ml/mel_freq_extractor/src/mfcc.h
Christian Baars 83f039cdf7
Berry: add tensorflow lite for microcontrollers (#18119)
Co-authored-by: Christian Baars <christianbaars@MacBook-Pro-von-Christian.local>
2023-03-05 16:46:18 +01:00

315 lines
10 KiB
C++

/*
mfcc.h - mel frequency extractor for ESP32
Computes features for slizes of audio data similiar to speechpy
This is intended to provide a stripped down implementation that can work with Edgempulse trained models
based on:
https://github.com/astorfi/speechpy
https://github.com/AIWintermuteAI/Speech-to-Intent-Micro/blob/main/inference_code/Wio_Terminal/wio_speech_to_intent_150_10/mfcc.cpp
Copyright (C) 2022 Christian Baars
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.@
*/
#ifndef MELFREQUENCYEXTRACTOR_H
#define MELFREQUENCYEXTRACTOR_H
#include <Arduino.h>
#include <string.h>
#include "float.h"
#include "esp_dsp.h"
class MFCC{
private:
int num_mfcc_features;
int frame_len;
int frame_len_padded;
int num_bank_bins;
float * m_frame;
float * m_buffer;
float * m_mel_energies;
float * m_dct_matrix;
float * m_mel_fbank;
uint8_t m_amplification;
float m_preemphasis;
float * create_dct_matrix(int32_t input_length, int32_t coefficient_count);
void create_mel_filterbank(int samp_freq, int low_freq, int high_freq);
static inline float InverseMelScale(float mel_freq) {
return 700.0f * (expf (mel_freq / 1127.0f) - 1.0f);
}
static inline float MelScale(float freq) {
return 1127.0f * logf (1.0f + freq / 700.0f);
}
public:
MFCC(int num_mfcc_features, int frame_len,int num_bank_bins, int samp_freq, int low_freq, int high_freq);
~MFCC();
void set_preamp(uint8_t amplification);
void set_preemphasis(float preemphasis);
void mfcc_compute(const int16_t* data, float* mfcc_out);
void log10_normalize(float* out_buf, int out_buf_len, int noise_floor_db);
};
MFCC::MFCC(int num_mfcc_features, int frame_len, int num_bank_bins, int samp_freq, int low_freq, int high_freq)
:num_mfcc_features(num_mfcc_features),
frame_len(frame_len),
num_bank_bins(num_bank_bins)
{
// Round-up to nearest power of 2.
frame_len_padded = pow(2,ceil((log(frame_len)/log(2))));
m_frame = new float[frame_len_padded];
m_buffer = new float[frame_len_padded * 2];
m_mel_energies = new float[num_bank_bins];
//create window function
// window_func = new float[frame_len];
// dsps_wind_hann_f32(window_func, frame_len);
m_amplification = 1;
m_preemphasis = 0.0;
//create mel filterbank
create_mel_filterbank(samp_freq, low_freq, high_freq);
//create DCT matrix for mfcc mode
if(num_mfcc_features != 0){
m_dct_matrix = create_dct_matrix(num_bank_bins, num_mfcc_features);
}
//initialize FFT
int ret = dsps_fft2r_init_fc32(NULL, frame_len_padded);
if(ret==0){
MicroPrintf("Framelength: %u, (rounded: %u)", frame_len,frame_len_padded);
}
else{
MicroPrintf("dsps_fft2r_init_fc32 error: %d",ret);
}
}
MFCC::~MFCC() {
delete []m_frame;
delete []m_buffer;
delete []m_mel_energies;
// delete []window_func;
delete []m_dct_matrix;
// for(int i=0;i<num_bank_bins;i++)
// delete mel_fbank[i];
delete [] m_mel_fbank;
dsps_fft2r_deinit_fc32();
}
float * MFCC::create_dct_matrix(int32_t input_length, int32_t coefficient_count) {
int32_t k, n;
float * M = new float[input_length*coefficient_count];
float normalizer = sqrt(2.0/(float)input_length);
for (k = 0; k < coefficient_count; k++) {
for (n = 0; n < input_length; n++) {
M[k*input_length+n] = normalizer * cos( ((double)M_PI)/input_length * (n + 0.5) * k );
}
}
return M;
}
void MFCC::create_mel_filterbank(int samp_freq, int low_freq, int high_freq) {
// MicroPrintf("Create FB ...");
int coefficients = frame_len_padded/2 + 1;
m_mel_fbank = new float[num_bank_bins * coefficients](); // zero-init
uint32_t delta = (MelScale(high_freq) - MelScale(low_freq))/(num_bank_bins + 1);
float mels[num_bank_bins+2];
float hertz[num_bank_bins+2];
int freq_index[num_bank_bins+2];
for (int i = 0;i<num_bank_bins+2;i++){
mels[i] = MelScale(low_freq) + (i * delta);
hertz[i] = InverseMelScale(mels[i]);
freq_index[i] = (coefficients + 1) * hertz[i] /samp_freq;
}
for (int i = 0;i<num_bank_bins;i++ ){
int left = int(freq_index[i]);
int middle = int(freq_index[i + 1]);
int right = int(freq_index[i + 2]);
float slope_up = 1/float(middle-left);
float slope_down = 1/float(right-middle);
// MicroPrintf("%u %u %u %f %f",left,middle,right,slope_up,slope_down);
for (int j = 0;j<(right-left+1);j++){
if (j < middle-left + 1){
m_mel_fbank[(i*coefficients)+left+j] = j*slope_up;
}
else{
m_mel_fbank[(i*coefficients)+left+j] = m_mel_fbank[(i*coefficients)+left+j-1]-slope_down;
}
}
}
// MicroPrintf("%f %f %f %f %f %f %f %f ",m_mel_fbank[5],m_mel_fbank[6] ,m_mel_fbank[7] ,m_mel_fbank[8] ,m_mel_fbank[9] ,m_mel_fbank[10] ,m_mel_fbank[11] ,m_mel_fbank[12]);
// MicroPrintf("FB done");
}
void MFCC::log10_normalize(float* out_buf, int out_buf_len, int noise_floor_db) {
const float noise = static_cast<float>(noise_floor_db * -1);
const float noise_scale = 1.0f / (static_cast<float>(noise_floor_db * -1) + 12.0f);
for (size_t ix = 0; ix < out_buf_len; ix++) {
float f = out_buf[ix];
if (f < 1e-30) {
out_buf[ix] = 0;
return;
}
f = 10.0f * log10(f); // scale by 10
f += noise;
f *= noise_scale;
// clip again
if (f < 0.0f) f = 0.0f;
else if (f > 1.0f) f = 1.0f;
out_buf[ix] = f;
}
}
void MFCC::set_preamp(uint8_t amplification){
m_amplification = amplification;
}
void MFCC::set_preemphasis(float preemphasis){
m_preemphasis = preemphasis;
// Speechpy computes this over the window of a sample, here we will compute only over the slize !!
}
void MFCC::mfcc_compute(const int16_t * audio_data, float* mfcc_out) {
int32_t i, j, bin;
int coefficients = frame_len_padded/2 + 1;
int data_clipped = 0;
int data_clipped_low = 0;
float conv_factor = m_amplification;
float clip_thres = 0.99f * (float)(1<<15);
// MicroPrintf("%d %d %d %d %d %d %d %d",audio_data[0],audio_data[1] ,audio_data[2] ,audio_data[3] ,audio_data[4] ,audio_data[5] ,audio_data[6] ,audio_data[7]);
//TensorFlow way of normalizing .wav data to (-1,1) for speechpy's MFE
if(num_mfcc_features == 0){
conv_factor /= (float)(1<<15);
clip_thres /= (float)(1<<15);
}
for (int i = 0; i < frame_len; i++) {
m_buffer[i] = audio_data[i] * conv_factor; //mfe -1..1, mfcc int16_t as float, both with additional pre_amp factor
}
if(m_buffer[i]> clip_thres){
m_buffer[i] /= m_amplification;
data_clipped++;
}
else if( m_buffer[i]< -clip_thres){
m_buffer[i] /= m_amplification;
data_clipped_low++;
}
if(data_clipped>0)
MicroPrintf("Clip: %d __ %d",data_clipped, data_clipped_low);
// MicroPrintf("%f %f %f %f %f %f %f %f ",m_buffer[0],m_buffer[1] ,m_buffer[2] ,m_buffer[3] ,m_buffer[4] ,m_buffer[5] ,m_buffer[6] ,m_buffer[7]);
//pre-emphasis
if(m_preemphasis!=0.0){
m_frame[0] = m_buffer[0] - m_preemphasis * m_buffer[frame_len - 1]; // roll through the frame "back" to the end
for (i = 1; i < frame_len; i++){
m_frame[i] = m_buffer[i] - m_preemphasis * m_buffer[i - 1];
}
}
else{
for (i = 1; i < frame_len; i++){
m_frame[i] = m_buffer[i];
}
}
// prepare buffer for FFT
for (i = 0; i < frame_len_padded; i++) {
m_buffer[i * 2] = i<frame_len ? m_frame[i] : 0;
// m_buffer[i * 2] = i<frame_len ? frame[i] * window_func[i] : 0; // in case we want to use a window function
m_buffer[i*2 + 1] = 0;
}
// MicroPrintf("%f %f %f %f %f %f %f %f ",frame[0],frame[1] ,frame[2] ,frame[3] ,frame[4] ,frame[5] ,frame[6] ,frame[7]);
//Compute FFT
int err = dsps_fft2r_fc32(m_buffer, frame_len_padded);
err += dsps_bit_rev_fc32(m_buffer, frame_len_padded); //Bit reverse
// err += dsps_cplx2reC_fc32(m_buffer, frame_len_padded);// Complex spectrum in y_cf
if(err!=0){
MicroPrintf("dsps_fft2r error: %u %f %f %f %f %f %f %f %f ",err,m_buffer[0],m_buffer[1] ,m_buffer[2] ,m_buffer[3] ,m_buffer[4] ,m_buffer[5] ,m_buffer[512] ,m_buffer[513]);
}
for (int i = 0 ; i < coefficients ; i++) {
m_buffer[i] = (m_buffer[i*2] * m_buffer[i*2] + m_buffer[i*2 + 1] * m_buffer[i*2 + 1])/frame_len_padded;
}
// MicroPrintf(" pow spec: %f %f %f %f %f %f %f %f ",m_buffer[0],m_buffer[1] ,m_buffer[2] ,m_buffer[3] ,m_buffer[4] ,m_buffer[5] ,m_buffer[255] ,m_buffer[256]);
//Apply mel filterbanks
for (int i = 0;i<num_bank_bins;i++ ){
m_mel_energies[i] = 0;
for (int j = 0;j<coefficients;j++ ){
m_mel_energies[i] += m_buffer[j] * m_mel_fbank[(i * coefficients) + j];
}
}
// for MFE copy and return - compute 10 * log10() later explicitely
if(num_mfcc_features == 0){
for (bin = 0; bin < num_bank_bins; bin++){
mfcc_out[bin] = m_mel_energies[bin];
}
// MicroPrintf("%u feat: %f %f %f %f %f %f %f %f ",num_bank_bins,mfcc_out[0],mfcc_out[1] ,mfcc_out[2] ,mfcc_out[3] ,mfcc_out[4] ,mfcc_out[5] ,mfcc_out[6] ,mfcc_out[7]);
return;
}
// Continue for MFCC
// Take log
for (bin = 0; bin < num_bank_bins; bin++){
m_mel_energies[bin] = logf(m_mel_energies[bin]);
}
// Take DCT. Uses matrix mul.
for (i = 1; i < num_mfcc_features; i++) {
float sum = 0.0;
for (j = 0; j < num_bank_bins; j++) {
sum += m_dct_matrix[i*num_bank_bins+j] * m_mel_energies[j];
}
mfcc_out[i] = sum;
}
// replace first cepstral coefficient with log of frame energy for DC elimination
for (i=0; i<frame_len_padded; i++){
mfcc_out[0] += m_buffer[i];
}
mfcc_out[0] = logf(mfcc_out[0]);
}
#endif //MELFREQUENCYEXTRACTOR_H