Sophie

Sophie

distrib > Mandriva > cooker > i586 > by-pkgid > a3f7190ce7be4b857b18774d441886bf > files > 38

cvoicecontrol-debug-0.9-0.alpha.7mdv2011.0.i586.rpm

/***************************************************************************
                          preprocess.h  -  Preprocessing of a wave file for
                          								 speech recognition
                             -------------------
    begin                : Sat Feb 12 2000
    copyright            : (C) 2000 by Daniel Kiecza
    email                : daniel@kiecza.de
 ***************************************************************************/

/***************************************************************************
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   the Free Software Foundation; either version 2 of the License, or     *
 *   (at your option) any later version.                                   *
 *                                                                         *
 ***************************************************************************/

#ifndef PREPROCESS_H
#define PREPROCESS_H

/********************************************************************************
 * fft_size        Size of short-time window
 * fft_size_char   Size of short-time window (in number of char values)
 *
 * fft_size is size in number of 16-bit samples,
 * a 16-bit sample is a two-byte value, thus:
 * fft_size == 2 * fft_size_char
 *
 * as the power spectrum is symmetric, we have:
 *   power_spec_size = 0.5 * fft_size
 ********************************************************************************/

#define FFT_SIZE        256
#define POWER_SPEC_SIZE 128
#define FFT_SIZE_CHAR   512
#define HAMMING_SIZE    FFT_SIZE
#define VECSIZE         16
#define FEAT_VEC_SIZE   VECSIZE

/********************************************************************************
 * Offset = 10ms (160 16bit-values = 320 uchars)
 *
 * the distance that lies between two audio frames
 * generally, it is  offset < fft_size, and thus
 * subsequent frames do overlap!
 ********************************************************************************/

#define OFFSET          320

/*****
  used for reduction of short-time spectrum to mel scale coefficients
  *****/
int   filter_banks[17];

/*****
  contains the power spectrum
  *****/
float power_spec[POWER_SPEC_SIZE];

/********************************************************************************
 * Hamming window width = 16ms ! (256 Frames)
 * (hamming_size == fft_size)
 *
 * a window function that a frame of audio values is multiplied with.
 * This is done to smoothen the beginning and end of a frame,
 * i.e. to reduce discontinuities at the two ends and as a result
 * to reduce the number of artefacts in the power spectrum
 ********************************************************************************/

float hamming_window[HAMMING_SIZE];

/*****
  The characteristics of the recording channel
  This is substracted from each feature vector to reduce
  channel effects
  *****/
int   do_mean_sub;
float channel_mean[FEAT_VEC_SIZE];


int  initPreprocess();
int  preprocessFrame(float *frame, float *result);
void endPreprocess();

#endif