<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> <head> <meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/> <title>SphinxBase: src/sphinx_lmtools/sphinx_lm_eval.c Source File</title> <link href="tabs.css" rel="stylesheet" type="text/css"/> <link href="navtree.css" rel="stylesheet" type="text/css"/> <script type="text/javascript" src="jquery.js"></script> <script type="text/javascript" src="navtree.js"></script> <script type="text/javascript" src="resize.js"></script> <script type="text/javascript"> $(document).ready(initResizable); </script> <link href="doxygen.css" rel="stylesheet" type="text/css"/> </head> <body> <!-- Generated by Doxygen 1.7.3 --> <div id="top"> <div id="titlearea"> <table cellspacing="0" cellpadding="0"> <tbody> <tr style="height: 56px;"> <td style="padding-left: 0.5em;"> <div id="projectname">SphinxBase <span id="projectnumber">0.6</span></div> </td> </tr> </tbody> </table> </div> <div id="navrow1" class="tabs"> <ul class="tablist"> <li><a href="index.html"><span>Main Page</span></a></li> <li><a href="pages.html"><span>Related Pages</span></a></li> <li><a href="annotated.html"><span>Data Structures</span></a></li> <li class="current"><a href="files.html"><span>Files</span></a></li> </ul> </div> <div id="navrow2" class="tabs2"> <ul class="tablist"> <li><a href="files.html"><span>File List</span></a></li> <li><a href="globals.html"><span>Globals</span></a></li> </ul> </div> </div> <div id="side-nav" class="ui-resizable side-nav-resizable"> <div id="nav-tree"> <div id="nav-tree-contents"> </div> </div> <div id="splitbar" style="-moz-user-select:none;" class="ui-resizable-handle"> </div> </div> <script type="text/javascript"> initNavTree('sphinx__lm__eval_8c.html',''); </script> <div id="doc-content"> <div class="header"> <div class="headertitle"> <h1>src/sphinx_lmtools/sphinx_lm_eval.c</h1> </div> </div> <div class="contents"> <a href="sphinx__lm__eval_8c.html">Go to the documentation of this file.</a><div class="fragment"><pre class="fragment"><a name="l00001"></a>00001 <span class="comment">/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */</span> <a name="l00002"></a>00002 <span class="comment">/* ====================================================================</span> <a name="l00003"></a>00003 <span class="comment"> * Copyright (c) 2008 Carnegie Mellon University. All rights </span> <a name="l00004"></a>00004 <span class="comment"> * reserved.</span> <a name="l00005"></a>00005 <span class="comment"> *</span> <a name="l00006"></a>00006 <span class="comment"> * Redistribution and use in source and binary forms, with or without</span> <a name="l00007"></a>00007 <span class="comment"> * modification, are permitted provided that the following conditions</span> <a name="l00008"></a>00008 <span class="comment"> * are met:</span> <a name="l00009"></a>00009 <span class="comment"> *</span> <a name="l00010"></a>00010 <span class="comment"> * 1. Redistributions of source code must retain the above copyright</span> <a name="l00011"></a>00011 <span class="comment"> * notice, this list of conditions and the following disclaimer. </span> <a name="l00012"></a>00012 <span class="comment"> *</span> <a name="l00013"></a>00013 <span class="comment"> * 2. Redistributions in binary form must reproduce the above copyright</span> <a name="l00014"></a>00014 <span class="comment"> * notice, this list of conditions and the following disclaimer in</span> <a name="l00015"></a>00015 <span class="comment"> * the documentation and/or other materials provided with the</span> <a name="l00016"></a>00016 <span class="comment"> * distribution.</span> <a name="l00017"></a>00017 <span class="comment"> *</span> <a name="l00018"></a>00018 <span class="comment"> * This work was supported in part by funding from the Defense Advanced </span> <a name="l00019"></a>00019 <span class="comment"> * Research Projects Agency and the National Science Foundation of the </span> <a name="l00020"></a>00020 <span class="comment"> * United States of America, and the CMU Sphinx Speech Consortium.</span> <a name="l00021"></a>00021 <span class="comment"> *</span> <a name="l00022"></a>00022 <span class="comment"> * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND </span> <a name="l00023"></a>00023 <span class="comment"> * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, </span> <a name="l00024"></a>00024 <span class="comment"> * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR</span> <a name="l00025"></a>00025 <span class="comment"> * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY</span> <a name="l00026"></a>00026 <span class="comment"> * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,</span> <a name="l00027"></a>00027 <span class="comment"> * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT </span> <a name="l00028"></a>00028 <span class="comment"> * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, </span> <a name="l00029"></a>00029 <span class="comment"> * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY </span> <a name="l00030"></a>00030 <span class="comment"> * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT </span> <a name="l00031"></a>00031 <span class="comment"> * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE </span> <a name="l00032"></a>00032 <span class="comment"> * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.</span> <a name="l00033"></a>00033 <span class="comment"> *</span> <a name="l00034"></a>00034 <span class="comment"> * ====================================================================</span> <a name="l00035"></a>00035 <span class="comment"> *</span> <a name="l00036"></a>00036 <span class="comment"> */</span> <a name="l00041"></a>00041 <span class="preprocessor">#include <sphinxbase/logmath.h></span> <a name="l00042"></a>00042 <span class="preprocessor">#include <sphinxbase/ngram_model.h></span> <a name="l00043"></a>00043 <span class="preprocessor">#include <sphinxbase/cmd_ln.h></span> <a name="l00044"></a>00044 <span class="preprocessor">#include <sphinxbase/ckd_alloc.h></span> <a name="l00045"></a>00045 <span class="preprocessor">#include <sphinxbase/err.h></span> <a name="l00046"></a>00046 <span class="preprocessor">#include <sphinxbase/pio.h></span> <a name="l00047"></a>00047 <span class="preprocessor">#include <sphinxbase/strfuncs.h></span> <a name="l00048"></a>00048 <a name="l00049"></a>00049 <span class="preprocessor">#include <stdio.h></span> <a name="l00050"></a>00050 <span class="preprocessor">#include <string.h></span> <a name="l00051"></a>00051 <span class="preprocessor">#include <math.h></span> <a name="l00052"></a>00052 <a name="l00053"></a>00053 <span class="keyword">static</span> <span class="keyword">const</span> <a class="code" href="structarg__t.html" title="Argument definition structure.">arg_t</a> defn[] = { <a name="l00054"></a>00054 { <span class="stringliteral">"-help"</span>, <a name="l00055"></a>00055 <a class="code" href="cmd__ln_8h.html#ac7d08ff59bb6905c3375162e75913e88" title="Boolean (true/false) argument (optional).">ARG_BOOLEAN</a>, <a name="l00056"></a>00056 <span class="stringliteral">"no"</span>, <a name="l00057"></a>00057 <span class="stringliteral">"Shows the usage of the tool"</span>}, <a name="l00058"></a>00058 <a name="l00059"></a>00059 { <span class="stringliteral">"-logbase"</span>, <a name="l00060"></a>00060 <a class="code" href="cmd__ln_8h.html#aceb617c8572cf5ad6257b35e6d8919e7">ARG_FLOAT64</a>, <a name="l00061"></a>00061 <span class="stringliteral">"1.0001"</span>, <a name="l00062"></a>00062 <span class="stringliteral">"Base in which all log-likelihoods calculated"</span> }, <a name="l00063"></a>00063 <a name="l00064"></a>00064 { <span class="stringliteral">"-lm"</span>, <a name="l00065"></a>00065 <a class="code" href="cmd__ln_8h.html#a4de5ed5fcf59a18b24bc9f6449cc9356" title="String argument (optional).">ARG_STRING</a>, <a name="l00066"></a>00066 NULL, <a name="l00067"></a>00067 <span class="stringliteral">"Language model file"</span>}, <a name="l00068"></a>00068 <a name="l00069"></a>00069 { <span class="stringliteral">"-probdef"</span>, <a name="l00070"></a>00070 <a class="code" href="cmd__ln_8h.html#a4de5ed5fcf59a18b24bc9f6449cc9356" title="String argument (optional).">ARG_STRING</a>, <a name="l00071"></a>00071 NULL, <a name="l00072"></a>00072 <span class="stringliteral">"Probability definition file for classes in LM"</span>}, <a name="l00073"></a>00073 <a name="l00074"></a>00074 { <span class="stringliteral">"-lmctlfn"</span>, <a name="l00075"></a>00075 <a class="code" href="cmd__ln_8h.html#a4de5ed5fcf59a18b24bc9f6449cc9356" title="String argument (optional).">ARG_STRING</a>, <a name="l00076"></a>00076 NULL, <a name="l00077"></a>00077 <span class="stringliteral">"Control file listing a set of language models"</span>}, <a name="l00078"></a>00078 <a name="l00079"></a>00079 { <span class="stringliteral">"-lmname"</span>, <a name="l00080"></a>00080 <a class="code" href="cmd__ln_8h.html#a4de5ed5fcf59a18b24bc9f6449cc9356" title="String argument (optional).">ARG_STRING</a>, <a name="l00081"></a>00081 NULL, <a name="l00082"></a>00082 <span class="stringliteral">"Name of language model in -lmctlfn to use for all utterances"</span> }, <a name="l00083"></a>00083 <a name="l00084"></a>00084 { <span class="stringliteral">"-lsn"</span>, <a name="l00085"></a>00085 <a class="code" href="cmd__ln_8h.html#a4de5ed5fcf59a18b24bc9f6449cc9356" title="String argument (optional).">ARG_STRING</a>, <a name="l00086"></a>00086 NULL, <a name="l00087"></a>00087 <span class="stringliteral">"Transcription file to evaluate"</span>}, <a name="l00088"></a>00088 <a name="l00089"></a>00089 { <span class="stringliteral">"-text"</span>, <a name="l00090"></a>00090 <a class="code" href="cmd__ln_8h.html#a4de5ed5fcf59a18b24bc9f6449cc9356" title="String argument (optional).">ARG_STRING</a>, <a name="l00091"></a>00091 <span class="stringliteral">"Text string to evaluate"</span>}, <a name="l00092"></a>00092 <a name="l00093"></a>00093 { <span class="stringliteral">"-mmap"</span>, <a name="l00094"></a>00094 <a class="code" href="cmd__ln_8h.html#ac7d08ff59bb6905c3375162e75913e88" title="Boolean (true/false) argument (optional).">ARG_BOOLEAN</a>, <a name="l00095"></a>00095 <span class="stringliteral">"no"</span>, <a name="l00096"></a>00096 <span class="stringliteral">"Use memory-mapped I/O for reading binary LM files"</span>}, <a name="l00097"></a>00097 <a name="l00098"></a>00098 { <span class="stringliteral">"-lw"</span>, <a name="l00099"></a>00099 <a class="code" href="cmd__ln_8h.html#ad9b1952e4f1def9ee6a88791375b3901">ARG_FLOAT32</a>, <a name="l00100"></a>00100 <span class="stringliteral">"1.0"</span>, <a name="l00101"></a>00101 <span class="stringliteral">"Language model weight"</span> }, <a name="l00102"></a>00102 <a name="l00103"></a>00103 { <span class="stringliteral">"-wip"</span>, <a name="l00104"></a>00104 <a class="code" href="cmd__ln_8h.html#ad9b1952e4f1def9ee6a88791375b3901">ARG_FLOAT32</a>, <a name="l00105"></a>00105 <span class="stringliteral">"1.0"</span>, <a name="l00106"></a>00106 <span class="stringliteral">"Word insertion probability"</span> }, <a name="l00107"></a>00107 <a name="l00108"></a>00108 { <span class="stringliteral">"-uw"</span>, <a name="l00109"></a>00109 <a class="code" href="cmd__ln_8h.html#ad9b1952e4f1def9ee6a88791375b3901">ARG_FLOAT32</a>, <a name="l00110"></a>00110 <span class="stringliteral">"1.0"</span>, <a name="l00111"></a>00111 <span class="stringliteral">"Unigram probability weight (interpolated with uniform distribution)"</span>}, <a name="l00112"></a>00112 <a name="l00113"></a>00113 { <span class="stringliteral">"-verbose"</span>, <a name="l00114"></a>00114 <a class="code" href="cmd__ln_8h.html#ac7d08ff59bb6905c3375162e75913e88" title="Boolean (true/false) argument (optional).">ARG_BOOLEAN</a>, <a name="l00115"></a>00115 <span class="stringliteral">"no"</span>, <a name="l00116"></a>00116 <span class="stringliteral">"Print details of perplexity calculation"</span> }, <a name="l00117"></a>00117 <a name="l00118"></a>00118 <span class="comment">/* FIXME: Support -lmstartsym, -lmendsym, -lmctlfn, -ctl_lm */</span> <a name="l00119"></a>00119 { NULL, 0, NULL, NULL } <a name="l00120"></a>00120 }; <a name="l00121"></a>00121 <a name="l00122"></a>00122 <span class="keyword">static</span> <span class="keywordtype">int</span> verbose; <a name="l00123"></a>00123 <a name="l00124"></a>00124 <span class="keyword">static</span> <span class="keywordtype">int</span> <a name="l00125"></a>00125 calc_entropy(<a class="code" href="structngram__model__s.html" title="Common implementation of ngram_model_t.">ngram_model_t</a> *lm, <span class="keywordtype">char</span> **words, int32 n, <a name="l00126"></a>00126 int32 *out_n_ccs, int32 *out_n_oovs, int32 *out_lm_score) <a name="l00127"></a>00127 { <a name="l00128"></a>00128 int32 *wids; <a name="l00129"></a>00129 int32 startwid; <a name="l00130"></a>00130 int32 i, ch, nccs, noovs, unk; <a name="l00131"></a>00131 <a name="l00132"></a>00132 <span class="keywordflow">if</span> (n == 0) <a name="l00133"></a>00133 <span class="keywordflow">return</span> 0; <a name="l00134"></a>00134 <a name="l00135"></a>00135 unk = <a class="code" href="ngram__model_8h.html#a1469e9e1c8516a77c9ac1e248a61ef4e" title="Get the unknown word ID for a language model.">ngram_unknown_wid</a>(lm); <a name="l00136"></a>00136 <a name="l00137"></a>00137 <span class="comment">/* Reverse this array into an array of word IDs. */</span> <a name="l00138"></a>00138 wids = <a class="code" href="ckd__alloc_8h.html#aa00ef21903bc4f8a972488417adc8d2e" title="Macros to simplify the use of above functions.">ckd_calloc</a>(n, <span class="keyword">sizeof</span>(*wids)); <a name="l00139"></a>00139 <span class="keywordflow">for</span> (i = 0; i < n; ++i) <a name="l00140"></a>00140 wids[n-i-1] = <a class="code" href="ngram__model_8h.html#ad03d4355d4ea659815dc25bce8d83880" title="Look up numerical word ID.">ngram_wid</a>(lm, words[i]); <a name="l00141"></a>00141 <span class="comment">/* Skip <s> as it's a context cue (HACK, this should be configurable). */</span> <a name="l00142"></a>00142 startwid = <a class="code" href="ngram__model_8h.html#ad03d4355d4ea659815dc25bce8d83880" title="Look up numerical word ID.">ngram_wid</a>(lm, <span class="stringliteral">"<s>"</span>); <a name="l00143"></a>00143 <a name="l00144"></a>00144 <span class="comment">/* Now evaluate the list of words in reverse using the</span> <a name="l00145"></a>00145 <span class="comment"> * remainder of the array as the history. */</span> <a name="l00146"></a>00146 ch = noovs = nccs = 0; <a name="l00147"></a>00147 <span class="keywordflow">for</span> (i = 0; i < n; ++i) { <a name="l00148"></a>00148 int32 n_used; <a name="l00149"></a>00149 int32 prob; <a name="l00150"></a>00150 <a name="l00151"></a>00151 <span class="comment">/* Skip <s> as it's a context cue (HACK, this should be configurable). */</span> <a name="l00152"></a>00152 <span class="keywordflow">if</span> (wids[i] == startwid) { <a name="l00153"></a>00153 ++nccs; <a name="l00154"></a>00154 <span class="keywordflow">continue</span>; <a name="l00155"></a>00155 } <a name="l00156"></a>00156 <span class="comment">/* Skip and count OOVs. */</span> <a name="l00157"></a>00157 <span class="keywordflow">if</span> (wids[i] == <a class="code" href="ngram__model_8h.html#a3d4b3dddd0ff67e13d30c9bf053d01ab" title="Impossible word ID.">NGRAM_INVALID_WID</a> || wids[i] == unk) { <a name="l00158"></a>00158 ++noovs; <a name="l00159"></a>00159 <span class="keywordflow">continue</span>; <a name="l00160"></a>00160 } <a name="l00161"></a>00161 <span class="comment">/* Sum up information for each N-gram */</span> <a name="l00162"></a>00162 prob = <a class="code" href="ngram__model_8h.html#a6ac5799e78ea4ad82a11e2439016471e" title="Quick general N-Gram score lookup.">ngram_ng_score</a>(lm, <a name="l00163"></a>00163 wids[i], wids + i + 1, <a name="l00164"></a>00164 n - i - 1, &n_used); <a name="l00165"></a>00165 <span class="keywordflow">if</span> (verbose) { <a name="l00166"></a>00166 <span class="keywordtype">int</span> m; <a name="l00167"></a>00167 printf(<span class="stringliteral">"log P(%s|"</span>, <a class="code" href="ngram__model_8h.html#a96e36290a005c03464ea6c637ccde2f5" title="Look up word string for numerical word ID.">ngram_word</a>(lm, wids[i])); <a name="l00168"></a>00168 m = i + <a class="code" href="ngram__model_8h.html#a462d374099a4fe8b3c3195b5e2013545" title="Get the order of the N-gram model (i.e.">ngram_model_get_size</a>(lm) - 1; <a name="l00169"></a>00169 <span class="keywordflow">if</span> (m >= n) <a name="l00170"></a>00170 m = n - 1; <a name="l00171"></a>00171 <span class="keywordflow">while</span> (m > i) { <a name="l00172"></a>00172 printf(<span class="stringliteral">"%s "</span>, <a class="code" href="ngram__model_8h.html#a96e36290a005c03464ea6c637ccde2f5" title="Look up word string for numerical word ID.">ngram_word</a>(lm, wids[m--])); <a name="l00173"></a>00173 } <a name="l00174"></a>00174 printf(<span class="stringliteral">") = %d\n"</span>, prob); <a name="l00175"></a>00175 } <a name="l00176"></a>00176 ch -= prob; <a name="l00177"></a>00177 } <a name="l00178"></a>00178 <a name="l00179"></a>00179 <span class="keywordflow">if</span> (out_n_ccs) *out_n_ccs = nccs; <a name="l00180"></a>00180 <span class="keywordflow">if</span> (out_n_oovs) *out_n_oovs = noovs; <a name="l00181"></a>00181 <a name="l00182"></a>00182 <span class="comment">/* Calculate cross-entropy CH = - 1/N sum log P(W|H) */</span> <a name="l00183"></a>00183 n -= (nccs + noovs); <a name="l00184"></a>00184 <span class="keywordflow">if</span> (n <= 0) <a name="l00185"></a>00185 <span class="keywordflow">return</span> 0; <a name="l00186"></a>00186 <span class="keywordflow">if</span> (out_lm_score) <a name="l00187"></a>00187 *out_lm_score = -ch; <a name="l00188"></a>00188 <span class="keywordflow">return</span> ch / n; <a name="l00189"></a>00189 } <a name="l00190"></a>00190 <a name="l00191"></a>00191 <span class="keyword">static</span> <span class="keywordtype">void</span> <a name="l00192"></a>00192 evaluate_file(<a class="code" href="structngram__model__s.html" title="Common implementation of ngram_model_t.">ngram_model_t</a> *lm, <a class="code" href="structlogmath__s.html">logmath_t</a> *lmath, <span class="keyword">const</span> <span class="keywordtype">char</span> *lsnfn) <a name="l00193"></a>00193 { <a name="l00194"></a>00194 FILE *fh; <a name="l00195"></a>00195 <a class="code" href="structlineiter__t.html" title="Line iterator for files.">lineiter_t</a> *litor; <a name="l00196"></a>00196 int32 nccs, noovs, nwords, lscr; <a name="l00197"></a>00197 float64 ch, log_to_log2;; <a name="l00198"></a>00198 <a name="l00199"></a>00199 <span class="keywordflow">if</span> ((fh = fopen(lsnfn, <span class="stringliteral">"r"</span>)) == NULL) <a name="l00200"></a>00200 <a class="code" href="err_8h.html#a5229a1d58f5f5e69963a8d038ff5bc3e" title="Print error text; Call perror(&quot;&quot;); exit(errno);.">E_FATAL_SYSTEM</a>(<span class="stringliteral">"failed to open transcript file %s"</span>, lsnfn); <a name="l00201"></a>00201 <a name="l00202"></a>00202 <span class="comment">/* We have to keep ch in floating-point to avoid overflows, so</span> <a name="l00203"></a>00203 <span class="comment"> * we might as well use log2. */</span> <a name="l00204"></a>00204 log_to_log2 = log(<a class="code" href="logmath_8h.html#a6114206ec0321d7015c42fc7b81cb83e" title="Get the log base.">logmath_get_base</a>(lmath)) / log(2); <a name="l00205"></a>00205 nccs = noovs = nwords = 0; <a name="l00206"></a>00206 ch = 0.0; <a name="l00207"></a>00207 <span class="keywordflow">for</span> (litor = <a class="code" href="pio_8h.html#a22d0125ab198f02f8bbe543417d99566" title="Start reading lines from a file.">lineiter_start</a>(fh); litor; litor = <a class="code" href="pio_8h.html#aff8df0b6928746d61b3520555263f71e" title="Move to the next line in the file.">lineiter_next</a>(litor)) { <a name="l00208"></a>00208 <span class="keywordtype">char</span> **words; <a name="l00209"></a>00209 int32 n, tmp_ch, tmp_noovs, tmp_nccs, tmp_lscr; <a name="l00210"></a>00210 <a name="l00211"></a>00211 n = <a class="code" href="strfuncs_8h.html#a5b520fdebcca599db86faaf75a82173f" title="Convert a line to an array of &quot;words&quot;, based on whitespace separators.">str2words</a>(litor->buf, NULL, 0); <a name="l00212"></a>00212 <span class="keywordflow">if</span> (n < 0) <a name="l00213"></a>00213 <a class="code" href="err_8h.html#a1a4495946ab2449d61108fe829a94613" title="Exit with non-zero status after error message.">E_FATAL</a>(<span class="stringliteral">"str2words(line, NULL, 0) = %d, should not happen\n"</span>, n); <a name="l00214"></a>00214 <span class="keywordflow">if</span> (n == 0) <span class="comment">/* Do nothing! */</span> <a name="l00215"></a>00215 <span class="keywordflow">continue</span>; <a name="l00216"></a>00216 words = <a class="code" href="ckd__alloc_8h.html#aa00ef21903bc4f8a972488417adc8d2e" title="Macros to simplify the use of above functions.">ckd_calloc</a>(n, <span class="keyword">sizeof</span>(*words)); <a name="l00217"></a>00217 <a class="code" href="strfuncs_8h.html#a5b520fdebcca599db86faaf75a82173f" title="Convert a line to an array of &quot;words&quot;, based on whitespace separators.">str2words</a>(litor->buf, words, n); <a name="l00218"></a>00218 <a name="l00219"></a>00219 <span class="comment">/* Remove any utterance ID (FIXME: has to be a single "word") */</span> <a name="l00220"></a>00220 <span class="keywordflow">if</span> (words[n-1][0] == <span class="charliteral">'('</span> <a name="l00221"></a>00221 && words[n-1][strlen(words[n-1])-1] == <span class="charliteral">')'</span>) <a name="l00222"></a>00222 n = n - 1; <a name="l00223"></a>00223 <a name="l00224"></a>00224 tmp_ch = calc_entropy(lm, words, n, &tmp_nccs, <a name="l00225"></a>00225 &tmp_noovs, &tmp_lscr); <a name="l00226"></a>00226 <a name="l00227"></a>00227 ch += (float64) tmp_ch * (n - tmp_nccs - tmp_noovs) * log_to_log2; <a name="l00228"></a>00228 nccs += tmp_nccs; <a name="l00229"></a>00229 noovs += tmp_noovs; <a name="l00230"></a>00230 lscr += tmp_lscr; <a name="l00231"></a>00231 nwords += n; <a name="l00232"></a>00232 <a name="l00233"></a>00233 <a class="code" href="ckd__alloc_8h.html#a31c6b405558620ac37599737b5722fbf" title="Test and free a 1-D array.">ckd_free</a>(words); <a name="l00234"></a>00234 } <a name="l00235"></a>00235 <a name="l00236"></a>00236 ch /= (nwords - nccs - noovs); <a name="l00237"></a>00237 printf(<span class="stringliteral">"cross-entropy: %f bits\n"</span>, ch); <a name="l00238"></a>00238 <a name="l00239"></a>00239 <span class="comment">/* Calculate perplexity pplx = exp CH */</span> <a name="l00240"></a>00240 printf(<span class="stringliteral">"perplexity: %f\n"</span>, pow(2.0, ch)); <a name="l00241"></a>00241 printf(<span class="stringliteral">"lm score: %d\n"</span>, lscr); <a name="l00242"></a>00242 <a name="l00243"></a>00243 <span class="comment">/* Report OOVs and CCs */</span> <a name="l00244"></a>00244 printf(<span class="stringliteral">"%d words evaluated\n"</span>, nwords); <a name="l00245"></a>00245 printf(<span class="stringliteral">"%d OOVs (%.2f%%), %d context cues removed\n"</span>, <a name="l00246"></a>00246 noovs, (<span class="keywordtype">double</span>)noovs / nwords * 100, nccs); <a name="l00247"></a>00247 } <a name="l00248"></a>00248 <a name="l00249"></a>00249 <span class="keyword">static</span> <span class="keywordtype">void</span> <a name="l00250"></a>00250 evaluate_string(<a class="code" href="structngram__model__s.html" title="Common implementation of ngram_model_t.">ngram_model_t</a> *lm, <a class="code" href="structlogmath__s.html">logmath_t</a> *lmath, <span class="keyword">const</span> <span class="keywordtype">char</span> *text) <a name="l00251"></a>00251 { <a name="l00252"></a>00252 <span class="keywordtype">char</span> *textfoo; <a name="l00253"></a>00253 <span class="keywordtype">char</span> **words; <a name="l00254"></a>00254 int32 n, ch, noovs, nccs, lscr; <a name="l00255"></a>00255 <a name="l00256"></a>00256 <span class="comment">/* Split it into an array of strings. */</span> <a name="l00257"></a>00257 textfoo = <a class="code" href="ckd__alloc_8h.html#ad313f92478859f9e4ea99d0f6e78c393" title="Macro for __ckd_salloc__.">ckd_salloc</a>(text); <a name="l00258"></a>00258 n = <a class="code" href="strfuncs_8h.html#a5b520fdebcca599db86faaf75a82173f" title="Convert a line to an array of &quot;words&quot;, based on whitespace separators.">str2words</a>(textfoo, NULL, 0); <a name="l00259"></a>00259 <span class="keywordflow">if</span> (n < 0) <a name="l00260"></a>00260 <a class="code" href="err_8h.html#a1a4495946ab2449d61108fe829a94613" title="Exit with non-zero status after error message.">E_FATAL</a>(<span class="stringliteral">"str2words(textfoo, NULL, 0) = %d, should not happen\n"</span>, n); <a name="l00261"></a>00261 <span class="keywordflow">if</span> (n == 0) <span class="comment">/* Do nothing! */</span> <a name="l00262"></a>00262 <span class="keywordflow">return</span>; <a name="l00263"></a>00263 words = <a class="code" href="ckd__alloc_8h.html#aa00ef21903bc4f8a972488417adc8d2e" title="Macros to simplify the use of above functions.">ckd_calloc</a>(n, <span class="keyword">sizeof</span>(*words)); <a name="l00264"></a>00264 <a class="code" href="strfuncs_8h.html#a5b520fdebcca599db86faaf75a82173f" title="Convert a line to an array of &quot;words&quot;, based on whitespace separators.">str2words</a>(textfoo, words, n); <a name="l00265"></a>00265 <a name="l00266"></a>00266 ch = calc_entropy(lm, words, n, &nccs, &noovs, &lscr); <a name="l00267"></a>00267 <a name="l00268"></a>00268 printf(<span class="stringliteral">"input: %s\n"</span>, text); <a name="l00269"></a>00269 printf(<span class="stringliteral">"cross-entropy: %f bits\n"</span>, <a name="l00270"></a>00270 ch * log(<a class="code" href="logmath_8h.html#a6114206ec0321d7015c42fc7b81cb83e" title="Get the log base.">logmath_get_base</a>(lmath)) / log(2)); <a name="l00271"></a>00271 <a name="l00272"></a>00272 <span class="comment">/* Calculate perplexity pplx = exp CH */</span> <a name="l00273"></a>00273 printf(<span class="stringliteral">"perplexity: %f\n"</span>, <a class="code" href="logmath_8h.html#ae8b0a168e29e448c0d6de66dc46e099e" title="Convert integer log in base B to linear floating point.">logmath_exp</a>(lmath, ch)); <a name="l00274"></a>00274 printf(<span class="stringliteral">"lm score: %d\n"</span>, lscr); <a name="l00275"></a>00275 <a name="l00276"></a>00276 <span class="comment">/* Report OOVs and CCs */</span> <a name="l00277"></a>00277 printf(<span class="stringliteral">"%d words evaluated\n"</span>, n); <a name="l00278"></a>00278 printf(<span class="stringliteral">"%d OOVs, %d context cues removed\n"</span>, <a name="l00279"></a>00279 noovs, nccs); <a name="l00280"></a>00280 <a name="l00281"></a>00281 <a class="code" href="ckd__alloc_8h.html#a31c6b405558620ac37599737b5722fbf" title="Test and free a 1-D array.">ckd_free</a>(textfoo); <a name="l00282"></a>00282 <a class="code" href="ckd__alloc_8h.html#a31c6b405558620ac37599737b5722fbf" title="Test and free a 1-D array.">ckd_free</a>(words); <a name="l00283"></a>00283 } <a name="l00284"></a>00284 <a name="l00285"></a>00285 <span class="keywordtype">int</span> <a name="l00286"></a>00286 main(<span class="keywordtype">int</span> argc, <span class="keywordtype">char</span> *argv[]) <a name="l00287"></a>00287 { <a name="l00288"></a>00288 <a class="code" href="structcmd__ln__t.html" title="Opaque structure used to hold the results of command-line parsing.">cmd_ln_t</a> *<a class="code" href="structsphinx__wave2feat__s.html#a484d308befbde315664da8520ebc410d" title="Configuration parameters.">config</a>; <a name="l00289"></a>00289 <a class="code" href="structngram__model__s.html" title="Common implementation of ngram_model_t.">ngram_model_t</a> *lm = NULL; <a name="l00290"></a>00290 <a class="code" href="structlogmath__s.html">logmath_t</a> *lmath; <a name="l00291"></a>00291 <span class="keyword">const</span> <span class="keywordtype">char</span> *lmfn, *probdefn, *lsnfn, *text; <a name="l00292"></a>00292 <a name="l00293"></a>00293 <span class="keywordflow">if</span> ((config = <a class="code" href="cmd__ln_8h.html#aa5a3a9e49198d8fd0dd3424fb880b6b6" title="Parse a list of strings into argumetns.">cmd_ln_parse_r</a>(NULL, defn, argc, argv, TRUE)) == NULL) <a name="l00294"></a>00294 <span class="keywordflow">return</span> 1; <a name="l00295"></a>00295 <a name="l00296"></a>00296 verbose = <a class="code" href="cmd__ln_8h.html#a159e691c95089689cf9a8f85a67830a6" title="Retrieve a boolean value from a command-line object.">cmd_ln_boolean_r</a>(config, <span class="stringliteral">"-verbose"</span>); <a name="l00297"></a>00297 <a name="l00298"></a>00298 <span class="comment">/* Create log math object. */</span> <a name="l00299"></a>00299 <span class="keywordflow">if</span> ((lmath = logmath_init <a name="l00300"></a>00300 (cmd_ln_float64_r(config, <span class="stringliteral">"-logbase"</span>), 0, 0)) == NULL) { <a name="l00301"></a>00301 <a class="code" href="err_8h.html#a1a4495946ab2449d61108fe829a94613" title="Exit with non-zero status after error message.">E_FATAL</a>(<span class="stringliteral">"Failed to initialize log math\n"</span>); <a name="l00302"></a>00302 } <a name="l00303"></a>00303 <a name="l00304"></a>00304 <span class="comment">/* Load the language model. */</span> <a name="l00305"></a>00305 lmfn = <a class="code" href="cmd__ln_8h.html#af0aa15288e06fc8271298e4fa7cdc91a" title="Retrieve a string from a command-line object.">cmd_ln_str_r</a>(config, <span class="stringliteral">"-lm"</span>); <a name="l00306"></a>00306 <span class="keywordflow">if</span> (lmfn == NULL <a name="l00307"></a>00307 || (lm = <a class="code" href="ngram__model_8h.html#ab0c840f2bdfc38cea08bb70054f76624" title="Read an N-Gram model from a file on disk.">ngram_model_read</a>(config, lmfn, <a name="l00308"></a>00308 <a class="code" href="ngram__model_8h.html#a406c0d64c15a9d1749d07c8ab6e0ae74a441701bf8ae0a2b79716feb31b5f257a" title="Determine file type automatically.">NGRAM_AUTO</a>, lmath)) == NULL) { <a name="l00309"></a>00309 <a class="code" href="err_8h.html#a1a4495946ab2449d61108fe829a94613" title="Exit with non-zero status after error message.">E_FATAL</a>(<span class="stringliteral">"Failed to load language model from %s\n"</span>, <a name="l00310"></a>00310 <a class="code" href="cmd__ln_8h.html#af0aa15288e06fc8271298e4fa7cdc91a" title="Retrieve a string from a command-line object.">cmd_ln_str_r</a>(config, <span class="stringliteral">"-lm"</span>)); <a name="l00311"></a>00311 } <a name="l00312"></a>00312 <span class="keywordflow">if</span> ((probdefn = <a class="code" href="cmd__ln_8h.html#af0aa15288e06fc8271298e4fa7cdc91a" title="Retrieve a string from a command-line object.">cmd_ln_str_r</a>(config, <span class="stringliteral">"-probdef"</span>)) != NULL) <a name="l00313"></a>00313 <a class="code" href="ngram__model_8h.html#a9b2a86c23543158754373c5456fe890d" title="Read a class definition file and add classes to a language model.">ngram_model_read_classdef</a>(lm, probdefn); <a name="l00314"></a>00314 <a class="code" href="ngram__model_8h.html#aa4b8d7c1f3d873b8458c0cfee13af4da" title="Apply a language weight, insertion penalty, and unigram weight to a language model.">ngram_model_apply_weights</a>(lm, <a name="l00315"></a>00315 cmd_ln_float32_r(config, <span class="stringliteral">"-lw"</span>), <a name="l00316"></a>00316 cmd_ln_float32_r(config, <span class="stringliteral">"-wip"</span>), <a name="l00317"></a>00317 cmd_ln_float32_r(config, <span class="stringliteral">"-uw"</span>)); <a name="l00318"></a>00318 <a name="l00319"></a>00319 <span class="comment">/* Now evaluate some text. */</span> <a name="l00320"></a>00320 lsnfn = <a class="code" href="cmd__ln_8h.html#af0aa15288e06fc8271298e4fa7cdc91a" title="Retrieve a string from a command-line object.">cmd_ln_str_r</a>(config, <span class="stringliteral">"-lsn"</span>); <a name="l00321"></a>00321 text = <a class="code" href="cmd__ln_8h.html#af0aa15288e06fc8271298e4fa7cdc91a" title="Retrieve a string from a command-line object.">cmd_ln_str_r</a>(config, <span class="stringliteral">"-text"</span>); <a name="l00322"></a>00322 <span class="keywordflow">if</span> (lsnfn) { <a name="l00323"></a>00323 evaluate_file(lm, lmath, lsnfn); <a name="l00324"></a>00324 } <a name="l00325"></a>00325 <span class="keywordflow">else</span> <span class="keywordflow">if</span> (text) { <a name="l00326"></a>00326 evaluate_string(lm, lmath, text); <a name="l00327"></a>00327 } <a name="l00328"></a>00328 <a name="l00329"></a>00329 <span class="keywordflow">return</span> 0; <a name="l00330"></a>00330 } </pre></div></div> </div> <div id="nav-path" class="navpath"> <ul> <li class="navelem"><a class="el" href="sphinx__lm__eval_8c.html">sphinx_lm_eval.c</a> </li> <li class="footer">Generated on Tue Apr 19 2011 for SphinxBase by  <a href="http://www.doxygen.org/index.html"> <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.7.3 </li> </ul> </div> </body> </html>