<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"> <html><head><meta http-equiv="Content-Type" content="text/html;charset=iso-8859-1"> <title>hebrewmcim.cpp Source File</title> <link href="doxygen.css" rel="stylesheet" type="text/css"> </head><body> <!-- Generated by Doxygen 1.2.15 --> <center> <a class="qindex" href="index.html">Main Page</a> <a class="qindex" href="namespaces.html">Namespace List</a> <a class="qindex" href="hierarchy.html">Class Hierarchy</a> <a class="qindex" href="classes.html">Alphabetical List</a> <a class="qindex" href="annotated.html">Compound List</a> <a class="qindex" href="files.html">File List</a> <a class="qindex" href="functions.html">Compound Members</a> </center> <hr><h1>hebrewmcim.cpp</h1><div class="fragment"><pre>00001 00011 <font class="preprocessor">#include <hebrewmcim.h></font> 00012 00013 HebrewMCIM::HebrewMCIM() 00014 :<a class="code" href="class_s_w_input_method.html">SWInputMethod</a>() { 00015 00016 init(); 00017 } 00018 00019 00020 <font class="keywordtype">int</font> *HebrewMCIM::translate(<font class="keywordtype">char</font> in) { 00021 <font class="keywordtype">int</font> retVal = 0; 00022 <font class="keyword">static</font> <font class="keywordtype">int</font> retString[5]; 00023 <font class="keywordtype">int</font> retStringIndex = 0; 00024 00025 memset(retString, 0, 5); 00026 00027 <font class="keywordflow">if</font> (getState() > 1) { 00028 <font class="keywordflow">if</font> (getState() >= 12) { <font class="comment">// serious issue with internal structure</font> 00029 setState(0); 00030 retString[retStringIndex++] = in; 00031 <font class="keywordflow">return</font> retString; 00032 } 00033 map<int, int>::iterator find = subst2[getState()].find(in); 00034 <font class="keywordflow">if</font> (find != subst2[getState()].end()) 00035 retVal = find->second; 00036 <font class="keywordflow">else</font> retVal = in; 00037 00038 setState(0); 00039 retString[retStringIndex++] = retVal; 00040 <font class="keywordflow">return</font> retString; 00041 } 00042 <font class="keywordflow">else</font> { 00043 retVal = subst[in]; 00044 00045 <font class="keywordflow">if</font> (retVal == 0) { 00046 setState(0); 00047 retString[retStringIndex++] = in; 00048 <font class="keywordflow">return</font> retString; 00049 } 00050 <font class="keywordflow">if</font> (retVal > 100) { 00051 setState(1); 00052 retString[retStringIndex++] = retVal; 00053 <font class="keywordflow">return</font> retString; 00054 } 00055 <font class="keywordflow">if</font> (retVal == 50) { <font class="comment">// multiChar</font> 00056 setState(1); 00057 <font class="keywordtype">int</font> *chars = multiChars[in]; 00058 <font class="keywordflow">if</font> (chars != 0) { 00059 retString[retStringIndex++] = chars[0]; 00060 retString[retStringIndex++] = chars[1]; 00061 <font class="keywordflow">return</font> retString; 00062 } 00063 } 00064 } 00065 setState(retVal); 00066 <font class="keywordflow">return</font> 0; 00067 } 00068 00069 00070 <font class="keywordtype">void</font> HebrewMCIM::init() { 00071 memset(subst, 0, 255); 00072 00073 subst[<font class="charliteral">')'</font>] = 1488; 00074 subst[<font class="charliteral">'B'</font>] = 1489; 00075 subst[<font class="charliteral">'G'</font>] = 1490; 00076 subst[<font class="charliteral">'D'</font>] = 1491; 00077 subst[<font class="charliteral">'H'</font>] = 1492; 00078 subst[<font class="charliteral">'W'</font>] = 1493; 00079 subst[<font class="charliteral">'Z'</font>] = 1494; 00080 subst[<font class="charliteral">'X'</font>] = 1495; 00081 subst[<font class="charliteral">'+'</font>] = 1496; 00082 subst[<font class="charliteral">'Y'</font>] = 1497; 00083 00084 subst[<font class="charliteral">'k'</font>] = 1498; <font class="comment">// finals</font> 00085 subst[<font class="charliteral">'m'</font>] = 1501; 00086 subst[<font class="charliteral">'n'</font>] = 1503; 00087 subst[<font class="charliteral">'c'</font>] = 1509; 00088 00089 subst[<font class="charliteral">'P'</font>] = 1508; 00090 subst[<font class="charliteral">'K'</font>] = 1499; 00091 subst[<font class="charliteral">'L'</font>] = 1500; 00092 subst[<font class="charliteral">'M'</font>] = 1502; 00093 subst[<font class="charliteral">'N'</font>] = 1504; 00094 subst[<font class="charliteral">'S'</font>] = 1505; 00095 subst[<font class="charliteral">'('</font>] = 1506; 00096 subst[<font class="charliteral">'p'</font>] = 1507; 00097 subst[<font class="charliteral">'C'</font>] = 1510; 00098 subst[<font class="charliteral">'Q'</font>] = 1511; 00099 subst[<font class="charliteral">'R'</font>] = 1512; 00100 subst[<font class="charliteral">'#'</font>] = 1513; 00101 00102 <font class="comment">// special multiChars</font> 00103 subst[<font class="charliteral">'&'</font>] = 50; 00104 subst[<font class="charliteral">'$'</font>] = 50; 00105 00106 <font class="keyword">static</font> <font class="keywordtype">int</font> x[] = {1513, 1474}; 00107 multiChars[<font class="charliteral">'&'</font>] = x; 00108 <font class="keyword">static</font> <font class="keywordtype">int</font> y[] = {1513, 1473}; 00109 multiChars[<font class="charliteral">'$'</font>] = y; 00110 00111 subst[<font class="charliteral">'T'</font>] = 1514; 00112 00113 <font class="comment">// VOWELS</font> 00114 subst[<font class="charliteral">'A'</font>] = 1463; 00115 subst[<font class="charliteral">'F'</font>] = 1464; 00116 subst[<font class="charliteral">'E'</font>] = 1462; 00117 subst[<font class="charliteral">'"'</font>] = 1461; 00118 subst[<font class="charliteral">'I'</font>] = 1460; 00119 subst[<font class="charliteral">'O'</font>] = 1465; 00120 subst[<font class="charliteral">'U'</font>] = 1467; 00121 00122 00123 00124 <font class="comment">// OTHER DIACRITICS</font> 00125 subst[<font class="charliteral">'.'</font>] = 1468; 00126 subst[<font class="charliteral">'-'</font>] = 1470; 00127 subst[<font class="charliteral">','</font>] = 1471; 00128 00129 <font class="comment">// Compound input</font> 00130 00131 <font class="comment">// CANTILLATION</font> 00132 00133 subst[<font class="charliteral">':'</font>] = 2; 00134 subst2[2][<font class="charliteral">'A'</font>] = 1458; 00135 subst2[2][<font class="charliteral">'E'</font>] = 1457; 00136 subst2[2][<font class="charliteral">'F'</font>] = 1459; 00137 00138 00139 <font class="comment">/* Telisha qetana is postpositive as in '04' above. However, Michigan</font> 00140 <font class="comment"># code '24' is for a medial telisha. Graphically, there is no</font> 00141 <font class="comment"># difference.</font> 00142 <font class="comment"> */</font> 00143 subst[<font class="charliteral">'2'</font>] = 5; 00144 subst2[5][<font class="charliteral">'4'</font>] = 1449; 00145 00146 00147 <font class="comment">/* Note Michigan encoding distinguishes between medial metheg '35' (occuring</font> 00148 <font class="comment"># on the left of the vowel), and the ordinary meteg '95' (occuring on the</font> 00149 <font class="comment"># right of the vowel). It is also used for silluq.</font> 00150 <font class="comment"> */</font> 00151 subst[<font class="charliteral">'3'</font>] = 6; 00152 subst2[6][<font class="charliteral">'3'</font>] = 1433; 00153 subst2[6][<font class="charliteral">'5'</font>] = 1469; 00154 00155 00156 <font class="comment">/* The Michigan code of telisha gedola in medial position. Graphically,</font> 00157 <font class="comment"># there is no difference.</font> 00158 <font class="comment"> */</font> 00159 subst[<font class="charliteral">'4'</font>] = 7; 00160 subst2[7][<font class="charliteral">'4'</font>] = 1440; 00161 00162 subst[<font class="charliteral">'6'</font>] = 8; 00163 subst2[8][<font class="charliteral">'0'</font>] = 1451; 00164 subst2[8][<font class="charliteral">'1'</font>] = 1436; 00165 00166 subst[<font class="charliteral">'1'</font>] = 4; 00167 subst2[4][<font class="charliteral">'0'</font>] = 1434; 00168 00169 <font class="comment">/* In the poetic books, prepositive dehi occurs; it's unclear whether</font> 00170 <font class="comment"># tipeha also occurs in the poetic books. Otherwise, we could simply</font> 00171 <font class="comment"># check for what book in the Tanach we are in. Michigan uses the same</font> 00172 <font class="comment"># code for each.</font> 00173 <font class="comment"> */</font> 00174 00175 subst2[4][<font class="charliteral">'3'</font>] = 1430; 00176 00177 <font class="comment">/* This is the poetic accent mugrash, which also includes rebia, but is</font> 00178 <font class="comment"># encoded separately as '81' in the Michigan text.</font> 00179 <font class="comment"> */</font> 00180 subst2[4][<font class="charliteral">'1'</font>] = 1437; 00181 subst2[4][<font class="charliteral">'4'</font>] = 1440; 00182 00183 00184 subst[<font class="charliteral">'0'</font>] = 3; 00185 subst2[3][<font class="charliteral">'0'</font>] = 1475; 00186 subst2[3][<font class="charliteral">'1'</font>] = 1426; 00187 00188 <font class="comment">/* According to BHS, zarqa and sinnor are both postpositive. However,</font> 00189 <font class="comment"># the Michigan encoding uses one code for both. The Unicode zarqa</font> 00190 <font class="comment"># (0x0598) is definitely NOT postpositive. And further, the shape of</font> 00191 <font class="comment"># the symbol is different in BHS and Uniocde. This needs further</font> 00192 <font class="comment"># research to determine what's going on here. For now, we follow BHS</font> 00193 <font class="comment"># and use the postpositive Unicode zinor or both accents.</font> 00194 <font class="comment"> */</font> 00195 00196 subst2[3][<font class="charliteral">'2'</font>] = 1454; 00197 00198 <font class="comment">/* Pashta is postpositive, and the Unicode equivalent reflects</font> 00199 <font class="comment"># this. However, there is a poetic equivalent -- azla legarmeh --</font> 00200 <font class="comment"># which is not postpositive, but no equivalent code point exists in</font> 00201 <font class="comment"># Unicode. The Michigan encoding does not distinguish between the two,</font> 00202 <font class="comment"># although it could be algorithmically determined.</font> 00203 <font class="comment"> */</font> 00204 00205 subst2[3][<font class="charliteral">'3'</font>] = 1433; 00206 subst2[3][<font class="charliteral">'4'</font>] = 1449; 00207 subst2[3][<font class="charliteral">'5'</font>] = 1472; 00208 00209 00210 <font class="comment">/* This is the Unicode Hebrew *accent*; there is also another Hebrew</font> 00211 <font class="comment"># *punctuation* called GERSHAYIM 0x05F4. I'm using the more</font> 00212 <font class="comment"># traditional rounded marks, rather than the alternate straight</font> 00213 <font class="comment"># marks.</font> 00214 <font class="comment"> */</font> 00215 00216 subst2[8][<font class="charliteral">'2'</font>] = 1438; 00217 00218 <font class="comment">// Also known as azla</font> 00219 subst2[8][<font class="charliteral">'3'</font>] = 1448; 00220 subst2[8][<font class="charliteral">'4'</font>] = 1452; 00221 subst2[8][<font class="charliteral">'5'</font>] = 1427; 00222 00223 00224 subst[<font class="charliteral">'8'</font>] = 9; 00225 subst2[9][<font class="charliteral">'0'</font>] = 1428; 00226 subst2[9][<font class="charliteral">'1'</font>] = 1431; 00227 00228 <font class="comment">/* Note, this accent is actually sinnorit, but it does not exist as a</font> 00229 <font class="comment"># separate glyph in the Unicode standard. The 'ZINOR' Unicode accent</font> 00230 <font class="comment"># is postpositive, while sinnorit is not. ZARQA is as close as I can</font> 00231 <font class="comment"># get to this.</font> 00232 <font class="comment"> */</font> 00233 subst2[9][<font class="charliteral">'2'</font>] = 1432; 00234 00235 <font class="comment">/* The Unicode form does not match the form used by BHS, but the names</font> 00236 <font class="comment"># are the same.</font> 00237 <font class="comment"> */</font> 00238 subst2[9][<font class="charliteral">'3'</font>] = 1441; 00239 subst2[9][<font class="charliteral">'4'</font>] = 1439; 00240 subst2[9][<font class="charliteral">'5'</font>] = 1429; 00241 00242 subst[<font class="charliteral">'7'</font>] = 10; 00243 subst2[10][<font class="charliteral">'0'</font>] = 1444; 00244 subst2[10][<font class="charliteral">'1'</font>] = 1445; 00245 subst2[10][<font class="charliteral">'2'</font>] = 1446; 00246 subst2[10][<font class="charliteral">'3'</font>] = 1430; <font class="comment">// also '13', '73' also is used for majela</font> 00247 subst2[10][<font class="charliteral">'4'</font>] = 1443; 00248 subst2[10][<font class="charliteral">'5'</font>] = 1469; <font class="comment">// this is silluq; should appear to the left of the vowel</font> 00249 00250 subst[<font class="charliteral">'9'</font>] = 11; 00251 subst2[11][<font class="charliteral">'1'</font>] = 1435; 00252 subst2[11][<font class="charliteral">'2'</font>] = 1425; 00253 subst2[11][<font class="charliteral">'3'</font>] = 1450; 00254 subst2[11][<font class="charliteral">'4'</font>] = 1447; 00255 subst2[11][<font class="charliteral">'5'</font>] = 1469; <font class="comment">// should appear to the right of the vowel</font> 00256 00257 } 00258 00259 <font class="comment">/*</font> 00260 <font class="comment"></font> 00261 <font class="comment"></font> 00262 <font class="comment"># CANTILLION MARKS</font> 00263 <font class="comment"></font> 00264 <font class="comment"> my $ETNAHTA = '&#1425;';</font> 00265 <font class="comment"># officially the Unicode name for this symbol was "SEGOL." However, that is</font> 00266 <font class="comment"># not a unique name, conflicting with the vowel of the same name. Further,</font> 00267 <font class="comment"># the position of the symbol is different. I have changed the name of the</font> 00268 <font class="comment"># accent to "SEGOLTA," the traditional name for this accent.</font> 00269 <font class="comment"> my $SEGOLTA = '&#1426;';</font> 00270 <font class="comment"> my $SHALSHELET = '&#1427;';</font> 00271 <font class="comment"> my $ZAQEF_QATAN = '&#1428;';</font> 00272 <font class="comment"> my $ZAQEF_GADOL = '&#1429;';</font> 00273 <font class="comment"> my $TIPEHA = '&#1430;';</font> 00274 <font class="comment"> my $REVIA = '&#1431;';</font> 00275 <font class="comment"> my $ZARQA = '&#1432;';</font> 00276 <font class="comment"> my $PASHTA = '&#1433;';</font> 00277 <font class="comment"> my $YETIV = '&#1434;';</font> 00278 <font class="comment"> my $TEVIR = '&#1435;';</font> 00279 <font class="comment"> my $GERESH = '&#1436;';</font> 00280 <font class="comment"> my $GERESH_MUQDAM = '&#1437;';</font> 00281 <font class="comment"> my $GERSHAYIM = '&#1438;';</font> 00282 <font class="comment"> my $QARNEY_PARA = '&#1439;';</font> 00283 <font class="comment"> my $TELISHA_GEDOLA = '&#1440;';</font> 00284 <font class="comment"> my $PAZER = '&#1441;';</font> 00285 <font class="comment"> my $MUNAH = '&#1443;';</font> 00286 <font class="comment"> my $MAHAPAKH = '&#1444;';</font> 00287 <font class="comment"> my $MERKHA = '&#1445;';</font> 00288 <font class="comment"> my $MERKHA_KEFULA = '&#1446;';</font> 00289 <font class="comment"> my $DARGA = '&#1447;';</font> 00290 <font class="comment"> my $QADMA = '&#1448;';</font> 00291 <font class="comment"> my $TELISHA_QETANA = '&#1449;';</font> 00292 <font class="comment"> my $YERAH_BEN_YOMO = '&#1450;';</font> 00293 <font class="comment"> my $OLE = '&#1451;';</font> 00294 <font class="comment"> my $ILUY = '&#1452;';</font> 00295 <font class="comment"> my $DEHI = '&#1453;';</font> 00296 <font class="comment"> my $ZINOR = '&#1454;';</font> 00297 <font class="comment"># HEBREW MARK</font> 00298 <font class="comment"> my $MASORA_CIRCLE = '&#1455;';</font> 00299 <font class="comment"># HEBREW EXTENDED-A points and punctuation</font> 00300 <font class="comment"> my $SHEVA = '&#1456;';</font> 00301 <font class="comment"> my $HATAF_SEGOL = '&#1457;';</font> 00302 <font class="comment"> my $HATAF_PATAH = '&#1458;';</font> 00303 <font class="comment"> my $HATAF_QAMATS = '&#1459;';</font> 00304 <font class="comment"> my $HIRIQ = '&#1460;';</font> 00305 <font class="comment"> my $TSERE = '&#1461;';</font> 00306 <font class="comment"> my $SEGOL = '&#1462;';</font> 00307 <font class="comment"># furtive Patah is not a distinct character</font> 00308 <font class="comment"> my $PATAH = '&#1463;';</font> 00309 <font class="comment"> my $QAMATS = '&#1464;';</font> 00310 <font class="comment"> my $HOLAM = '&#1465;';</font> 00311 <font class="comment"> my $QUBUTS = '&#1467;';</font> 00312 <font class="comment"># also used as shuruq</font> 00313 <font class="comment"># falls within the base letter</font> 00314 <font class="comment"> my $DAGESH_OR_MAPIQ = '&#1468;';</font> 00315 <font class="comment"># also used as siluq</font> 00316 <font class="comment"> my $METAG = '&#1469;';</font> 00317 <font class="comment"> my $MAQAF = '&#1470;';</font> 00318 <font class="comment"> my $RAFE = '&#1471;';</font> 00319 <font class="comment"># Also used for legarmeh</font> 00320 <font class="comment"># may be treated as spacing punctuation, not as a point</font> 00321 <font class="comment"> my $PASEQ = '&#1472;';</font> 00322 <font class="comment"> my $SHIN_DOT = '&#1473;';</font> 00323 <font class="comment"> my $SIN_DOT = '&#1474;';</font> 00324 <font class="comment"> my $SOF_PASUQ = '&#1475;';</font> 00325 <font class="comment"># HEBREW MARK</font> 00326 <font class="comment"> my $UPPER_DOT = '&#1476;';</font> 00327 <font class="comment"># HEBREW LETTERS based on ISO 8859-8</font> 00328 <font class="comment"># aleph</font> 00329 <font class="comment"># x (alef symbol - 2135)</font> 00330 <font class="comment"> my $ALEF = '&#1488;';</font> 00331 <font class="comment"># x (bet symbol - 2136)</font> 00332 <font class="comment"> my $BET = '&#1489;';</font> 00333 <font class="comment"># x (gimel symbol - 2137)</font> 00334 <font class="comment"> my $GIMEL = '&#1490;';</font> 00335 <font class="comment"># x (dalet symbol - 2138)</font> 00336 <font class="comment"> my $DALET = '&#1491;';</font> 00337 <font class="comment"> my $HE = '&#1492;';</font> 00338 <font class="comment"> my $VAV = '&#1493;';</font> 00339 <font class="comment"> my $ZAYIN = '&#1494;';</font> 00340 <font class="comment"> my $HET = '&#1495;';</font> 00341 <font class="comment"> my $TET = '&#1496;';</font> 00342 <font class="comment"> my $YOD = '&#1497;';</font> 00343 <font class="comment"> my $FINAL_KAF = '&#1498;';</font> 00344 <font class="comment"> my $KAF = '&#1499;';</font> 00345 <font class="comment"> my $LAMED = '&#1500;';</font> 00346 <font class="comment"> my $FINAL_MEM = '&#1501;';</font> 00347 <font class="comment"> my $MEM = '&#1502;';</font> 00348 <font class="comment"> my $FINAL_NUN = '&#1503;';</font> 00349 <font class="comment"> my $NUN = '&#1504;';</font> 00350 <font class="comment"> my $SAMEKH = '&#1505;';</font> 00351 <font class="comment"> my $AYIN = '&#1506;';</font> 00352 <font class="comment"> my $FINAL_PE = '&#1507;';</font> 00353 <font class="comment"> my $PE = '&#1508;';</font> 00354 <font class="comment"> my $FINAL_TSADI = '&#1509;';</font> 00355 <font class="comment"># also known as zade</font> 00356 <font class="comment"> my $TSADI = '&#1510;';</font> 00357 <font class="comment"> my $QOF = '&#1511;';</font> 00358 <font class="comment"> my $RESH = '&#1512;';</font> 00359 <font class="comment"> my $SHIN = '&#1513;';</font> 00360 <font class="comment"> my $TAV = '&#1514;';</font> 00361 <font class="comment"># Yiddish digraphs</font> 00362 <font class="comment"># Hebrew Ligature</font> 00363 <font class="comment"># tsvey vovn</font> 00364 <font class="comment"> my $DOUBLE_VAV = '&#1520;';</font> 00365 <font class="comment"> my $VAV_YOD = '&#1521;';</font> 00366 <font class="comment"># tsvey yudn</font> 00367 <font class="comment"> my $DOUBLE_YOD = '&#1522;';</font> 00368 <font class="comment"></font> 00369 <font class="comment"># Additional punctuation</font> 00370 <font class="comment"> my $PUNCT_GERESH = '&#1523;';</font> 00371 <font class="comment"> my $PUNCT_GERSHAYIM = '&#1524;';</font> 00372 <font class="comment"># Reserved: 0x05F5"</font> 00373 <font class="comment"># x (hebrew point judeo-spanish varika - FB1E)</font> 00374 <font class="comment">#my $JUDEO_SPANISH_VARIKA = pack("U",0xFB1E); # UTF-8 OxFB1E</font> 00375 <font class="comment"></font> 00376 <font class="comment">#############################</font> 00377 <font class="comment"># End of Unicode 2.0 Hebrew #</font> 00378 <font class="comment">#############################</font> 00379 <font class="comment"></font> 00380 <font class="comment"># A hash whose key is a Michagan code, and whose value is a Unicode</font> 00381 <font class="comment"># equvalent</font> 00382 <font class="comment"></font> 00383 <font class="comment"> char subst[] = new char [255];</font> 00384 <font class="comment"> subst[')'] = 1488;</font> 00385 <font class="comment"> 'B' => $BET,</font> 00386 <font class="comment"> 'G' => $GIMEL,</font> 00387 <font class="comment"> 'D' => $DALET,</font> 00388 <font class="comment"> 'H' => $HE,</font> 00389 <font class="comment"> 'W' => $VAV,</font> 00390 <font class="comment"> 'Z' => $ZAYIN,</font> 00391 <font class="comment"> 'X' => $HET,</font> 00392 <font class="comment"> '+' => $TET,</font> 00393 <font class="comment"> 'Y' => $YOD,</font> 00394 <font class="comment"> 'K' => $KAF,</font> 00395 <font class="comment"> 'L' => $LAMED,</font> 00396 <font class="comment"> 'M' => $MEM,</font> 00397 <font class="comment"> 'N' => $NUN,</font> 00398 <font class="comment"> 'S' => $SAMEKH,</font> 00399 <font class="comment"> '(' => $AYIN,</font> 00400 <font class="comment"> 'P' => $PE,</font> 00401 <font class="comment"> 'C' => $TSADI,</font> 00402 <font class="comment"> 'Q' => $QOF,</font> 00403 <font class="comment"> 'R' => $RESH,</font> 00404 <font class="comment"> '#' => $SHIN, # the letter shin without a point</font> 00405 <font class="comment"> '&' => ($SHIN . $SIN_DOT),</font> 00406 <font class="comment"> '$' => ($SHIN . $SHIN_DOT), # '</font> 00407 <font class="comment"> 'T' => $TAV,</font> 00408 <font class="comment"># VOWELS</font> 00409 <font class="comment"> 'A' => $PATAH,</font> 00410 <font class="comment"> 'F' => $QAMATS,</font> 00411 <font class="comment"> 'E' => $SEGOL,</font> 00412 <font class="comment"> '"' => $TSERE,</font> 00413 <font class="comment"> 'I' => $HIRIQ,</font> 00414 <font class="comment"> 'O' => $HOLAM,</font> 00415 <font class="comment"> 'U' => $QUBUTS,</font> 00416 <font class="comment"> ':' => $SHEVA,</font> 00417 <font class="comment"> ':A' => $HATAF_PATAH,</font> 00418 <font class="comment"> ':E' => $HATAF_SEGOL,</font> 00419 <font class="comment"> ':F' => $HATAF_QAMATS,</font> 00420 <font class="comment"># OTHER DIACRITICS</font> 00421 <font class="comment"> '.' => $DAGESH_OR_MAPIQ,</font> 00422 <font class="comment"> '-' => $MAQAF,</font> 00423 <font class="comment"> ',' => $RAFE,</font> 00424 <font class="comment"># CANTILLATION</font> 00425 <font class="comment"> '00' => $SOF_PASUQ,</font> 00426 <font class="comment"> '01' => $SEGOLTA,</font> 00427 <font class="comment"># According to BHS, zarqa and sinnor are both postpositive. However,</font> 00428 <font class="comment"># the Michigan encoding uses one code for both. The Unicode zarqa</font> 00429 <font class="comment"># (0x0598) is definitely NOT postpositive. And further, the shape of</font> 00430 <font class="comment"># the symbol is different in BHS and Uniocde. This needs further</font> 00431 <font class="comment"># research to determine what's going on here. For now, we follow BHS</font> 00432 <font class="comment"># and use the postpositive Unicode zinor or both accents.</font> 00433 <font class="comment"> '02' => $ZINOR,</font> 00434 <font class="comment"># Pashta is postpositive, and the Unicode equivalent reflects</font> 00435 <font class="comment"># this. However, there is a poetic equivalent -- azla legarmeh --</font> 00436 <font class="comment"># which is not postpositive, but no equivalent code point exists in</font> 00437 <font class="comment"># Unicode. The Michigan encoding does not distinguish between the two,</font> 00438 <font class="comment"># although it could be algorithmically determined.</font> 00439 <font class="comment"> '03' => $PASHTA,</font> 00440 <font class="comment"> '04' => $TELISHA_QETANA,</font> 00441 <font class="comment"> '05' => $PASEQ,</font> 00442 <font class="comment"> '10' => $YETIV,</font> 00443 <font class="comment"># In the poetic books, prepositive dehi occurs; it's unclear whether</font> 00444 <font class="comment"># tipeha also occurs in the poetic books. Otherwise, we could simply</font> 00445 <font class="comment"># check for what book in the Tanach we are in. Michigan uses the same</font> 00446 <font class="comment"># code for each.</font> 00447 <font class="comment"> '13' => $TIPEHA, # also $DEHI</font> 00448 <font class="comment"># This is the poetic accent mugrash, which also includes rebia, but is</font> 00449 <font class="comment"># encoded separately as '81' in the Michigan text.</font> 00450 <font class="comment"> '11' => $GERESH_MUQDAM,</font> 00451 <font class="comment"> '14' => $TELISHA_GEDOLA,</font> 00452 <font class="comment"># Telisha qetana is postpositive as in '04' above. However, Michigan</font> 00453 <font class="comment"># code '24' is for a medial telisha. Graphically, there is no</font> 00454 <font class="comment"># difference.</font> 00455 <font class="comment"> '24' => $TELISHA_QETANA,</font> 00456 <font class="comment"> '33' => $PASHTA,</font> 00457 <font class="comment"># The Michigan code of telisha gedola in medial position. Graphically,</font> 00458 <font class="comment"># there is no difference.</font> 00459 <font class="comment"> '44' => $TELISHA_GEDOLA,</font> 00460 <font class="comment"> '60' => $OLE,</font> 00461 <font class="comment"> '61' => $GERESH,</font> 00462 <font class="comment"># This is the Unicode Hebrew *accent*; there is also another Hebrew</font> 00463 <font class="comment"># *punctuation* called GERSHAYIM 0x05F4. I'm using the more</font> 00464 <font class="comment"># traditional rounded marks, rather than the alternate straight</font> 00465 <font class="comment"># marks.</font> 00466 <font class="comment"> '62' => $GERSHAYIM,</font> 00467 <font class="comment"># Also known as azla</font> 00468 <font class="comment"> '63' => $QADMA,</font> 00469 <font class="comment"> '64' => $ILUY,</font> 00470 <font class="comment"> '65' => $SHALSHELET,</font> 00471 <font class="comment"> '80' => $ZAQEF_QATAN,</font> 00472 <font class="comment"> '81' => $REVIA,</font> 00473 <font class="comment"># Note, this accent is actually sinnorit, but it does not exist as a</font> 00474 <font class="comment"># separate glyph in the Unicode standard. The 'ZINOR' Unicode accent</font> 00475 <font class="comment"># is postpositive, while sinnorit is not. ZARQA is as close as I can</font> 00476 <font class="comment"># get to this.</font> 00477 <font class="comment"> '82' => $ZARQA,</font> 00478 <font class="comment"># The Unicode form does not match the form used by BHS, but the names</font> 00479 <font class="comment"># are the same.</font> 00480 <font class="comment"> '83' => $PAZER,</font> 00481 <font class="comment"> '84' => $QARNEY_PARA,</font> 00482 <font class="comment"> '85' => $ZAQEF_GADOL,</font> 00483 <font class="comment"># Note Michigan encoding distinguishes between medial metheg '35' (occuring</font> 00484 <font class="comment"># on the left of the vowel), and the ordinary meteg '95' (occuring on the</font> 00485 <font class="comment"># right of the vowel). It is also used for silluq.</font> 00486 <font class="comment"> '35' => $METAG,</font> 00487 <font class="comment"> '70' => $MAHAPAKH,</font> 00488 <font class="comment"> '71' => $MERKHA,</font> 00489 <font class="comment"> '72' => $MERKHA_KEFULA,</font> 00490 <font class="comment"> '73' => $TIPEHA, # also '13', '73' also is used for majela</font> 00491 <font class="comment"> '74' => $MUNAH,</font> 00492 <font class="comment"> '75' => $METAG, # this is silluq; should appear to the left of the vowel</font> 00493 <font class="comment"> '91' => $TEVIR,</font> 00494 <font class="comment"> '92' => $ETNAHTA,</font> 00495 <font class="comment"> '93' => $YERAH_BEN_YOMO,</font> 00496 <font class="comment"> '94' => $DARGA,</font> 00497 <font class="comment"> '95' => $METAG, # should appear to the right of the vowel</font> 00498 <font class="comment"></font> 00499 <font class="comment"># Not used by the Michigan Encoding</font> 00500 <font class="comment"># $UPPER_DOT = '05C4';</font> 00501 <font class="comment"> );</font> 00502 <font class="comment"></font> 00503 <font class="comment"># declare other variables</font> 00504 <font class="comment"> my (@bhsLines,</font> 00505 <font class="comment"> @bhsVerse,</font> 00506 <font class="comment"> @entity_line) = ();</font> 00507 <font class="comment"></font> 00508 <font class="comment"> my ($i,</font> 00509 <font class="comment"> $verse,</font> 00510 <font class="comment"> $word,</font> 00511 <font class="comment"> $character) = 0;</font> 00512 <font class="comment"></font> 00513 <font class="comment"> my ($element,</font> 00514 <font class="comment"> $saveGuttural) = "";</font> 00515 <font class="comment"></font> 00516 <font class="comment"># read in a line</font> 00517 <font class="comment"> while (<>) {</font> 00518 <font class="comment"># Process one verse</font> 00519 <font class="comment"># iterate over every character and change to XML decimal entity</font> 00520 <font class="comment"> CHAR: for ( $i = 0; ($i < scalar(@bhsVerse)); $i++) {</font> 00521 <font class="comment"> # find and convert final kaf, mem, nun, pe, tsade</font> 00522 <font class="comment"> ( # if final form</font> 00523 <font class="comment"> $bhsVerse[$i] =~ /[KMNPC]/</font> 00524 <font class="comment"> )</font> 00525 <font class="comment"> &&</font> 00526 <font class="comment"> (</font> 00527 <font class="comment"> ( # whitespace or</font> 00528 <font class="comment"> $bhsVerse[$i+1] =~ /[ \-?]/</font> 00529 <font class="comment"> )</font> 00530 <font class="comment"> ||</font> 00531 <font class="comment"> ( # EOL or</font> 00532 <font class="comment"> $i == ( scalar(@bhsVerse) - 1 )</font> 00533 <font class="comment"> )</font> 00534 <font class="comment"> ||</font> 00535 <font class="comment"> ( # sof pasuq or</font> 00536 <font class="comment"> ( $bhsVerse[$i+1] =~ /0/ ) &&</font> 00537 <font class="comment"> ( $bhsVerse[$i+2] =~ /0/ )</font> 00538 <font class="comment"> )</font> 00539 <font class="comment"> ||</font> 00540 <font class="comment"> ( # one accent followed by white, eol or</font> 00541 <font class="comment"> (</font> 00542 <font class="comment"> ( $bhsVerse[$i+1] =~ /\d/ ) &&</font> 00543 <font class="comment"> ( $bhsVerse[$i+2] =~ /\d/ )</font> 00544 <font class="comment"> ) &&</font> 00545 <font class="comment"> (</font> 00546 <font class="comment"> ( $bhsVerse[$i+3] =~ /[ \-?]/ ) ||</font> 00547 <font class="comment"> ( $i == ( scalar(@bhsVerse) - 1 ) )</font> 00548 <font class="comment"> )</font> 00549 <font class="comment"> )</font> 00550 <font class="comment"> ||</font> 00551 <font class="comment"> ( # two accents followed by white, eol</font> 00552 <font class="comment"> (</font> 00553 <font class="comment"> ( $bhsVerse[$i+1] =~ /\d/ ) &&</font> 00554 <font class="comment"> ( $bhsVerse[$i+2] =~ /\d/ ) &&</font> 00555 <font class="comment"> ( $bhsVerse[$i+3] =~ /\d/ ) &&</font> 00556 <font class="comment"> ( $bhsVerse[$i+4] =~ /\d/ )</font> 00557 <font class="comment"> ) &&</font> 00558 <font class="comment"> (</font> 00559 <font class="comment"> ( $bhsVerse[$i+5] =~ /[ \-?]/ ) ||</font> 00560 <font class="comment"> ( $i == ( scalar(@bhsVerse) - 1 ) )</font> 00561 <font class="comment"> )</font> 00562 <font class="comment"> )</font> 00563 <font class="comment"> ||</font> 00564 <font class="comment"> ( # followed by a vowel and white, eol, sof pasuq</font> 00565 <font class="comment"> ( $bhsVerse[$i+1] =~ /[:F]/ ) &&</font> 00566 <font class="comment"> ( # followed by</font> 00567 <font class="comment"> ( $bhsVerse[$i+2] =~ /[ \-?]/ ) || # whitespace or</font> 00568 <font class="comment"> ( $i == ( scalar(@bhsVerse) - 1 ) ) || # eol or</font> 00569 <font class="comment"> ( # sof pasuq</font> 00570 <font class="comment"> ( $bhsVerse[$i+2] =~ /0/ ) &&</font> 00571 <font class="comment"> ( $bhsVerse[$i+3] =~ /0/ )</font> 00572 <font class="comment"> )</font> 00573 <font class="comment"> )</font> 00574 <font class="comment"> )</font> 00575 <font class="comment"> ) # end of what follows after final letter</font> 00576 <font class="comment"> &&</font> 00577 <font class="comment"> do {</font> 00578 <font class="comment"> $bhsVerse[$i] =~ /K/ && eval { push @entity_line,$FINAL_KAF; }</font> 00579 <font class="comment"> && next CHAR;</font> 00580 <font class="comment"> $bhsVerse[$i] =~ /M/ && eval { push @entity_line,$FINAL_MEM; }</font> 00581 <font class="comment"> && next CHAR;</font> 00582 <font class="comment"> $bhsVerse[$i] =~ /N/ && eval { push @entity_line,$FINAL_NUN; }</font> 00583 <font class="comment"> && next CHAR;</font> 00584 <font class="comment"> $bhsVerse[$i] =~ /P/ && eval { push @entity_line,$FINAL_PE; }</font> 00585 <font class="comment"> && next CHAR;</font> 00586 <font class="comment"> $bhsVerse[$i] =~ /C/ && eval { push @entity_line,$FINAL_TSADI; }</font> 00587 <font class="comment"> && next CHAR;</font> 00588 <font class="comment"> };</font> 00589 <font class="comment"> # find and convert "furtive patach"</font> 00590 <font class="comment"> ( $bhsVerse[$i] =~ /A/ ) && # If the letter is a patach</font> 00591 <font class="comment"> ( $bhsVerse[$i-1] =~ /[)HX(]/ ) && # and is preceeded by a guttural</font> 00592 <font class="comment"> ( ( $bhsVerse[$i-2] =~ /[AEFOU]/ ) || # and is preceeded by a vowel</font> 00593 <font class="comment"> ( ( $bhsVerse[$i-2] =~ /\./ ) && # or by suruq</font> 00594 <font class="comment"> ( $bhsVerse[$i-3] =~ /W/ ) ) || #</font> 00595 <font class="comment"> ( ( $bhsVerse[$i-2] =~ /W/ ) && # or by holem (written plene)</font> 00596 <font class="comment"> ( $bhsVerse[$i-3] =~ /O/ ) ) || #</font> 00597 <font class="comment"> ( ( $bhsVerse[$i-2] =~ /Y/ ) && # or by hiriq-yod</font> 00598 <font class="comment"> ( $bhsVerse[$i-3] =~ /I/ ) ) ) &&</font> 00599 <font class="comment"> do {</font> 00600 <font class="comment"> $saveGuttural = pop @entity_line; # snip off the gutteral</font> 00601 <font class="comment"> push @entity_line,$PATAH; # push on the patach</font> 00602 <font class="comment"> push @entity_line,$saveGuttural; # push back on the gutteral</font> 00603 <font class="comment"> next CHAR;</font> 00604 <font class="comment"> };</font> 00605 <font class="comment"> # convert cantillation</font> 00606 <font class="comment"> # since we have previously dealt with all other cases of</font> 00607 <font class="comment"> # numbers, two digit patterns are all we have to search for</font> 00608 <font class="comment"> $bhsVerse[$i] =~ /\d/ && $bhsVerse[$i+1] =~ /\d/ && do {</font> 00609 <font class="comment"> push @entity_line,$Michigan2XMLentity{"$bhsVerse[$i]$bhsVerse[$i+1]"};</font> 00610 <font class="comment"> $i++; # accents are two digits long, so advance past the 2nd digit</font> 00611 <font class="comment"> next CHAR;</font> 00612 <font class="comment"> };</font> 00613 <font class="comment"> # convert katef vowels, which are two characters long</font> 00614 <font class="comment"> $bhsVerse[$i] =~ /:/ && $bhsVerse[$i+1] =~ /[AEF]/ && do {</font> 00615 <font class="comment"> push @entity_line,$Michigan2XMLentity{"$bhsVerse[$i]$bhsVerse[$i+1]"};</font> 00616 <font class="comment"> $i++;</font> 00617 <font class="comment"> next CHAR;</font> 00618 <font class="comment"> };</font> 00619 <font class="comment"> # convert everything else</font> 00620 <font class="comment"> push @entity_line,$Michigan2XMLentity{"$bhsVerse[$i]"};</font> 00621 <font class="comment"> } # end CHAR</font> 00622 <font class="comment"># print the line to standard output with XML character-level encoding</font> 00623 <font class="comment"># each character has the following format:</font> 00624 <font class="comment"># <c id="1kg1.verse#.word#.character#">&#1234;</c></font> 00625 <font class="comment"></font> 00626 <font class="comment"># set up the verse element</font> 00627 <font class="comment"> $word = 1;</font> 00628 <font class="comment"> $character = 1;</font> 00629 <font class="comment"> print "<verse>\n<word>\n";</font> 00630 <font class="comment"># print each character element</font> 00631 <font class="comment"># if there is a space, then close the word entity, open a new word</font> 00632 <font class="comment"># entity, increment the word number, reset the character number to</font> 00633 <font class="comment"># zero.</font> 00634 <font class="comment"> foreach $element (@entity_line) {</font> 00635 <font class="comment"> if ( $element =~ " " ) {</font> 00636 <font class="comment"> $word++;</font> 00637 <font class="comment"> $character = 1;</font> 00638 <font class="comment"> print "</word>\n<word>\n";</font> 00639 <font class="comment"> next;</font> 00640 <font class="comment"> }</font> 00641 <font class="comment"> print "<c id=\"1kg1.$verse.$word.$character\">$element</c>\n";</font> 00642 <font class="comment"> $character++;</font> 00643 <font class="comment"> }</font> 00644 <font class="comment"># close the verse element</font> 00645 <font class="comment"> print "</word></verse>\n";</font> 00646 <font class="comment"># reinitialize variables</font> 00647 <font class="comment"> @bhsVerse = ();</font> 00648 <font class="comment"> @entity_line = ();</font> 00649 <font class="comment"> @bhsLines = ();</font> 00650 <font class="comment"> } # end while</font> 00651 <font class="comment"># close the XML document</font> 00652 <font class="comment"> print "</body>\n";</font> 00653 <font class="comment"> */</font> </pre></div><hr><address align="right"><small>Generated on Thu Jun 20 22:12:59 2002 for The Sword Project by <a href="http://www.doxygen.org/index.html"> <img src="doxygen.png" alt="doxygen" align="middle" border=0 width=110 height=53></a>1.2.15 </small></address> </body> </html>