3
* Title: Keyboard mapping for Michigan-Claremont Hebrew input
5
* Copyright: Copyright (c) 2001 CrossWire Bible Society under the terms of the GNU GPL
7
* @author Troy A. Griffitts
11
#include <hebrewmcim.h>
13
HebrewMCIM::HebrewMCIM()
20
int *HebrewMCIM::translate(char in) {
22
static int retString[5];
23
int retStringIndex = 0;
25
memset(retString, 0, 5);
28
if (getState() >= 12) { // serious issue with internal structure
30
retString[retStringIndex++] = in;
33
map<int, int>::iterator find = subst2[getState()].find(in);
34
if (find != subst2[getState()].end())
35
retVal = find->second;
39
retString[retStringIndex++] = retVal;
47
retString[retStringIndex++] = in;
52
retString[retStringIndex++] = retVal;
55
if (retVal == 50) { // multiChar
57
int *chars = multiChars[in];
59
retString[retStringIndex++] = chars[0];
60
retString[retStringIndex++] = chars[1];
70
void HebrewMCIM::init() {
71
memset(subst, 0, 255);
84
subst['k'] = 1498; // finals
102
// special multiChars
106
static int x[] = {1513, 1474};
108
static int y[] = {1513, 1473};
134
subst2[2]['A'] = 1458;
135
subst2[2]['E'] = 1457;
136
subst2[2]['F'] = 1459;
139
/* Telisha qetana is postpositive as in '04' above. However, Michigan
140
# code '24' is for a medial telisha. Graphically, there is no
144
subst2[5]['4'] = 1449;
147
/* Note Michigan encoding distinguishes between medial metheg '35' (occuring
148
# on the left of the vowel), and the ordinary meteg '95' (occuring on the
149
# right of the vowel). It is also used for silluq.
152
subst2[6]['3'] = 1433;
153
subst2[6]['5'] = 1469;
156
/* The Michigan code of telisha gedola in medial position. Graphically,
157
# there is no difference.
160
subst2[7]['4'] = 1440;
163
subst2[8]['0'] = 1451;
164
subst2[8]['1'] = 1436;
167
subst2[4]['0'] = 1434;
169
/* In the poetic books, prepositive dehi occurs; it's unclear whether
170
# tipeha also occurs in the poetic books. Otherwise, we could simply
171
# check for what book in the Tanach we are in. Michigan uses the same
175
subst2[4]['3'] = 1430;
177
/* This is the poetic accent mugrash, which also includes rebia, but is
178
# encoded separately as '81' in the Michigan text.
180
subst2[4]['1'] = 1437;
181
subst2[4]['4'] = 1440;
185
subst2[3]['0'] = 1475;
186
subst2[3]['1'] = 1426;
188
/* According to BHS, zarqa and sinnor are both postpositive. However,
189
# the Michigan encoding uses one code for both. The Unicode zarqa
190
# (0x0598) is definitely NOT postpositive. And further, the shape of
191
# the symbol is different in BHS and Uniocde. This needs further
192
# research to determine what's going on here. For now, we follow BHS
193
# and use the postpositive Unicode zinor or both accents.
196
subst2[3]['2'] = 1454;
198
/* Pashta is postpositive, and the Unicode equivalent reflects
199
# this. However, there is a poetic equivalent -- azla legarmeh --
200
# which is not postpositive, but no equivalent code point exists in
201
# Unicode. The Michigan encoding does not distinguish between the two,
202
# although it could be algorithmically determined.
205
subst2[3]['3'] = 1433;
206
subst2[3]['4'] = 1449;
207
subst2[3]['5'] = 1472;
210
/* This is the Unicode Hebrew *accent*; there is also another Hebrew
211
# *punctuation* called GERSHAYIM 0x05F4. I'm using the more
212
# traditional rounded marks, rather than the alternate straight
216
subst2[8]['2'] = 1438;
218
// Also known as azla
219
subst2[8]['3'] = 1448;
220
subst2[8]['4'] = 1452;
221
subst2[8]['5'] = 1427;
225
subst2[9]['0'] = 1428;
226
subst2[9]['1'] = 1431;
228
/* Note, this accent is actually sinnorit, but it does not exist as a
229
# separate glyph in the Unicode standard. The 'ZINOR' Unicode accent
230
# is postpositive, while sinnorit is not. ZARQA is as close as I can
233
subst2[9]['2'] = 1432;
235
/* The Unicode form does not match the form used by BHS, but the names
238
subst2[9]['3'] = 1441;
239
subst2[9]['4'] = 1439;
240
subst2[9]['5'] = 1429;
243
subst2[10]['0'] = 1444;
244
subst2[10]['1'] = 1445;
245
subst2[10]['2'] = 1446;
246
subst2[10]['3'] = 1430; // also '13', '73' also is used for majela
247
subst2[10]['4'] = 1443;
248
subst2[10]['5'] = 1469; // this is silluq; should appear to the left of the vowel
251
subst2[11]['1'] = 1435;
252
subst2[11]['2'] = 1425;
253
subst2[11]['3'] = 1450;
254
subst2[11]['4'] = 1447;
255
subst2[11]['5'] = 1469; // should appear to the right of the vowel
264
my $ETNAHTA = '֑';
265
# officially the Unicode name for this symbol was "SEGOL." However, that is
266
# not a unique name, conflicting with the vowel of the same name. Further,
267
# the position of the symbol is different. I have changed the name of the
268
# accent to "SEGOLTA," the traditional name for this accent.
269
my $SEGOLTA = '֒';
270
my $SHALSHELET = '֓';
271
my $ZAQEF_QATAN = '֔';
272
my $ZAQEF_GADOL = '֕';
273
my $TIPEHA = '֖';
274
my $REVIA = '֗';
275
my $ZARQA = '֘';
276
my $PASHTA = '֙';
277
my $YETIV = '֚';
278
my $TEVIR = '֛';
279
my $GERESH = '֜';
280
my $GERESH_MUQDAM = '֝';
281
my $GERSHAYIM = '֞';
282
my $QARNEY_PARA = '֟';
283
my $TELISHA_GEDOLA = '֠';
284
my $PAZER = '֡';
285
my $MUNAH = '֣';
286
my $MAHAPAKH = '֤';
287
my $MERKHA = '֥';
288
my $MERKHA_KEFULA = '֦';
289
my $DARGA = '֧';
290
my $QADMA = '֨';
291
my $TELISHA_QETANA = '֩';
292
my $YERAH_BEN_YOMO = '֪';
294
my $ILUY = '֬';
295
my $DEHI = '֭';
296
my $ZINOR = '֮';
298
my $MASORA_CIRCLE = '֯';
299
# HEBREW EXTENDED-A points and punctuation
300
my $SHEVA = 'ְ';
301
my $HATAF_SEGOL = 'ֱ';
302
my $HATAF_PATAH = 'ֲ';
303
my $HATAF_QAMATS = 'ֳ';
304
my $HIRIQ = 'ִ';
305
my $TSERE = 'ֵ';
306
my $SEGOL = 'ֶ';
307
# furtive Patah is not a distinct character
308
my $PATAH = 'ַ';
309
my $QAMATS = 'ָ';
310
my $HOLAM = 'ֹ';
311
my $QUBUTS = 'ֻ';
312
# also used as shuruq
313
# falls within the base letter
314
my $DAGESH_OR_MAPIQ = 'ּ';
316
my $METAG = 'ֽ';
317
my $MAQAF = '־';
318
my $RAFE = 'ֿ';
319
# Also used for legarmeh
320
# may be treated as spacing punctuation, not as a point
321
my $PASEQ = '׀';
322
my $SHIN_DOT = 'ׁ';
323
my $SIN_DOT = 'ׂ';
324
my $SOF_PASUQ = '׃';
326
my $UPPER_DOT = 'ׄ';
327
# HEBREW LETTERS based on ISO 8859-8
329
# x (alef symbol - 2135)
330
my $ALEF = 'א';
331
# x (bet symbol - 2136)
333
# x (gimel symbol - 2137)
334
my $GIMEL = 'ג';
335
# x (dalet symbol - 2138)
336
my $DALET = 'ד';
339
my $ZAYIN = 'ז';
343
my $FINAL_KAF = 'ך';
345
my $LAMED = 'ל';
346
my $FINAL_MEM = 'ם';
348
my $FINAL_NUN = 'ן';
350
my $SAMEKH = 'ס';
351
my $AYIN = 'ע';
352
my $FINAL_PE = 'ף';
354
my $FINAL_TSADI = 'ץ';
356
my $TSADI = 'צ';
358
my $RESH = 'ר';
359
my $SHIN = 'ש';
364
my $DOUBLE_VAV = 'װ';
365
my $VAV_YOD = 'ױ';
367
my $DOUBLE_YOD = 'ײ';
369
# Additional punctuation
370
my $PUNCT_GERESH = '׳';
371
my $PUNCT_GERSHAYIM = '״';
373
# x (hebrew point judeo-spanish varika - FB1E)
374
#my $JUDEO_SPANISH_VARIKA = pack("U",0xFB1E); # UTF-8 OxFB1E
376
#############################
377
# End of Unicode 2.0 Hebrew #
378
#############################
380
# A hash whose key is a Michagan code, and whose value is a Unicode
383
char subst[] = new char [255];
404
'#' => $SHIN, # the letter shin without a point
405
'&' => ($SHIN . $SIN_DOT),
406
'$' => ($SHIN . $SHIN_DOT), # '
417
':A' => $HATAF_PATAH,
418
':E' => $HATAF_SEGOL,
419
':F' => $HATAF_QAMATS,
421
'.' => $DAGESH_OR_MAPIQ,
427
# According to BHS, zarqa and sinnor are both postpositive. However,
428
# the Michigan encoding uses one code for both. The Unicode zarqa
429
# (0x0598) is definitely NOT postpositive. And further, the shape of
430
# the symbol is different in BHS and Uniocde. This needs further
431
# research to determine what's going on here. For now, we follow BHS
432
# and use the postpositive Unicode zinor or both accents.
434
# Pashta is postpositive, and the Unicode equivalent reflects
435
# this. However, there is a poetic equivalent -- azla legarmeh --
436
# which is not postpositive, but no equivalent code point exists in
437
# Unicode. The Michigan encoding does not distinguish between the two,
438
# although it could be algorithmically determined.
440
'04' => $TELISHA_QETANA,
443
# In the poetic books, prepositive dehi occurs; it's unclear whether
444
# tipeha also occurs in the poetic books. Otherwise, we could simply
445
# check for what book in the Tanach we are in. Michigan uses the same
447
'13' => $TIPEHA, # also $DEHI
448
# This is the poetic accent mugrash, which also includes rebia, but is
449
# encoded separately as '81' in the Michigan text.
450
'11' => $GERESH_MUQDAM,
451
'14' => $TELISHA_GEDOLA,
452
# Telisha qetana is postpositive as in '04' above. However, Michigan
453
# code '24' is for a medial telisha. Graphically, there is no
455
'24' => $TELISHA_QETANA,
457
# The Michigan code of telisha gedola in medial position. Graphically,
458
# there is no difference.
459
'44' => $TELISHA_GEDOLA,
462
# This is the Unicode Hebrew *accent*; there is also another Hebrew
463
# *punctuation* called GERSHAYIM 0x05F4. I'm using the more
464
# traditional rounded marks, rather than the alternate straight
471
'80' => $ZAQEF_QATAN,
473
# Note, this accent is actually sinnorit, but it does not exist as a
474
# separate glyph in the Unicode standard. The 'ZINOR' Unicode accent
475
# is postpositive, while sinnorit is not. ZARQA is as close as I can
478
# The Unicode form does not match the form used by BHS, but the names
481
'84' => $QARNEY_PARA,
482
'85' => $ZAQEF_GADOL,
483
# Note Michigan encoding distinguishes between medial metheg '35' (occuring
484
# on the left of the vowel), and the ordinary meteg '95' (occuring on the
485
# right of the vowel). It is also used for silluq.
489
'72' => $MERKHA_KEFULA,
490
'73' => $TIPEHA, # also '13', '73' also is used for majela
492
'75' => $METAG, # this is silluq; should appear to the left of the vowel
495
'93' => $YERAH_BEN_YOMO,
497
'95' => $METAG, # should appear to the right of the vowel
499
# Not used by the Michigan Encoding
500
# $UPPER_DOT = '05C4';
503
# declare other variables
519
# iterate over every character and change to XML decimal entity
520
CHAR: for ( $i = 0; ($i < scalar(@bhsVerse)); $i++) {
521
# find and convert final kaf, mem, nun, pe, tsade
523
$bhsVerse[$i] =~ /[KMNPC]/
528
$bhsVerse[$i+1] =~ /[ \-?]/
532
$i == ( scalar(@bhsVerse) - 1 )
536
( $bhsVerse[$i+1] =~ /0/ ) &&
537
( $bhsVerse[$i+2] =~ /0/ )
540
( # one accent followed by white, eol or
542
( $bhsVerse[$i+1] =~ /\d/ ) &&
543
( $bhsVerse[$i+2] =~ /\d/ )
546
( $bhsVerse[$i+3] =~ /[ \-?]/ ) ||
547
( $i == ( scalar(@bhsVerse) - 1 ) )
551
( # two accents followed by white, eol
553
( $bhsVerse[$i+1] =~ /\d/ ) &&
554
( $bhsVerse[$i+2] =~ /\d/ ) &&
555
( $bhsVerse[$i+3] =~ /\d/ ) &&
556
( $bhsVerse[$i+4] =~ /\d/ )
559
( $bhsVerse[$i+5] =~ /[ \-?]/ ) ||
560
( $i == ( scalar(@bhsVerse) - 1 ) )
564
( # followed by a vowel and white, eol, sof pasuq
565
( $bhsVerse[$i+1] =~ /[:F]/ ) &&
567
( $bhsVerse[$i+2] =~ /[ \-?]/ ) || # whitespace or
568
( $i == ( scalar(@bhsVerse) - 1 ) ) || # eol or
570
( $bhsVerse[$i+2] =~ /0/ ) &&
571
( $bhsVerse[$i+3] =~ /0/ )
575
) # end of what follows after final letter
578
$bhsVerse[$i] =~ /K/ && eval { push @entity_line,$FINAL_KAF; }
580
$bhsVerse[$i] =~ /M/ && eval { push @entity_line,$FINAL_MEM; }
582
$bhsVerse[$i] =~ /N/ && eval { push @entity_line,$FINAL_NUN; }
584
$bhsVerse[$i] =~ /P/ && eval { push @entity_line,$FINAL_PE; }
586
$bhsVerse[$i] =~ /C/ && eval { push @entity_line,$FINAL_TSADI; }
589
# find and convert "furtive patach"
590
( $bhsVerse[$i] =~ /A/ ) && # If the letter is a patach
591
( $bhsVerse[$i-1] =~ /[)HX(]/ ) && # and is preceeded by a guttural
592
( ( $bhsVerse[$i-2] =~ /[AEFOU]/ ) || # and is preceeded by a vowel
593
( ( $bhsVerse[$i-2] =~ /\./ ) && # or by suruq
594
( $bhsVerse[$i-3] =~ /W/ ) ) || #
595
( ( $bhsVerse[$i-2] =~ /W/ ) && # or by holem (written plene)
596
( $bhsVerse[$i-3] =~ /O/ ) ) || #
597
( ( $bhsVerse[$i-2] =~ /Y/ ) && # or by hiriq-yod
598
( $bhsVerse[$i-3] =~ /I/ ) ) ) &&
600
$saveGuttural = pop @entity_line; # snip off the gutteral
601
push @entity_line,$PATAH; # push on the patach
602
push @entity_line,$saveGuttural; # push back on the gutteral
605
# convert cantillation
606
# since we have previously dealt with all other cases of
607
# numbers, two digit patterns are all we have to search for
608
$bhsVerse[$i] =~ /\d/ && $bhsVerse[$i+1] =~ /\d/ && do {
609
push @entity_line,$Michigan2XMLentity{"$bhsVerse[$i]$bhsVerse[$i+1]"};
610
$i++; # accents are two digits long, so advance past the 2nd digit
613
# convert katef vowels, which are two characters long
614
$bhsVerse[$i] =~ /:/ && $bhsVerse[$i+1] =~ /[AEF]/ && do {
615
push @entity_line,$Michigan2XMLentity{"$bhsVerse[$i]$bhsVerse[$i+1]"};
619
# convert everything else
620
push @entity_line,$Michigan2XMLentity{"$bhsVerse[$i]"};
622
# print the line to standard output with XML character-level encoding
623
# each character has the following format:
624
# <c id="1kg1.verse#.word#.character#">Ӓ</c>
626
# set up the verse element
629
print "<verse>\n<word>\n";
630
# print each character element
631
# if there is a space, then close the word entity, open a new word
632
# entity, increment the word number, reset the character number to
634
foreach $element (@entity_line) {
635
if ( $element =~ " " ) {
638
print "</word>\n<word>\n";
641
print "<c id=\"1kg1.$verse.$word.$character\">$element</c>\n";
644
# close the verse element
645
print "</word></verse>\n";
646
# reinitialize variables
651
# close the XML document