1
/***************************************************************************
2
copyright : (C) 2006 by Robby Stephenson
3
email : robby@periapsis.org
4
***************************************************************************/
6
/***************************************************************************
8
* This file has been modified to match the requirements of KBibTeX. *
9
* In case of problems or bugs arising from this implementation, please *
10
* contact the KBibTeX team first. *
11
* Thomas Fischer <fischer@unix-ag.uni-kl.de> *
13
***************************************************************************/
15
/***************************************************************************
17
* This program is free software; you can redistribute it and/or modify *
18
* it under the terms of version 2 of the GNU General Public License as *
19
* published by the Free Software Foundation; *
21
***************************************************************************/
23
// This class is adapted from Iso6937ToUnicode from the MARC4J project, available
24
// from http://marc4j.tigris.org, with the following notice:
25
// * Copyright (C) 2002 Bas Peters (mail@bpeters.com)
26
// * Copyright (C) 2002 Yves Pratter (ypratter@club-internet.fr)
28
// That source was released under the terms of the GNU Lesser General Public
29
// License, version 2.1. In accordance with Condition 3 of that license,
30
// I am applying the terms of the GNU General Public License to the source
31
// code, and including a large portion of it here
33
#include "iso6937converter.h"
38
using KBibTeX::Iso6937Converter;
40
QString Iso6937Converter::toUtf8(const QCString& text_) {
41
const uint len = text_.length();
45
for(uint i = 0; i < len; ++i) {
49
} else if(isCombining(c) && hasNext(i, len)) {
50
QChar d = getCombiningChar(c * 256 + text_[i + 1]);
55
result[pos++] = getChar(c);
58
result[pos++] = getChar(c);
66
bool Iso6937Converter::hasNext(uint pos, uint len) {
67
return pos < (len - 1);
71
bool Iso6937Converter::isAscii(uchar c) {
76
bool Iso6937Converter::isCombining(uchar c) {
77
return c >= 0xC0 && c <= 0xDF;
80
// Source : http://anubis.dkuug.dk/JTC1/SC2/WG3/docs/6937cd.pdf
81
QChar Iso6937Converter::getChar(uchar c) {
84
return 0x00A0; // 10/00 NO-BREAK SPACE
86
return 0x00A1; // 10/01 INVERTED EXCLAMATION MARK
88
return 0x00A2; // 10/02 CENT SIGN
90
return 0x00A3; // 10/03 POUND SIGN
91
// 10/04 (This position shall not be used)
93
return 0x00A5; // 10/05 YEN SIGN
94
// 10/06 (This position shall not be used)
96
return 0x00A7; // 10/07 SECTION SIGN
98
return 0x00A4; // 10/08 CURRENCY SIGN
100
return 0x2018; // 10/09 LEFT SINGLE QUOTATION MARK
102
return 0x201C; // 10/10 LEFT DOUBLE QUOTATION MARK
104
return 0x00AB; // 10/11 LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
106
return 0x2190; // 10/12 LEFTWARDS ARROW
108
return 0x2191; // 10/13 UPWARDS ARROW
110
return 0x2192; // 10/14 RIGHTWARDS ARROW
112
return 0x2193; // 10/15 DOWNWARDS ARROW
115
return 0x00B0; // 11/00 DEGREE SIGN
117
return 0x00B1; // 11/01 PLUS-MINUS SIGN
119
return 0x00B2; // 11/02 SUPERSCRIPT TWO
121
return 0x00B3; // 11/03 SUPERSCRIPT THREE
123
return 0x00D7; // 11/04 MULTIPLICATION SIGN
125
return 0x00B5; // 11/05 MICRO SIGN
127
return 0x00B6; // 11/06 PILCROW SIGN
129
return 0x00B7; // 11/07 MIDDLE DOT
131
return 0x00F7; // 11/08 DIVISION SIGN
133
return 0x2019; // 11/09 RIGHT SINGLE QUOTATION MARK
135
return 0x201D; // 11/10 RIGHT DOUBLE QUOTATION MARK
137
return 0x00BB; // 11/11 RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
139
return 0x00BC; // 11/12 VULGAR FRACTION ONE QUARTER
141
return 0x00BD; // 11/13 VULGAR FRACTION ONE HALF
143
return 0x00BE; // 11/14 VULGAR FRACTION THREE QUARTERS
145
return 0x00BF; // 11/15 INVERTED QUESTION MARK
147
// 4/0 to 5/15 diacritic characters
150
return 0x2015; // 13/00 HORIZONTAL BAR
152
return 0x00B9; // 13/01 SUPERSCRIPT ONE
154
return 0x2117; // 13/02 REGISTERED SIGN
156
return 0x00A9; // 13/03 COPYRIGHT SIGN
158
return 0x00AE; // 13/04 TRADE MARK SIGN
160
return 0x266A; // 13/05 EIGHTH NOTE
162
return 0x00AC; // 13/06 NOT SIGN
164
return 0x00A6; // 13/07 BROKEN BAR
165
// 13/08 (This position shall not be used)
166
// 13/09 (This position shall not be used)
167
// 13/10 (This position shall not be used)
168
// 13/11 (This position shall not be used)
170
return 0x215B; // 13/12 VULGAR FRACTION ONE EIGHTH
172
return 0x215E; // 13/15 VULGAR FRACTION SEVEN EIGHTHS
175
return 0x2126; // 14/00 OHM SIGN
177
return 0x00C6; // 14/01 LATIN CAPITAL LETTER AE
179
return 0x0110; // 14/02 LATIN CAPITAL LETTER D WITH STROKE
181
return 0x00AA; // 14/03 FEMININE ORDINAL INDICATOR
183
return 0x0126; // 14/04 LATIN CAPITAL LETTER H WITH STROKE
184
// 14/05 (This position shall not be used)
186
return 0x0132; // 14/06 LATIN CAPITAL LIGATURE IJ
188
return 0x013F; // 14/07 LATIN CAPITAL LETTER L WITH MIDDLE DOT
190
return 0x0141; // 14/08 LATIN CAPITAL LETTER L WITH STROKE
192
return 0x00D8; // 14/09 LATIN CAPITAL LETTER O WITH STROKE
194
return 0x0152; // 14/10 LATIN CAPITAL LIGATURE OE
196
return 0x00BA; // 14/11 MASCULINE ORDINAL INDICATOR
198
return 0x00DE; // 14/12 LATIN CAPITAL LETTER THORN
200
return 0x0166; // 14/13 LATIN CAPITAL LETTER T WITH STROKE
202
return 0x014A; // 14/14 LATIN CAPITAL LETTER ENG
204
return 0x0149; // 14/15 LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
207
return 0x0138; // 15/00 LATIN SMALL LETTER KRA
209
return 0x00E6; // 15/01 LATIN SMALL LETTER AE
211
return 0x0111; // 15/02 LATIN SMALL LETTER D WITH STROKE
213
return 0x00F0; // 15/03 LATIN SMALL LETTER ETH
215
return 0x0127; // 15/04 LATIN SMALL LETTER H WITH STROKE
217
return 0x0131; // 15/05 LATIN SMALL LETTER DOTLESS I
219
return 0x0133; // 15/06 LATIN SMALL LIGATURE IJ
221
return 0x0140; // 15/07 LATIN SMALL LETTER L WITH MIDDLE DOT
223
return 0x0142; // 15/08 LATIN SMALL LETTER L WITH STROKE
225
return 0x00F8; // 15/09 LATIN SMALL LETTER O WITH STROKE
227
return 0x0153; // 15/10 LATIN SMALL LIGATURE OE
229
return 0x00DF; // 15/11 LATIN SMALL LETTER SHARP S
231
return 0x00FE; // 15/12 LATIN SMALL LETTER THORN
233
return 0x0167; // 15/13 LATIN SMALL LETTER T WITH STROKE
235
return 0x014B; // 15/14 LATIN SMALL LETTER ENG
237
return 0x00AD; // 15/15 SOFT HYPHEN$
243
QChar Iso6937Converter::getCombiningChar(uint c) {
245
// 12/00 (This position shall not be used)
247
// 12/01 non-spacing grave accent
249
return 0x00C0; // LATIN CAPITAL LETTER A WITH GRAVE
251
return 0x00C8; // LATIN CAPITAL LETTER E WITH GRAVE
253
return 0x00CC; // LATIN CAPITAL LETTER I WITH GRAVE
255
return 0x00D2; // LATIN CAPITAL LETTER O WITH GRAVE
257
return 0x00D9; // LATIN CAPITAL LETTER U WITH GRAVE
259
return 0x00E0; // LATIN SMALL LETTER A WITH GRAVE
261
return 0x00E8; // LATIN SMALL LETTER E WITH GRAVE
263
return 0x00EC; // LATIN SMALL LETTER I WITH GRAVE
265
return 0x00F2; // LATIN SMALL LETTER O WITH GRAVE
267
return 0x00F9; // LATIN SMALL LETTER U WITH GRAVE
269
// 12/02 non-spacing acute accent
271
return 0x00B4; // ACUTE ACCENT
273
return 0x00C1; // LATIN CAPITAL LETTER A WITH ACUTE
275
return 0x0106; // LATIN CAPITAL LETTER C WITH ACUTE
277
return 0x00C9; // LATIN CAPITAL LETTER E WITH ACUTE
279
return 0x00CD; // LATIN CAPITAL LETTER I WITH ACUTE
281
return 0x0139; // LATIN CAPITAL LETTER L WITH ACUTE
283
return 0x0143; // LATIN CAPITAL LETTER N WITH ACUTE
285
return 0x00D3; // LATIN CAPITAL LETTER O WITH ACUTE
287
return 0x0154; // LATIN CAPITAL LETTER R WITH ACUTE
289
return 0x015A; // LATIN CAPITAL LETTER S WITH ACUTE
291
return 0x00DA; // LATIN CAPITAL LETTER U WITH ACUTE
293
return 0x00DD; // LATIN CAPITAL LETTER Y WITH ACUTE
295
return 0x0179; // LATIN CAPITAL LETTER Z WITH ACUTE
297
return 0x00E1; // LATIN SMALL LETTER A WITH ACUTE
299
return 0x0107; // LATIN SMALL LETTER C WITH ACUTE
301
return 0x00E9; // LATIN SMALL LETTER E WITH ACUTE
303
return 0x01F5; // LATIN SMALL LETTER G WITH CEDILLA(4)
305
return 0x00ED; // LATIN SMALL LETTER I WITH ACUTE
307
return 0x013A; // LATIN SMALL LETTER L WITH ACUTE
309
return 0x0144; // LATIN SMALL LETTER N WITH ACUTE
311
return 0x00F3; // LATIN SMALL LETTER O WITH ACUTE
313
return 0x0155; // LATIN SMALL LETTER R WITH ACUTE
315
return 0x015B; // LATIN SMALL LETTER S WITH ACUTE
317
return 0x00FA; // LATIN SMALL LETTER U WITH ACUTE
319
return 0x00FD; // LATIN SMALL LETTER Y WITH ACUTE
321
return 0x017A; // LATIN SMALL LETTER Z WITH ACUTE
323
// 12/03 non-spacing circumflex accent
325
return 0x00C2; // LATIN CAPITAL LETTER A WITH CIRCUMFLEX
327
return 0x0108; // LATIN CAPITAL LETTER C WITH CIRCUMFLEX
329
return 0x00CA; // LATIN CAPITAL LETTER E WITH CIRCUMFLEX
331
return 0x011C; // LATIN CAPITAL LETTER G WITH CIRCUMFLEX
333
return 0x0124; // LATIN CAPITAL LETTER H WITH CIRCUMFLEX
335
return 0x00CE; // LATIN CAPITAL LETTER I WITH CIRCUMFLEX
337
return 0x0134; // LATIN CAPITAL LETTER J WITH CIRCUMFLEX
339
return 0x00D4; // LATIN CAPITAL LETTER O WITH CIRCUMFLEX
341
return 0x015C; // LATIN CAPITAL LETTER S WITH CIRCUMFLEX
343
return 0x00DB; // LATIN CAPITAL LETTER U WITH CIRCUMFLEX
345
return 0x0174; // LATIN CAPITAL LETTER W WITH CIRCUMFLEX
347
return 0x0176; // LATIN CAPITAL LETTER Y WITH CIRCUMFLEX
349
return 0x00E2; // LATIN SMALL LETTER A WITH CIRCUMFLEX
351
return 0x0109; // LATIN SMALL LETTER C WITH CIRCUMFLEX
353
return 0x00EA; // LATIN SMALL LETTER E WITH CIRCUMFLEX
355
return 0x011D; // LATIN SMALL LETTER G WITH CIRCUMFLEX
357
return 0x0125; // LATIN SMALL LETTER H WITH CIRCUMFLEX
359
return 0x00EE; // LATIN SMALL LETTER I WITH CIRCUMFLEX
361
return 0x0135; // LATIN SMALL LETTER J WITH CIRCUMFLEX
363
return 0x00F4; // LATIN SMALL LETTER O WITH CIRCUMFLEX
365
return 0x015D; // LATIN SMALL LETTER S WITH CIRCUMFLEX
367
return 0x00FB; // LATIN SMALL LETTER U WITH CIRCUMFLEX
369
return 0x0175; // LATIN SMALL LETTER W WITH CIRCUMFLEX
371
return 0x0177; // LATIN SMALL LETTER Y WITH CIRCUMFLEX
373
// 12/04 non-spacing tilde
375
return 0x00C3; // LATIN CAPITAL LETTER A WITH TILDE
377
return 0x0128; // LATIN CAPITAL LETTER I WITH TILDE
379
return 0x00D1; // LATIN CAPITAL LETTER N WITH TILDE
381
return 0x00D5; // LATIN CAPITAL LETTER O WITH TILDE
383
return 0x0168; // LATIN CAPITAL LETTER U WITH TILDE
385
return 0x00E3; // LATIN SMALL LETTER A WITH TILDE
387
return 0x0129; // LATIN SMALL LETTER I WITH TILDE
389
return 0x00F1; // LATIN SMALL LETTER N WITH TILDE
391
return 0x00F5; // LATIN SMALL LETTER O WITH TILDE
393
return 0x0169; // LATIN SMALL LETTER U WITH TILDE
395
// 12/05 non-spacing macron
397
return 0x0100; // LATIN CAPITAL LETTER A WITH MACRON
399
return 0x0112; // LATIN CAPITAL LETTER E WITH MACRON
401
return 0x012A; // LATIN CAPITAL LETTER I WITH MACRON
403
return 0x014C; // LATIN CAPITAL LETTER O WITH MACRON
405
return 0x016A; // LATIN CAPITAL LETTER U WITH MACRON
407
return 0x0101; // LATIN SMALL LETTER A WITH MACRON
409
return 0x0113; // LATIN SMALL LETTER E WITH MACRON
411
return 0x012B; // LATIN SMALL LETTER I WITH MACRON
413
return 0x014D; // LATIN SMALL LETTER O WITH MACRON
415
return 0x016B; // LATIN SMALL LETTER U WITH MACRON
417
// 12/06 non-spacing breve
419
return 0x02D8; // BREVE
421
return 0x0102; // LATIN CAPITAL LETTER A WITH BREVE
423
return 0x011E; // LATIN CAPITAL LETTER G WITH BREVE
425
return 0x016C; // LATIN CAPITAL LETTER U WITH BREVE
427
return 0x0103; // LATIN SMALL LETTER A WITH BREVE
429
return 0x011F; // LATIN SMALL LETTER G WITH BREVE
431
return 0x016D; // LATIN SMALL LETTER U WITH BREVE
433
// 12/07 non-spacing dot above
435
return 0x010A; // LATIN CAPITAL LETTER C WITH DOT ABOVE
437
return 0x0116; // LATIN CAPITAL LETTER E WITH DOT ABOVE
439
return 0x0120; // LATIN CAPITAL LETTER G WITH DOT ABOVE
441
return 0x0130; // LATIN CAPITAL LETTER I WITH DOT ABOVE
443
return 0x017B; // LATIN CAPITAL LETTER Z WITH DOT ABOVE
445
return 0x010B; // LATIN SMALL LETTER C WITH DOT ABOVE
447
return 0x0117; // LATIN SMALL LETTER E WITH DOT ABOVE
449
return 0x0121; // LATIN SMALL LETTER G WITH DOT ABOVE
451
return 0x017C; // LATIN SMALL LETTER Z WITH DOT ABOVE
453
// 12/08 non-spacing diaeresis
455
return 0x00A8; // DIAERESIS
457
return 0x00C4; // LATIN CAPITAL LETTER A WITH DIAERESIS
459
return 0x00CB; // LATIN CAPITAL LETTER E WITH DIAERESIS
461
return 0x00CF; // LATIN CAPITAL LETTER I WITH DIAERESIS
463
return 0x00D6; // LATIN CAPITAL LETTER O WITH DIAERESIS
465
return 0x00DC; // LATIN CAPITAL LETTER U WITH DIAERESIS
467
return 0x0178; // LATIN CAPITAL LETTER Y WITH DIAERESIS
469
return 0x00E4; // LATIN SMALL LETTER A WITH DIAERESIS
471
return 0x00EB; // LATIN SMALL LETTER E WITH DIAERESIS
473
return 0x00EF; // LATIN SMALL LETTER I WITH DIAERESIS
475
return 0x00F6; // LATIN SMALL LETTER O WITH DIAERESIS
477
return 0x00FC; // LATIN SMALL LETTER U WITH DIAERESIS
479
return 0x00FF; // LATIN SMALL LETTER Y WITH DIAERESIS
481
// 12/09 (This position shall not be used)
483
// 12/10 non-spacing ring above
485
return 0x02DA; // RING ABOVE
487
return 0x00C5; // LATIN CAPITAL LETTER A WITH RING ABOVE
489
return 0x016E; // LATIN CAPITAL LETTER U WITH RING ABOVE
491
return 0x00E5; // LATIN SMALL LETTER A WITH RING ABOVE
493
return 0x016F; // LATIN SMALL LETTER U WITH RING ABOVE
495
// 12/11 non-spacing cedilla
497
return 0x00B8; // CEDILLA
499
return 0x00C7; // LATIN CAPITAL LETTER C WITH CEDILLA
501
return 0x0122; // LATIN CAPITAL LETTER G WITH CEDILLA
503
return 0x0136; // LATIN CAPITAL LETTER K WITH CEDILLA
505
return 0x013B; // LATIN CAPITAL LETTER L WITH CEDILLA
507
return 0x0145; // LATIN CAPITAL LETTER N WITH CEDILLA
509
return 0x0156; // LATIN CAPITAL LETTER R WITH CEDILLA
511
return 0x015E; // LATIN CAPITAL LETTER S WITH CEDILLA
513
return 0x0162; // LATIN CAPITAL LETTER T WITH CEDILLA
515
return 0x00E7; // LATIN SMALL LETTER C WITH CEDILLA
516
// case 0xCB67: return 0x0123; // small g with cedilla
518
return 0x0137; // LATIN SMALL LETTER K WITH CEDILLA
520
return 0x013C; // LATIN SMALL LETTER L WITH CEDILLA
522
return 0x0146; // LATIN SMALL LETTER N WITH CEDILLA
524
return 0x0157; // LATIN SMALL LETTER R WITH CEDILLA
526
return 0x015F; // LATIN SMALL LETTER S WITH CEDILLA
528
return 0x0163; // LATIN SMALL LETTER T WITH CEDILLA
530
// 12/12 (This position shall not be used)
532
// 12/13 non-spacing double acute accent
534
return 0x0150; // LATIN CAPITAL LETTER O WITH DOUBLE ACUTE
536
return 0x0170; // LATIN CAPITAL LETTER U WITH DOUBLE ACUTE
538
return 0x0151; // LATIN SMALL LETTER O WITH DOUBLE ACUTE
540
return 0x0171; // LATIN SMALL LETTER U WITH DOUBLE ACUTE
542
// 12/14 non-spacing ogonek
544
return 0x02DB; // ogonek
546
return 0x0104; // LATIN CAPITAL LETTER A WITH OGONEK
548
return 0x0118; // LATIN CAPITAL LETTER E WITH OGONEK
550
return 0x012E; // LATIN CAPITAL LETTER I WITH OGONEK
552
return 0x0172; // LATIN CAPITAL LETTER U WITH OGONEK
554
return 0x0105; // LATIN SMALL LETTER A WITH OGONEK
556
return 0x0119; // LATIN SMALL LETTER E WITH OGONEK
558
return 0x012F; // LATIN SMALL LETTER I WITH OGONEK
560
return 0x0173; // LATIN SMALL LETTER U WITH OGONEK
562
// 12/15 non-spacing caron
564
return 0x02C7; // CARON
566
return 0x010C; // LATIN CAPITAL LETTER C WITH CARON
568
return 0x010E; // LATIN CAPITAL LETTER D WITH CARON
570
return 0x011A; // LATIN CAPITAL LETTER E WITH CARON
572
return 0x013D; // LATIN CAPITAL LETTER L WITH CARON
574
return 0x0147; // LATIN CAPITAL LETTER N WITH CARON
576
return 0x0158; // LATIN CAPITAL LETTER R WITH CARON
578
return 0x0160; // LATIN CAPITAL LETTER S WITH CARON
580
return 0x0164; // LATIN CAPITAL LETTER T WITH CARON
582
return 0x017D; // LATIN CAPITAL LETTER Z WITH CARON
584
return 0x010D; // LATIN SMALL LETTER C WITH CARON
586
return 0x010F; // LATIN SMALL LETTER D WITH CARON
588
return 0x011B; // LATIN SMALL LETTER E WITH CARON
590
return 0x013E; // LATIN SMALL LETTER L WITH CARON
592
return 0x0148; // LATIN SMALL LETTER N WITH CARON
594
return 0x0159; // LATIN SMALL LETTER R WITH CARON
596
return 0x0161; // LATIN SMALL LETTER S WITH CARON
598
return 0x0165; // LATIN SMALL LETTER T WITH CARON
600
return 0x017E; // LATIN SMALL LETTER Z WITH CARON
603
kdDebug() << "Iso6937Converter::getCombiningChar() - no match for " << c << endl;