1
/***************************************************************************
2
* Copyright (C) 2004-2009 by Thomas Fischer *
3
* fischer@unix-ag.uni-kl.de *
5
* This program is free software; you can redistribute it and/or modify *
6
* it under the terms of the GNU General Public License as published by *
7
* the Free Software Foundation; either version 2 of the License, or *
8
* (at your option) any later version. *
10
* This program is distributed in the hope that it will be useful, *
11
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
12
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13
* GNU General Public License for more details. *
15
* You should have received a copy of the GNU General Public License *
16
* along with this program; if not, write to the *
17
* Free Software Foundation, Inc., *
18
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
19
***************************************************************************/
21
#include <qapplication.h>
24
#include "encoderlatex.h"
28
EncoderLaTeX *EncoderLaTeX::encoderLaTeX = NULL;
30
static struct Decomposition
32
const char *latexCommand;
42
/*{"x", 0x0305}, OVERLINE */
45
/*{"x", 0x0309}, HOOK ABOVE */
49
/*{"x", 0x030d}, VERTICAL LINE ABOVE */
50
/*{"x", 0x030e}, DOUBLE VERTICAL LINE ABOVE */
51
/*{"x", 0x030f}, DOUBLE GRAVE ACCENT */
52
/*{"x", 0x0310}, CANDRABINDU */
53
/*{"x", 0x0311}, INVERTED BREVE */
54
/*{"x", 0x0312}, TURNED COMMA ABOVE */
55
/*{"x", 0x0313}, COMMA ABOVE */
56
/*{"x", 0x0314}, REVERSED COMMA ABOVE */
88
static const int decompositionscount = sizeof( decompositions ) / sizeof( decompositions[ 0 ] ) ;
90
static const struct EncoderLaTeXCommandMapping
95
commandmappingdatalatex[] =
112
// awk -F '[{}\\\\]+' '/DeclareUnicodeCharacter/ { print "{\""$4"\", 0x"$3"},"}' /usr/share/texmf-dist/tex/latex/base/t2aenc.dfu | grep '0x04' | sort -r -f
113
{"cyrzhdsc", 0x0497},
114
{"CYRZHDSC", 0x0496},
127
{"cyryhcrs", 0x04B1},
128
{"CYRYHCRS", 0x04B0},
135
{"cyrushrt", 0x045E},
136
{"CYRUSHRT", 0x040E},
151
{"cyrsftsn", 0x044C},
152
{"CYRSFTSN", 0x042C},
155
{"cyrschwa", 0x04D9},
156
{"CYRSCHWA", 0x04D8},
161
{"CYRpalochka", 0x04C0},
182
{"cyrkvcrs", 0x049D},
183
{"CYRKVCRS", 0x049C},
190
{"cyrishrt", 0x0439},
191
{"CYRISHRT", 0x0419},
198
{"cyrhrdsn", 0x044A},
199
{"CYRHRDSN", 0x042A},
206
{"cyrghcrs", 0x0493},
207
{"CYRGHCRS", 0x0492},
226
{"cyrchvcrs", 0x04B9},
227
{"CYRCHVCRS", 0x04B8},
228
{"cyrchrdsc", 0x04B7},
229
{"CYRCHRDSC", 0x04B6},
242
static const int commandmappingdatalatexcount = sizeof( commandmappingdatalatex ) / sizeof( commandmappingdatalatex[ 0 ] ) ;
244
/** Command can be either
247
(3) <space>, line end,
248
(4) \following_command (including \<space>, which must be maintained!),
249
(5) } (end of entry or group)
251
const char *expansionsCmd[] = {"\\{\\\\%1\\}", "\\\\%1\\{\\}", "\\\\%1(\\n|\\r|\\\\|\\})", "\\\\%1\\s"};
252
static const int expansionscmdcount = sizeof( expansionsCmd ) / sizeof( expansionsCmd[0] );
254
static const struct EncoderLaTeXModCharMapping
256
const char *modifier;
258
unsigned int unicode;
260
modcharmappingdatalatex[] =
262
{"\\\\`", "A", 0x00C0},
263
{"\\\\'", "A", 0x00C1},
264
{"\\\\\\^", "A", 0x00C2},
265
{"\\\\~", "A", 0x00C3},
266
{"\\\\\"", "A", 0x00C4},
267
{"\\\\r", "A", 0x00C5},
269
{"\\\\c", "C", 0x00C7},
270
{"\\\\`", "E", 0x00C8},
271
{"\\\\'", "E", 0x00C9},
272
{"\\\\\\^", "E", 0x00CA},
273
{"\\\\\"", "E", 0x00CB},
274
{"\\\\`", "I", 0x00CC},
275
{"\\\\'", "I", 0x00CD},
276
{"\\\\\\^", "I", 0x00CE},
277
{"\\\\\"", "I", 0x00CF},
279
{"\\\\~", "N", 0x00D1},
280
{"\\\\`", "O", 0x00D2},
281
{"\\\\'", "O", 0x00D3},
282
{"\\\\\\^", "O", 0x00D4},
284
{"\\\\\"", "O", 0x00D6},
286
{"\\\\", "O", 0x00D8},
287
{"\\\\`", "U", 0x00D9},
288
{"\\\\'", "U", 0x00DA},
289
{"\\\\\\^", "U", 0x00DB},
290
{"\\\\\"", "U", 0x00DC},
291
{"\\\\'", "Y", 0x00DD},
293
{"\\\\\"", "s", 0x00DF},
294
{"\\\\`", "a", 0x00E0},
295
{"\\\\'", "a", 0x00E1},
296
{"\\\\\\^", "a", 0x00E2},
297
{"\\\\~", "a", 0x00E3},
298
{"\\\\\"", "a", 0x00E4},
299
{"\\\\r", "a", 0x00E5},
301
{"\\\\c", "c", 0x00E7},
302
{"\\\\`", "e", 0x00E8},
303
{"\\\\'", "e", 0x00E9},
304
{"\\\\\\^", "e", 0x00EA},
305
{"\\\\\"", "e", 0x00EB},
306
{"\\\\`", "i", 0x00EC},
307
{"\\\\'", "i", 0x00ED},
308
{"\\\\'", "\\\\i", 0x00ED},
309
{"\\\\\\^", "i", 0x00EE},
312
{"\\\\~", "n", 0x00F1},
313
{"\\\\`", "o", 0x00F2},
314
{"\\\\'", "o", 0x00F3},
315
{"\\\\\\^", "o", 0x00F4},
317
{"\\\\\"", "o", 0x00F6},
319
{"\\\\", "o", 0x00F8},
320
{"\\\\`", "u", 0x00F9},
321
{"\\\\'", "u", 0x00FA},
322
{"\\\\\\^", "u", 0x00FB},
323
{"\\\\\"", "u", 0x00FC},
324
{"\\\\'", "y", 0x00FD},
329
{"\\\\u", "A", 0x0102},
330
{"\\\\u", "a", 0x0103},
333
{"\\\\'", "C", 0x0106},
334
{"\\\\'", "c", 0x0107},
339
{"\\\\v", "C", 0x010C},
340
{"\\\\v", "c", 0x010D},
341
{"\\\\v", "D", 0x010E},
351
{"\\\\c", "E", 0x0118},
352
{"\\\\c", "e", 0x0119},
353
{"\\\\v", "E", 0x011A},
354
{"\\\\v", "e", 0x011B},
357
{"\\\\u", "G", 0x011E},
358
{"\\\\u", "g", 0x011F},
371
{"\\\\u", "I", 0x012C},
372
{"\\\\u", "i", 0x012D},
384
{"\\\\'", "L", 0x0139},
385
{"\\\\'", "l", 0x013A},
394
{"\\\\'", "N", 0x0143},
395
{"\\\\'", "n", 0x0144},
398
{"\\\\v", "N", 0x0147},
399
{"\\\\v", "n", 0x0148},
405
{"\\\\u", "O", 0x014E},
406
{"\\\\u", "o", 0x014F},
407
{"\\\\H", "O", 0x0150},
408
{"\\\\H", "o", 0x0151},
411
{"\\\\'", "R", 0x0154},
412
{"\\\\'", "r", 0x0155},
415
{"\\\\v", "R", 0x0158},
416
{"\\\\v", "r", 0x0159},
417
{"\\\\'", "S", 0x015A},
418
{"\\\\'", "s", 0x015B},
421
{"\\\\c", "S", 0x015E},
422
{"\\\\c", "s", 0x015F},
423
{"\\\\v", "S", 0x0160},
424
{"\\\\v", "s", 0x0161},
427
{"\\\\v", "T", 0x0164},
435
{"\\\\u", "U", 0x016C},
436
{"\\\\u", "u", 0x016D},
437
{"\\\\r", "U", 0x016E},
438
{"\\\\r", "u", 0x016F},
447
{"\\\\\"", "Y", 0x0178},
448
{"\\\\'", "Z", 0x0179},
449
{"\\\\'", "z", 0x017A},
452
{"\\\\v", "Z", 0x017D},
453
{"\\\\v", "z", 0x017E},
456
{"\\\\v", "A", 0x01CD},
457
{"\\\\v", "a", 0x01CE},
458
{"\\\\v", "G", 0x01E6},
459
{"\\\\v", "g", 0x01E7}
462
const char *expansionsMod1[] = {"\\{%1\\{%2\\}\\}", "\\{%1 %2\\}", "%1\\{%2\\}"};
463
static const int expansionsmod1count = sizeof( expansionsMod1 ) / sizeof( expansionsMod1[0] );
464
const char *expansionsMod2[] = {"\\{%1%2\\}", "%1%2\\{\\}", "%1%2"};
465
static const int expansionsmod2count = sizeof( expansionsMod2 ) / sizeof( expansionsMod2[0] );
467
static const int modcharmappingdatalatexcount = sizeof( modcharmappingdatalatex ) / sizeof( modcharmappingdatalatex[ 0 ] ) ;
469
static const struct EncoderLaTeXCharMapping
472
unsigned int unicode;
475
charmappingdatalatex[] =
477
{"\\\\#", 0x0023, "\\#"},
478
{"\\\\&", 0x0026, "\\&"},
479
{"\\\\_", 0x005F, "\\_"},
480
{"!`", 0x00A1, "!`"},
481
{"\"<", 0x00AB, "\"<"},
482
{"\">", 0x00BB, "\">"},
483
{"[?]`", 0x00BF, "?`"},
487
static const int charmappingdatalatexcount = sizeof( charmappingdatalatex ) / sizeof( charmappingdatalatex[ 0 ] ) ;
489
EncoderLaTeX::EncoderLaTeX()
492
buildCombinedMapping();
495
EncoderLaTeX::~EncoderLaTeX()
500
QString EncoderLaTeX::decode( const QString & text )
502
const QString splitMarker = "|KBIBTEX|";
503
QString result = text;
505
/** Collect (all?) urls from the BibTeX file and store them in urls */
506
/** Problem is that the replace function below will replace
507
* character sequences in the URL rendering the URL invalid.
508
* Later, all URLs will be replaced back to their original
509
* in the hope nothing breaks ... */
511
QRegExp httpRegExp( "(ht|f)tp://[^\"} ]+" );
512
httpRegExp.setMinimal( false );
516
pos = httpRegExp.search( result, pos );
520
QString url = httpRegExp.cap( 0 );
525
decomposedUTF8toLaTeX( result );
527
/** split text into math and non-math regions */
528
QStringList intermediate = QStringList::split( '$', result, true );
529
QStringList::Iterator it = intermediate.begin();
530
while ( it != intermediate.end() )
533
* Sometimes we split strings like "\$", which is not intended.
534
* So, we have to manually fix things by checking for strings
535
* ending with "\" and append both the removed dollar sign and
536
* the following string (which was never supposed to be an
537
* independent string). Finally, we remove the unnecessary
538
* string and continue.
540
if (( *it ).endsWith( "\\" ) )
542
QStringList::Iterator cur = it;
544
( *cur ).append( '$' ).append( *it );
545
intermediate.remove( it );
552
qApp->processEvents();
555
for ( QStringList::Iterator it = intermediate.begin(); it != intermediate.end(); ++it )
557
if ( !result.isEmpty() ) result.append( splitMarker );
558
result.append( *it );
561
if ( it == intermediate.end() )
564
if (( *it ).length() > 256 )
565
qDebug( "Very long math equation using $ found, maybe due to broken inline math: %s", ( *it ).left( 48 ).latin1() );
568
qApp->processEvents();
570
for ( QValueList<CharMappingItem>::ConstIterator cmit = m_charMapping.begin(); cmit != m_charMapping.end(); ++cmit )
571
result.replace(( *cmit ).regExp, ( *cmit ).unicode );
573
qApp->processEvents();
575
QStringList transformed = QStringList::split( splitMarker, result, true );
577
qApp->processEvents();
580
for ( QStringList::Iterator itt = transformed.begin(), iti = intermediate.begin(); itt != transformed.end() && iti != intermediate.end(); ++itt, ++iti )
582
result.append( *itt );
585
if ( iti == intermediate.end() )
588
result.append( "$" ).append( *iti ).append( "$" );
591
qApp->processEvents();
593
/** Reinserting original URLs as explained above */
598
pos = httpRegExp.search( result, pos );
602
int len = httpRegExp.cap( 0 ).length();
603
result = result.left( pos - 1 ).append( urls[idx++] ).append( result.mid( pos + len - 1 ) );
610
QString EncoderLaTeX::encode( const QString & text )
612
const QString splitMarker = "|KBIBTEX|";
613
QString result = text;
615
/** Collect (all?) urls from the BibTeX file and store them in urls */
616
/** Problem is that the replace function below will replace
617
* character sequences in the URL rendering the URL invalid.
618
* Later, all URLs will be replaced back to their original
619
* in the hope nothing breaks ... */
621
QRegExp httpRegExp( "(ht|f)tp://[^\"} ]+" );
622
httpRegExp.setMinimal( false );
626
pos = httpRegExp.search( result, pos );
630
QString url = httpRegExp.cap( 0 );
635
/** split text into math and non-math regions */
636
QStringList intermediate = QStringList::split( '$', result, true );
637
QStringList::Iterator it = intermediate.begin();
638
while ( it != intermediate.end() )
641
* Sometimes we split strings like "\$", which is not intended.
642
* So, we have to manually fix things by checking for strings
643
* ending with "\" and append both the removed dollar sign and
644
* the following string (which was never supposed to be an
645
* independent string). Finally, we remove the unnecessary
646
* string and continue.
648
if (( *it ).endsWith( "\\" ) )
650
QStringList::Iterator cur = it;
652
( *cur ).append( '$' ).append( *it );
653
intermediate.remove( it );
660
qApp->processEvents();
663
for ( QStringList::Iterator it = intermediate.begin(); it != intermediate.end(); ++it )
665
if ( !result.isEmpty() ) result.append( splitMarker );
666
result.append( *it );
669
if ( it == intermediate.end() )
672
if (( *it ).length() > 256 )
673
qDebug( "Very long math equation using $ found, maybe due to broken inline math: %s", ( *it ).left( 48 ).latin1() );
676
qApp->processEvents();
678
for ( QValueList<CharMappingItem>::ConstIterator cmit = m_charMapping.begin(); cmit != m_charMapping.end(); ++cmit )
679
result.replace(( *cmit ).unicode, ( *cmit ).latex );
681
qApp->processEvents();
683
QStringList transformed = QStringList::split( splitMarker, result, true );
685
qApp->processEvents();
688
for ( QStringList::Iterator itt = transformed.begin(), iti = intermediate.begin(); itt != transformed.end() && iti != intermediate.end(); ++itt, ++iti )
690
result.append( *itt );
693
if ( iti == intermediate.end() )
696
result.append( "$" ).append( *iti ).append( "$" );
699
qApp->processEvents();
701
/** \url accepts unquotet & and _
702
May introduce new problem tough */
703
if ( result.contains( "\\url{" ) )
704
result.replace( "\\&", "&" ).replace( "\\_", "_" ).replace( QChar( 0x2013 ), "--" ).replace( "\\#", "#" );
706
decomposedUTF8toLaTeX( result );
708
/** Reinserting original URLs as explained above */
713
pos = httpRegExp.search( result, pos );
717
int len = httpRegExp.cap( 0 ).length();
718
result = result.left( pos - 1 ).append( urls[idx++] ).append( result.mid( pos + len - 1 ) );
725
QString EncoderLaTeX::encode( const QString &text, const QChar &replace )
727
QString result = text;
728
for ( QValueList<CharMappingItem>::ConstIterator it = m_charMapping.begin(); it != m_charMapping.end(); ++it )
729
if (( *it ).unicode == replace )
730
result.replace(( *it ).unicode, ( *it ).latex );
734
QString EncoderLaTeX::encodeSpecialized( const QString & text, const EntryField::FieldType fieldType )
736
QString result = encode( text );
740
case EntryField::ftPages:
741
result.replace( QChar( 0x2013 ), "--" );
744
case EntryField::ftURL:
745
result.replace( "\\&", "&" ).replace( "\\_", "_" ).replace( QChar( 0x2013 ), "--" ).replace( "\\#", "#" );
755
QString& EncoderLaTeX::decomposedUTF8toLaTeX( QString &text )
757
for ( QValueList<CombinedMappingItem>::Iterator it = m_combinedMapping.begin(); it != m_combinedMapping.end(); ++it )
759
int i = ( *it ).regExp.search( text );
762
QString a = ( *it ).regExp.cap( 1 );
763
text = text.left( i ) + "\\" + ( *it ).latex + "{" + a + "}" + text.mid( i + 2 );
764
i = ( *it ).regExp.search( text, i + 1 );
771
void EncoderLaTeX::buildCombinedMapping()
773
for ( int i = 0; i < decompositionscount; i++ )
775
CombinedMappingItem item;
776
item.regExp = QRegExp( "(.)" + QString( QChar( decompositions[i].unicode ) ) );
777
item.latex = decompositions[i].latexCommand;
778
m_combinedMapping.append( item );
782
void EncoderLaTeX::buildCharMapping()
784
/** encoding and decoding for digraphs such as -- or ?` */
785
for ( int i = 0; i < charmappingdatalatexcount; i++ )
787
CharMappingItem charMappingItem;
788
charMappingItem.regExp = QRegExp( charmappingdatalatex[ i ].regexp );
789
charMappingItem.unicode = QChar( charmappingdatalatex[ i ].unicode );
790
charMappingItem.latex = QString( charmappingdatalatex[ i ].latex );
791
m_charMapping.append( charMappingItem );
794
/** encoding and decoding for commands such as \AA or \ss */
795
for ( int i = 0; i < commandmappingdatalatexcount; ++i )
797
/** different types of writing such as {\AA} or \AA{} possible */
798
for ( int j = 0; j < expansionscmdcount; ++j )
800
CharMappingItem charMappingItem;
801
charMappingItem.regExp = QRegExp( QString( expansionsCmd[j] ).arg( commandmappingdatalatex[i].letters ) );
802
charMappingItem.unicode = QChar( commandmappingdatalatex[i].unicode );
803
if ( charMappingItem.regExp.numCaptures() > 0 )
804
charMappingItem.unicode += QString( "\\1" );
805
charMappingItem.latex = QString( "{\\%1}" ).arg( commandmappingdatalatex[i].letters );
806
m_charMapping.append( charMappingItem );
810
/** encoding and decoding for letters such as \"a */
811
for ( int i = 0; i < modcharmappingdatalatexcount; ++i )
813
QString modifierRegExp = QString( modcharmappingdatalatex[i].modifier );
814
QString modifier = modifierRegExp;
815
modifier.replace( "\\^", "^" ).replace( "\\\\", "\\" );
817
/** first batch of replacement rules, where no separator is required between modifier and character (e.g. \"a) */
818
if ( !modifierRegExp.at( modifierRegExp.length() - 1 ).isLetter() )
819
for ( int j = 0; j < expansionsmod2count; ++j )
821
CharMappingItem charMappingItem;
822
charMappingItem.regExp = QRegExp( QString( expansionsMod2[j] ).arg( modifierRegExp ).arg( modcharmappingdatalatex[i].letter ) );
823
charMappingItem.unicode = QChar( modcharmappingdatalatex[i].unicode );
824
charMappingItem.latex = QString( "{%1%2}" ).arg( modifier ).arg( modcharmappingdatalatex[i].letter );
825
m_charMapping.append( charMappingItem );
828
/** second batch of replacement rules, where a separator is required between modifier and character (e.g. \v{g}) */
829
for ( int j = 0; j < expansionsmod1count; ++j )
831
CharMappingItem charMappingItem;
832
charMappingItem.regExp = QRegExp( QString( expansionsMod1[j] ).arg( modifierRegExp ).arg( modcharmappingdatalatex[i].letter ) );
833
charMappingItem.unicode = QChar( modcharmappingdatalatex[i].unicode );
834
charMappingItem.latex = QString( "%1{%2}" ).arg( modifier ).arg( modcharmappingdatalatex[i].letter );
835
m_charMapping.append( charMappingItem );
840
EncoderLaTeX* EncoderLaTeX::currentEncoderLaTeX()
842
if ( encoderLaTeX == NULL )
843
encoderLaTeX = new EncoderLaTeX();
848
void EncoderLaTeX::deleteCurrentEncoderLaTeX()
850
if ( encoderLaTeX != NULL )
857
char EncoderLaTeX::unicodeToASCII( unsigned int unicode )
859
if ( unicode < 128 ) return ( char )unicode;
860
for ( int i = 0; i < modcharmappingdatalatexcount; ++i )
861
if ( modcharmappingdatalatex[i].unicode == unicode )
862
return *modcharmappingdatalatex[i].letter;