2
/******************************************************************************
3
* MODULE : parsetex.cpp
4
* DESCRIPTION: conversion of tex/latex strings into logical tex/latex trees
5
* COPYRIGHT : (C) 1999 Joris van der Hoeven
6
*******************************************************************************
7
* This software falls under the GNU general public license and comes WITHOUT
8
* ANY WARRANTY WHATSOEVER. See the file $TEXMACS_PATH/LICENSE for more details.
9
* If you don't have this file, write to the Free Software Foundation, Inc.,
10
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
11
******************************************************************************/
13
#include "Tex/convert_tex.hpp"
15
/******************************************************************************
16
* The latex_parser structure
17
*******************************************************************************
19
* During the parsing, the following global variables are used:
21
* command_type Contains the types of all currently defined tex commands.
22
* This is either 'command' 'modifier' 'operator'
23
* 'environment' 'list' 'symbol' 'big-symbol' or 'user'.
24
* command_arity Contains the corresponding arity.
25
* command_def Contains the definitions of user commands.
27
* The command_type hashmap also contains come special fields
29
* \<sub> Stands for the subscript command
30
* \<sup> Stands for the supscript command
32
* !mode Gives the current mode ("text" or "math").
33
* !verbatim Verbatim mode ("true" or "false")
34
* !em Emphasized mode ("true" or "false")
36
*******************************************************************************
37
* WARNING: we recently put the standard LaTeX macros in latex_type and
38
* latex_arity instead of command_type and command_arity.
39
******************************************************************************/
43
void latex_error (string s, int i, string message);
45
tree parse (string s, int& i, char stop= '\0', bool change= false);
46
tree parse_backslash (string s, int& i);
47
tree parse_symbol (string s, int& i);
48
tree parse_command (string s, int& i, string which);
49
tree parse_unknown (string s, int& i, string which);
50
tree parse_verbatim (string s, int& i, string end);
53
/******************************************************************************
55
******************************************************************************/
58
latex_parser::latex_error (string s, int i, string message) {
59
cerr << "Latex error] " << message << "\n";
60
if (i>30) s= "..." * s (i-27, N(s));
61
if (N(s)>60) s= s (0, 57) * "...";
62
cerr << "Latex error] in " << s << "\n";
65
/******************************************************************************
66
* Main parsing routine
67
******************************************************************************/
70
latex_parser::parse (string s, int& i, char stop, bool change) {
74
command_type ->extend ();
75
command_arity->extend ();
76
command_def ->extend ();
78
while ((i<n) && ((s[i]==' ') || (s[i]=='\t') || (s[i]=='\n'))) i++;
79
while ((i<n) && (s[i]!=stop)) {
84
while ((i<n) && ((s[i]==' ') || (s[i]=='~') || (s[i]=='\t'))) i++;
85
if ((i<n) && (s[i]!='\n')) t << " ";
89
while ((i<n) && ((s[i]==' ') || (s[i]=='\t') || (s[i]=='\n')))
90
if (s[i++]=='\n') ln++;
92
if (ln == 1) t << " ";
98
if (test (s, i, "%%%%%%%%%% Start TeXmacs macros\n")) {
99
while ((i<n) && (!test (s, i, "%%%%%%%%%% End TeXmacs macros\n")))
104
while ((i<n) && (s[i]!='\n')) i++;
107
while ((i<n) && ((s[i]==' ') || (s[i]=='\t') || (s[i]=='\n')))
108
if (s[i++]=='\n') ln++;
110
if ((N(t)>0) && ((t[N(t)-1]==" ") || (t[N(t)-1]=="\n")))
119
if (is_numeric (s[i])) {
123
else t << s (i-1, i);
126
if (((i+7)<n) && (s(i,i+5)=="\\over") && (!is_alpha (s (i+5, i+7)))) {
128
tree arg= parse_command (s, i, "\\over");
130
((t[N(t)-1] == " ") || (t[N(t)-1] == tree (TUPLE, "\\ "))))
132
if (is_tuple (arg, "\\over", 1) && (N(t)>=1))
133
t[N(t)-1]= tree (TUPLE, "\\frac", t[N(t)-1], arg[1]);
135
else if (((i+5)<n) && (s(i,i+3)=="\\sp") && (!is_alpha(s[i+3]))) {
137
t << parse_command (s, i, "\\<sup>");
139
else if (((i+5)<n) && (s(i,i+3)=="\\sb") && (!is_alpha(s[i+3]))) {
141
t << parse_command (s, i, "\\<sub>");
143
else if (((i+10)<n) && (s(i,i+8)=="\\pmatrix")) {
145
tree arg= parse_command (s, i, "\\pmatrix");
146
if (is_tuple (arg, "\\pmatrix", 1)) arg= arg[1];
147
t << tree (TUPLE, "\\begin-pmatrix");
148
if (is_concat (arg)) t << A (arg);
150
t << tree (TUPLE, "\\end-pmatrix");
153
tree u= parse_backslash (s, i);
159
if (command_type ["!mode"] == "math") {
161
while ((i < N(s)) && (s[i] == '\'')) i++;
162
t << tuple ("\\prime", s (start, i));
164
else t << s (i-1, i);
167
if (command_type ["!mode"] == "math") t << tree (TUPLE, "\\ast");
173
t << parse_command (s, i, "\\<sub>");
175
if (command_type ["!mode"] == "math")
176
t << parse_command (s, i, "\\<sub>");
177
else t << s (i-1, i);
182
t << parse_command (s, i, "\\<sup>");
184
if (command_type ["!mode"] == "math")
185
t << parse_command (s, i, "\\<sup>");
186
else t << s (i-1, i);
190
t << tree (TUPLE, "\\<less>");
194
t << tree (TUPLE, "\\<gtr>");
199
t << parse_verbatim (s, i, "\244");
203
t << parse (s, i, '}');
207
if ((i<n) && (s[i]!=' ') && (s[i]!='\t') && (s[i]!='\n')) break;
208
while ((i<n) && ((s[i]==' ') || (s[i]=='\t') || (s[i]=='\n')))
209
if (s[i++]=='\n') ln++;
210
if (ln >= 2) t << "\n"; else t << tree (TUPLE, "\\ ");
216
if ((i<n) & (s[i]=='$')) {
219
t << tree (TUPLE, "\\begin-displaymath");
221
else t << tree (TUPLE, "\\begin-math");
222
command_type ("!mode")= "math";
223
t << parse (s, i, '$');
224
command_type ("!mode")= "text";
225
if ((i<n) && (s[i]=='$')) i++;
227
if ((i<n) && (s[i]=='$')) i++;
228
t << tree (TUPLE, "\\end-displaymath");
230
else t << tree (TUPLE, "\\end-math");
241
command_type ->merge ();
242
command_arity->merge ();
243
command_def ->merge ();
246
command_type ->shorten ();
247
command_arity->shorten ();
248
command_def ->shorten ();
251
if (N(t)==0) return "";
252
if (N(t)==1) return t[0];
256
/******************************************************************************
258
******************************************************************************/
261
latex_parser::parse_backslash (string s, int& i) {
263
if (((i+7)<n) && (s(i,i+5)=="\\verb")) {
265
return parse_verbatim (s, i, s(i-1,i));
267
if (((i+29)<n) && (s(i,i+16)=="\\begin{verbatim}")) {
269
return parse_verbatim (s, i, "\\end{verbatim}");
272
/************************ special commands *********************************/
277
return tree (TUPLE, "\\ ");
279
if (!is_alpha(s[i])) {
281
if (s[i-1]=='(') return parse_command (s, i, "\\begin-math");
282
if (s[i-1]==')') return parse_command (s, i, "\\end-math");
283
if (s[i-1]=='[') return parse_command (s, i, "\\begin-displaymath");
284
if (s[i-1]==']') return parse_command (s, i, "\\end-displaymath");
285
return parse_command (s, i, s (i-2, i));
288
/************************* normal commands *********************************/
290
while ((i<n) && is_alpha (s[i])) i++;
291
if ((i<n) && (s[i]=='*')) i++;
292
string r= s (start, i);
293
if ((r == "\\begin") || (r == "\\end")) {
294
while ((i<n) && ((s[i]==' ') || (s[i]=='\t') || (s[i]=='\n'))) i++;
295
if ((i==n) || (s[i]!='{')) {
296
latex_error (s, i, "begin or end which environment ?");
300
while ((i<n) && (s[i]!='}')) i++;
301
r = r * "-" * s (start, i);
304
return parse_command (s, i, r);
308
sharp_to_arg (string s, tree args) {
311
for (i=0; i<N(s); i++)
312
if ((s[i]=='#') && ((i+1)<N(s)) && (s[i+1]>='1') && (s[i+1]<='9')) {
313
int nr= ((int) s[++i]) - ((int) '0');
314
if (N(args)>nr) r << as_string (args[nr]);
321
latex_parser::parse_symbol (string s, int& i) {
323
if ((s[i] == '*') && (command_type ["!mode"] == "math")) {
324
i++; return tree (TUPLE, "\\ast"); }
325
if (s[i] == '<') { i++; return tree (TUPLE, "\\<less>"); }
326
if (s[i] == '>') { i++; return tree (TUPLE, "\\<gtr>"); }
327
if (s[i] != '\\') { i++; return s(start, i); }
329
if (i == N(s)) return tree (TUPLE, "\\backslash");
330
if (!is_alpha (s[i])) { i++; return s(start, i); }
331
while ((i<N(s)) && is_alpha (s[i])) i++;
332
if ((i<N(s)) && (s[i]=='*')) i++;
337
latex_parser::parse_command (string s, int& i, string cmd) {
339
cout << cmd << " [" << command_type [cmd] << ", "
340
<< command_type ["!mode"] << "]" << LF;
342
if (cmd == "\\newcommand") cmd= "\\def";
343
if (cmd == "\\renewcommand") cmd= "\\def";
344
if (cmd == "\\renewenvironment") cmd= "\\newenvironment";
345
if (cmd == "\\begin-split") cmd= "\\begin-eqsplit";
346
if (cmd == "\\end-split") cmd= "\\end-eqsplit";
347
if (cmd == "\\begin-split*") cmd= "\\begin-eqsplit*";
348
if (cmd == "\\end-split*") cmd= "\\end-eqsplit*";
349
if ((!command_type->contains (cmd)) &&
350
(latex_type [cmd] == "undefined"))
351
return parse_unknown (s, i, cmd);
353
if (latex_type [cmd] == "math-environment") {
354
if (cmd (0, 6) == "\\begin") command_type ("!mode") = "math";
355
else command_type ("!mode") = "text";
359
((cmd == "\\text") || (cmd == "\\mbox")) &&
360
(command_type ["!mode"] == "math");
361
if (mbox_flag) command_type ("!mode") = "text";
365
(latex_type [cmd]=="undefined")? command_arity (cmd): latex_arity [cmd];
366
bool option= (arity<0);
367
if (option) arity= -1-arity;
369
/************************ retrieve arguments *******************************/
370
tree t (TUPLE, copy (cmd)); // parsed arguments
371
tree u (TUPLE, copy (cmd)); // unparsed arguments
372
while ((i<n) && ((arity>0) || option)) {
374
while ((j<n) && ((s[j]==' ') || (s[j]=='\t') || (s[j]=='\n'))) j++;
376
if (option && (s[j]=='[')) {
379
t << parse (s, i, ']');
382
t[0]->label= t[0]->label * "*";
385
else if ((arity>0) && (s[j]=='{')) {
388
if ((N(t)==1) && (cmd == "\\def")) {
389
while ((i<n) && (s[i]!='}')) i++;
392
else t << parse (s, i, '}');
397
else if (option && (s[j]=='#') && (cmd == "\\def")) {
398
while ((j+3 <= n) && is_numeric (s[j+1]) && (s[j+2] == '#')) j+=2;
404
t[0]->label= t[0]->label * "*";
410
tree st= parse_symbol (s, i);
418
if (arity>0) latex_error (s, i, "too little arguments for " * cmd);
420
/******************** new commands and environments ************************/
421
if (is_tuple (t, "\\def", 2)) {
422
string var= as_string (t[1]);
423
command_type (var)= "user";
424
command_arity (var)= 0;
425
command_def (var)= as_string (u[2]);
427
if (is_tuple (t, "\\def*", 3)) {
428
string var= as_string (t[1]);
429
command_type (var)= "user";
430
command_arity (var)= as_int (t[2]);
431
command_def (var)= as_string (u[3]);
433
if (is_tuple (t, "\\newenvironment", 3)) {
434
string var= "\\begin-" * as_string (t[1]);
435
command_type (var)= "user";
436
command_arity (var)= 0;
437
command_def (var)= as_string (u[2]);
438
var= "\\end-" * as_string (t[1]);
439
command_type (var)= "user";
440
command_arity (var)= 0;
441
command_def (var)= as_string (u[3]);
443
if (is_tuple (t, "\\newenvironment*", 4)) {
444
string var= "\\begin-" * as_string (t[1]);
445
command_type (var)= "user";
446
command_arity (var)= as_int (t[2]);
447
command_def (var)= as_string (u[3]);
448
var= "\\end-" * as_string (t[1]);
449
command_type (var)= "user";
450
command_arity (var)= 0;
451
command_def (var)= as_string (u[4]);
454
/***************** environment changes for user commands ******************/
455
if (command_type[cmd] == "user") {
457
(void) parse (sharp_to_arg (command_def[cmd], u), pos, '\0', true);
458
// t= parse (sharp_to_arg (command_def[cmd], u), pos, '\0', true);
459
// variant if you want to replace macros by their definitions
462
if (mbox_flag) command_type ("!mode") = "math";
467
latex_parser::parse_unknown (string s, int& i, string cmd) {
471
tree t (TUPLE, copy (cmd));
474
while ((j<n) && ((s[j]==' ') || (s[j]=='\t') || (s[j]=='\n'))) j++;
476
if (option && (s[j]=='[')) {
479
t << parse (s, i, ']');
481
t[0]->label= t[0]->label * "*";
484
else if (s[j]=='{') {
487
t << parse (s, i, '}');
495
/******************************************************************************
496
* Parsing verbatim text
497
******************************************************************************/
500
latex_parser::parse_verbatim (string s, int& i, string end) {
501
int start=i, n= N(s), e= N(end);
502
while ((i<(n-e)) && (s(i,i+e)!=end)) i++;
505
tree (TUPLE, "\\begin-verbatim"),
507
tree (TUPLE, "\\end-verbatim"));
510
/******************************************************************************
511
* This routine may be used to transform accented characters to the Cork format
512
******************************************************************************/
514
static char Cork_unaccented[128]= {
515
'A', ' ', 'C', 'C', 'D', 'E', ' ', 'G',
516
'L', 'L', ' ', 'N', 'N', ' ', 'O', 'R',
517
'R', 'S', 'S', 'S', 'T', 'T', 'U', 'U',
518
'Y', 'Z', 'Z', 'Z', ' ', 'I', 'd', ' ',
519
'a', ' ', 'c', 'c', 'd', 'e', ' ', 'g',
520
'l', 'l', ' ', 'n', 'n', ' ', 'o', 'r',
521
'r', 's', 's', 's', 't', 't', 'u', 'u',
522
'y', 'z', 'z', 'z', ' ', ' ', ' ', ' ',
523
'A', 'A', 'A', 'A', 'A', 'A', ' ', 'C',
524
'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I',
525
'D', 'N', 'O', 'O', 'O', 'O', 'O', ' ',
526
' ', 'U', 'U', 'U', 'U', 'Y', ' ', ' ',
527
'a', 'a', 'a', 'a', 'a', 'a', ' ', 'c',
528
'e', 'e', 'e', 'e', 16 , 16 , 16 , 16 ,
529
'd', 'n', 'o', 'o', 'o', 'o', 'o', ' ',
530
' ', 'u', 'u', 'u', 'u', 'y', ' ', ' '
533
static char Cork_accent[128]= {
534
'u' , ' ' , '\'', 'v' , 'v' , 'v' , ' ' , 'u' ,
535
'\'', ' ' , ' ' , '\'', 'v' , ' ' , 'H' , '\'',
536
'v' , '\'', 'v' , 'c' , 'v' , 'c' , 'H' , ' ' ,
537
'\"', '\'', 'v' , '.' , ' ' , '.' , '=' , ' ' , // "
538
'u' , ' ' , '\'', 'v' , 'v' , 'v' , ' ' , 'u' ,
539
'\'', ' ' , ' ' , '\'', 'v' , ' ' , 'H' , '\'',
540
'v' , '\'', 'v' , 'c' , 'v' , 'c' , 'H' , ' ' ,
541
'\"', '\'', 'v' , '.' , ' ' , ' ' , ' ' , ' ' , // "
542
'`' , '\'', '^' , '~' , '\"', ' ' , ' ' , 'c' , // "
543
'`' , '\'', '^' , '\"', '`' , '\'', '^' , '\"', // "
544
'=' , '~' , '`' , '\'', '^' , '~' , '\"', ' ' , // "
545
' ' , '`' , '\'', '^' , '\"', '\'', ' ' , ' ' , // "
546
'`' , '\'', '^' , '~' , '\"', ' ' , ' ' , 'c' , // "
547
'`' , '\'', '^' , '\"', '`' , '\'', '^' , '\"', // "
548
'=' , '~' , '`' , '\'', '^' , '~' , '\"', ' ' , // "
549
' ' , '`' , '\'', '^' , '\"', '\'', ' ' , ' ' // "
553
accented_to_Cork (tree t) {
554
if (arity (t) == 0) return t;
557
for (i=0; i<n; i++) r[i]= accented_to_Cork (t[i]);
558
if (is_compound (t[0])) return r;
560
string s= t[0]->label;
561
if ((N(s)==2) && (s[0]=='\\') && (n==2) &&
562
is_atomic (t[1]) && (N(t[1]->label)<=2)) {
563
string v= t[1]->label;
570
if (s[1] == '\'') return "\001";
571
if (s[1] == '^' ) return "\136";
572
if (s[1] == '\"') return "\004"; // "
573
if (s[1] == '~' ) return "\176";
574
if (s[1] == '=' ) return "\026";
575
if (s[1] == '.' ) return "\137";
576
if (s[1] == 'u' ) return "\025";
577
if (s[1] == 'v' ) return "\024";
578
if (s[1] == 'H' ) return "\175";
579
if (s[1] == 'c' ) return "\030";
582
char c1= v[0], c2= s[1];
583
if (v=="\\i") c1= (char) 16;
584
if ((N(v)==1) || (v=="\\i"))
585
for (i=0; i<127; i++)
586
if ((Cork_unaccented[i]==c1) && (Cork_accent[i]==c2))
587
return tree (string ((char) (i+128)));
593
/******************************************************************************
595
******************************************************************************/
598
parse_latex (string s) {
600
s= dos_to_better (s);
603
command_type ("!mode") = "text";
604
command_type ("!em") = "false";
605
t= accented_to_Cork (ltx.parse (s, i));
606
command_type ("!mode") = "text";
607
command_type ("!em") = "false";
612
parse_latex_document (string s) {
613
return compound ("!file", parse_latex (s));