~ubuntu-branches/ubuntu/jaunty/poppler/jaunty-security

1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
1
//========================================================================
2
//
3
// ABWOutputDev.cc
4
//
1.1.18 by Sebastien Bacher
Import upstream version 0.10.3
5
// Copyright 2006-2007 Jauco Noordzij <jauco@jauco.nl>
6
// Copyright 2007 Dominic Lachowicz <cinamod@hotmail.com>
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
7
//
8
// Based somewhat on HtmlOutputDev.cc
9
//
10
//========================================================================
11
12
#ifdef __GNUC__
13
#pragma implementation
14
#endif
15
16
#include "config.h"
17
#include <stdio.h>
18
#include <stdlib.h>
19
#include <stdarg.h>
20
#include <stddef.h>
21
#include <ctype.h>
22
#include <math.h>
23
#include "goo/GooString.h"
24
#include "goo/GooList.h"
25
#include "UnicodeMap.h"
26
#include "goo/gmem.h"
27
#include "Error.h"
28
#include "GfxState.h"
29
#include "GlobalParams.h"
30
#include "ABWOutputDev.h"
31
#include "PDFDoc.h"
32
33
#include <libxml/parser.h>
34
#include <libxml/tree.h>
35
#include <libxml/xpath.h>
36
#include <libxml/xpathInternals.h>
37
38
39
// Inter-character space width which will cause addChar to start a new
40
// word.
41
#define minWordBreakSpace 0.1
42
43
// Maximum inter-word spacing, as a fraction of the font size.
44
#define maxWordSpacing 1.5
45
46
// Max distance between baselines of two lines within a block, as a
47
// fraction of the font size.
48
#define maxLineSpacingDelta 1.5
49
50
#define C_maxVCutValue 4
51
#define C_maxHCutValue 5
52
//------------------------------------------------------------------------
53
// ABWOutputDev
54
//------------------------------------------------------------------------
55
56
ABWOutputDev::ABWOutputDev(xmlDocPtr ext_doc)
57
{
58
  pdfdoc = NULL;
59
  N_page = N_style = N_text = N_styleset = N_Block = N_word = NULL;
60
  doc = ext_doc;
61
  N_root = xmlNewNode(NULL, BAD_CAST "abiword");
62
  xmlDocSetRootElement(doc, N_root);
63
  N_styleset = xmlNewChild(N_root, NULL, BAD_CAST "styles", NULL);
64
  N_content = xmlNewChild(N_root, NULL, BAD_CAST "content", NULL);
65
  uMap = globalParams->getTextEncoding();
66
  maxStyle = Style = 1;
67
}
68
69
ABWOutputDev::~ABWOutputDev() {
70
  xmlCleanupParser();
71
}
72
73
void ABWOutputDev::startPage(int pageNum, GfxState *state) {
74
  /*While reading a pdf page this node acts as a placeholder parent.
75
  when conversion is finished and the page is structured as we like it
76
  all text fragments are moved from N_page to N_content.*/
77
  N_page = xmlNewNode(NULL, BAD_CAST "page");
78
  G_pageNum = pageNum;
79
} 
80
81
/*Callback to denote that poppler reached the end of a page
82
here I insert most of the interesting processing stuff*/
83
void ABWOutputDev::endPage() {
84
  //make sure all words are closed
85
  endTextBlock();
86
  cleanUpNode(N_page, true);
87
  //xmlAddChild(N_content, N_page);
88
  //xmlSaveFormatFileEnc("pre-cut.xml", doc, "UTF-8", 1);
89
  //xmlUnlinkNode(N_page);
90
  //call the top down cutting mechanism
91
  recursiveXYC(N_page);
92
  //by stopping to worry about creating empty nodes I made the code quite a 
93
  //bit more robust. This function makes sure we have a nice'n'clean tree
94
  cleanUpNode(N_page, true);
95
  //xmlAddChild(N_content, N_page);
96
  //xmlSaveFormatFileEnc("raw.xml", doc, "UTF-8", 1);
97
  //xmlUnlinkNode(N_page);
98
  
99
  //Interpret the XY tree and infer text blocks and columns
100
  interpretXYTree();
101
  cleanUpNode(N_page, true);
102
  //xmlAddChild(N_content, N_page);
103
  //xmlSaveFormatFileEnc("interpreted.xml", doc, "UTF-8", 1);
104
  //xmlUnlinkNode(N_page);
105
  
106
  //I have blocks and columns, this function will turn that into paragraphs and
107
  //columns
108
  generateParagraphs();
109
  cleanUpNode(N_page, true);
110
  xmlAddChild(N_content, N_page);
111
  N_page = NULL;
112
}
113
114
void ABWOutputDev::recursiveXYC(xmlNodePtr nodeset) {
115
  /*This function implements the recursive XY Cut. basically, it gets
116
  the largest piece of whitespace (using getBiggestSeperator()) and then
117
  splits the page using splitNodes on that whitespace. It calls itself again
118
  with both the halves*/
119
  float bhs, bvs, X1, X2, Y1, Y2;
120
121
  bvs = getBiggestSeperator(nodeset, VERTICAL, &X1, &X2);
122
  bhs = getBiggestSeperator(nodeset, HORIZONTAL, &Y1, &Y2);
123
  
1.1.8 by Martin Pitt
Import upstream version 0.5.91
124
  if (bvs == -1){
125
    if (bhs == -1){//both -1
126
      //FIXME: add assertions that bvs and bhs are >=-1
127
      printf("No seperators\n");
128
      return;
129
    }
130
    else { //only bhs > -1
131
      splitNodes(Y1, HORIZONTAL, nodeset, bhs);
132
    }
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
133
  }
134
  else {
1.1.8 by Martin Pitt
Import upstream version 0.5.91
135
    if (bhs == -1){//only bvs > -1
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
136
      splitNodes(X1, VERTICAL, nodeset, bvs);
137
    }
1.1.8 by Martin Pitt
Import upstream version 0.5.91
138
    else {//both > -1
139
      if (bvs >= (bhs/1.7)){
140
        //When people read a text they prefer vertical cuts over horizontal 
141
        //ones. I'm not that sure about the 1.7 value, but it seems to work.
142
        splitNodes(X1, VERTICAL, nodeset, bvs);
143
      }
144
      else {
145
        splitNodes(Y1, HORIZONTAL, nodeset, bhs);
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
146
      }
147
    }
148
  }
1.1.8 by Martin Pitt
Import upstream version 0.5.91
149
  recursiveXYC(nodeset->children);
150
  recursiveXYC(nodeset->children->next);
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
151
}
152
1.1.8 by Martin Pitt
Import upstream version 0.5.91
153
void ABWOutputDev::splitNodes(float splitValue, unsigned int direction, xmlNodePtr N_parent, double seperator){
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
154
  //This function takes a nodeset and splits it based on a cut value. It returns
155
  //the nodePtr with two childnodes, the both chunks.
156
  xmlNodePtr N_move, N_cur, N_newH, N_newL;
157
  char * propName;
158
  const char *nodeName;
159
  char buf[20];
1.1.8 by Martin Pitt
Import upstream version 0.5.91
160
  if (direction == HORIZONTAL) {
161
    propName = "Y1"; 
162
    nodeName = "horizontal";
163
  }
164
  else { 
165
    propName = "X1"; 
166
    nodeName = "vertical";
167
  }
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
168
  N_newH = xmlNewNode(NULL, BAD_CAST nodeName);
169
  N_newL = xmlNewNode(NULL, BAD_CAST nodeName);
1.1.8 by Martin Pitt
Import upstream version 0.5.91
170
  sprintf(buf, "%f", seperator); 
171
  xmlNewProp(N_newH, BAD_CAST "diff", BAD_CAST buf);
172
  sprintf(buf, "%f", seperator); 
173
  xmlNewProp(N_newL, BAD_CAST "diff", BAD_CAST buf);
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
174
  N_cur = N_parent->children;
175
  while (N_cur){
176
    N_move = N_cur->next;
1.1.8 by Martin Pitt
Import upstream version 0.5.91
177
    xmlUnlinkNode(N_cur);
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
178
    if (xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST propName)) > splitValue){
179
      xmlAddChild(N_newH, N_cur);
180
    }
181
    else {
182
      xmlAddChild(N_newL, N_cur);
183
    }
184
    N_cur = N_move;
185
  }
186
  xmlAddChild(N_parent, N_newL);
187
  xmlAddChild(N_parent, N_newH);
188
}
189
190
float ABWOutputDev::getBiggestSeperator(xmlNodePtr N_set, unsigned int direction, float * C1, float * C2)
191
{
1.1.8 by Martin Pitt
Import upstream version 0.5.91
192
  int i = 0;
193
  int nodeCount = xmlLsCountNode(N_set);
194
  float store;
195
  int min;
196
  float gap, endV;
197
  float * stt;
198
  float * end;
199
  if (nodeCount == 0){
200
    //Add assertion that this shouldn't happen
201
    fprintf(stderr,"No child nodes");
202
    return -1;
203
  }
204
  stt = new float[nodeCount];
205
  end = new float[nodeCount];
206
  //store all variables in two arrays (one for start, one for end coordinates)
207
  if (direction == VERTICAL) {
208
    for (xmlNodePtr N_cur = N_set->children; N_cur != NULL; N_cur = N_cur->next){
209
      stt[i] = xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "X1"));
210
      end[i] = xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "X2"));
211
      i++;
212
    }
213
  }
214
  else {
215
    for (xmlNodePtr N_cur = N_set->children; N_cur != NULL; N_cur = N_cur->next){
216
      stt[i] = xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "Y1"));
217
      end[i] = xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "Y2"));
218
      i++;
219
    }
220
  }
221
  //Sort them
222
  for (i = 0; i < nodeCount - 1; i++){
223
    min = i;
224
    for (int j = i + 1; j < nodeCount; j++)
225
      if (stt[j] < stt[i])
226
        min = j;
227
    store = stt[i];
228
    stt[i] = stt[min];
229
    stt[min] = store;
230
    store = end[i];
231
    end[i] = end[min];
232
    end[min] = store;
233
  }
234
  //find the largest gap
235
  gap = -1;
236
  endV = end[0];
237
  *C1 = 0;
238
  *C2 = 0;
239
  for (int inspect = 1; inspect < nodeCount; inspect++){
240
    //no gap
241
    if (((stt[inspect] - endV) - gap) < 0.5){ //FIXME:This is copied almost directly from the previous function, needs checking out
242
      //partial overlap instead of complete one
243
      if (end[inspect] > endV)
244
        endV = end[inspect];
245
    }
246
    //gap
247
    else{
248
      //gap is larger than any previous gap
249
      if (gap < (stt[inspect] - endV)){
250
        gap = stt[inspect] - endV;
251
        *C1 = endV;
252
        *C2 = stt[inspect];
253
      }
254
      endV = end[inspect];
255
    }
256
  }
1.1.14 by Loic Minier
Import upstream version 0.8.4
257
  delete[] stt;
258
  delete[] end;
1.1.8 by Martin Pitt
Import upstream version 0.5.91
259
  return gap;
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
260
}
261
262
void ABWOutputDev::updateFont(GfxState *state) {
263
  char buf[160];
264
  xmlNodePtr N_cur;
265
  GfxFont *font;
266
  bool found = false;
267
  bool isBold, isItalic, S_isBold, S_isItalic;
268
  isBold = isItalic = S_isBold =  S_isItalic = false;
269
  font = state->getFont();
270
  GooString *ftName;
271
  char *fnEnd, *fnName;
1.1.12 by Loic Minier
Import upstream version 0.8.2
272
  int fnStart, ftSize;
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
273
  //the first time this function is called there is no funt.
274
  //Fixme: find out if that isn'y a bug
275
  if (font){
1.1.8 by Martin Pitt
Import upstream version 0.5.91
276
    isBold = (font->isBold() || font->getWeight() >6 || (strstr(font->getOrigName()->getCString(), "Bold")-font->getOrigName()->getCString() == (font->getOrigName()->getLength()-4)));
277
    isItalic =  (font->isItalic() || (strstr(font->getOrigName()->getCString(), "Italic")-font->getOrigName()->getCString() == (font->getOrigName()->getLength()-6)));
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
278
    ftSize = int(state->getTransformedFontSize())-1;
279
    ftName = new GooString(font->getOrigName());
280
    fnStart = strcspn(ftName->getCString(), "+");
281
    if (fnStart < ftName->getLength())
282
      ftName->del(0,fnStart+1);
283
    fnEnd = strrchr(ftName->getCString(), 44);
284
    if (fnEnd == 0)
285
      fnEnd = strrchr(ftName->getCString(), 45);
286
    if (fnEnd != 0)
287
      ftName->del(fnEnd-ftName->getCString(),ftName->getLength()-1);
288
    
289
/*    fnName = ftName;
290
    if (isBold or isItalic){
291
      fnStart = strcspn(fnName, "+");
292
      if (fnStart == font->getOrigName()->getLength())
293
        fnStart = 0;
294
      else fnStart++;
295
296
      fnEnd = strstr(fnName, ",");
297
      if (fnEnd == 0)
298
        fnEnd = strstr(fnName, "-");
299
      if (fnEnd != 0)
300
        fnName[fnEnd-fnName] = 0;
301
//      char fntName[fnLength];
302
//      strncpy (fntName,fnName+fnStart+1,fnLength);
303
      fnName+=fnStart;
304
//      fnName = fntName;
305
    }
306
    else {*/
307
      fnName = ftName->getCString();
308
//    }
309
    for (N_cur = N_styleset->children; N_cur; N_cur = N_cur ->next){
310
      if (
311
       isBold == (xmlStrcasecmp(xmlGetProp(N_cur,BAD_CAST "bold"),BAD_CAST "bold;") == 0)
1.1.8 by Martin Pitt
Import upstream version 0.5.91
312
       &&
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
313
       isItalic == (xmlStrcasecmp(xmlGetProp(N_cur,BAD_CAST "italic"),BAD_CAST "italic") == 0)
1.1.8 by Martin Pitt
Import upstream version 0.5.91
314
       &&
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
315
       xmlStrcasecmp(xmlGetProp(N_cur,BAD_CAST "font"),BAD_CAST fnName) == 0
1.1.8 by Martin Pitt
Import upstream version 0.5.91
316
       &&
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
317
       xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "size")) == ftSize
318
      ) {
319
        found = true;
320
        Style = int(xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "id")));
321
      }
322
    }
323
    if (!found){
324
      N_cur = xmlNewChild(N_styleset, NULL, BAD_CAST "s", NULL);
325
      xmlSetProp(N_cur, BAD_CAST "type", BAD_CAST "P");
326
      sprintf(buf, "%d", maxStyle++);
327
      xmlSetProp(N_cur, BAD_CAST "name", BAD_CAST buf);
328
      xmlSetProp(N_cur, BAD_CAST "id", BAD_CAST buf);
329
      Style = maxStyle;
330
      sprintf(buf, "%d", ftSize); xmlSetProp(N_cur, BAD_CAST "size", BAD_CAST buf);
331
      isBold   ? xmlSetProp(N_cur, BAD_CAST "bold", BAD_CAST "bold;")  : xmlSetProp(N_cur, BAD_CAST "bold", BAD_CAST "normal;");
332
      isItalic ? xmlSetProp(N_cur, BAD_CAST "italic", BAD_CAST "italic"): xmlSetProp(N_cur, BAD_CAST "italic", BAD_CAST "normal");
333
      xmlSetProp(N_cur, BAD_CAST "font", BAD_CAST fnName);
334
    }
335
  }
336
}
337
338
void ABWOutputDev::drawChar(GfxState *state, double x, double y,
339
			double dx, double dy,
340
			double originX, double originY,
341
			CharCode code, int nBytes, Unicode *u, int uLen)
342
{
343
  //I wouldn't know what size this should safely be. I guess 64 bytes should be
344
  //enough for any unicode character
345
  char buf[64];
346
  int charLen;
347
  x = dx;
348
  y = dy;
349
  //state->textTransformDelta(dx * state->getHorizScaling(), dy, &dx, &dy);
350
  //state->transformDelta(dx, dy, &dx, &dy);
351
  if (uLen == 1 && code == 0x20) {
352
    //If we break a text sequence on space, then the X1 should be increased
353
    //but the Y1 and Y2 should remain the same.
354
    beginWord(state,X2+dx,Y2);
355
  }
356
  else {
357
    X2    += dx;
358
    Y2    += dy;
359
    charLen = uMap->mapUnicode(*u,buf,sizeof(buf));
360
    //Getting Unicode to libxml is something I need to fix.
361
    //simply passing it using a bad-cast isn't working.
362
    //I assume that CharCode code it the U+value of the unicode character
363
    //But for a ligature code gives me DF which is the ringel-s, I guess
364
    //code should be two bytes wide?
365
    xmlNodeAddContentLen(N_word, BAD_CAST buf, charLen);
366
  }
367
}
368
369
void ABWOutputDev::beginString(GfxState *state, GooString *s) {
370
  double x,y;
371
  //state->textTransform(x, y, &x, &y);
372
  state->transform(state->getCurX(), state->getCurY(), &x, &y);
373
  if (N_word) {
374
    verDist = y-Y2;
375
    horDist = x-X2;
376
    //TEST:changed fabs(horDist) to horDist
377
    //FIXME: this if statement seems awkward to me.
1.1.8 by Martin Pitt
Import upstream version 0.5.91
378
    if (horDist > (state->getTransformedFontSize()*maxWordSpacing) || (fabs(verDist) > (state->getTransformedFontSize()/maxLineSpacingDelta))) {
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
379
      beginTextBlock(state,x,y);
380
    }
381
    else {
1.1.8 by Martin Pitt
Import upstream version 0.5.91
382
      if ((horDist > (state->getTransformedFontSize()*minWordBreakSpace)) || (fabs(verDist) > (state->getTransformedFontSize()/maxLineSpacingDelta))) {
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
383
        beginWord(state,x,y);
384
      }
385
    }
386
  }
387
  else {
388
  //This is the first word. Clear all values and call beginWord;
389
    X2 = x;
390
    Y2 = y;
391
    horDist = 0;
392
    verDist = 0;
393
    height  = 0;
394
    beginTextBlock(state,x,y);
395
  }
396
}
397
398
void ABWOutputDev::endString(GfxState *state) {
399
400
}
401
402
void ABWOutputDev::beginWord(GfxState *state, double x, double y){
403
  char buf[20];
404
//  printf("***BREAK!***\n");
405
  endWord();
406
  X1 = x;
407
  Y2 = y;
408
409
  horDist = X1-X2;
410
  verDist = Y1-Y2;
411
412
  X2 = X1;
413
  height = state->getFont()->getAscent() * state->getTransformedFontSize();
414
  Y1 = Y2-height;
415
416
  N_word = xmlNewChild(N_Block, NULL, BAD_CAST "word", NULL);
417
  sprintf(buf, "%f", X1); xmlNewProp(N_word, BAD_CAST "X1", BAD_CAST buf);
418
  sprintf(buf, "%f", Y1); xmlNewProp(N_word, BAD_CAST "Y1", BAD_CAST buf);
419
  sprintf(buf, "%d", Style); xmlNewProp(N_word, BAD_CAST "style", BAD_CAST buf);
420
}
421
422
void ABWOutputDev::endWord(){
423
  char buf[20];
424
  if (N_word) {
425
    sprintf(buf, "%f", X2);    xmlNewProp(N_word, BAD_CAST "X2", BAD_CAST buf);
426
    sprintf(buf, "%f", Y2);    xmlNewProp(N_word, BAD_CAST "Y2", BAD_CAST buf);
427
    sprintf(buf, "%f", X2-X1); xmlNewProp(N_word, BAD_CAST "width", BAD_CAST buf);
428
    sprintf(buf, "%f", Y2-Y1); xmlNewProp(N_word, BAD_CAST "height", BAD_CAST buf);
429
    N_word = NULL;
430
  }
431
}
432
433
void ABWOutputDev::beginTextBlock(GfxState *state, double x, double y){
434
  endTextBlock();
435
  N_Block = xmlNewChild(N_page, NULL, BAD_CAST "Textblock", NULL);
436
  beginWord(state,x,y);
437
}
438
439
void ABWOutputDev::endTextBlock(){
440
  if (N_Block) {
441
    endWord();
442
    N_Block = NULL;  
443
  }
444
}
445
/*
446
This will be a function to retrieve coherent text blocks from the chunk tree.*/
447
void ABWOutputDev::interpretXYTree(){
448
  xmlNodePtr N_oldPage;
449
  N_oldPage = N_page;
450
  N_page = xmlNewNode(NULL, BAD_CAST "page");
451
  N_column = N_page;
452
  //xmlAddChild(N_content, N_page);
453
  N_Block = xmlNewChild(N_column, NULL, BAD_CAST "chunk", NULL);
454
  ATP_recursive(N_oldPage);
455
}
456
457
void ABWOutputDev::ATP_recursive(xmlNodePtr N_parent){
458
  xmlNodePtr N_first, N_second, N_line, N_tempCol, N_tempColset;
1.1.8 by Martin Pitt
Import upstream version 0.5.91
459
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
460
  N_first  = N_parent->children;
461
  if (!N_first)
462
    return;
463
464
  N_second = N_first->next;
465
/*
466
  Possibilities: 
467
  there is one child node
468
    Because we cleaned up before the only case where we allow one childnode is 
469
    within Textblocks and textBlocks within 'vertical' nodes.
470
      basically one text node means: add it to the current block.
471
  There are two childnodes
472
    This can be two verticals, two horizontals or one horizontal and a text node.
473
    verticals:
474
      If the first is vertical, the second is as well.
475
      verticals mean: create a new Block, add a column per vertical make the
476
      vertical the block and recurse inside.
477
      then make the second vertical the block and recurse inside
478
      then finish the block (ie. create a new one)
479
    horizontal and or Textblocks
480
        if first is textnode
481
          add first to block
482
          if second is textnode
483
            at to block
484
          else
485
            call again
486
        else
487
          begin new block
488
            call again
489
          begin new block
490
          if second is text node
491
            add to block
492
          else
493
            call again
494
  there are more then two child nodes
495
    this can be a number of Textblocks and horizontals
496
    add the textNodes to the current Block
497
    if a horizontal is encountered enter it and generate a new block afterwards
498
  */
499
  //fprintf(stderr,"**********************************************************************\n");
500
  //xmlSaveFormatFileEnc("-", doc, "UTF-8", 1);
501
  switch (xmlLsCountNode(N_parent)) {
502
  case 1:
503
    //fprintf(stderr,"case 1\n");
504
    N_line = xmlNewChild(N_Block, NULL, BAD_CAST "line", NULL);
505
    xmlUnlinkNode(N_first);
506
    xmlAddChild(N_line, N_first);
507
    break;
508
  case 2:
509
    //fprintf(stderr,"case 2\n");
510
    if (xmlStrcasecmp(N_first->name,BAD_CAST "vertical") == 0){
511
      //store the column for the moment
512
      N_tempCol = N_column;
513
      /*If we have three columns they will turn up in the tree as:
514
      <vertical>
515
        <vertical/>
516
        <vertical/>
517
      </vertical>
518
      <vertical/>
519
      */
520
      //if the parent is a vertical as well, we can skip the colset generation 
521
      //thing here we can also remove the just added column and block, because 
522
      //these are going to replace them
523
      if (xmlStrcasecmp(N_parent->name,BAD_CAST "vertical") != 0){
524
        //fprintf(stderr,"first time column\n");
525
        N_tempColset = N_colset;
526
        N_colset = xmlNewChild(N_column, NULL, BAD_CAST "colset", NULL);
527
        N_column = xmlNewChild(N_colset, NULL, BAD_CAST "column", NULL);
528
        N_Block = xmlNewChild(N_column, NULL, BAD_CAST "chunk", NULL);
529
      }
530
      else {
531
        //fprintf(stderr,"second time column\n");
532
        xmlUnlinkNode(N_column);
533
        N_column = xmlNewChild(N_colset, NULL, BAD_CAST "column", NULL);
534
        N_Block = xmlNewChild(N_column, NULL, BAD_CAST "chunk", NULL);
535
      }
536
      //fprintf(stderr,"Building first column...\n");
537
      ATP_recursive(N_first);
538
      N_column = xmlNewChild(N_colset, NULL, BAD_CAST "column", NULL);
539
      N_Block = xmlNewChild(N_column, NULL, BAD_CAST "chunk", NULL);
540
      //fprintf(stderr,"Building second column...\n");
541
      ATP_recursive(N_second);
542
      //make sure we end the column by continuing in the master column and 
543
      //setting the block and line to it
544
      N_column = N_tempCol;
545
      if (xmlStrcasecmp(N_parent->name,BAD_CAST "vertical") != 0){
1.1.8 by Martin Pitt
Import upstream version 0.5.91
546
        if (N_tempColset != NULL)
547
          N_colset = N_tempColset;
548
        else
549
          fprintf(stderr,"N_templColset should not! be empty (line 823)");//FIXME: add assert
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
550
      }
551
    }
552
    else {
553
      if (xmlStrcasecmp(N_first->name,BAD_CAST "Textblock") == 0) {
554
        //fprintf(stderr,"add first as textblock\n");
555
        N_line = xmlNewChild(N_Block, NULL, BAD_CAST "line", NULL);
556
        xmlUnlinkNode(N_first);
557
        xmlAddChild(N_line, N_first);
558
        if (xmlStrcasecmp(N_second->name,BAD_CAST "Textblock") == 0) {
559
          //fprintf(stderr,"add second as textblock\n");
560
          //FIXME: this is not neat. We should ignore the cut ignoring when there are only two elements above
561
          //line aggregation doesn't work anyway atm.
562
          xmlUnlinkNode(N_second);
563
          xmlAddChild(N_line, N_second);
564
          //We have two textChunks that are going to be added to the line.
565
          //the following statements make the line wrap around both textblocks
566
          //if the firstX1 is smaller then the second X1 use the first, else use the second etc.
567
        }
568
        else {
569
          //fprintf(stderr,"recursing into second\n");
570
          ATP_recursive(N_second);
571
        }
572
      }
573
      else {
574
        N_Block = xmlNewChild(N_column, NULL, BAD_CAST "chunk", NULL);
575
        //fprintf(stderr,"recursing into first\n");
576
        ATP_recursive(N_first);
577
        N_Block = xmlNewChild(N_column, NULL, BAD_CAST "chunk", NULL);
578
        if (xmlStrcasecmp(N_second->name,BAD_CAST "Textblock") == 0) {
579
          //fprintf(stderr,"add second as textblock\n");
580
          N_line = xmlNewChild(N_Block, NULL, BAD_CAST "line", NULL);
581
          xmlUnlinkNode(N_second);
582
          xmlAddChild(N_line, N_second);
583
        }
584
        else {
585
          //fprintf(stderr,"recursing into second\n");
586
          ATP_recursive(N_second);
587
        }
588
      }
589
    }
590
    break;
591
  default:
1.1.12 by Loic Minier
Import upstream version 0.8.2
592
    //double tX1=0, tX2=0, tY1=0, tY2=0;
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
593
    //fprintf(stderr,"case default\n");
594
    N_line = xmlNewChild(N_Block, NULL, BAD_CAST "line", NULL);
595
    while (N_first){
596
      //xmlXPathCastStringToNumber(xmlGetProp(N_first,BAD_CAST "X1")) < tX1 ? tX1 = xmlXPathCastStringToNumber(xmlGetProp(N_first,BAD_CAST "X1")) : tX1 = tX1;
597
      //xmlXPathCastStringToNumber(xmlGetProp(N_first,BAD_CAST "X2")) > tX2 ? tX2 = xmlXPathCastStringToNumber(xmlGetProp(N_first,BAD_CAST "X2")) : tX2 = tX2;
598
      //xmlXPathCastStringToNumber(xmlGetProp(N_first,BAD_CAST "Y1")) < tY1 ? tY1 = xmlXPathCastStringToNumber(xmlGetProp(N_first,BAD_CAST "Y1")) : tY1 = tY1;
599
      //xmlXPathCastStringToNumber(xmlGetProp(N_first,BAD_CAST "Y2")) > tY2 ? tY2 = xmlXPathCastStringToNumber(xmlGetProp(N_first,BAD_CAST "Y2")) : tY1 = tY2;
600
      N_second = N_first->next;
601
      if (xmlStrcasecmp(N_first->name,BAD_CAST "Textblock") == 0){
602
        xmlUnlinkNode(N_first);
603
        xmlAddChild(N_line, N_first);
604
      }
605
      else { //fprintf(stderr,"This shouldn't happen! (line 700)\n");
606
      }
607
      N_first = N_second;
608
    }
609
    break;
610
  }
611
}
612
613
/*The cleanup function. It started out as a simple function to remove empty nodes
614
so that I could call xmladdnewchildnode as often as I liked so that I wouldn't get seg-faults
615
It is now a bit more advanced, makes sure the tree is as it's supposed to be and adds information too*/
616
void ABWOutputDev::cleanUpNode(xmlNodePtr N_parent, bool aggregateInfo){
617
  double tX1=-1, tX2=-1, tY1=-1, tY2=-1;
618
  xmlNodePtr N_cur, N_next;
619
  N_cur = N_parent->children;
620
  char buf[20];
621
  int prevStyle = -1;
622
  xmlChar *val;
623
  int styleLength = xmlLsCountNode(N_styleset)+1;
624
  float stylePos;
1.1.8 by Martin Pitt
Import upstream version 0.5.91
625
  int *styles = new int[styleLength];
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
626
  for (int i=1; i< styleLength; i++) { styles[i] = 0;}
627
  /*
628
  ignore two horizontal nodes with textBlocks right underneath them. They 
629
  signal the end of a chunk, and the horizontal seperation needs to be 
630
  preserved, because it means they are different lines. The second horizontal 
631
  therefore needs to be kept.
632
  */
633
  if ((xmlLsCountNode(N_parent) == 2)
1.1.8 by Martin Pitt
Import upstream version 0.5.91
634
      &&
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
635
     xmlStrcasecmp(N_parent->name,BAD_CAST "horizontal") == 0
1.1.8 by Martin Pitt
Import upstream version 0.5.91
636
      && 
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
637
     N_cur
1.1.8 by Martin Pitt
Import upstream version 0.5.91
638
      &&
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
639
     N_cur->next
1.1.8 by Martin Pitt
Import upstream version 0.5.91
640
      &&
641
     xmlStrcasecmp(N_cur->name,BAD_CAST "horizontal") == 0 && xmlStrcasecmp(N_cur->next->name,BAD_CAST "horizontal") == 0
642
      &&
643
     xmlLsCountNode(N_cur) == 1 && xmlLsCountNode(N_cur->next) == 1
644
      &&
645
     xmlStrcasecmp(N_cur->children->name,BAD_CAST "Textblock") == 0 && xmlStrcasecmp(N_cur->next->children->name,BAD_CAST "Textblock") == 0
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
646
     ) {
647
    xmlAddPrevSibling(N_cur->next,N_cur->children); 
648
    xmlUnlinkNode(N_cur);
649
  } 
650
  /*
651
  This removes columns if one of the parts is actually a single letter.
652
  I found out I liked the columns better, so I have the code commented out.
653
  */
654
/*  else if ((xmlLsCountNode(N_parent) == 2)
1.1.8 by Martin Pitt
Import upstream version 0.5.91
655
             &&
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
656
            N_cur
1.1.8 by Martin Pitt
Import upstream version 0.5.91
657
             &&
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
658
            N_cur->next
1.1.8 by Martin Pitt
Import upstream version 0.5.91
659
             && 
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
660
            xmlStrcasecmp(N_cur->name,BAD_CAST "vertical") == 0
1.1.8 by Martin Pitt
Import upstream version 0.5.91
661
             &&
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
662
            xmlStrcasecmp(N_cur->next->name,BAD_CAST "vertical") == 0
1.1.8 by Martin Pitt
Import upstream version 0.5.91
663
             && 
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
664
            (N_cur->children) 
1.1.8 by Martin Pitt
Import upstream version 0.5.91
665
             &&
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
666
            (N_cur->children->children)
1.1.8 by Martin Pitt
Import upstream version 0.5.91
667
             &&
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
668
            (N_cur->children->children->children)
1.1.8 by Martin Pitt
Import upstream version 0.5.91
669
             &&
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
670
            xmlStrlen(N_cur->children->children->children->content) == 1) {
671
    N_next = N_cur->next;
672
    xmlAddChild(N_parent, N_next->children);
673
    xmlAddPrevSibling(N_next->children->children, N_cur->children);
674
    xmlUnlinkNode(N_cur);
675
    xmlUnlinkNode(N_next);
676
  } */else {
677
    while (N_cur){
678
      N_next = N_cur->next;
679
      cleanUpNode(N_cur, aggregateInfo);
1.1.8 by Martin Pitt
Import upstream version 0.5.91
680
      if (xmlLsCountNode(N_cur) == 0 && (xmlStrcasecmp(N_cur->name,BAD_CAST "cbr") != 0) && (xmlStrcasecmp(N_cur->name,BAD_CAST "s") != 0))
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
681
        xmlUnlinkNode(N_cur);
682
      //If the node is still around
683
      N_cur = N_next;
684
    }
685
  }
686
  //If a countainer element has only one child, it can be removed except for vertical
687
  //cuts with only one textElement;
688
  //the main reason for this code is to remove the crumbs after cleaning up in the loop above
1.1.8 by Martin Pitt
Import upstream version 0.5.91
689
  if ((xmlLsCountNode(N_parent) == 1) && ((xmlStrcasecmp(N_parent->name,BAD_CAST "horizontal") == 0) || ((xmlStrcasecmp(N_parent->name,BAD_CAST "vertical") == 0) && (xmlStrcasecmp(N_parent->children->name,BAD_CAST "Textblock") != 0)))){
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
690
    N_cur = N_parent->children;
691
    xmlAddPrevSibling(N_parent,N_cur);
692
    xmlUnlinkNode(N_parent);
693
  }
694
  //We cannot remove the page element so if it has only one childnode, we remove that childnode instead
1.1.8 by Martin Pitt
Import upstream version 0.5.91
695
  if ((xmlStrcasecmp(N_parent->name,BAD_CAST "page") == 0) && (xmlLsCountNode(N_parent) == 1)) {
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
696
    N_cur = N_parent->children->children;
697
    while (N_cur){
698
      N_next = N_cur->next;
699
      xmlUnlinkNode(N_cur);
700
      xmlAddChild(N_parent, N_cur);
701
      N_cur = N_next;
702
    }
703
    xmlUnlinkNode(N_parent->children);
704
  }
705
  //Ok, so by this time the N_parent and his children are guaranteed to be clean
706
  //this for loop gets information from the 'word' elements and propagates it up
707
  //the tree. 
1.1.8 by Martin Pitt
Import upstream version 0.5.91
708
  if (aggregateInfo && xmlStrcasecmp(N_parent->name,BAD_CAST "word") != 0) {
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
709
    for (N_cur = N_parent->children; N_cur; N_cur = N_cur->next){
710
      val = xmlGetProp(N_cur,BAD_CAST "style");
711
      stylePos = xmlXPathCastStringToNumber(val);
712
      //fprintf(stderr,"1: %f, %d\n",stylePos,int(stylePos));
713
      styles[int(stylePos)]=styles[int(stylePos)]+1;
714
      //fprintf(stderr,"2: styles[%d] = %d\n",int(stylePos),styles[int(stylePos)]);
1.1.8 by Martin Pitt
Import upstream version 0.5.91
715
      (xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "X1")) < tX1 || tX1 == -1)? tX1 = xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "X1")) : tX1 = tX1;
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
716
      (xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "X2")) > tX2)             ? tX2 = xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "X2")) : tX2 = tX2;
1.1.8 by Martin Pitt
Import upstream version 0.5.91
717
      (xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "Y1")) < tY1 || tY1 == -1)? tY1 = xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "Y1")) : tY1 = tY1;
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
718
      (xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "Y2")) > tY2)             ? tY2 = xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "Y2")) : tY2 = tY2;
719
    }
720
    sprintf(buf, "%f", tX1);     xmlSetProp(N_parent, BAD_CAST "X1", BAD_CAST buf);
721
    sprintf(buf, "%f", tX2);     xmlSetProp(N_parent, BAD_CAST "X2", BAD_CAST buf);
722
    sprintf(buf, "%f", tY1);     xmlSetProp(N_parent, BAD_CAST "Y1", BAD_CAST buf);
723
    sprintf(buf, "%f", tY2);     xmlSetProp(N_parent, BAD_CAST "Y2", BAD_CAST buf);
724
    sprintf(buf, "%f", tX2-tX1); xmlSetProp(N_parent, BAD_CAST "width", BAD_CAST buf);
725
    sprintf(buf, "%f", tY2-tY1); xmlSetProp(N_parent, BAD_CAST "height", BAD_CAST buf);
726
    prevStyle = 0;
727
    styles[0] = -1;
728
    for (int i=1; i< styleLength; i++) { if (styles[i] > styles[prevStyle]) prevStyle = i; }
729
    //fprintf(stderr,"%d\n", prevStyle);
730
    if (prevStyle > 0){
731
      sprintf(buf, "%d", prevStyle);     xmlSetProp(N_parent, BAD_CAST "style", BAD_CAST buf);
732
    }
733
  }
1.1.8 by Martin Pitt
Import upstream version 0.5.91
734
  if (N_parent->children && xmlStrcasecmp(N_parent->children->name,BAD_CAST "line") == 0 && xmlGetProp(N_parent->children,BAD_CAST "alignment") != NULL)
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
735
    xmlSetProp(N_parent, BAD_CAST "alignment", xmlGetProp(N_parent->children,BAD_CAST "alignment"));
1.1.8 by Martin Pitt
Import upstream version 0.5.91
736
1.1.18 by Sebastien Bacher
Import upstream version 0.10.3
737
   delete[] styles;
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
738
}
739
740
void ABWOutputDev::generateParagraphs() {
741
  xmlNodePtr N_cur, N_parent, N_p, N_line, N_next;
742
  int lvl;
743
  //basically I first detect the text-alignment within blocks.
744
  //ASSUMPTION: my block seperation thing is good enough so I don't need to
745
  //worry about two alignments in one paragraph
746
  
747
  X1 = 0;
748
  X2 = pdfdoc->getPageCropWidth(G_pageNum);
749
  Y1 = 0;
750
  Y2 = pdfdoc->getPageCropHeight(G_pageNum);
751
  addAlignment(N_page);
752
  
753
  //then it's a switch per alignement
754
  N_cur = N_page->children;
755
  N_parent = N_page;
756
  lvl = 1;
757
  while (N_cur) {
758
    if (xmlStrcasecmp(N_cur->name,BAD_CAST "chunk") == 0){
759
      N_p = xmlNewNode(NULL, BAD_CAST "chunk");
760
      xmlAddPrevSibling(N_cur,N_p);
761
      //N_p = xmlNewChild(N_parent, NULL, BAD_CAST "chunk", NULL);
762
      //A new paragraph is created when:
763
      switch (int(xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "alignment")))){
764
      //left
765
      case 1: //the distance between the texblock X2 and the last word X2 is more than
766
         //the following first word width.
767
         N_line = N_cur->children;
768
         while (N_line){
769
           N_next = N_line->next;
770
           xmlUnlinkNode(N_line);
771
           xmlAddChild(N_p,N_line);
772
           xmlSetProp(N_line, BAD_CAST "alignment", BAD_CAST "1");
1.1.8 by Martin Pitt
Import upstream version 0.5.91
773
           if (N_next && xmlStrcasecmp(N_next->name,BAD_CAST "line") == 0){
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
774
             if (xmlXPathCastStringToNumber(xmlGetProp(N_next->children->children,BAD_CAST "width")) < (xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "width")) - xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "width")))){
775
               N_p = xmlNewNode(NULL, BAD_CAST "chunk");
776
               xmlAddPrevSibling(N_cur,N_p);
777
             }
778
           }
779
           N_line = N_next;
780
         }
781
         break;
782
      //right
783
      case 2: //the same but now with X1 and first word and following last word
784
         N_line = N_cur->children;
785
         while (N_line){
786
           N_next = N_line->next;
787
           xmlUnlinkNode(N_line);
788
           xmlAddChild(N_p,N_line);
789
           xmlSetProp(N_line, BAD_CAST "alignment", BAD_CAST "2");
1.1.8 by Martin Pitt
Import upstream version 0.5.91
790
           if (N_next && xmlStrcasecmp(N_next->name,BAD_CAST "line") == 0){
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
791
             //fprintf(stderr,"width_next=%f, X2_bl=%f, X2_w=%f\n",xmlXPathCastStringToNumber(xmlGetProp(N_next->children->children,BAD_CAST "width")),xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "width")),xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "width")));
792
             if (xmlXPathCastStringToNumber(xmlGetProp(N_next->children->children,BAD_CAST "width")) < (xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "width")) - xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "width")))){
793
               N_p = xmlNewNode(NULL, BAD_CAST "chunk");
794
               xmlAddPrevSibling(N_cur,N_p);
795
             }
796
           }
797
           N_line = N_next;
798
         }
799
         break;
800
      //centered
801
      case 3: //the combined left and right space is more than the following first word
802
         N_line = N_cur->children;
803
         while (N_line){
804
           N_next = N_line->next;
805
           xmlUnlinkNode(N_line);
806
           xmlAddChild(N_p,N_line);
807
           xmlSetProp(N_line, BAD_CAST "alignment", BAD_CAST "3");
1.1.8 by Martin Pitt
Import upstream version 0.5.91
808
           if (N_next && xmlStrcasecmp(N_next->name,BAD_CAST "line") == 0){
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
809
             //fprintf(stderr,"width_next=%f, X2_bl=%f, X2_w=%f\n",xmlXPathCastStringToNumber(xmlGetProp(N_next->children->children,BAD_CAST "width")),xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "width")),xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "width")));
810
             if (xmlXPathCastStringToNumber(xmlGetProp(N_next->children->children,BAD_CAST "width")) < (xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "width")) - xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "width")))){
811
               N_p = xmlNewNode(NULL, BAD_CAST "chunk");
812
               xmlAddPrevSibling(N_cur,N_p);
813
             }
814
           }
815
           N_line = N_next;
816
         }
817
         break;
818
      //justified
819
      case 4:
820
         //we break on all alignment=1 lines. A line with alignment=1 that is the first of a block will
821
         //also initiate a paragraph break before.
822
         N_line = N_cur->children;
823
         if (xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "alignment")) == 1){
824
           N_p = xmlNewNode(NULL, BAD_CAST "chunk");
825
           xmlAddPrevSibling(N_cur,N_p);
826
         }
827
         while (N_line){
828
           N_next = N_line->next;
829
           xmlUnlinkNode(N_line);
830
           xmlAddChild(N_p,N_line);
831
           if (xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "alignment")) == 1){
832
             N_p = xmlNewNode(NULL, BAD_CAST "chunk");
833
             xmlAddPrevSibling(N_cur,N_p);
834
           }
835
           xmlSetProp(N_line, BAD_CAST "alignment", BAD_CAST "4");
836
           N_line = N_next;
837
         }
838
         break;
839
      }
840
    }
1.1.8 by Martin Pitt
Import upstream version 0.5.91
841
    else if (xmlStrcasecmp(N_cur->name,BAD_CAST "colset") == 0 || xmlStrcasecmp(N_cur->name,BAD_CAST "column") == 0){
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
842
      N_parent = N_cur;
843
      N_cur = N_cur->children;
844
      lvl++;
845
      N_p = xmlNewNode(NULL, BAD_CAST "chunk");
846
      xmlAddPrevSibling(N_cur,N_p);
847
      continue;
848
    }
849
    if (N_cur->next)
850
      N_cur = N_cur->next;
851
    else while (lvl > 0){
852
      N_cur = N_parent;
853
      N_parent = N_cur->parent;
854
      lvl--;
855
      if (N_cur->next){
856
        N_cur = N_cur->next;
857
        break;
858
      }
859
    }
860
    if (lvl==0)
861
      N_cur = NULL;
862
  }
863
}
864
865
//function that adds an 'alignment=' property to the <chunk>s
866
void ABWOutputDev::addAlignment(xmlNodePtr N_parent) {
867
  xmlNodePtr N_chunk, N_line;
868
  double tX1, tX2;
869
  bool leftMatch, rightMatch, centerMatch;
870
  int leftCnt = 0, rightCnt = 0, cntrCnt = 0, justCnt = 0;
871
  //fprintf(stderr,"Entering addAlignment\n");
872
  for (N_chunk = N_parent->children; N_chunk; N_chunk = N_chunk->next) {
873
    if (xmlStrcasecmp(N_chunk->name,BAD_CAST "chunk") == 0){
874
      X1 = xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X1"));
875
      X2 = xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X2"));
876
      //fprintf(stderr,"Found chunk\n");
877
      //if the chunk contains only one line, we don't need to loop through it.
878
      if (xmlLsCountNode(N_chunk) == 1){
879
        //fprintf(stderr,"Processing line\n");
880
        //fprintf(stderr,"X1=%f, X2=%f, cX1=%f, cX2=%f\n",X1,X2,xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X1")), xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X2")));
881
        //fprintf(stderr,"%f\n",(xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X1")) - X1)-(X2 - xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X2"))));
882
        //fprintf(stderr,"cX1-X1=%f, X2-cX2=%f\n",(xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X1")) - X1),(X2 - xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X2"))));
883
        // a one line chunk, is either centered or left or right-aligned.
884
        if ((xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X1"))-X1)-(X2-xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X2"))) > 1) {
885
          xmlNewProp(N_chunk, BAD_CAST "alignment", BAD_CAST "2");
886
          xmlNewProp(N_chunk->children, BAD_CAST "alignment", BAD_CAST "2");
887
          //fprintf(stderr,"alignment = right\n");
888
        }
889
        else { 
890
        if ((xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X1"))-X1)-(X2 - xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X2")))< -1) {
891
          xmlNewProp(N_chunk, BAD_CAST "alignment", BAD_CAST "1");
892
          xmlNewProp(N_chunk->children, BAD_CAST "alignment", BAD_CAST "1");
893
          //fprintf(stderr,"alignment = left\n");
894
        }
895
        else {
896
          xmlNewProp(N_chunk, BAD_CAST "alignment", BAD_CAST "3");
897
          xmlNewProp(N_chunk->children, BAD_CAST "alignment", BAD_CAST "3");
898
          //fprintf(stderr,"alignment = center\n");
899
        }
900
        }
901
      }
902
      else {
903
      leftCnt = 0;
904
      rightCnt = 0;
905
      cntrCnt = 0;
906
      justCnt = 0;
907
      for (N_line = N_chunk->children; N_line; N_line = N_line->next) {
908
        //fprintf(stderr,"Processing line\n");
909
        /*
910
        |X1 - cX1| == 1
911
        |X2 - cX2| == 1
912
        |(cX1-X1)-(X2-cX2)| == 1
913
        ok, each line can be just as wide as the current set,
914
        it can be smaller and moved to the right
915
        it can be smaller and moved to the left.
916
        it can 
917
        */
918
        //fprintf(stderr,"X1=%f, X2=%f, cX1=%f, cX2=%f\n",X1,X2,xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "X1")), xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "X2")));
919
        //fprintf(stderr,"cX1-X1=%f, X2-cX2=%f\n",(xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "X1")) - X1),(X2 - xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "X2"))));
920
        leftMatch =  fabs(xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "X1"))-X1) < 2;
921
        rightMatch =  fabs(X2-xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "X2"))) < 2;
922
        centerMatch =  fabs((xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "X1"))-X1)-(X2-xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "X2")))) < 2;
1.1.8 by Martin Pitt
Import upstream version 0.5.91
923
        if (leftMatch && rightMatch) {
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
924
          xmlNewProp(N_line, BAD_CAST "alignment", BAD_CAST "4");
925
          justCnt++;
926
        }
927
        else if (centerMatch) {
928
          xmlNewProp(N_line, BAD_CAST "alignment", BAD_CAST "3");
929
          cntrCnt++;
930
        }
931
        else if (rightMatch) {
932
          xmlNewProp(N_line, BAD_CAST "alignment", BAD_CAST "2");
933
          rightCnt++;
934
        }
935
        else {
936
          xmlNewProp(N_line, BAD_CAST "alignment", BAD_CAST "1");
937
          leftCnt++;
938
        }
939
      }
940
      //there is almost always one justified line in a centered text
941
      //and most justified blocks have at least one left aligned line
942
      //fprintf(stderr,"1:%d ,2:%d ,3:%d ,4:%d\n",leftCnt,justCnt,cntrCnt,rightCnt);
1.1.8 by Martin Pitt
Import upstream version 0.5.91
943
      if ((leftCnt-1 >= justCnt) && (leftCnt >= rightCnt) && (leftCnt >= cntrCnt))
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
944
        xmlNewProp(N_chunk, BAD_CAST "alignment", BAD_CAST "1");
1.1.8 by Martin Pitt
Import upstream version 0.5.91
945
      else if ((justCnt >= leftCnt-1) && (justCnt >= rightCnt) && (justCnt >= cntrCnt))
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
946
        xmlNewProp(N_chunk, BAD_CAST "alignment", BAD_CAST "4");
1.1.8 by Martin Pitt
Import upstream version 0.5.91
947
      else if ((cntrCnt >= justCnt-1) && (cntrCnt >= rightCnt) && (cntrCnt >= leftCnt))
1.1.7 by Sebastien Bacher
Import upstream version 0.5.9
948
        xmlNewProp(N_chunk, BAD_CAST "alignment", BAD_CAST "3");
949
      else
950
        xmlNewProp(N_chunk, BAD_CAST "alignment", BAD_CAST "2");
951
      }
952
    } 
953
    else {
954
      if (xmlStrcasecmp(N_chunk->name,BAD_CAST "colset") == 0){
955
        //fprintf(stderr,"Found a colset\n");
956
        addAlignment(N_chunk);
957
      }
958
      else {
959
        if (xmlStrcasecmp(N_chunk->name,BAD_CAST "column") == 0){
960
          //fprintf(stderr,"Found a column\n");
961
          tX1 = X1;
962
          tX2 = X2;
963
          X1 = xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X1"));
964
          X2 = xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X2"));
965
          addAlignment(N_chunk);
966
          X1 = tX1;
967
          X2 = tX2;
968
        }
969
        else { //fprintf(stderr,"Found something else\n");
970
	}
971
      }
972
    }
973
  }
974
//parse all blocks, and all lines within all blocks
975
//do a set of checks and tick a flag if the check fails
976
//check for line X1 is textBlock X1
977
//check for line X2 is textblock X2
978
//check if line is centered in textBock (LX1 != TX1 && LX2 != TX2 && LX1-TX1 == TX2=LX2)
979
//if the LX1 != TX1 then how much is the difference?
980
//a line isn't left aligned if all lines have a different X1 <= not so strong assumption.
981
982
//justified if both are straight except for a couple of (same factor sized) indents at the left
983
//else centered if above calculation is correct
984
//else left aligned if left side is more straight than right (more lines in the same X1 or common factor
985
//else right
986
}
987
988
void ABWOutputDev::setPDFDoc(PDFDoc *priv_pdfdoc) {
989
  pdfdoc = priv_pdfdoc;
990
}
991
992
void ABWOutputDev::createABW() {
993
  //*************************************************************
994
  //change styles to abiword format
995
  xmlNodePtr N_cur, N_next;
996
  xmlAttrPtr N_prop;
997
  char buf[500];
998
  for (N_cur = N_styleset->children; N_cur; N_cur = N_cur->next){
999
    sprintf(buf,"margin-top:0pt; color:000000; margin-left:0pt; text-position:normal; widows:2; text-indent:0in; font-variant:normal; margin-right:0pt; lang:nl-NL; line-height:1.0; font-size:%dpt; text-decoration:none; margin-bottom:0pt; bgcolor:transparent; text-align:left; font-stretch:normal;",int(xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "size"))));
1000
    strncat(buf,"font-family:",12);
1001
    strncat(buf,(char *)xmlGetProp(N_cur,BAD_CAST "font"),strlen((char *)xmlGetProp(N_cur,BAD_CAST "font")));
1002
    strncat(buf,";",1);
1003
    strncat(buf,"font-weight:",12);
1004
    strncat(buf,(char *)xmlGetProp(N_cur,BAD_CAST "bold"),strlen((char *)xmlGetProp(N_cur,BAD_CAST "bold")));
1005
    strncat(buf,"font-style:",12);
1006
    strncat(buf,(char *)xmlGetProp(N_cur,BAD_CAST "italic"),strlen((char *)xmlGetProp(N_cur,BAD_CAST "italic")));
1007
    xmlSetProp(N_cur, BAD_CAST "props", BAD_CAST buf);
1008
    N_prop = xmlHasProp(N_cur, BAD_CAST "id");
1009
    if (N_prop != NULL) xmlRemoveProp(N_prop);
1010
    N_prop = xmlHasProp(N_cur, BAD_CAST "size");
1011
    if (N_prop != NULL) xmlRemoveProp(N_prop);
1012
    N_prop = xmlHasProp(N_cur, BAD_CAST "bold");
1013
    if (N_prop != NULL) xmlRemoveProp(N_prop);
1014
    N_prop = xmlHasProp(N_cur, BAD_CAST "italic");
1015
    if (N_prop != NULL) xmlRemoveProp(N_prop);
1016
    N_prop = xmlHasProp(N_cur, BAD_CAST "font");
1017
    if (N_prop != NULL) xmlRemoveProp(N_prop);
1018
  }
1019
  //*************************************************************
1020
  //Change the rest of the document
1021
  //each child of N_content is a page
1022
  N_cur = N_content->children;
1023
  while (N_cur){
1024
    //we creat a section node and attach it to the root, it will com after all
1025
    //the page nodes. Then we transform the page, and finally remove it
1026
    N_next = N_cur->next;
1027
    //fprintf(stderr,"***Transforming page\n");
1028
    N_Block = xmlNewChild(N_root, NULL, BAD_CAST "section", NULL);
1029
    transformPage(N_cur);
1030
    xmlUnlinkNode(N_cur);
1031
    //fprintf(stderr,"***Finished transforming page\n");
1032
    N_cur = N_next;
1033
  }
1034
  cleanUpNode(N_root, false);
1035
}
1036
1037
void ABWOutputDev::transformPage(xmlNodePtr N_parent){
1038
  char buf[60];
1039
  xmlNodePtr N_cur, N_curLine, N_curText, N_curWord, text, space;
1040
  //translate the nodes into abiword nodes
1041
  if (xmlStrcasecmp(N_parent->name,BAD_CAST "page") == 0){
1042
    for (N_cur = N_parent->children; N_cur; N_cur = N_cur->next){
1043
      //fprintf(stderr,"**pass a page child\n");
1044
      transformPage(N_cur);
1045
    }
1046
  }
1047
  if (xmlStrcasecmp(N_parent->name,BAD_CAST "chunk") == 0){
1048
    //fprintf(stderr,"Found a chunk\n");
1049
    //I start a <p> on each chunk and add all word containment
1050
    N_text = xmlNewChild(N_Block, NULL, BAD_CAST "p", NULL);
1051
    if (int(xmlXPathCastStringToNumber(xmlGetProp(N_parent,BAD_CAST "style"))) > 0){
1052
      xmlNewProp(N_text, BAD_CAST "style", xmlGetProp(N_parent,BAD_CAST "style"));
1053
    }
1054
    switch (int(xmlXPathCastStringToNumber(xmlGetProp(N_parent,BAD_CAST "alignment")))){
1055
    case 1: xmlNewProp(N_text, BAD_CAST "props", BAD_CAST "text-align:left");
1056
           break;
1057
    case 2: xmlNewProp(N_text, BAD_CAST "props", BAD_CAST "text-align:right");
1058
           break;
1059
    case 3: xmlNewProp(N_text, BAD_CAST "props", BAD_CAST "text-align:center");
1060
           break;
1061
    case 4: xmlNewProp(N_text, BAD_CAST "props", BAD_CAST "text-align:justify");
1062
           break;
1063
    }
1064
    for (N_curLine = N_parent->children; N_curLine; N_curLine = N_curLine->next){
1065
      //fprintf(stderr,"A line\n");
1066
      for (N_curText = N_curLine->children; N_curText; N_curText = N_curText->next){
1067
        //fprintf(stderr,"a textNode\n");
1068
        for (N_curWord = N_curText->children; N_curWord; N_curWord = N_curWord->next){
1069
          //fprintf(stderr,"a word\n");
1070
          text = N_curWord->children;
1071
          xmlUnlinkNode(text);
1072
          xmlAddChild(N_text,text);
1073
          space = xmlNewText(BAD_CAST " ");
1074
          xmlAddChild(N_text,space);
1075
        }
1076
      }
1077
    }
1078
  }
1079
  if (xmlStrcasecmp(N_parent->name,BAD_CAST "column") == 0){
1080
    //fprintf(stderr,"Found a column\n");
1081
    for (N_cur = N_parent->children; N_cur; N_cur = N_cur->next){
1082
      transformPage(N_cur);
1083
    }
1084
    xmlNewChild(N_text, NULL, BAD_CAST "cbr", NULL);
1085
  }
1086
  if (xmlStrcasecmp(N_parent->name,BAD_CAST "colset") == 0){
1087
    //fprintf(stderr,"Found a colset\n");
1088
    //create new section columns: count childNodes of N_cur
1089
    //recurse through chunks and create textNodes
1090
    N_Block = xmlNewChild(N_root, NULL, BAD_CAST "section", NULL);
1091
    sprintf(buf,"columns:%d",xmlLsCountNode(N_parent));
1092
    xmlNewProp(N_Block, BAD_CAST "props", BAD_CAST buf);
1093
    for (N_cur = N_parent->children; N_cur; N_cur = N_cur->next){
1094
      transformPage(N_cur);
1095
    }
1096
    N_Block = xmlNewChild(N_root, NULL, BAD_CAST "section", NULL);
1097
  }
1098
  //fprintf(stderr,"at the end\n");
1099
}
1.1.8 by Martin Pitt
Import upstream version 0.5.91
1100
1101
//Count nodes, copied from debugxml.c from libxml
1102
// libxml copyright file below
1103
/*
1104
Except where otherwise noted in the source code (e.g. the files hash.c,
1105
list.c and the trio files, which are covered by a similar licence but
1106
with different Copyright notices) all the files are:
1107
1108
 Copyright (C) 1998-2003 Daniel Veillard.  All Rights Reserved.
1109
1110
Permission is hereby granted, free of charge, to any person obtaining a copy
1111
of this software and associated documentation files (the "Software"), to deal
1112
in the Software without restriction, including without limitation the rights
1113
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
1114
copies of the Software, and to permit persons to whom the Software is fur-
1115
nished to do so, subject to the following conditions:
1116
1117
The above copyright notice and this permission notice shall be included in
1118
all copies or substantial portions of the Software.
1119
1120
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1121
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FIT-
1122
NESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
1123
DANIEL VEILLARD BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
1124
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CON-
1125
NECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
1126
1127
Except as contained in this notice, the name of Daniel Veillard shall not
1128
be used in advertising or otherwise to promote the sale, use or other deal-
1129
ings in this Software without prior written authorization from him.
1130
*/
1131
int ABWOutputDev::xmlLsCountNode(xmlNodePtr node) {
1132
  int ret = 0;
1133
  xmlNodePtr list = NULL;
1134
1135
  if (node == NULL)
1136
    return(0);
1137
1138
  switch (node->type) {
1139
    case XML_ELEMENT_NODE:
1140
      list = node->children;
1141
      break;
1142
    case XML_DOCUMENT_NODE:
1143
    case XML_HTML_DOCUMENT_NODE:
1144
#ifdef LIBXML_DOCB_ENABLED
1145
    case XML_DOCB_DOCUMENT_NODE:
1146
#endif
1147
      list = ((xmlDocPtr) node)->children;
1148
      break;
1149
    case XML_ATTRIBUTE_NODE:
1150
      list = ((xmlAttrPtr) node)->children;
1151
      break;
1152
    case XML_TEXT_NODE:
1153
    case XML_CDATA_SECTION_NODE:
1154
    case XML_PI_NODE:
1155
    case XML_COMMENT_NODE:
1156
      if (node->content != NULL) {
1157
        ret = xmlStrlen(node->content);
1158
      }
1159
      break;
1160
    case XML_ENTITY_REF_NODE:
1161
    case XML_DOCUMENT_TYPE_NODE:
1162
    case XML_ENTITY_NODE:
1163
    case XML_DOCUMENT_FRAG_NODE:
1164
    case XML_NOTATION_NODE:
1165
    case XML_DTD_NODE:
1166
    case XML_ELEMENT_DECL:
1167
    case XML_ATTRIBUTE_DECL:
1168
    case XML_ENTITY_DECL:
1169
    case XML_NAMESPACE_DECL:
1170
    case XML_XINCLUDE_START:
1171
    case XML_XINCLUDE_END:
1172
      ret = 1;
1173
      break;
1174
  }
1175
  for (;list != NULL;ret++) 
1176
    list = list->next;
1177
  return(ret);
1178
}