2
* Copyright 2009 CrossWire Bible Society (http://www.crosswire.org)
3
* CrossWire Bible Society
7
* This program is free software; you can redistribute it and/or modify it
8
* under the terms of the GNU General Public License as published by the
9
* Free Software Foundation version 2.
11
* This program is distributed in the hope that it will be useful, but
12
* WITHOUT ANY WARRANTY; without even the implied warranty of
13
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
* General Public License for more details.
38
#include <lzsscomprs.h>
39
#include <zipcomprs.h>
40
#include <cipherfil.h>
44
#include <latin1utf8.h>
47
#ifndef NO_SWORD_NAMESPACE
48
using namespace sword;
53
// Turn debugging on and off
56
const int DEBUG_WRITE = 1; // writing to module
57
const int DEBUG_VERSE = 2; // verse start and end
58
const int DEBUG_QUOTE = 4; // quotes, especially Words of Christ (WOC)
59
const int DEBUG_TITLE = 8; // titles
60
const int DEBUG_INTERVERSE = 16; // inter-verse maerial
61
const int DEBUG_XFORM = 32; // transformations
62
const int DEBUG_REV11N = 64; // versification
63
const int DEBUG_REF = 128; // parsing of osisID and osisRef
64
const int DEBUG_STACK = 256; // cleanup of references
65
const int DEBUG_OTHER = 512; // ins and outs of books, chapters and verses
68
const int EXIT_BAD_ARG = 1; // Bad parameter given for program
69
const int EXIT_NO_WRITE = 2; // Could not open the module for writing
70
const int EXIT_NO_CREATE = 3; // Could not create the module
71
const int EXIT_NO_READ = 4; // Could not open the input file for reading.
72
const int EXIT_BAD_NESTING = 5; // BSP or BCV nesting is bad
82
VerseKey currentVerse;
84
char activeOsisID[255];
85
char currentOsisID[255];
87
SWBuf activeVerseText;
89
ListKey currentKeyIDs = ListKey();
91
std::vector<ListKey> linkedVerses;
93
static bool inCanonicalOSISBook = true; // osisID is for a book that is not in Sword's canon
94
static bool normalize = true; // Whether to normalize UTF-8 to NFC
96
bool isOSISAbbrev(const char *buf) {
97
VerseMgr *vmgr = VerseMgr::getSystemVerseMgr();
98
const VerseMgr::System *av11n = vmgr->getVersificationSystem(currentVerse.getVersificationSystem());
99
return av11n->getBookNumberByOSISName(buf) >= 0;
103
* Determine whether the string contains a valid unicode sequence.
104
* The following table give the pattern of a valid UTF-8 character.
105
* Unicode Range 1st 2nd 3rd 4th
106
* U-00000000 - U-0000007F 0nnnnnnn
107
* U-00000080 - U-000007FF 110nnnnn 10nnnnnn
108
* U-00000800 - U-0000FFFF 1110nnnn 10nnnnnn 10nnnnnn
109
* U-00010000 - U-001FFFFF 11110nnn 10nnnnnn 10nnnnnn 10nnnnnn
111
* 1. The latest UTF-8 RFC allows for a max of 4 bytes.
113
* 2. The number of bits of the leading byte before the first 0
114
* is the total number of bytes.
115
* 3. The "n" are the bits of the unicode codepoint.
116
* This routine does not check to see if the code point is in the range.
119
* param txt the text to check
120
* return 1 if all high order characters form a valid unicode sequence
121
* -1 if there are no high order characters.
122
* Note: this is also a valid unicode sequence
123
* 0 if there are high order characters that do not form
124
* a valid unicode sequence
127
int detectUTF8(const char *txt) {
128
unsigned int countUTF8 = 0;
131
// Cast it to make masking and shifting easier
132
const unsigned char *p = (const unsigned char*) txt;
134
// Is the high order bit set?
136
// Then count the number of high order bits that are set.
137
// This determines the number of following bytes
138
// that are a part of the unicode character
139
unsigned char i = *p;
140
for (count = 0; i & 0x80; count++) {
145
// Count 0: bug in code that would cause core walking
146
// Count 1: is a pattern of 10nnnnnn,
147
// which does not signal the start of a unicode character
148
// Count 5 to 8: 111110nn, 1111110n and 11111110 and 11111111
149
// are not legal starts, either
150
if (count < 2 || count > 4) return 0;
152
// At this point we expect (count - 1) following characters
153
// of the pattern 10nnnnnn
154
while (--count && *++p) {
155
// The pattern of each following character must be: 10nnnnnn
156
// So, compare the top 2 bits.
157
if ((0xc0 & *p) != 0x80) return 0;
160
// Oops, we've run out of bytes too soon: Cannot be UTF-8
163
// We have a valid UTF-8 character, so count it
167
// Advance to the next character to examine.
171
// At this point it is either UTF-8 or 7-bit ascii
172
return countUTF8 ? 1 : -1;
175
void prepareSWText(const char *osisID, SWBuf &text)
177
// Always check on UTF8 and report on non-UTF8 entries
178
int utf8State = detectUTF8(text.c_str());
180
// Trust, but verify.
181
if (!normalize && !utf8State) {
182
cout << "WARNING(UTF8): " << osisID << ": Should be converted to UTF-8 (" << text << ")" << endl;
187
// Don't need to normalize text that is ASCII
188
// But assume other non-UTF-8 text is Latin1 (cp1252) and convert it to UTF-8
190
cout << "INFO(UTF8): " << osisID << ": Converting to UTF-8 (" << text << ")" << endl;
191
converter.processText(text, (SWKey *)2); // note the hack of 2 to mimic a real key. TODO: remove all hacks
194
// Prepare for double check. This probably can be removed.
195
// But for now we are running the check again.
196
// This is to determine whether we need to normalize output of the conversion.
197
utf8State = detectUTF8(text.c_str());
200
// Double check. This probably can be removed.
202
cout << "ERROR(UTF8): " << osisID << ": Converting to UTF-8 (" << text << ")" << endl;
207
normalizer.processText(text, (SWKey *)2); // note the hack of 2 to mimic a real key. TODO: remove all hacks
208
if (before != text) {
216
// This routine converts an osisID or osisRef into one that SWORD can parse into a verse list
217
// An osisRef is made up of:
221
// an osisRef osisRef
223
// An osisID can have a work prefix which is terminated by a : and may have a grain
224
// which is started by a !
226
// However, SWORD cannot handle work prefixes or grains and expects ranges to be
227
// separated with a single;
228
void prepareSWVerseKey(SWBuf &buf) {
229
// This routine modifies the buf in place
230
char* s = buf.getRawData();
232
bool inRange = false;
236
if (debug & DEBUG_REF) {
237
cout << "DEBUG(REF): Copy range marker:" << *p << endl;;
240
// Range markers are copied as is
244
// Look ahead to see if we are in a work prefix
245
// but don't look past an osisID
247
while (*n && *n != ':' && *n != ' ' && *n != '-') {
251
// We have found a work prefix
253
// set p to skip the work prefix
256
if (debug & DEBUG_REF) {
257
cout << "DEBUG(REF): Found a work prefix ";
258
for (char *x = s; x <= n; x++) {
266
// Now we are in the meat of an osisID.
267
// Copy it to its end but stop on a grain marker of '!'
269
if (debug & DEBUG_REF) {
270
cout << "DEBUG(REF): Copy osisID:";
273
while (*p && *p != '!' && *p != ' ' && *p != '-') {
275
if (debug & DEBUG_REF) {
282
if (debug & DEBUG_REF) {
287
// The ! and everything following until we hit
288
// the end of the osisID is part of the grain reference
291
while (*n && *n != ' ' && *n != '-') {
295
if (debug & DEBUG_REF) {
296
cout << "DEBUG(REF): Found a grain suffix ";
297
for (char *x = p; x < n; x++) {
306
// At this point we have processed an osisID
308
// if we are not in a range and the next characer is a -
309
// then we are entering a range
310
inRange = !inRange && *p == '-';
313
if (debug & DEBUG_REF) {
315
cout << "DEBUG(REF): Found a range" << endl;
320
// between ranges and stand alone osisIDs we might have whitespace
321
if (!inRange && *p == ' ') {
322
// skip this and subsequent spaces
326
// replacing them all with a ';'
329
if (debug & DEBUG_REF) {
330
cout << "DEBUG(REF): replacing space with ;. Remaining: " << p << endl;
336
// Determine whether we have modified the buffer
337
// We have modified the buffer if s is not sitting on the null byte of the original
339
// null terminate the reference
341
// Since we modified the swbuf, we need to tell it what we have done
342
buf.setSize(s - buf.c_str());
344
if (debug & DEBUG_REF) {
345
cout << "DEBUG(REF): shortended keyVal to`" << buf.c_str() << "`"<< endl;
352
* Determine whether a verse as given is valid for the versification.
353
* This is done by comparing the before and after of normalization.
355
bool isValidRef(const char *buf) {
356
// Create a VerseKey that does not do auto normalization
357
// Note: need to turn on headings so that a heading does not get normalized anyway
358
// And set it to the reference under question
360
before.setVersificationSystem(currentVerse.getVersificationSystem());
361
before.AutoNormalize(0);
365
// If we are a heading we must bail
366
// These will autonormalize to the last verse of the prior chapter
367
if (!before.Testament() || !before.Book() || !before.Chapter() || !before.Verse()) {
371
// Create a VerseKey that does do auto normalization
372
// And set it to the reference under question
374
after.setVersificationSystem(currentVerse.getVersificationSystem());
375
after.AutoNormalize(1);
383
// If we have gotten here the reference is not in the selected versification.
384
cout << "INFO(V11N): " << before << " is not in the " << currentVerse.getVersificationSystem() << " versification." << endl;
387
if (debug & DEBUG_REV11N) {
388
cout << "DEBUG(V11N): " << before << " normalizes to " << after << endl;
396
* This routine is used to ensure that all the text in the input is saved to the module.
397
* Assumption: The input orders all the verses for a chapter in numerical order. Thus, any
398
* verses that are not in the chosen versification (v11n) follow those that are.
400
* The prior implementation of this adjusted the verse to the last one that is in the chosen v11n.
401
* If it the chapter were extra, then it is appended to the last verse of the last
402
* chapter in the chosen v11n for that book. If it is just extra verses for a chapter, then it is
403
* appended to the last verse of the chapter.
405
* The problem with this is when a OSIS verse refers to more than one verse, e.g.
406
* osisID="Gen.1.29 Gen.1.30 Gen.1.31" (Gen.1.31 is the last verse of the chapter in the chosen v11n)
407
* and then it is followed by Gen.1.32.
409
* This routine assumes that linking is postponed to the end so that in the example Gen.1.30-31
410
* are not linked but rather empty. This routine will then find the last verse in the computed
411
* chapter that has content.
413
* Alternative, we could have done linking as we went, but this routine would have needed
414
* to find the first entry in the link set and elsewhere in the code when appending to a
415
* verse, it would need to be checked for adjacent links and those would have needed to be adjusted.
417
* param key the key that may need to be adjusted
419
void makeValidRef(VerseKey &key) {
421
int chapterMax = key.getChapterMax();
422
int verseMax = key.getVerseMax();
425
if (debug & DEBUG_REV11N) {
426
cout << "DEBUG(V11N) Chapter max:" << chapterMax << ", Verse Max:" << verseMax << endl;
430
cout << "INFO(V11N): " << key.getOSISRef() << " is not in the " << key.getVersificationSystem() << " versification.";
431
// Since isValidRef returned false constrain the key to the nearest prior reference.
432
// If we are past the last chapter set the reference to the last chapter
433
if (key.Chapter() > chapterMax) {
434
key.Chapter(chapterMax);
437
// Either we set the chapter to the last chapter and now need to set to the last verse in the chapter
438
// Or the verse is beyond the end of the chapter.
439
// In any case we need to constrain the verse to it's chapter.
442
// There are three cases we want to handle:
443
// In the examples we are using the KJV versification where the last verse of Matt.7 is Matt.7.29.
444
// In each of these cases the out-of-versification, extra verse is Matt.7.30.
445
// 1) The "extra" verse follows the last verse in the chapter.
446
// <verse osisID="Matt.7.29">...</verse><verse osisID="Matt.7.30">...</verse>
447
// In this case re-versify Matt.7.30 as Matt.7.29.
449
// 2) The "extra" verse follows a range (a set of linked verses).
450
// <verse osisID="Matt.7.28-Matt.7.29">...</verse><verse osisID="Matt.7.30">...</verse>
451
// In this case, re-versify Matt.7.30 as Matt.7.28, the first verse in the linked set.
452
// Since we are post-poning linking, we want to re-reversify to the last entry in the module.
454
// 3) The last verse in the chapter is not in the input. There may be other verses missing as well.
455
// <verse osisID="Matt.7.8">...</verse><verse osisID="Matt.7.30">...</verse>
456
// In this case we should re-versify Matt.7.30 as Matt.7.29.
457
// However, since this and 2) are ambiguous, we'll re-reversify to the last entry in the module.
459
while (!key.Error() && !module->hasEntry(&key)) {
463
cout << " Appending content to " << key.getOSISRef() << endl;
466
void writeEntry(SWBuf &text, bool force = false) {
469
static const char* revision = "<milestone type=\"x-importer\" subType=\"x-osis2mod\" n=\"$Rev: 2400 $\"/>";
470
static bool firstOT = true;
471
static bool firstNT = true;
473
if (!inCanonicalOSISBook) {
477
strcpy(keyOsisID, currentVerse.getOSISRef());
479
// set keyOsisID to anything that an osisID cannot be.
481
strcpy(keyOsisID, "-force");
484
static VerseKey lastKey;
485
lastKey.setVersificationSystem(currentVerse.getVersificationSystem());
486
lastKey.AutoNormalize(0);
490
saveKey.setVersificationSystem(currentVerse.getVersificationSystem());
491
saveKey.AutoNormalize(0);
493
saveKey = currentVerse;
495
// If we have seen a verse and the supplied one is different then we output the collected one.
496
if (*activeOsisID && strcmp(activeOsisID, keyOsisID)) {
498
if (!isValidRef(lastKey)) {
499
makeValidRef(lastKey);
502
currentVerse = lastKey;
504
prepareSWText(activeOsisID, activeVerseText);
506
// Put the revision into the module
507
int testmt = currentVerse.Testament();
508
if ((testmt == 1 && firstOT) || (testmt == 2 && firstNT)) {
510
t.setVersificationSystem(currentVerse.getVersificationSystem());
514
currentVerse.Book(0);
515
currentVerse.Chapter(0);
516
currentVerse.Verse(0);
517
module->setEntry(revision);
529
// If the entry already exists, then append this entry to the text.
530
// This is for verses that are outside the chosen versification. They are appended to the prior verse.
531
// The space should not be needed if we retained verse tags.
532
SWBuf currentText = module->getRawEntry();
533
if (currentText.length()) {
534
cout << "INFO(WRITE): Appending entry: " << currentVerse.getOSISRef() << ": " << activeVerseText << endl;
535
activeVerseText = currentText + " " + activeVerseText;
539
if (debug & DEBUG_WRITE) {
540
cout << "DEBUG(WRITE): " << activeOsisID << ":" << currentVerse.getOSISRef() << ": " << activeVerseText << endl;
544
module->setEntry(activeVerseText);
545
activeVerseText = "";
548
// The following is for initial verse content and for appending interverse content.
549
// Eliminate leading whitespace on the beginning of each verse and
550
// before we append to current content, since we just added one
552
if (activeVerseText.length()) {
553
activeVerseText += " ";
554
activeVerseText += text;
557
activeVerseText = text;
559
// text has been consumed so clear it out.
562
currentVerse = saveKey;
563
lastKey = currentVerse;
564
strcpy(activeOsisID, keyOsisID);
567
void linkToEntry(VerseKey &linkKey, VerseKey &dest) {
569
// Only link verses that are in the versification.
570
if (!isValidRef(linkKey)) {
575
saveKey.setVersificationSystem(currentVerse.getVersificationSystem());
576
saveKey.AutoNormalize(0);
578
saveKey = currentVerse;
579
currentVerse = linkKey;
581
cout << "INFO(LINK): Linking " << currentVerse.getOSISRef() << " to " << dest.getOSISRef() << "\n";
582
module->linkEntry(&dest);
584
currentVerse = saveKey;
587
// Return true if the content was handled or is to be ignored.
588
// false if the what has been seen is to be accumulated and considered later.
589
bool handleToken(SWBuf &text, XMLTag token) {
591
// Everything between the begin book tag and the first begin chapter tag is inBookHeader
592
static bool inBookHeader = false;
594
// Everything between the begin chapter tag and the first begin verse tag is inChapterHeader
595
static bool inChapterHeader = false;
597
// Flags indicating whether we are processing the content of a chapter
598
static bool inChapter = false;
600
// Flags indicating whether we are processing the content of a verse
601
static bool inVerse = false;
603
// Flags indicating whether we are processing the content of to be prepended to a verse
604
static bool inPreVerse = false;
605
static int genID = 1;
607
// Flag indicating whether we are in "Words of Christ"
608
static bool inWOC = false;
609
// Tag for WOC quotes within a verse
610
static XMLTag wocTag = "<q who=\"Jesus\" marker=\"\">";
612
// Flag used to indicate where useful text begins
613
static bool firstDiv = false;
615
// Stack of quote elements used to handle Words of Christ
616
static std::stack<XMLTag> quoteStack;
618
// Stack of elements used to validate that books, chapters and verses are well-formed
619
// This goes beyond simple xml well-formed and also considers milestoned div, chapter and verse
620
// to be begin and end tags, too.
621
// It is an error if books and chapters are not well formed (though not required by OSIS)
622
// It is a warning that verses are not well formed (because some clients are not ready)
623
static std::stack<XMLTag> tagStack;
625
// The following are used to validate well-formedness
626
static int chapterDepth = 0;
627
static int bookDepth = 0;
628
static int verseDepth = 0;
630
int tagDepth = tagStack.size();
631
const char *tokenName = token.getName();
632
bool isEndTag = token.isEndTag() || token.getAttribute("eID");
633
const char *typeAttr = token.getAttribute("type");
635
// process start tags
638
// Remember non-empty start tags
639
if (!token.isEmpty()) {
640
tagStack.push(token);
642
if (debug & DEBUG_STACK) {
643
cout << "DEBUG(STACK): " << currentOsisID << ": push (" << tagStack.size() << ") " << token.getName() << endl;
648
// throw away everything up to the first div
650
if (!strcmp(tokenName, "div")) {
652
if (debug & DEBUG_OTHER) {
653
cout << "DEBUG(FOUND): Found first div and pitching prior material: " << text << endl;
656
// TODO: Save off the content to use it to suggest the module's conf.
661
// Collect the content so it can be used to suggest the module's conf.
666
//-- WITH osisID OR annotateRef -------------------------------------------------------------------------
667
// Handle Book, Chapter, and Verse (or commentary equivalent)
668
if (token.getAttribute("osisID") || token.getAttribute("annotateRef")) {
670
// BOOK START, <div type="book" ...>
671
if ((!strcmp(tokenName, "div")) && (typeAttr && !strcmp(typeAttr, "book"))) {
672
if (inBookHeader || inChapterHeader) { // this one should never happen, but just in case
674
if (debug & DEBUG_TITLE) {
675
cout << "DEBUG(TITLE): " << currentOsisID << ": OOPS HEADING " << endl;
676
cout << "\tinChapterHeader = " << inChapterHeader << endl;
677
cout << "\tinBookHeader = " << inBookHeader << endl;
680
currentVerse.Testament(0);
681
currentVerse.Book(0);
682
currentVerse.Chapter(0);
683
currentVerse.Verse(0);
686
currentVerse = token.getAttribute("osisID");
687
currentVerse.Chapter(0);
688
currentVerse.Verse(0);
689
strcpy(currentOsisID, currentVerse.getOSISRef());
695
inChapterHeader = false;
697
bookDepth = tagStack.size();
701
inCanonicalOSISBook = isOSISAbbrev(token.getAttribute("osisID"));
702
if (!inCanonicalOSISBook) {
703
cout << "WARNING(V11N): New book is " << token.getAttribute("osisID") << " and is not in " << v11n << " versification, ignoring" << endl;
706
else if (debug & DEBUG_OTHER) {
707
cout << "DEBUG(FOUND): New book is " << currentVerse.getOSISRef() << endl;
714
// CHAPTER START, <div type="chapter" ...> or <chapter ...>
715
if (((!strcmp(tokenName, "div")) && (typeAttr && !strcmp(typeAttr, "chapter"))) ||
716
(!strcmp(tokenName, "chapter"))
720
if (debug & DEBUG_TITLE) {
721
cout << "DEBUG(TITLE): " << currentOsisID << ": BOOK HEADING "<< text.c_str() << endl;
727
currentVerse = token.getAttribute("osisID");
728
currentVerse.Verse(0);
730
if (debug & DEBUG_OTHER) {
731
cout << "DEBUG(FOUND): Current chapter is " << currentVerse.getOSISRef() << " (" << token.getAttribute("osisID") << ")" << endl;
734
strcpy(currentOsisID, currentVerse.getOSISRef());
739
inBookHeader = false;
740
inChapterHeader = true;
742
chapterDepth = tagStack.size();
748
// VERSE, <verse ...> OR COMMENTARY START, <div annotateType="xxx" ...>
749
if (!strcmp(tokenName, "verse") ||
750
(!strcmp(tokenName, "div") && token.getAttribute("annotateType"))) {
752
if (debug & DEBUG_OTHER) {
753
cout << "DEBUG(FOUND): Entering verse" << endl;
756
if (inChapterHeader) {
757
SWBuf heading = text;
760
if (heading.length()) {
762
if (debug & DEBUG_TITLE) {
763
cout << "DEBUG(TITLE): " << currentOsisID << ": CHAPTER HEADING "<< heading.c_str() << endl;
769
inChapterHeader = false;
772
// Did we have pre-verse material that needs to be marked?
775
sprintf(genBuf, "<div type=\"x-milestone\" subType=\"x-preverse\" eID=\"pv%d\"/>", genID++);
779
// Get osisID for verse or annotateRef for commentary
780
SWBuf keyVal = token.getAttribute(strcmp(tokenName, "verse") ? "annotateRef" : "osisID");
782
// Massage the key into a form that ParseVerseList can accept
783
prepareSWVerseKey(keyVal);
785
// The osisID or annotateRef can be more than a single verse
786
// The first or only one is the currentVerse
787
// Use the last verse seen (i.e. the currentVerse) as the basis for recovering from bad parsing.
788
// This should never happen if the references are valid OSIS references
789
ListKey verseKeys = currentVerse.ParseVerseList(keyVal, currentVerse, true);
790
int memberKeyCount = verseKeys.Count();
791
if (memberKeyCount) {
792
currentVerse = verseKeys.getElement(0);
793
// See if this osisID or annotateRef refers to more than one verse.
794
// If it does, save it until all verses have been seen.
795
// At that point we will output links.
796
// This can be done by incrementing, which will produce an error
797
// if there is only one verse.
798
verseKeys.setPosition(TOP);
799
verseKeys.increment(1);
800
if (!verseKeys.Error()) {
801
linkedVerses.push_back(verseKeys);
805
cout << "ERROR(REF): Invalid osisID/annotateRef: " << token.getAttribute(strcmp(tokenName, "verse") ? "annotateRef" : "osisID") << endl;
808
strcpy(currentOsisID, currentVerse.getOSISRef());
810
if (debug & DEBUG_OTHER) {
811
cout << "DEBUG(FOUND): New current verse is " << currentVerse.getOSISRef() << endl;
812
cout << "DEBUG(FOUND): osisID/annotateRef is adjusted to: " << keyVal << endl;
818
inBookHeader = false;
819
inChapterHeader = false;
820
verseDepth = tagStack.size();
822
// Include the token if it is not a verse
823
if (strcmp(tokenName, "verse")) {
827
else if (debug & DEBUG_VERSE)
829
// transform the verse into a milestone
830
XMLTag t = "<milestone resp=\"v\" />";
831
// copy all the attributes of the verse element to the milestone
832
StringList attrNames = token.getAttributeNames();
833
for (StringList::iterator loop = attrNames.begin(); loop != attrNames.end(); loop++) {
834
const char* attr = (*loop).c_str();
835
t.setAttribute(attr, token.getAttribute(attr));
846
} // done with Handle Book, Chapter, and Verse (or commentary equivalent)
848
// Now consider everything else.
850
// Handle WOC quotes.
851
// Note this requires transformBSP to make them into milestones
852
// Otherwise have to do it here
853
if (!strcmp(tokenName, "q")) {
854
quoteStack.push(token);
856
if (debug & DEBUG_QUOTE) {
857
cout << "DEBUG(QUOTE): " << currentOsisID << ": quote top(" << quoteStack.size() << ") " << token << endl;
860
if (token.getAttribute("who") && !strcmp(token.getAttribute("who"), "Jesus")) {
863
// Output per verse WOC markup.
866
// Output the quotation mark if appropriate, inside the WOC.
867
// If there is no marker attribute, let the SWORD engine manufacture one.
868
// If there is a marker attribute and it has content, then output that.
869
// If the marker attribute is present and empty, then there is nothing to do.
870
// And have it within the WOC markup
871
if (!token.getAttribute("marker") || token.getAttribute("marker")[0]) {
872
token.setAttribute("who", 0); // remove the who="Jesus"
880
// Have we found the start of pre-verse material?
881
// Pre-verse material follows the following rules
882
// 1) Between the opening of a book and the first chapter, all the material is handled as an introduction to the book.
883
// 2) Between the opening of a chapter and the first verse, the material is split between the introduction of the chapter
884
// and the first verse of the chapter.
885
// A <div> with a type other than section will be taken as a chapter introduction.
886
// A <title> of type acrostic, psalm or no type, will be taken as a title for the verse.
887
// A <title> of type main or chapter will be seen as a chapter title.
888
// 3) Between verses, the material is split between the prior verse and the next verse.
889
// Basically, while end and empty tags are found, they belong to the prior verse.
890
// Once a begin tag is found, it belongs to the next verse.
891
// If the title has an attribute type of "main" or "chapter"
892
// it belongs to its <div> or <chapter> and is treated as part of its heading
893
// Otherwise if it a title in a chapter before the first the first verse it
894
// is put into the verse as a preverse title.
896
if (!inPreVerse && !inBookHeader) {
897
if (inChapterHeader) {
898
// Determine when we are no longer in a chapter heading, but in pre-verse material:
899
// If we see one of the following:
901
// a title that is not main or chapter
902
if ((!strcmp(tokenName, "div") && (typeAttr && !strcmp(typeAttr, "section"))) ||
903
(!strcmp(tokenName, "title") && (!typeAttr || (strcmp(typeAttr, "main") && strcmp(typeAttr, "chapter"))))
905
// Since we have found the boundary, we need to write out the chapter heading
907
// And we are no longer in the chapter heading
908
inChapterHeader = false;
909
// But rather, we are now in pre-verse material
913
else if (!inVerse && inChapter) {
919
sprintf(genBuf, "<div type=\"x-milestone\" subType=\"x-preverse\" sID=\"pv%d\"/>", genID++);
925
if (debug & DEBUG_INTERVERSE) {
926
if (!inVerse && !inBookHeader && !inChapterHeader) {
927
cout << "DEBUG(INTERVERSE): " << currentOsisID << ": interverse start token " << token << ":" << text.c_str() << endl;
933
} // Done with procesing start and empty tags
938
if (tagStack.empty()) {
939
cout << "FATAL(NESTING): " << currentOsisID << ": tag expected" << endl;
940
exit(EXIT_BAD_NESTING);
943
// Note: empty end tags have the eID attribute
944
if (!token.isEmpty()) {
945
XMLTag topToken = tagStack.top();
946
tagDepth = tagStack.size();
948
if (debug & DEBUG_STACK) {
949
cout << "DEBUG(STACK): " << currentOsisID << ": pop(" << tagDepth << ") " << topToken.getName() << endl;
954
if (strcmp(topToken.getName(), tokenName)) {
955
cout << "FATAL(NESTING): " << currentOsisID << ": Expected " << topToken.getName() << " found " << tokenName << endl;
956
// exit(EXIT_BAD_NESTING); // (OSK) I'm sure this validity check is a good idea, but there's a but somewhere that's killing the converter here.
957
// So I'm disabling this line. Unvalidated OSIS files shouldn't be run through the converter anyway.
958
// (DM) This has nothing to do with well-form or valid. It checks milestoned elements for proper nesting.
962
// We haven't seen the first div so there is nothing to do.
964
// Collect the content so it can be used to suggest the module's conf.
968
// VERSE and COMMENTARY END
969
if (!strcmp(tokenName, "verse") || (inVerse && !strcmp(tokenName, "div"))) {
971
if (tagDepth != verseDepth) {
972
cout << "WARNING(NESTING): verse " << currentOsisID << " is not well formed:(" << verseDepth << "," << tagDepth << ")" << endl;
975
// If we are in WOC then we need to terminate the <q who="Jesus" marker=""> that was added earlier in the verse.
981
// Include the token if it is not a verse
982
if (strcmp(tokenName, "verse")) {
986
else if (debug & DEBUG_VERSE)
988
// transform the verse into a milestone
989
XMLTag t = "<milestone resp=\"v\" />";
990
// copy all the attributes of the verse element to the milestone
991
StringList attrNames = token.getAttributeNames();
992
for (StringList::iterator loop = attrNames.begin(); loop != attrNames.end(); loop++) {
993
const char* attr = (*loop).c_str();
994
t.setAttribute(attr, token.getAttribute(attr));
1009
// Handle WOC quotes.
1010
// Note this requires transformBSP to make them into milestones
1011
// Otherwise have to manage it here
1012
if (!strcmp(tokenName, "q")) {
1013
XMLTag topToken = quoteStack.top();
1015
if (debug & DEBUG_QUOTE) {
1016
cout << "DEBUG(QUOTE): " << currentOsisID << ": quote pop(" << quoteStack.size() << ") " << topToken << " -- " << token << endl;
1021
// If we have found an end tag for a <q who="Jesus"> then we are done with the WOC
1022
// and we need to terminate the <q who="Jesus" marker=""> that was added earlier in the verse.
1023
if (token.getAttribute("who") && !strcmp(token.getAttribute("who"), "Jesus")) {
1025
if (debug & DEBUG_QUOTE) {
1026
cout << "DEBUG(QUOTE): " << currentOsisID << ": (" << quoteStack.size() << ") " << topToken << " -- " << token << endl;
1030
const char *sID = topToken.getAttribute("sID");
1031
const char *eID = token.getAttribute("eID");
1038
if (strcmp(sID, eID)) {
1039
cout << "ERROR(NESTING): improper nesting " << currentOsisID << ": matching (sID,eID) not found. Looking at (" << sID << "," << eID << ")" << endl;
1043
// Output the quotation mark if appropriate, inside the WOC.
1044
// If there is no marker attribute, let the SWORD engine manufacture one.
1045
// If there is a marker attribute and it has content, then output that.
1046
// If the marker attribute is present and empty, then there is nothing to do.
1047
// And have it within the WOC markup
1048
if (!token.getAttribute("marker") || token.getAttribute("marker")[0]) {
1049
token.setAttribute("who", 0); // remove the who="Jesus"
1053
// Now close the WOC
1054
text.append("</q>");
1060
// Look for the end of document, book and chapter
1061
// Also for material that goes with last entry
1062
if (!inVerse && !inBookHeader && !inChapterHeader) {
1063
// Is this the end of a chapter.
1064
if (tagDepth == chapterDepth && (!strcmp(tokenName, "div") || !strcmp(tokenName, "chapter"))) {
1073
// Is it the end of a book
1074
if (tagDepth == bookDepth && (!strcmp(tokenName, "div"))) {
1083
// Do not include the end of an osis document
1084
if (!strcmp(tokenName, "osisText") || !strcmp(tokenName, "osis")) {
1092
// When we are not inPreVerse, the interverse tags get appended to the preceeding verse.
1097
if (debug & DEBUG_INTERVERSE) {
1098
cout << "DEBUG(INTERVERSE): " << currentOsisID << ": appending interverse end tag: " << tokenName << "(" << tagDepth << "," << chapterDepth << "," << bookDepth << ")" << endl;
1105
if (debug & DEBUG_INTERVERSE) {
1106
cout << "DEBUG(INTERVERSE): " << currentOsisID << ": interverse end tag: " << tokenName << "(" << tagDepth << "," << chapterDepth << "," << bookDepth << ")" << endl;
1114
} // done with Processing end tags
1120
* Support normalizations necessary for a SWORD module.
1121
* OSIS allows for document structure (Book, Section, Paragraph or BSP)
1122
* to overlap Bible versification (Book, Chapter, Verse).
1123
* Most SWORD applications need to display verses in isolation or in HTML table cells,
1124
* requiring each stored entry (i.e. verses) to be well-formed xml.
1125
* This routine normalizes container elements which could cross verse boundaries into milestones.
1126
* For most of these OSIS elements, there is a milestone form. However, p is not milestoneable.
1127
* For this reason, p is transformed into lb elements.
1128
* param t the tag to transform
1129
* return the transformed tag or the original one
1131
XMLTag transformBSP(XMLTag t) {
1132
static std::stack<XMLTag> bspTagStack;
1136
// Support simplification transformations
1139
if (debug & DEBUG_XFORM) {
1140
cout << "DEBUG(XFORM): " << currentOsisID << ": xform empty " << t << endl;
1146
const char* tagName = t.getName();
1147
if (!t.isEndTag()) {
1148
// Transform <p> into <div type="paragraph"> and milestone it
1149
if (!strcmp(tagName, "p")) {
1150
t.setText("<div type=\"paragraph\" />");
1151
sprintf(buf, "gen%d", sID++);
1152
t.setAttribute("sID", buf);
1155
// Transform <tag> into <tag sID="">, where tag is a milestoneable element.
1156
// The following containers are milestoneable.
1157
// abbr, closer, div, foreign, l, lg, salute, signed, speech
1159
// abbr When would this ever cross a boundary?
1160
// seg as it is used for a divineName hack
1161
// foreign so that it can be easily italicized
1162
else if (!strcmp(tagName, "chapter") ||
1163
!strcmp(tagName, "closer") ||
1164
!strcmp(tagName, "div") ||
1165
!strcmp(tagName, "l") ||
1166
!strcmp(tagName, "lg") ||
1167
!strcmp(tagName, "q") ||
1168
!strcmp(tagName, "salute") ||
1169
!strcmp(tagName, "signed") ||
1170
!strcmp(tagName, "speech") ||
1171
!strcmp(tagName, "verse")
1174
sprintf(buf, "gen%d", sID++);
1175
t.setAttribute("sID", buf);
1177
bspTagStack.push(t);
1179
if (debug & DEBUG_XFORM) {
1180
cout << "DEBUG(XFORM): " << currentOsisID << ": xform push (" << bspTagStack.size() << ") " << t << " (tagname=" << tagName << ")" << endl;
1181
XMLTag topToken = bspTagStack.top();
1182
cout << "DEBUG(XFORM): " << currentOsisID << ": xform top(" << bspTagStack.size() << ") " << topToken << endl;
1187
XMLTag topToken = bspTagStack.top();
1189
if (debug & DEBUG_XFORM) {
1190
cout << "DEBUG(XFORM): " << currentOsisID << ": xform pop(" << bspTagStack.size() << ") " << topToken << endl;
1195
// Look for the milestoneable container tags handled above.
1196
if (!strcmp(tagName, "chapter") ||
1197
!strcmp(tagName, "closer") ||
1198
!strcmp(tagName, "div") ||
1199
!strcmp(tagName, "l") ||
1200
!strcmp(tagName, "lg") ||
1201
!strcmp(tagName, "p") ||
1202
!strcmp(tagName, "q") ||
1203
!strcmp(tagName, "salute") ||
1204
!strcmp(tagName, "signed") ||
1205
!strcmp(tagName, "speech") ||
1206
!strcmp(tagName, "verse")
1208
// make this a clone of the start tag with sID changed to eID
1209
// Note: in the case of </p> the topToken is a <div type="paragraph">
1211
t.setAttribute("eID", t.getAttribute("sID"));
1212
t.setAttribute("sID", 0);
1220
* Write out all links in the module.
1221
* Waiting is necessary because writeEntry might ultimately append
1222
* text to a verse moving it's offset in the data file.
1223
* While we are minimizing it by postponing the write until we have
1224
* gathered the next verse, the following scenario is happening:
1225
* A module is using linked verses and has some verses that are not
1226
* in the chosen versification. If the out-of-canon verse happens following
1227
* a linked verse, the out-of-canon verse is appended to the prior
1228
* verse. Care has to be taken that the linked verses all point to
1229
* the first of the set.
1233
// Link all the verses
1235
destKey.setVersificationSystem(currentVerse.getVersificationSystem());
1236
destKey.AutoNormalize(0);
1237
destKey.Headings(1);
1240
linkKey.setVersificationSystem(currentVerse.getVersificationSystem());
1241
linkKey.AutoNormalize(0);
1242
linkKey.Headings(1);
1243
for (unsigned int i = 0; i < linkedVerses.size(); i++) {
1244
// The verseKeys is a list of verses
1245
// where the first is the real verse
1246
// and the others link to it.
1247
ListKey verseKeys = linkedVerses[i];
1248
verseKeys.setPosition(TOP);
1249
destKey = verseKeys.getElement();
1250
verseKeys.increment(1);
1252
while (!verseKeys.Error()) {
1253
linkKey = verseKeys.getElement();
1254
verseKeys.increment(1);
1255
linkToEntry(linkKey, destKey);
1260
void usage(const char *app, const char *error = 0) {
1262
if (error) fprintf(stderr, "\n%s: %s\n", app, error);
1264
fprintf(stderr, "\nusage: %s <output/path> <osisDoc> [OPTIONS]\n", app);
1265
fprintf(stderr, " <output/path>\t\t an existing folder that the module will be written\n");
1266
fprintf(stderr, " <osisDoc>\t\t path to the validated OSIS document, or '-' to read from standard input\n");
1267
fprintf(stderr, " -a\t\t\t augment module if exists (default is to create new)\n");
1268
fprintf(stderr, " -z\t\t\t use ZIP compression (default no compression)\n");
1269
fprintf(stderr, " -Z\t\t\t use LZSS compression (default no compression)\n");
1270
fprintf(stderr, " -b <2|3|4>\t\t compression block size (default 4):\n");
1271
fprintf(stderr, "\t\t\t\t 2 - verse; 3 - chapter; 4 - book\n");
1272
fprintf(stderr, " -c <cipher_key>\t encipher module using supplied key\n");
1273
fprintf(stderr, "\t\t\t\t (default no enciphering)\n");
1274
fprintf(stderr, " -N\t\t\t do not convert UTF-8 or normalize UTF-8 to NFC\n");
1275
fprintf(stderr, "\t\t\t\t (default is to convert to UTF-8, if needed,\n");
1276
fprintf(stderr, "\t\t\t\t and then normalize to NFC)\n");
1277
fprintf(stderr, "\t\t\t\t Note: UTF-8 texts should be normalized to NFC.\n");
1278
fprintf(stderr, " -s <2|4>\t\t max text size per entry (default is 2).\n");
1279
fprintf(stderr, "\t\t\t\t Note: useful for commentaries with very large entries\n");
1280
fprintf(stderr, "\t\t\t\t in uncompressed modules (default is 65535 bytes)\n");
1281
fprintf(stderr, " -v <v11n>\t\t specify a versification scheme to use (default is KJV)\n");
1282
fprintf(stderr, "\t\t\t\t Note: The following are valid values for v11n:\n");
1283
VerseMgr *vmgr = VerseMgr::getSystemVerseMgr();
1284
StringList av11n = vmgr->getVersificationSystems();
1285
for (StringList::iterator loop = av11n.begin(); loop != av11n.end(); loop++) {
1286
fprintf(stderr, "\t\t\t\t\t%s\n", (*loop).c_str());
1289
fprintf(stderr, " -d <flags>\t\t turn on debugging (default is 0)\n");
1290
fprintf(stderr, "\t\t\t\t Note: This flag may change in the future.\n");
1291
fprintf(stderr, "\t\t\t\t Flags: The following are valid values:\n");
1292
fprintf(stderr, "\t\t\t\t\t0 - no debugging\n");
1293
fprintf(stderr, "\t\t\t\t\t1 - writes to module, very verbose\n");
1294
fprintf(stderr, "\t\t\t\t\t2 - verse start and end\n");
1295
fprintf(stderr, "\t\t\t\t\t4 - quotes, especially Words of Christ (WOC)\n");
1296
fprintf(stderr, "\t\t\t\t\t8 - titles\n");
1297
fprintf(stderr, "\t\t\t\t\t16 - inter-verse material\n");
1298
fprintf(stderr, "\t\t\t\t\t32 - BSP to BCV transformations\n");
1299
fprintf(stderr, "\t\t\t\t\t64 - v11n exceptions\n");
1300
fprintf(stderr, "\t\t\t\t\t128 - parsing of osisID and osisRef\n");
1301
fprintf(stderr, "\t\t\t\t\t256 - internal stack\n");
1302
fprintf(stderr, "\t\t\t\t\t512 - miscellaneous\n");
1303
fprintf(stderr, "\t\t\t\t This flag can be used more than once.\n");
1305
fprintf(stderr, "\n");
1306
fprintf(stderr, "See http://www.crosswire.org/wiki/osis2mod for more details.\n");
1307
fprintf(stderr, "\n");
1311
void processOSIS(istream& infile) {
1312
activeOsisID[0] = '\0';
1314
strcpy(currentOsisID,"N/A");
1316
currentVerse.setVersificationSystem(v11n);
1317
currentVerse.AutoNormalize(0);
1318
currentVerse.Headings(1); // turn on mod/testmnt/book/chap headings
1319
currentVerse.Persist(1);
1321
module->setKey(currentVerse);
1322
module->setPosition(TOP);
1326
bool intoken = false;
1327
bool inWhitespace = false;
1328
bool seeingSpace = false;
1329
char curChar = '\0';
1331
while (infile.good()) {
1333
curChar = infile.get();
1335
// skip the character if it is bad. infile.good() will catch the problem
1336
if (curChar == -1) {
1340
if (!intoken && curChar == '<') {
1346
// Outside of tokens merge adjacent whitespace
1348
seeingSpace = isspace(curChar);
1353
// convert all whitespace to blanks
1356
inWhitespace = seeingSpace;
1359
if (intoken && curChar == '>') {
1361
inWhitespace = false;
1363
// take this isalpha if out to check for bugs in text
1364
if ((isalpha(token[1])) || (isalpha(token[2]))) {
1365
//cout << "Handle:" << token.c_str() << endl;
1366
XMLTag t = transformBSP(token.c_str());
1368
if (!handleToken(text, t)) {
1376
token.append(curChar);
1380
case '>' : text.append(">"); break;
1381
case '<' : text.append("<"); break;
1382
default : text.append(curChar); break;
1387
// Force the last entry from the text buffer.
1389
writeEntry(text, true);
1393
if (converted) fprintf(stderr, "osis2mod converted %d verses to UTF-8\n", converted);
1394
if (normalized) fprintf(stderr, "osis2mod normalized %d verses to NFC\n", normalized);
1398
int main(int argc, char **argv) {
1400
fprintf(stderr, "You are running osis2mod: $Rev: 2400 $\n");
1402
// Let's test our command line arguments
1407
// variables for arguments, holding defaults
1408
const char* program = argv[0];
1409
const char* path = argv[1];
1410
const char* osisDoc = argv[2];
1412
SWBuf compType = "";
1413
bool isCommentary = false;
1416
SWBuf cipherKey = "";
1417
SWCompress *compressor = 0;
1419
for (int i = 3; i < argc; i++) {
1420
if (!strcmp(argv[i], "-a")) {
1423
else if (!strcmp(argv[i], "-z")) {
1424
if (compType.size()) usage(*argv, "Cannot specify both -z and -Z");
1425
if (entrySize) usage(*argv, "Cannot specify both -z and -s");
1428
else if (!strcmp(argv[i], "-Z")) {
1429
if (compType.size()) usage(*argv, "Cannot specify both -z and -Z");
1430
if (entrySize) usage(*argv, "Cannot specify both -Z and -s");
1433
else if (!strcmp(argv[i], "-b")) {
1435
iType = atoi(argv[++i]);
1436
if ((iType >= 2) && (iType <= 4)) continue;
1438
usage(*argv, "-b requires one of <2|3|4>");
1440
else if (!strcmp(argv[i], "-N")) {
1443
else if (!strcmp(argv[i], "-c")) {
1444
if (i+1 < argc) cipherKey = argv[++i];
1445
else usage(*argv, "-c requires <cipher_key>");
1447
else if (!strcmp(argv[i], "-v")) {
1448
if (i+1 < argc) v11n = argv[++i];
1449
else usage(*argv, "-v requires <v11n>");
1451
else if (!strcmp(argv[i], "-s")) {
1452
if (compType.size()) usage(*argv, "Cannot specify -s and -z or -Z");
1454
entrySize = atoi(argv[++i]);
1455
if (entrySize == 2 || entrySize == 4) {
1459
usage(*argv, "-s requires one of <2|4>");
1461
else if (!strcmp(argv[i], "-C")) {
1462
isCommentary = true;
1465
else if (!strcmp(argv[i], "-d")) {
1466
if (i+1 < argc) debug |= atoi(argv[++i]);
1467
else usage(*argv, "-d requires <flags>");
1470
else usage(*argv, (((SWBuf)"Unknown argument: ")+ argv[i]).c_str());
1473
if (compType == "ZIP") {
1474
compressor = new ZipCompress();
1476
else if (compType = "LZSS") {
1477
compressor = new LZSSCompress();
1483
cout << "WARNING(UTF8): " << program << " is not compiled with support for ICU. Assuming -N." << endl;
1488
if (debug & DEBUG_OTHER) {
1489
cout << "DEBUG(ARGS):\n\tpath: " << path << "\n\tosisDoc: " << osisDoc << "\n\tcreate: " << append << "\n\tcompressType: " << compType << "\n\tblockType: " << iType << "\n\tcipherKey: " << cipherKey.c_str() << "\n\tnormalize: " << normalize << endl;
1493
if (!append) { // == 0 then create module
1494
// Try to initialize a default set of datafiles and indicies at our
1495
// datapath location passed to us from the user.
1497
if (zText::createModule(path, iType, v11n)) {
1498
fprintf(stderr, "ERROR: %s: couldn't create module at path: %s \n", program, path);
1499
exit(EXIT_NO_CREATE);
1502
else if (entrySize == 4) {
1503
if (RawText4::createModule(path, v11n)) {
1504
fprintf(stderr, "ERROR: %s: couldn't create module at path: %s \n", program, path);
1505
exit(EXIT_NO_CREATE);
1509
if (RawText::createModule(path, v11n)) {
1510
fprintf(stderr, "ERROR: %s: couldn't create module at path: %s \n", program, path);
1511
exit(EXIT_NO_CREATE);
1516
// Do some initialization stuff
1518
// Create a compressed text module allowing very large entries
1519
// Taking defaults except for first, fourth, fifth and last argument
1524
iType, // iblockType
1525
compressor, // icomp
1528
DIRECTION_LTR, // dir
1529
FMT_UNKNOWN, // markup
1531
v11n // versification
1534
else if (entrySize == 4) {
1535
// Create a raw text module allowing very large entries
1536
// Taking defaults except for first and last argument
1537
module = new RawText4(
1542
ENC_UNKNOWN, // encoding
1543
DIRECTION_LTR, // dir
1544
FMT_UNKNOWN, // markup
1546
v11n // versification
1550
// Create a raw text module allowing reasonable sized entries
1551
// Taking defaults except for first and last argument
1552
module = new RawText(
1557
ENC_UNKNOWN, // encoding
1558
DIRECTION_LTR, // dir
1559
FMT_UNKNOWN, // markup
1561
v11n // versification
1565
SWFilter *cipherFilter = 0;
1567
if (cipherKey.length()) {
1568
fprintf(stderr, "Adding cipher filter with phrase: %s\n", cipherKey.c_str() );
1569
cipherFilter = new CipherFilter(cipherKey.c_str());
1570
module->AddRawFilter(cipherFilter);
1573
if (!module->isWritable()) {
1574
fprintf(stderr, "The module is not writable. Writing text to it will not work.\nExiting.\n" );
1575
exit(EXIT_NO_WRITE);
1578
// Either read from std::cin (aka stdin), when the argument is a '-'
1579
// or from a specified file.
1580
if (!strcmp(osisDoc, "-")) {
1584
// Let's see if we can open our input file
1585
ifstream infile(osisDoc);
1586
if (infile.fail()) {
1587
fprintf(stderr, "ERROR: %s: couldn't open input file: %s \n", program, osisDoc);
1590
processOSIS(infile);
1596
delete cipherFilter;