1
// =================================================================================================
2
// Copyright 2002-2008 Adobe Systems Incorporated
3
// All Rights Reserved.
5
// NOTICE: Adobe permits you to use, modify, and distribute this file in accordance with the terms
6
// of the Adobe license agreement accompanying it.
8
// Adobe patent application tracking #P435, entitled 'Unique markers to simplify embedding data of
9
// one format in a file with a different format', inventors: Sean Parent, Greg Gilley.
10
// =================================================================================================
12
#include "XMP_Environment.h" // ! This must be the first include!
13
#include "XMPCore_Impl.hpp"
14
#include "XMPMeta.hpp"
15
#include "XMPUtils.hpp"
16
#include "UnicodeInlines.incl_cpp"
17
#include "UnicodeConversions.hpp"
18
#include "ExpatAdapter.hpp"
27
#pragma warning ( disable : 4533 ) // initialization of '...' is skipped by 'goto ...'
28
#pragma warning ( disable : 4702 ) // unreachable code
29
#pragma warning ( disable : 4800 ) // forcing value to bool 'true' or 'false' (performance warning)
30
#pragma warning ( disable : 4996 ) // '...' was declared deprecated
34
// *** Use the XMP_PropIsXyz (Schema, Simple, Struct, Array, ...) macros
35
// *** Add debug codegen checks, e.g. that typical masking operations really work
36
// *** Change all uses of strcmp and strncmp to XMP_LitMatch and XMP_LitNMatch
39
// =================================================================================================
40
// Local Types and Constants
41
// =========================
44
// =================================================================================================
48
#ifndef Trace_ParsingHackery
49
#define Trace_ParsingHackery 0
52
static const char * kReplaceLatin1[128] =
55
// The 0x80..0x9F range is undefined in Latin-1, but is defined in Windows code page 1252.
56
// The bytes 0x81, 0x8D, 0x8F, 0x90, and 0x9D are formally undefined by Windows 1252, but
57
// their conversion API maps them to U+0081, etc. These are in XML's RestrictedChar set, so
58
// we map them to a space.
60
"\xE2\x82\xAC", " ", "\xE2\x80\x9A", "\xC6\x92", // 0x80 .. 0x83
61
"\xE2\x80\x9E", "\xE2\x80\xA6", "\xE2\x80\xA0", "\xE2\x80\xA1", // 0x84 .. 0x87
62
"\xCB\x86", "\xE2\x80\xB0", "\xC5\xA0", "\xE2\x80\xB9", // 0x88 .. 0x8B
63
"\xC5\x92", " ", "\xC5\xBD", " ", // 0x8C .. 0x8F
65
" ", "\xE2\x80\x98", "\xE2\x80\x99", "\xE2\x80\x9C", // 0x90 .. 0x93
66
"\xE2\x80\x9D", "\xE2\x80\xA2", "\xE2\x80\x93", "\xE2\x80\x94", // 0x94 .. 0x97
67
"\xCB\x9C", "\xE2\x84\xA2", "\xC5\xA1", "\xE2\x80\xBA", // 0x98 .. 0x9B
68
"\xC5\x93", " ", "\xC5\xBE", "\xC5\xB8", // 0x9C .. 0x9F
70
// These are the UTF-8 forms of the official Latin-1 characters in the range 0xA0..0xFF. Not
71
// too surprisingly these map to U+00A0, etc. Which is the Unicode Latin Supplement range.
73
"\xC2\xA0", "\xC2\xA1", "\xC2\xA2", "\xC2\xA3", "\xC2\xA4", "\xC2\xA5", "\xC2\xA6", "\xC2\xA7", // 0xA0 .. 0xA7
74
"\xC2\xA8", "\xC2\xA9", "\xC2\xAA", "\xC2\xAB", "\xC2\xAC", "\xC2\xAD", "\xC2\xAE", "\xC2\xAF", // 0xA8 .. 0xAF
76
"\xC2\xB0", "\xC2\xB1", "\xC2\xB2", "\xC2\xB3", "\xC2\xB4", "\xC2\xB5", "\xC2\xB6", "\xC2\xB7", // 0xB0 .. 0xB7
77
"\xC2\xB8", "\xC2\xB9", "\xC2\xBA", "\xC2\xBB", "\xC2\xBC", "\xC2\xBD", "\xC2\xBE", "\xC2\xBF", // 0xB8 .. 0xBF
79
"\xC3\x80", "\xC3\x81", "\xC3\x82", "\xC3\x83", "\xC3\x84", "\xC3\x85", "\xC3\x86", "\xC3\x87", // 0xC0 .. 0xC7
80
"\xC3\x88", "\xC3\x89", "\xC3\x8A", "\xC3\x8B", "\xC3\x8C", "\xC3\x8D", "\xC3\x8E", "\xC3\x8F", // 0xC8 .. 0xCF
82
"\xC3\x90", "\xC3\x91", "\xC3\x92", "\xC3\x93", "\xC3\x94", "\xC3\x95", "\xC3\x96", "\xC3\x97", // 0xD0 .. 0xD7
83
"\xC3\x98", "\xC3\x99", "\xC3\x9A", "\xC3\x9B", "\xC3\x9C", "\xC3\x9D", "\xC3\x9E", "\xC3\x9F", // 0xD8 .. 0xDF
85
"\xC3\xA0", "\xC3\xA1", "\xC3\xA2", "\xC3\xA3", "\xC3\xA4", "\xC3\xA5", "\xC3\xA6", "\xC3\xA7", // 0xE0 .. 0xE7
86
"\xC3\xA8", "\xC3\xA9", "\xC3\xAA", "\xC3\xAB", "\xC3\xAC", "\xC3\xAD", "\xC3\xAE", "\xC3\xAF", // 0xE8 .. 0xEF
88
"\xC3\xB0", "\xC3\xB1", "\xC3\xB2", "\xC3\xB3", "\xC3\xB4", "\xC3\xB5", "\xC3\xB6", "\xC3\xB7", // 0xF0 .. 0xF7
89
"\xC3\xB8", "\xC3\xB9", "\xC3\xBA", "\xC3\xBB", "\xC3\xBC", "\xC3\xBD", "\xC3\xBE", "\xC3\xBF", // 0xF8 .. 0xFF
94
// =================================================================================================
99
#define IsHexDigit(ch) ( (('0' <= (ch)) && ((ch) <= '9')) || (('A' <= (ch)) && ((ch) <= 'F')) )
100
#define HexDigitValue(ch) ( (((ch) - '0') < 10) ? ((ch) - '0') : ((ch) - 'A' + 10) )
103
// -------------------------------------------------------------------------------------------------
106
static const XML_Node * PickBestRoot ( const XML_Node & xmlParent, XMP_OptionBits options )
109
// Look among this parent's content for x:xmpmeta. The recursion for x:xmpmeta is broader than
110
// the strictly defined choice, but gives us smaller code.
111
for ( size_t childNum = 0, childLim = xmlParent.content.size(); childNum < childLim; ++childNum ) {
112
const XML_Node * childNode = xmlParent.content[childNum];
113
if ( childNode->kind != kElemNode ) continue;
114
if ( (childNode->name == "x:xmpmeta") || (childNode->name == "x:xapmeta") ) return PickBestRoot ( *childNode, 0 );
116
// Look among this parent's content for a bare rdf:RDF if that is allowed.
117
if ( ! (options & kXMP_RequireXMPMeta) ) {
118
for ( size_t childNum = 0, childLim = xmlParent.content.size(); childNum < childLim; ++childNum ) {
119
const XML_Node * childNode = xmlParent.content[childNum];
120
if ( childNode->kind != kElemNode ) continue;
121
if ( childNode->name == "rdf:RDF" ) return childNode;
125
// Recurse into the content.
126
for ( size_t childNum = 0, childLim = xmlParent.content.size(); childNum < childLim; ++childNum ) {
127
const XML_Node * foundRoot = PickBestRoot ( *xmlParent.content[childNum], options );
128
if ( foundRoot != 0 ) return foundRoot;
135
// -------------------------------------------------------------------------------------------------
139
// Find the XML node that is the root of the XMP data tree. Generally this will be an outer node,
140
// but it could be anywhere if a general XML document is parsed (e.g. SVG). The XML parser counted
141
// all possible root nodes, and kept a pointer to the last one. If there is more than one possible
142
// root use PickBestRoot to choose among them.
144
// If there is a root node, try to extract the version of the previous XMP toolkit.
146
static const XML_Node * FindRootNode ( XMPMeta * thiz, const XMLParserAdapter & xmlParser, XMP_OptionBits options )
148
const XML_Node * rootNode = xmlParser.rootNode;
150
if ( xmlParser.rootCount > 1 ) rootNode = PickBestRoot ( xmlParser.tree, options );
151
if ( rootNode == 0 ) return 0;
153
// We have a root node. Try to extract previous toolkit version number.
155
XMP_StringPtr verStr = "";
157
XMP_Assert ( rootNode->name == "rdf:RDF" );
159
if ( (options & kXMP_RequireXMPMeta) &&
160
((rootNode->parent == 0) ||
161
((rootNode->parent->name != "x:xmpmeta") && (rootNode->parent->name != "x:xapmeta"))) ) return 0;
163
for ( size_t attrNum = 0, attrLim = rootNode->parent->attrs.size(); attrNum < attrLim; ++attrNum ) {
164
const XML_Node * currAttr =rootNode->parent->attrs[attrNum];
165
if ( (currAttr->name == "x:xmptk") || (currAttr->name == "x:xaptk") ) {
166
verStr = currAttr->value.c_str();
171
// Decode the version number into MMmmuubbb digits. If any part is too big, peg it at 99 or 999.
174
while ( (*verStr != 0) && ((*verStr < '0') || (*verStr > '9')) ) ++verStr;
177
while ( (*verStr != 0) && ('0' <= *verStr) && (*verStr <= '9') ) {
178
part = (part * 10) + (*verStr - '0');
181
if ( part > 99 ) part = 99;
182
thiz->prevTkVer = part * 100*100*1000;
185
if ( *verStr == '.' ) ++verStr;
186
while ( (*verStr != 0) && ('0' <= *verStr) && (*verStr <= '9') ) {
187
part = (part * 10) + (*verStr - '0');
190
if ( part > 99 ) part = 99;
191
thiz->prevTkVer += part * 100*1000;
194
if ( *verStr == '.' ) ++verStr;
195
while ( (*verStr != 0) && ('0' <= *verStr) && (*verStr <= '9') ) {
196
part = (part * 10) + (*verStr - '0');
199
if ( part > 99 ) part = 99;
200
thiz->prevTkVer += part * 1000;
203
if ( *verStr == '-' ) ++verStr;
204
while ( (*verStr != 0) && ('0' <= *verStr) && (*verStr <= '9') ) {
205
part = (part * 10) + (*verStr - '0');
208
if ( part > 999 ) part = 999;
209
thiz->prevTkVer += part;
215
// -------------------------------------------------------------------------------------------------
219
// Undo the denormalization performed by the XMP used in Acrobat 5. If a Dublin Core array had only
220
// one item, it was serialized as a simple property. The xml:lang attribute was dropped from an
221
// alt-text item if the language was x-default.
223
// *** This depends on the dc: namespace prefix.
226
NormalizeDCArrays ( XMP_Node * xmpTree )
228
XMP_Node * dcSchema = FindSchemaNode ( xmpTree, kXMP_NS_DC, kXMP_ExistingOnly );
229
if ( dcSchema == 0 ) return;
231
for ( size_t propNum = 0, propLimit = dcSchema->children.size(); propNum < propLimit; ++propNum ) {
232
XMP_Node * currProp = dcSchema->children[propNum];
233
XMP_OptionBits arrayForm = 0;
235
if ( ! XMP_PropIsSimple ( currProp->options ) ) continue; // Nothing to do if not simple.
237
if ( (currProp->name == "dc:creator" ) || // See if it is supposed to be an array.
238
(currProp->name == "dc:date" ) ) { // *** Think about an array of char* and a loop.
239
arrayForm = kXMP_PropArrayIsOrdered;
241
(currProp->name == "dc:description" ) ||
242
(currProp->name == "dc:rights" ) ||
243
(currProp->name == "dc:title" ) ) {
244
arrayForm = kXMP_PropArrayIsAltText;
246
(currProp->name == "dc:contributor" ) ||
247
(currProp->name == "dc:language" ) ||
248
(currProp->name == "dc:publisher" ) ||
249
(currProp->name == "dc:relation" ) ||
250
(currProp->name == "dc:subject" ) ||
251
(currProp->name == "dc:type" ) ) {
252
arrayForm = kXMP_PropValueIsArray;
254
if ( arrayForm == 0 ) continue; // Nothing to do if it isn't supposed to be an array.
256
arrayForm = VerifySetOptions ( arrayForm, 0 ); // Set the implicit array bits.
257
XMP_Node * newArray = new XMP_Node ( dcSchema, currProp->name.c_str(), arrayForm );
258
dcSchema->children[propNum] = newArray;
259
newArray->children.push_back ( currProp );
260
currProp->parent = newArray;
261
currProp->name = kXMP_ArrayItemName;
263
if ( XMP_ArrayIsAltText ( arrayForm ) && (! (currProp->options & kXMP_PropHasLang)) ) {
264
XMP_Node * newLang = new XMP_Node ( currProp, "xml:lang", "x-default", kXMP_PropIsQualifier );
265
currProp->options |= (kXMP_PropHasQualifiers | kXMP_PropHasLang);
266
if ( currProp->qualifiers.empty() ) { // *** Need a util?
267
currProp->qualifiers.push_back ( newLang );
269
currProp->qualifiers.insert ( currProp->qualifiers.begin(), newLang );
275
} // NormalizeDCArrays
278
// -------------------------------------------------------------------------------------------------
279
// CompareAliasedSubtrees
280
// ----------------------
282
// *** Change to do some alias-specific setup, then use CompareSubtrees. One special case for
283
// *** aliases is a simple to x-default alias, the options and qualifiers obviously differ.
286
CompareAliasedSubtrees ( XMP_Node * aliasNode, XMP_Node * baseNode, bool outerCall = true )
288
// ! The outermost call is special. The names almost certainly differ. The qualifiers (and
289
// ! hence options) will differ for an alias to the x-default item of a langAlt array.
290
if ( (aliasNode->value != baseNode->value) ||
291
(aliasNode->children.size() != baseNode->children.size()) ) {
292
XMP_Throw ( "Mismatch between alias and base nodes", kXMPErr_BadXMP );
295
if ( (aliasNode->name != baseNode->name) ||
296
(aliasNode->options != baseNode->options) ||
297
(aliasNode->qualifiers.size() != baseNode->qualifiers.size()) ) {
298
XMP_Throw ( "Mismatch between alias and base nodes", kXMPErr_BadXMP );
302
for ( size_t childNum = 0, childLim = aliasNode->children.size(); childNum < childLim; ++childNum ) {
303
XMP_Node * aliasChild = aliasNode->children[childNum];
304
XMP_Node * baseChild = baseNode->children[childNum];
305
CompareAliasedSubtrees ( aliasChild, baseChild, false );
308
for ( size_t qualNum = 0, qualLim = aliasNode->qualifiers.size(); qualNum < qualLim; ++qualNum ) {
309
XMP_Node * aliasQual = aliasNode->qualifiers[qualNum];
310
XMP_Node * baseQual = baseNode->qualifiers[qualNum];
311
CompareAliasedSubtrees ( aliasQual, baseQual, false );
314
} // CompareAliasedSubtrees
317
// -------------------------------------------------------------------------------------------------
318
// TransplantArrayItemAlias
319
// ------------------------
322
TransplantArrayItemAlias ( XMP_Node * oldParent, size_t oldNum, XMP_Node * newParent )
324
XMP_Node * childNode = oldParent->children[oldNum];
326
if ( newParent->options & kXMP_PropArrayIsAltText ) {
327
if ( childNode->options & kXMP_PropHasLang ) {
328
XMP_Throw ( "Alias to x-default already has a language qualifier", kXMPErr_BadXMP ); // *** Allow x-default.
330
childNode->options |= (kXMP_PropHasQualifiers | kXMP_PropHasLang);
331
XMP_Node * langQual = new XMP_Node ( childNode, "xml:lang", "x-default", kXMP_PropIsQualifier ); // *** AddLangQual util?
332
if ( childNode->qualifiers.empty() ) {
333
childNode->qualifiers.push_back ( langQual );
335
childNode->qualifiers.insert ( childNode->qualifiers.begin(), langQual );
339
oldParent->children.erase ( oldParent->children.begin() + oldNum );
340
childNode->name = kXMP_ArrayItemName;
341
childNode->parent = newParent;
342
if ( newParent->children.empty() ) {
343
newParent->children.push_back ( childNode );
345
newParent->children.insert ( newParent->children.begin(), childNode );
348
} // TransplantArrayItemAlias
351
// -------------------------------------------------------------------------------------------------
352
// TransplantNamedAlias
353
// --------------------
356
TransplantNamedAlias ( XMP_Node * oldParent, size_t oldNum, XMP_Node * newParent, XMP_VarString & newName )
358
XMP_Node * childNode = oldParent->children[oldNum];
360
oldParent->children.erase ( oldParent->children.begin() + oldNum );
361
childNode->name = newName;
362
childNode->parent = newParent;
363
newParent->children.push_back ( childNode );
365
} // TransplantNamedAlias
368
// -------------------------------------------------------------------------------------------------
369
// MoveExplicitAliases
370
// -------------------
373
MoveExplicitAliases ( XMP_Node * tree, XMP_OptionBits parseOptions )
375
tree->options ^= kXMP_PropHasAliases;
376
const bool strictAliasing = ((parseOptions & kXMP_StrictAliasing) != 0);
378
// Visit all of the top level nodes looking for aliases. If there is no base, transplant the
379
// alias subtree. If there is a base and strict aliasing is on, make sure the alias and base
382
// ! Use "while" loops not "for" loops since both the schema and property loops can remove the
383
// ! current item from the vector being traversed. And don't increment the counter for a delete.
385
size_t schemaNum = 0;
386
while ( schemaNum < tree->children.size() ) {
387
XMP_Node * currSchema = tree->children[schemaNum];
390
while ( propNum < currSchema->children.size() ) {
391
XMP_Node * currProp = currSchema->children[propNum];
392
if ( ! (currProp->options & kXMP_PropIsAlias) ) {
396
currProp->options ^= kXMP_PropIsAlias;
398
// Find the base path, look for the base schema and root node.
400
XMP_AliasMapPos aliasPos = sRegisteredAliasMap->find ( currProp->name );
401
XMP_Assert ( aliasPos != sRegisteredAliasMap->end() );
402
XMP_ExpandedXPath & basePath = aliasPos->second;
403
XMP_OptionBits arrayOptions = (basePath[kRootPropStep].options & kXMP_PropArrayFormMask);
405
XMP_Node * baseSchema = FindSchemaNode ( tree, basePath[kSchemaStep].step.c_str(), kXMP_CreateNodes );
406
if ( baseSchema->options & kXMP_NewImplicitNode ) baseSchema->options ^= kXMP_NewImplicitNode;
407
XMP_Node * baseNode = FindChildNode ( baseSchema, basePath[kRootPropStep].step.c_str(), kXMP_ExistingOnly );
409
if ( baseNode == 0 ) {
411
if ( basePath.size() == 2 ) {
412
// A top-to-top alias, transplant the property.
413
TransplantNamedAlias ( currSchema, propNum, baseSchema, basePath[kRootPropStep].step );
415
// An alias to an array item, create the array and transplant the property.
416
baseNode = new XMP_Node ( baseSchema, basePath[kRootPropStep].step.c_str(), arrayOptions );
417
baseSchema->children.push_back ( baseNode );
418
TransplantArrayItemAlias ( currSchema, propNum, baseNode );
421
} else if ( basePath.size() == 2 ) {
423
// The base node does exist and this is a top-to-top alias. Check for conflicts if
424
// strict aliasing is on. Remove and delete the alias subtree.
425
if ( strictAliasing ) CompareAliasedSubtrees ( currProp, baseNode );
426
currSchema->children.erase ( currSchema->children.begin() + propNum );
431
// This is an alias to an array item and the array exists. Look for the aliased item.
432
// Then transplant or check & delete as appropriate.
434
XMP_Node * itemNode = 0;
435
if ( arrayOptions & kXMP_PropArrayIsAltText ) {
436
XMP_Index xdIndex = LookupLangItem ( baseNode, *xdefaultName );
437
if ( xdIndex != -1 ) itemNode = baseNode->children[xdIndex];
438
} else if ( ! baseNode->children.empty() ) {
439
itemNode = baseNode->children[0];
442
if ( itemNode == 0 ) {
443
TransplantArrayItemAlias ( currSchema, propNum, baseNode );
445
if ( strictAliasing ) CompareAliasedSubtrees ( currProp, itemNode );
446
currSchema->children.erase ( currSchema->children.begin() + propNum );
454
// Increment the counter or remove an empty schema node.
455
if ( currSchema->children.size() > 0 ) {
458
delete tree->children[schemaNum]; // ! Delete the schema node itself.
459
tree->children.erase ( tree->children.begin() + schemaNum );
464
} // MoveExplicitAliases
467
// -------------------------------------------------------------------------------------------------
472
FixGPSTimeStamp ( XMP_Node * exifSchema, XMP_Node * gpsDateTime )
474
XMP_DateTime binGPSStamp;
476
XMPUtils::ConvertToDate ( gpsDateTime->value.c_str(), &binGPSStamp );
478
return; // Don't let a bad date stop other things.
480
if ( (binGPSStamp.year != 0) || (binGPSStamp.month != 0) || (binGPSStamp.day != 0) ) return;
482
XMP_Node * otherDate = FindChildNode ( exifSchema, "exif:DateTimeOriginal", kXMP_ExistingOnly );
483
if ( otherDate == 0 ) otherDate = FindChildNode ( exifSchema, "exif:DateTimeDigitized", kXMP_ExistingOnly );
484
if ( otherDate == 0 ) return;
486
XMP_DateTime binOtherDate;
488
XMPUtils::ConvertToDate ( otherDate->value.c_str(), &binOtherDate );
490
return; // Don't let a bad date stop other things.
493
binGPSStamp.year = binOtherDate.year;
494
binGPSStamp.month = binOtherDate.month;
495
binGPSStamp.day = binOtherDate.day;
497
XMP_StringPtr goodStr;
498
XMP_StringLen goodLen;
499
XMPUtils::ConvertFromDate ( binGPSStamp, &goodStr, &goodLen );
501
gpsDateTime->value.assign ( goodStr, goodLen );
506
// -------------------------------------------------------------------------------------------------
507
// MigrateAudioCopyright
508
// ---------------------
510
// The initial support for WAV files mapped a legacy ID3 audio copyright into a new xmpDM:copyright
511
// property. This is special case code to migrate that into dc:rights['x-default']. The rules:
513
// 1. If there is no dc:rights array, or an empty array -
514
// Create one with dc:rights['x-default'] set from double linefeed and xmpDM:copyright.
516
// 2. If there is a dc:rights array but it has no x-default item -
517
// Create an x-default item as a copy of the first item then apply rule #3.
519
// 3. If there is a dc:rights array with an x-default item, look for a double linefeed in the value.
520
// A. If no double linefeed, compare the x-default value to the xmpDM:copyright value.
521
// A1. If they match then leave the x-default value alone.
522
// A2. Otherwise, append a double linefeed and the xmpDM:copyright value to the x-default value.
523
// B. If there is a double linefeed, compare the trailing text to the xmpDM:copyright value.
524
// B1. If they match then leave the x-default value alone.
525
// B2. Otherwise, replace the trailing x-default text with the xmpDM:copyright value.
527
// 4. In all cases, delete the xmpDM:copyright property.
530
MigrateAudioCopyright ( XMPMeta * xmp, XMP_Node * dmCopyright )
535
std::string & dmValue = dmCopyright->value;
536
static const char * kDoubleLF = "\xA\xA";
538
XMP_Node * dcSchema = FindSchemaNode ( &xmp->tree, kXMP_NS_DC, kXMP_CreateNodes );
539
XMP_Node * dcRightsArray = FindChildNode ( dcSchema, "dc:rights", kXMP_ExistingOnly );
541
if ( (dcRightsArray == 0) || dcRightsArray->children.empty() ) {
543
// 1. No dc:rights array, create from double linefeed and xmpDM:copyright.
544
dmValue.insert ( 0, kDoubleLF );
545
xmp->SetLocalizedText ( kXMP_NS_DC, "rights", "", "x-default", dmValue.c_str(), 0 );
549
std::string xdefaultStr ( "x-default" );
551
XMP_Index xdIndex = LookupLangItem ( dcRightsArray, xdefaultStr );
554
// 2. No x-default item, create from the first item.
555
XMP_StringPtr firstValue = dcRightsArray->children[0]->value.c_str();
556
xmp->SetLocalizedText ( kXMP_NS_DC, "rights", "", "x-default", firstValue, 0 );
557
xdIndex = LookupLangItem ( dcRightsArray, xdefaultStr );
560
// 3. Look for a double linefeed in the x-default value.
561
XMP_Assert ( xdIndex == 0 );
562
std::string & defaultValue = dcRightsArray->children[xdIndex]->value;
563
XMP_Index lfPos = defaultValue.find ( kDoubleLF );
567
// 3A. No double LF, compare whole values.
568
if ( dmValue != defaultValue ) {
569
// 3A2. Append the xmpDM:copyright to the x-default item.
570
defaultValue += kDoubleLF;
571
defaultValue += dmValue;
576
// 3B. Has double LF, compare the tail.
577
if ( defaultValue.compare ( lfPos+2, std::string::npos, dmValue ) != 0 ) {
578
// 3B2. Replace the x-default tail.
579
defaultValue.replace ( lfPos+2, std::string::npos, dmValue );
586
// 4. Get rid of the xmpDM:copyright.
587
xmp->DeleteProperty ( kXMP_NS_DM, "copyright" );
590
// Don't let failures (like a bad dc:rights form) stop other cleanup.
593
} // MigrateAudioCopyright
596
// -------------------------------------------------------------------------------------------------
600
// Make sure that the array is well-formed AltText. Each item must be simple and have an xml:lang
601
// qualifier. If repairs are needed, keep simple non-empty items by adding the xml:lang.
604
RepairAltText ( XMP_Node & tree, XMP_StringPtr schemaNS, XMP_StringPtr arrayName )
606
XMP_Node * schemaNode = FindSchemaNode ( &tree, schemaNS, kXMP_ExistingOnly );
607
if ( schemaNode == 0 ) return;
609
XMP_Node * arrayNode = FindChildNode ( schemaNode, arrayName, kXMP_ExistingOnly );
610
if ( (arrayNode == 0) || XMP_ArrayIsAltText ( arrayNode->options ) ) return; // Already OK.
612
if ( ! XMP_PropIsArray ( arrayNode->options ) ) return; // ! Not even an array, leave it alone.
613
// *** Should probably change simple values to LangAlt with 'x-default' item.
615
arrayNode->options |= (kXMP_PropArrayIsOrdered | kXMP_PropArrayIsAlternate | kXMP_PropArrayIsAltText);
617
for ( int i = arrayNode->children.size()-1; i >= 0; --i ) { // ! Need a signed index type.
619
XMP_Node * currChild = arrayNode->children[i];
621
if ( ! XMP_PropIsSimple ( currChild->options ) ) {
623
// Delete non-simple children.
624
delete ( currChild );
625
arrayNode->children.erase ( arrayNode->children.begin() + i );
627
} else if ( ! XMP_PropHasLang ( currChild->options ) ) {
629
if ( currChild->value.empty() ) {
631
// Delete empty valued children that have no xml:lang.
632
delete ( currChild );
633
arrayNode->children.erase ( arrayNode->children.begin() + i );
637
// Add an xml:lang qualifier with the value "x-repair".
638
XMP_Node * repairLang = new XMP_Node ( currChild, "xml:lang", "x-repair", kXMP_PropIsQualifier );
639
if ( currChild->qualifiers.empty() ) {
640
currChild->qualifiers.push_back ( repairLang );
642
currChild->qualifiers.insert ( currChild->qualifiers.begin(), repairLang );
644
currChild->options |= (kXMP_PropHasQualifiers | kXMP_PropHasLang);
655
// -------------------------------------------------------------------------------------------------
660
TouchUpDataModel ( XMPMeta * xmp )
662
XMP_Node & tree = xmp->tree;
664
// Do special case touch ups for certain schema.
666
XMP_Node * currSchema = 0;
668
currSchema = FindSchemaNode ( &tree, kXMP_NS_EXIF, kXMP_ExistingOnly );
669
if ( currSchema != 0 ) {
671
// Do a special case fix for exif:GPSTimeStamp.
672
XMP_Node * gpsDateTime = FindChildNode ( currSchema, "exif:GPSTimeStamp", kXMP_ExistingOnly );
673
if ( gpsDateTime != 0 ) FixGPSTimeStamp ( currSchema, gpsDateTime );
675
// *** Should probably have RepairAltText change simple values to LangAlt with 'x-default' item.
676
// *** For now just do this for exif:UserComment, the one case we know about, late in cycle fix.
677
XMP_Node * userComment = FindChildNode ( currSchema, "exif:UserComment", kXMP_ExistingOnly );
678
if ( (userComment != 0) && XMP_PropIsSimple ( userComment->options ) ) {
679
XMP_Node * newChild = new XMP_Node ( userComment, kXMP_ArrayItemName,
680
userComment->value.c_str(), userComment->options );
681
newChild->qualifiers.swap ( userComment->qualifiers );
682
if ( ! XMP_PropHasLang ( newChild->options ) ) {
683
XMP_Node * langQual = new XMP_Node ( newChild, "xml:lang", "x-default", kXMP_PropIsQualifier );
684
newChild->qualifiers.insert ( newChild->qualifiers.begin(), langQual );
685
newChild->options |= (kXMP_PropHasQualifiers | kXMP_PropHasLang);
687
userComment->value.erase();
688
userComment->options = kXMP_PropArrayFormMask; // ! Happens to have all the right bits.
689
userComment->children.push_back ( newChild );
694
currSchema = FindSchemaNode ( &tree, kXMP_NS_DM, kXMP_ExistingOnly );
695
if ( currSchema != 0 ) {
696
// Do a special case migration of xmpDM:copyright to dc:rights['x-default']. Do this before
697
// the dc: touch up since it can affect the dc: schema.
698
XMP_Node * dmCopyright = FindChildNode ( currSchema, "xmpDM:copyright", kXMP_ExistingOnly );
699
if ( dmCopyright != 0 ) MigrateAudioCopyright ( xmp, dmCopyright );
702
currSchema = FindSchemaNode ( &tree, kXMP_NS_DC, kXMP_ExistingOnly );
703
if ( currSchema != 0 ) {
704
// Do a special case fix for dc:subject, make sure it is an unordered array.
705
XMP_Node * dcSubject = FindChildNode ( currSchema, "dc:subject", kXMP_ExistingOnly );
706
if ( dcSubject != 0 ) {
707
XMP_OptionBits keepMask = ~(kXMP_PropArrayIsOrdered | kXMP_PropArrayIsAlternate | kXMP_PropArrayIsAltText);
708
dcSubject->options &= keepMask; // Make sure any ordered array bits are clear.
712
// Fix any broken AltText arrays that we know about.
714
RepairAltText ( tree, kXMP_NS_DC, "dc:description" ); // ! Note inclusion of prefixes for direct node lookup!
715
RepairAltText ( tree, kXMP_NS_DC, "dc:rights" );
716
RepairAltText ( tree, kXMP_NS_DC, "dc:title" );
717
RepairAltText ( tree, kXMP_NS_XMP_Rights, "xmpRights:UsageTerms" );
718
RepairAltText ( tree, kXMP_NS_EXIF, "exif:UserComment" );
720
// Tweak old XMP: Move an instance ID from rdf:about to the xmpMM:InstanceID property. An old
721
// instance ID usually looks like "uuid:bac965c4-9d87-11d9-9a30-000d936b79c4", plus InDesign
722
// 3.0 wrote them like "bac965c4-9d87-11d9-9a30-000d936b79c4". If the name looks like a UUID
723
// simply move it to xmpMM:InstanceID, don't worry about any existing xmpMM:InstanceID. Both
724
// will only be present when a newer file with the xmpMM:InstanceID property is updated by an
725
// old app that uses rdf:about.
727
if ( ! tree.name.empty() ) {
729
bool nameIsUUID = false;
730
XMP_StringPtr nameStr = tree.name.c_str();
732
if ( XMP_LitNMatch ( nameStr, "uuid:", 5 ) ) {
736
} else if ( tree.name.size() == 36 ) {
738
nameIsUUID = true; // ! Assume true, we'll set it to false below if not.
739
for ( int i = 0; i < 36; ++i ) {
740
char ch = nameStr[i];
742
if ( (i == 8) || (i == 13) || (i == 18) || (i == 23) ) continue;
746
if ( (('0' <= ch) && (ch <= '9')) || (('a' <= ch) && (ch <= 'z')) ) continue;
756
XMP_ExpandedXPath expPath;
757
ExpandXPath ( kXMP_NS_XMP_MM, "InstanceID", &expPath );
758
XMP_Node * idNode = FindNode ( &tree, expPath, kXMP_CreateNodes, 0 );
759
if ( idNode == 0 ) XMP_Throw ( "Failure creating xmpMM:InstanceID", kXMPErr_InternalFailure );
761
idNode->options = 0; // Clobber any existing xmpMM:InstanceID.
762
idNode->value = tree.name;
763
idNode->RemoveChildren();
764
idNode->RemoveQualifiers();
772
} // TouchUpDataModel
775
// -------------------------------------------------------------------------------------------------
776
// DetermineInputEncoding
777
// ----------------------
779
// Try to determine the character encoding, making a guess if the input is too short. We make some
780
// simplifying assumtions: the first character must be U+FEFF or ASCII, U+0000 is not allowed. The
781
// XML 1.1 spec is even more strict, UTF-16 XML documents must begin with U+FEFF, and the first
782
// "real" character must be '<'. Ignoring the XML declaration, the first XML character could be '<',
783
// space, tab, CR, or LF.
785
// The possible input sequences are:
788
// EF BB BF -- - UTF-8
789
// FE FF -- -- - Big endian UTF-16
790
// 00 00 FE FF - Big endian UTF 32
791
// FF FE 00 00 - Little endian UTF-32
792
// FF FE -- -- - Little endian UTF-16
795
// nn mm -- -- - UTF-8 -
796
// 00 00 00 nn - Big endian UTF-32
797
// 00 nn -- -- - Big endian UTF-16
798
// nn 00 00 00 - Little endian UTF-32
799
// nn 00 -- -- - Little endian UTF-16
801
// ! We don't check for full patterns, or for errors. We just check enough to determine what the
802
// ! only possible (or reasonable) case would be.
804
static XMP_OptionBits
805
DetermineInputEncoding ( const XMP_Uns8 * buffer, size_t length )
807
if ( length < 2 ) return kXMP_EncodeUTF8;
809
XMP_Uns8 * uniChar = (XMP_Uns8*)buffer; // ! Make sure comparisons are unsigned.
811
if ( uniChar[0] == 0 ) {
814
// 00 nn -- -- - Big endian UTF-16
815
// 00 00 00 nn - Big endian UTF-32
816
// 00 00 FE FF - Big endian UTF 32
818
if ( (length < 4) || (uniChar[1] != 0) ) return kXMP_EncodeUTF16Big;
819
return kXMP_EncodeUTF32Big;
821
} else if ( uniChar[0] < 0x80 ) {
824
// nn mm -- -- - UTF-8, includes EF BB BF case
825
// nn 00 00 00 - Little endian UTF-32
826
// nn 00 -- -- - Little endian UTF-16
828
if ( uniChar[1] != 0 ) return kXMP_EncodeUTF8;
829
if ( (length < 4) || (uniChar[2] != 0) ) return kXMP_EncodeUTF16Little;
830
return kXMP_EncodeUTF32Little;
835
// EF BB BF -- - UTF-8
836
// FE FF -- -- - Big endian UTF-16
837
// FF FE 00 00 - Little endian UTF-32
838
// FF FE -- -- - Little endian UTF-16
840
if ( uniChar[0] == 0xEF ) return kXMP_EncodeUTF8;
841
if ( uniChar[0] == 0xFE ) return kXMP_EncodeUTF16Big;
842
if ( (length < 4) || (uniChar[2] != 0) ) return kXMP_EncodeUTF16Little;
843
return kXMP_EncodeUTF32Little;
847
} // DetermineInputEncoding
850
// -------------------------------------------------------------------------------------------------
854
// Look for a valid multi-byte UTF-8 sequence and return its length. Returns 0 for an invalid UTF-8
855
// sequence. Returns a negative value for a partial valid sequence at the end of the buffer.
857
// The checking is not strict. We simply count the number of high order 1 bits in the first byte,
858
// then look for n-1 following bytes whose high order 2 bits are 1 and 0. We do not check for a
859
// minimal length representation of the codepoint, or that the codepoint is defined by Unicode.
862
CountUTF8 ( const XMP_Uns8 * charStart, const XMP_Uns8 * bufEnd )
864
XMP_Assert ( charStart < bufEnd ); // Catch this in debug builds.
865
if ( charStart >= bufEnd ) return 0; // Don't run-on in release builds.
866
if ( (*charStart & 0xC0) != 0xC0 ) return 0; // Must have at least 2 high bits set.
869
XMP_Uns8 firstByte = *charStart;
870
for ( firstByte = firstByte << 2; (firstByte & 0x80) != 0; firstByte = firstByte << 1 ) ++byteCount;
872
if ( (charStart + byteCount) > bufEnd ) return -byteCount;
874
for ( int i = 1; i < byteCount; ++i ) {
875
if ( (charStart[i] & 0xC0) != 0x80 ) return 0;
883
// -------------------------------------------------------------------------------------------------
884
// CountControlEscape
885
// ------------------
887
// Look for a numeric escape sequence for a "prohibited" ASCII control character. These are 0x7F,
888
// and the range 0x00..0x1F except for tab/LF/CR. Return 0 if this is definitely not a numeric
889
// escape, the length of the escape if found, or a negative value for a partial escape.
892
CountControlEscape ( const XMP_Uns8 * escStart, const XMP_Uns8 * bufEnd )
894
XMP_Assert ( escStart < bufEnd ); // Catch this in debug builds.
895
if ( escStart >= bufEnd ) return 0; // Don't run-on in release builds.
896
XMP_Assert ( *escStart == '&' );
898
size_t tailLen = bufEnd - escStart;
899
if ( tailLen < 5 ) return -1; // Don't need a more thorough check, we'll catch it on the next pass.
901
if ( strncmp ( (char*)escStart, "&#x", 3 ) != 0 ) return 0;
903
XMP_Uns8 escValue = 0;
904
const XMP_Uns8 * escPos = escStart + 3;
906
if ( ('0' <= *escPos) && (*escPos <= '9') ) {
907
escValue = *escPos - '0';
909
} else if ( ('A' <= *escPos) && (*escPos <= 'F') ) {
910
escValue = *escPos - 'A' + 10;
912
} else if ( ('a' <= *escPos) && (*escPos <= 'f') ) {
913
escValue = *escPos - 'a' + 10;
917
if ( ('0' <= *escPos) && (*escPos <= '9') ) {
918
escValue = (escValue << 4) + (*escPos - '0');
920
} else if ( ('A' <= *escPos) && (*escPos <= 'F') ) {
921
escValue = (escValue << 4) + (*escPos - 'A' + 10);
923
} else if ( ('a' <= *escPos) && (*escPos <= 'f') ) {
924
escValue = (escValue << 4) + (*escPos - 'a' + 10);
928
if ( escPos == bufEnd ) return -1; // Partial escape.
929
if ( *escPos != ';' ) return 0;
931
size_t escLen = escPos - escStart + 1;
932
if ( escLen < 5 ) return 0; // ! Catch "&#x;".
934
if ( (escValue == kTab) || (escValue == kLF) || (escValue == kCR) ) return 0; // An allowed escape.
936
return escLen; // Found a full "prohibited" numeric escape.
938
} // CountControlEscape
941
// -------------------------------------------------------------------------------------------------
942
// ProcessUTF8Portion
943
// ------------------
945
// Early versions of the XMP spec mentioned allowing ISO Latin-1 input. There are also problems with
946
// some clients placing ASCII control characters within XMP values. This is an XML problem, the XML
947
// spec only allows tab (0x09), LF (0x0A), and CR (0x0D) from the 0x00..0x1F range. As a concession
948
// to this we scan 8-bit input for byte sequences that are not valid UTF-8 or in the 0x00..0x1F
949
// range and replace each byte as follows:
950
// 0x00..0x1F - Replace with a space, except for tab, CR, and LF.
951
// 0x7F - Replace with a space. This is ASCII Delete, not allowed by ISO Latin-1.
952
// 0x80..0x9F - Replace with the UTF-8 for a corresponding Unicode character.
953
// 0xA0..0XFF - Replace with the UTF-8 for a corresponding Unicode character.
955
// The 0x80..0x9F range is not defined by Latin-1. But the Windows 1252 code page defines these and
956
// is otherwise the same as Latin-1.
958
// For at least historical compatibility reasons we also find and replace singly escaped ASCII
959
// control characters. The Expat parser we're using does not allow numeric escapes like "".
960
// The XML spec is clear that raw controls are not allowed (in the RestrictedChar set), but it isn't
961
// as clear about numeric escapes for them. At any rate, Expat complains, so we treat the numeric
962
// escapes like raw characters and replace them with a space.
964
// We check for 1 or 2 hex digits ("	" or "	") and upper or lower case ("
" or "
").
965
// The full escape sequence is 5 or 6 bytes.
968
ProcessUTF8Portion ( XMLParserAdapter * xmlParser,
969
const XMP_Uns8 * buffer,
973
const XMP_Uns8 * bufEnd = buffer + length;
975
const XMP_Uns8 * spanStart = buffer;
976
const XMP_Uns8 * spanEnd;
978
for ( spanEnd = spanStart; spanEnd < bufEnd; ++spanEnd ) {
980
if ( (0x20 <= *spanEnd) && (*spanEnd <= 0x7E) && (*spanEnd != '&') ) continue; // A regular ASCII character.
982
if ( *spanEnd >= 0x80 ) {
984
// See if this is a multi-byte UTF-8 sequence, or a Latin-1 character to replace.
986
int uniLen = CountUTF8 ( spanEnd, bufEnd );
990
// A valid UTF-8 character, keep it as-is.
991
spanEnd += uniLen - 1; // ! The loop increment will put back the +1.
993
} else if ( (uniLen < 0) && (! last) ) {
995
// Have a partial UTF-8 character at the end of the buffer and more input coming.
996
xmlParser->ParseBuffer ( spanStart, (spanEnd - spanStart), false );
997
return (spanEnd - buffer);
1001
// Not a valid UTF-8 sequence. Replace the first byte with the Latin-1 equivalent.
1002
xmlParser->ParseBuffer ( spanStart, (spanEnd - spanStart), false );
1003
const char * replacement = kReplaceLatin1 [ *spanEnd - 0x80 ];
1004
xmlParser->ParseBuffer ( replacement, strlen ( replacement ), false );
1005
spanStart = spanEnd + 1; // ! The loop increment will do "spanEnd = spanStart".
1009
} else if ( (*spanEnd < 0x20) || (*spanEnd == 0x7F) ) {
1011
// Replace ASCII controls other than tab, LF, and CR with a space.
1013
if ( (*spanEnd == kTab) || (*spanEnd == kLF) || (*spanEnd == kCR) ) continue;
1015
xmlParser->ParseBuffer ( spanStart, (spanEnd - spanStart), false );
1016
xmlParser->ParseBuffer ( " ", 1, false );
1017
spanStart = spanEnd + 1; // ! The loop increment will do "spanEnd = spanStart".
1021
// See if this is a numeric escape sequence for a prohibited ASCII control.
1023
XMP_Assert ( *spanEnd == '&' );
1024
int escLen = CountControlEscape ( spanEnd, bufEnd );
1028
// Have a partial numeric escape in this buffer, wait for more input.
1029
if ( last ) continue; // No more buffers, not an escape, absorb as normal input.
1030
xmlParser->ParseBuffer ( spanStart, (spanEnd - spanStart), false );
1031
return (spanEnd - buffer);
1033
} else if ( escLen > 0 ) {
1035
// Have a complete numeric escape to replace.
1036
xmlParser->ParseBuffer ( spanStart, (spanEnd - spanStart), false );
1037
xmlParser->ParseBuffer ( " ", 1, false );
1038
spanStart = spanEnd + escLen;
1039
spanEnd = spanStart - 1; // ! The loop continuation will increment spanEnd!
1047
XMP_Assert ( spanEnd == bufEnd );
1049
if ( spanStart < bufEnd ) xmlParser->ParseBuffer ( spanStart, (spanEnd - spanStart), false );
1050
if ( last ) xmlParser->ParseBuffer ( " ", 1, true );
1054
} // ProcessUTF8Portion
1057
// -------------------------------------------------------------------------------------------------
1061
// Although most clients will probably parse everything in one call, we have a buffered API model
1062
// and need to support even the extreme case of 1 byte at a time parsing. This is considerably
1063
// complicated by some special cases for 8-bit input. Because of this, the first thing we do is
1064
// determine whether the input is 8-bit, UTF-16, or UTF-32.
1066
// Both the 8-bit special cases and the encoding determination are easier to do with 8 bytes or more
1067
// of input. The XMLParserAdapter class has a pending-input buffer for this. At the start of parsing
1068
// we (moght) try to fill this buffer before determining the input character encoding. After that,
1069
// we (might) use this buffer with the current input to simplify the logic in Process8BitInput. The
1070
// "(might)" part means that we don't actually use the pending-input buffer unless we have to. In
1071
// particular, the common case of single-buffer parsing won't use it.
1074
XMPMeta::ParseFromBuffer ( XMP_StringPtr buffer,
1075
XMP_StringLen xmpSize,
1076
XMP_OptionBits options )
1078
if ( (buffer == 0) && (xmpSize != 0) ) XMP_Throw ( "Null parse buffer", kXMPErr_BadParam );
1079
if ( xmpSize == kXMP_UseNullTermination ) xmpSize = strlen ( buffer );
1081
const bool lastClientCall = ((options & kXMP_ParseMoreBuffers) == 0); // *** Could use FlagIsSet & FlagIsClear macros.
1083
this->tree.ClearNode(); // Make sure the target XMP object is totally empty.
1085
if ( this->xmlParser == 0 ) {
1086
if ( (xmpSize == 0) && lastClientCall ) return; // Tolerate empty parse. Expat complains if there are no XML elements.
1087
this->xmlParser = XMP_NewExpatAdapter();
1090
XMLParserAdapter& parser = *this->xmlParser;
1092
#if 0 // XMP_DebugBuild
1093
if ( parser.parseLog != 0 ) {
1094
char message [200]; // AUDIT: Using sizeof(message) below for snprintf length is safe.
1095
snprintf ( message, sizeof(message), "<!-- ParseFromBuffer, length = %d, options = %X%s -->", // AUDIT: See above.
1096
xmpSize, options, (lastClientCall ? " (last)" : "") );
1097
fwrite ( message, 1, strlen(message), parser.parseLog );
1098
fflush ( parser.parseLog );
1102
try { // Cleanup the tree and xmlParser if anything fails.
1104
// Determine the character encoding before doing any real parsing. This is needed to do the
1105
// 8-bit special processing.
1107
if ( parser.charEncoding == XMP_OptionBits(-1) ) {
1109
if ( (parser.pendingCount == 0) && (xmpSize >= kXMLPendingInputMax) ) {
1111
// This ought to be the common case, the first buffer is big enough.
1112
parser.charEncoding = DetermineInputEncoding ( (XMP_Uns8*)buffer, xmpSize );
1116
// Try to fill the pendingInput buffer before calling DetermineInputEncoding.
1118
size_t pendingOverlap = kXMLPendingInputMax - parser.pendingCount;
1119
if ( pendingOverlap > xmpSize ) pendingOverlap = xmpSize;
1121
memcpy ( &parser.pendingInput[parser.pendingCount], buffer, pendingOverlap ); // AUDIT: Count is safe.
1122
buffer += pendingOverlap;
1123
xmpSize -= pendingOverlap;
1124
parser.pendingCount += pendingOverlap;
1126
if ( (! lastClientCall) && (parser.pendingCount < kXMLPendingInputMax) ) return;
1127
parser.charEncoding = DetermineInputEncoding ( parser.pendingInput, parser.pendingCount );
1129
#if Trace_ParsingHackery
1130
fprintf ( stderr, "XMP Character encoding is %d\n", parser.charEncoding );
1137
// We have the character encoding. Process UTF-16 and UTF-32 as is. UTF-8 needs special
1138
// handling to take care of things like ISO Latin-1 or unescaped ASCII controls.
1140
XMP_Assert ( parser.charEncoding != XMP_OptionBits(-1) );
1142
if ( parser.charEncoding != kXMP_EncodeUTF8 ) {
1144
if ( parser.pendingCount > 0 ) {
1145
// Might have pendingInput from the above portion to determine the character encoding.
1146
parser.ParseBuffer ( parser.pendingInput, parser.pendingCount, false );
1148
parser.ParseBuffer ( buffer, xmpSize, lastClientCall );
1152
#if Trace_ParsingHackery
1153
fprintf ( stderr, "Parsing %d bytes @ %.8X, %s, %d pending, context: %.8s\n",
1154
xmpSize, buffer, (lastClientCall ? "last" : "not last"), parser.pendingCount, buffer );
1157
// The UTF-8 processing is a bit complex due to the need to tolerate ISO Latin-1 input.
1158
// This is done by scanning the input for byte sequences that are not valid UTF-8,
1159
// assuming they are Latin-1 characters in the range 0x80..0xFF. This requires saving a
1160
// pending input buffer to handle partial UTF-8 sequences at the end of a buffer.
1162
while ( parser.pendingCount > 0 ) {
1164
// We've got some leftover input, process it first then continue with the current
1165
// buffer. Try to fill the pendingInput buffer before parsing further. We use a loop
1166
// for wierd edge cases like a 2 byte input buffer, using 1 byte for pendingInput,
1167
// then having a partial UTF-8 end and need to absorb more.
1169
size_t pendingOverlap = kXMLPendingInputMax - parser.pendingCount;
1170
if ( pendingOverlap > xmpSize ) pendingOverlap = xmpSize;
1172
memcpy ( &parser.pendingInput[parser.pendingCount], buffer, pendingOverlap ); // AUDIT: Count is safe.
1173
parser.pendingCount += pendingOverlap;
1174
buffer += pendingOverlap;
1175
xmpSize -= pendingOverlap;
1177
if ( (! lastClientCall) && (parser.pendingCount < kXMLPendingInputMax) ) return;
1178
size_t bytesDone = ProcessUTF8Portion ( &parser, parser.pendingInput, parser.pendingCount, lastClientCall );
1179
size_t bytesLeft = parser.pendingCount - bytesDone;
1181
#if Trace_ParsingHackery
1182
fprintf ( stderr, " ProcessUTF8Portion handled %d pending bytes\n", bytesDone );
1185
if ( bytesDone == parser.pendingCount ) {
1187
// Done with all of the pending input, move on to the current buffer.
1188
parser.pendingCount = 0;
1190
} else if ( bytesLeft <= pendingOverlap ) {
1192
// The leftover pending input all came from the current buffer. Exit this loop.
1193
buffer -= bytesLeft;
1194
xmpSize += bytesLeft;
1195
parser.pendingCount = 0;
1197
} else if ( xmpSize > 0 ) {
1199
// Pull more of the current buffer into the pending input and try again.
1200
// Backup by this pass's overlap so the loop entry code runs OK.
1201
parser.pendingCount -= pendingOverlap;
1202
buffer -= pendingOverlap;
1203
xmpSize += pendingOverlap;
1207
// There is no more of the current buffer. Wait for more. Partial sequences at
1208
// the end of the last buffer should be treated as Latin-1 by ProcessUTF8Portion.
1209
XMP_Assert ( ! lastClientCall );
1210
parser.pendingCount = bytesLeft;
1211
memcpy ( &parser.pendingInput[0], &parser.pendingInput[bytesDone], bytesLeft ); // AUDIT: Count is safe.
1218
// Done with the pending input, process the current buffer.
1220
size_t bytesDone = ProcessUTF8Portion ( &parser, (XMP_Uns8*)buffer, xmpSize, lastClientCall );
1222
#if Trace_ParsingHackery
1223
fprintf ( stderr, " ProcessUTF8Portion handled %d additional bytes\n", bytesDone );
1226
if ( bytesDone < xmpSize ) {
1228
XMP_Assert ( ! lastClientCall );
1229
size_t bytesLeft = xmpSize - bytesDone;
1230
if ( bytesLeft > kXMLPendingInputMax ) XMP_Throw ( "Parser bytesLeft too large", kXMPErr_InternalFailure );
1232
memcpy ( parser.pendingInput, &buffer[bytesDone], bytesLeft ); // AUDIT: Count is safe.
1233
parser.pendingCount = bytesLeft;
1234
return; // Wait for the next buffer.
1240
if ( lastClientCall ) {
1242
#if XMP_DebugBuild && DumpXMLParseTree
1243
if ( parser.parseLog == 0 ) parser.parseLog = stdout;
1244
DumpXMLTree ( parser.parseLog, parser.tree, 0 );
1247
const XML_Node * xmlRoot = FindRootNode ( this, *this->xmlParser, options );
1249
if ( xmlRoot != 0 ) {
1251
ProcessRDF ( &this->tree, *xmlRoot, options );
1252
NormalizeDCArrays ( &this->tree );
1253
if ( this->tree.options & kXMP_PropHasAliases ) MoveExplicitAliases ( &this->tree, options );
1254
TouchUpDataModel ( this );
1256
// Delete empty schema nodes. Do this last, other cleanup can make empty schema.
1257
size_t schemaNum = 0;
1258
while ( schemaNum < this->tree.children.size() ) {
1259
XMP_Node * currSchema = this->tree.children[schemaNum];
1260
if ( currSchema->children.size() > 0 ) {
1263
delete this->tree.children[schemaNum]; // ! Delete the schema node itself.
1264
this->tree.children.erase ( this->tree.children.begin() + schemaNum );
1270
delete this->xmlParser;
1271
this->xmlParser = 0;
1277
delete this->xmlParser;
1278
this->xmlParser = 0;
1280
this->tree.ClearNode();
1285
} // ParseFromBuffer
1287
// =================================================================================================