1
// This file is part of BOINC.
2
// http://boinc.berkeley.edu
3
// Copyright (C) 2008 University of California
5
// BOINC is free software; you can redistribute it and/or modify it
6
// under the terms of the GNU Lesser General Public License
7
// as published by the Free Software Foundation,
8
// either version 3 of the License, or (at your option) any later version.
10
// BOINC is distributed in the hope that it will be useful,
11
// but WITHOUT ANY WARRANTY; without even the implied warranty of
12
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
13
// See the GNU Lesser General Public License for more details.
15
// You should have received a copy of the GNU Lesser General Public License
16
// along with BOINC. If not, see <http://www.gnu.org/licenses/>.
18
// A very crude interface for parsing XML files;
19
// assumes all elements are either single-line or
20
// have start and end tags on separate lines.
21
// This is meant to be used ONLY for parsing XML files produced
22
// by the BOINC scheduling server or client.
23
// Could replace this with a more general parser.
25
#if defined(_WIN32) && !defined(__STDWX_H__) && !defined(_BOINC_WIN_) && !defined(_AFX_STDAFX_H_)
26
#include "boinc_win.h"
41
#include "error_numbers.h"
46
#include "boinc_fcgi.h"
53
// Parse a boolean; tag is of form "foobar"
54
// Accept either <foobar/> or <foobar>0|1</foobar>
56
bool parse_bool(const char* buf, const char* tag, bool& result) {
57
char single_tag[256], start_tag[256];
60
sprintf(single_tag, "<%s/>", tag);
61
if (match_tag(buf, single_tag)) {
65
sprintf(start_tag, "<%s>", tag);
66
if (parse_int(buf, start_tag, x)) {
73
// parse a string of the form ...<tag attrs>string</tag>...;
74
// returns the "string" part.
75
// Does XML unescaping (replace < with <)
76
// "string" may not include '<'
77
// Strips white space from ends.
78
// Use "<tag", not "<tag>", if there might be attributes
80
bool parse_str(const char* buf, const char* tag, char* dest, int destlen) {
90
const char* q = strchr(p, '<');
93
if (len >= destlen) len = destlen-1;
94
memcpy(tempbuf, p, len);
96
strip_whitespace(tempbuf);
97
xml_unescape(tempbuf, dest, destlen);
101
bool parse_str(const char* buf, const char* tag, string& dest) {
103
if (!parse_str(buf, tag, tempbuf, 1024)) return false;
108
// parse a string of the form 'xxx name="value" xxx';
109
// returns value in dest
111
void parse_attr(const char* buf, const char* name, char* dest, int len) {
116
p = strstr(buf, name);
120
q = strchr(p+1, '"');
122
if (len > q-p) len = (int)(q-p);
123
strlcpy(dest, p+1, len);
126
int copy_stream(FILE* in, FILE* out) {
130
n = (int)fread(buf, 1, 1024, in);
131
m = (int)fwrite(buf, 1, n, out);
132
if (m != n) return ERR_FWRITE;
138
// append to a malloc'd string
140
int strcatdup(char*& p, char* buf) {
141
p = (char*)realloc(p, strlen(p) + strlen(buf)+1);
149
// Copy from a file to a malloc'd string until the end tag is reached
150
// Does NOT copy the start and end tags.
152
int dup_element_contents(FILE* in, const char* end_tag, char** pp) {
154
int bufsize = 4000000;
155
int nused=0; // not counting ending NULL
156
char* buf = (char*)malloc(bufsize);
158
// Start with a big buffer.
159
// When done, copy to an exact-size buffer
161
while (fgets(line, 256, in)) {
162
if (strstr(line, end_tag)) {
163
*pp = (char*)malloc(nused+1);
168
int n = strlen(line);
169
if (nused + n >= bufsize) {
171
buf = (char*)realloc(buf, bufsize);
173
strcpy(buf+nused, line);
177
return ERR_XML_PARSE;
180
int dup_element(FILE* in, const char* tag_name, char** pp) {
181
char buf[256], end_tag[256];
184
sprintf(buf, "<%s>\n", tag_name);
185
sprintf(end_tag, "</%s>", tag_name);
187
char* p = strdup(buf);
188
while (fgets(buf, 256, in)) {
189
if (strstr(buf, end_tag)) {
190
sprintf(buf, "</%s>\n", tag_name);
191
retval = strcatdup(p, buf);
192
if (retval) return retval;
196
retval = strcatdup(p, buf);
197
if (retval) return retval;
199
return ERR_XML_PARSE;
202
// copy from a file to static buffer
204
int copy_element_contents(FILE* in, const char* end_tag, char* p, int len) {
209
while (fgets(buf, 256, in)) {
210
if (strstr(buf, end_tag)) {
213
n = (int)strlen(buf);
214
if (n >= len-1) return ERR_XML_PARSE;
218
return ERR_XML_PARSE;
221
int copy_element_contents(FILE* in, const char* end_tag, string& str) {
225
while (fgets(buf, 256, in)) {
226
if (strstr(buf, end_tag)) {
231
return ERR_XML_PARSE;
234
// replace XML element contents (element must be present)
236
void replace_element_contents(
237
char* buf, const char* start, const char* end, const char* replacement
239
char temp[4096], *p, *q;
241
p = strstr(buf, start);
244
strlcpy(temp, q, sizeof(temp));
245
strcpy(p, replacement);
249
// if the string contains a substring of the form X...Y,
250
// remove the first such.
251
bool remove_element(char* buf, const char* start, const char* end) {
253
p = strstr(buf, start);
254
if (!p) return false;
255
q = strstr(p+strlen(start), end);
256
if (!q) return false;
257
strcpy(p, q+strlen(end));
261
// replace a substring. Do at most one instance.
263
bool str_replace(char* str, const char* substr, const char* replacement) {
266
p = strstr(str, substr);
267
if (!p) return false;
268
int n = (int)strlen(substr);
270
strcpy(p, replacement);
275
// if the given XML has an element of the form
276
// <venue name="venue_name">
279
// then return the contents of that element.
280
// Otherwise strip out all <venue> elements
282
void extract_venue(const char* in, const char* venue_name, char* out) {
286
sprintf(buf, "<venue name=\"%s\">", venue_name);
289
// prefs contain the specified venue
293
wp = strstr(out, "</venue");
296
// prefs don't contain the specified venue
301
p = strstr(q, "<venue");
306
strncat(out, q, p-q);
307
q = strstr(p, "</venue>");
309
q += strlen("</venue>");
314
// copy a line from the given string.
315
// kinda like fgets() when you're reading from a string
317
char* sgets(char* buf, int len, char*& in) {
320
p = strstr(in, "\n");
323
strlcpy(buf, in, len);
329
// NOTE: these used to take std::string instead of char* args.
330
// But this performed poorly.
332
// NOTE: output buffer should be 6X size of input
334
void xml_escape(const char* in, char* out, int len) {
341
x &= 0xff; // just in case
345
} else if (x == '&') {
349
sprintf(buf, "&#%d;", x);
357
sprintf(buf, "&#%d;", x);
365
if (p > out + len - 8) break;
370
// output buffer need not be larger than input
372
void xml_unescape(const char* in, char* out, int len) {
375
if (*in != '&') { // avoid strncmp's if possible
377
} else if (!strncmp(in, "<", 4)) {
380
} else if (!strncmp(in, "&", 5)) {
383
} else if (!strncmp(in, "&#", 2)) {
387
in = strchr(in, ';');
392
if (p > out + len - 2) break;
397
// we got an unrecognized line.
398
// If it has two <'s (e.g. <foo>xx</foo>) return 0.
399
// If it's of the form <foo/> return 0.
400
// If it's of the form <foo> then scan for </foo> and return 0.
401
// Otherwise return ERR_XML_PARSE
403
int skip_unrecognized(char* buf, MIOFILE& fin) {
404
char* p, *q, buf2[256];
405
std::string close_tag;
407
p = strchr(buf, '<');
409
return ERR_XML_PARSE;
411
if (strchr(p+1, '<')) {
414
q = strchr(p+1, '>');
416
return ERR_XML_PARSE;
418
if (q[-1] == '/') return 0;
420
close_tag = string("</") + string(p+1) + string(">");
421
while (fin.fgets(buf2, 256)) {
422
if (strstr(buf2, close_tag.c_str())) {
427
return ERR_XML_PARSE;
430
XML_PARSER::XML_PARSER(MIOFILE* _f) {
434
// read until find non-whitespace char.
435
// Return the char in the reference param
436
// Return true iff reached EOF
438
bool XML_PARSER::scan_nonws(int& first_char) {
442
if (c == EOF) return true;
443
if (isspace(c)) continue;
449
int XML_PARSER::scan_comment() {
454
if (c == EOF) return 2;
457
if (strstr(buf, "-->")) {
460
if (strlen(buf) > 32) {
467
// we just read a <; read until we find a >,
468
// and copy intervening text to buf.
471
// 1 if got a comment (ignore)
474
int XML_PARSER::scan_tag(
475
char* tag_buf, int tag_len, char* attr_buf, int attr_len
478
char* buf_start = tag_buf;
479
bool found_space = false;
480
for (int i=0; ; i++) {
482
if (c == EOF) return 2;
485
if (attr_buf) *attr_buf = 0;
496
if (found_space && attr_buf) {
497
if (--attr_len > 0) {
507
// check for comment start
509
if (i==2 && !strncmp(buf_start, "!--", 3)) {
510
return scan_comment();
515
// read and copy text to buf; stop when find a <;
516
// ungetc() that so we read it again
517
// Return true iff reached EOF
519
bool XML_PARSER::copy_until_tag(char* buf, int len) {
523
if (c == EOF) return true;
535
// Scan something, either tag or text.
536
// Strip whitespace at start and end.
537
// Return true iff reached EOF
539
bool XML_PARSER::get(char* buf, int len, bool& is_tag, char* attr_buf, int attr_len) {
545
if (eof) return true;
547
int retval = scan_tag(buf, len, attr_buf, attr_len);
548
if (retval == 2) return true;
549
if (retval == 1) continue;
553
eof = copy_until_tag(buf+1, len-1);
554
if (eof) return true;
557
strip_whitespace(buf);
562
// We just parsed "parsed_tag".
563
// If it matches "start_tag", and is followed by a string
564
// and by the matching close tag, return the string in "buf",
567
bool XML_PARSER::parse_str(
568
char* parsed_tag, const char* start_tag, char* buf, int len
571
char end_tag[256], tag[256], tmp[64000];
573
// handle the archaic form <tag/>, which means empty string
575
strcpy(tag, start_tag);
577
if (!strcmp(parsed_tag, tag)) {
582
// check for start tag
584
if (strcmp(parsed_tag, start_tag)) return false;
587
strcpy(end_tag+1, start_tag);
589
// get text after start tag
591
eof = get(tmp, 64000, is_tag);
592
if (eof) return false;
594
// if it's the end tag, return empty string
597
if (strcmp(tmp, end_tag)) {
605
eof = get(tag, sizeof(tag), is_tag);
606
if (eof) return false;
607
if (!is_tag) return false;
608
if (strcmp(tag, end_tag)) return false;
609
strlcpy(buf, tmp, len);
613
bool XML_PARSER::parse_string(
614
char* parsed_tag, const char* start_tag, string& str
617
bool flag = parse_str(parsed_tag, start_tag, buf, sizeof(buf));
618
if (!flag) return false;
623
// Same, for integers
625
bool XML_PARSER::parse_int(char* parsed_tag, const char* start_tag, int& i) {
628
char end_tag[256], tag[256];
630
if (strcmp(parsed_tag, start_tag)) return false;
633
strcpy(end_tag+1, start_tag);
635
eof = get(buf, sizeof(buf), is_tag);
636
if (eof) return false;
638
if (!strcmp(buf, end_tag)) {
639
i = 0; // treat <foo></foo> as <foo>0</foo>
645
int val = strtol(buf, &end, 0);
646
if (errno == ERANGE) return false;
647
if (end != buf+strlen(buf)) return false;
649
eof = get(tag, sizeof(tag), is_tag);
650
if (eof) return false;
651
if (!is_tag) return false;
652
if (strcmp(tag, end_tag)) return false;
659
bool XML_PARSER::parse_double(char* parsed_tag, const char* start_tag, double& x) {
662
char end_tag[256], tag[256];
664
if (strcmp(parsed_tag, start_tag)) return false;
667
strcpy(end_tag+1, start_tag);
669
eof = get(buf, sizeof(buf), is_tag);
670
if (eof) return false;
672
if (!strcmp(buf, end_tag)) {
673
x = 0; // treat <foo></foo> as <foo>0</foo>
679
double val = strtod(buf, &end);
680
if (end != buf+strlen(buf)) return false;
682
eof = get(tag, sizeof(tag), is_tag);
683
if (eof) return false;
684
if (!is_tag) return false;
685
if (strcmp(tag, end_tag)) return false;
692
bool XML_PARSER::parse_bool(char* parsed_tag, const char* start_tag, bool& b) {
695
char end_tag[256], tag[256];
697
// handle the archaic form <tag/>, which means true
699
strcpy(tag, start_tag);
701
if (!strcmp(parsed_tag, tag)) {
706
// otherwise look for something of the form <tag>int</tag>
708
if (strcmp(parsed_tag, start_tag)) return false;
710
eof = get(buf, sizeof(buf), is_tag);
711
if (eof) return false;
712
if (is_tag) return false;
713
bool val = (strtol(buf, &end, 0) != 0);
714
if (end != buf+strlen(buf)) return false;
717
strcpy(end_tag+1, start_tag);
718
eof = get(tag, sizeof(tag), is_tag);
719
if (eof) return false;
720
if (!is_tag) return false;
721
if (strcmp(tag, end_tag)) return false;
726
// parse a start tag (optionally preceded by <?xml>)
728
bool XML_PARSER::parse_start(const char* start_tag) {
732
eof = get(tag, sizeof(tag), is_tag);
733
if (eof || !is_tag ) {
736
if (strstr(tag, "?xml")) {
737
eof = get(tag, sizeof(tag), is_tag);
738
if (eof || !is_tag ) {
742
if (strcmp(tag, start_tag)) {
748
// copy everything up to (but not including) the given end tag.
749
// The copied text may include XML tags.
750
// strips whitespace.
752
int XML_PARSER::element_contents(const char* end_tag, char* buf, int buflen) {
757
retval = ERR_XML_PARSE;
762
retval = ERR_XML_PARSE;
767
char* p = strstr(buf, end_tag);
774
strip_whitespace(buf);
778
// We got an unexpected tag.
779
// If it's an end tag, do nothing.
780
// Otherwise skip until the end tag, if any
782
void XML_PARSER::skip_unexpected(
783
const char* start_tag, bool verbose, const char* where
785
char tag[256], end_tag[256];
789
fprintf(stderr, "Unrecognized XML in %s: %s\n", where, start_tag);
791
if (strchr(start_tag, '/')) return;
792
sprintf(end_tag, "/%s", start_tag);
793
while (!get(tag, sizeof(tag), is_tag)) {
795
fprintf(stderr, "Skipping: %s\n", tag);
797
if (!is_tag) continue;
798
if (!strcmp(tag, end_tag)) return;
799
skip_unexpected(tag, verbose, where);
803
// sample use is shown below
806
void parse(FILE* f) {
816
if (!xp.parse_start("blah")) {
817
printf("missing start tag\n");
820
while (!xp.get(tag, sizeof(tag), is_tag)) {
822
printf("unexpected text: %s\n", tag);
825
if (!strcmp(tag, "/blah")) {
828
} else if (xp.parse_str(tag, "str", name, sizeof(name))) {
829
printf("got str: %s\n", name);
830
} else if (xp.parse_int(tag, "int", val)) {
831
printf("got int: %d\n", val);
832
} else if (xp.parse_double(tag, "double", x)) {
833
printf("got double: %f\n", x);
834
} else if (xp.parse_bool(tag, "bool", flag)) {
835
printf("got bool: %d\n", flag);
837
printf("unparsed tag: %s\n", tag);
838
xp.skip_unexpected(tag, true, "xml test");
841
printf("unexpected EOF\n");
845
FILE* f = fopen("foo.xml", "r");
849
... and run it against, e.g.:
851
<?xml version="1.0" encoding="ISO-8859-1" ?>
860
<double>6.555</double>
865
const char *BOINC_RCSID_3f3de9eb18 = "$Id: parse.cpp 16478 2008-11-11 23:07:36Z davea $";