4
* Building upon and improving the CodeMirror 2 XML parser
5
* @author: Dror BG (deebug.dev@gmail.com)
9
CodeMirror.defineMode("xmlpure", function(config, parserConfig) {
11
var STYLE_ERROR = "error";
12
var STYLE_INSTRUCTION = "comment";
13
var STYLE_COMMENT = "comment";
14
var STYLE_ELEMENT_NAME = "tag";
15
var STYLE_ATTRIBUTE = "attribute";
16
var STYLE_WORD = "string";
17
var STYLE_TEXT = "atom";
18
var STYLE_ENTITIES = "string";
20
var TAG_INSTRUCTION = "!instruction";
21
var TAG_CDATA = "!cdata";
22
var TAG_COMMENT = "!comment";
23
var TAG_TEXT = "!text";
33
var indentUnit = config.indentUnit;
35
///////////////////////////////////////////////////////////////////////////
38
// chain a parser to another parser
39
function chain(stream, state, parser) {
40
state.tokenize = parser;
41
return parser(stream, state);
44
// parse a block (comment, CDATA or text)
45
function inBlock(style, terminator, nextTokenize) {
46
return function(stream, state) {
47
while (!stream.eol()) {
48
if (stream.match(terminator)) {
50
state.tokenize = nextTokenize;
59
// go down a level in the document
60
// (hint: look at who calls this function to know what the contexts are)
61
function pushContext(state, tagName) {
62
var noIndent = doNotIndent.hasOwnProperty(tagName) || (state.context && state.context.doIndent);
66
indent: state.context ? state.context.indent + indentUnit : 0,
67
lineNumber: state.lineNumber,
68
indented: state.indented,
71
state.context = newContext;
74
// go up a level in the document
75
function popContext(state) {
77
var oldContext = state.context;
78
state.context = oldContext.prev;
82
// we shouldn't be here - it means we didn't have a context to pop
86
// return true if the current token is seperated from the tokens before it
87
// which means either this is the start of the line, or there is at least
88
// one space or tab character behind the token
89
// otherwise returns false
90
function isTokenSeparated(stream) {
91
return stream.sol() ||
92
stream.string.charAt(stream.start - 1) == " " ||
93
stream.string.charAt(stream.start - 1) == "\t";
96
///////////////////////////////////////////////////////////////////////////
99
// an XML document can contain:
100
// - a single declaration (if defined, it must be the very first line)
101
// - exactly one root element
102
// @todo try to actually limit the number of root elements to 1
103
// - zero or more comments
104
function parseDocument(stream, state) {
105
if(stream.eat("<")) {
106
if(stream.eat("?")) {
107
// processing instruction
108
pushContext(state, TAG_INSTRUCTION);
109
state.tokenize = parseProcessingInstructionStartTag;
110
return STYLE_INSTRUCTION;
111
} else if(stream.match("!--")) {
112
// new context: comment
113
pushContext(state, TAG_COMMENT);
114
return chain(stream, state, inBlock(STYLE_COMMENT, "-->", parseDocument));
115
} else if(stream.eatSpace() || stream.eol() ) {
120
state.tokenize = parseElementTagName;
121
return STYLE_ELEMENT_NAME;
130
///////////////////////////////////////////////////////////////////////////
131
// context: XML element start-tag or end-tag
133
// - element start-tag can contain attributes
134
// - element start-tag may self-close (or start an element block if it doesn't)
135
// - element end-tag can contain only the tag name
136
function parseElementTagName(stream, state) {
137
// get the name of the tag
138
var startPos = stream.pos;
139
if(stream.match(/^[a-zA-Z_:][-a-zA-Z0-9_:.]*/)) {
141
var tagName = stream.string.substring(startPos, stream.pos);
142
pushContext(state, tagName);
143
state.tokenize = parseElement;
144
return STYLE_ELEMENT_NAME;
145
} else if(stream.match(/^\/[a-zA-Z_:][-a-zA-Z0-9_:.]*( )*>/)) {
147
var endTagName = stream.string.substring(startPos + 1, stream.pos - 1).trim();
148
var oldContext = popContext(state);
149
state.tokenize = state.context == null ? parseDocument : parseElementBlock;
150
if(oldContext == null || endTagName != oldContext.tagName) {
151
// the start and end tag names should match - error
154
return STYLE_ELEMENT_NAME;
156
// no tag name - error
157
state.tokenize = state.context == null ? parseDocument : parseElementBlock;
158
stream.eatWhile(/[^>]/);
167
function parseElement(stream, state) {
168
if(stream.match(/^\/>/)) {
171
state.tokenize = state.context == null ? parseDocument : parseElementBlock;
172
return STYLE_ELEMENT_NAME;
173
} else if(stream.eat(/^>/)) {
174
state.tokenize = parseElementBlock;
175
return STYLE_ELEMENT_NAME;
176
} else if(isTokenSeparated(stream) && stream.match(/^[a-zA-Z_:][-a-zA-Z0-9_:.]*( )*=/)) {
178
state.tokenize = parseAttribute;
179
return STYLE_ATTRIBUTE;
182
// no other options - this is an error
183
state.tokenize = state.context == null ? parseDocument : parseDocument;
184
stream.eatWhile(/[^>]/);
189
///////////////////////////////////////////////////////////////////////////
190
// context: attribute
192
// attribute values may contain everything, except:
193
// - the ending quote (with ' or ") - this marks the end of the value
194
// - the character "<" - should never appear
195
// - ampersand ("&") - unless it starts a reference: a string that ends with a semi-colon (";")
196
// ---> note: this parser is lax in what may be put into a reference string,
197
// ---> consult http://www.w3.org/TR/REC-xml/#NT-Reference if you want to make it tighter
198
function parseAttribute(stream, state) {
199
var quote = stream.next();
200
if(quote != "\"" && quote != "'") {
201
// attribute must be quoted
203
state.tokenize = parseElement;
207
state.tokParams.quote = quote;
208
state.tokenize = parseAttributeValue;
212
// @todo: find out whether this attribute value spans multiple lines,
213
// and if so, push a context for it in order not to indent it
214
// (or something of the sort..)
215
function parseAttributeValue(stream, state) {
217
while(!stream.eol()) {
219
if(ch == state.tokParams.quote) {
221
state.tokenize = parseElement;
223
} else if(ch == "<") {
224
// can't have less-than signs in an attribute value, ever
226
state.tokenize = parseElement;
228
} else if(ch == "&") {
229
// reference - look for a semi-colon, or return error if none found
232
// make sure that semi-colon isn't right after the ampersand
235
state.tokenize = parseElement;
239
// make sure no less-than characters slipped in
240
while(!stream.eol() && ch != ";") {
242
// can't have less-than signs in an attribute value, ever
244
state.tokenize = parseElement;
249
if(stream.eol() && ch != ";") {
250
// no ampersand found - error
252
state.tokenize = parseElement;
258
// attribute value continues to next line
262
///////////////////////////////////////////////////////////////////////////
263
// context: element block
265
// a block can contain:
270
function parseElementBlock(stream, state) {
271
if(stream.eat("<")) {
272
if(stream.match("?")) {
273
pushContext(state, TAG_INSTRUCTION);
274
state.tokenize = parseProcessingInstructionStartTag;
275
return STYLE_INSTRUCTION;
276
} else if(stream.match("!--")) {
277
// new context: comment
278
pushContext(state, TAG_COMMENT);
279
return chain(stream, state, inBlock(STYLE_COMMENT, "-->",
280
state.context == null ? parseDocument : parseElementBlock));
281
} else if(stream.match("![CDATA[")) {
282
// new context: CDATA section
283
pushContext(state, TAG_CDATA);
284
return chain(stream, state, inBlock(STYLE_TEXT, "]]>",
285
state.context == null ? parseDocument : parseElementBlock));
286
} else if(stream.eatSpace() || stream.eol() ) {
291
state.tokenize = parseElementTagName;
292
return STYLE_ELEMENT_NAME;
294
} else if(stream.eat("&")) {
295
stream.eatWhile(/[^;]/);
297
return STYLE_ENTITIES;
300
pushContext(state, TAG_TEXT);
301
state.tokenize = parseText;
305
state.tokenize = state.context == null ? parseDocument : parseElementBlock;
310
function parseText(stream, state) {
311
stream.eatWhile(/[^<]/);
313
// we cannot possibly be in the document context,
314
// just inside an element block
316
state.tokenize = parseElementBlock;
321
///////////////////////////////////////////////////////////////////////////
322
// context: XML processing instructions
324
// XML processing instructions (PIs) allow documents to contain instructions for applications.
325
// PI format: <?name data?>
326
// - 'name' can be anything other than 'xml' (case-insensitive)
327
// - 'data' can be anything which doesn't contain '?>'
328
// XML declaration is a special PI (see XML declaration context below)
329
function parseProcessingInstructionStartTag(stream, state) {
330
if(stream.match("xml", true, true)) {
332
if(state.lineNumber > 1 || stream.pos > 5) {
333
state.tokenize = parseDocument;
337
state.tokenize = parseDeclarationVersion;
338
return STYLE_INSTRUCTION;
342
// regular processing instruction
343
if(isTokenSeparated(stream) || stream.match("?>")) {
344
// we have a space after the start-tag, or nothing but the end-tag
345
// either way - error!
346
state.tokenize = parseDocument;
351
state.tokenize = parseProcessingInstructionBody;
352
return STYLE_INSTRUCTION;
355
function parseProcessingInstructionBody(stream, state) {
356
stream.eatWhile(/[^?]/);
357
if(stream.eat("?")) {
358
if(stream.eat(">")) {
360
state.tokenize = state.context == null ? parseDocument : parseElementBlock;
363
return STYLE_INSTRUCTION;
367
///////////////////////////////////////////////////////////////////////////
368
// context: XML declaration
370
// XML declaration is of the following format:
371
// <?xml version="1.0" encoding="UTF-8" standalone="no" ?>
372
// - must start at the first character of the first line
373
// - may span multiple lines
374
// - must include 'version'
375
// - may include 'encoding' and 'standalone' (in that order after 'version')
376
// - attribute names must be lowercase
377
// - cannot contain anything else on the line
378
function parseDeclarationVersion(stream, state) {
379
state.tokenize = parseDeclarationEncoding;
381
if(isTokenSeparated(stream) && stream.match(/^version( )*=( )*"([a-zA-Z0-9_.:]|\-)+"/)) {
382
return STYLE_INSTRUCTION;
388
function parseDeclarationEncoding(stream, state) {
389
state.tokenize = parseDeclarationStandalone;
391
if(isTokenSeparated(stream) && stream.match(/^encoding( )*=( )*"[A-Za-z]([A-Za-z0-9._]|\-)*"/)) {
392
return STYLE_INSTRUCTION;
397
function parseDeclarationStandalone(stream, state) {
398
state.tokenize = parseDeclarationEndTag;
400
if(isTokenSeparated(stream) && stream.match(/^standalone( )*=( )*"(yes|no)"/)) {
401
return STYLE_INSTRUCTION;
406
function parseDeclarationEndTag(stream, state) {
407
state.tokenize = parseDocument;
409
if(stream.match("?>") && stream.eol()) {
411
return STYLE_INSTRUCTION;
417
///////////////////////////////////////////////////////////////////////////
422
startState: function() {
424
tokenize: parseDocument,
433
token: function(stream, state) {
435
// initialize a new line
437
state.lineError = false;
438
state.indented = stream.indentation();
441
// eat all (the spaces) you can
442
if(stream.eatSpace()) return null;
444
// run the current tokenize function, according to the state
445
var style = state.tokenize(stream, state);
447
// is there an error somewhere in the line?
448
state.lineError = (state.lineError || style == "error");
453
blankLine: function(state) {
454
// blank lines are lines too!
456
state.lineError = false;
459
indent: function(state, textAfter) {
461
if(state.context.noIndent == true) {
462
// do not indent - no return value at all
465
if(textAfter.match(/^<\/.*/)) {
466
// end-tag - indent back to last context
467
return state.context.indent;
469
if(textAfter.match(/^<!\[CDATA\[/)) {
470
// a stand-alone CDATA start-tag - indent back to column 0
473
// indent to last context + regular indent unit
474
return state.context.indent + indentUnit;
479
compareStates: function(a, b) {
480
if (a.indented != b.indented) return false;
481
for (var ca = a.context, cb = b.context; ; ca = ca.prev, cb = cb.prev) {
482
if (!ca || !cb) return ca == cb;
483
if (ca.tagName != cb.tagName) return false;
489
CodeMirror.defineMIME("application/xml", "purexml");
490
CodeMirror.defineMIME("text/xml", "purexml");