slowparse.js | |
---|---|
Slowparse is a token stream parser for HTML and CSS text, recording regions of interest during the parse run and signaling any errors detected accompanied by relevant regions in the text stream, to make debugging easy. Each error type is documented in the error specification. Slowparse also builds a DOM as it goes, attaching metadata to each node build that points to where it came from in the original source. For more information on the rationale behind Slowparse, as well as its design goals, see the README. ImplementationSlowparse is effectively a finite state machine for HTML and CSS strings, and will switch between the HTML and CSS parsers while maintaining a single token stream. | var Slowparse = (function() { |
Character Entity ParsingWe currently only parse the most common named character entities. | var CHARACTER_ENTITY_REFS = {
lt: "<",
gt: ">",
quot: '"',
amp: "&"
};
|
This function does not currently replace numeric character entity
references (e.g., | function replaceEntityRefs(text) {
return text.replace(/&([A-Za-z]+);/g, function(ref, name) {
name = name.toLowerCase();
if (name in CHARACTER_ENTITY_REFS)
return CHARACTER_ENTITY_REFS[name];
return ref;
});
}
|
Errors
The first argument is the name of an error type, followed by
arbitrary positional arguments specific to that error type. Every
instance has a | function ParseError(type) {
this.name = "ParseError";
if (!(type in ParseErrorBuilders))
throw new Error("Unknown ParseError type: " + type);
var args = [];
for (var i = 1; i < arguments.length; i++)
args.push(arguments[i]);
var parseInfo = ParseErrorBuilders[type].apply(ParseErrorBuilders, args);
/* This may seem a weird way of setting an attribute, but we want
* to make the JSON serialize so the 'type' appears first, as it
* makes our documentation read better. */
parseInfo = ParseErrorBuilders._combine({
type: type
}, parseInfo);
this.message = type;
this.parseInfo = parseInfo;
}
ParseError.prototype = Error.prototype; |
Each public factory function returns a | var ParseErrorBuilders = {
/* Create a new object that has the properties of both arguments
* and return it. */
_combine: function(a, b) {
var obj = {}, name;
for (name in a) {
obj[name] = a[name];
}
for (name in b) {
obj[name] = b[name];
}
return obj;
}, |
These are HTML errors. | UNCLOSED_TAG: function(parser) {
return {
openTag: this._combine({
name: parser.domBuilder.currentNode.nodeName.toLowerCase()
}, parser.domBuilder.currentNode.parseInfo.openTag)
};
},
INVALID_TAG_NAME: function(tagName, token) {
return {
openTag: this._combine({
name: tagName
}, token.interval)
};
},
UNEXPECTED_CLOSE_TAG: function(parser, closeTagName, token) {
return {
closeTag: this._combine({
name: closeTagName
}, token.interval)
};
},
MISMATCHED_CLOSE_TAG: function(parser, openTagName, closeTagName, token) {
return {
openTag: this._combine({
name: openTagName
}, parser.domBuilder.currentNode.parseInfo.openTag),
closeTag: this._combine({
name: closeTagName
}, token.interval)
};
},
CLOSE_TAG_FOR_VOID_ELEMENT: function(parser, closeTagName, token) {
return {
closeTag: this._combine({
name: closeTagName
}, token.interval)
};
},
UNTERMINATED_COMMENT: function(token) {
return {
start: token.interval.start
};
},
UNTERMINATED_ATTR_VALUE: function(parser, nameTok) {
return {
openTag: this._combine({
name: parser.domBuilder.currentNode.nodeName.toLowerCase()
}, parser.domBuilder.currentNode.parseInfo.openTag),
attribute: {
name: {
value: nameTok.value,
start: nameTok.interval.start,
end: nameTok.interval.end
},
value: {
start: parser.stream.makeToken().interval.start
}
},
};
},
UNQUOTED_ATTR_VALUE: function(parser) {
var pos = parser.stream.pos;
if (!parser.stream.end())
pos = parser.stream.makeToken().interval.start;
return {
start: pos
};
},
UNTERMINATED_OPEN_TAG: function(parser) {
return {
openTag: {
start: parser.domBuilder.currentNode.parseInfo.openTag.start,
end: parser.stream.pos,
name: parser.domBuilder.currentNode.nodeName.toLowerCase()
}
};
},
SELF_CLOSING_NON_VOID_ELEMENT: function(parser, tagName) {
return {
name: tagName,
start: parser.domBuilder.currentNode.parseInfo.openTag.start,
end: parser.stream.makeToken().interval.end
};
},
UNTERMINATED_CLOSE_TAG: function(parser) {
var end = parser.stream.pos;
if (!parser.stream.end())
end = parser.stream.makeToken().interval.start;
return {
closeTag: {
name: parser.domBuilder.currentNode.nodeName.toLowerCase(),
start: parser.domBuilder.currentNode.parseInfo.closeTag.start,
end: end
}
};
}, |
These are CSS errors. | INVALID_CSS_PROPERTY_NAME: function(parser, start, end, property) {
return {
cssProperty: {
start: start,
end: end,
property: property
}
};
},
MISSING_CSS_SELECTOR: function(parser, start, end) {
return {
cssBlock: {
start: start,
end: end
}
};
},
UNFINISHED_CSS_SELECTOR: function(parser, start, end, selector) {
return {
cssSelector: {
start: start,
end: end,
selector: selector
}
};
},
MISSING_CSS_BLOCK_OPENER: function(parser, start, end, selector) {
return {
cssSelector: {
start: start,
end: end,
selector: selector
}
};
},
INVALID_CSS_PROPERTY_NAME: function(parser, start, end, property) {
return {
cssProperty: {
start: start,
end: end,
property: property
}
};
},
MISSING_CSS_PROPERTY: function(parser, start, end, selector) {
return {
cssSelector: {
start: start,
end: end,
selector: selector
}
};
},
UNFINISHED_CSS_PROPERTY: function(parser, start, end, property) {
return {
cssProperty: {
start: start,
end: end,
property: property
}
};
},
MISSING_CSS_VALUE: function(parser, start, end, property) {
return {
cssProperty: {
start: start,
end: end,
property: property
}
};
},
UNFINISHED_CSS_VALUE: function(parser, start, end, value) {
return {
cssValue: {
start: start,
end: end,
value: value
}
};
},
MISSING_CSS_BLOCK_CLOSER: function(parser, start, end, value) {
return {
cssValue: {
start: start,
end: end,
value: value
}
};
},
UNCAUGHT_CSS_PARSE_ERROR: function(parser, start, end, msg) {
return {
error: {
start: start,
end: end,
msg: msg
}
};
},
UNTERMINATED_CSS_COMMENT: function(start) {
return {
start: start
};
}
};
|
Streams
| function Stream(text) {
this.text = text;
this.pos = 0;
this.tokenStart = 0;
}
Stream.prototype = { |
| peek: function() {
return this.text[this.pos];
}, |
| next: function() {
if (!this.end())
return this.text[this.pos++];
}, |
| end: function() {
return (this.pos == this.text.length);
}, |
| eat: function(match) {
if (this.peek().match(match))
return this.next();
}, |
| eatWhile: function(matcher) {
var wereAnyEaten = false;
while (!this.end()) {
if (this.eat(matcher))
wereAnyEaten = true;
else
return wereAnyEaten;
}
}, |
| eatSpace: function() {
return this.eatWhile(/[\s\n]/);
}, |
| eatCSSWhile: function(matcher) {
var wereAnyEaten = false,
chr = '',
peek = '',
next = '';
while (!this.end()) {
chr = this.eat(matcher);
if (chr)
wereAnyEaten = true;
else
return wereAnyEaten;
if (chr === '/') {
peek = this.peek();
if (peek === '*') {
/* Block comment found. Gobble until resolved. */
while(next !== '/' && !this.end()) {
this.eatWhile(/[^*]/);
this.next();
next = this.next();
}
next = '';
}
}
}
}, |
| markTokenStart: function() {
this.tokenStart = this.pos;
}, |
| markTokenStartAfterSpace: function() {
this.eatSpace();
this.markTokenStart();
}, |
| makeToken: function() {
if (this.pos == this.tokenStart)
return null;
var token = {
value: this.text.slice(this.tokenStart, this.pos),
interval: {
start: this.tokenStart,
end: this.pos
}
};
this.tokenStart = this.pos;
return token;
}, |
| match: function(string, consume, caseFold) {
var substring = this.text.slice(this.pos, this.pos + string.length);
if (caseFold) {
string = string.toLowerCase();
substring = substring.toLowerCase();
}
if (string == substring) {
if (consume)
this.pos += string.length;
return true;
}
return false;
}
}; |
CSS Parsing
| function CSSParser(stream, domBuilder) {
this.stream = stream;
this.domBuilder = domBuilder;
}
CSSParser.prototype = { |
We keep a list of all currently valid CSS properties (CSS1-CSS3). This list does not contain vendor prefixes. | cssProperties: [
"alignment-adjust","alignment-baseline","animation","animation-delay",
"animation-direction","animation-duration","animation-iteration-count",
"animation-name","animation-play-state","animation-timing-function",
"appearance","azimuth","backface-visibility","background",
"background-attachment","background-clip","background-color",
"background-image","background-origin","background-position",
"background-repeat","background-size","baseline-shift","binding",
"bleed","bookmark-label","bookmark-level","bookmark-state",
"bookmark-target","border","border-bottom","border-bottom-color",
"border-bottom-left-radius","border-bottom-right-radius",
"border-bottom-style","border-bottom-width","border-collapse",
"border-color","border-image","border-image-outset",
"border-image-repeat","border-image-slice","border-image-source",
"border-image-width","border-left","border-left-color",
"border-left-style","border-left-width","border-radius","border-right",
"border-right-color","border-right-style","border-right-width",
"border-spacing","border-style","border-top","border-top-color",
"border-top-left-radius","border-top-right-radius","border-top-style",
"border-top-width","border-width","bottom","box-decoration-break",
"box-shadow","box-sizing","break-after","break-before","break-inside",
"caption-side","clear","clip","color","color-profile","column-count",
"column-fill","column-gap","column-rule","column-rule-color",
"column-rule-style","column-rule-width","column-span","column-width",
"columns","content","counter-increment","counter-reset","crop","cue",
"cue-after","cue-before","cursor","direction","display",
"dominant-baseline","drop-initial-after-adjust",
"drop-initial-after-align","drop-initial-before-adjust",
"drop-initial-before-align","drop-initial-size","drop-initial-value",
"elevation","empty-cells","filter","fit","fit-position","flex-align",
"flex-flow","flex-line-pack","flex-order","flex-pack","float","float-offset",
"font","font-family","font-size","font-size-adjust","font-stretch",
"font-style","font-variant","font-weight","grid-columns","grid-rows",
"hanging-punctuation","height","hyphenate-after","hyphenate-before",
"hyphenate-character","hyphenate-lines","hyphenate-resource","hyphens",
"icon","image-orientation","image-rendering","image-resolution",
"inline-box-align","left","letter-spacing","line-break","line-height",
"line-stacking","line-stacking-ruby","line-stacking-shift",
"line-stacking-strategy","list-style","list-style-image",
"list-style-position","list-style-type","margin","margin-bottom",
"margin-left","margin-right","margin-top","marker-offset","marks",
"marquee-direction","marquee-loop","marquee-play-count","marquee-speed",
"marquee-style","max-height","max-width","min-height","min-width",
"move-to","nav-down","nav-index","nav-left","nav-right","nav-up",
"opacity","orphans","outline","outline-color","outline-offset",
"outline-style","outline-width","overflow","overflow-style",
"overflow-wrap","overflow-x","overflow-y","padding","padding-bottom",
"padding-left","padding-right","padding-top","page","page-break-after",
"page-break-before","page-break-inside","page-policy","pause",
"pause-after","pause-before","perspective","perspective-origin",
"phonemes","pitch","pitch-range","play-during","position",
"presentation-level","punctuation-trim","quotes","rendering-intent",
"resize","rest","rest-after","rest-before","richness","right",
"rotation","rotation-point","ruby-align","ruby-overhang",
"ruby-position","ruby-span","src","size","speak","speak-header",
"speak-numeral","speak-punctuation","speech-rate","stress","string-set",
"tab-size","table-layout","target","target-name","target-new",
"target-position","text-align","text-align-last","text-decoration",
"text-decoration-color","text-decoration-line","text-decoration-skip",
"text-decoration-style","text-emphasis","text-emphasis-color",
"text-emphasis-position","text-emphasis-style","text-height",
"text-indent","text-justify","text-outline","text-shadow",
"text-space-collapse","text-transform","text-underline-position",
"text-wrap","top","transform","transform-origin","transform-style",
"transition","transition-delay","transition-duration",
"transition-property","transition-timing-function","unicode-bidi",
"vertical-align","visibility","voice-balance","voice-duration",
"voice-family","voice-pitch","voice-pitch-range","voice-rate",
"voice-stress","voice-volume","volume","white-space","widows","width",
"word-break","word-spacing","word-wrap","z-index"], |
This helper verifies that a specific string is a known CSS property.
We include vendor-prefixed known CSS properties, like | _knownCSSProperty: function(propertyName) {
propertyName = propertyName.replace(/^-.+?-/,'');
return this.cssProperties.indexOf(propertyName) > -1;
}, |
The CSS Master Parse FunctionHere we process the token stream, assumed to have its pointer inside a CSS element, and will try to parse the content inside it as CSS until we hit the end of the CSS element. Any parse errors along the way will result in a | parse: function() { |
We'll use some instance variables to keep track of our parse state: | |
| this.rules = [];
|
| this.comments = []; |
Parsing is based on finite states, and a call
to | var sliceStart = this.stream.pos;
this.stream.markTokenStartAfterSpace();
this._parseSelector();
var sliceEnd = this.stream.pos; |
If we get here, the CSS block has no errors,
and we report the start/end of the CSS block
in the stream, as well as the rules/comments
for the calling | var cssBlock = {
value: this.stream.text.slice(sliceStart, sliceEnd),
parseInfo: {
start: sliceStart,
end: sliceEnd,
rules: this.rules,
comments: this.comments
}
};
this.rules = null;
this.comments = null;
return cssBlock;
}, |
CSS Comment ParsingHere we record the position of comments in term in the instance's comment list, and return term with all its comments stripped. | stripComments: function(term, startPos) {
var pos,
last = term.length,
commentStart, commentEnd,
prev, next,
stripped = "";
for (pos=0; pos < last; pos++) {
if (term[pos] === '/' && pos<last-1 && term[pos+1] === '*') {
commentStart = startPos + pos;
pos += 3;
while(pos < last-1 && term.substr(pos-1,2) !== "*/") {
pos++;
}
if (pos >= last-1 && term.substr(pos-1,2) !== "*/")
throw new ParseError("UNTERMINATED_CSS_COMMENT", commentStart);
commentEnd = startPos + pos + 1;
this.comments.push({start: commentStart, end: commentEnd});
} else {
stripped += term[pos];
}
}
return stripped;
}, |
CSS Selector ParsingA selector is a string, and terminates on There are a few characters in selectors that are an immediate error:
Note that we cannot flag | _parseSelector: function() { |
Depending on our state, we may be coming from having just parsed a rule. If that's the case, add it to our list of rules. | if (this.currentRule) {
this.rules.push(this.currentRule);
this.currentRule = null;
} |
Gobble all characters that could be part of the selector. | this.stream.eatCSSWhile(/[^\{;\}<]/);
var token = this.stream.makeToken(),
peek = this.stream.peek();
|
If there was nothing to select, we're either done, or an error occurred. | if (token === null) {
if (!this.stream.end() && this.stream.peek() === '<') {
return;
}
throw new ParseError("MISSING_CSS_SELECTOR", this, this.stream.pos-1,
this.stream.pos);
} |
If we get here, we have a selector string. | token.value = token.value.trim();
var selector = token.value,
selectorStart = token.interval.start,
selectorEnd = selectorStart + selector.length;
selector = this.stripComments(selector, selectorStart).trim();
if (selector === '') {
this._parseSelector();
return;
} |
Now we'll set up a ruleset object for this selector. | this.currentRule = {
selector: {
value: selector,
start: selectorStart,
end: selectorEnd
},
declarations: {
start: null,
end: null,
properties: []
}
}; |
Now we start to analyse whether we can continue, or whether we're in a terminal state, based on the next character in the stream. | if (this.stream.end() || peek === '<') {
throw new ParseError("UNFINISHED_CSS_SELECTOR", this, selectorStart,
selectorEnd, selector);
}
if (!this.stream.end()) {
var next = this.stream.next(),
errorMsg = "[_parseSelector] Expected {, }, ; or :, " +
"instead found " + next;
if (next === '{') { |
The only legal continuation after a selector is the opening
| this.currentRule.declarations.start = this.stream.pos-1;
this._parseDeclaration(selector, selectorStart);
} else if (next === ';' || next === '}') { |
Otherwise, this is a parse error; we should have seen | throw new ParseError("MISSING_CSS_BLOCK_OPENER", this,
selectorStart, selectorEnd, selector);
} else { |
We get here if an unexpected character was found. | throw new ParseError("UNCAUGHT_CSS_PARSE_ERROR", this,
token.interval.start, token.interval.end,
errorMsg);
}
} else { |
If the stream ended after the selector, we want the user to follow
up with | throw new ParseError("MISSING_CSS_BLOCK_OPENER", this, selectorStart,
selectorEnd, selector);
}
}, |
CSS Declaration ParsingA declaration is a | _parseDeclaration: function(selector, selectorStart, value) { |
First, we forward the stream to the next non-space character. | this.stream.markTokenStartAfterSpace();
var peek = this.stream.peek();
if (peek === '}') { |
If the next character is | this.stream.next();
this.currentRule.declarations.end = this.stream.pos;
this.stream.markTokenStartAfterSpace();
this._parseSelector();
} |
Administratively important: there are two ways for this function
to have been called. One is from | else if (value && (this.stream.end() || peek === '<')) {
throw new ParseError("MISSING_CSS_BLOCK_CLOSER", this, selectorStart,
selectorStart+value.length, value);
}
|
If we're still in this function at this point, all is well and we can move on to property parsing. | else {
this._parseProperty(selector, selectorStart);
}
}, |
CSS Property ParsingThere is a fixed list of CSS properties, and we must check two things:
Properties are terminated by
| _parseProperty: function(selector, selectorStart) {
var property = this.stream.eatCSSWhile(/[^\{\}<;:]/),
token = this.stream.makeToken();
if (token === null) {
throw new ParseError("MISSING_CSS_PROPERTY", this, selectorStart,
selectorStart + selector.length, selector);
}
var property = token.value.trim();
propertyStart = token.interval.start,
propertyEnd = propertyStart + property.length;
property = this.stripComments(property, propertyStart).trim();
if (property === '') {
this._parseDeclaration(selector, selectorStart);
return;
}
var next = this.stream.next(),
errorMsg = "[_parseProperty] Expected }, <, ; or :, " +
"instead found " + next;
if ((this.stream.end() && next !== ':') || next === '<' ||
next === '}') {
throw new ParseError("UNFINISHED_CSS_PROPERTY", this, propertyStart,
propertyEnd, property);
} |
We record | this.currentProperty = {
name: {
value: property,
start: propertyStart,
end: propertyEnd
}
}; |
If we find a colon, we have a property and now need a value to go along with it. | if (next === ':') { |
Before we continue, we must make sure the string we found is a real CSS property. | if (!( property && property.match(/^[a-z\-]+$/)) ||
!this._knownCSSProperty(property))
throw new ParseError("INVALID_CSS_PROPERTY_NAME", this,
propertyStart, propertyEnd, property);
this.stream.markTokenStartAfterSpace();
this._parseValue(selector, selectorStart, property, propertyStart);
} |
Otherwise, anything else at this point constitutes an error. | else if (next === ';') {
throw new ParseError("MISSING_CSS_VALUE", this, propertyStart,
propertyEnd, property);
}
else if (next === '{') {
throw new ParseError("MISSING_CSS_BLOCK_CLOSER", this, selectorStart,
propertyStart, selector);
}
else {
throw new ParseError("UNCAUGHT_CSS_PARSE_ERROR", this,
token.interval.start, token.interval.end,
errorMsg);
}
}, |
CSS Value ParsingA value must end either in
| _parseValue: function(selector, selectorStart, property, propertyStart) {
var rule = this.stream.eatCSSWhile(/[^}<;]/),
token = this.stream.makeToken();
if(token === null) {
throw new ParseError("MISSING_CSS_VALUE", this, propertyStart,
propertyStart+property.length, property);
}
var next = (!this.stream.end() ? this.stream.next() : "end of stream"),
errorMsg = "[_parseValue] Expected }, <, or ;, instead found "+next;
token.value = token.value.trim();
var value = token.value,
valueStart = token.interval.start,
valueEnd = valueStart + value.length;
value = this.stripComments(value, valueStart).trim();
if (value === '') {
throw new ParseError("MISSING_CSS_VALUE", this, this.stream.pos-1,
this.stream.pos);
} |
At this point we can fill in the value part of the current
| this.currentProperty.value = {
value: value,
start: valueStart,
end: valueEnd
}
if ((this.stream.end() && next !== ';') || next === '<') {
throw new ParseError("UNFINISHED_CSS_VALUE", this, valueStart,
valueEnd, value);
}
if (next === ';') { |
This is normal CSS rule termination; try to read a new property/value pair. | this._bindCurrentRule();
this.stream.markTokenStartAfterSpace();
this._parseDeclaration(selector, valueStart, value);
}
else if (next === '}') { |
This is block level termination; try to read a new selector. | this.currentRule.declarations.end = this.stream.pos;
this._bindCurrentRule();
this.stream.markTokenStartAfterSpace();
this._parseSelector();
}
else {
throw new ParseError("UNCAUGHT_CSS_PARSE_ERROR", this,
token.interval.start, token.interval.end,
errorMsg);
}
}, |
This helper function binds the currrent | _bindCurrentRule: function() {
this.currentRule.declarations.properties.push(this.currentProperty);
this.currentProperty = null;
}
} |
HTML ParsingThe HTML token stream parser object has references to the stream, as well as a DOM builder that is used to construct the DOM while we run through the token stream. | function HTMLParser(stream, domBuilder) {
this.stream = stream;
this.domBuilder = domBuilder;
this.cssParser = new CSSParser(stream, domBuilder);
}
HTMLParser.prototype = {
html5Doctype: '<!DOCTYPE html>', |
Void HTML elements are the ones that don't need to have a closing tag. | voidHtmlElements: ["area", "base", "br", "col", "command", "embed", "hr",
"img", "input", "keygen", "link", "meta", "param",
"source", "track", "wbr"], |
We keep a list of all valid HTML5 elements. | htmlElements: ["a", "abbr", "address", "area", "article", "aside",
"audio", "b", "base", "bdi", "bdo", "bgsound", "blink",
"blockquote", "body", "br", "button", "canvas", "caption",
"cite", "code", "col", "colgroup", "command", "datalist",
"dd", "del", "details", "dfn", "div", "dl", "dt", "em",
"embed", "fieldset", "figcaption", "figure", "footer",
"form", "frame", "frameset", "h1", "h2", "h3", "h4", "h5",
"h6", "head", "header", "hgroup", "hr", "html", "i",
"iframe", "img", "input", "ins", "kbd", "keygen", "label",
"legend", "li", "link", "map", "mark", "marquee", "menu",
"meta", "meter", "nav", "nobr", "noscript", "object", "ol",
"optgroup", "option", "output", "p", "param", "pre",
"progress", "q", "rp", "rt", "ruby", "samp", "script",
"section", "select", "small", "source", "spacer", "span",
"strong", "style", "sub", "summary", "sup", "table",
"tbody", "td", "textarea", "tfoot", "th", "thead", "time",
"title", "tr", "track", "u", "ul", "var", "video", "wbr"], |
We also keep a list of HTML elements that are now obsolete, but may still be encountered in the wild on popular sites. | obsoleteHtmlElements: ["acronym", "applet", "basefont", "big", "center",
"dir", "font", "isindex", "listing", "noframes",
"plaintext", "s", "strike", "tt", "xmp"], |
This is a helper function to determine whether a given string is a legal HTML element tag. | _knownHTMLElement: function(tagName) {
return this.voidHtmlElements.indexOf(tagName) > -1 ||
this.htmlElements.indexOf(tagName) > -1 ||
this.obsoleteHtmlElements.indexOf(tagName) > -1;
}, |
This is a helper function to determine whether a given string is a void HTML element tag. | _knownVoidHTMLElement: function(tagName) {
return this.voidHtmlElements.indexOf(tagName) > -1;
}, |
The HTML Master Parse FunctionThe HTML master parse function works the same as the CSS parser: it takes the token stream and will try to parse the content as a sequence of HTML elements. Any parse errors along the way will result in the code
throwing a | parse: function() { |
First we check to see if the beginning of our stream is an HTML5 doctype tag. We're currently quite strict and don't parse XHTML or other doctypes. | if (this.stream.match(this.html5Doctype, true, true))
this.domBuilder.fragment.parseInfo = {
doctype: {
start: 0,
end: this.stream.pos
}
};
|
Next, we parse "tag soup", creating text nodes and diving into tags as we find them. | while (!this.stream.end()) {
if (this.stream.peek() == '<') {
this._buildTextNode();
this._parseStartTag();
} else
this.stream.next();
}
this._buildTextNode(); |
At the end, it's possible we're left with an open tag, so we test for that. | if (this.domBuilder.currentNode != this.domBuilder.fragment)
throw new ParseError("UNCLOSED_TAG", this);
}, |
This is a helper to build a DOM text node. | _buildTextNode: function() {
var token = this.stream.makeToken();
if (token) {
this.domBuilder.text(replaceEntityRefs(token.value), token.interval);
}
}, |
HTML Tag ParsingThis is the entry point for parsing the beginning of an HTML tag.
It assumes the stream is on a | _parseStartTag: function() {
if (this.stream.next() != '<')
throw new Error('assertion failed, expected to be on "<"');
if (this.stream.match('!--', true)) {
this._parseComment();
return;
}
this.stream.eat(/\//);
this.stream.eatWhile(/[\w\d]/);
var token = this.stream.makeToken();
var tagName = token.value.slice(1).toLowerCase();
|
If the character after the | if (tagName[0] == '/') {
var closeTagName = tagName.slice(1).toLowerCase();
if (this._knownVoidHTMLElement(closeTagName))
throw new ParseError("CLOSE_TAG_FOR_VOID_ELEMENT", this,
closeTagName, token);
if (!this.domBuilder.currentNode.parseInfo)
throw new ParseError("UNEXPECTED_CLOSE_TAG", this, closeTagName,
token);
this.domBuilder.currentNode.parseInfo.closeTag = {
start: token.interval.start
};
var openTagName = this.domBuilder.currentNode.nodeName.toLowerCase();
if (closeTagName != openTagName)
throw new ParseError("MISMATCHED_CLOSE_TAG", this, openTagName,
closeTagName, token);
this._parseEndCloseTag();
}
else { |
We want to make sure that opening tags have valid tag names. | if (!(tagName && this._knownHTMLElement(tagName)))
throw new ParseError("INVALID_TAG_NAME", tagName, token);
this.domBuilder.pushElement(tagName, {
openTag: {
start: token.interval.start
}
});
if (!this.stream.end())
this._parseEndOpenTag(tagName);
}
}, |
This helper parses HTML comments. It assumes the stream has just
passed the beginning | _parseComment: function() {
var token;
while (!this.stream.end()) {
if (this.stream.match('-->', true)) {
token = this.stream.makeToken();
this.domBuilder.comment(token.value.slice(4, -3), token.interval);
return;
}
this.stream.next();
}
token = this.stream.makeToken();
throw new ParseError("UNTERMINATED_COMMENT", token);
}, |
This helper function parses the end of a closing tag. It expects the stream to be right after the end of the closing tag's tag name. | _parseEndCloseTag: function() {
this.stream.eatSpace();
if (this.stream.next() != '>')
throw new ParseError("UNTERMINATED_CLOSE_TAG", this);
var end = this.stream.makeToken().interval.end;
this.domBuilder.currentNode.parseInfo.closeTag.end = end;
this.domBuilder.popElement();
}, |
This helper function parses the rest of an opening tag after
its tag name, looking for | _parseEndOpenTag: function(tagName) {
/* FIXME: we probably don't need while() here, as the parser will
* either cleanly terminate or throw a ParseError anyway? */
while (!this.stream.end()) {
if (this.stream.eatWhile(/[A-Za-z\-]/)) {
this._parseAttribute();
}
else if (this.stream.eatSpace()) {
this.stream.makeToken();
}
else if (this.stream.peek() == '>' || this.stream.match("/>")) {
if (this.stream.match("/>", true)) {
if (!this._knownVoidHTMLElement(tagName))
throw new ParseError("SELF_CLOSING_NON_VOID_ELEMENT", this,
tagName);
} else
this.stream.next();
var end = this.stream.makeToken().interval.end;
this.domBuilder.currentNode.parseInfo.openTag.end = end; |
If the opening tag represents a void element, there will not be a closing element, so we tell our DOM builder that we're done. | if (tagName && this._knownVoidHTMLElement(tagName))
this.domBuilder.popElement();
|
If the opening tag represents a | if (!this.stream.end() && tagName === "style") {
var cssBlock = this.cssParser.parse();
this.domBuilder.text(cssBlock.value, cssBlock.parseInfo);
}
return;
} else
throw new ParseError("UNTERMINATED_OPEN_TAG", this);
}
}, |
This helper function parses an HTML tag attribute. It expects the stream to be right after the end of an attribute name. | _parseAttribute: function() {
var nameTok = this.stream.makeToken();
nameTok.value = nameTok.value.toLowerCase();
this.stream.eatSpace(); |
If the character after the attribute name is a | if (this.stream.peek() == '=') {
this.stream.next(); |
Currently, we only support quoted attribute values, even though the HTML5 standard allows them to sometimes go unquoted. | this.stream.eatSpace();
this.stream.makeToken();
if (this.stream.next() != '"')
throw new ParseError("UNQUOTED_ATTR_VALUE", this);
this.stream.eatWhile(/[^"]/);
if (this.stream.next() != '"')
throw new ParseError("UNTERMINATED_ATTR_VALUE", this, nameTok);
var valueTok = this.stream.makeToken();
var unquotedValue = replaceEntityRefs(valueTok.value.slice(1, -1));
this.domBuilder.attribute(nameTok.value, unquotedValue, {
name: nameTok.interval,
value: valueTok.interval
});
} else {
this.stream.makeToken();
this.domBuilder.attribute(nameTok.value, '', {
name: nameTok.interval
});
}
}
}; |
The DOM BuilderThe DOM builder is used to construct a DOM representation of the
HTML/CSS being parsed. Each node contains a The DOM builder is given a single document DOM object that will be used to create all necessary DOM nodes. | function DOMBuilder(document) {
this.document = document;
this.fragment = document.createDocumentFragment();
this.currentNode = this.fragment;
}
DOMBuilder.prototype = { |
This method pushes a new element onto the DOM builder's stack. The element is appended to the currently active element and is then made the new currently active element. | pushElement: function(tagName, parseInfo) {
var node = this.document.createElement(tagName);
node.parseInfo = parseInfo;
this.currentNode.appendChild(node);
this.currentNode = node;
}, |
This method pops the current element off the DOM builder's stack, making its parent element the currently active element. | popElement: function() {
this.currentNode = this.currentNode.parentNode;
}, |
This method appends an HTML comment node to the currently active element. | comment: function(data, parseInfo) {
var comment = this.document.createComment('');
comment.nodeValue = data;
comment.parseInfo = parseInfo;
this.currentNode.appendChild(comment);
}, |
This method appends an attribute to the currently active element. | attribute: function(name, value, parseInfo) {
var attrNode = this.document.createAttribute(name);
attrNode.parseInfo = parseInfo;
attrNode.nodeValue = value;
this.currentNode.attributes.setNamedItem(attrNode);
}, |
This method appends a text node to the currently active element. | text: function(text, parseInfo) {
var textNode = this.document.createTextNode(text);
textNode.parseInfo = parseInfo;
this.currentNode.appendChild(textNode);
}
}; |
Exported Symbols
| var Slowparse = { |
We export our list of recognized HTML elements and CSS properties for clients to use if needed. | HTML_ELEMENT_NAMES: HTMLParser.prototype.voidHtmlElements.concat(
HTMLParser.prototype.htmlElements.concat(
HTMLParser.prototype.obsoleteHtmlElements)),
CSS_PROPERTY_NAMES: CSSParser.prototype.cssProperties, |
We also export a few internal symbols for use by Slowparse's testing suite. | replaceEntityRefs: replaceEntityRefs,
Stream: Stream, |
An array of error detector functions can also be passed as a
third argument to this function. An error detector function takes
the HTML and generated document fragment as arguments and returns
an error object if an error is detected, or | HTML: function(document, html, errorDetectors) {
var stream = new Stream(html),
domBuilder,
parser,
error = null;
if (document.pushElement)
domBuilder = document;
else
domBuilder = new DOMBuilder(document);
parser = new HTMLParser(stream, domBuilder);
try {
parser.parse();
} catch (e) {
if (e.parseInfo) {
error = e.parseInfo;
} else
throw e;
}
(errorDetectors || []).forEach(function(detector) {
if (!error)
error = detector(html, domBuilder.fragment) || null;
});
return {
document: domBuilder.fragment,
error: error
};
}, |
| findError: function(html, errorDetectors) {
return this.HTML(document, html, errorDetectors).error;
}
};
return Slowparse;
})();
|