slowparse.js

Slowparse is a token stream parser for HTML and CSS text, recording regions of interest during the parse run and signaling any errors detected accompanied by relevant regions in the text stream, to make debugging easy. Each error type is documented in the error specification.

Slowparse also builds a DOM as it goes, attaching metadata to each node build that points to where it came from in the original source.

For more information on the rationale behind Slowparse, as well as its design goals, see the README.

Implementation

Slowparse is effectively a finite state machine for HTML and CSS strings, and will switch between the HTML and CSS parsers while maintaining a single token stream.

var Slowparse = (function() {

Character Entity Parsing

We currently only parse the most common named character entities.

  var CHARACTER_ENTITY_REFS = {
    lt: "<",
    gt: ">",
    quot: '"',
    amp: "&"
  };
  

replaceEntityRefs() will replace named character entity references (e.g. &lt;) in the given text string and return the result. If an entity name is unrecognized, don't replace it at all. Writing HTML would be surprisingly painful without this forgiving behavior.

This function does not currently replace numeric character entity references (e.g., &#160;).

  function replaceEntityRefs(text) {
    return text.replace(/&([A-Za-z]+);/g, function(ref, name) {
      name = name.toLowerCase();
      if (name in CHARACTER_ENTITY_REFS)
        return CHARACTER_ENTITY_REFS[name];
      return ref;
    });
  }
  
  

Errors

ParseError is an internal error class used to indicate a parsing error. It never gets seen by Slowparse clients, as parse errors are an expected occurrence. However, they are used internally to simplify flow control.

The first argument is the name of an error type, followed by arbitrary positional arguments specific to that error type. Every instance has a parseInfo property which contains the error object that will be exposed to Slowparse clients when parsing errors occur.

  function ParseError(type) {
    this.name = "ParseError";
    if (!(type in ParseErrorBuilders))
      throw new Error("Unknown ParseError type: " + type);
    var args = [];
    for (var i = 1; i < arguments.length; i++)
      args.push(arguments[i]);
    var parseInfo = ParseErrorBuilders[type].apply(ParseErrorBuilders, args);

    /* This may seem a weird way of setting an attribute, but we want
     * to make the JSON serialize so the 'type' appears first, as it
     * makes our documentation read better. */
    parseInfo = ParseErrorBuilders._combine({
      type: type
    }, parseInfo);
    this.message = type;
    this.parseInfo = parseInfo;
  }

  ParseError.prototype = Error.prototype;

ParseErrorBuilders contains Factory functions for all our types of parse errors, indexed by error type.

Each public factory function returns a parseInfo object, sans the type property. For more information on each type of error, see the error specification.

  var ParseErrorBuilders = {
    /* Create a new object that has the properties of both arguments
     * and return it. */
    _combine: function(a, b) {
      var obj = {}, name;
      for (name in a) {
        obj[name] = a[name];
      }
      for (name in b) {
        obj[name] = b[name];
      }
      return obj;
    },

These are HTML errors.

    UNCLOSED_TAG: function(parser) {
      return {
        openTag: this._combine({
          name: parser.domBuilder.currentNode.nodeName.toLowerCase()
        }, parser.domBuilder.currentNode.parseInfo.openTag)
      };
    },
    INVALID_TAG_NAME: function(tagName, token) {
      return {
        openTag: this._combine({
          name: tagName
        }, token.interval)
      };
    },
    UNEXPECTED_CLOSE_TAG: function(parser, closeTagName, token) {
      return {
        closeTag: this._combine({
          name: closeTagName
        }, token.interval)
      };
    },
    MISMATCHED_CLOSE_TAG: function(parser, openTagName, closeTagName, token) {
      return {
        openTag: this._combine({
          name: openTagName
        }, parser.domBuilder.currentNode.parseInfo.openTag),
        closeTag: this._combine({
          name: closeTagName
        }, token.interval)
      };
    },
    CLOSE_TAG_FOR_VOID_ELEMENT: function(parser, closeTagName, token) {
      return {
        closeTag: this._combine({
          name: closeTagName
        }, token.interval)
      };
    },
    UNTERMINATED_COMMENT: function(token) {
      return {
        start: token.interval.start
      };
    },
    UNTERMINATED_ATTR_VALUE: function(parser, nameTok) {
      return {
        openTag: this._combine({
          name: parser.domBuilder.currentNode.nodeName.toLowerCase()
        }, parser.domBuilder.currentNode.parseInfo.openTag),
        attribute: {
          name: {
            value: nameTok.value,
            start: nameTok.interval.start,
            end: nameTok.interval.end
          },
          value: {
            start: parser.stream.makeToken().interval.start
          }
        },
      };
    },
    UNQUOTED_ATTR_VALUE: function(parser) {
      var pos = parser.stream.pos;
      if (!parser.stream.end())
        pos = parser.stream.makeToken().interval.start;
      return {
        start: pos
      };
    },
    UNTERMINATED_OPEN_TAG: function(parser) {
      return {
        openTag: {
          start: parser.domBuilder.currentNode.parseInfo.openTag.start,
          end: parser.stream.pos,
          name: parser.domBuilder.currentNode.nodeName.toLowerCase()
        }
      };
    },
    SELF_CLOSING_NON_VOID_ELEMENT: function(parser, tagName) {
      return {
        name: tagName,
        start: parser.domBuilder.currentNode.parseInfo.openTag.start,
        end: parser.stream.makeToken().interval.end
      };
    },
    UNTERMINATED_CLOSE_TAG: function(parser) {
      var end = parser.stream.pos;
      if (!parser.stream.end())
        end = parser.stream.makeToken().interval.start;
      return {
        closeTag: {
          name: parser.domBuilder.currentNode.nodeName.toLowerCase(),
          start: parser.domBuilder.currentNode.parseInfo.closeTag.start,
          end: end
        }
      };
    },

These are CSS errors.

    INVALID_CSS_PROPERTY_NAME: function(parser, start, end, property) {
      return {
        cssProperty: {
          start: start,
          end: end,
          property: property
        }
      };
    },
    MISSING_CSS_SELECTOR: function(parser, start, end) {
      return {
        cssBlock: {
          start: start,
          end: end
        }
      };
    },
    UNFINISHED_CSS_SELECTOR: function(parser, start, end, selector) {
      return {
        cssSelector: {
          start: start,
          end: end,
          selector: selector
        }
      };
    },
    MISSING_CSS_BLOCK_OPENER: function(parser, start, end, selector) {
      return {
        cssSelector: {
          start: start,
          end: end,
          selector: selector
        }
      };
    },
    INVALID_CSS_PROPERTY_NAME: function(parser, start, end, property) {
      return {
        cssProperty: {
          start: start,
          end: end,
          property: property
        }
      };
    },
    MISSING_CSS_PROPERTY: function(parser, start, end, selector) {
      return {
        cssSelector: {
          start: start,
          end: end,
          selector: selector
        }
      };
    },
    UNFINISHED_CSS_PROPERTY: function(parser, start, end, property) {
      return {
        cssProperty: {
          start: start,
          end: end,
          property: property
        }
      };
    },
    MISSING_CSS_VALUE: function(parser, start, end, property) {
      return {
        cssProperty: {
          start: start,
          end: end,
          property: property
        }
      };
    },
    UNFINISHED_CSS_VALUE: function(parser, start, end, value) {
      return {
        cssValue: {
          start: start,
          end: end,
          value: value
        }
      };
    },
    MISSING_CSS_BLOCK_CLOSER: function(parser, start, end, value) {
      return {
        cssValue: {
          start: start,
          end: end,
          value: value
        }
      };
    },
    UNCAUGHT_CSS_PARSE_ERROR: function(parser, start, end, msg) {
      return {
        error: {
          start: start,
          end: end,
          msg: msg
        }
      };
    },
    UNTERMINATED_CSS_COMMENT: function(start) {
      return {
        start: start
      };
    }
  };
  

Streams

Stream is an internal class used for tokenization. The interface for this class is inspired by the analogous class in CodeMirror.

  function Stream(text) {
    this.text = text;
    this.pos = 0;
    this.tokenStart = 0;
  }

  Stream.prototype = {

Stream.peek() returns the next character in the stream without advancing it. It will return undefined at the end of the text.

    peek: function() {
      return this.text[this.pos];
    },

Stream.next() returns the next character in the stream and advances it. It also returns undefined when no more characters are available.

    next: function() {
      if (!this.end())
        return this.text[this.pos++];
    },

Stream.end() returns true only if the stream is at the end of the text.

    end: function() {
      return (this.pos == this.text.length);
    },

Stream.eat() takes a regular expression. If the next character in the stream matches the given argument, it is consumed and returned. Otherwise, undefined is returned.

    eat: function(match) {
      if (this.peek().match(match))
        return this.next();
    },

Stream.eatWhile() repeatedly calls eat() with the given argument, until it fails. Returns true if any characters were eaten.

    eatWhile: function(matcher) {
      var wereAnyEaten = false;
      while (!this.end()) {
        if (this.eat(matcher))
          wereAnyEaten = true;
        else
          return wereAnyEaten;
      }
    },

Stream.eatSpace() is a shortcut for eatWhile() when matching white-space (including newlines).

    eatSpace: function() {
      return this.eatWhile(/[\s\n]/);
    },

Stream.eatCSSWhile() is like eatWhile(), but it automatically deals with eating block comments like /* foo */.

    eatCSSWhile: function(matcher) {
      var wereAnyEaten = false,
          chr = '',
          peek = '',
          next = '';
      while (!this.end()) {
        chr = this.eat(matcher);
        if (chr)
          wereAnyEaten = true;
        else
          return wereAnyEaten;
        if (chr === '/') {
          peek = this.peek();
          if (peek === '*') {
            /* Block comment found. Gobble until resolved. */
            while(next !== '/' && !this.end()) {
              this.eatWhile(/[^*]/);
              this.next();
              next = this.next();
            }
            next = '';
          }
        }
      }
    },

Stream.markTokenStart() will set the start for the next token to the current stream position (i.e., "where we are now").

    markTokenStart: function() {
      this.tokenStart = this.pos;
    },

Stream.markTokenStartAfterSpace() is a wrapper function for eating up space, then marking the start for a new token.

    markTokenStartAfterSpace: function() {
      this.eatSpace();
      this.markTokenStart();
    },

Stream.makeToken() generates a JSON-serializable token object representing the interval of text between the end of the last generated token and the current stream position.

    makeToken: function() {
      if (this.pos == this.tokenStart)
        return null;
      var token = {
        value: this.text.slice(this.tokenStart, this.pos),
        interval: {
          start: this.tokenStart,
          end: this.pos
        }
      };
      this.tokenStart = this.pos;
      return token;
    },

Stream.match() acts like a multi-character eat—if consume is true or not given—or a look-ahead that doesn't update the stream position—if it is false. string must be a string. caseFold can be set to true to make the match case-insensitive.

    match: function(string, consume, caseFold) {
      var substring = this.text.slice(this.pos, this.pos + string.length);
      if (caseFold) {
        string = string.toLowerCase();
        substring = substring.toLowerCase();
      }
      if (string == substring) {
        if (consume)
          this.pos += string.length;
        return true;
      }
      return false;
    }
  };

CSS Parsing

CSSParser is our internal CSS token stream parser object. This object has references to the stream, as well as the HTML DOM builder that is used by the HTML parser.

  function CSSParser(stream, domBuilder) {
    this.stream = stream;
    this.domBuilder = domBuilder;
  }

  CSSParser.prototype = {

We keep a list of all currently valid CSS properties (CSS1-CSS3). This list does not contain vendor prefixes.

    cssProperties: [
      "alignment-adjust","alignment-baseline","animation","animation-delay",
      "animation-direction","animation-duration","animation-iteration-count",
      "animation-name","animation-play-state","animation-timing-function",
      "appearance","azimuth","backface-visibility","background",
      "background-attachment","background-clip","background-color",
      "background-image","background-origin","background-position",
      "background-repeat","background-size","baseline-shift","binding",
      "bleed","bookmark-label","bookmark-level","bookmark-state",
      "bookmark-target","border","border-bottom","border-bottom-color",
      "border-bottom-left-radius","border-bottom-right-radius",
      "border-bottom-style","border-bottom-width","border-collapse",
      "border-color","border-image","border-image-outset",
      "border-image-repeat","border-image-slice","border-image-source",
      "border-image-width","border-left","border-left-color",
      "border-left-style","border-left-width","border-radius","border-right",
      "border-right-color","border-right-style","border-right-width",
      "border-spacing","border-style","border-top","border-top-color",
      "border-top-left-radius","border-top-right-radius","border-top-style",
      "border-top-width","border-width","bottom","box-decoration-break",
      "box-shadow","box-sizing","break-after","break-before","break-inside",
      "caption-side","clear","clip","color","color-profile","column-count",
      "column-fill","column-gap","column-rule","column-rule-color",
      "column-rule-style","column-rule-width","column-span","column-width",
      "columns","content","counter-increment","counter-reset","crop","cue",
      "cue-after","cue-before","cursor","direction","display",
      "dominant-baseline","drop-initial-after-adjust",
      "drop-initial-after-align","drop-initial-before-adjust",
      "drop-initial-before-align","drop-initial-size","drop-initial-value",
      "elevation","empty-cells","filter","fit","fit-position","flex-align",
      "flex-flow","flex-line-pack","flex-order","flex-pack","float","float-offset",
      "font","font-family","font-size","font-size-adjust","font-stretch",
      "font-style","font-variant","font-weight","grid-columns","grid-rows",
      "hanging-punctuation","height","hyphenate-after","hyphenate-before",
      "hyphenate-character","hyphenate-lines","hyphenate-resource","hyphens",
      "icon","image-orientation","image-rendering","image-resolution",
      "inline-box-align","left","letter-spacing","line-break","line-height",
      "line-stacking","line-stacking-ruby","line-stacking-shift",
      "line-stacking-strategy","list-style","list-style-image",
      "list-style-position","list-style-type","margin","margin-bottom",
      "margin-left","margin-right","margin-top","marker-offset","marks",
      "marquee-direction","marquee-loop","marquee-play-count","marquee-speed",
      "marquee-style","max-height","max-width","min-height","min-width",
      "move-to","nav-down","nav-index","nav-left","nav-right","nav-up",
      "opacity","orphans","outline","outline-color","outline-offset",
      "outline-style","outline-width","overflow","overflow-style",
      "overflow-wrap","overflow-x","overflow-y","padding","padding-bottom",
      "padding-left","padding-right","padding-top","page","page-break-after",
      "page-break-before","page-break-inside","page-policy","pause",
      "pause-after","pause-before","perspective","perspective-origin",
      "phonemes","pitch","pitch-range","play-during","position",
      "presentation-level","punctuation-trim","quotes","rendering-intent",
      "resize","rest","rest-after","rest-before","richness","right",
      "rotation","rotation-point","ruby-align","ruby-overhang",
      "ruby-position","ruby-span","src","size","speak","speak-header",
      "speak-numeral","speak-punctuation","speech-rate","stress","string-set",
      "tab-size","table-layout","target","target-name","target-new",
      "target-position","text-align","text-align-last","text-decoration",
      "text-decoration-color","text-decoration-line","text-decoration-skip",
      "text-decoration-style","text-emphasis","text-emphasis-color",
      "text-emphasis-position","text-emphasis-style","text-height",
      "text-indent","text-justify","text-outline","text-shadow",
      "text-space-collapse","text-transform","text-underline-position",
      "text-wrap","top","transform","transform-origin","transform-style",
      "transition","transition-delay","transition-duration",
      "transition-property","transition-timing-function","unicode-bidi",
      "vertical-align","visibility","voice-balance","voice-duration",
      "voice-family","voice-pitch","voice-pitch-range","voice-rate",
      "voice-stress","voice-volume","volume","white-space","widows","width",
      "word-break","word-spacing","word-wrap","z-index"],

This helper verifies that a specific string is a known CSS property. We include vendor-prefixed known CSS properties, like -o-transition.

    _knownCSSProperty: function(propertyName) {
      propertyName = propertyName.replace(/^-.+?-/,'');
      return this.cssProperties.indexOf(propertyName) > -1;
    },

The CSS Master Parse Function

Here we process the token stream, assumed to have its pointer inside a CSS element, and will try to parse the content inside it as CSS until we hit the end of the CSS element.

Any parse errors along the way will result in a ParseError being thrown.

    parse: function() {

We'll use some instance variables to keep track of our parse state:

  • A list of the CSS rulesets for the CSS block.
      this.rules = [];
      
  • A list of comment blocks inside the CSS.
      this.comments = [];

Parsing is based on finite states, and a call to _parseSelector() will run through any number of states until it either throws an error, or terminates cleanly.

      var sliceStart = this.stream.pos;
      this.stream.markTokenStartAfterSpace();
      this._parseSelector();
      var sliceEnd = this.stream.pos;

If we get here, the CSS block has no errors, and we report the start/end of the CSS block in the stream, as well as the rules/comments for the calling HTMLparser instance to work with.

      var cssBlock = {
        value: this.stream.text.slice(sliceStart, sliceEnd),
        parseInfo: {
          start: sliceStart,
          end: sliceEnd,
          rules: this.rules,
          comments: this.comments
        }
      };

      this.rules = null;
      this.comments = null;
      return cssBlock;
    },

CSS Comment Parsing

Here we record the position of comments in term in the instance's comment list, and return term with all its comments stripped.

    stripComments: function(term, startPos) {
      var pos,
          last = term.length,
          commentStart, commentEnd,
          prev, next,
          stripped = "";
      for (pos=0; pos < last; pos++) {
        if (term[pos] === '/' && pos<last-1 && term[pos+1] === '*') {
          commentStart = startPos + pos;
          pos += 3;
          while(pos < last-1 && term.substr(pos-1,2) !== "*/") {
            pos++;
          }
          if (pos >= last-1 && term.substr(pos-1,2) !== "*/")
            throw new ParseError("UNTERMINATED_CSS_COMMENT", commentStart);
          commentEnd = startPos + pos + 1;
          this.comments.push({start: commentStart, end: commentEnd});
        } else {
          stripped += term[pos];
        }
      }
      return stripped;
    },

CSS Selector Parsing

A selector is a string, and terminates on {, which signals the start of a CSS property/value pair (which may be empty).

There are a few characters in selectors that are an immediate error:

  • ; Rule terminator (ERROR: missing block opener)
  • } End of css block (ERROR: missing block opener)
  • < End of <style> element, start of </style> (ERROR: css declaration has no body)

Note that we cannot flag : as an error because pseudo-classes use it as their prefix.

    _parseSelector: function() {

Depending on our state, we may be coming from having just parsed a rule. If that's the case, add it to our list of rules.

      if (this.currentRule) {
        this.rules.push(this.currentRule);
        this.currentRule = null;
      }

Gobble all characters that could be part of the selector.

      this.stream.eatCSSWhile(/[^\{;\}<]/);
      var token = this.stream.makeToken(),
          peek = this.stream.peek();
      

If there was nothing to select, we're either done, or an error occurred.

      if (token === null) {
        if (!this.stream.end() && this.stream.peek() === '<') {
          return;
        }
        throw new ParseError("MISSING_CSS_SELECTOR", this, this.stream.pos-1,
                             this.stream.pos);
      }

If we get here, we have a selector string.

      token.value = token.value.trim();
      var selector = token.value,
          selectorStart = token.interval.start,
          selectorEnd = selectorStart + selector.length;
      
      selector = this.stripComments(selector, selectorStart).trim();
      if (selector === '') {
        this._parseSelector();
        return;
      }

Now we'll set up a ruleset object for this selector.

      this.currentRule = {
        selector: {
          value: selector,
          start: selectorStart,
          end: selectorEnd
        },
        declarations: {
          start: null,
          end: null,
          properties: []
        }
      };

Now we start to analyse whether we can continue, or whether we're in a terminal state, based on the next character in the stream.

      if (this.stream.end() || peek === '<') {
        throw new ParseError("UNFINISHED_CSS_SELECTOR", this, selectorStart,
                             selectorEnd, selector);
      }

      if (!this.stream.end()) {
        var next = this.stream.next(),
            errorMsg = "[_parseSelector] Expected {, }, ; or :, " +
                       "instead found " + next;
        if (next === '{') {

The only legal continuation after a selector is the opening { character. If that's the character we see, we can mark the start of the declarations block and start parsing them.

          this.currentRule.declarations.start = this.stream.pos-1;
          this._parseDeclaration(selector, selectorStart);
        } else if (next === ';' || next === '}') {

Otherwise, this is a parse error; we should have seen { instead.

          throw new ParseError("MISSING_CSS_BLOCK_OPENER", this,
                               selectorStart, selectorEnd, selector);
        } else {

We get here if an unexpected character was found.

          throw new ParseError("UNCAUGHT_CSS_PARSE_ERROR", this,
                               token.interval.start, token.interval.end,
                               errorMsg);
        }
      } else {

If the stream ended after the selector, we want the user to follow up with {.

        throw new ParseError("MISSING_CSS_BLOCK_OPENER", this, selectorStart,
                             selectorEnd, selector);
      }
    },

CSS Declaration Parsing

A declaration is a property: value; pair. It can be empty, in which case the next character must be }.

    _parseDeclaration: function(selector, selectorStart, value) {

First, we forward the stream to the next non-space character.

      this.stream.markTokenStartAfterSpace();
      var peek = this.stream.peek();
      if (peek === '}') {

If the next character is } then this is an empty block, and we should move on to trying to read a new selector ruleset.

        this.stream.next();
        this.currentRule.declarations.end = this.stream.pos;
        this.stream.markTokenStartAfterSpace();
        this._parseSelector();
      }

Administratively important: there are two ways for this function to have been called. One is from _parseSelector(), which is "the normal way", the other from _parseValue(), after finding a properly closed property:value; pair. In this case value will be the last declaration's value, which will let us throw a sensible debug error in case the stream is empty at this point, or points to </style>.

      else if (value && (this.stream.end() || peek === '<')) {
        throw new ParseError("MISSING_CSS_BLOCK_CLOSER", this, selectorStart,
                             selectorStart+value.length, value);
      }
      

If we're still in this function at this point, all is well and we can move on to property parsing.

      else {
        this._parseProperty(selector, selectorStart);
      }
    },

CSS Property Parsing

There is a fixed list of CSS properties, and we must check two things:

  1. Does the token string contain a syntax-legal property?
  2. Is that property in the set of known ones?

Properties are terminated by :, but we might also see the following characters, which should signal an error:

  • ; rule terminator (ERROR: missing value)
  • } end of CSS block (ERROR: missing value)
  • < end of <style> element, start of </style> (ERROR: missing value)
    _parseProperty: function(selector, selectorStart) {
      var property = this.stream.eatCSSWhile(/[^\{\}<;:]/),
          token = this.stream.makeToken();

      if (token === null) {
        throw new ParseError("MISSING_CSS_PROPERTY", this, selectorStart,
                             selectorStart + selector.length, selector);
      }

      var property = token.value.trim();
          propertyStart = token.interval.start,
          propertyEnd = propertyStart + property.length;

      property = this.stripComments(property, propertyStart).trim();
      if (property === '') {
        this._parseDeclaration(selector, selectorStart);
        return;
      }

      var next = this.stream.next(),
          errorMsg = "[_parseProperty] Expected }, <, ; or :, " +
                     "instead found " + next;

      if ((this.stream.end() && next !== ':') || next === '<' ||
          next === '}') {
        throw new ParseError("UNFINISHED_CSS_PROPERTY", this, propertyStart,
                             propertyEnd, property);
      }

We record property: value pairs as we run through the stream, which are added to the set of property: value pairs in the instance's rules.properties array. The push happens when we have a clean run in _parseValue().

      this.currentProperty = {
        name: {
          value: property,
          start: propertyStart,
          end: propertyEnd
        }
      };

If we find a colon, we have a property and now need a value to go along with it.

      if (next === ':') {

Before we continue, we must make sure the string we found is a real CSS property.

        if (!( property && property.match(/^[a-z\-]+$/)) ||
            !this._knownCSSProperty(property))
          throw new ParseError("INVALID_CSS_PROPERTY_NAME", this,
                               propertyStart, propertyEnd, property);
        this.stream.markTokenStartAfterSpace();
        this._parseValue(selector, selectorStart, property, propertyStart);
      }

Otherwise, anything else at this point constitutes an error.

      else if (next === ';') {
        throw new ParseError("MISSING_CSS_VALUE", this, propertyStart,
                             propertyEnd, property);
      }
      else if (next === '{') {
        throw new ParseError("MISSING_CSS_BLOCK_CLOSER", this, selectorStart,
                             propertyStart, selector);
      }
      else {
        throw new ParseError("UNCAUGHT_CSS_PARSE_ERROR", this,
                             token.interval.start, token.interval.end,
                             errorMsg);
      }
    },

CSS Value Parsing

A value must end either in ; or in }. However, we may also find:

  • < end of <style> element, start of </style> (ERROR: missing block closer)
    _parseValue: function(selector, selectorStart, property, propertyStart) {
      var rule = this.stream.eatCSSWhile(/[^}<;]/),
          token = this.stream.makeToken();
          
      if(token === null) {
        throw new ParseError("MISSING_CSS_VALUE", this, propertyStart,
                             propertyStart+property.length, property);
      }

      var next = (!this.stream.end() ? this.stream.next() : "end of stream"),
          errorMsg = "[_parseValue] Expected }, <, or ;, instead found "+next;
      token.value = token.value.trim();
      var value = token.value,
          valueStart = token.interval.start,
          valueEnd = valueStart + value.length;

      value = this.stripComments(value, valueStart).trim();
      if (value === '') {
        throw new ParseError("MISSING_CSS_VALUE", this, this.stream.pos-1,
                             this.stream.pos);
      }

At this point we can fill in the value part of the current property: value; pair. However, we hold off binding it until we are sure there are no parse errors.

      this.currentProperty.value = {
        value: value,
        start: valueStart,
        end: valueEnd
      }

      if ((this.stream.end() && next !== ';') || next === '<') {
        throw new ParseError("UNFINISHED_CSS_VALUE", this, valueStart,
                             valueEnd, value);
      }

      if (next === ';') {

This is normal CSS rule termination; try to read a new property/value pair.

        this._bindCurrentRule();
        this.stream.markTokenStartAfterSpace();
        this._parseDeclaration(selector, valueStart, value);
      }
      else if (next === '}') {

This is block level termination; try to read a new selector.

        this.currentRule.declarations.end = this.stream.pos;
        this._bindCurrentRule();
        this.stream.markTokenStartAfterSpace();
        this._parseSelector();
      }
      else {
        throw new ParseError("UNCAUGHT_CSS_PARSE_ERROR", this,
                             token.interval.start, token.interval.end,
                             errorMsg);
      }
    },

This helper function binds the currrent property: value object in the current ruleset, and resets it for the next selector block.

    _bindCurrentRule: function() {
      this.currentRule.declarations.properties.push(this.currentProperty);
      this.currentProperty = null;
    }
  }

HTML Parsing

The HTML token stream parser object has references to the stream, as well as a DOM builder that is used to construct the DOM while we run through the token stream.

  function HTMLParser(stream, domBuilder) {
    this.stream = stream;
    this.domBuilder = domBuilder;
    this.cssParser = new CSSParser(stream, domBuilder);
  }

  HTMLParser.prototype = {
    html5Doctype: '<!DOCTYPE html>',

Void HTML elements are the ones that don't need to have a closing tag.

    voidHtmlElements: ["area", "base", "br", "col", "command", "embed", "hr",
                       "img", "input", "keygen", "link", "meta", "param",
                       "source", "track", "wbr"],

We keep a list of all valid HTML5 elements.

    htmlElements: ["a", "abbr", "address", "area", "article", "aside",
                   "audio", "b", "base", "bdi", "bdo", "bgsound", "blink",
                   "blockquote", "body", "br", "button", "canvas", "caption",
                   "cite", "code", "col", "colgroup", "command", "datalist",
                   "dd", "del", "details", "dfn", "div", "dl", "dt", "em",
                   "embed", "fieldset", "figcaption", "figure", "footer",
                   "form", "frame", "frameset", "h1", "h2", "h3", "h4", "h5",
                   "h6", "head", "header", "hgroup", "hr", "html", "i",
                   "iframe", "img", "input", "ins", "kbd", "keygen", "label",
                   "legend", "li", "link", "map", "mark", "marquee", "menu",
                   "meta", "meter", "nav", "nobr", "noscript", "object", "ol",
                   "optgroup", "option", "output", "p", "param", "pre",
                   "progress", "q", "rp", "rt", "ruby", "samp", "script",
                   "section", "select", "small", "source", "spacer", "span",
                   "strong", "style", "sub", "summary", "sup", "table",
                   "tbody", "td", "textarea", "tfoot", "th", "thead", "time",
                   "title", "tr", "track", "u", "ul", "var", "video", "wbr"],

We also keep a list of HTML elements that are now obsolete, but may still be encountered in the wild on popular sites.

    obsoleteHtmlElements: ["acronym", "applet", "basefont", "big", "center",
                           "dir", "font", "isindex", "listing", "noframes",
                           "plaintext", "s", "strike", "tt", "xmp"],

This is a helper function to determine whether a given string is a legal HTML element tag.

    _knownHTMLElement: function(tagName) {
      return this.voidHtmlElements.indexOf(tagName) > -1 || 
              this.htmlElements.indexOf(tagName) > -1 ||
              this.obsoleteHtmlElements.indexOf(tagName) > -1;
    },

This is a helper function to determine whether a given string is a void HTML element tag.

    _knownVoidHTMLElement: function(tagName) {
      return this.voidHtmlElements.indexOf(tagName) > -1;
    },

The HTML Master Parse Function

The HTML master parse function works the same as the CSS parser: it takes the token stream and will try to parse the content as a sequence of HTML elements.

Any parse errors along the way will result in the code throwing a ParseError.

    parse: function() {

First we check to see if the beginning of our stream is an HTML5 doctype tag. We're currently quite strict and don't parse XHTML or other doctypes.

      if (this.stream.match(this.html5Doctype, true, true))
        this.domBuilder.fragment.parseInfo = {
          doctype: {
            start: 0,
            end: this.stream.pos
          }
        };
      

Next, we parse "tag soup", creating text nodes and diving into tags as we find them.

      while (!this.stream.end()) {
        if (this.stream.peek() == '<') {
          this._buildTextNode();
          this._parseStartTag();
        } else
          this.stream.next();
      }

      this._buildTextNode();

At the end, it's possible we're left with an open tag, so we test for that.

      if (this.domBuilder.currentNode != this.domBuilder.fragment)
        throw new ParseError("UNCLOSED_TAG", this);
    },

This is a helper to build a DOM text node.

    _buildTextNode: function() {
      var token = this.stream.makeToken();
      if (token) {
        this.domBuilder.text(replaceEntityRefs(token.value), token.interval);
      }
    },

HTML Tag Parsing

This is the entry point for parsing the beginning of an HTML tag. It assumes the stream is on a < character.

    _parseStartTag: function() {
      if (this.stream.next() != '<')
        throw new Error('assertion failed, expected to be on "<"');

      if (this.stream.match('!--', true)) {
        this._parseComment();
        return;
      }
      
      this.stream.eat(/\//);
      this.stream.eatWhile(/[\w\d]/);
      var token = this.stream.makeToken();
      var tagName = token.value.slice(1).toLowerCase();
      

If the character after the < is a /, we're on a closing tag. We want to report useful errors about whether the tag is unexpected or doesn't match with the most recent opening tag.

      if (tagName[0] == '/') {
        var closeTagName = tagName.slice(1).toLowerCase();
        if (this._knownVoidHTMLElement(closeTagName))
          throw new ParseError("CLOSE_TAG_FOR_VOID_ELEMENT", this,
                               closeTagName, token);
        if (!this.domBuilder.currentNode.parseInfo)
          throw new ParseError("UNEXPECTED_CLOSE_TAG", this, closeTagName,
                               token);
        this.domBuilder.currentNode.parseInfo.closeTag = {
          start: token.interval.start
        };
        var openTagName = this.domBuilder.currentNode.nodeName.toLowerCase();
        if (closeTagName != openTagName)
          throw new ParseError("MISMATCHED_CLOSE_TAG", this, openTagName,
                               closeTagName, token);
        this._parseEndCloseTag();
      }
      
      else {

We want to make sure that opening tags have valid tag names.

        if (!(tagName && this._knownHTMLElement(tagName)))
          throw new ParseError("INVALID_TAG_NAME", tagName, token);

        this.domBuilder.pushElement(tagName, {
          openTag: {
            start: token.interval.start
          }
        });
        if (!this.stream.end())
          this._parseEndOpenTag(tagName);
      }
    },

This helper parses HTML comments. It assumes the stream has just passed the beginning <!-- of an HTML comment.

    _parseComment: function() {
      var token;
      while (!this.stream.end()) {
        if (this.stream.match('-->', true)) {
          token = this.stream.makeToken();
          this.domBuilder.comment(token.value.slice(4, -3), token.interval);
          return;
        }
        this.stream.next();
      }
      token = this.stream.makeToken();
      throw new ParseError("UNTERMINATED_COMMENT", token);
    },

This helper function parses the end of a closing tag. It expects the stream to be right after the end of the closing tag's tag name.

    _parseEndCloseTag: function() {
      this.stream.eatSpace();
      if (this.stream.next() != '>')
        throw new ParseError("UNTERMINATED_CLOSE_TAG", this);
      var end = this.stream.makeToken().interval.end;
      this.domBuilder.currentNode.parseInfo.closeTag.end = end;
      this.domBuilder.popElement();
    },

This helper function parses the rest of an opening tag after its tag name, looking for attribute="value" data until a > is encountered.

    _parseEndOpenTag: function(tagName) {
      /* FIXME: we probably don't need while() here, as the parser will
       *        either cleanly terminate or throw a ParseError anyway? */
      while (!this.stream.end()) {
        if (this.stream.eatWhile(/[A-Za-z\-]/)) {
          this._parseAttribute();
        }
        else if (this.stream.eatSpace()) {
          this.stream.makeToken();
        }
        else if (this.stream.peek() == '>' || this.stream.match("/>")) {
          if (this.stream.match("/>", true)) {
            if (!this._knownVoidHTMLElement(tagName))
              throw new ParseError("SELF_CLOSING_NON_VOID_ELEMENT", this, 
                                   tagName);
          } else
            this.stream.next();
          var end = this.stream.makeToken().interval.end;
          this.domBuilder.currentNode.parseInfo.openTag.end = end;

If the opening tag represents a void element, there will not be a closing element, so we tell our DOM builder that we're done.

          if (tagName && this._knownVoidHTMLElement(tagName))
            this.domBuilder.popElement();
          

If the opening tag represents a <style> element, we hand off parsing to our CSS parser.

          if (!this.stream.end() && tagName === "style") {
            var cssBlock = this.cssParser.parse();
            this.domBuilder.text(cssBlock.value, cssBlock.parseInfo);
          }

          return;
        } else
          throw new ParseError("UNTERMINATED_OPEN_TAG", this);
      }
    },

This helper function parses an HTML tag attribute. It expects the stream to be right after the end of an attribute name.

    _parseAttribute: function() {
      var nameTok = this.stream.makeToken();
      nameTok.value = nameTok.value.toLowerCase();
      this.stream.eatSpace();

If the character after the attribute name is a =, then we look for an attribute value; otherwise, this is a boolean attribute.

      if (this.stream.peek() == '=') {
        this.stream.next();

Currently, we only support quoted attribute values, even though the HTML5 standard allows them to sometimes go unquoted.

        this.stream.eatSpace();
        this.stream.makeToken();
        if (this.stream.next() != '"')
          throw new ParseError("UNQUOTED_ATTR_VALUE", this);
        this.stream.eatWhile(/[^"]/);
        if (this.stream.next() != '"')
          throw new ParseError("UNTERMINATED_ATTR_VALUE", this, nameTok);
        var valueTok = this.stream.makeToken();
        var unquotedValue = replaceEntityRefs(valueTok.value.slice(1, -1));
        this.domBuilder.attribute(nameTok.value, unquotedValue, {
          name: nameTok.interval,
          value: valueTok.interval
        });
      } else {
        this.stream.makeToken();
        this.domBuilder.attribute(nameTok.value, '', {
          name: nameTok.interval
        });
      }
    }
  };

The DOM Builder

The DOM builder is used to construct a DOM representation of the HTML/CSS being parsed. Each node contains a parseInfo expando property that contains information about the text extents of the original source code that the DOM element maps to.

The DOM builder is given a single document DOM object that will be used to create all necessary DOM nodes.

  function DOMBuilder(document) {
    this.document = document;
    this.fragment = document.createDocumentFragment();
    this.currentNode = this.fragment;
  }

  DOMBuilder.prototype = {

This method pushes a new element onto the DOM builder's stack. The element is appended to the currently active element and is then made the new currently active element.

    pushElement: function(tagName, parseInfo) {
      var node = this.document.createElement(tagName);
      node.parseInfo = parseInfo;
      this.currentNode.appendChild(node);
      this.currentNode = node;
    },

This method pops the current element off the DOM builder's stack, making its parent element the currently active element.

    popElement: function() {
      this.currentNode = this.currentNode.parentNode;
    },

This method appends an HTML comment node to the currently active element.

    comment: function(data, parseInfo) {
      var comment = this.document.createComment('');
      comment.nodeValue = data;
      comment.parseInfo = parseInfo;
      this.currentNode.appendChild(comment);
    },

This method appends an attribute to the currently active element.

    attribute: function(name, value, parseInfo) {
      var attrNode = this.document.createAttribute(name);
      attrNode.parseInfo = parseInfo;
      attrNode.nodeValue = value;
      this.currentNode.attributes.setNamedItem(attrNode);
    },

This method appends a text node to the currently active element.

    text: function(text, parseInfo) {
      var textNode = this.document.createTextNode(text);
      textNode.parseInfo = parseInfo;
      this.currentNode.appendChild(textNode);
    }
  };

Exported Symbols

Slowparse is the object that holds all exported symbols from this library.

  var Slowparse = {

We export our list of recognized HTML elements and CSS properties for clients to use if needed.

    HTML_ELEMENT_NAMES: HTMLParser.prototype.voidHtmlElements.concat(
                          HTMLParser.prototype.htmlElements.concat(
                            HTMLParser.prototype.obsoleteHtmlElements)),
    CSS_PROPERTY_NAMES: CSSParser.prototype.cssProperties,

We also export a few internal symbols for use by Slowparse's testing suite.

    replaceEntityRefs: replaceEntityRefs,
    Stream: Stream,

Slowparse.HTML() is the primary function we export. Given a DOM document object (or a DOMBuilder instance) and a string of HTML, we return an object with the following keys:

  • document is a DOM document fragment containing the DOM of the parsed HTML. If an error occurred while parsing, this document is incomplete, and represents what was built before the error was encountered.

  • error is a JSON-serializable object representing any error that occurred while parsing. If no errors occurred while parsing, its value is null. For a list of the types of errors that can be returned, see the error specification.

An array of error detector functions can also be passed as a third argument to this function. An error detector function takes the HTML and generated document fragment as arguments and returns an error object if an error is detected, or undefined otherwise. This can be used for further error checking on the parsed document.

    HTML: function(document, html, errorDetectors) {
      var stream = new Stream(html),
          domBuilder,
          parser,
          error = null;

      if (document.pushElement)
        domBuilder = document;
      else
        domBuilder = new DOMBuilder(document);
      parser = new HTMLParser(stream, domBuilder);

      try {
        parser.parse();
      } catch (e) {
        if (e.parseInfo) {
          error = e.parseInfo;
        } else
          throw e;
      }
      
      (errorDetectors || []).forEach(function(detector) {
        if (!error)
          error = detector(html, domBuilder.fragment) || null;
      });

      return {
        document: domBuilder.fragment,
        error: error
      };
    },

Slowparse.findError() just returns any error in the given HTML string, or null if the HTML contains no errors.

    findError: function(html, errorDetectors) {
      return this.HTML(document, html, errorDetectors).error;
    }
  };

  return Slowparse;
})();