sax
Advanced tools
Comparing version 0.1.5 to 0.2.0
733
lib/sax.js
// wrapper for non-node envs | ||
;(function (sax) { | ||
sax.parser = function (strict, opt) { return new SAXParser(strict, opt) }; | ||
sax.SAXParser = SAXParser; | ||
sax.parser = function (strict, opt) { return new SAXParser(strict, opt) } | ||
sax.SAXParser = SAXParser | ||
@@ -16,3 +16,3 @@ // When we pass the MAX_BUFFER_LENGTH position, start checking for buffer overruns. | ||
// Set to Infinity to have unlimited buffers. | ||
sax.MAX_BUFFER_LENGTH = 64 * 1024; | ||
sax.MAX_BUFFER_LENGTH = 64 * 1024 | ||
@@ -23,26 +23,27 @@ var buffers = [ | ||
"attribValue", "cdata" | ||
]; | ||
] | ||
function SAXParser (strict, opt) { | ||
clearBuffers(this); | ||
this.q = this.c = ""; | ||
this.bufferCheckPosition = sax.MAX_BUFFER_LENGTH; | ||
this.opt = opt || {}; | ||
this.tagCase = this.opt.lowercasetags ? "toLowerCase" : "toUpperCase"; | ||
this.tags = []; | ||
this.closed = this.closedRoot = this.sawRoot = false; | ||
this.tag = this.error = null; | ||
this.strict = !!strict; | ||
this.state = S.BEGIN; | ||
this.ENTITIES = Object.create(sax.ENTITIES); | ||
clearBuffers(this) | ||
this.q = this.c = "" | ||
this.bufferCheckPosition = sax.MAX_BUFFER_LENGTH | ||
this.opt = opt || {} | ||
this.tagCase = this.opt.lowercasetags ? "toLowerCase" : "toUpperCase" | ||
this.tags = [] | ||
this.closed = this.closedRoot = this.sawRoot = false | ||
this.tag = this.error = null | ||
this.strict = !!strict | ||
this.state = S.BEGIN | ||
this.ENTITIES = Object.create(sax.ENTITIES) | ||
// mostly just for error reporting | ||
this.position = this.line = this.column = 0; | ||
emit(this, "onready"); | ||
this.position = this.line = this.column = 0 | ||
emit(this, "onready") | ||
} | ||
function checkBufferLength (parser) { | ||
var maxAllowed = Math.max(sax.MAX_BUFFER_LENGTH, 10), | ||
maxActual = 0; | ||
var maxAllowed = Math.max(sax.MAX_BUFFER_LENGTH, 10) | ||
, maxActual = 0 | ||
for (var i = 0, l = buffers.length; i < l; i ++) { | ||
var len = parser[buffers[i]].length; | ||
var len = parser[buffers[i]].length | ||
if (len > maxAllowed) { | ||
@@ -63,12 +64,15 @@ // Text/cdata nodes can get big, and since they're buffered, | ||
} | ||
maxActual = Math.max(maxActual, len); | ||
maxActual = Math.max(maxActual, len) | ||
} | ||
// schedule the next check for the earliest possible buffer overrun. | ||
parser.bufferCheckPosition = (sax.MAX_BUFFER_LENGTH - maxActual) + parser.position; | ||
parser.bufferCheckPosition = (sax.MAX_BUFFER_LENGTH - maxActual) | ||
+ parser.position | ||
} | ||
function clearBuffers (parser) { | ||
for (var i = 0, l = buffers.length; i < l; i ++) { | ||
parser[buffers[i]] = ""; | ||
parser[buffers[i]] = "" | ||
} | ||
} | ||
SAXParser.prototype = { | ||
@@ -80,19 +84,94 @@ write : write, | ||
try { | ||
var Stream = require("stream").Stream | ||
} catch (ex) { | ||
var Stream = function () {} | ||
} | ||
function createStream (strict, opt) { | ||
return new SAXStream(strict, opt) | ||
} | ||
function SAXStream (strict, opt) { | ||
Stream.apply(me) | ||
this._parser = new SAXParser(strict, opt) | ||
this.writable = true | ||
this.readable = true | ||
var me = this | ||
this._parser.onend = function () { | ||
me.emit("end") | ||
} | ||
this._parser.onerror = function (er) { | ||
me.emit("error", er) | ||
} | ||
} | ||
SAXStream.prototype = Object.create(Stream.prototype, | ||
{ constructor: { value: SAXStream } }) | ||
SAXStream.prototype.write = function (data) { | ||
this._parser.write(data.toString()) | ||
this.emit(data) | ||
} | ||
SAXStream.prototype.end = function (chunk) { | ||
if (chunk && chunk.length) this._parser.write(chunk.toString()) | ||
this._parser.emit("end") | ||
} | ||
var streamWraps = | ||
[ "opentag" | ||
, "closetag" | ||
, "text" | ||
, "attribute" | ||
, "error" | ||
, "doctype" | ||
, "processinginstruction" | ||
, "sgmldeclaration" | ||
, "comment" | ||
, "opencdata" | ||
, "cdata" | ||
, "closecdata" | ||
, "ready" | ||
] | ||
SAXStream.prototype.on = function (ev, handler) { | ||
var me = this | ||
if (!me._parser["on"+ev] && streamWraps.indexOf(ev) !== -1) { | ||
me._parser["on"+ev] = function () { | ||
var args = arguments.length === 1 ? [arguments[0]] | ||
: Array.apply(null, arguments) | ||
args.splice(0, 0, ev) | ||
me.emit.apply(me, args) | ||
} | ||
} | ||
return Stream.prototype.on.call(me, ev, handler) | ||
} | ||
// character classes and tokens | ||
var whitespace = "\r\n\t ", | ||
var whitespace = "\r\n\t " | ||
// this really needs to be replaced with character classes. | ||
// XML allows all manner of ridiculous numbers and digits. | ||
number = "0124356789", | ||
letter = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ", | ||
// (Letter | '_' | ':') | ||
nameStart = letter+"_:", | ||
nameBody = nameStart+number+"-.", | ||
quote = "'\"", | ||
entity = number+letter+"#", | ||
CDATA = "[CDATA[", | ||
DOCTYPE = "DOCTYPE"; | ||
, number = "0124356789" | ||
, letter = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" | ||
// (Letter | "_" | ":") | ||
, nameStart = letter+"_:" | ||
, nameBody = nameStart+number+"-." | ||
, quote = "'\"" | ||
, entity = number+letter+"#" | ||
, CDATA = "[CDATA[" | ||
, DOCTYPE = "DOCTYPE" | ||
function is (charclass, c) { return charclass.indexOf(c) !== -1 } | ||
function not (charclass, c) { return !is(charclass, c) } | ||
var S = 0; | ||
var S = 0 | ||
sax.STATE = | ||
@@ -142,310 +221,320 @@ { BEGIN : S++ | ||
for (var S in sax.STATE) sax.STATE[sax.STATE[S]] = S; | ||
for (var S in sax.STATE) sax.STATE[sax.STATE[S]] = S | ||
// shorthand | ||
S = sax.STATE; | ||
S = sax.STATE | ||
sax.EVENTS = [ // for discoverability. | ||
"text", "processinginstruction", "sgmldeclaration", | ||
"doctype", "comment", "attribute", "opentag", "closetag", | ||
"opencdata", "cdata", "closecdata", "error", "end", "ready" ]; | ||
"opencdata", "cdata", "closecdata", "error", "end", "ready" ] | ||
function emit (parser, event, data) { | ||
parser[event] && parser[event](data); | ||
parser[event] && parser[event](data) | ||
} | ||
function emitNode (parser, nodeType, data) { | ||
if (parser.textNode) closeText(parser); | ||
emit(parser, nodeType, data); | ||
if (parser.textNode) closeText(parser) | ||
emit(parser, nodeType, data) | ||
} | ||
function closeText (parser) { | ||
parser.textNode = textopts(parser.opt, parser.textNode); | ||
if (parser.textNode) emit(parser, "ontext", parser.textNode); | ||
parser.textNode = ""; | ||
parser.textNode = textopts(parser.opt, parser.textNode) | ||
if (parser.textNode) emit(parser, "ontext", parser.textNode) | ||
parser.textNode = "" | ||
} | ||
function textopts (opt, text) { | ||
if (opt.trim) text = text.trim(); | ||
if (opt.normalize) text = text.replace(/\s+/g, " "); | ||
return text; | ||
if (opt.trim) text = text.trim() | ||
if (opt.normalize) text = text.replace(/\s+/g, " ") | ||
return text | ||
} | ||
function error (parser, er) { | ||
closeText(parser); | ||
closeText(parser) | ||
er += "\nLine: "+parser.line+ | ||
"\nColumn: "+parser.column+ | ||
"\nChar: "+parser.c; | ||
er = new Error(er); | ||
parser.error = er; | ||
emit(parser, "onerror", er); | ||
return parser; | ||
"\nChar: "+parser.c | ||
er = new Error(er) | ||
parser.error = er | ||
emit(parser, "onerror", er) | ||
return parser | ||
} | ||
function end (parser) { | ||
if (parser.state !== S.TEXT) error(parser, "Unexpected end"); | ||
closeText(parser); | ||
parser.c = ""; | ||
parser.closed = true; | ||
emit(parser, "onend"); | ||
SAXParser.call(parser, parser.strict, parser.opt); | ||
return parser; | ||
if (parser.state !== S.TEXT) error(parser, "Unexpected end") | ||
closeText(parser) | ||
parser.c = "" | ||
parser.closed = true | ||
emit(parser, "onend") | ||
SAXParser.call(parser, parser.strict, parser.opt) | ||
return parser | ||
} | ||
function strictFail (parser, message) { | ||
if (parser.strict) error(parser, message); | ||
if (parser.strict) error(parser, message) | ||
} | ||
function newTag (parser) { | ||
if (!parser.strict) parser.tagName = parser.tagName[parser.tagCase](); | ||
parser.tag = { name : parser.tagName, attributes : {} }; | ||
if (!parser.strict) parser.tagName = parser.tagName[parser.tagCase]() | ||
parser.tag = { name : parser.tagName, attributes : {} } | ||
} | ||
function openTag (parser, selfClosing) { | ||
parser.sawRoot = true; | ||
parser.tags.push(parser.tag); | ||
emitNode(parser, "onopentag", parser.tag); | ||
parser.sawRoot = true | ||
parser.tags.push(parser.tag) | ||
emitNode(parser, "onopentag", parser.tag) | ||
if (!selfClosing) { | ||
parser.tag = null; | ||
parser.tagName = ""; | ||
parser.state = S.TEXT; | ||
parser.tag = null | ||
parser.tagName = "" | ||
parser.state = S.TEXT | ||
} | ||
parser.attribName = parser.attribValue = ""; | ||
parser.attribName = parser.attribValue = "" | ||
} | ||
function closeTag (parser) { | ||
if (!parser.tagName) { | ||
strictFail(parser, "Weird empty close tag."); | ||
parser.textNode += "</>"; | ||
parser.state = S.TEXT; | ||
return; | ||
strictFail(parser, "Weird empty close tag.") | ||
parser.textNode += "</>" | ||
parser.state = S.TEXT | ||
return | ||
} | ||
// first make sure that the closing tag actually exists. | ||
// <a><b></c></b></a> will close everything, otherwise. | ||
var t = parser.tags.length; | ||
if (!parser.strict) parser.tagName = parser.tagName[parser.tagCase](); | ||
var closeTo = parser.tagName; | ||
var t = parser.tags.length | ||
if (!parser.strict) parser.tagName = parser.tagName[parser.tagCase]() | ||
var closeTo = parser.tagName | ||
while (t --) { | ||
var close = parser.tags[t]; | ||
var close = parser.tags[t] | ||
if (close.name !== closeTo) { | ||
// fail the first time in strict mode | ||
strictFail(parser, "Unexpected close tag"); | ||
} else break; | ||
strictFail(parser, "Unexpected close tag") | ||
} else break | ||
} | ||
// didn't find it. we already failed for strict, so just abort. | ||
if (t < 0) return; | ||
var s = parser.tags.length; | ||
if (t < 0) return | ||
var s = parser.tags.length | ||
while (s --> t) { | ||
parser.tag = parser.tags.pop(); | ||
parser.tagName = parser.tag.name; | ||
emitNode(parser, "onclosetag", parser.tagName); | ||
parser.tag = parser.tags.pop() | ||
parser.tagName = parser.tag.name | ||
emitNode(parser, "onclosetag", parser.tagName) | ||
} | ||
if (t === 0) parser.closedRoot = true; | ||
parser.tagName = parser.attribValue = parser.attribName = ""; | ||
parser.tag = null; | ||
parser.state = S.TEXT; | ||
if (t === 0) parser.closedRoot = true | ||
parser.tagName = parser.attribValue = parser.attribName = "" | ||
parser.tag = null | ||
parser.state = S.TEXT | ||
} | ||
function parseEntity (parser) { | ||
var entity = parser.entity.toLowerCase(), num, numStr = ""; | ||
if (parser.ENTITIES[entity]) return parser.ENTITIES[entity]; | ||
var entity = parser.entity.toLowerCase(), num, numStr = "" | ||
if (parser.ENTITIES[entity]) return parser.ENTITIES[entity] | ||
if (entity.charAt(0) === "#") { | ||
if (entity.charAt(1) === "x") { | ||
entity = entity.slice(2); | ||
num = parseInt(entity, 16), numStr = num.toString(16); | ||
entity = entity.slice(2) | ||
num = parseInt(entity, 16), numStr = num.toString(16) | ||
} else { | ||
entity = entity.slice(1); | ||
num = parseInt(entity, 10), numStr = num.toString(10); | ||
entity = entity.slice(1) | ||
num = parseInt(entity, 10), numStr = num.toString(10) | ||
} | ||
} | ||
if (numStr.toLowerCase() !== entity) { | ||
strictFail(parser, "Invalid character entity"); | ||
return "&"+parser.entity + ";"; | ||
strictFail(parser, "Invalid character entity") | ||
return "&"+parser.entity + ";" | ||
} | ||
return String.fromCharCode(num); | ||
return String.fromCharCode(num) | ||
} | ||
function write (chunk) { | ||
var parser = this; | ||
if (this.error) throw this.error; | ||
var parser = this | ||
if (this.error) throw this.error | ||
if (parser.closed) return error(parser, | ||
"Cannot write after close. Assign an onready handler."); | ||
if (chunk === null) return end(parser); | ||
"Cannot write after close. Assign an onready handler.") | ||
if (chunk === null) return end(parser) | ||
var i = 0, c = "" | ||
while (parser.c = c = chunk.charAt(i++)) { | ||
parser.position ++; | ||
parser.position ++ | ||
if (c === "\n") { | ||
parser.line ++; | ||
parser.column = 0; | ||
} else parser.column ++; | ||
parser.line ++ | ||
parser.column = 0 | ||
} else parser.column ++ | ||
switch (parser.state) { | ||
case S.BEGIN: | ||
if (c === "<") parser.state = S.OPEN_WAKA; | ||
if (c === "<") parser.state = S.OPEN_WAKA | ||
else if (not(whitespace,c)) { | ||
// have to process this as a text node. | ||
// weird, but happens. | ||
strictFail(parser, "Non-whitespace before first tag."); | ||
parser.textNode = c; | ||
state = S.TEXT; | ||
strictFail(parser, "Non-whitespace before first tag.") | ||
parser.textNode = c | ||
state = S.TEXT | ||
} | ||
continue; | ||
continue | ||
case S.TEXT: | ||
if (parser.sawRoot && !parser.closedRoot) { | ||
var starti = i-1; | ||
var starti = i-1 | ||
while (c && c!=="<" && c!=="&") { | ||
c = chunk.charAt(i++); | ||
c = chunk.charAt(i++) | ||
if (c) { | ||
parser.position ++; | ||
parser.position ++ | ||
if (c === "\n") { | ||
parser.line ++; | ||
parser.column = 0; | ||
} else parser.column ++; | ||
parser.line ++ | ||
parser.column = 0 | ||
} else parser.column ++ | ||
} | ||
} | ||
parser.textNode += chunk.substring(starti, i-1); | ||
parser.textNode += chunk.substring(starti, i-1) | ||
} | ||
if (c === "<") parser.state = S.OPEN_WAKA; | ||
if (c === "<") parser.state = S.OPEN_WAKA | ||
else { | ||
if (not(whitespace, c) && (!parser.sawRoot || parser.closedRoot)) | ||
strictFail("Text data outside of root node."); | ||
if (c === "&") parser.state = S.TEXT_ENTITY; | ||
else parser.textNode += c; | ||
strictFail("Text data outside of root node.") | ||
if (c === "&") parser.state = S.TEXT_ENTITY | ||
else parser.textNode += c | ||
} | ||
continue; | ||
continue | ||
case S.OPEN_WAKA: | ||
// either a /, ?, !, or text is coming next. | ||
if (c === "!") { | ||
parser.state = S.SGML_DECL; | ||
parser.sgmlDecl = ""; | ||
parser.state = S.SGML_DECL | ||
parser.sgmlDecl = "" | ||
} else if (is(whitespace, c)) { | ||
// wait for it... | ||
} else if (is(nameStart,c)) { | ||
parser.state = S.OPEN_TAG; | ||
parser.tagName = c; | ||
parser.state = S.OPEN_TAG | ||
parser.tagName = c | ||
} else if (c === "/") { | ||
parser.state = S.CLOSE_TAG; | ||
parser.tagName = ""; | ||
parser.state = S.CLOSE_TAG | ||
parser.tagName = "" | ||
} else if (c === "?") { | ||
parser.state = S.PROC_INST; | ||
parser.procInstName = parser.procInstBody = ""; | ||
parser.state = S.PROC_INST | ||
parser.procInstName = parser.procInstBody = "" | ||
} else { | ||
strictFail(parser, "Unencoded <"); | ||
parser.textNode += "<" + c; | ||
parser.state = S.TEXT; | ||
strictFail(parser, "Unencoded <") | ||
parser.textNode += "<" + c | ||
parser.state = S.TEXT | ||
} | ||
continue; | ||
continue | ||
case S.SGML_DECL: | ||
if ((parser.sgmlDecl+c).toUpperCase() === CDATA) { | ||
emitNode(parser, "onopencdata"); | ||
parser.state = S.CDATA; | ||
parser.sgmlDecl = ""; | ||
parser.cdata = ""; | ||
emitNode(parser, "onopencdata") | ||
parser.state = S.CDATA | ||
parser.sgmlDecl = "" | ||
parser.cdata = "" | ||
} else if (parser.sgmlDecl+c === "--") { | ||
parser.state = S.COMMENT; | ||
parser.comment = ""; | ||
parser.sgmlDecl = ""; | ||
parser.state = S.COMMENT | ||
parser.comment = "" | ||
parser.sgmlDecl = "" | ||
} else if ((parser.sgmlDecl+c).toUpperCase() === DOCTYPE) { | ||
parser.state = S.DOCTYPE; | ||
parser.state = S.DOCTYPE | ||
if (parser.doctype || parser.sawRoot) strictFail(parser, | ||
"Inappropriately located doctype declaration"); | ||
parser.doctype = ""; | ||
parser.sgmlDecl = ""; | ||
"Inappropriately located doctype declaration") | ||
parser.doctype = "" | ||
parser.sgmlDecl = "" | ||
} else if (c === ">") { | ||
emitNode(parser, "onsgmldeclaration", parser.sgmlDecl); | ||
parser.sgmlDecl = ""; | ||
parser.state = S.TEXT; | ||
emitNode(parser, "onsgmldeclaration", parser.sgmlDecl) | ||
parser.sgmlDecl = "" | ||
parser.state = S.TEXT | ||
} else if (is(quote, c)) { | ||
parser.state = S.SGML_DECL_QUOTED; | ||
parser.sgmlDecl += c; | ||
} else parser.sgmlDecl += c; | ||
continue; | ||
parser.state = S.SGML_DECL_QUOTED | ||
parser.sgmlDecl += c | ||
} else parser.sgmlDecl += c | ||
continue | ||
case S.SGML_DECL_QUOTED: | ||
if (c === parser.q) { | ||
parser.state = S.SGML_DECL; | ||
parser.q = ""; | ||
parser.state = S.SGML_DECL | ||
parser.q = "" | ||
} | ||
parser.sgmlDecl += c; | ||
continue; | ||
parser.sgmlDecl += c | ||
continue | ||
case S.DOCTYPE: | ||
if (c === ">") { | ||
parser.state = S.TEXT; | ||
emitNode(parser, "ondoctype", parser.doctype); | ||
parser.doctype = true; // just remember that we saw it. | ||
parser.state = S.TEXT | ||
emitNode(parser, "ondoctype", parser.doctype) | ||
parser.doctype = true // just remember that we saw it. | ||
} else { | ||
parser.doctype += c; | ||
if (c === "[") parser.state = S.DOCTYPE_DTD; | ||
parser.doctype += c | ||
if (c === "[") parser.state = S.DOCTYPE_DTD | ||
else if (is(quote, c)) { | ||
parser.state = S.DOCTYPE_QUOTED; | ||
parser.q = c; | ||
parser.state = S.DOCTYPE_QUOTED | ||
parser.q = c | ||
} | ||
} | ||
continue; | ||
continue | ||
case S.DOCTYPE_QUOTED: | ||
parser.doctype += c; | ||
parser.doctype += c | ||
if (c === parser.q) { | ||
parser.q = ""; | ||
parser.state = S.DOCTYPE; | ||
parser.q = "" | ||
parser.state = S.DOCTYPE | ||
} | ||
continue; | ||
continue | ||
case S.DOCTYPE_DTD: | ||
parser.doctype += c; | ||
if (c === "]") parser.state = S.DOCTYPE; | ||
parser.doctype += c | ||
if (c === "]") parser.state = S.DOCTYPE | ||
else if (is(quote,c)) { | ||
parser.state = S.DOCTYPE_DTD_QUOTED; | ||
parser.q = c; | ||
parser.state = S.DOCTYPE_DTD_QUOTED | ||
parser.q = c | ||
} | ||
continue; | ||
continue | ||
case S.DOCTYPE_DTD_QUOTED: | ||
parser.doctype += c; | ||
parser.doctype += c | ||
if (c === parser.q) { | ||
parser.state = S.DOCTYPE_DTD; | ||
parser.q = ""; | ||
parser.state = S.DOCTYPE_DTD | ||
parser.q = "" | ||
} | ||
continue; | ||
continue | ||
case S.COMMENT: | ||
if (c === "-") parser.state = S.COMMENT_ENDING; | ||
else parser.comment += c; | ||
continue; | ||
if (c === "-") parser.state = S.COMMENT_ENDING | ||
else parser.comment += c | ||
continue | ||
case S.COMMENT_ENDING: | ||
if (c === "-") { | ||
parser.state = S.COMMENT_ENDED; | ||
parser.comment = textopts(parser.opt, parser.comment); | ||
if (parser.comment) emitNode(parser, "oncomment", parser.comment); | ||
parser.comment = ""; | ||
parser.state = S.COMMENT_ENDED | ||
parser.comment = textopts(parser.opt, parser.comment) | ||
if (parser.comment) emitNode(parser, "oncomment", parser.comment) | ||
parser.comment = "" | ||
} else { | ||
strictFail(parser, "Invalid comment"); | ||
parser.comment += "-" + c; | ||
strictFail(parser, "Invalid comment") | ||
parser.comment += "-" + c | ||
} | ||
continue; | ||
continue | ||
case S.COMMENT_ENDED: | ||
if (c !== ">") strictFail(parser, "Malformed comment"); | ||
else parser.state = S.TEXT; | ||
continue; | ||
if (c !== ">") strictFail(parser, "Malformed comment") | ||
else parser.state = S.TEXT | ||
continue | ||
case S.CDATA: | ||
if (c === "]") parser.state = S.CDATA_ENDING; | ||
else parser.cdata += c; | ||
continue; | ||
if (c === "]") parser.state = S.CDATA_ENDING | ||
else parser.cdata += c | ||
continue | ||
case S.CDATA_ENDING: | ||
if (c === "]") parser.state = S.CDATA_ENDING_2; | ||
if (c === "]") parser.state = S.CDATA_ENDING_2 | ||
else { | ||
parser.cdata += "]" + c; | ||
parser.state = S.CDATA; | ||
parser.cdata += "]" + c | ||
parser.state = S.CDATA | ||
} | ||
continue; | ||
continue | ||
case S.CDATA_ENDING_2: | ||
if (c === ">") { | ||
if (parser.cdata) emitNode(parser, "oncdata", parser.cdata); | ||
emitNode(parser, "onclosecdata"); | ||
parser.cdata = ""; | ||
parser.state = S.TEXT; | ||
if (parser.cdata) emitNode(parser, "oncdata", parser.cdata) | ||
emitNode(parser, "onclosecdata") | ||
parser.cdata = "" | ||
parser.state = S.TEXT | ||
} else if (c === "]") { | ||
parser.cdata += "]" | ||
} else { | ||
parser.cdata += "]]" + c; | ||
parser.state = S.CDATA; | ||
parser.cdata += "]]" + c | ||
parser.state = S.CDATA | ||
} | ||
continue; | ||
continue | ||
case S.PROC_INST: | ||
if (c === "?") parser.state = S.PROC_INST_ENDING; | ||
else if (is(whitespace, c)) parser.state = S.PROC_INST_BODY; | ||
else parser.procInstName += c; | ||
continue; | ||
if (c === "?") parser.state = S.PROC_INST_ENDING | ||
else if (is(whitespace, c)) parser.state = S.PROC_INST_BODY | ||
else parser.procInstName += c | ||
continue | ||
case S.PROC_INST_BODY: | ||
if (!parser.procInstBody && is(whitespace, c)) continue; | ||
else if (c === "?") parser.state = S.PROC_INST_ENDING; | ||
if (!parser.procInstBody && is(whitespace, c)) continue | ||
else if (c === "?") parser.state = S.PROC_INST_ENDING | ||
else if (is(quote, c)) { | ||
parser.state = S.PROC_INST_QUOTED; | ||
parser.q = c; | ||
parser.procInstBody += c; | ||
} else parser.procInstBody += c; | ||
continue; | ||
parser.state = S.PROC_INST_QUOTED | ||
parser.q = c | ||
parser.procInstBody += c | ||
} else parser.procInstBody += c | ||
continue | ||
case S.PROC_INST_ENDING: | ||
@@ -456,131 +545,132 @@ if (c === ">") { | ||
body : parser.procInstBody | ||
}); | ||
parser.procInstName = parser.procInstBody = ""; | ||
parser.state = S.TEXT; | ||
}) | ||
parser.procInstName = parser.procInstBody = "" | ||
parser.state = S.TEXT | ||
} else { | ||
parser.procInstBody += "?" + c; | ||
parser.state = S.PROC_INST_BODY; | ||
parser.procInstBody += "?" + c | ||
parser.state = S.PROC_INST_BODY | ||
} | ||
continue; | ||
continue | ||
case S.PROC_INST_QUOTED: | ||
parser.procInstBody += c; | ||
parser.procInstBody += c | ||
if (c === parser.q) { | ||
parser.state = S.PROC_INST_BODY; | ||
parser.q = ""; | ||
parser.state = S.PROC_INST_BODY | ||
parser.q = "" | ||
} | ||
continue; | ||
continue | ||
case S.OPEN_TAG: | ||
if (is(nameBody, c)) parser.tagName += c; | ||
if (is(nameBody, c)) parser.tagName += c | ||
else { | ||
newTag(parser); | ||
if (c === ">") openTag(parser); | ||
else if (c === "/") parser.state = S.OPEN_TAG_SLASH; | ||
newTag(parser) | ||
if (c === ">") openTag(parser) | ||
else if (c === "/") parser.state = S.OPEN_TAG_SLASH | ||
else { | ||
if (not(whitespace, c)) strictFail( | ||
parser, "Invalid character in tag name"); | ||
parser.state = S.ATTRIB; | ||
parser, "Invalid character in tag name") | ||
parser.state = S.ATTRIB | ||
} | ||
} | ||
continue; | ||
continue | ||
case S.OPEN_TAG_SLASH: | ||
if (c === ">") { | ||
openTag(parser, true); | ||
closeTag(parser); | ||
openTag(parser, true) | ||
closeTag(parser) | ||
} else { | ||
strictFail(parser, "Forward-slash in opening tag not followed by >"); | ||
parser.state = S.ATTRIB; | ||
strictFail(parser, "Forward-slash in opening tag not followed by >") | ||
parser.state = S.ATTRIB | ||
} | ||
continue; | ||
continue | ||
case S.ATTRIB: | ||
// haven't read the attribute name yet. | ||
if (is(whitespace, c)) continue; | ||
else if (c === ">") openTag(parser); | ||
else if (c === "/") parser.state = S.OPEN_TAG_SLASH; | ||
if (is(whitespace, c)) continue | ||
else if (c === ">") openTag(parser) | ||
else if (c === "/") parser.state = S.OPEN_TAG_SLASH | ||
else if (is(nameStart, c)) { | ||
parser.attribName = c; | ||
parser.attribValue = ""; | ||
parser.state = S.ATTRIB_NAME; | ||
} else strictFail(parser, "Invalid attribute name"); | ||
continue; | ||
parser.attribName = c | ||
parser.attribValue = "" | ||
parser.state = S.ATTRIB_NAME | ||
} else strictFail(parser, "Invalid attribute name") | ||
continue | ||
case S.ATTRIB_NAME: | ||
if (c === "=") parser.state = S.ATTRIB_VALUE; | ||
else if (is(whitespace, c)) parser.state = S.ATTRIB_NAME_SAW_WHITE; | ||
else if (is(nameBody, c)) parser.attribName += c; | ||
else strictFail(parser, "Invalid attribute name"); | ||
continue; | ||
if (c === "=") parser.state = S.ATTRIB_VALUE | ||
else if (is(whitespace, c)) parser.state = S.ATTRIB_NAME_SAW_WHITE | ||
else if (is(nameBody, c)) parser.attribName += c | ||
else strictFail(parser, "Invalid attribute name") | ||
continue | ||
case S.ATTRIB_NAME_SAW_WHITE: | ||
if (c === "=") parser.state = S.ATTRIB_VALUE; | ||
else if (is(whitespace, c)) continue; | ||
if (c === "=") parser.state = S.ATTRIB_VALUE | ||
else if (is(whitespace, c)) continue | ||
else { | ||
strictFail(parser, "Attribute without value"); | ||
parser.tag.attributes[parser.attribName] = ""; | ||
parser.attribValue = ""; | ||
emitNode(parser, "onattribute", { name : parser.attribName, value : "" }); | ||
parser.attribName = ""; | ||
if (c === ">") openTag(parser); | ||
strictFail(parser, "Attribute without value") | ||
parser.tag.attributes[parser.attribName] = "" | ||
parser.attribValue = "" | ||
emitNode(parser, "onattribute", | ||
{ name : parser.attribName, value : "" }) | ||
parser.attribName = "" | ||
if (c === ">") openTag(parser) | ||
else if (is(nameStart, c)) { | ||
parser.attribName = c; | ||
parser.state = S.ATTRIB_NAME; | ||
parser.attribName = c | ||
parser.state = S.ATTRIB_NAME | ||
} else { | ||
strictFail(parser, "Invalid attribute name"); | ||
parser.state = S.ATTRIB; | ||
strictFail(parser, "Invalid attribute name") | ||
parser.state = S.ATTRIB | ||
} | ||
} | ||
continue; | ||
continue | ||
case S.ATTRIB_VALUE: | ||
if (is(whitespace, c)) continue; | ||
if (is(whitespace, c)) continue | ||
else if (is(quote, c)) { | ||
parser.q = c; | ||
parser.state = S.ATTRIB_VALUE_QUOTED; | ||
parser.q = c | ||
parser.state = S.ATTRIB_VALUE_QUOTED | ||
} else { | ||
strictFail(parser, "Unquoted attribute value"); | ||
parser.state = S.ATTRIB_VALUE_UNQUOTED; | ||
parser.attribValue = c; | ||
strictFail(parser, "Unquoted attribute value") | ||
parser.state = S.ATTRIB_VALUE_UNQUOTED | ||
parser.attribValue = c | ||
} | ||
continue; | ||
continue | ||
case S.ATTRIB_VALUE_QUOTED: | ||
if (c !== parser.q) { | ||
if (c === "&") parser.state = S.ATTRIB_VALUE_ENTITY_Q; | ||
else parser.attribValue += c; | ||
continue; | ||
if (c === "&") parser.state = S.ATTRIB_VALUE_ENTITY_Q | ||
else parser.attribValue += c | ||
continue | ||
} | ||
parser.tag.attributes[parser.attribName] = parser.attribValue; | ||
parser.tag.attributes[parser.attribName] = parser.attribValue | ||
emitNode(parser, "onattribute", { | ||
name:parser.attribName, value:parser.attribValue}); | ||
parser.attribName = parser.attribValue = ""; | ||
parser.q = ""; | ||
parser.state = S.ATTRIB; | ||
continue; | ||
name:parser.attribName, value:parser.attribValue}) | ||
parser.attribName = parser.attribValue = "" | ||
parser.q = "" | ||
parser.state = S.ATTRIB | ||
continue | ||
case S.ATTRIB_VALUE_UNQUOTED: | ||
if (not(whitespace+">",c)) { | ||
if (c === "&") parser.state = S.ATTRIB_VALUE_ENTITY_U; | ||
else parser.attribValue += c; | ||
continue; | ||
if (c === "&") parser.state = S.ATTRIB_VALUE_ENTITY_U | ||
else parser.attribValue += c | ||
continue | ||
} | ||
emitNode(parser, "onattribute", { | ||
name:parser.attribName, value:parser.attribValue}); | ||
parser.attribName = parser.attribValue = ""; | ||
if (c === ">") openTag(parser); | ||
else parser.state = S.ATTRIB; | ||
continue; | ||
emitNode(parser, "onattribute", | ||
{ name: parser.attribName, value: parser.attribValue}) | ||
parser.attribName = parser.attribValue = "" | ||
if (c === ">") openTag(parser) | ||
else parser.state = S.ATTRIB | ||
continue | ||
case S.CLOSE_TAG: | ||
if (!parser.tagName) { | ||
if (is(whitespace, c)) continue; | ||
if (is(whitespace, c)) continue | ||
else if (not(nameStart, c)) strictFail(parser, | ||
"Invalid tagname in closing tag."); | ||
else parser.tagName = c; | ||
"Invalid tagname in closing tag.") | ||
else parser.tagName = c | ||
} | ||
else if (c === ">") closeTag(parser); | ||
else if (is(nameBody, c)) parser.tagName += c; | ||
else if (c === ">") closeTag(parser) | ||
else if (is(nameBody, c)) parser.tagName += c | ||
else { | ||
if (not(whitespace, c)) strictFail(parser, | ||
"Invalid tagname in closing tag"); | ||
parser.state = S.CLOSE_TAG_SAW_WHITE; | ||
"Invalid tagname in closing tag") | ||
parser.state = S.CLOSE_TAG_SAW_WHITE | ||
} | ||
continue; | ||
continue | ||
case S.CLOSE_TAG_SAW_WHITE: | ||
if (is(whitespace, c)) continue; | ||
if (c === ">") closeTag(parser); | ||
else strictFail("Invalid characters in closing tag"); | ||
continue; | ||
if (is(whitespace, c)) continue | ||
if (c === ">") closeTag(parser) | ||
else strictFail("Invalid characters in closing tag") | ||
continue | ||
case S.TEXT_ENTITY: | ||
@@ -591,27 +681,27 @@ case S.ATTRIB_VALUE_ENTITY_Q: | ||
case S.TEXT_ENTITY: | ||
var returnState = S.TEXT, buffer = "textNode"; | ||
break; | ||
var returnState = S.TEXT, buffer = "textNode" | ||
break | ||
case S.ATTRIB_VALUE_ENTITY_Q: | ||
var returnState = S.ATTRIB_VALUE_QUOTED, buffer = "attribValue"; | ||
break; | ||
var returnState = S.ATTRIB_VALUE_QUOTED, buffer = "attribValue" | ||
break | ||
case S.ATTRIB_VALUE_ENTITY_U: | ||
var returnState = S.ATTRIB_VALUE_UNQUOTED, buffer = "attribValue"; | ||
break; | ||
var returnState = S.ATTRIB_VALUE_UNQUOTED, buffer = "attribValue" | ||
break | ||
} | ||
if (c === ";") { | ||
parser[buffer] += parseEntity(parser); | ||
parser.entity = ""; | ||
parser.state = returnState; | ||
parser[buffer] += parseEntity(parser) | ||
parser.entity = "" | ||
parser.state = returnState | ||
} | ||
else if (is(entity, c)) parser.entity += c; | ||
else if (is(entity, c)) parser.entity += c | ||
else { | ||
strictFail("Invalid character entity"); | ||
parser[buffer] += "&" + parser.entity; | ||
parser.entity = ""; | ||
parser.state = returnState; | ||
strictFail("Invalid character entity") | ||
parser[buffer] += "&" + parser.entity | ||
parser.entity = "" | ||
parser.state = returnState | ||
} | ||
continue; | ||
continue | ||
default: | ||
throw new Error(parser, "Unknown state: " + parser.state); | ||
break; | ||
throw new Error(parser, "Unknown state: " + parser.state) | ||
break | ||
} | ||
@@ -621,8 +711,9 @@ } // while | ||
// if (parser.state === S.CDATA && parser.cdata) { | ||
// emitNode(parser, "oncdata", parser.cdata); | ||
// parser.cdata = ""; | ||
// emitNode(parser, "oncdata", parser.cdata) | ||
// parser.cdata = "" | ||
// } | ||
if (parser.position >= parser.bufferCheckPosition) checkBufferLength(parser); | ||
return parser; | ||
if (parser.position >= parser.bufferCheckPosition) checkBufferLength(parser) | ||
return parser | ||
} | ||
})(typeof exports === "undefined" ? sax = {} : exports) |
{ "name" : "sax" | ||
, "author" : "Isaac Z. Schlueter <i@izs.me>" | ||
, "version" : "0.1.5" | ||
, "version" : "0.2.0" | ||
, "main" : "lib/sax" | ||
@@ -5,0 +5,0 @@ , "license" : "MIT" |
@@ -60,2 +60,24 @@ # sax js | ||
// stream usage | ||
// takes the same options as the parser | ||
var saxStream = require("sax").createStream(strict, options) | ||
saxStream.on("error", function (e) { | ||
// unhandled errors will throw, since this is a proper node | ||
// event emitter. | ||
console.error("error!", e) | ||
// clear the error | ||
this._parser.error = null | ||
this._parser.resume() | ||
}) | ||
saxStream.on("opentag", function (node) { | ||
// same object as above | ||
}) | ||
// pipe is supported, and it's readable/writable | ||
// same chunks coming in also go out. | ||
fs.createReadStream("file.xml") | ||
.pipe(saxStream) | ||
.pipe(fs.createReadStream("file-copy.xml")) | ||
## Arguments | ||
@@ -109,2 +131,5 @@ | ||
When using the stream interface, assign handlers using the EventEmitter | ||
`on` function in the normal fashion. | ||
`error` - Indication that something bad happened. The error will be hanging out on | ||
@@ -149,8 +174,1 @@ `parser.error`, and must be deleted before parsing can continue. By listening to | ||
`ready` - Indication that the stream has reset, and is ready to be written to. | ||
## Todo | ||
Build an HTML parser on top of this, which follows the same parsing rules as web browsers. | ||
Make it fast by replacing the trampoline with a switch, and not buffering so much | ||
stuff. |
129892
1146
172