Comparing version 0.0.1 to 0.0.2
555
index.js
@@ -11,7 +11,7 @@ // Castor HTML/XML Parser | ||
(function(glob) { | ||
"use strict"; | ||
// Some grammar-related constants... | ||
// Parser internal states | ||
@@ -23,9 +23,10 @@ var STATE_UNINITIALISED = 0, | ||
STATE_WITHIN_XML_INSTRUCTION = 4, | ||
STATE_WITHIN_CDATA = 5, | ||
STATE_WITHIN_COMMENT = 6, | ||
STATE_EXPECTING_ELEMENT_NAME = 7, | ||
STATE_EXPECTING_ATTRIBUTE_NAME = 8, | ||
STATE_EXPECTING_ATTRIBUTE_VALUE = 9, | ||
STATE_EXPECTING_ELEMENT_CLOSE = 10; | ||
STATE_EXPECTING_CDATA = 5, | ||
STATE_WITHIN_CDATA = 6, | ||
STATE_WITHIN_COMMENT = 7, | ||
STATE_EXPECTING_ELEMENT_NAME = 8, | ||
STATE_EXPECTING_ATTRIBUTE_NAME = 9, | ||
STATE_EXPECTING_ATTRIBUTE_VALUE = 10, | ||
STATE_EXPECTING_ELEMENT_CLOSE = 11; | ||
// Nodes which should be considered implicitly self-closing | ||
@@ -36,3 +37,10 @@ // Taken from http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html#void-elements | ||
]; | ||
// Nodes which should be considered 'raw text' or rawCDATA elements | ||
// (that is, only </ should trigger a parser state change. Combined because the internal distinction wasn't that useful.) | ||
// Taken from http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html#raw-text-elements | ||
var rawTextElements = [ | ||
"script", "style", "textarea", "title" | ||
]; | ||
// Omission map (end tags) | ||
@@ -62,3 +70,3 @@ // Determines which tags will automatically be closed when another begins, and by which sibling tags they will be closed. | ||
} | ||
// Node type/class | ||
@@ -73,3 +81,3 @@ // Based on DOM, but doesn't implement _ANY_ of the DOM functions. | ||
this.textContent = nodeType !== 1 && typeof tagParameter === "string" ? tagParameter : ""; | ||
// Custom stuff for heuristics... | ||
@@ -79,14 +87,26 @@ this.weight = 0; | ||
}; | ||
// Helper function that implicitly empty nodes to determine whether they should be closed or not. | ||
// Helper function that determines whether the current node is 'raw text', that is, | ||
// parser state should only change when the characters '</' are encountered. | ||
function isRawTextNode(node) { | ||
node = node instanceof Node ? node.tagName : node; | ||
for (var nodeTypeIndex = 0; nodeTypeIndex < rawTextElements.length; nodeTypeIndex ++) { | ||
if (!!node && rawTextElements[nodeTypeIndex].toLowerCase() === node.toLowerCase()) return true; | ||
} | ||
return false; | ||
} | ||
// Helper function that checks whether current node is 'implicitly empty' (void) and should be automatically closed. | ||
function isVoidNode(node) { | ||
node = node instanceof Node ? node.tagName : node; | ||
for (var nodeTypeIndex = 0; nodeTypeIndex < voidElements.length; nodeTypeIndex ++) { | ||
if (voidElements[nodeTypeIndex].toLowerCase() === node.toLowerCase()) return true; | ||
if (!!node && voidElements[nodeTypeIndex].toLowerCase() === node.toLowerCase()) return true; | ||
} | ||
return false; | ||
} | ||
// Determines whether a tag should close and become a sibling of the current node, or become | ||
@@ -96,3 +116,3 @@ // a child of the current node. | ||
var closesNode = false; | ||
if (omissionMap[currentTagName]) { | ||
@@ -103,7 +123,7 @@ for (var testIndex = 0; testIndex < omissionMap[currentTagName].length; testIndex ++) { | ||
} | ||
// The current node doesn't exist in the map or isn't closed but the specified tag. | ||
return false; | ||
} | ||
// Define Castor itself... | ||
@@ -113,6 +133,6 @@ // State parameters hang off the main Castor object. | ||
// methods of Castor. Anything which can act independently of state is written separately. | ||
var Castor = function() { | ||
this.doctype = null; | ||
// Variables for managing parser state... | ||
@@ -131,3 +151,3 @@ this.tree = new Node(99,"document"); // '99' is just a node number I've made up for the document node. | ||
this.prevChar2 = ""; | ||
// These two are just for debugging. | ||
@@ -137,3 +157,3 @@ this.lineNo = 0; | ||
}; | ||
// Helper function for altering the internal state of the parser. | ||
@@ -144,4 +164,4 @@ Castor.prototype.setState = function(newState) { | ||
}; | ||
// Hides buffer implementation (in case native buffers are eventually used.) | ||
@@ -151,4 +171,4 @@ Castor.prototype.buffer = function(c) { | ||
}; | ||
// Retrieves the current token buffer | ||
@@ -158,4 +178,4 @@ Castor.prototype.getBuffer = function() { | ||
}; | ||
// Destroys anything in the buffer. | ||
@@ -165,4 +185,4 @@ Castor.prototype.clearBuffer = function() { | ||
}; | ||
// Creates a new text node at the current depth with the buffer contents, | ||
@@ -178,7 +198,7 @@ // then clears it. | ||
} | ||
this.clearBuffer(); | ||
} | ||
// Closes the current node, but also scans up the stack (if a closing tag name was provided) to ensure it's balanced. | ||
@@ -191,3 +211,3 @@ Castor.prototype.closeNode = function(closingTagName) { | ||
var tmpCurrentNode = this.currentNode, tmpTreeDepth = this.treeDepth; | ||
// Repeat while we haven't reached the top of the tree yet, or found the node we're looking to close. | ||
@@ -198,9 +218,10 @@ while (tmpTreeDepth && !nodeFound) { | ||
} | ||
if (tmpCurrentNode.parentNode) { | ||
tmpCurrentNode = tmpCurrentNode.parentNode; | ||
tmpTreeDepth --; | ||
} | ||
tmpTreeDepth --; | ||
} | ||
// If we actually found the node we were looking for... | ||
@@ -212,7 +233,7 @@ if (nodeFound) { | ||
} | ||
return false; | ||
} | ||
// Bailout on error. | ||
@@ -223,17 +244,15 @@ // Generates an error describing where the error was... | ||
} | ||
// The actual parser function. | ||
Castor.prototype.parse = function(sourceInput) { | ||
var self = this; | ||
var charIndex = 0; | ||
sourceInput = typeof sourceInput === "string" ? sourceInput : sourceInput.toString(); | ||
// Kick off the parser! | ||
while (charIndex < sourceInput.length) { | ||
// Fetch most recent three characters, push previous characters back on the stack... | ||
@@ -243,3 +262,6 @@ self.prevChar2 = self.prevChar; | ||
self.curChar = sourceInput.substr(charIndex,1); | ||
// Increment parse pointer. | ||
charIndex ++; | ||
if (self.curChar === "\n") { | ||
@@ -251,29 +273,29 @@ self.lineNo ++; | ||
} | ||
switch (self.state) { | ||
case STATE_UNINITIALISED: | ||
// Parser is just collecting text in uninitialised mode. | ||
if (self.curChar === "<") { | ||
// Open tag of some description | ||
// Could be an element, processing instruction, comment, | ||
// CDATA or DOCTYPE | ||
self.setState(STATE_EXPECTING_TAG); | ||
} else { | ||
// Character wasn't an instruction, so we'll save it to our token buffer. | ||
self.buffer(self.curChar); | ||
} | ||
break; | ||
case STATE_EXPECTING_TAG: | ||
if (self.curChar === "!") { | ||
// We've got a 'bang' tag on our hands. | ||
@@ -283,48 +305,71 @@ // Wait for the character after this to determine what we're dealing with... | ||
self.setState(STATE_EXPECTING_BANG_QUALIFIER); | ||
} else if (self.curChar === "?") { | ||
// We've got an XML processing instruction on our hands. | ||
// For now, we just ignore these. Not like I need 'em for this anyway. | ||
self.setState(STATE_WITHIN_XML_INSTRUCTION); | ||
} else if (self.curChar.match(/[a-z0-9]/i)) { | ||
// Looks like an element node. Flush current buffer to tree and get ready to handle element name... | ||
self.flushBuffer(); | ||
self.buffer(self.curChar); | ||
self.setState(STATE_EXPECTING_ELEMENT_NAME); | ||
// Check to see whether we're inside a raw text node | ||
// (raw text nodes should not have children, and any child nodes will be treated as text.) | ||
if (isRawTextNode(self.currentNode)) { | ||
// Just buffer the previous character '>' and the current character | ||
// And set the state back to STATE_UNINITIALISED | ||
self.buffer(self.prevChar); | ||
self.buffer(self.curChar); | ||
self.setState(STATE_UNINITIALISED); | ||
} else { | ||
// Looks like an element node. Flush current buffer to tree and get ready to handle element name... | ||
self.flushBuffer(); | ||
self.buffer(self.curChar); | ||
self.setState(STATE_EXPECTING_ELEMENT_NAME); | ||
} | ||
} else if (self.curChar === "/") { | ||
// Looks like a closing tag. | ||
self.setState(STATE_EXPECTING_ELEMENT_CLOSE); | ||
// Buffer any previous text if it exists... | ||
self.flushBuffer(); | ||
} else if (self.curChar.match(/\s/)) { | ||
// The character after the tag was whitespace. We assume the tag is text (unescaped < character!) | ||
// and buffer up the previous and current character before dropping back to 'uninitialised' state. | ||
self.buffer(self.prevChar); | ||
self.buffer(self.curChar); | ||
self.setState(STATE_UNINITIALISED); | ||
} else { | ||
// We weren't expacting this character here. Bail out! | ||
self.bailout(); | ||
// Our expectations were subverted, but we're currently in a raw text node. | ||
if (isRawTextNode(self.currentNode)) { | ||
// So just buffer the previous and current characters and set our state back to uninitialised. | ||
self.buffer(self.prevChar); | ||
self.buffer(self.curChar); | ||
self.setState(STATE_UNINITIALISED); | ||
} else { | ||
// We weren't expacting this character here. Bail out! | ||
self.bailout(); | ||
} | ||
} | ||
break; | ||
// 3 node types start with <!. Wait for next character to determine what this is. | ||
case STATE_EXPECTING_BANG_QUALIFIER: | ||
if (self.curChar === "D" || self.curChar === "d") { | ||
// This looks like a DOCTYPE! | ||
@@ -334,20 +379,25 @@ self.flushBuffer(); | ||
self.setState(STATE_WITHIN_DOCTYPE); | ||
} else if (self.curChar === "-") { | ||
// This looks like a comment! | ||
self.setState(STATE_WITHIN_COMMENT); | ||
} else if (self.curChar === "[") { | ||
// This looks like a CDATA section! | ||
self.setState(STATE_EXPECTING_CDATA); | ||
} else { | ||
// Uh... | ||
self.bailout(); | ||
} | ||
break; | ||
case STATE_WITHIN_DOCTYPE: | ||
if (self.curChar === ">") { | ||
// That's the end of our doctype! Save it and move on... | ||
@@ -357,3 +407,3 @@ self.doctype = self.getBuffer(); | ||
self.setState(STATE_UNINITIALISED); | ||
} else { | ||
@@ -363,31 +413,93 @@ // Do we even pay attention to doctypes? | ||
} | ||
break; | ||
case STATE_WITHIN_XML_INSTRUCTION: | ||
bailout("XML instructions not yet supported."); | ||
// For now, just an <strike>elaborate</strike> system designed to ignore XML instructions! | ||
if (self.curChar === ">" && self.prevChar === "?") { | ||
self.setState(STATE_UNINITIALISED); | ||
} | ||
break; | ||
case STATE_EXPECTING_CDATA: | ||
if (self.curChar === "[" && | ||
self.prevChar === "A" && | ||
self.prevChar2 === "T") { | ||
// That's the end of the <![CDATA[ beginning tag... | ||
self.setState(STATE_WITHIN_CDATA); | ||
} else if (self.curChar.match(/[CDATA]/i)) { | ||
// Ignore allowed characters in CDATA opening tag | ||
} else if (self.curChar === ">") { | ||
// Allow cancelling CDATA if tag wasn't opened properly... | ||
self.setState(STATE_UNINITIALISED); | ||
} else { | ||
// Hrm. That certainly knocked us for six. | ||
self.bailout(); | ||
} | ||
break; | ||
case STATE_WITHIN_CDATA: | ||
bailout("CDATA not yet supported."); | ||
if (self.curChar === ">" && | ||
self.prevChar === "]" && | ||
self.prevChar2 === "]") { | ||
// For the time being I'll just save CDATA nodes as text. | ||
self.flushBuffer(); | ||
// That's the end of our CDATA section! | ||
self.setState(STATE_UNINITIALISED); | ||
} else if (self.curChar === "]") { | ||
// Don't buffer this character yet. | ||
// However, we're only interested in sequences of /two/ square | ||
// brackets. So if prevChar2 is a bracket, buffer it. | ||
if (self.prevChar2 === "]") { | ||
self.buffer(self.prevChar2); | ||
} | ||
} else { | ||
// If the last character was one of those verboten square brackets, | ||
// we can finally be sure it wasn't part of a closing tag. buffer it. | ||
if (self.prevChar === "]") { | ||
self.buffer(self.prevChar); | ||
} | ||
// And buffer everything else. | ||
self.buffer(self.curChar); | ||
} | ||
break; | ||
case STATE_WITHIN_COMMENT: | ||
if (self.curChar === "-") { | ||
// Do nothing for now. We'll work out whether we're going to buffer this | ||
// as part of the comment later. | ||
} else if (self.curChar === ">") { | ||
if (self.prevChar === "-" && self.prevChar2 === "-") { | ||
// OK then - that's the end of the comment! | ||
// Create a new comment node with the contents of the buffer. | ||
if (self.getBuffer().length) { | ||
@@ -399,43 +511,46 @@ var newComment = new Node(8,self.getBuffer()); | ||
} | ||
// Revert to uninitialised state. | ||
self.setState(STATE_UNINITIALISED); | ||
} else { | ||
self.buffer(self.curChar); | ||
} | ||
} else { | ||
// Because we don't buffer "-" characters immediately (in case they're part of the closing token) | ||
// we wait until we know it isn't followed by another "-", and then we buffer it. | ||
if (self.prevChar === "-" && self.curChar !== "-" && !self.curChar.match(/\s/)) { | ||
self.buffer(self.prevChar); | ||
} | ||
self.buffer(self.curChar); | ||
} | ||
break; | ||
case STATE_EXPECTING_ELEMENT_NAME: | ||
if (self.curChar.match(/[a-z0-9]/i)) { | ||
// For the time being, we'll just treat namespaces as tag/attribute names. | ||
// TODO: split namespaces out and extend the node class to accomodate for them. | ||
if (self.curChar.match(/[a-z0-9\-\:]/i)) { | ||
// Just a text character. Buffer up! | ||
self.buffer(self.curChar); | ||
} else { | ||
// Get the element name from the buffer... | ||
var elementName = self.getBuffer(); | ||
// If this tag implicitly closes the currently open node | ||
if (closesCurrentNode(elementName,self.currentNode.tagName)) { | ||
// Then close the currently open node first. | ||
self.closeNode() | ||
} | ||
// Create element node with the buffer as its tagName | ||
@@ -447,23 +562,23 @@ // Assign the current node as its parent... | ||
self.clearBuffer(); | ||
// Set as the current node | ||
self.currentNode = newElement; | ||
// Increase our tree-depth for tracking/debugging | ||
self.treeDepth ++; | ||
if (self.curChar === ">") { | ||
// If we're an attribute-less opening tag, just switch back to uninitialised state. | ||
self.setState(STATE_UNINITIALISED); | ||
} else if (self.curChar.match(/\s/)) { | ||
// If there's whitespace we must be expecting attributes! | ||
self.setState(STATE_EXPECTING_ATTRIBUTE_NAME); | ||
} else if (self.curChar === "/") { | ||
// We're a self-closing tag. | ||
self.setState(STATE_EXPECTING_ELEMENT_CLOSE); | ||
} else { | ||
@@ -474,11 +589,11 @@ // Error condition... | ||
} | ||
break; | ||
case STATE_EXPECTING_ATTRIBUTE_NAME: | ||
if (self.curChar === "/") { | ||
// Oh, it was just some whitespace before the end of a self-closing tag. | ||
self.setState(STATE_EXPECTING_ELEMENT_CLOSE); | ||
// Well, we save it to the node if there was anything in the buffer! | ||
@@ -490,5 +605,7 @@ if (self.getBuffer().length) { | ||
} | ||
} else if (self.curChar.match(/[a-z0-9\-]/i)) { | ||
// For the time being, we'll just treat namespaces as tag/attribute names. | ||
// TODO: split namespaces out and extend the node class to accomodate for them. | ||
} else if (self.curChar.match(/[a-z0-9\-\:]/i)) { | ||
// looks like an attribute name to me! | ||
@@ -498,5 +615,5 @@ // But wait. If the previous character was whitespace, and the previous state was also | ||
// current node. Otherwise, just buffer away! | ||
if (self.prevChar.match(/\s/) && self.prevState === STATE_EXPECTING_ATTRIBUTE_NAME) { | ||
// Well, we save it to the node if there was anything in the buffer! | ||
@@ -508,31 +625,31 @@ if (self.getBuffer().length) { | ||
} | ||
} | ||
self.buffer(self.curChar); | ||
} else if (self.curChar.match(/\s/)) { | ||
// Ignore whitespace. | ||
} else if (self.curChar === "=") { | ||
// Looks like we're being primed for a value... | ||
self.setState(STATE_EXPECTING_ATTRIBUTE_VALUE); | ||
// Capture and clear the buffer... | ||
self.currentAttribute = self.getBuffer(); | ||
self.clearBuffer(); | ||
} else if (self.curChar === "'" || self.curChar === "\"") { | ||
// Some idiot just stuck a string delimiter directly after an attribtue name without an equals sign. | ||
// Never mind, we can deal with that. | ||
if (self.getBuffer().length) { | ||
// Well handling it this way only makes sense if we've actually got something in the buffer | ||
// to use as an attribute name. Otherwise there's nothing to attach the attribute value to. | ||
self.currentDelimiter = self.curChar; | ||
self.setState(STATE_EXPECTING_ATTRIBUTE_VALUE); | ||
// Capture and clear the buffer... | ||
@@ -542,7 +659,7 @@ self.currentAttribute = self.getBuffer(); | ||
} | ||
} else if (self.curChar === ">") { | ||
// So the element was closed. | ||
// If there's anything in the buffer, consider it a boolean attribute. | ||
if (self.getBuffer().length) { | ||
@@ -553,22 +670,22 @@ // Save the buffer as a boolean attribute, and clear it. | ||
} | ||
if (isVoidNode(self.currentNode) && self.currentNode.parentNode) { | ||
self.currentNode = self.currentNode.parentNode; | ||
} | ||
// Return to uninitialised state. | ||
self.setState(STATE_UNINITIALISED); | ||
} | ||
break; | ||
case STATE_EXPECTING_ATTRIBUTE_VALUE: | ||
// Are we in a delimited string? | ||
if (self.currentDelimiter.length) { | ||
if (self.curChar === self.currentDelimiter) { | ||
self.currentDelimiter = ""; | ||
if (self.getBuffer().length) { | ||
@@ -579,13 +696,13 @@ // In that case, save the buffer to the node attributes and start looking for more! | ||
} | ||
// Oh, we've reached a matching delimiter? Well that's the end of that then. | ||
self.setState(STATE_EXPECTING_ATTRIBUTE_NAME); | ||
} else { | ||
self.buffer(self.curChar); | ||
} | ||
} else { | ||
if (self.curChar === "/" || self.curChar === ">") { | ||
// We hit the end of the tag. Branch according to whether we're still expecting another character or not. | ||
@@ -595,17 +712,17 @@ if (self.curChar === "/") { | ||
} else { | ||
// If this node is in our list of tag types to be implicitly closed, then close it. | ||
// Then switch back to STATE_UNINITIALISED. | ||
if (isVoidNode(self.currentNode)) { | ||
self.closeNode(); | ||
} | ||
self.setState(STATE_UNINITIALISED); | ||
} | ||
// Is there actually something in the buffer? | ||
if (self.getBuffer().length) { | ||
self.currentNode.attributes[self.currentAttribute] = self.getBuffer(); | ||
} else { | ||
@@ -615,7 +732,7 @@ // Couldn't extract a value? Treat this as a boolean attribute. | ||
} | ||
self.clearBuffer(); | ||
} else if (self.curChar === "'" || self.curChar === "\"") { | ||
// Hit a delimiter. If there's something in the buffer, consider the final delimiter | ||
@@ -628,11 +745,11 @@ // for a string with a missing first delimiter. | ||
self.setState(STATE_EXPECTING_ATTRIBUTE_NAME); | ||
} else { | ||
// Set the current string delimiter. | ||
// Nothing in the buffer, so no need to clear it! | ||
self.currentDelimiter = self.curChar; | ||
} | ||
} else if (self.curChar.match(/\s/)) { | ||
@@ -642,3 +759,3 @@ // Whitespace. If there's nothing in the buffer, we haven't gotten to the attribute value yet, | ||
// back to STATE_EXPECTING_ATTRIBUTE_NAME after saving the attribute value to the node. | ||
if (self.getBuffer().length) { | ||
@@ -651,34 +768,63 @@ self.currentNode.attributes[self.currentAttribute] = self.getBuffer(); | ||
} | ||
break; | ||
case STATE_EXPECTING_ELEMENT_CLOSE: | ||
if (self.curChar.match(/[a-z0-9]/i)) { | ||
// Dealing with a closing tag name? Just bufffer it. | ||
self.buffer(self.curChar); | ||
} else if (self.curChar === ">") { | ||
if (isRawTextNode(self.currentNode)) { | ||
if (self.currentNode.tagName.toLowerCase() !== self.getBuffer().toLowerCase()) { | ||
// If we're in a raw text node now, and a closing tag arrives which isn't for us, | ||
// we totally ignore it, rather than trying hard to balance out the heirarchy. | ||
// Instead, treat as text and begin a new buffer! | ||
var tmpBuffer = self.getBuffer(); | ||
self.clearBuffer(); | ||
self.buffer("</" + tmpBuffer + ">"); | ||
self.setState(STATE_UNINITIALISED); | ||
// Oh, and break out of this before we actually do close the node! | ||
break; | ||
} | ||
} | ||
// Close the current node... | ||
// Pass in the tagName specified by the closing tag so we can ensure the tree is balanced... | ||
self.closeNode(self.getBuffer()); | ||
// ...And return to uninitialised state. | ||
self.setState(STATE_UNINITIALISED); | ||
// And clear the buffer. | ||
self.clearBuffer(); | ||
} else if (self.curChar.match(/\s/)) { | ||
// Ignore whitespace... | ||
} else { | ||
self.bailout(); | ||
// So our expectation was subverted. | ||
// Treat the closing tag as character data and revert to uninitialised state. | ||
if (self.getBuffer().length) { | ||
// Do a bit of juggling to replace | ||
var tmpBufferData = self.getBuffer(); | ||
self.clearBuffer(); | ||
self.buffer("</" + tmpBufferData); | ||
} | ||
// The buffer should be saved to a text node next time the parser state is altered. | ||
self.setState(STATE_UNINITIALISED); | ||
} | ||
break; | ||
default: | ||
@@ -689,18 +835,15 @@ // Something happened we weren't expecting. | ||
} | ||
// Increment parse pointer. | ||
charIndex ++; | ||
} | ||
// Return the finished parse tree to the calling function... | ||
return self.tree; | ||
}; | ||
function castorExport() { | ||
return new Castor(); | ||
} | ||
castorExport.Castor = Castor; | ||
(typeof module != "undefined" && module.exports) ? (module.exports = castorExport) : (typeof define != "undefined" ? (define("castor", [], function() { return castorExport; })) : (glob.castor = castorExport)); | ||
})(this); |
@@ -5,3 +5,3 @@ { | ||
"description": "Ultra-simple HTML/XML parser. Generates a DOM-like (but extremely simplified and not at all compliant) tree.", | ||
"version": "0.0.1", | ||
"version": "0.0.2", | ||
"homepage": "https://github.com/cgiffard/Castor", | ||
@@ -8,0 +8,0 @@ "repository": { |
456063
629