castor - npm Package Compare versions

Comparing version 0.0.1 to 0.0.2

555

index.js

		@@ -11,7 +11,7 @@ // Castor HTML/XML Parser
		(function(glob) {


		"use strict";


		// Some grammar-related constants...


		// Parser internal states
		@@ -23,9 +23,10 @@ var STATE_UNINITIALISED = 0,
		STATE_WITHIN_XML_INSTRUCTION = 4,
		STATE_WITHIN_CDATA = 5,
		STATE_WITHIN_COMMENT = 6,
		STATE_EXPECTING_ELEMENT_NAME = 7,
		STATE_EXPECTING_ATTRIBUTE_NAME = 8,
		STATE_EXPECTING_ATTRIBUTE_VALUE = 9,
		STATE_EXPECTING_ELEMENT_CLOSE = 10;

		STATE_EXPECTING_CDATA = 5,
		STATE_WITHIN_CDATA = 6,
		STATE_WITHIN_COMMENT = 7,
		STATE_EXPECTING_ELEMENT_NAME = 8,
		STATE_EXPECTING_ATTRIBUTE_NAME = 9,
		STATE_EXPECTING_ATTRIBUTE_VALUE = 10,
		STATE_EXPECTING_ELEMENT_CLOSE = 11;

		// Nodes which should be considered implicitly self-closing
		@@ -36,3 +37,10 @@ // Taken from http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html#void-elements
		];


		// Nodes which should be considered 'raw text' or rawCDATA elements
		// (that is, only </ should trigger a parser state change. Combined because the internal distinction wasn't that useful.)
		// Taken from http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html#raw-text-elements
		var rawTextElements = [
		"script", "style", "textarea", "title"
		];

		// Omission map (end tags)
		@@ -62,3 +70,3 @@ // Determines which tags will automatically be closed when another begins, and by which sibling tags they will be closed.
		}


		// Node type/class
		@@ -73,3 +81,3 @@ // Based on DOM, but doesn't implement _ANY_ of the DOM functions.
		this.textContent = nodeType !== 1 && typeof tagParameter === "string" ? tagParameter : "";


		// Custom stuff for heuristics...
		@@ -79,14 +87,26 @@ this.weight = 0;
		};

		// Helper function that implicitly empty nodes to determine whether they should be closed or not.

		// Helper function that determines whether the current node is 'raw text', that is,
		// parser state should only change when the characters '</' are encountered.
		function isRawTextNode(node) {
		node = node instanceof Node ? node.tagName : node;

		for (var nodeTypeIndex = 0; nodeTypeIndex < rawTextElements.length; nodeTypeIndex ++) {
		if (!!node && rawTextElements[nodeTypeIndex].toLowerCase() === node.toLowerCase()) return true;
		}

		return false;
		}

		// Helper function that checks whether current node is 'implicitly empty' (void) and should be automatically closed.
		function isVoidNode(node) {
		node = node instanceof Node ? node.tagName : node;


		for (var nodeTypeIndex = 0; nodeTypeIndex < voidElements.length; nodeTypeIndex ++) {
		if (voidElements[nodeTypeIndex].toLowerCase() === node.toLowerCase()) return true;
		if (!!node && voidElements[nodeTypeIndex].toLowerCase() === node.toLowerCase()) return true;
		}


		return false;
		}


		// Determines whether a tag should close and become a sibling of the current node, or become
		@@ -96,3 +116,3 @@ // a child of the current node.
		var closesNode = false;


		if (omissionMap[currentTagName]) {
		@@ -103,7 +123,7 @@ for (var testIndex = 0; testIndex < omissionMap[currentTagName].length; testIndex ++) {
		}


		// The current node doesn't exist in the map or isn't closed but the specified tag.
		return false;
		}


		// Define Castor itself...
		@@ -113,6 +133,6 @@ // State parameters hang off the main Castor object.
		// methods of Castor. Anything which can act independently of state is written separately.


		var Castor = function() {
		this.doctype = null;


		// Variables for managing parser state...
		@@ -131,3 +151,3 @@ this.tree = new Node(99,"document"); // '99' is just a node number I've made up for the document node.
		this.prevChar2 = "";


		// These two are just for debugging.
		@@ -137,3 +157,3 @@ this.lineNo = 0;
		};


		// Helper function for altering the internal state of the parser.
		@@ -144,4 +164,4 @@ Castor.prototype.setState = function(newState) {
		};




		// Hides buffer implementation (in case native buffers are eventually used.)
		@@ -151,4 +171,4 @@ Castor.prototype.buffer = function(c) {
		};




		// Retrieves the current token buffer
		@@ -158,4 +178,4 @@ Castor.prototype.getBuffer = function() {
		};




		// Destroys anything in the buffer.
		@@ -165,4 +185,4 @@ Castor.prototype.clearBuffer = function() {
		};




		// Creates a new text node at the current depth with the buffer contents,
		@@ -178,7 +198,7 @@ // then clears it.
		}


		this.clearBuffer();
		}




		// Closes the current node, but also scans up the stack (if a closing tag name was provided) to ensure it's balanced.
		@@ -191,3 +211,3 @@ Castor.prototype.closeNode = function(closingTagName) {
		var tmpCurrentNode = this.currentNode, tmpTreeDepth = this.treeDepth;


		// Repeat while we haven't reached the top of the tree yet, or found the node we're looking to close.
		@@ -198,9 +218,10 @@ while (tmpTreeDepth && !nodeFound) {
		}


		if (tmpCurrentNode.parentNode) {
		tmpCurrentNode = tmpCurrentNode.parentNode;
		tmpTreeDepth --;
		}

		tmpTreeDepth --;
		}


		// If we actually found the node we were looking for...
		@@ -212,7 +233,7 @@ if (nodeFound) {
		}


		return false;
		}




		// Bailout on error.
		@@ -223,17 +244,15 @@ // Generates an error describing where the error was...
		}






		// The actual parser function.


		Castor.prototype.parse = function(sourceInput) {
		var self = this;
		var charIndex = 0;


		sourceInput = typeof sourceInput === "string" ? sourceInput : sourceInput.toString();


		// Kick off the parser!

		while (charIndex < sourceInput.length) {

		// Fetch most recent three characters, push previous characters back on the stack...
		@@ -243,3 +262,6 @@ self.prevChar2 = self.prevChar;
		self.curChar = sourceInput.substr(charIndex,1);


		// Increment parse pointer.
		charIndex ++;

		if (self.curChar === "\n") {
		@@ -251,29 +273,29 @@ self.lineNo ++;
		}


		switch (self.state) {
		case STATE_UNINITIALISED:


		// Parser is just collecting text in uninitialised mode.
		if (self.curChar === "<") {


		// Open tag of some description
		// Could be an element, processing instruction, comment,
		// CDATA or DOCTYPE


		self.setState(STATE_EXPECTING_TAG);


		} else {


		// Character wasn't an instruction, so we'll save it to our token buffer.


		self.buffer(self.curChar);
		}




		break;


		case STATE_EXPECTING_TAG:


		if (self.curChar === "!") {


		// We've got a 'bang' tag on our hands.
		@@ -283,48 +305,71 @@ // Wait for the character after this to determine what we're dealing with...
		self.setState(STATE_EXPECTING_BANG_QUALIFIER);


		} else if (self.curChar === "?") {


		// We've got an XML processing instruction on our hands.
		// For now, we just ignore these. Not like I need 'em for this anyway.


		self.setState(STATE_WITHIN_XML_INSTRUCTION);


		} else if (self.curChar.match(/[a-z0-9]/i)) {

		// Looks like an element node. Flush current buffer to tree and get ready to handle element name...
		self.flushBuffer();
		self.buffer(self.curChar);
		self.setState(STATE_EXPECTING_ELEMENT_NAME);


		// Check to see whether we're inside a raw text node
		// (raw text nodes should not have children, and any child nodes will be treated as text.)
		if (isRawTextNode(self.currentNode)) {

		// Just buffer the previous character '>' and the current character
		// And set the state back to STATE_UNINITIALISED
		self.buffer(self.prevChar);
		self.buffer(self.curChar);
		self.setState(STATE_UNINITIALISED);

		} else {
		// Looks like an element node. Flush current buffer to tree and get ready to handle element name...
		self.flushBuffer();
		self.buffer(self.curChar);
		self.setState(STATE_EXPECTING_ELEMENT_NAME);
		}

		} else if (self.curChar === "/") {


		// Looks like a closing tag.
		self.setState(STATE_EXPECTING_ELEMENT_CLOSE);


		// Buffer any previous text if it exists...
		self.flushBuffer();


		} else if (self.curChar.match(/\s/)) {


		// The character after the tag was whitespace. We assume the tag is text (unescaped < character!)
		// and buffer up the previous and current character before dropping back to 'uninitialised' state.


		self.buffer(self.prevChar);
		self.buffer(self.curChar);
		self.setState(STATE_UNINITIALISED);


		} else {

		// We weren't expacting this character here. Bail out!
		self.bailout();

		// Our expectations were subverted, but we're currently in a raw text node.
		if (isRawTextNode(self.currentNode)) {

		// So just buffer the previous and current characters and set our state back to uninitialised.
		self.buffer(self.prevChar);
		self.buffer(self.curChar);
		self.setState(STATE_UNINITIALISED);

		} else {

		// We weren't expacting this character here. Bail out!
		self.bailout();
		}
		}


		break;




		// 3 node types start with <!. Wait for next character to determine what this is.
		case STATE_EXPECTING_BANG_QUALIFIER:


		if (self.curChar === "D" \|\| self.curChar === "d") {


		// This looks like a DOCTYPE!
		@@ -334,20 +379,25 @@ self.flushBuffer();
		self.setState(STATE_WITHIN_DOCTYPE);


		} else if (self.curChar === "-") {


		// This looks like a comment!
		self.setState(STATE_WITHIN_COMMENT);


		} else if (self.curChar === "[") {

		// This looks like a CDATA section!
		self.setState(STATE_EXPECTING_CDATA);

		} else {


		// Uh...
		self.bailout();
		}


		break;


		case STATE_WITHIN_DOCTYPE:


		if (self.curChar === ">") {


		// That's the end of our doctype! Save it and move on...
		@@ -357,3 +407,3 @@ self.doctype = self.getBuffer();
		self.setState(STATE_UNINITIALISED);


		} else {
		@@ -363,31 +413,93 @@ // Do we even pay attention to doctypes?
		}


		break;


		case STATE_WITHIN_XML_INSTRUCTION:

		bailout("XML instructions not yet supported.");


		// For now, just an <strike>elaborate</strike> system designed to ignore XML instructions!

		if (self.curChar === ">" && self.prevChar === "?") {
		self.setState(STATE_UNINITIALISED);
		}

		break;


		case STATE_EXPECTING_CDATA:

		if (self.curChar === "[" &&
		self.prevChar === "A" &&
		self.prevChar2 === "T") {

		// That's the end of the <![CDATA[ beginning tag...
		self.setState(STATE_WITHIN_CDATA);

		} else if (self.curChar.match(/[CDATA]/i)) {

		// Ignore allowed characters in CDATA opening tag

		} else if (self.curChar === ">") {

		// Allow cancelling CDATA if tag wasn't opened properly...
		self.setState(STATE_UNINITIALISED);

		} else {

		// Hrm. That certainly knocked us for six.
		self.bailout();

		}

		break;

		case STATE_WITHIN_CDATA:

		bailout("CDATA not yet supported.");


		if (self.curChar === ">" &&
		self.prevChar === "]" &&
		self.prevChar2 === "]") {

		// For the time being I'll just save CDATA nodes as text.
		self.flushBuffer();

		// That's the end of our CDATA section!
		self.setState(STATE_UNINITIALISED);

		} else if (self.curChar === "]") {

		// Don't buffer this character yet.
		// However, we're only interested in sequences of /two/ square
		// brackets. So if prevChar2 is a bracket, buffer it.

		if (self.prevChar2 === "]") {
		self.buffer(self.prevChar2);
		}

		} else {

		// If the last character was one of those verboten square brackets,
		// we can finally be sure it wasn't part of a closing tag. buffer it.
		if (self.prevChar === "]") {
		self.buffer(self.prevChar);
		}

		// And buffer everything else.
		self.buffer(self.curChar);

		}

		break;


		case STATE_WITHIN_COMMENT:


		if (self.curChar === "-") {


		// Do nothing for now. We'll work out whether we're going to buffer this
		// as part of the comment later.


		} else if (self.curChar === ">") {


		if (self.prevChar === "-" && self.prevChar2 === "-") {


		// OK then - that's the end of the comment!
		// Create a new comment node with the contents of the buffer.


		if (self.getBuffer().length) {
		@@ -399,43 +511,46 @@ var newComment = new Node(8,self.getBuffer());
		}


		// Revert to uninitialised state.
		self.setState(STATE_UNINITIALISED);


		} else {
		self.buffer(self.curChar);
		}


		} else {


		// Because we don't buffer "-" characters immediately (in case they're part of the closing token)
		// we wait until we know it isn't followed by another "-", and then we buffer it.


		if (self.prevChar === "-" && self.curChar !== "-" && !self.curChar.match(/\s/)) {
		self.buffer(self.prevChar);
		}


		self.buffer(self.curChar);
		}


		break;


		case STATE_EXPECTING_ELEMENT_NAME:

		if (self.curChar.match(/[a-z0-9]/i)) {

		// For the time being, we'll just treat namespaces as tag/attribute names.
		// TODO: split namespaces out and extend the node class to accomodate for them.
		if (self.curChar.match(/[a-z0-9\-\:]/i)) {

		// Just a text character. Buffer up!
		self.buffer(self.curChar);


		} else {


		// Get the element name from the buffer...
		var elementName = self.getBuffer();


		// If this tag implicitly closes the currently open node
		if (closesCurrentNode(elementName,self.currentNode.tagName)) {


		// Then close the currently open node first.
		self.closeNode()
		}




		// Create element node with the buffer as its tagName
		@@ -447,23 +562,23 @@ // Assign the current node as its parent...
		self.clearBuffer();


		// Set as the current node
		self.currentNode = newElement;


		// Increase our tree-depth for tracking/debugging
		self.treeDepth ++;


		if (self.curChar === ">") {
		// If we're an attribute-less opening tag, just switch back to uninitialised state.
		self.setState(STATE_UNINITIALISED);


		} else if (self.curChar.match(/\s/)) {
		// If there's whitespace we must be expecting attributes!


		self.setState(STATE_EXPECTING_ATTRIBUTE_NAME);


		} else if (self.curChar === "/") {
		// We're a self-closing tag.


		self.setState(STATE_EXPECTING_ELEMENT_CLOSE);


		} else {
		@@ -474,11 +589,11 @@ // Error condition...
		}


		break;


		case STATE_EXPECTING_ATTRIBUTE_NAME:


		if (self.curChar === "/") {
		// Oh, it was just some whitespace before the end of a self-closing tag.
		self.setState(STATE_EXPECTING_ELEMENT_CLOSE);


		// Well, we save it to the node if there was anything in the buffer!
		@@ -490,5 +605,7 @@ if (self.getBuffer().length) {
		}

		} else if (self.curChar.match(/[a-z0-9\-]/i)) {


		// For the time being, we'll just treat namespaces as tag/attribute names.
		// TODO: split namespaces out and extend the node class to accomodate for them.
		} else if (self.curChar.match(/[a-z0-9\-\:]/i)) {

		// looks like an attribute name to me!
		@@ -498,5 +615,5 @@ // But wait. If the previous character was whitespace, and the previous state was also
		// current node. Otherwise, just buffer away!


		if (self.prevChar.match(/\s/) && self.prevState === STATE_EXPECTING_ATTRIBUTE_NAME) {


		// Well, we save it to the node if there was anything in the buffer!
		@@ -508,31 +625,31 @@ if (self.getBuffer().length) {
		}


		}


		self.buffer(self.curChar);


		} else if (self.curChar.match(/\s/)) {
		// Ignore whitespace.


		} else if (self.curChar === "=") {


		// Looks like we're being primed for a value...
		self.setState(STATE_EXPECTING_ATTRIBUTE_VALUE);


		// Capture and clear the buffer...
		self.currentAttribute = self.getBuffer();
		self.clearBuffer();


		} else if (self.curChar === "'" \|\| self.curChar === "\"") {


		// Some idiot just stuck a string delimiter directly after an attribtue name without an equals sign.
		// Never mind, we can deal with that.


		if (self.getBuffer().length) {
		// Well handling it this way only makes sense if we've actually got something in the buffer
		// to use as an attribute name. Otherwise there's nothing to attach the attribute value to.


		self.currentDelimiter = self.curChar;
		self.setState(STATE_EXPECTING_ATTRIBUTE_VALUE);


		// Capture and clear the buffer...
		@@ -542,7 +659,7 @@ self.currentAttribute = self.getBuffer();
		}


		} else if (self.curChar === ">") {
		// So the element was closed.
		// If there's anything in the buffer, consider it a boolean attribute.


		if (self.getBuffer().length) {
		@@ -553,22 +670,22 @@ // Save the buffer as a boolean attribute, and clear it.
		}


		if (isVoidNode(self.currentNode) && self.currentNode.parentNode) {
		self.currentNode = self.currentNode.parentNode;
		}


		// Return to uninitialised state.
		self.setState(STATE_UNINITIALISED);
		}


		break;


		case STATE_EXPECTING_ATTRIBUTE_VALUE:


		// Are we in a delimited string?


		if (self.currentDelimiter.length) {
		if (self.curChar === self.currentDelimiter) {


		self.currentDelimiter = "";


		if (self.getBuffer().length) {
		@@ -579,13 +696,13 @@ // In that case, save the buffer to the node attributes and start looking for more!
		}


		// Oh, we've reached a matching delimiter? Well that's the end of that then.
		self.setState(STATE_EXPECTING_ATTRIBUTE_NAME);


		} else {
		self.buffer(self.curChar);
		}


		} else {
		if (self.curChar === "/" \|\| self.curChar === ">") {


		// We hit the end of the tag. Branch according to whether we're still expecting another character or not.
		@@ -595,17 +712,17 @@ if (self.curChar === "/") {
		} else {


		// If this node is in our list of tag types to be implicitly closed, then close it.
		// Then switch back to STATE_UNINITIALISED.


		if (isVoidNode(self.currentNode)) {
		self.closeNode();
		}


		self.setState(STATE_UNINITIALISED);
		}


		// Is there actually something in the buffer?
		if (self.getBuffer().length) {
		self.currentNode.attributes[self.currentAttribute] = self.getBuffer();


		} else {
		@@ -615,7 +732,7 @@ // Couldn't extract a value? Treat this as a boolean attribute.
		}


		self.clearBuffer();


		} else if (self.curChar === "'" \|\| self.curChar === "\"") {


		// Hit a delimiter. If there's something in the buffer, consider the final delimiter
		@@ -628,11 +745,11 @@ // for a string with a missing first delimiter.
		self.setState(STATE_EXPECTING_ATTRIBUTE_NAME);


		} else {


		// Set the current string delimiter.
		// Nothing in the buffer, so no need to clear it!
		self.currentDelimiter = self.curChar;


		}


		} else if (self.curChar.match(/\s/)) {
		@@ -642,3 +759,3 @@ // Whitespace. If there's nothing in the buffer, we haven't gotten to the attribute value yet,
		// back to STATE_EXPECTING_ATTRIBUTE_NAME after saving the attribute value to the node.


		if (self.getBuffer().length) {
		@@ -651,34 +768,63 @@ self.currentNode.attributes[self.currentAttribute] = self.getBuffer();
		}


		break;


		case STATE_EXPECTING_ELEMENT_CLOSE:


		if (self.curChar.match(/[a-z0-9]/i)) {


		// Dealing with a closing tag name? Just bufffer it.
		self.buffer(self.curChar);


		} else if (self.curChar === ">") {


		if (isRawTextNode(self.currentNode)) {
		if (self.currentNode.tagName.toLowerCase() !== self.getBuffer().toLowerCase()) {
		// If we're in a raw text node now, and a closing tag arrives which isn't for us,
		// we totally ignore it, rather than trying hard to balance out the heirarchy.

		// Instead, treat as text and begin a new buffer!
		var tmpBuffer = self.getBuffer();
		self.clearBuffer();
		self.buffer("</" + tmpBuffer + ">");
		self.setState(STATE_UNINITIALISED);

		// Oh, and break out of this before we actually do close the node!
		break;
		}
		}

		// Close the current node...
		// Pass in the tagName specified by the closing tag so we can ensure the tree is balanced...
		self.closeNode(self.getBuffer());


		// ...And return to uninitialised state.
		self.setState(STATE_UNINITIALISED);


		// And clear the buffer.
		self.clearBuffer();


		} else if (self.curChar.match(/\s/)) {


		// Ignore whitespace...


		} else {
		self.bailout();

		// So our expectation was subverted.
		// Treat the closing tag as character data and revert to uninitialised state.

		if (self.getBuffer().length) {

		// Do a bit of juggling to replace
		var tmpBufferData = self.getBuffer();
		self.clearBuffer();
		self.buffer("</" + tmpBufferData);
		}

		// The buffer should be saved to a text node next time the parser state is altered.
		self.setState(STATE_UNINITIALISED);
		}


		break;


		default:
		@@ -689,18 +835,15 @@ // Something happened we weren't expecting.
		}

		// Increment parse pointer.
		charIndex ++;
		}


		// Return the finished parse tree to the calling function...
		return self.tree;
		};


		function castorExport() {
		return new Castor();
		}


		castorExport.Castor = Castor;


		(typeof module != "undefined" && module.exports) ? (module.exports = castorExport) : (typeof define != "undefined" ? (define("castor", [], function() { return castorExport; })) : (glob.castor = castorExport));
		})(this);

package.json

		@@ -5,3 +5,3 @@ {
		"description": "Ultra-simple HTML/XML parser. Generates a DOM-like (but extremely simplified and not at all compliant) tree.",
		"version": "0.0.1",
		"version": "0.0.2",
		"homepage": "https://github.com/cgiffard/Castor",
		@@ -8,0 +8,0 @@ "repository": {

castor - npm Package Compare versions

Improved metrics