New Case Study:See how Anthropic automated 95% of dependency reviews with Socket.Learn More
Socket
Sign inDemoInstall
Socket

castor

Package Overview
Dependencies
Maintainers
1
Versions
28
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

castor - npm Package Compare versions

Comparing version 0.0.1 to 0.0.2

555

index.js

@@ -11,7 +11,7 @@ // Castor HTML/XML Parser

(function(glob) {
"use strict";
// Some grammar-related constants...
// Parser internal states

@@ -23,9 +23,10 @@ var STATE_UNINITIALISED = 0,

STATE_WITHIN_XML_INSTRUCTION = 4,
STATE_WITHIN_CDATA = 5,
STATE_WITHIN_COMMENT = 6,
STATE_EXPECTING_ELEMENT_NAME = 7,
STATE_EXPECTING_ATTRIBUTE_NAME = 8,
STATE_EXPECTING_ATTRIBUTE_VALUE = 9,
STATE_EXPECTING_ELEMENT_CLOSE = 10;
STATE_EXPECTING_CDATA = 5,
STATE_WITHIN_CDATA = 6,
STATE_WITHIN_COMMENT = 7,
STATE_EXPECTING_ELEMENT_NAME = 8,
STATE_EXPECTING_ATTRIBUTE_NAME = 9,
STATE_EXPECTING_ATTRIBUTE_VALUE = 10,
STATE_EXPECTING_ELEMENT_CLOSE = 11;
// Nodes which should be considered implicitly self-closing

@@ -36,3 +37,10 @@ // Taken from http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html#void-elements

];
// Nodes which should be considered 'raw text' or rawCDATA elements
// (that is, only </ should trigger a parser state change. Combined because the internal distinction wasn't that useful.)
// Taken from http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html#raw-text-elements
var rawTextElements = [
"script", "style", "textarea", "title"
];
// Omission map (end tags)

@@ -62,3 +70,3 @@ // Determines which tags will automatically be closed when another begins, and by which sibling tags they will be closed.

}
// Node type/class

@@ -73,3 +81,3 @@ // Based on DOM, but doesn't implement _ANY_ of the DOM functions.

this.textContent = nodeType !== 1 && typeof tagParameter === "string" ? tagParameter : "";
// Custom stuff for heuristics...

@@ -79,14 +87,26 @@ this.weight = 0;

};
// Helper function that implicitly empty nodes to determine whether they should be closed or not.
// Helper function that determines whether the current node is 'raw text', that is,
// parser state should only change when the characters '</' are encountered.
function isRawTextNode(node) {
node = node instanceof Node ? node.tagName : node;
for (var nodeTypeIndex = 0; nodeTypeIndex < rawTextElements.length; nodeTypeIndex ++) {
if (!!node && rawTextElements[nodeTypeIndex].toLowerCase() === node.toLowerCase()) return true;
}
return false;
}
// Helper function that checks whether current node is 'implicitly empty' (void) and should be automatically closed.
function isVoidNode(node) {
node = node instanceof Node ? node.tagName : node;
for (var nodeTypeIndex = 0; nodeTypeIndex < voidElements.length; nodeTypeIndex ++) {
if (voidElements[nodeTypeIndex].toLowerCase() === node.toLowerCase()) return true;
if (!!node && voidElements[nodeTypeIndex].toLowerCase() === node.toLowerCase()) return true;
}
return false;
}
// Determines whether a tag should close and become a sibling of the current node, or become

@@ -96,3 +116,3 @@ // a child of the current node.

var closesNode = false;
if (omissionMap[currentTagName]) {

@@ -103,7 +123,7 @@ for (var testIndex = 0; testIndex < omissionMap[currentTagName].length; testIndex ++) {

}
// The current node doesn't exist in the map or isn't closed but the specified tag.
return false;
}
// Define Castor itself...

@@ -113,6 +133,6 @@ // State parameters hang off the main Castor object.

// methods of Castor. Anything which can act independently of state is written separately.
var Castor = function() {
this.doctype = null;
// Variables for managing parser state...

@@ -131,3 +151,3 @@ this.tree = new Node(99,"document"); // '99' is just a node number I've made up for the document node.

this.prevChar2 = "";
// These two are just for debugging.

@@ -137,3 +157,3 @@ this.lineNo = 0;

};
// Helper function for altering the internal state of the parser.

@@ -144,4 +164,4 @@ Castor.prototype.setState = function(newState) {

};
// Hides buffer implementation (in case native buffers are eventually used.)

@@ -151,4 +171,4 @@ Castor.prototype.buffer = function(c) {

};
// Retrieves the current token buffer

@@ -158,4 +178,4 @@ Castor.prototype.getBuffer = function() {

};
// Destroys anything in the buffer.

@@ -165,4 +185,4 @@ Castor.prototype.clearBuffer = function() {

};
// Creates a new text node at the current depth with the buffer contents,

@@ -178,7 +198,7 @@ // then clears it.

}
this.clearBuffer();
}
// Closes the current node, but also scans up the stack (if a closing tag name was provided) to ensure it's balanced.

@@ -191,3 +211,3 @@ Castor.prototype.closeNode = function(closingTagName) {

var tmpCurrentNode = this.currentNode, tmpTreeDepth = this.treeDepth;
// Repeat while we haven't reached the top of the tree yet, or found the node we're looking to close.

@@ -198,9 +218,10 @@ while (tmpTreeDepth && !nodeFound) {

}
if (tmpCurrentNode.parentNode) {
tmpCurrentNode = tmpCurrentNode.parentNode;
tmpTreeDepth --;
}
tmpTreeDepth --;
}
// If we actually found the node we were looking for...

@@ -212,7 +233,7 @@ if (nodeFound) {

}
return false;
}
// Bailout on error.

@@ -223,17 +244,15 @@ // Generates an error describing where the error was...

}
// The actual parser function.
Castor.prototype.parse = function(sourceInput) {
var self = this;
var charIndex = 0;
sourceInput = typeof sourceInput === "string" ? sourceInput : sourceInput.toString();
// Kick off the parser!
while (charIndex < sourceInput.length) {
// Fetch most recent three characters, push previous characters back on the stack...

@@ -243,3 +262,6 @@ self.prevChar2 = self.prevChar;

self.curChar = sourceInput.substr(charIndex,1);
// Increment parse pointer.
charIndex ++;
if (self.curChar === "\n") {

@@ -251,29 +273,29 @@ self.lineNo ++;

}
switch (self.state) {
case STATE_UNINITIALISED:
// Parser is just collecting text in uninitialised mode.
if (self.curChar === "<") {
// Open tag of some description
// Could be an element, processing instruction, comment,
// CDATA or DOCTYPE
self.setState(STATE_EXPECTING_TAG);
} else {
// Character wasn't an instruction, so we'll save it to our token buffer.
self.buffer(self.curChar);
}
break;
case STATE_EXPECTING_TAG:
if (self.curChar === "!") {
// We've got a 'bang' tag on our hands.

@@ -283,48 +305,71 @@ // Wait for the character after this to determine what we're dealing with...

self.setState(STATE_EXPECTING_BANG_QUALIFIER);
} else if (self.curChar === "?") {
// We've got an XML processing instruction on our hands.
// For now, we just ignore these. Not like I need 'em for this anyway.
self.setState(STATE_WITHIN_XML_INSTRUCTION);
} else if (self.curChar.match(/[a-z0-9]/i)) {
// Looks like an element node. Flush current buffer to tree and get ready to handle element name...
self.flushBuffer();
self.buffer(self.curChar);
self.setState(STATE_EXPECTING_ELEMENT_NAME);
// Check to see whether we're inside a raw text node
// (raw text nodes should not have children, and any child nodes will be treated as text.)
if (isRawTextNode(self.currentNode)) {
// Just buffer the previous character '>' and the current character
// And set the state back to STATE_UNINITIALISED
self.buffer(self.prevChar);
self.buffer(self.curChar);
self.setState(STATE_UNINITIALISED);
} else {
// Looks like an element node. Flush current buffer to tree and get ready to handle element name...
self.flushBuffer();
self.buffer(self.curChar);
self.setState(STATE_EXPECTING_ELEMENT_NAME);
}
} else if (self.curChar === "/") {
// Looks like a closing tag.
self.setState(STATE_EXPECTING_ELEMENT_CLOSE);
// Buffer any previous text if it exists...
self.flushBuffer();
} else if (self.curChar.match(/\s/)) {
// The character after the tag was whitespace. We assume the tag is text (unescaped &lt; character!)
// and buffer up the previous and current character before dropping back to 'uninitialised' state.
self.buffer(self.prevChar);
self.buffer(self.curChar);
self.setState(STATE_UNINITIALISED);
} else {
// We weren't expacting this character here. Bail out!
self.bailout();
// Our expectations were subverted, but we're currently in a raw text node.
if (isRawTextNode(self.currentNode)) {
// So just buffer the previous and current characters and set our state back to uninitialised.
self.buffer(self.prevChar);
self.buffer(self.curChar);
self.setState(STATE_UNINITIALISED);
} else {
// We weren't expacting this character here. Bail out!
self.bailout();
}
}
break;
// 3 node types start with <!. Wait for next character to determine what this is.
case STATE_EXPECTING_BANG_QUALIFIER:
if (self.curChar === "D" || self.curChar === "d") {
// This looks like a DOCTYPE!

@@ -334,20 +379,25 @@ self.flushBuffer();

self.setState(STATE_WITHIN_DOCTYPE);
} else if (self.curChar === "-") {
// This looks like a comment!
self.setState(STATE_WITHIN_COMMENT);
} else if (self.curChar === "[") {
// This looks like a CDATA section!
self.setState(STATE_EXPECTING_CDATA);
} else {
// Uh...
self.bailout();
}
break;
case STATE_WITHIN_DOCTYPE:
if (self.curChar === ">") {
// That's the end of our doctype! Save it and move on...

@@ -357,3 +407,3 @@ self.doctype = self.getBuffer();

self.setState(STATE_UNINITIALISED);
} else {

@@ -363,31 +413,93 @@ // Do we even pay attention to doctypes?

}
break;
case STATE_WITHIN_XML_INSTRUCTION:
bailout("XML instructions not yet supported.");
// For now, just an <strike>elaborate</strike> system designed to ignore XML instructions!
if (self.curChar === ">" && self.prevChar === "?") {
self.setState(STATE_UNINITIALISED);
}
break;
case STATE_EXPECTING_CDATA:
if (self.curChar === "[" &&
self.prevChar === "A" &&
self.prevChar2 === "T") {
// That's the end of the <![CDATA[ beginning tag...
self.setState(STATE_WITHIN_CDATA);
} else if (self.curChar.match(/[CDATA]/i)) {
// Ignore allowed characters in CDATA opening tag
} else if (self.curChar === ">") {
// Allow cancelling CDATA if tag wasn't opened properly...
self.setState(STATE_UNINITIALISED);
} else {
// Hrm. That certainly knocked us for six.
self.bailout();
}
break;
case STATE_WITHIN_CDATA:
bailout("CDATA not yet supported.");
if (self.curChar === ">" &&
self.prevChar === "]" &&
self.prevChar2 === "]") {
// For the time being I'll just save CDATA nodes as text.
self.flushBuffer();
// That's the end of our CDATA section!
self.setState(STATE_UNINITIALISED);
} else if (self.curChar === "]") {
// Don't buffer this character yet.
// However, we're only interested in sequences of /two/ square
// brackets. So if prevChar2 is a bracket, buffer it.
if (self.prevChar2 === "]") {
self.buffer(self.prevChar2);
}
} else {
// If the last character was one of those verboten square brackets,
// we can finally be sure it wasn't part of a closing tag. buffer it.
if (self.prevChar === "]") {
self.buffer(self.prevChar);
}
// And buffer everything else.
self.buffer(self.curChar);
}
break;
case STATE_WITHIN_COMMENT:
if (self.curChar === "-") {
// Do nothing for now. We'll work out whether we're going to buffer this
// as part of the comment later.
} else if (self.curChar === ">") {
if (self.prevChar === "-" && self.prevChar2 === "-") {
// OK then - that's the end of the comment!
// Create a new comment node with the contents of the buffer.
if (self.getBuffer().length) {

@@ -399,43 +511,46 @@ var newComment = new Node(8,self.getBuffer());

}
// Revert to uninitialised state.
self.setState(STATE_UNINITIALISED);
} else {
self.buffer(self.curChar);
}
} else {
// Because we don't buffer "-" characters immediately (in case they're part of the closing token)
// we wait until we know it isn't followed by another "-", and then we buffer it.
if (self.prevChar === "-" && self.curChar !== "-" && !self.curChar.match(/\s/)) {
self.buffer(self.prevChar);
}
self.buffer(self.curChar);
}
break;
case STATE_EXPECTING_ELEMENT_NAME:
if (self.curChar.match(/[a-z0-9]/i)) {
// For the time being, we'll just treat namespaces as tag/attribute names.
// TODO: split namespaces out and extend the node class to accomodate for them.
if (self.curChar.match(/[a-z0-9\-\:]/i)) {
// Just a text character. Buffer up!
self.buffer(self.curChar);
} else {
// Get the element name from the buffer...
var elementName = self.getBuffer();
// If this tag implicitly closes the currently open node
if (closesCurrentNode(elementName,self.currentNode.tagName)) {
// Then close the currently open node first.
self.closeNode()
}
// Create element node with the buffer as its tagName

@@ -447,23 +562,23 @@ // Assign the current node as its parent...

self.clearBuffer();
// Set as the current node
self.currentNode = newElement;
// Increase our tree-depth for tracking/debugging
self.treeDepth ++;
if (self.curChar === ">") {
// If we're an attribute-less opening tag, just switch back to uninitialised state.
self.setState(STATE_UNINITIALISED);
} else if (self.curChar.match(/\s/)) {
// If there's whitespace we must be expecting attributes!
self.setState(STATE_EXPECTING_ATTRIBUTE_NAME);
} else if (self.curChar === "/") {
// We're a self-closing tag.
self.setState(STATE_EXPECTING_ELEMENT_CLOSE);
} else {

@@ -474,11 +589,11 @@ // Error condition...

}
break;
case STATE_EXPECTING_ATTRIBUTE_NAME:
if (self.curChar === "/") {
// Oh, it was just some whitespace before the end of a self-closing tag.
self.setState(STATE_EXPECTING_ELEMENT_CLOSE);
// Well, we save it to the node if there was anything in the buffer!

@@ -490,5 +605,7 @@ if (self.getBuffer().length) {

}
} else if (self.curChar.match(/[a-z0-9\-]/i)) {
// For the time being, we'll just treat namespaces as tag/attribute names.
// TODO: split namespaces out and extend the node class to accomodate for them.
} else if (self.curChar.match(/[a-z0-9\-\:]/i)) {
// looks like an attribute name to me!

@@ -498,5 +615,5 @@ // But wait. If the previous character was whitespace, and the previous state was also

// current node. Otherwise, just buffer away!
if (self.prevChar.match(/\s/) && self.prevState === STATE_EXPECTING_ATTRIBUTE_NAME) {
// Well, we save it to the node if there was anything in the buffer!

@@ -508,31 +625,31 @@ if (self.getBuffer().length) {

}
}
self.buffer(self.curChar);
} else if (self.curChar.match(/\s/)) {
// Ignore whitespace.
} else if (self.curChar === "=") {
// Looks like we're being primed for a value...
self.setState(STATE_EXPECTING_ATTRIBUTE_VALUE);
// Capture and clear the buffer...
self.currentAttribute = self.getBuffer();
self.clearBuffer();
} else if (self.curChar === "'" || self.curChar === "\"") {
// Some idiot just stuck a string delimiter directly after an attribtue name without an equals sign.
// Never mind, we can deal with that.
if (self.getBuffer().length) {
// Well handling it this way only makes sense if we've actually got something in the buffer
// to use as an attribute name. Otherwise there's nothing to attach the attribute value to.
self.currentDelimiter = self.curChar;
self.setState(STATE_EXPECTING_ATTRIBUTE_VALUE);
// Capture and clear the buffer...

@@ -542,7 +659,7 @@ self.currentAttribute = self.getBuffer();

}
} else if (self.curChar === ">") {
// So the element was closed.
// If there's anything in the buffer, consider it a boolean attribute.
if (self.getBuffer().length) {

@@ -553,22 +670,22 @@ // Save the buffer as a boolean attribute, and clear it.

}
if (isVoidNode(self.currentNode) && self.currentNode.parentNode) {
self.currentNode = self.currentNode.parentNode;
}
// Return to uninitialised state.
self.setState(STATE_UNINITIALISED);
}
break;
case STATE_EXPECTING_ATTRIBUTE_VALUE:
// Are we in a delimited string?
if (self.currentDelimiter.length) {
if (self.curChar === self.currentDelimiter) {
self.currentDelimiter = "";
if (self.getBuffer().length) {

@@ -579,13 +696,13 @@ // In that case, save the buffer to the node attributes and start looking for more!

}
// Oh, we've reached a matching delimiter? Well that's the end of that then.
self.setState(STATE_EXPECTING_ATTRIBUTE_NAME);
} else {
self.buffer(self.curChar);
}
} else {
if (self.curChar === "/" || self.curChar === ">") {
// We hit the end of the tag. Branch according to whether we're still expecting another character or not.

@@ -595,17 +712,17 @@ if (self.curChar === "/") {

} else {
// If this node is in our list of tag types to be implicitly closed, then close it.
// Then switch back to STATE_UNINITIALISED.
if (isVoidNode(self.currentNode)) {
self.closeNode();
}
self.setState(STATE_UNINITIALISED);
}
// Is there actually something in the buffer?
if (self.getBuffer().length) {
self.currentNode.attributes[self.currentAttribute] = self.getBuffer();
} else {

@@ -615,7 +732,7 @@ // Couldn't extract a value? Treat this as a boolean attribute.

}
self.clearBuffer();
} else if (self.curChar === "'" || self.curChar === "\"") {
// Hit a delimiter. If there's something in the buffer, consider the final delimiter

@@ -628,11 +745,11 @@ // for a string with a missing first delimiter.

self.setState(STATE_EXPECTING_ATTRIBUTE_NAME);
} else {
// Set the current string delimiter.
// Nothing in the buffer, so no need to clear it!
self.currentDelimiter = self.curChar;
}
} else if (self.curChar.match(/\s/)) {

@@ -642,3 +759,3 @@ // Whitespace. If there's nothing in the buffer, we haven't gotten to the attribute value yet,

// back to STATE_EXPECTING_ATTRIBUTE_NAME after saving the attribute value to the node.
if (self.getBuffer().length) {

@@ -651,34 +768,63 @@ self.currentNode.attributes[self.currentAttribute] = self.getBuffer();

}
break;
case STATE_EXPECTING_ELEMENT_CLOSE:
if (self.curChar.match(/[a-z0-9]/i)) {
// Dealing with a closing tag name? Just bufffer it.
self.buffer(self.curChar);
} else if (self.curChar === ">") {
if (isRawTextNode(self.currentNode)) {
if (self.currentNode.tagName.toLowerCase() !== self.getBuffer().toLowerCase()) {
// If we're in a raw text node now, and a closing tag arrives which isn't for us,
// we totally ignore it, rather than trying hard to balance out the heirarchy.
// Instead, treat as text and begin a new buffer!
var tmpBuffer = self.getBuffer();
self.clearBuffer();
self.buffer("</" + tmpBuffer + ">");
self.setState(STATE_UNINITIALISED);
// Oh, and break out of this before we actually do close the node!
break;
}
}
// Close the current node...
// Pass in the tagName specified by the closing tag so we can ensure the tree is balanced...
self.closeNode(self.getBuffer());
// ...And return to uninitialised state.
self.setState(STATE_UNINITIALISED);
// And clear the buffer.
self.clearBuffer();
} else if (self.curChar.match(/\s/)) {
// Ignore whitespace...
} else {
self.bailout();
// So our expectation was subverted.
// Treat the closing tag as character data and revert to uninitialised state.
if (self.getBuffer().length) {
// Do a bit of juggling to replace
var tmpBufferData = self.getBuffer();
self.clearBuffer();
self.buffer("</" + tmpBufferData);
}
// The buffer should be saved to a text node next time the parser state is altered.
self.setState(STATE_UNINITIALISED);
}
break;
default:

@@ -689,18 +835,15 @@ // Something happened we weren't expecting.

}
// Increment parse pointer.
charIndex ++;
}
// Return the finished parse tree to the calling function...
return self.tree;
};
function castorExport() {
return new Castor();
}
castorExport.Castor = Castor;
(typeof module != "undefined" && module.exports) ? (module.exports = castorExport) : (typeof define != "undefined" ? (define("castor", [], function() { return castorExport; })) : (glob.castor = castorExport));
})(this);

@@ -5,3 +5,3 @@ {

"description": "Ultra-simple HTML/XML parser. Generates a DOM-like (but extremely simplified and not at all compliant) tree.",
"version": "0.0.1",
"version": "0.0.2",
"homepage": "https://github.com/cgiffard/Castor",

@@ -8,0 +8,0 @@ "repository": {

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc