node-readability - npm Package Compare versions

Comparing version 0.4.0 to 0.9.0

examples/index.html

examples/result.html

package.json

		{
		"name": "node-readability",
		"version": "0.4.0",
		"version": "0.9.0",
		"author": "Zihua Li",
		@@ -5,0 +5,0 @@ "description": "Turning any web page into a clean view.",

README.md

		@@ -9,4 +9,6 @@ # Readability
		1. Optimized for more websites.
		2. Support encodings such as GBK and GB2312.
		3. Converts relative urls to absolute for images and links automatically(Thank [Guillermo Baigorria](https://github.com/gbaygon) & [Tom Sutton](https://github.com/tomsutton1984)).
		2. Supporting HTML5 tags(`article`, `section`) and Microdata API.
		3. Focusing on both accuracy and performance. 4x times faster than arc90's version.
		3. Supporting encodings such as GBK and GB2312.
		4. Converting relative urls to absolute for images and links automatically(Thank [Guillermo Baigorria](https://github.com/gbaygon) & [Tom Sutton](https://github.com/tomsutton1984)).

		@@ -36,13 +38,13 @@ ## Example
		read('http://howtonode.org/really-simple-file-uploads', function(err, article, meta) {
		// The main body of the page.
		// Main Article
		console.log(article.content);
		// The title of the page.
		// Title
		console.log(article.title);

		// The raw HTML code of the page
		// HTML Source Code
		console.log(article.html);
		// The document object of the page
		// DOM
		console.log(article.document);

		// The response object from request lib

		// Response Object from Request Lib
		console.log(meta);
		@@ -49,0 +51,0 @@ });

190

src/helpers.js

		@@ -5,3 +5,3 @@ var url = require("url");
		var regexps = {
		unlikelyCandidatesRe: /combx\|comment\|disqus\|foot\|header\|menu\|meta\|nav\|rss\|shoutbox\|sidebar\|sponsor/i,
		unlikelyCandidatesRe: /combx\|modal\|lightbox\|comment\|disqus\|foot\|header\|menu\|meta\|nav\|rss\|shoutbox\|sidebar\|sponsor\|social\|teaserlist\|time\|tweet\|twitter/i,
		okMaybeItsACandidateRe: /and\|article\|body\|column\|main/i,
		@@ -20,4 +20,4 @@ positiveRe: /article\|body\|content\|entry\|hentry\|page\|pagination\|post\|text/i,
		var dbg;
		exports.debug = function (debug) {
		dbg = (debug) ? console.log : function () {};
		exports.debug = function(debug) {
		dbg = (debug) ? console.log : function() {};
		};
		@@ -37,3 +37,3 @@
		**/
		var prepDocument = module.exports.prepDocument = function (document) {
		var prepDocument = module.exports.prepDocument = function(document) {
		var frames = document.getElementsByTagName('frame');
		@@ -44,3 +44,3 @@ if (frames.length > 0) {

		Array.prototype.slice.call(frames, 0).forEach(function (frame) {
		Array.prototype.slice.call(frames, 0).forEach(function(frame) {
		var frameSize = frame.offsetWidth + frame.offsetHeight;
		@@ -72,16 +72,6 @@ var canAccessFrame = false;

		// remove all scripts that are not readability
		var scripts = document.getElementsByTagName('script');
		for (var i = 0; i < scripts.length; ++i) {
		scripts[i].parentNode.removeChild(scripts[i]);
		}
		// remove all stylesheets
		for (var k = 0; k < document.styleSheets.length; k++) {
		document.styleSheets[k].disabled = true;
		}

		// turn all double br's into p's
		// note, this is pretty costly as far as processing goes. Maybe optimize later.
		document.body.innerHTML = document.body.innerHTML.replace(regexps.replaceBrsRe, '</p><p>').replace(regexps.replaceFontsRe, '<$1span>')
		}
		// document.body.innerHTML = document.body.innerHTML.replace(regexps.replaceBrsRe, '</p><p>').replace(regexps.replaceFontsRe, '<$1span>');
		};

		@@ -94,3 +84,3 @@ /***
		**/
		var grabArticle = module.exports.grabArticle = function (document, preserveUnlikelyCandidates) {
		var grabArticle = module.exports.grabArticle = function(document, preserveUnlikelyCandidates) {
		/**
		@@ -118,3 +108,3 @@ * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs
		// Turn all divs that don't have children block level elements into p's
		if (!continueFlag && node.tagName === "DIV") {
		if (!continueFlag && node.tagName === 'DIV') {
		if (node.innerHTML.search(regexps.divToPElementsRe) === -1) {
		@@ -127,3 +117,3 @@ dbg("Altering div to p");
		// EXPERIMENTAL
		node.childNodes._toArray().forEach(function (childNode) {
		node.childNodes._toArray().forEach(function(childNode) {
		if (childNode.nodeType == 3 /TEXT_NODE/ ) {
		@@ -177,3 +167,2 @@ // use span instead of p. Need more tests.
		// Add points for any commas within this paragraph */
		// support Chinese commas.
		contentScore += innerText.replace('，', ',').split(',').length;
		@@ -195,3 +184,3 @@
		var topCandidate = null;
		candidates.forEach(function (candidate) {
		candidates.forEach(function(candidate) {
		/**
		@@ -212,8 +201,10 @@ * Scale the final candidates score based on link density. Good content should have a
		**/
		if (topCandidate === null \|\| topCandidate.tagName === "BODY") {
		if (topCandidate === null \|\| topCandidate.tagName === 'BODY') {
		// With no top candidate, bail out if no body tag exists as last resort.
		if (!document.body) return new Error("No body tag was found.");
		topCandidate = document.createElement("DIV");
		if (!document.body) {
		return new Error('No body tag was found.');
		}
		topCandidate = document.createElement('DIV');
		topCandidate.innerHTML = document.body.innerHTML;
		document.body.innerHTML = "";
		document.body.innerHTML = '';
		document.body.appendChild(topCandidate);
		@@ -228,4 +219,4 @@ initializeNode(topCandidate);
		**/
		var articleContent = document.createElement("DIV");
		articleContent.id = "readability-content";
		var articleContent = document.createElement('DIV');
		articleContent.id = 'readability-content';
		var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
		@@ -237,4 +228,4 @@ var siblingNodes = topCandidate.parentNode.childNodes;

		dbg("Looking at sibling node: " + siblingNode + " (" + siblingNode.className + ":" + siblingNode.id + ")" + ((typeof siblingNode.readability != 'undefined') ? (" with score " + siblingNode.readability.contentScore) : ''));
		dbg("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown'));
		dbg('Looking at sibling node: ' + siblingNode + ' (' + siblingNode.className + ':' + siblingNode.id + ')' + ((typeof siblingNode.readability != 'undefined') ? (' with score ' + siblingNode.readability.contentScore) : ''));
		dbg('Sibling has score ' + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown'));

		@@ -249,3 +240,3 @@ if (siblingNode === topCandidate) {

		if (siblingNode.nodeName == "P") {
		if (siblingNode.nodeName == 'P') {
		var linkDensity = getLinkDensity(siblingNode);
		@@ -257,3 +248,3 @@ var nodeContent = getInnerText(siblingNode);
		append = true;
		} else if (nodeLength < 80 && linkDensity == 0 && nodeContent.search(/\.( \|$)/) !== -1) {
		} else if (nodeLength < 80 && linkDensity === 0 && nodeContent.search(/\.( \|$)/) !== -1) {
		append = true;
		@@ -264,3 +255,3 @@ }
		if (append) {
		dbg("Appending node: " + siblingNode)
		dbg("Appending node: " + siblingNode);

		@@ -288,3 +279,3 @@ /* Append sibling and subtract from our list because it removes the node when you append to another node */
		**/
		function cleanStyles (e) {
		function cleanStyles(e) {
		if (!e) return;
		@@ -316,3 +307,3 @@
		**/
		function killBreaks (e) {
		function killBreaks(e) {
		e.innerHTML = e.innerHTML.replace(regexps.killBreaksRe, '<br />');
		@@ -329,3 +320,3 @@ }
		**/
		getInnerText = exports.getInnerText = function (e, normalizeSpaces) {
		getInnerText = exports.getInnerText = function(e, normalizeSpaces) {
		var textContent = "";
		@@ -348,3 +339,3 @@
		**/
		function getCharCount (e, s) {
		function getCharCount(e, s) {
		s = s \|\| ",";
		@@ -357,7 +348,7 @@ return getInnerText(e).split(s).length;
		* This is the amount of text that is inside a link divided by the total text in the node.
		*
		*
		* @param Element
		* @return number (float)
		**/
		function getLinkDensity (e) {
		function getLinkDensity(e) {
		var links = e.getElementsByTagName("a");
		@@ -370,3 +361,3 @@
		// hack for <h2><a href="#menu"></a></h2> / <h2><a></a></h2>
		if(!href \|\| (href.length > 0 && href[0] === '#')) continue;
		if (!href \|\| (href.length > 0 && href[0] === '#')) continue;
		linkLength += getInnerText(links[i]).length;
		@@ -378,3 +369,3 @@ }
		/**
		* Get an elements class/id weight. Uses regular expressions to tell if this
		* Get an elements class/id weight. Uses regular expressions to tell if this
		* element looks good or bad.
		@@ -385,7 +376,7 @@ *
		**/
		function getClassWeight (e) {
		function getClassWeight(e) {
		var weight = 0;

		/* Look for a special classname */
		if (e.className != "") {
		if (e.className !== '') {
		if (e.className.search(regexps.negativeRe) !== -1) weight -= 25;
		@@ -397,3 +388,3 @@
		/* Look for a special ID */
		if (typeof (e.id) == 'string' && e.id != "") {
		if (typeof(e.id) == 'string' && e.id != "") {
		if (e.id.search(regexps.negativeRe) !== -1) weight -= 25;
		@@ -415,3 +406,3 @@
		**/
		function clean (e, tag) {
		function clean(e, tag) {
		var targetList = e.getElementsByTagName(tag);
		@@ -425,4 +416,4 @@ var isEmbed = (tag == 'object' \|\| tag == 'embed');
		var validRule = false;
		for(var i = 0; i < cleanRules.length; i++) {
		if(cleanRules[i](targetList[y], tag) === true) {
		for (var i = 0; i < cleanRules.length; i++) {
		if (cleanRules[i](targetList[y], tag) === true) {
		validRule = true;
		@@ -433,3 +424,3 @@ break;

		if(validRule) {
		if (validRule) {
		continue;
		@@ -441,3 +432,3 @@ }
		if (isEmbed) {
		if(targetList[y].innerHTML.search(regexps.videoRe) !== -1) {
		if (targetList[y].innerHTML.search(regexps.videoRe) !== -1) {
		continue;
		@@ -524,7 +515,8 @@ }
		**/
		function fixLinks (e) {
		function fixLinks(e) {
		if (!e.ownerDocument.originalURL) {
		return;
		}
		function fixLink(link){

		function fixLink(link) {
		var fixed = url.resolve(e.ownerDocument.originalURL, link);
		@@ -538,3 +530,3 @@ return fixed;
		var src = imgs[i].getAttribute('src');
		if(src) {
		if (src) {
		imgs[i].setAttribute('src', fixLink(src));
		@@ -547,3 +539,3 @@ }
		var href = as[i].getAttribute('href');
		if(href) {
		if (href) {
		as[i].setAttribute('href', fixLink(href));
		@@ -560,3 +552,3 @@ }
		**/
		function cleanHeaders (e) {
		function cleanHeaders(e) {
		for (var headerIndex = 1; headerIndex < 7; headerIndex++) {
		@@ -579,3 +571,3 @@ var headers = e.getElementsByTagName('h' + headerIndex);

		function cleanSingleHeader (e) {
		function cleanSingleHeader(e) {
		for (var headerIndex = 1; headerIndex < 7; headerIndex++) {
		@@ -592,3 +584,3 @@ var headers = e.getElementsByTagName('h' + headerIndex);

		function prepArticle (articleContent) {
		function prepArticle(articleContent) {
		cleanStyles(articleContent);
		@@ -598,5 +590,7 @@ killBreaks(articleContent);
		/* Clean out junk from the article content */
		clean(articleContent, "form");
		clean(articleContent, "object");
		clean(articleContent, "h1");
		clean(articleContent, 'form');
		clean(articleContent, 'object');
		if (articleContent.getElementsByTagName('h1').length === 1) {
		clean(articleContent, 'h1');
		}
		/**
		@@ -606,3 +600,3 @@ * If there is only one h2, they are probably using it
		***/
		if (articleContent.getElementsByTagName('h2').length == 1) clean(articleContent, "h2");
		if (articleContent.getElementsByTagName('h2').length === 1) clean(articleContent, "h2");

		@@ -648,41 +642,55 @@ clean(articleContent, "iframe");
		**/
		function initializeNode (node) {
		node.readability = {
		"contentScore": 0
		};
		function initializeNode(node) {
		node.readability = { contentScore: 0 };

		switch (node.tagName) {
		case 'DIV':
		node.readability.contentScore += 5;
		break;
		case 'ARTICLE':
		node.readability.contentScore += 10;
		break;

		case 'PRE':
		case 'TD':
		case 'BLOCKQUOTE':
		node.readability.contentScore += 3;
		break;
		case 'SECTION':
		node.readability.contentScore += 8;
		break;

		case 'ADDRESS':
		case 'OL':
		case 'UL':
		case 'DL':
		case 'DD':
		case 'DT':
		case 'LI':
		case 'FORM':
		node.readability.contentScore -= 3;
		break;
		case 'DIV':
		node.readability.contentScore += 5;
		break;

		case 'H1':
		case 'H2':
		case 'H3':
		case 'H4':
		case 'H5':
		case 'H6':
		case 'TH':
		node.readability.contentScore -= 5;
		break;
		case 'PRE':
		case 'TD':
		case 'BLOCKQUOTE':
		node.readability.contentScore += 3;
		break;

		case 'ADDRESS':
		case 'OL':
		case 'UL':
		case 'DL':
		case 'DD':
		case 'DT':
		case 'LI':
		case 'FORM':
		node.readability.contentScore -= 3;
		break;

		case 'H1':
		case 'H2':
		case 'H3':
		case 'H4':
		case 'H5':
		case 'H6':
		case 'TH':
		node.readability.contentScore -= 5;
		break;
		}

		if (node.attributes.itemscope) {
		node.readability.contentScore += 5;
		if (node.attributes.itemtype &&
		/blog\|post\|article/i.test(node.getAttribute('itemtype'))) {
		node.readability.contentScore += 30;
		}
		}

		node.readability.contentScore += getClassWeight(node);
		}

src/readability.js

		@@ -7,3 +7,3 @@ var jsdom = require('jsdom');

		exports.debug = function (debug) {
		exports.debug = function(debug) {
		helpers.debug(debug);
		@@ -43,3 +43,3 @@ };

		Readability.prototype.getContent = function (notDeprecated) {
		Readability.prototype.getContent = function(notDeprecated) {
		if (!notDeprecated) {
		@@ -64,3 +64,3 @@ console.warn('The method `getContent()` is deprecated, using `content` property instead.');

		Readability.prototype.getTitle = function (notDeprecated) {
		Readability.prototype.getTitle = function(notDeprecated) {
		if (!notDeprecated) {
		@@ -78,3 +78,3 @@ console.warn('The method `getTitle()` is deprecated, using `title` property instead.');
		var self = this;
		commonSeparatingCharacters.forEach(function (char) {
		commonSeparatingCharacters.forEach(function(char) {
		var tmpArray = title.split(char);
		@@ -94,3 +94,3 @@ if (tmpArray.length > 1) {

		Readability.prototype.getDocument = function (notDeprecated) {
		Readability.prototype.getDocument = function(notDeprecated) {
		if (!notDeprecated) {
		@@ -102,3 +102,3 @@ console.warn('The method `getDocument()` is deprecated, using `document` property instead.');

		Readability.prototype.getHTML = function (notDeprecated) {
		Readability.prototype.getHTML = function(notDeprecated) {
		if (!notDeprecated) {
		@@ -110,3 +110,3 @@ console.warn('The method `getHTML()` is deprecated, using `html` property instead.');

		function _findHTMLCharset(htmlbuffer){
		function _findHTMLCharset(htmlbuffer) {

		@@ -116,9 +116,9 @@ var body = htmlbuffer.toString("ascii"),

		if(meta = body.match(/<meta\s+http-equiv=["']content-type["'][^>]*?>/i)){
		if (meta = body.match(/<meta\s+http-equiv=["']content-type["'][^>]*?>/i)) {
		input = meta[0];
		}

		if(input){
		if (input) {
		charset = input.match(/charset\s?=\s?([a-zA-Z\-0-9]*);?/);
		if(charset){
		if (charset) {
		charset = (charset[1] \|\| "").trim().toLowerCase();
		@@ -128,3 +128,3 @@ }

		if(!charset && (meta = body.match(/<meta\s+charset=["'](.*?)["']/i))){
		if (!charset && (meta = body.match(/<meta\s+charset=["'](.*?)["']/i))) {
		charset = (meta[1] \|\| "").trim().toLowerCase();
		@@ -136,4 +136,4 @@ }

		function _parseContentType(str){
		if(!str){
		function _parseContentType(str) {
		if (!str) {
		return {};
		@@ -145,6 +145,6 @@ }

		for(var i=0, len = parts.length; i<len; i++){
		for (var i = 0, len = parts.length; i < len; i++) {
		chparts = parts[i].split("=");
		if(chparts.length>1){
		if(chparts[0].trim().toLowerCase() == "charset"){
		if (chparts.length > 1) {
		if (chparts[0].trim().toLowerCase() == "charset") {
		charset = chparts[1];
		@@ -156,5 +156,5 @@ }
		return {
		mimeType: (mimeType \|\| "").trim().toLowerCase(),
		mimeType: (mimeType \|\| "").trim().toLowerCase(),
		charset: (charset \|\| "UTF-8").trim().toLowerCase() // defaults to UTF-8
		}
		};
		}
		@@ -174,3 +174,3 @@
		request(html, options, function(err, res, buffer) {
		if(err) {
		if (err) {
		return callback(err);
		@@ -181,9 +181,9 @@ }

		if(content_type.mimeType == "text/html"){
		if (content_type.mimeType == "text/html") {
		content_type.charset = _findHTMLCharset(buffer) \|\| content_type.charset;
		}

		content_type.charset = (overrideEncoding \|\| content_type.charset \|\| "utf-8").trim().toLowerCase();
		content_type.charset = (overrideEncoding \|\| content_type.charset \|\| "utf-8").trim().toLowerCase();

		if(!content_type.charset.match(/^utf-?8$/i)){
		if (!content_type.charset.match(/^utf-?8$/i)) {
		buffer = encodinglib.convert(buffer, "UTF-8", content_type.charset);
		@@ -207,4 +207,4 @@ }
		html: body,
		done: function (errors, window) {
		if(meta) {
		done: function(errors, window) {
		if (meta) {
		window.document.originalURL = meta.request.uri.href;
		@@ -211,0 +211,0 @@ } else {

node-readability - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics