@@ -1,9 +0,13 @@
		var readability = require('../src/readability');
		var readability = require('../src/readability')
		, fs = require('fs')

		// uncoment the following line to print the debug info to console.
		// readability.debug(true);

		readability.debug(true);

		readability.read('http://howtonode.org/really-simple-file-uploads', function(err, read) {
		console.log(read.getContent());
		console.log(read.getTitle());
		});
		readability.read('http://colorlines.com/archives/2011/08/dispatch_from_angola_faith-based_slavery_in_a_louisiana_prison.html',
		function(err, read) {
		var dom = read.getDocument();
		var html = '<html><head><meta charset="utf-8"><title>'+dom.title+'</title></head><body><h1>'+read.getTitle()+'</h1>'+read.getContent()+'</body></html>';
		console.log(html);
		});

package.json

		{
		"name": "node-readability",
		"version": "0.0.2",
		"version": "0.0.3",
		"author": "Zihua Li",
		@@ -5,0 +5,0 @@ "description": "Turn any web page into a clean view",

README.md

		# node-readability

		Turn any web page into a clean view. It's based on arc90's readability project.
		Turn any web page into a clean view. This module is based on arc90's readability project.

		@@ -10,3 +10,2 @@ ## Install
		## Requirements
		* [node.js](http://nodejs.org/)
		* [jsdom](https://github.com/tmpvar/jsdom)
		@@ -23,3 +22,3 @@ * [fetch](https://github.com/andris9/fetch)
		* options is an optional options object
		* callback is the callback to run - `callback(error, read)`
		* callback is the callback to run - `callback(error, article)`

		@@ -30,3 +29,2 @@ Example

		// source file is iso-8859-15 but it is converted to utf-8 automatically
		readability.read('http://howtonode.org/really-simple-file-uploads', function(err, article) {
		@@ -36,6 +34,8 @@ console.log(article.getArticleContent());

		NB If the file has been marked with charset other than utf-8, it is converted automatically. Charsets such as GBK, GB2312 is also supported via icon.
		NB If the file has been marked with charset other than utf-8, it is converted automatically. Charsets such as GBK, GB2312 is also supported via [iconv](https://github.com/bnoordhuis/node-iconv).

		## Options

		node-readability support all the options that [fetch](https://github.com/andris9/fetch) support.

		Possible option values
		@@ -61,4 +61,2 @@

		Readability support lazy evaluation by passing `readResult` to the callback function.

		### getContent()
		@@ -82,4 +80,4 @@

		* Support more readability features
		* Performance optimization
		* Support more readability features
		* Performance optimization

		@@ -86,0 +84,0 @@ ## License

591

src/readability.js

		var jsdom = require('jsdom'),
		fetchUrl = require("fetch").fetchUrl
		fetchUrl = require("fetch").fetchUrl,
		helpers = require('./helpers')

		var dbg;
		exports.debug = function (debug) {
		dbg = (debug) ? console.log : function () {}
		helpers.debug(debug);
		};
		@@ -11,21 +11,2 @@

		function trim(string) {
		return string.replace(/(^\s)(\s$)/g, '');
		}

		// All of the regular expressions in use within readability.
		var regexps = {
		unlikelyCandidatesRe: /combx\|comment\|disqus\|foot\|header\|menu\|meta\|nav\|rss\|shoutbox\|sidebar\|sponsor/i,
		okMaybeItsACandidateRe: /and\|article\|body\|column\|main/i,
		positiveRe: /article\|body\|content\|entry\|hentry\|page\|pagination\|post\|text/i,
		negativeRe: /combx\|comment\|contact\|foot\|footer\|footnote\|link\|media\|meta\|promo\|related\|scroll\|shoutbox\|sponsor\|utility\|tags\|widget/i,
		divToPElementsRe: /<(a\|blockquote\|dl\|div\|img\|ol\|p\|pre\|table\|ul)/i,
		replaceBrsRe: /(<br[^>]>[ \n\r\t]){2,}/gi,
		replaceFontsRe: /<(\/?)font[^>]*>/gi,
		trimRe: /^\s+\|\s+$/g,
		normalizeRe: /\s{2,}/g,
		killBreaksRe: /(<br\s\/?>(\s\| ?)){1,}/g,
		videoRe: /http:\/\/(www\.)?(youtube\|vimeo\|youku\|tudou\|56\|yinyuetai)\.com/i
		};

		function Readablity(document, options) {
		@@ -40,3 +21,3 @@ this._document = document;

		prepDocument(this._document);
		helpers.prepDocument(this._document);
		this.cache = {
		@@ -52,7 +33,7 @@ 'body': this._document.body.innerHTML

		articleContent = grabArticle(this._document);
		if (getInnerText(articleContent, false) === "") {
		articleContent = helpers.grabArticle(this._document);
		if (helpers.getInnerText(articleContent, false) === "") {
		this._document.body.innerHTML = this.cache['body'];
		articleContent = grabArticle(this._document, true);
		if (getInnerText(articleContent, false) === "") {
		articleContent = helpers.grabArticle(this._document, true);
		if (helpers.getInnerText(articleContent, false) === "") {
		return this.cache['article-content'] = false;
		@@ -72,8 +53,10 @@ }
		var betterTitle;
		var commonSeparatingCharacters = ['\|', '_', '-', '«'];
		var commonSeparatingCharacters = [' \| ', ' _ ', ' - ', '«', '»', '—'];

		var self = this;
		commonSeparatingCharacters.forEach(function (char) {
		var tmpArray = title.split(char);
		if (tmpArray.length > 1) {
		if (betterTitle) return this.cache['article-title'] = title;
		betterTitle = trim(tmpArray[0]);
		if (betterTitle) return self.cache['article-title'] = title;
		betterTitle = tmpArray[0].trim();
		}
		@@ -97,550 +80,2 @@ });


		/**
		* Prepare the HTML document for readability to scrape it.
		* This includes things like stripping javascript, CSS, and handling terrible markup.
		*
		* @return void
		**/
		function prepDocument(document) {
		var frames = document.getElementsByTagName('frame');
		if (frames.length > 0) {
		var bestFrame = null;
		var bestFrameSize = 0;

		frames.forEach(function (frame) {
		var frameSize = frame.offsetWidth + frame.offsetHeight;
		var canAccessFrame = false;
		try {
		frame.contentWindow.document.body;
		canAccessFrame = true;
		} catch (e) {}

		if (canAccessFrame && frameSize > bestFrameSize) {
		bestFrame = frame;
		bestFrameSize = frameSize;
		}
		});

		if (bestFrame) {
		var newBody = document.createElement('body');
		newBody.innerHTML = bestFrame.contentWindow.document.body.innerHTML;
		newBody.style.overflow = 'scroll';
		document.body = newBody;

		var frameset = document.getElementsByTagName('frameset')[0];
		if (frameset) {
		frameset.parentNode.removeChild(frameset);
		}
		}
		}

		// remove all scripts that are not readability
		var scripts = document.getElementsByTagName('script');
		for (var i = 0; i < scripts.length; ++i) {
		scripts[i].parentNode.removeChild(scripts[i]);
		}
		// remove all stylesheets
		for (var k = 0; k < document.styleSheets.length; k++) {
		document.styleSheets[k].disabled = true;
		}

		// turn all double br's into p's
		// note, this is pretty costly as far as processing goes. Maybe optimize later.
		document.body.innerHTML = document.body.innerHTML.replace(regexps.replaceBrsRe, '</p><p>').replace(regexps.replaceFontsRe, '<$1span>')
		}

		/***
		* grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
		* most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
		*
		* @return Element
		**/
		function grabArticle(document, preserveUnlikelyCandidates) {
		/**
		* First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs
		* into P tags where they have been used inappropriately (as in, where they contain no other block level elements.)
		*
		* Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
		* TODO: Shouldn't this be a reverse traversal?
		**/
		var nodes = document.getElementsByTagName('*');
		for (var i = 0; i < nodes.length; ++i) {
		var node = nodes[i];
		// Remove unlikely candidates */
		var continueFlag = false;
		if (!preserveUnlikelyCandidates) {
		var unlikelyMatchString = node.className + node.id;
		if (unlikelyMatchString.search(regexps.unlikelyCandidatesRe) !== -1 && unlikelyMatchString.search(regexps.okMaybeItsACandidateRe) == -1 && node.tagName !== "BODY") {
		dbg("Removing unlikely candidate - " + unlikelyMatchString);
		node.parentNode.removeChild(node);
		continueFlag = true;
		}
		}

		// Turn all divs that don't have children block level elements into p's
		if (!continueFlag && node.tagName === "DIV") {
		if (node.innerHTML.search(regexps.divToPElementsRe) === -1) {
		dbg("Altering div to p");
		var newNode = document.createElement('p');
		newNode.innerHTML = node.innerHTML;
		node.parentNode.replaceChild(newNode, node);
		} else {
		// EXPERIMENTAL
		node.childNodes.forEach(function (childNode) {
		if (childNode.nodeType == 3 /TEXT_NODE/ ) {
		dbg("replacing text node with a p tag with the same content.");
		var p = document.createElement('p');
		p.innerHTML = childNode.nodeValue;
		p.style.display = 'inline';
		p.className = 'readability-styled';
		childNode.parentNode.replaceChild(p, childNode);
		}
		});
		}
		}
		}

		/**
		* Loop through all paragraphs, and assign a score to them based on how content-y they look.
		* Then add their score to their parent node.
		*
		* A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
		**/
		var allParagraphs = document.getElementsByTagName("p");
		var candidates = [];

		for (var i = 0; i < allParagraphs.length; ++i) {
		var paragraph = allParagraphs[i];
		var parentNode = paragraph.parentNode;
		var grandParentNode = parentNode.parentNode;
		var innerText = getInnerText(paragraph);

		// If this paragraph is less than 25 characters, don't even count it.
		if (innerText.length >= 25) {

		// Initialize readability data for the parent.
		if (typeof parentNode.readability == 'undefined') {
		initializeNode(parentNode);
		candidates.push(parentNode);
		}

		// Initialize readability data for the grandparent.
		if (typeof grandParentNode.readability == 'undefined') {
		initializeNode(grandParentNode);
		candidates.push(grandParentNode);
		}

		var contentScore = 0;

		// Add a point for the paragraph itself as a base. */
		++contentScore;

		// Add points for any commas within this paragraph */
		contentScore += innerText.split(',').length;

		// For every 100 characters in this paragraph, add another point. Up to 3 points. */
		contentScore += Math.min(Math.floor(innerText.length / 100), 3);

		// Add the score to the parent. The grandparent gets half. */
		parentNode.readability.contentScore += contentScore;
		grandParentNode.readability.contentScore += contentScore / 2;
		}

		}

		/**
		* After we've calculated scores, loop through all of the possible candidate nodes we found
		* and find the one with the highest score.
		**/
		var topCandidate = null;
		candidates.forEach(function (candidate) {
		/**
		* Scale the final candidates score based on link density. Good content should have a
		* relatively small link density (5% or less) and be mostly unaffected by this operation.
		**/
		candidate.readability.contentScore = candidate.readability.contentScore * (1 - getLinkDensity(candidate));

		dbg('Candidate: ' + candidate + " (" + candidate.className + ":" + candidate.id + ") with score " + candidate.readability.contentScore);

		if (!topCandidate \|\| candidate.readability.contentScore > topCandidate.readability.contentScore) topCandidate = candidate;
		});

		/**
		* If we still have no top candidate, just use the body as a last resort.
		* We also have to copy the body node so it is something we can modify.
		**/
		if (topCandidate === null \|\| topCandidate.tagName === "BODY") {
		topCandidate = document.createElement("DIV");
		topCandidate.innerHTML = document.body.innerHTML;
		document.body.innerHTML = "";
		document.body.appendChild(topCandidate);
		initializeNode(topCandidate);
		}


		/**
		* Now that we have the top candidate, look through its siblings for content that might also be related.
		* Things like preambles, content split by ads that we removed, etc.
		**/
		var articleContent = document.createElement("DIV");
		articleContent.id = "readability-content";
		var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
		var siblingNodes = topCandidate.parentNode.childNodes;
		for (var i = 0, il = siblingNodes.length; i < il; i++) {
		var siblingNode = siblingNodes[i];
		var append = false;

		dbg("Looking at sibling node: " + siblingNode + " (" + siblingNode.className + ":" + siblingNode.id + ")" + ((typeof siblingNode.readability != 'undefined') ? (" with score " + siblingNode.readability.contentScore) : ''));
		dbg("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown'));

		if (siblingNode === topCandidate) {
		append = true;
		}

		if (typeof siblingNode.readability != 'undefined' && siblingNode.readability.contentScore >= siblingScoreThreshold) {
		append = true;
		}

		if (siblingNode.nodeName == "P") {
		var linkDensity = getLinkDensity(siblingNode);
		var nodeContent = getInnerText(siblingNode);
		var nodeLength = nodeContent.length;

		if (nodeLength > 80 && linkDensity < 0.25) {
		append = true;
		} else if (nodeLength < 80 && linkDensity == 0 && nodeContent.search(/\.( \|$)/) !== -1) {
		append = true;
		}
		}

		if (append) {
		dbg("Appending node: " + siblingNode)

		/* Append sibling and subtract from our list because it removes the node when you append to another node */
		articleContent.appendChild(siblingNode);
		i--;
		il--;
		}
		}

		/**
		* So we have all of the content that we need. Now we clean it up for presentation.
		**/
		prepArticle(articleContent);

		return articleContent;
		};

		/**
		* Remove the style attribute on every e and under.
		*
		* @param Element
		* @return void
		**/
		function cleanStyles(e) {
		if (!e) return;


		// Remove any root styles, if we're able.
		if (typeof e.removeAttribute == 'function' && e.className != 'readability-styled') e.removeAttribute('style');

		// Go until there are no more child nodes
		var cur = e.firstChild;
		while (cur) {
		if (cur.nodeType == 1) {
		// Remove style attribute(s) :
		if (cur.className != "readability-styled") {
		cur.removeAttribute("style");
		}
		cleanStyles(cur);
		}
		cur = cur.nextSibling;
		}
		}

		/**
		* Remove extraneous break tags from a node.
		*
		* @param Element
		* @return void
		**/
		function killBreaks(e) {
		e.innerHTML = e.innerHTML.replace(regexps.killBreaksRe, '<br />');
		}


		/**
		* Get the inner text of a node - cross browser compatibly.
		* This also strips out any excess whitespace to be found.
		*
		* @param Element
		* @return string
		**/
		function getInnerText(e, normalizeSpaces) {
		var textContent = "";

		normalizeSpaces = (typeof normalizeSpaces == 'undefined') ? true : normalizeSpaces;

		textContent = e.textContent.replace(regexps.trimRe, "");

		if (normalizeSpaces) return textContent.replace(regexps.normalizeRe, " ");
		else return textContent;
		}

		/**
		* Get the number of times a string s appears in the node e.
		*
		* @param Element
		* @param string - what to split on. Default is ","
		* @return number (integer)
		**/
		function getCharCount(e, s) {
		s = s \|\| ",";
		return getInnerText(e).split(s).length;
		}

		/**
		* Get the density of links as a percentage of the content
		* This is the amount of text that is inside a link divided by the total text in the node.
		*
		* @param Element
		* @return number (float)
		**/
		function getLinkDensity(e) {
		var links = e.getElementsByTagName("a");
		var textLength = getInnerText(e).length;
		var linkLength = 0;
		for (var i = 0, il = links.length; i < il; i++) {
		linkLength += getInnerText(links[i]).length;
		}

		return linkLength / textLength;
		}

		/**
		* Get an elements class/id weight. Uses regular expressions to tell if this
		* element looks good or bad.
		*
		* @param Element
		* @return number (Integer)
		**/
		function getClassWeight(e) {
		var weight = 0;

		/* Look for a special classname */
		if (e.className != "") {
		if (e.className.search(regexps.negativeRe) !== -1) weight -= 25;

		if (e.className.search(regexps.positiveRe) !== -1) weight += 25;
		}

		/* Look for a special ID */
		if (typeof (e.id) == 'string' && e.id != "") {
		if (e.id.search(regexps.negativeRe) !== -1) weight -= 25;

		if (e.id.search(regexps.positiveRe) !== -1) weight += 25;
		}

		return weight;
		}

		/**
		* Clean a node of all elements of type "tag".
		* (Unless it's a youtube/vimeo video. People love movies.)
		*
		* @param Element
		* @param string tag to clean
		* @return void
		**/
		function clean(e, tag) {
		var targetList = e.getElementsByTagName(tag);
		var isEmbed = (tag == 'object' \|\| tag == 'embed');

		for (var y = targetList.length - 1; y >= 0; y--) {
		/* Allow youtube and vimeo videos through as people usually want to see those. */
		if (isEmbed && targetList[y].innerHTML.search(regexps.videoRe) !== -1) {
		continue;
		}

		targetList[y].parentNode.removeChild(targetList[y]);
		}
		}

		/**
		* Clean an element of all tags of type "tag" if they look fishy.
		* "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
		*
		* @return void
		**/
		function cleanConditionally(e, tag) {
		var tagsList = e.getElementsByTagName(tag);
		var curTagsLength = tagsList.length;

		/**
		* Gather counts for other typical elements embedded within.
		* Traverse backwards so we can remove nodes at the same time without effecting the traversal.
		*
		* TODO: Consider taking into account original contentScore here.
		**/
		for (var i = curTagsLength - 1; i >= 0; i--) {
		var weight = getClassWeight(tagsList[i]);

		dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].className + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability != 'undefined') ? (" with score " + tagsList[i].readability.contentScore) : ''));

		if (weight < 0) {
		tagsList[i].parentNode.removeChild(tagsList[i]);
		} else if (getCharCount(tagsList[i], ',') < 10) {
		/**
		* If there are not very many commas, and the number of
		* non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
		**/

		var p = tagsList[i].getElementsByTagName("p").length;
		var img = tagsList[i].getElementsByTagName("img").length;
		var li = tagsList[i].getElementsByTagName("li").length - 100;
		var input = tagsList[i].getElementsByTagName("input").length;

		var embedCount = 0;
		var embeds = tagsList[i].getElementsByTagName("embed");
		for (var ei = 0, il = embeds.length; ei < il; ei++) {
		if (embeds[ei].src.search(regexps.videoRe) == -1) {
		embedCount++;
		}
		}

		var linkDensity = getLinkDensity(tagsList[i]);
		var contentLength = getInnerText(tagsList[i]).length;
		var toRemove = false;

		if (img > p) {
		toRemove = true;
		} else if (li > p && tag != "ul" && tag != "ol") {
		toRemove = true;
		} else if (input > Math.floor(p / 3)) {
		toRemove = true;
		} else if (contentLength < 25 && (img == 0 \|\| img > 2)) {
		toRemove = true;
		} else if (weight < 25 && linkDensity > .2) {
		toRemove = true;
		} else if (weight >= 25 && linkDensity > .5) {
		toRemove = true;
		} else if ((embedCount == 1 && contentLength < 75) \|\| embedCount > 1) {
		toRemove = true;
		}

		if (toRemove) {
		tagsList[i].parentNode.removeChild(tagsList[i]);
		}
		}
		}
		}

		/**
		* Clean out spurious headers from an Element. Checks things like classnames and link density.
		*
		* @param Element
		* @return void
		**/
		function cleanHeaders(e) {
		for (var headerIndex = 1; headerIndex < 7; headerIndex++) {
		var headers = e.getElementsByTagName('h' + headerIndex);
		for (var i = headers.length - 1; i >= 0; i--) {
		if (getClassWeight(headers[i]) < 0 \|\| getLinkDensity(headers[i]) > 0.33) {
		headers[i].parentNode.removeChild(headers[i]);
		}
		}
		}
		}

		function prepArticle(articleContent) {
		cleanStyles(articleContent);
		killBreaks(articleContent);

		/* Clean out junk from the article content */
		clean(articleContent, "form");
		clean(articleContent, "object");
		clean(articleContent, "h1");
		/**
		* If there is only one h2, they are probably using it
		* as a header and not a subheader, so remove it since we already have a header.
		***/
		if (articleContent.getElementsByTagName('h2').length == 1) clean(articleContent, "h2");
		clean(articleContent, "iframe");

		cleanHeaders(articleContent);

		/* Do these last as the previous stuff may have removed junk that will affect these */
		cleanConditionally(articleContent, "table");
		cleanConditionally(articleContent, "ul");
		cleanConditionally(articleContent, "div");

		/* Remove extra paragraphs */
		var articleParagraphs = articleContent.getElementsByTagName('p');
		for (i = articleParagraphs.length - 1; i >= 0; i--) {
		var imgCount = articleParagraphs[i].getElementsByTagName('img').length;
		var embedCount = articleParagraphs[i].getElementsByTagName('embed').length;
		var objectCount = articleParagraphs[i].getElementsByTagName('object').length;

		if (imgCount == 0 && embedCount == 0 && objectCount == 0 && getInnerText(articleParagraphs[i], false) == '') {
		articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]);
		}
		}

		try {
		articleContent.innerHTML = articleContent.innerHTML.replace(/<br[^>]>\s<p/gi, '<p');
		} catch (e) {
		dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.");
		}

		}

		/**
		* Initialize a node with the readability object. Also checks the
		* className/id for special names to add to its score.
		*
		* @param Element
		* @return void
		**/
		function initializeNode(node) {
		node.readability = {
		"contentScore": 0
		};

		switch (node.tagName) {
		case 'DIV':
		node.readability.contentScore += 5;
		break;

		case 'PRE':
		case 'TD':
		case 'BLOCKQUOTE':
		node.readability.contentScore += 3;
		break;

		case 'ADDRESS':
		case 'OL':
		case 'UL':
		case 'DL':
		case 'DD':
		case 'DT':
		case 'LI':
		case 'FORM':
		node.readability.contentScore -= 3;
		break;

		case 'H1':
		case 'H2':
		case 'H3':
		case 'H4':
		case 'H5':
		case 'H6':
		case 'TH':
		node.readability.contentScore -= 5;
		break;
		}

		node.readability.contentScore += getClassWeight(node);
		}

		function read(html, options, callback) {
		@@ -647,0 +82,0 @@ if (typeof options === 'function') {

node-readability - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics

Worsened metrics