Socket
Socket
Sign inDemoInstall

node-readability

Package Overview
Dependencies
Maintainers
1
Versions
27
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

node-readability - npm Package Compare versions

Comparing version 0.0.2 to 0.0.3

src/helpers.js

16

examples/simple.js

@@ -1,9 +0,13 @@

var readability = require('../src/readability');
var readability = require('../src/readability')
, fs = require('fs')
// uncoment the following line to print the debug info to console.
// readability.debug(true);
readability.debug(true);
readability.read('http://howtonode.org/really-simple-file-uploads', function(err, read) {
console.log(read.getContent());
console.log(read.getTitle());
});
readability.read('http://colorlines.com/archives/2011/08/dispatch_from_angola_faith-based_slavery_in_a_louisiana_prison.html',
function(err, read) {
var dom = read.getDocument();
var html = '<html><head><meta charset="utf-8"><title>'+dom.title+'</title></head><body><h1>'+read.getTitle()+'</h1>'+read.getContent()+'</body></html>';
console.log(html);
});
{
"name": "node-readability",
"version": "0.0.2",
"version": "0.0.3",
"author": "Zihua Li",

@@ -5,0 +5,0 @@ "description": "Turn any web page into a clean view",

# node-readability
Turn any web page into a clean view. It's based on arc90's readability project.
Turn any web page into a clean view. This module is based on arc90's readability project.

@@ -10,3 +10,2 @@ ## Install

## Requirements
* [node.js](http://nodejs.org/)
* [jsdom](https://github.com/tmpvar/jsdom)

@@ -23,3 +22,3 @@ * [fetch](https://github.com/andris9/fetch)

* **options** is an optional options object
* **callback** is the callback to run - `callback(error, read)`
* **callback** is the callback to run - `callback(error, article)`

@@ -30,3 +29,2 @@ Example

// source file is iso-8859-15 but it is converted to utf-8 automatically
readability.read('http://howtonode.org/really-simple-file-uploads', function(err, article) {

@@ -36,6 +34,8 @@ console.log(article.getArticleContent());

**NB** If the file has been marked with charset other than utf-8, it is converted automatically. Charsets such as GBK, GB2312 is also supported via icon.
**NB** If the file has been marked with charset other than utf-8, it is converted automatically. Charsets such as GBK, GB2312 is also supported via [iconv](https://github.com/bnoordhuis/node-iconv).
## Options
node-readability support all the options that [fetch](https://github.com/andris9/fetch) support.
Possible option values

@@ -61,4 +61,2 @@

Readability support lazy evaluation by passing `readResult` to the callback function.
### getContent()

@@ -82,4 +80,4 @@

* Support more readability features
* Performance optimization
* Support more readability features
* Performance optimization

@@ -86,0 +84,0 @@ ## License

var jsdom = require('jsdom'),
fetchUrl = require("fetch").fetchUrl
fetchUrl = require("fetch").fetchUrl,
helpers = require('./helpers')
var dbg;
exports.debug = function (debug) {
dbg = (debug) ? console.log : function () {}
helpers.debug(debug);
};

@@ -11,21 +11,2 @@

function trim(string) {
return string.replace(/(^\s*)(\s*$)/g, '');
}
// All of the regular expressions in use within readability.
var regexps = {
unlikelyCandidatesRe: /combx|comment|disqus|foot|header|menu|meta|nav|rss|shoutbox|sidebar|sponsor/i,
okMaybeItsACandidateRe: /and|article|body|column|main/i,
positiveRe: /article|body|content|entry|hentry|page|pagination|post|text/i,
negativeRe: /combx|comment|contact|foot|footer|footnote|link|media|meta|promo|related|scroll|shoutbox|sponsor|utility|tags|widget/i,
divToPElementsRe: /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
replaceBrsRe: /(<br[^>]*>[ \n\r\t]*){2,}/gi,
replaceFontsRe: /<(\/?)font[^>]*>/gi,
trimRe: /^\s+|\s+$/g,
normalizeRe: /\s{2,}/g,
killBreaksRe: /(<br\s*\/?>(\s|&nbsp;?)*){1,}/g,
videoRe: /http:\/\/(www\.)?(youtube|vimeo|youku|tudou|56|yinyuetai)\.com/i
};
function Readablity(document, options) {

@@ -40,3 +21,3 @@ this._document = document;

prepDocument(this._document);
helpers.prepDocument(this._document);
this.cache = {

@@ -52,7 +33,7 @@ 'body': this._document.body.innerHTML

articleContent = grabArticle(this._document);
if (getInnerText(articleContent, false) === "") {
articleContent = helpers.grabArticle(this._document);
if (helpers.getInnerText(articleContent, false) === "") {
this._document.body.innerHTML = this.cache['body'];
articleContent = grabArticle(this._document, true);
if (getInnerText(articleContent, false) === "") {
articleContent = helpers.grabArticle(this._document, true);
if (helpers.getInnerText(articleContent, false) === "") {
return this.cache['article-content'] = false;

@@ -72,8 +53,10 @@ }

var betterTitle;
var commonSeparatingCharacters = ['|', '_', '-', '«'];
var commonSeparatingCharacters = [' | ', ' _ ', ' - ', '«', '»', '—'];
var self = this;
commonSeparatingCharacters.forEach(function (char) {
var tmpArray = title.split(char);
if (tmpArray.length > 1) {
if (betterTitle) return this.cache['article-title'] = title;
betterTitle = trim(tmpArray[0]);
if (betterTitle) return self.cache['article-title'] = title;
betterTitle = tmpArray[0].trim();
}

@@ -97,550 +80,2 @@ });

/**
* Prepare the HTML document for readability to scrape it.
* This includes things like stripping javascript, CSS, and handling terrible markup.
*
* @return void
**/
function prepDocument(document) {
var frames = document.getElementsByTagName('frame');
if (frames.length > 0) {
var bestFrame = null;
var bestFrameSize = 0;
frames.forEach(function (frame) {
var frameSize = frame.offsetWidth + frame.offsetHeight;
var canAccessFrame = false;
try {
frame.contentWindow.document.body;
canAccessFrame = true;
} catch (e) {}
if (canAccessFrame && frameSize > bestFrameSize) {
bestFrame = frame;
bestFrameSize = frameSize;
}
});
if (bestFrame) {
var newBody = document.createElement('body');
newBody.innerHTML = bestFrame.contentWindow.document.body.innerHTML;
newBody.style.overflow = 'scroll';
document.body = newBody;
var frameset = document.getElementsByTagName('frameset')[0];
if (frameset) {
frameset.parentNode.removeChild(frameset);
}
}
}
// remove all scripts that are not readability
var scripts = document.getElementsByTagName('script');
for (var i = 0; i < scripts.length; ++i) {
scripts[i].parentNode.removeChild(scripts[i]);
}
// remove all stylesheets
for (var k = 0; k < document.styleSheets.length; k++) {
document.styleSheets[k].disabled = true;
}
// turn all double br's into p's
// note, this is pretty costly as far as processing goes. Maybe optimize later.
document.body.innerHTML = document.body.innerHTML.replace(regexps.replaceBrsRe, '</p><p>').replace(regexps.replaceFontsRe, '<$1span>')
}
/***
* grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
* most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
*
* @return Element
**/
function grabArticle(document, preserveUnlikelyCandidates) {
/**
* First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs
* into P tags where they have been used inappropriately (as in, where they contain no other block level elements.)
*
* Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
* TODO: Shouldn't this be a reverse traversal?
**/
var nodes = document.getElementsByTagName('*');
for (var i = 0; i < nodes.length; ++i) {
var node = nodes[i];
// Remove unlikely candidates */
var continueFlag = false;
if (!preserveUnlikelyCandidates) {
var unlikelyMatchString = node.className + node.id;
if (unlikelyMatchString.search(regexps.unlikelyCandidatesRe) !== -1 && unlikelyMatchString.search(regexps.okMaybeItsACandidateRe) == -1 && node.tagName !== "BODY") {
dbg("Removing unlikely candidate - " + unlikelyMatchString);
node.parentNode.removeChild(node);
continueFlag = true;
}
}
// Turn all divs that don't have children block level elements into p's
if (!continueFlag && node.tagName === "DIV") {
if (node.innerHTML.search(regexps.divToPElementsRe) === -1) {
dbg("Altering div to p");
var newNode = document.createElement('p');
newNode.innerHTML = node.innerHTML;
node.parentNode.replaceChild(newNode, node);
} else {
// EXPERIMENTAL
node.childNodes.forEach(function (childNode) {
if (childNode.nodeType == 3 /*TEXT_NODE*/ ) {
dbg("replacing text node with a p tag with the same content.");
var p = document.createElement('p');
p.innerHTML = childNode.nodeValue;
p.style.display = 'inline';
p.className = 'readability-styled';
childNode.parentNode.replaceChild(p, childNode);
}
});
}
}
}
/**
* Loop through all paragraphs, and assign a score to them based on how content-y they look.
* Then add their score to their parent node.
*
* A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
**/
var allParagraphs = document.getElementsByTagName("p");
var candidates = [];
for (var i = 0; i < allParagraphs.length; ++i) {
var paragraph = allParagraphs[i];
var parentNode = paragraph.parentNode;
var grandParentNode = parentNode.parentNode;
var innerText = getInnerText(paragraph);
// If this paragraph is less than 25 characters, don't even count it.
if (innerText.length >= 25) {
// Initialize readability data for the parent.
if (typeof parentNode.readability == 'undefined') {
initializeNode(parentNode);
candidates.push(parentNode);
}
// Initialize readability data for the grandparent.
if (typeof grandParentNode.readability == 'undefined') {
initializeNode(grandParentNode);
candidates.push(grandParentNode);
}
var contentScore = 0;
// Add a point for the paragraph itself as a base. */
++contentScore;
// Add points for any commas within this paragraph */
contentScore += innerText.split(',').length;
// For every 100 characters in this paragraph, add another point. Up to 3 points. */
contentScore += Math.min(Math.floor(innerText.length / 100), 3);
// Add the score to the parent. The grandparent gets half. */
parentNode.readability.contentScore += contentScore;
grandParentNode.readability.contentScore += contentScore / 2;
}
}
/**
* After we've calculated scores, loop through all of the possible candidate nodes we found
* and find the one with the highest score.
**/
var topCandidate = null;
candidates.forEach(function (candidate) {
/**
* Scale the final candidates score based on link density. Good content should have a
* relatively small link density (5% or less) and be mostly unaffected by this operation.
**/
candidate.readability.contentScore = candidate.readability.contentScore * (1 - getLinkDensity(candidate));
dbg('Candidate: ' + candidate + " (" + candidate.className + ":" + candidate.id + ") with score " + candidate.readability.contentScore);
if (!topCandidate || candidate.readability.contentScore > topCandidate.readability.contentScore) topCandidate = candidate;
});
/**
* If we still have no top candidate, just use the body as a last resort.
* We also have to copy the body node so it is something we can modify.
**/
if (topCandidate === null || topCandidate.tagName === "BODY") {
topCandidate = document.createElement("DIV");
topCandidate.innerHTML = document.body.innerHTML;
document.body.innerHTML = "";
document.body.appendChild(topCandidate);
initializeNode(topCandidate);
}
/**
* Now that we have the top candidate, look through its siblings for content that might also be related.
* Things like preambles, content split by ads that we removed, etc.
**/
var articleContent = document.createElement("DIV");
articleContent.id = "readability-content";
var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
var siblingNodes = topCandidate.parentNode.childNodes;
for (var i = 0, il = siblingNodes.length; i < il; i++) {
var siblingNode = siblingNodes[i];
var append = false;
dbg("Looking at sibling node: " + siblingNode + " (" + siblingNode.className + ":" + siblingNode.id + ")" + ((typeof siblingNode.readability != 'undefined') ? (" with score " + siblingNode.readability.contentScore) : ''));
dbg("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown'));
if (siblingNode === topCandidate) {
append = true;
}
if (typeof siblingNode.readability != 'undefined' && siblingNode.readability.contentScore >= siblingScoreThreshold) {
append = true;
}
if (siblingNode.nodeName == "P") {
var linkDensity = getLinkDensity(siblingNode);
var nodeContent = getInnerText(siblingNode);
var nodeLength = nodeContent.length;
if (nodeLength > 80 && linkDensity < 0.25) {
append = true;
} else if (nodeLength < 80 && linkDensity == 0 && nodeContent.search(/\.( |$)/) !== -1) {
append = true;
}
}
if (append) {
dbg("Appending node: " + siblingNode)
/* Append sibling and subtract from our list because it removes the node when you append to another node */
articleContent.appendChild(siblingNode);
i--;
il--;
}
}
/**
* So we have all of the content that we need. Now we clean it up for presentation.
**/
prepArticle(articleContent);
return articleContent;
};
/**
* Remove the style attribute on every e and under.
*
* @param Element
* @return void
**/
function cleanStyles(e) {
if (!e) return;
// Remove any root styles, if we're able.
if (typeof e.removeAttribute == 'function' && e.className != 'readability-styled') e.removeAttribute('style');
// Go until there are no more child nodes
var cur = e.firstChild;
while (cur) {
if (cur.nodeType == 1) {
// Remove style attribute(s) :
if (cur.className != "readability-styled") {
cur.removeAttribute("style");
}
cleanStyles(cur);
}
cur = cur.nextSibling;
}
}
/**
* Remove extraneous break tags from a node.
*
* @param Element
* @return void
**/
function killBreaks(e) {
e.innerHTML = e.innerHTML.replace(regexps.killBreaksRe, '<br />');
}
/**
* Get the inner text of a node - cross browser compatibly.
* This also strips out any excess whitespace to be found.
*
* @param Element
* @return string
**/
function getInnerText(e, normalizeSpaces) {
var textContent = "";
normalizeSpaces = (typeof normalizeSpaces == 'undefined') ? true : normalizeSpaces;
textContent = e.textContent.replace(regexps.trimRe, "");
if (normalizeSpaces) return textContent.replace(regexps.normalizeRe, " ");
else return textContent;
}
/**
* Get the number of times a string s appears in the node e.
*
* @param Element
* @param string - what to split on. Default is ","
* @return number (integer)
**/
function getCharCount(e, s) {
s = s || ",";
return getInnerText(e).split(s).length;
}
/**
* Get the density of links as a percentage of the content
* This is the amount of text that is inside a link divided by the total text in the node.
*
* @param Element
* @return number (float)
**/
function getLinkDensity(e) {
var links = e.getElementsByTagName("a");
var textLength = getInnerText(e).length;
var linkLength = 0;
for (var i = 0, il = links.length; i < il; i++) {
linkLength += getInnerText(links[i]).length;
}
return linkLength / textLength;
}
/**
* Get an elements class/id weight. Uses regular expressions to tell if this
* element looks good or bad.
*
* @param Element
* @return number (Integer)
**/
function getClassWeight(e) {
var weight = 0;
/* Look for a special classname */
if (e.className != "") {
if (e.className.search(regexps.negativeRe) !== -1) weight -= 25;
if (e.className.search(regexps.positiveRe) !== -1) weight += 25;
}
/* Look for a special ID */
if (typeof (e.id) == 'string' && e.id != "") {
if (e.id.search(regexps.negativeRe) !== -1) weight -= 25;
if (e.id.search(regexps.positiveRe) !== -1) weight += 25;
}
return weight;
}
/**
* Clean a node of all elements of type "tag".
* (Unless it's a youtube/vimeo video. People love movies.)
*
* @param Element
* @param string tag to clean
* @return void
**/
function clean(e, tag) {
var targetList = e.getElementsByTagName(tag);
var isEmbed = (tag == 'object' || tag == 'embed');
for (var y = targetList.length - 1; y >= 0; y--) {
/* Allow youtube and vimeo videos through as people usually want to see those. */
if (isEmbed && targetList[y].innerHTML.search(regexps.videoRe) !== -1) {
continue;
}
targetList[y].parentNode.removeChild(targetList[y]);
}
}
/**
* Clean an element of all tags of type "tag" if they look fishy.
* "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
*
* @return void
**/
function cleanConditionally(e, tag) {
var tagsList = e.getElementsByTagName(tag);
var curTagsLength = tagsList.length;
/**
* Gather counts for other typical elements embedded within.
* Traverse backwards so we can remove nodes at the same time without effecting the traversal.
*
* TODO: Consider taking into account original contentScore here.
**/
for (var i = curTagsLength - 1; i >= 0; i--) {
var weight = getClassWeight(tagsList[i]);
dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].className + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability != 'undefined') ? (" with score " + tagsList[i].readability.contentScore) : ''));
if (weight < 0) {
tagsList[i].parentNode.removeChild(tagsList[i]);
} else if (getCharCount(tagsList[i], ',') < 10) {
/**
* If there are not very many commas, and the number of
* non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
**/
var p = tagsList[i].getElementsByTagName("p").length;
var img = tagsList[i].getElementsByTagName("img").length;
var li = tagsList[i].getElementsByTagName("li").length - 100;
var input = tagsList[i].getElementsByTagName("input").length;
var embedCount = 0;
var embeds = tagsList[i].getElementsByTagName("embed");
for (var ei = 0, il = embeds.length; ei < il; ei++) {
if (embeds[ei].src.search(regexps.videoRe) == -1) {
embedCount++;
}
}
var linkDensity = getLinkDensity(tagsList[i]);
var contentLength = getInnerText(tagsList[i]).length;
var toRemove = false;
if (img > p) {
toRemove = true;
} else if (li > p && tag != "ul" && tag != "ol") {
toRemove = true;
} else if (input > Math.floor(p / 3)) {
toRemove = true;
} else if (contentLength < 25 && (img == 0 || img > 2)) {
toRemove = true;
} else if (weight < 25 && linkDensity > .2) {
toRemove = true;
} else if (weight >= 25 && linkDensity > .5) {
toRemove = true;
} else if ((embedCount == 1 && contentLength < 75) || embedCount > 1) {
toRemove = true;
}
if (toRemove) {
tagsList[i].parentNode.removeChild(tagsList[i]);
}
}
}
}
/**
* Clean out spurious headers from an Element. Checks things like classnames and link density.
*
* @param Element
* @return void
**/
function cleanHeaders(e) {
for (var headerIndex = 1; headerIndex < 7; headerIndex++) {
var headers = e.getElementsByTagName('h' + headerIndex);
for (var i = headers.length - 1; i >= 0; i--) {
if (getClassWeight(headers[i]) < 0 || getLinkDensity(headers[i]) > 0.33) {
headers[i].parentNode.removeChild(headers[i]);
}
}
}
}
function prepArticle(articleContent) {
cleanStyles(articleContent);
killBreaks(articleContent);
/* Clean out junk from the article content */
clean(articleContent, "form");
clean(articleContent, "object");
clean(articleContent, "h1");
/**
* If there is only one h2, they are probably using it
* as a header and not a subheader, so remove it since we already have a header.
***/
if (articleContent.getElementsByTagName('h2').length == 1) clean(articleContent, "h2");
clean(articleContent, "iframe");
cleanHeaders(articleContent);
/* Do these last as the previous stuff may have removed junk that will affect these */
cleanConditionally(articleContent, "table");
cleanConditionally(articleContent, "ul");
cleanConditionally(articleContent, "div");
/* Remove extra paragraphs */
var articleParagraphs = articleContent.getElementsByTagName('p');
for (i = articleParagraphs.length - 1; i >= 0; i--) {
var imgCount = articleParagraphs[i].getElementsByTagName('img').length;
var embedCount = articleParagraphs[i].getElementsByTagName('embed').length;
var objectCount = articleParagraphs[i].getElementsByTagName('object').length;
if (imgCount == 0 && embedCount == 0 && objectCount == 0 && getInnerText(articleParagraphs[i], false) == '') {
articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]);
}
}
try {
articleContent.innerHTML = articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, '<p');
} catch (e) {
dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.");
}
}
/**
* Initialize a node with the readability object. Also checks the
* className/id for special names to add to its score.
*
* @param Element
* @return void
**/
function initializeNode(node) {
node.readability = {
"contentScore": 0
};
switch (node.tagName) {
case 'DIV':
node.readability.contentScore += 5;
break;
case 'PRE':
case 'TD':
case 'BLOCKQUOTE':
node.readability.contentScore += 3;
break;
case 'ADDRESS':
case 'OL':
case 'UL':
case 'DL':
case 'DD':
case 'DT':
case 'LI':
case 'FORM':
node.readability.contentScore -= 3;
break;
case 'H1':
case 'H2':
case 'H3':
case 'H4':
case 'H5':
case 'H6':
case 'TH':
node.readability.contentScore -= 5;
break;
}
node.readability.contentScore += getClassWeight(node);
}
function read(html, options, callback) {

@@ -647,0 +82,0 @@ if (typeof options === 'function') {

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc