node-readability
Advanced tools
Comparing version 0.4.0 to 0.9.0
{ | ||
"name": "node-readability", | ||
"version": "0.4.0", | ||
"version": "0.9.0", | ||
"author": "Zihua Li", | ||
@@ -5,0 +5,0 @@ "description": "Turning any web page into a clean view.", |
@@ -9,4 +9,6 @@ # Readability | ||
1. Optimized for more websites. | ||
2. Support encodings such as GBK and GB2312. | ||
3. Converts relative urls to absolute for images and links automatically(Thank [Guillermo Baigorria](https://github.com/gbaygon) & [Tom Sutton](https://github.com/tomsutton1984)). | ||
2. Supporting HTML5 tags(`article`, `section`) and Microdata API. | ||
3. Focusing on both accuracy and performance. 4x times faster than arc90's version. | ||
3. Supporting encodings such as GBK and GB2312. | ||
4. Converting relative urls to absolute for images and links automatically(Thank [Guillermo Baigorria](https://github.com/gbaygon) & [Tom Sutton](https://github.com/tomsutton1984)). | ||
@@ -36,13 +38,13 @@ ## Example | ||
read('http://howtonode.org/really-simple-file-uploads', function(err, article, meta) { | ||
// The main body of the page. | ||
// Main Article | ||
console.log(article.content); | ||
// The title of the page. | ||
// Title | ||
console.log(article.title); | ||
// The raw HTML code of the page | ||
// HTML Source Code | ||
console.log(article.html); | ||
// The document object of the page | ||
// DOM | ||
console.log(article.document); | ||
// The response object from request lib | ||
// Response Object from Request Lib | ||
console.log(meta); | ||
@@ -49,0 +51,0 @@ }); |
@@ -5,3 +5,3 @@ var url = require("url"); | ||
var regexps = { | ||
unlikelyCandidatesRe: /combx|comment|disqus|foot|header|menu|meta|nav|rss|shoutbox|sidebar|sponsor/i, | ||
unlikelyCandidatesRe: /combx|modal|lightbox|comment|disqus|foot|header|menu|meta|nav|rss|shoutbox|sidebar|sponsor|social|teaserlist|time|tweet|twitter/i, | ||
okMaybeItsACandidateRe: /and|article|body|column|main/i, | ||
@@ -20,4 +20,4 @@ positiveRe: /article|body|content|entry|hentry|page|pagination|post|text/i, | ||
var dbg; | ||
exports.debug = function (debug) { | ||
dbg = (debug) ? console.log : function () {}; | ||
exports.debug = function(debug) { | ||
dbg = (debug) ? console.log : function() {}; | ||
}; | ||
@@ -37,3 +37,3 @@ | ||
**/ | ||
var prepDocument = module.exports.prepDocument = function (document) { | ||
var prepDocument = module.exports.prepDocument = function(document) { | ||
var frames = document.getElementsByTagName('frame'); | ||
@@ -44,3 +44,3 @@ if (frames.length > 0) { | ||
Array.prototype.slice.call(frames, 0).forEach(function (frame) { | ||
Array.prototype.slice.call(frames, 0).forEach(function(frame) { | ||
var frameSize = frame.offsetWidth + frame.offsetHeight; | ||
@@ -72,16 +72,6 @@ var canAccessFrame = false; | ||
// remove all scripts that are not readability | ||
var scripts = document.getElementsByTagName('script'); | ||
for (var i = 0; i < scripts.length; ++i) { | ||
scripts[i].parentNode.removeChild(scripts[i]); | ||
} | ||
// remove all stylesheets | ||
for (var k = 0; k < document.styleSheets.length; k++) { | ||
document.styleSheets[k].disabled = true; | ||
} | ||
// turn all double br's into p's | ||
// note, this is pretty costly as far as processing goes. Maybe optimize later. | ||
document.body.innerHTML = document.body.innerHTML.replace(regexps.replaceBrsRe, '</p><p>').replace(regexps.replaceFontsRe, '<$1span>') | ||
} | ||
// document.body.innerHTML = document.body.innerHTML.replace(regexps.replaceBrsRe, '</p><p>').replace(regexps.replaceFontsRe, '<$1span>'); | ||
}; | ||
@@ -94,3 +84,3 @@ /*** | ||
**/ | ||
var grabArticle = module.exports.grabArticle = function (document, preserveUnlikelyCandidates) { | ||
var grabArticle = module.exports.grabArticle = function(document, preserveUnlikelyCandidates) { | ||
/** | ||
@@ -118,3 +108,3 @@ * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs | ||
// Turn all divs that don't have children block level elements into p's | ||
if (!continueFlag && node.tagName === "DIV") { | ||
if (!continueFlag && node.tagName === 'DIV') { | ||
if (node.innerHTML.search(regexps.divToPElementsRe) === -1) { | ||
@@ -127,3 +117,3 @@ dbg("Altering div to p"); | ||
// EXPERIMENTAL | ||
node.childNodes._toArray().forEach(function (childNode) { | ||
node.childNodes._toArray().forEach(function(childNode) { | ||
if (childNode.nodeType == 3 /*TEXT_NODE*/ ) { | ||
@@ -177,3 +167,2 @@ // use span instead of p. Need more tests. | ||
// Add points for any commas within this paragraph */ | ||
// support Chinese commas. | ||
contentScore += innerText.replace(',', ',').split(',').length; | ||
@@ -195,3 +184,3 @@ | ||
var topCandidate = null; | ||
candidates.forEach(function (candidate) { | ||
candidates.forEach(function(candidate) { | ||
/** | ||
@@ -212,8 +201,10 @@ * Scale the final candidates score based on link density. Good content should have a | ||
**/ | ||
if (topCandidate === null || topCandidate.tagName === "BODY") { | ||
if (topCandidate === null || topCandidate.tagName === 'BODY') { | ||
// With no top candidate, bail out if no body tag exists as last resort. | ||
if (!document.body) return new Error("No body tag was found."); | ||
topCandidate = document.createElement("DIV"); | ||
if (!document.body) { | ||
return new Error('No body tag was found.'); | ||
} | ||
topCandidate = document.createElement('DIV'); | ||
topCandidate.innerHTML = document.body.innerHTML; | ||
document.body.innerHTML = ""; | ||
document.body.innerHTML = ''; | ||
document.body.appendChild(topCandidate); | ||
@@ -228,4 +219,4 @@ initializeNode(topCandidate); | ||
**/ | ||
var articleContent = document.createElement("DIV"); | ||
articleContent.id = "readability-content"; | ||
var articleContent = document.createElement('DIV'); | ||
articleContent.id = 'readability-content'; | ||
var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2); | ||
@@ -237,4 +228,4 @@ var siblingNodes = topCandidate.parentNode.childNodes; | ||
dbg("Looking at sibling node: " + siblingNode + " (" + siblingNode.className + ":" + siblingNode.id + ")" + ((typeof siblingNode.readability != 'undefined') ? (" with score " + siblingNode.readability.contentScore) : '')); | ||
dbg("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown')); | ||
dbg('Looking at sibling node: ' + siblingNode + ' (' + siblingNode.className + ':' + siblingNode.id + ')' + ((typeof siblingNode.readability != 'undefined') ? (' with score ' + siblingNode.readability.contentScore) : '')); | ||
dbg('Sibling has score ' + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown')); | ||
@@ -249,3 +240,3 @@ if (siblingNode === topCandidate) { | ||
if (siblingNode.nodeName == "P") { | ||
if (siblingNode.nodeName == 'P') { | ||
var linkDensity = getLinkDensity(siblingNode); | ||
@@ -257,3 +248,3 @@ var nodeContent = getInnerText(siblingNode); | ||
append = true; | ||
} else if (nodeLength < 80 && linkDensity == 0 && nodeContent.search(/\.( |$)/) !== -1) { | ||
} else if (nodeLength < 80 && linkDensity === 0 && nodeContent.search(/\.( |$)/) !== -1) { | ||
append = true; | ||
@@ -264,3 +255,3 @@ } | ||
if (append) { | ||
dbg("Appending node: " + siblingNode) | ||
dbg("Appending node: " + siblingNode); | ||
@@ -288,3 +279,3 @@ /* Append sibling and subtract from our list because it removes the node when you append to another node */ | ||
**/ | ||
function cleanStyles (e) { | ||
function cleanStyles(e) { | ||
if (!e) return; | ||
@@ -316,3 +307,3 @@ | ||
**/ | ||
function killBreaks (e) { | ||
function killBreaks(e) { | ||
e.innerHTML = e.innerHTML.replace(regexps.killBreaksRe, '<br />'); | ||
@@ -329,3 +320,3 @@ } | ||
**/ | ||
getInnerText = exports.getInnerText = function (e, normalizeSpaces) { | ||
getInnerText = exports.getInnerText = function(e, normalizeSpaces) { | ||
var textContent = ""; | ||
@@ -348,3 +339,3 @@ | ||
**/ | ||
function getCharCount (e, s) { | ||
function getCharCount(e, s) { | ||
s = s || ","; | ||
@@ -357,7 +348,7 @@ return getInnerText(e).split(s).length; | ||
* This is the amount of text that is inside a link divided by the total text in the node. | ||
* | ||
* | ||
* @param Element | ||
* @return number (float) | ||
**/ | ||
function getLinkDensity (e) { | ||
function getLinkDensity(e) { | ||
var links = e.getElementsByTagName("a"); | ||
@@ -370,3 +361,3 @@ | ||
// hack for <h2><a href="#menu"></a></h2> / <h2><a></a></h2> | ||
if(!href || (href.length > 0 && href[0] === '#')) continue; | ||
if (!href || (href.length > 0 && href[0] === '#')) continue; | ||
linkLength += getInnerText(links[i]).length; | ||
@@ -378,3 +369,3 @@ } | ||
/** | ||
* Get an elements class/id weight. Uses regular expressions to tell if this | ||
* Get an elements class/id weight. Uses regular expressions to tell if this | ||
* element looks good or bad. | ||
@@ -385,7 +376,7 @@ * | ||
**/ | ||
function getClassWeight (e) { | ||
function getClassWeight(e) { | ||
var weight = 0; | ||
/* Look for a special classname */ | ||
if (e.className != "") { | ||
if (e.className !== '') { | ||
if (e.className.search(regexps.negativeRe) !== -1) weight -= 25; | ||
@@ -397,3 +388,3 @@ | ||
/* Look for a special ID */ | ||
if (typeof (e.id) == 'string' && e.id != "") { | ||
if (typeof(e.id) == 'string' && e.id != "") { | ||
if (e.id.search(regexps.negativeRe) !== -1) weight -= 25; | ||
@@ -415,3 +406,3 @@ | ||
**/ | ||
function clean (e, tag) { | ||
function clean(e, tag) { | ||
var targetList = e.getElementsByTagName(tag); | ||
@@ -425,4 +416,4 @@ var isEmbed = (tag == 'object' || tag == 'embed'); | ||
var validRule = false; | ||
for(var i = 0; i < cleanRules.length; i++) { | ||
if(cleanRules[i](targetList[y], tag) === true) { | ||
for (var i = 0; i < cleanRules.length; i++) { | ||
if (cleanRules[i](targetList[y], tag) === true) { | ||
validRule = true; | ||
@@ -433,3 +424,3 @@ break; | ||
if(validRule) { | ||
if (validRule) { | ||
continue; | ||
@@ -441,3 +432,3 @@ } | ||
if (isEmbed) { | ||
if(targetList[y].innerHTML.search(regexps.videoRe) !== -1) { | ||
if (targetList[y].innerHTML.search(regexps.videoRe) !== -1) { | ||
continue; | ||
@@ -524,7 +515,8 @@ } | ||
**/ | ||
function fixLinks (e) { | ||
function fixLinks(e) { | ||
if (!e.ownerDocument.originalURL) { | ||
return; | ||
} | ||
function fixLink(link){ | ||
function fixLink(link) { | ||
var fixed = url.resolve(e.ownerDocument.originalURL, link); | ||
@@ -538,3 +530,3 @@ return fixed; | ||
var src = imgs[i].getAttribute('src'); | ||
if(src) { | ||
if (src) { | ||
imgs[i].setAttribute('src', fixLink(src)); | ||
@@ -547,3 +539,3 @@ } | ||
var href = as[i].getAttribute('href'); | ||
if(href) { | ||
if (href) { | ||
as[i].setAttribute('href', fixLink(href)); | ||
@@ -560,3 +552,3 @@ } | ||
**/ | ||
function cleanHeaders (e) { | ||
function cleanHeaders(e) { | ||
for (var headerIndex = 1; headerIndex < 7; headerIndex++) { | ||
@@ -579,3 +571,3 @@ var headers = e.getElementsByTagName('h' + headerIndex); | ||
function cleanSingleHeader (e) { | ||
function cleanSingleHeader(e) { | ||
for (var headerIndex = 1; headerIndex < 7; headerIndex++) { | ||
@@ -592,3 +584,3 @@ var headers = e.getElementsByTagName('h' + headerIndex); | ||
function prepArticle (articleContent) { | ||
function prepArticle(articleContent) { | ||
cleanStyles(articleContent); | ||
@@ -598,5 +590,7 @@ killBreaks(articleContent); | ||
/* Clean out junk from the article content */ | ||
clean(articleContent, "form"); | ||
clean(articleContent, "object"); | ||
clean(articleContent, "h1"); | ||
clean(articleContent, 'form'); | ||
clean(articleContent, 'object'); | ||
if (articleContent.getElementsByTagName('h1').length === 1) { | ||
clean(articleContent, 'h1'); | ||
} | ||
/** | ||
@@ -606,3 +600,3 @@ * If there is only one h2, they are probably using it | ||
***/ | ||
if (articleContent.getElementsByTagName('h2').length == 1) clean(articleContent, "h2"); | ||
if (articleContent.getElementsByTagName('h2').length === 1) clean(articleContent, "h2"); | ||
@@ -648,41 +642,55 @@ clean(articleContent, "iframe"); | ||
**/ | ||
function initializeNode (node) { | ||
node.readability = { | ||
"contentScore": 0 | ||
}; | ||
function initializeNode(node) { | ||
node.readability = { contentScore: 0 }; | ||
switch (node.tagName) { | ||
case 'DIV': | ||
node.readability.contentScore += 5; | ||
break; | ||
case 'ARTICLE': | ||
node.readability.contentScore += 10; | ||
break; | ||
case 'PRE': | ||
case 'TD': | ||
case 'BLOCKQUOTE': | ||
node.readability.contentScore += 3; | ||
break; | ||
case 'SECTION': | ||
node.readability.contentScore += 8; | ||
break; | ||
case 'ADDRESS': | ||
case 'OL': | ||
case 'UL': | ||
case 'DL': | ||
case 'DD': | ||
case 'DT': | ||
case 'LI': | ||
case 'FORM': | ||
node.readability.contentScore -= 3; | ||
break; | ||
case 'DIV': | ||
node.readability.contentScore += 5; | ||
break; | ||
case 'H1': | ||
case 'H2': | ||
case 'H3': | ||
case 'H4': | ||
case 'H5': | ||
case 'H6': | ||
case 'TH': | ||
node.readability.contentScore -= 5; | ||
break; | ||
case 'PRE': | ||
case 'TD': | ||
case 'BLOCKQUOTE': | ||
node.readability.contentScore += 3; | ||
break; | ||
case 'ADDRESS': | ||
case 'OL': | ||
case 'UL': | ||
case 'DL': | ||
case 'DD': | ||
case 'DT': | ||
case 'LI': | ||
case 'FORM': | ||
node.readability.contentScore -= 3; | ||
break; | ||
case 'H1': | ||
case 'H2': | ||
case 'H3': | ||
case 'H4': | ||
case 'H5': | ||
case 'H6': | ||
case 'TH': | ||
node.readability.contentScore -= 5; | ||
break; | ||
} | ||
if (node.attributes.itemscope) { | ||
node.readability.contentScore += 5; | ||
if (node.attributes.itemtype && | ||
/blog|post|article/i.test(node.getAttribute('itemtype'))) { | ||
node.readability.contentScore += 30; | ||
} | ||
} | ||
node.readability.contentScore += getClassWeight(node); | ||
} |
@@ -7,3 +7,3 @@ var jsdom = require('jsdom'); | ||
exports.debug = function (debug) { | ||
exports.debug = function(debug) { | ||
helpers.debug(debug); | ||
@@ -43,3 +43,3 @@ }; | ||
Readability.prototype.getContent = function (notDeprecated) { | ||
Readability.prototype.getContent = function(notDeprecated) { | ||
if (!notDeprecated) { | ||
@@ -64,3 +64,3 @@ console.warn('The method `getContent()` is deprecated, using `content` property instead.'); | ||
Readability.prototype.getTitle = function (notDeprecated) { | ||
Readability.prototype.getTitle = function(notDeprecated) { | ||
if (!notDeprecated) { | ||
@@ -78,3 +78,3 @@ console.warn('The method `getTitle()` is deprecated, using `title` property instead.'); | ||
var self = this; | ||
commonSeparatingCharacters.forEach(function (char) { | ||
commonSeparatingCharacters.forEach(function(char) { | ||
var tmpArray = title.split(char); | ||
@@ -94,3 +94,3 @@ if (tmpArray.length > 1) { | ||
Readability.prototype.getDocument = function (notDeprecated) { | ||
Readability.prototype.getDocument = function(notDeprecated) { | ||
if (!notDeprecated) { | ||
@@ -102,3 +102,3 @@ console.warn('The method `getDocument()` is deprecated, using `document` property instead.'); | ||
Readability.prototype.getHTML = function (notDeprecated) { | ||
Readability.prototype.getHTML = function(notDeprecated) { | ||
if (!notDeprecated) { | ||
@@ -110,3 +110,3 @@ console.warn('The method `getHTML()` is deprecated, using `html` property instead.'); | ||
function _findHTMLCharset(htmlbuffer){ | ||
function _findHTMLCharset(htmlbuffer) { | ||
@@ -116,9 +116,9 @@ var body = htmlbuffer.toString("ascii"), | ||
if(meta = body.match(/<meta\s+http-equiv=["']content-type["'][^>]*?>/i)){ | ||
if (meta = body.match(/<meta\s+http-equiv=["']content-type["'][^>]*?>/i)) { | ||
input = meta[0]; | ||
} | ||
if(input){ | ||
if (input) { | ||
charset = input.match(/charset\s?=\s?([a-zA-Z\-0-9]*);?/); | ||
if(charset){ | ||
if (charset) { | ||
charset = (charset[1] || "").trim().toLowerCase(); | ||
@@ -128,3 +128,3 @@ } | ||
if(!charset && (meta = body.match(/<meta\s+charset=["'](.*?)["']/i))){ | ||
if (!charset && (meta = body.match(/<meta\s+charset=["'](.*?)["']/i))) { | ||
charset = (meta[1] || "").trim().toLowerCase(); | ||
@@ -136,4 +136,4 @@ } | ||
function _parseContentType(str){ | ||
if(!str){ | ||
function _parseContentType(str) { | ||
if (!str) { | ||
return {}; | ||
@@ -145,6 +145,6 @@ } | ||
for(var i=0, len = parts.length; i<len; i++){ | ||
for (var i = 0, len = parts.length; i < len; i++) { | ||
chparts = parts[i].split("="); | ||
if(chparts.length>1){ | ||
if(chparts[0].trim().toLowerCase() == "charset"){ | ||
if (chparts.length > 1) { | ||
if (chparts[0].trim().toLowerCase() == "charset") { | ||
charset = chparts[1]; | ||
@@ -156,5 +156,5 @@ } | ||
return { | ||
mimeType: (mimeType || "").trim().toLowerCase(), | ||
mimeType: (mimeType || "").trim().toLowerCase(), | ||
charset: (charset || "UTF-8").trim().toLowerCase() // defaults to UTF-8 | ||
} | ||
}; | ||
} | ||
@@ -174,3 +174,3 @@ | ||
request(html, options, function(err, res, buffer) { | ||
if(err) { | ||
if (err) { | ||
return callback(err); | ||
@@ -181,9 +181,9 @@ } | ||
if(content_type.mimeType == "text/html"){ | ||
if (content_type.mimeType == "text/html") { | ||
content_type.charset = _findHTMLCharset(buffer) || content_type.charset; | ||
} | ||
content_type.charset = (overrideEncoding || content_type.charset || "utf-8").trim().toLowerCase(); | ||
content_type.charset = (overrideEncoding || content_type.charset || "utf-8").trim().toLowerCase(); | ||
if(!content_type.charset.match(/^utf-?8$/i)){ | ||
if (!content_type.charset.match(/^utf-?8$/i)) { | ||
buffer = encodinglib.convert(buffer, "UTF-8", content_type.charset); | ||
@@ -207,4 +207,4 @@ } | ||
html: body, | ||
done: function (errors, window) { | ||
if(meta) { | ||
done: function(errors, window) { | ||
if (meta) { | ||
window.document.originalURL = meta.request.uri.href; | ||
@@ -211,0 +211,0 @@ } else { |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
3552331
16
1249
106