node-readability
Advanced tools
Comparing version 0.0.2 to 0.0.3
@@ -1,9 +0,13 @@ | ||
var readability = require('../src/readability'); | ||
var readability = require('../src/readability') | ||
, fs = require('fs') | ||
// uncoment the following line to print the debug info to console. | ||
// readability.debug(true); | ||
readability.debug(true); | ||
readability.read('http://howtonode.org/really-simple-file-uploads', function(err, read) { | ||
console.log(read.getContent()); | ||
console.log(read.getTitle()); | ||
}); | ||
readability.read('http://colorlines.com/archives/2011/08/dispatch_from_angola_faith-based_slavery_in_a_louisiana_prison.html', | ||
function(err, read) { | ||
var dom = read.getDocument(); | ||
var html = '<html><head><meta charset="utf-8"><title>'+dom.title+'</title></head><body><h1>'+read.getTitle()+'</h1>'+read.getContent()+'</body></html>'; | ||
console.log(html); | ||
}); |
{ | ||
"name": "node-readability", | ||
"version": "0.0.2", | ||
"version": "0.0.3", | ||
"author": "Zihua Li", | ||
@@ -5,0 +5,0 @@ "description": "Turn any web page into a clean view", |
# node-readability | ||
Turn any web page into a clean view. It's based on arc90's readability project. | ||
Turn any web page into a clean view. This module is based on arc90's readability project. | ||
@@ -10,3 +10,2 @@ ## Install | ||
## Requirements | ||
* [node.js](http://nodejs.org/) | ||
* [jsdom](https://github.com/tmpvar/jsdom) | ||
@@ -23,3 +22,3 @@ * [fetch](https://github.com/andris9/fetch) | ||
* **options** is an optional options object | ||
* **callback** is the callback to run - `callback(error, read)` | ||
* **callback** is the callback to run - `callback(error, article)` | ||
@@ -30,3 +29,2 @@ Example | ||
// source file is iso-8859-15 but it is converted to utf-8 automatically | ||
readability.read('http://howtonode.org/really-simple-file-uploads', function(err, article) { | ||
@@ -36,6 +34,8 @@ console.log(article.getArticleContent()); | ||
**NB** If the file has been marked with charset other than utf-8, it is converted automatically. Charsets such as GBK, GB2312 is also supported via icon. | ||
**NB** If the file has been marked with charset other than utf-8, it is converted automatically. Charsets such as GBK, GB2312 is also supported via [iconv](https://github.com/bnoordhuis/node-iconv). | ||
## Options | ||
node-readability support all the options that [fetch](https://github.com/andris9/fetch) support. | ||
Possible option values | ||
@@ -61,4 +61,2 @@ | ||
Readability support lazy evaluation by passing `readResult` to the callback function. | ||
### getContent() | ||
@@ -82,4 +80,4 @@ | ||
* Support more readability features | ||
* Performance optimization | ||
* Support more readability features | ||
* Performance optimization | ||
@@ -86,0 +84,0 @@ ## License |
var jsdom = require('jsdom'), | ||
fetchUrl = require("fetch").fetchUrl | ||
fetchUrl = require("fetch").fetchUrl, | ||
helpers = require('./helpers') | ||
var dbg; | ||
exports.debug = function (debug) { | ||
dbg = (debug) ? console.log : function () {} | ||
helpers.debug(debug); | ||
}; | ||
@@ -11,21 +11,2 @@ | ||
function trim(string) { | ||
return string.replace(/(^\s*)(\s*$)/g, ''); | ||
} | ||
// All of the regular expressions in use within readability. | ||
var regexps = { | ||
unlikelyCandidatesRe: /combx|comment|disqus|foot|header|menu|meta|nav|rss|shoutbox|sidebar|sponsor/i, | ||
okMaybeItsACandidateRe: /and|article|body|column|main/i, | ||
positiveRe: /article|body|content|entry|hentry|page|pagination|post|text/i, | ||
negativeRe: /combx|comment|contact|foot|footer|footnote|link|media|meta|promo|related|scroll|shoutbox|sponsor|utility|tags|widget/i, | ||
divToPElementsRe: /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i, | ||
replaceBrsRe: /(<br[^>]*>[ \n\r\t]*){2,}/gi, | ||
replaceFontsRe: /<(\/?)font[^>]*>/gi, | ||
trimRe: /^\s+|\s+$/g, | ||
normalizeRe: /\s{2,}/g, | ||
killBreaksRe: /(<br\s*\/?>(\s| ?)*){1,}/g, | ||
videoRe: /http:\/\/(www\.)?(youtube|vimeo|youku|tudou|56|yinyuetai)\.com/i | ||
}; | ||
function Readablity(document, options) { | ||
@@ -40,3 +21,3 @@ this._document = document; | ||
prepDocument(this._document); | ||
helpers.prepDocument(this._document); | ||
this.cache = { | ||
@@ -52,7 +33,7 @@ 'body': this._document.body.innerHTML | ||
articleContent = grabArticle(this._document); | ||
if (getInnerText(articleContent, false) === "") { | ||
articleContent = helpers.grabArticle(this._document); | ||
if (helpers.getInnerText(articleContent, false) === "") { | ||
this._document.body.innerHTML = this.cache['body']; | ||
articleContent = grabArticle(this._document, true); | ||
if (getInnerText(articleContent, false) === "") { | ||
articleContent = helpers.grabArticle(this._document, true); | ||
if (helpers.getInnerText(articleContent, false) === "") { | ||
return this.cache['article-content'] = false; | ||
@@ -72,8 +53,10 @@ } | ||
var betterTitle; | ||
var commonSeparatingCharacters = ['|', '_', '-', '«']; | ||
var commonSeparatingCharacters = [' | ', ' _ ', ' - ', '«', '»', '—']; | ||
var self = this; | ||
commonSeparatingCharacters.forEach(function (char) { | ||
var tmpArray = title.split(char); | ||
if (tmpArray.length > 1) { | ||
if (betterTitle) return this.cache['article-title'] = title; | ||
betterTitle = trim(tmpArray[0]); | ||
if (betterTitle) return self.cache['article-title'] = title; | ||
betterTitle = tmpArray[0].trim(); | ||
} | ||
@@ -97,550 +80,2 @@ }); | ||
/** | ||
* Prepare the HTML document for readability to scrape it. | ||
* This includes things like stripping javascript, CSS, and handling terrible markup. | ||
* | ||
* @return void | ||
**/ | ||
function prepDocument(document) { | ||
var frames = document.getElementsByTagName('frame'); | ||
if (frames.length > 0) { | ||
var bestFrame = null; | ||
var bestFrameSize = 0; | ||
frames.forEach(function (frame) { | ||
var frameSize = frame.offsetWidth + frame.offsetHeight; | ||
var canAccessFrame = false; | ||
try { | ||
frame.contentWindow.document.body; | ||
canAccessFrame = true; | ||
} catch (e) {} | ||
if (canAccessFrame && frameSize > bestFrameSize) { | ||
bestFrame = frame; | ||
bestFrameSize = frameSize; | ||
} | ||
}); | ||
if (bestFrame) { | ||
var newBody = document.createElement('body'); | ||
newBody.innerHTML = bestFrame.contentWindow.document.body.innerHTML; | ||
newBody.style.overflow = 'scroll'; | ||
document.body = newBody; | ||
var frameset = document.getElementsByTagName('frameset')[0]; | ||
if (frameset) { | ||
frameset.parentNode.removeChild(frameset); | ||
} | ||
} | ||
} | ||
// remove all scripts that are not readability | ||
var scripts = document.getElementsByTagName('script'); | ||
for (var i = 0; i < scripts.length; ++i) { | ||
scripts[i].parentNode.removeChild(scripts[i]); | ||
} | ||
// remove all stylesheets | ||
for (var k = 0; k < document.styleSheets.length; k++) { | ||
document.styleSheets[k].disabled = true; | ||
} | ||
// turn all double br's into p's | ||
// note, this is pretty costly as far as processing goes. Maybe optimize later. | ||
document.body.innerHTML = document.body.innerHTML.replace(regexps.replaceBrsRe, '</p><p>').replace(regexps.replaceFontsRe, '<$1span>') | ||
} | ||
/*** | ||
* grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is | ||
* most likely to be the stuff a user wants to read. Then return it wrapped up in a div. | ||
* | ||
* @return Element | ||
**/ | ||
function grabArticle(document, preserveUnlikelyCandidates) { | ||
/** | ||
* First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs | ||
* into P tags where they have been used inappropriately (as in, where they contain no other block level elements.) | ||
* | ||
* Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 | ||
* TODO: Shouldn't this be a reverse traversal? | ||
**/ | ||
var nodes = document.getElementsByTagName('*'); | ||
for (var i = 0; i < nodes.length; ++i) { | ||
var node = nodes[i]; | ||
// Remove unlikely candidates */ | ||
var continueFlag = false; | ||
if (!preserveUnlikelyCandidates) { | ||
var unlikelyMatchString = node.className + node.id; | ||
if (unlikelyMatchString.search(regexps.unlikelyCandidatesRe) !== -1 && unlikelyMatchString.search(regexps.okMaybeItsACandidateRe) == -1 && node.tagName !== "BODY") { | ||
dbg("Removing unlikely candidate - " + unlikelyMatchString); | ||
node.parentNode.removeChild(node); | ||
continueFlag = true; | ||
} | ||
} | ||
// Turn all divs that don't have children block level elements into p's | ||
if (!continueFlag && node.tagName === "DIV") { | ||
if (node.innerHTML.search(regexps.divToPElementsRe) === -1) { | ||
dbg("Altering div to p"); | ||
var newNode = document.createElement('p'); | ||
newNode.innerHTML = node.innerHTML; | ||
node.parentNode.replaceChild(newNode, node); | ||
} else { | ||
// EXPERIMENTAL | ||
node.childNodes.forEach(function (childNode) { | ||
if (childNode.nodeType == 3 /*TEXT_NODE*/ ) { | ||
dbg("replacing text node with a p tag with the same content."); | ||
var p = document.createElement('p'); | ||
p.innerHTML = childNode.nodeValue; | ||
p.style.display = 'inline'; | ||
p.className = 'readability-styled'; | ||
childNode.parentNode.replaceChild(p, childNode); | ||
} | ||
}); | ||
} | ||
} | ||
} | ||
/** | ||
* Loop through all paragraphs, and assign a score to them based on how content-y they look. | ||
* Then add their score to their parent node. | ||
* | ||
* A score is determined by things like number of commas, class names, etc. Maybe eventually link density. | ||
**/ | ||
var allParagraphs = document.getElementsByTagName("p"); | ||
var candidates = []; | ||
for (var i = 0; i < allParagraphs.length; ++i) { | ||
var paragraph = allParagraphs[i]; | ||
var parentNode = paragraph.parentNode; | ||
var grandParentNode = parentNode.parentNode; | ||
var innerText = getInnerText(paragraph); | ||
// If this paragraph is less than 25 characters, don't even count it. | ||
if (innerText.length >= 25) { | ||
// Initialize readability data for the parent. | ||
if (typeof parentNode.readability == 'undefined') { | ||
initializeNode(parentNode); | ||
candidates.push(parentNode); | ||
} | ||
// Initialize readability data for the grandparent. | ||
if (typeof grandParentNode.readability == 'undefined') { | ||
initializeNode(grandParentNode); | ||
candidates.push(grandParentNode); | ||
} | ||
var contentScore = 0; | ||
// Add a point for the paragraph itself as a base. */ | ||
++contentScore; | ||
// Add points for any commas within this paragraph */ | ||
contentScore += innerText.split(',').length; | ||
// For every 100 characters in this paragraph, add another point. Up to 3 points. */ | ||
contentScore += Math.min(Math.floor(innerText.length / 100), 3); | ||
// Add the score to the parent. The grandparent gets half. */ | ||
parentNode.readability.contentScore += contentScore; | ||
grandParentNode.readability.contentScore += contentScore / 2; | ||
} | ||
} | ||
/** | ||
* After we've calculated scores, loop through all of the possible candidate nodes we found | ||
* and find the one with the highest score. | ||
**/ | ||
var topCandidate = null; | ||
candidates.forEach(function (candidate) { | ||
/** | ||
* Scale the final candidates score based on link density. Good content should have a | ||
* relatively small link density (5% or less) and be mostly unaffected by this operation. | ||
**/ | ||
candidate.readability.contentScore = candidate.readability.contentScore * (1 - getLinkDensity(candidate)); | ||
dbg('Candidate: ' + candidate + " (" + candidate.className + ":" + candidate.id + ") with score " + candidate.readability.contentScore); | ||
if (!topCandidate || candidate.readability.contentScore > topCandidate.readability.contentScore) topCandidate = candidate; | ||
}); | ||
/** | ||
* If we still have no top candidate, just use the body as a last resort. | ||
* We also have to copy the body node so it is something we can modify. | ||
**/ | ||
if (topCandidate === null || topCandidate.tagName === "BODY") { | ||
topCandidate = document.createElement("DIV"); | ||
topCandidate.innerHTML = document.body.innerHTML; | ||
document.body.innerHTML = ""; | ||
document.body.appendChild(topCandidate); | ||
initializeNode(topCandidate); | ||
} | ||
/** | ||
* Now that we have the top candidate, look through its siblings for content that might also be related. | ||
* Things like preambles, content split by ads that we removed, etc. | ||
**/ | ||
var articleContent = document.createElement("DIV"); | ||
articleContent.id = "readability-content"; | ||
var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2); | ||
var siblingNodes = topCandidate.parentNode.childNodes; | ||
for (var i = 0, il = siblingNodes.length; i < il; i++) { | ||
var siblingNode = siblingNodes[i]; | ||
var append = false; | ||
dbg("Looking at sibling node: " + siblingNode + " (" + siblingNode.className + ":" + siblingNode.id + ")" + ((typeof siblingNode.readability != 'undefined') ? (" with score " + siblingNode.readability.contentScore) : '')); | ||
dbg("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown')); | ||
if (siblingNode === topCandidate) { | ||
append = true; | ||
} | ||
if (typeof siblingNode.readability != 'undefined' && siblingNode.readability.contentScore >= siblingScoreThreshold) { | ||
append = true; | ||
} | ||
if (siblingNode.nodeName == "P") { | ||
var linkDensity = getLinkDensity(siblingNode); | ||
var nodeContent = getInnerText(siblingNode); | ||
var nodeLength = nodeContent.length; | ||
if (nodeLength > 80 && linkDensity < 0.25) { | ||
append = true; | ||
} else if (nodeLength < 80 && linkDensity == 0 && nodeContent.search(/\.( |$)/) !== -1) { | ||
append = true; | ||
} | ||
} | ||
if (append) { | ||
dbg("Appending node: " + siblingNode) | ||
/* Append sibling and subtract from our list because it removes the node when you append to another node */ | ||
articleContent.appendChild(siblingNode); | ||
i--; | ||
il--; | ||
} | ||
} | ||
/** | ||
* So we have all of the content that we need. Now we clean it up for presentation. | ||
**/ | ||
prepArticle(articleContent); | ||
return articleContent; | ||
}; | ||
/** | ||
* Remove the style attribute on every e and under. | ||
* | ||
* @param Element | ||
* @return void | ||
**/ | ||
function cleanStyles(e) { | ||
if (!e) return; | ||
// Remove any root styles, if we're able. | ||
if (typeof e.removeAttribute == 'function' && e.className != 'readability-styled') e.removeAttribute('style'); | ||
// Go until there are no more child nodes | ||
var cur = e.firstChild; | ||
while (cur) { | ||
if (cur.nodeType == 1) { | ||
// Remove style attribute(s) : | ||
if (cur.className != "readability-styled") { | ||
cur.removeAttribute("style"); | ||
} | ||
cleanStyles(cur); | ||
} | ||
cur = cur.nextSibling; | ||
} | ||
} | ||
/** | ||
* Remove extraneous break tags from a node. | ||
* | ||
* @param Element | ||
* @return void | ||
**/ | ||
function killBreaks(e) { | ||
e.innerHTML = e.innerHTML.replace(regexps.killBreaksRe, '<br />'); | ||
} | ||
/** | ||
* Get the inner text of a node - cross browser compatibly. | ||
* This also strips out any excess whitespace to be found. | ||
* | ||
* @param Element | ||
* @return string | ||
**/ | ||
function getInnerText(e, normalizeSpaces) { | ||
var textContent = ""; | ||
normalizeSpaces = (typeof normalizeSpaces == 'undefined') ? true : normalizeSpaces; | ||
textContent = e.textContent.replace(regexps.trimRe, ""); | ||
if (normalizeSpaces) return textContent.replace(regexps.normalizeRe, " "); | ||
else return textContent; | ||
} | ||
/** | ||
* Get the number of times a string s appears in the node e. | ||
* | ||
* @param Element | ||
* @param string - what to split on. Default is "," | ||
* @return number (integer) | ||
**/ | ||
function getCharCount(e, s) { | ||
s = s || ","; | ||
return getInnerText(e).split(s).length; | ||
} | ||
/** | ||
* Get the density of links as a percentage of the content | ||
* This is the amount of text that is inside a link divided by the total text in the node. | ||
* | ||
* @param Element | ||
* @return number (float) | ||
**/ | ||
function getLinkDensity(e) { | ||
var links = e.getElementsByTagName("a"); | ||
var textLength = getInnerText(e).length; | ||
var linkLength = 0; | ||
for (var i = 0, il = links.length; i < il; i++) { | ||
linkLength += getInnerText(links[i]).length; | ||
} | ||
return linkLength / textLength; | ||
} | ||
/** | ||
* Get an elements class/id weight. Uses regular expressions to tell if this | ||
* element looks good or bad. | ||
* | ||
* @param Element | ||
* @return number (Integer) | ||
**/ | ||
function getClassWeight(e) { | ||
var weight = 0; | ||
/* Look for a special classname */ | ||
if (e.className != "") { | ||
if (e.className.search(regexps.negativeRe) !== -1) weight -= 25; | ||
if (e.className.search(regexps.positiveRe) !== -1) weight += 25; | ||
} | ||
/* Look for a special ID */ | ||
if (typeof (e.id) == 'string' && e.id != "") { | ||
if (e.id.search(regexps.negativeRe) !== -1) weight -= 25; | ||
if (e.id.search(regexps.positiveRe) !== -1) weight += 25; | ||
} | ||
return weight; | ||
} | ||
/** | ||
* Clean a node of all elements of type "tag". | ||
* (Unless it's a youtube/vimeo video. People love movies.) | ||
* | ||
* @param Element | ||
* @param string tag to clean | ||
* @return void | ||
**/ | ||
function clean(e, tag) { | ||
var targetList = e.getElementsByTagName(tag); | ||
var isEmbed = (tag == 'object' || tag == 'embed'); | ||
for (var y = targetList.length - 1; y >= 0; y--) { | ||
/* Allow youtube and vimeo videos through as people usually want to see those. */ | ||
if (isEmbed && targetList[y].innerHTML.search(regexps.videoRe) !== -1) { | ||
continue; | ||
} | ||
targetList[y].parentNode.removeChild(targetList[y]); | ||
} | ||
} | ||
/** | ||
* Clean an element of all tags of type "tag" if they look fishy. | ||
* "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc. | ||
* | ||
* @return void | ||
**/ | ||
function cleanConditionally(e, tag) { | ||
var tagsList = e.getElementsByTagName(tag); | ||
var curTagsLength = tagsList.length; | ||
/** | ||
* Gather counts for other typical elements embedded within. | ||
* Traverse backwards so we can remove nodes at the same time without effecting the traversal. | ||
* | ||
* TODO: Consider taking into account original contentScore here. | ||
**/ | ||
for (var i = curTagsLength - 1; i >= 0; i--) { | ||
var weight = getClassWeight(tagsList[i]); | ||
dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].className + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability != 'undefined') ? (" with score " + tagsList[i].readability.contentScore) : '')); | ||
if (weight < 0) { | ||
tagsList[i].parentNode.removeChild(tagsList[i]); | ||
} else if (getCharCount(tagsList[i], ',') < 10) { | ||
/** | ||
* If there are not very many commas, and the number of | ||
* non-paragraph elements is more than paragraphs or other ominous signs, remove the element. | ||
**/ | ||
var p = tagsList[i].getElementsByTagName("p").length; | ||
var img = tagsList[i].getElementsByTagName("img").length; | ||
var li = tagsList[i].getElementsByTagName("li").length - 100; | ||
var input = tagsList[i].getElementsByTagName("input").length; | ||
var embedCount = 0; | ||
var embeds = tagsList[i].getElementsByTagName("embed"); | ||
for (var ei = 0, il = embeds.length; ei < il; ei++) { | ||
if (embeds[ei].src.search(regexps.videoRe) == -1) { | ||
embedCount++; | ||
} | ||
} | ||
var linkDensity = getLinkDensity(tagsList[i]); | ||
var contentLength = getInnerText(tagsList[i]).length; | ||
var toRemove = false; | ||
if (img > p) { | ||
toRemove = true; | ||
} else if (li > p && tag != "ul" && tag != "ol") { | ||
toRemove = true; | ||
} else if (input > Math.floor(p / 3)) { | ||
toRemove = true; | ||
} else if (contentLength < 25 && (img == 0 || img > 2)) { | ||
toRemove = true; | ||
} else if (weight < 25 && linkDensity > .2) { | ||
toRemove = true; | ||
} else if (weight >= 25 && linkDensity > .5) { | ||
toRemove = true; | ||
} else if ((embedCount == 1 && contentLength < 75) || embedCount > 1) { | ||
toRemove = true; | ||
} | ||
if (toRemove) { | ||
tagsList[i].parentNode.removeChild(tagsList[i]); | ||
} | ||
} | ||
} | ||
} | ||
/** | ||
* Clean out spurious headers from an Element. Checks things like classnames and link density. | ||
* | ||
* @param Element | ||
* @return void | ||
**/ | ||
function cleanHeaders(e) { | ||
for (var headerIndex = 1; headerIndex < 7; headerIndex++) { | ||
var headers = e.getElementsByTagName('h' + headerIndex); | ||
for (var i = headers.length - 1; i >= 0; i--) { | ||
if (getClassWeight(headers[i]) < 0 || getLinkDensity(headers[i]) > 0.33) { | ||
headers[i].parentNode.removeChild(headers[i]); | ||
} | ||
} | ||
} | ||
} | ||
function prepArticle(articleContent) { | ||
cleanStyles(articleContent); | ||
killBreaks(articleContent); | ||
/* Clean out junk from the article content */ | ||
clean(articleContent, "form"); | ||
clean(articleContent, "object"); | ||
clean(articleContent, "h1"); | ||
/** | ||
* If there is only one h2, they are probably using it | ||
* as a header and not a subheader, so remove it since we already have a header. | ||
***/ | ||
if (articleContent.getElementsByTagName('h2').length == 1) clean(articleContent, "h2"); | ||
clean(articleContent, "iframe"); | ||
cleanHeaders(articleContent); | ||
/* Do these last as the previous stuff may have removed junk that will affect these */ | ||
cleanConditionally(articleContent, "table"); | ||
cleanConditionally(articleContent, "ul"); | ||
cleanConditionally(articleContent, "div"); | ||
/* Remove extra paragraphs */ | ||
var articleParagraphs = articleContent.getElementsByTagName('p'); | ||
for (i = articleParagraphs.length - 1; i >= 0; i--) { | ||
var imgCount = articleParagraphs[i].getElementsByTagName('img').length; | ||
var embedCount = articleParagraphs[i].getElementsByTagName('embed').length; | ||
var objectCount = articleParagraphs[i].getElementsByTagName('object').length; | ||
if (imgCount == 0 && embedCount == 0 && objectCount == 0 && getInnerText(articleParagraphs[i], false) == '') { | ||
articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]); | ||
} | ||
} | ||
try { | ||
articleContent.innerHTML = articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, '<p'); | ||
} catch (e) { | ||
dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring."); | ||
} | ||
} | ||
/** | ||
* Initialize a node with the readability object. Also checks the | ||
* className/id for special names to add to its score. | ||
* | ||
* @param Element | ||
* @return void | ||
**/ | ||
function initializeNode(node) { | ||
node.readability = { | ||
"contentScore": 0 | ||
}; | ||
switch (node.tagName) { | ||
case 'DIV': | ||
node.readability.contentScore += 5; | ||
break; | ||
case 'PRE': | ||
case 'TD': | ||
case 'BLOCKQUOTE': | ||
node.readability.contentScore += 3; | ||
break; | ||
case 'ADDRESS': | ||
case 'OL': | ||
case 'UL': | ||
case 'DL': | ||
case 'DD': | ||
case 'DT': | ||
case 'LI': | ||
case 'FORM': | ||
node.readability.contentScore -= 3; | ||
break; | ||
case 'H1': | ||
case 'H2': | ||
case 'H3': | ||
case 'H4': | ||
case 'H5': | ||
case 'H6': | ||
case 'TH': | ||
node.readability.contentScore -= 5; | ||
break; | ||
} | ||
node.readability.contentScore += getClassWeight(node); | ||
} | ||
function read(html, options, callback) { | ||
@@ -647,0 +82,0 @@ if (typeof options === 'function') { |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
Filesystem access
Supply chain riskAccesses the file system, and could potentially read sensitive data.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
25423
5
596
81
1
1