node-readability
Advanced tools
Comparing version 2.2.0 to 3.0.0
@@ -0,1 +1,10 @@ | ||
<a name="3.0.0"></a> | ||
## [3.0.0](https://github.com/luin/node-readability/compare/v2.2.0...v3.0.0) (2017-08-10) | ||
Thank [Harold Treen](https://github.com/haroldtreen) and other contributors for the awesome work! | ||
* Replace text node followed by br with a p tag | ||
* Remove lightbox from unlikelyCandidatesRe | ||
* Fix for psychology-today + test | ||
<a name="2.2.0"></a> | ||
@@ -2,0 +11,0 @@ ## [2.2.0](https://github.com/luin/node-readability/compare/v2.1.5...v2.2.0) (2016-03-11) |
{ | ||
"name": "node-readability", | ||
"version": "2.2.0", | ||
"version": "3.0.0", | ||
"author": "Zihua Li", | ||
@@ -26,3 +26,3 @@ "description": "Turning any web page into a clean view.", | ||
"encoding": "~0.1.7", | ||
"jsdom": "^6.3.0", | ||
"jsdom": "^9.12.0", | ||
"minimist": "^1.2.0", | ||
@@ -29,0 +29,0 @@ "request": "~2.40.0" |
@@ -22,3 +22,3 @@ # Readability | ||
Note that as of our 2.0.0 release, this module only works with Node.js >= 2.0. In the meantime you are still welcome to install a release in the 1.x series(by `npm install node-readability@1`) if you use an older Node.js version. | ||
Note that from v2.0.0, this module only works with Node.js >= 2.0. In the meantime you are still welcome to install a release in the 1.x series(by `npm install node-readability@1`) if you use an older Node.js version. | ||
@@ -128,3 +128,3 @@ ## Usage | ||
This lib is using jsdom to parser HTML instead of cheerio because some data such as image size and element visibility isn't able to acquire when using cheerio, which will significantly affect the result. | ||
This lib is using jsdom to parse HTML instead of cheerio because some data such as image size and element visibility isn't able to acquire when using cheerio, which will significantly affect the result. | ||
@@ -131,0 +131,0 @@ ## Contributors |
@@ -1,2 +0,2 @@ | ||
#!/usr/bin/env iojs | ||
#!/usr/bin/env node | ||
var read = require("./readability.js"); | ||
@@ -3,0 +3,0 @@ var argv = require("minimist")(process.argv.slice(2)); |
@@ -5,5 +5,5 @@ var url = require("url"); | ||
var regexps = { | ||
unlikelyCandidatesRe: /combx|modal|lightbox|comment|disqus|foot|header|menu|meta|nav|rss|shoutbox|sidebar|sponsor|social|teaserlist|time|tweet|twitter/i, | ||
okMaybeItsACandidateRe: /and|article|body|column|main/i, | ||
positiveRe: /article|body|content|entry|hentry|page|pagination|post|text/i, | ||
unlikelyCandidatesRe: /combx|modal|comment|disqus|foot|header|menu|meta|nav|rss|shoutbox|sidebar|sponsor|social|teaserlist|time|tweet|twitter/i, | ||
okMaybeItsACandidateRe: /and|article|body|column|main|story|entry|^post/im, | ||
positiveRe: /article|body|content|entry|hentry|page|pagination|post|section|chapter|description|main|blog|text/i, | ||
negativeRe: /combx|comment|contact|foot|footer|footnote|link|media|meta|promo|related|scroll|shoutbox|sponsor|utility|tags|widget/i, | ||
@@ -16,3 +16,4 @@ divToPElementsRe: /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i, | ||
killBreaksRe: /(<br\s*\/?>(\s| ?)*){1,}/g, | ||
videoRe: /http:\/\/(www\.)?(youtube|vimeo|youku|tudou|56|yinyuetai)\.com/i | ||
videoRe: /http:\/\/(www\.)?(youtube|vimeo|youku|tudou|56|yinyuetai)\.com/i, | ||
attributeRe: /blog|post|article/i | ||
}; | ||
@@ -69,2 +70,8 @@ | ||
} | ||
// Strip out all <script> tags, as they *should* be useless | ||
var scripts = document.getElementsByTagName('script'); | ||
[].forEach.call(scripts, function (node) { | ||
node.parentNode.removeChild(node); | ||
}); | ||
@@ -96,3 +103,3 @@ // turn all double br's into p's | ||
if (!preserveUnlikelyCandidates) { | ||
var unlikelyMatchString = node.className + node.id; | ||
var unlikelyMatchString = node.className + '\n' + node.id; | ||
if (unlikelyMatchString.search(regexps.unlikelyCandidatesRe) !== -1 && unlikelyMatchString.search(regexps.okMaybeItsACandidateRe) == -1 && node.tagName !== 'HTML' && node.tagName !== "BODY") { | ||
@@ -116,7 +123,16 @@ dbg("Removing unlikely candidate - " + unlikelyMatchString); | ||
if (childNode.nodeType == 3 /*TEXT_NODE*/ ) { | ||
// use span instead of p. Need more tests. | ||
dbg("replacing text node with a span tag with the same content."); | ||
var span = document.createElement('span'); | ||
span.innerHTML = childNode.nodeValue; | ||
childNode.parentNode.replaceChild(span, childNode); | ||
var nextSibling = childNode.nextSibling | ||
if (nextSibling && nextSibling.tagName == 'BR') { | ||
dbg("replacing text node followed by br with a p tag with the same content."); | ||
var p = document.createElement('p'); | ||
p.innerHTML = childNode.nodeValue; | ||
childNode.parentNode.removeChild(nextSibling) | ||
childNode.parentNode.replaceChild(p, childNode); | ||
} else { | ||
// use span instead of p. Need more tests. | ||
dbg("replacing text node with a span tag with the same content."); | ||
var span = document.createElement('span'); | ||
span.innerHTML = childNode.nodeValue; | ||
childNode.parentNode.replaceChild(span, childNode); | ||
} | ||
} | ||
@@ -307,3 +323,3 @@ }); | ||
**/ | ||
getInnerText = exports.getInnerText = function(e, normalizeSpaces) { | ||
var getInnerText = exports.getInnerText = function(e, normalizeSpaces) { | ||
var textContent = ""; | ||
@@ -658,3 +674,3 @@ | ||
if (node.attributes.itemtype && | ||
/blog|post|article/i.test(node.getAttribute('itemtype'))) { | ||
regexps.attributeRe.test(node.getAttribute('itemtype'))) { | ||
node.readability.contentScore += 30; | ||
@@ -661,0 +677,0 @@ } |
@@ -82,3 +82,3 @@ var jsdom = require('jsdom'); | ||
var title = this._document.title; | ||
var title = _findMetaTitle(this._document) || this._document.title; | ||
var betterTitle; | ||
@@ -142,2 +142,16 @@ var commonSeparatingCharacters = [' | ', ' _ ', ' - ', '«', '»', '—']; | ||
function _findMetaTitle(document) { | ||
var metaTags = document.getElementsByTagName('meta'); | ||
var tag; | ||
for(var i = 0; i < metaTags.length; i++) { | ||
tag = metaTags[i]; | ||
if(tag.getAttribute('property') === 'og:title' || tag.getAttribute('name') === 'twitter:title'){ | ||
return tag.getAttribute('content'); | ||
} | ||
} | ||
return null; | ||
} | ||
function _findHTMLCharset(htmlbuffer) { | ||
@@ -144,0 +158,0 @@ |
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Filesystem access
Supply chain riskAccesses the file system, and could potentially read sensitive data.
Found 1 instance in 1 package
Long strings
Supply chain riskContains long string literals, which may be a sign of obfuscated or packed code.
Found 1 instance in 1 package
0
3270951
12
853
+ Addedabab@1.0.4(transitive)
+ Addedacorn@4.0.13(transitive)
+ Addedacorn-globals@3.1.0(transitive)
+ Addedarray-equal@1.0.2(transitive)
+ Addedcontent-type-parser@1.0.2(transitive)
+ Addedhtml-encoding-sniffer@1.0.2(transitive)
+ Addediconv-lite@0.4.24(transitive)
+ Addedjsdom@9.12.0(transitive)
+ Addedsax@1.4.1(transitive)
+ Addedwebidl-conversions@3.0.14.0.2(transitive)
+ Addedwhatwg-encoding@1.0.5(transitive)
+ Addedwhatwg-url@4.8.0(transitive)
- Removedacorn@2.7.0(transitive)
- Removedacorn-globals@1.0.9(transitive)
- Removedbrowser-request@0.3.3(transitive)
- Removeddom-serializer@0.2.2(transitive)
- Removeddomelementtype@1.3.12.3.0(transitive)
- Removeddomhandler@2.4.2(transitive)
- Removeddomutils@1.7.0(transitive)
- Removedentities@1.1.22.2.0(transitive)
- Removedhtmlparser2@3.10.1(transitive)
- Removedinherits@2.0.4(transitive)
- Removedjsdom@6.5.1(transitive)
- Removedreadable-stream@3.6.2(transitive)
- Removedstring_decoder@1.3.0(transitive)
- Removedutil-deprecate@1.0.2(transitive)
- Removedwhatwg-url-compat@0.6.5(transitive)
- Removedxmlhttprequest@1.8.0(transitive)
- Removedxtend@4.0.2(transitive)
Updatedjsdom@^9.12.0