node-readability
Advanced tools
Comparing version 0.2.1 to 0.3.0
{ | ||
"name": "node-readability", | ||
"version": "0.2.1", | ||
"version": "0.3.0", | ||
"author": "Zihua Li", | ||
@@ -22,4 +22,5 @@ "description": "Turning any web page into a clean view.", | ||
"dependencies": { | ||
"fetch": "0.3.x", | ||
"jsdom": "0.8.x" | ||
"jsdom": "0.8.x", | ||
"request": "~2.31.0", | ||
"encoding": "~0.1.7" | ||
}, | ||
@@ -34,4 +35,5 @@ "engines": [ | ||
"mocha": "~1.8.2", | ||
"should": "~1.2.2" | ||
"should": "~2.1.1", | ||
"nock": "~0.27.1" | ||
} | ||
} |
@@ -46,22 +46,21 @@ # Readability | ||
node-readability will pass the options to [fetch](https://github.com/andris9/fetch) directly. | ||
node-readability will pass the options to [request](https://github.com/mikeal/request) directly. | ||
See request lib to view all available options. | ||
Possible option values | ||
* **maxRedirects** how many redirects allowed, defaults to 10 | ||
* **disableRedirects** set to true if redirects are not allowed, defaults to false | ||
* **headers** optional header fields, in the form of `{'Header-Field':'value'}` | ||
* **maxResponseLength** maximum allowed length for the file, the remainder is cut off. Defaults to `Infinity` | ||
* **method** defaults to GET | ||
* **payload** request body | ||
* **disableGzip** set to false, to disable content gzipping, needed for Node v0.5.9 which has buggy zlib | ||
* **cookies** an array of cookie definitions in the form of `['name=val']` | ||
* **cookieJar** for sharing cookies between requests, see below | ||
* **outputEncoding** | ||
* **disableDecoding** set to true to disable automatic charset decoding to utf-8 | ||
* **overrideCharset** set input encoding | ||
* **asyncDnsLoookup** use high performance asynchronous DNS resolution based on c-ares instead of a thread pool calling getaddrinfo(3) | ||
* **timeout** set a timeout in ms | ||
* **agent** pass-through http.request agent parameter | ||
node-readability has additional option cleanRules which allow set your own validation rule for tags. | ||
If true rule is valid, otherwise no. | ||
options.cleanRules = [callback(obj, tagName)] | ||
``` | ||
read(url, { | ||
cleanRulers : [ | ||
function(obj, tag) { | ||
if(tag === 'object') { | ||
if(obj.getAttribute('class') === 'BrightcoveExperience') { | ||
return true; | ||
} | ||
} | ||
} | ||
] | ||
}, function(err, article, response) {}); | ||
``` | ||
## article object | ||
@@ -83,2 +82,6 @@ | ||
## meta object | ||
response object from request lib. If you need to get current url after all redirect or get some headers it can be useful. | ||
The document of the web page generated by jsdom. You can use it to access the DOM directly(for example, `article.document.getElementById('main')`). | ||
@@ -93,1 +96,5 @@ | ||
This code is under the Apache License 2.0. http://www.apache.org/licenses/LICENSE-2.0 | ||
[![Bitdeli Badge](https://d2weczhvl823v0.cloudfront.net/luin/node-readability/trend.png)](https://bitdeli.com/free "Bitdeli Badge") | ||
@@ -23,2 +23,8 @@ var url = require("url"); | ||
var cleanRules = []; | ||
module.exports.setCleanRules = function(rules) { | ||
cleanRules = rules; | ||
}; | ||
/** | ||
@@ -389,8 +395,26 @@ * Prepare the HTML document for readability to scrape it. | ||
for (var y = targetList.length - 1; y >= 0; y--) { | ||
/* Allow youtube and vimeo videos through as people usually want to see those. */ | ||
if (isEmbed && targetList[y].innerHTML.search(regexps.videoRe) !== -1) { | ||
//------- user clean handler ----------------- | ||
var validRule = false; | ||
for(var i = 0; i < cleanRules.length; i++) { | ||
if(cleanRules[i](targetList[y], tag) === true) { | ||
validRule = true; | ||
break; | ||
} | ||
} | ||
if(validRule) { | ||
continue; | ||
} | ||
//------- end user clean handler ----------------- | ||
/* Allow youtube and vimeo videos through as people usually want to see those. */ | ||
if (isEmbed) { | ||
if(targetList[y].innerHTML.search(regexps.videoRe) !== -1) { | ||
continue; | ||
} | ||
} | ||
targetList[y].parentNode.removeChild(targetList[y]); | ||
@@ -397,0 +421,0 @@ } |
var jsdom = require('jsdom'); | ||
var fetchUrl = require('fetch').fetchUrl; | ||
var request = require('request'); | ||
var helpers = require('./helpers'); | ||
var encodinglib = require("encoding"); | ||
exports.debug = function (debug) { | ||
@@ -11,3 +13,3 @@ helpers.debug(debug); | ||
function Readability(document) { | ||
function Readability(document, options) { | ||
this._document = document; | ||
@@ -18,2 +20,3 @@ this.iframeLoads = 0; | ||
this._articleContent = ''; | ||
helpers.setCleanRules(options.cleanRulers || []); | ||
@@ -103,2 +106,48 @@ this.cache = {}; | ||
function _findHTMLCharset(htmlbuffer){ | ||
var body = htmlbuffer.toString("ascii"), | ||
input, meta, charset; | ||
if(meta = body.match(/<meta\s+http-equiv=["']content-type["'][^>]*?>/i)){ | ||
input = meta[0]; | ||
} | ||
if(input){ | ||
charset = input.match(/charset\s?=\s?([a-zA-Z\-0-9]*);?/); | ||
if(charset){ | ||
charset = (charset[1] || "").trim().toLowerCase(); | ||
} | ||
} | ||
if(!charset && (meta = body.match(/<meta\s+charset=["'](.*?)["']/i))){ | ||
charset = (meta[1] || "").trim().toLowerCase(); | ||
} | ||
return charset; | ||
} | ||
function _parseContentType(str){ | ||
if(!str){ | ||
return {}; | ||
} | ||
var parts = str.split(";"), | ||
mimeType = parts.shift(), | ||
charset, chparts; | ||
for(var i=0, len = parts.length; i<len; i++){ | ||
chparts = parts[i].split("="); | ||
if(chparts.length>1){ | ||
if(chparts[0].trim().toLowerCase() == "charset"){ | ||
charset = chparts[1]; | ||
} | ||
} | ||
} | ||
return { | ||
mimeType: (mimeType || "").trim().toLowerCase(), | ||
charset: (charset || "UTF-8").trim().toLowerCase() // defaults to UTF-8 | ||
} | ||
} | ||
function read(html, options, callback) { | ||
@@ -110,4 +159,26 @@ if (typeof options === 'function') { | ||
var overrideEncoding = options.encoding; | ||
options.encoding = null; | ||
if (html.indexOf('<') === -1) { | ||
fetchUrl(html, options, jsdomParse); | ||
request(html, options, function(err, res, buffer) { | ||
if(err) { | ||
return callback(err); | ||
} | ||
var content_type = _parseContentType(res.headers['content-type']); | ||
if(content_type.mimeType == "text/html"){ | ||
content_type.charset = _findHTMLCharset(buffer) || content_type.charset; | ||
} | ||
content_type.charset = (overrideEncoding || content_type.charset || "utf-8").trim().toLowerCase(); | ||
if(!content_type.charset.match(/^utf-?8$/i)){ | ||
buffer = encodinglib.convert(buffer, "UTF-8", content_type.charset); | ||
} | ||
jsdomParse(null, res, buffer.toString()); | ||
}); | ||
} else { | ||
@@ -127,6 +198,12 @@ jsdomParse(null, null, html); | ||
done: function (errors, window) { | ||
window.document.originalURL = html; | ||
if(meta) { | ||
window.document.originalURL = meta.request.uri.href; | ||
} else { | ||
window.document.originalURL = null; | ||
} | ||
if (errors) return callback(errors); | ||
if (!window.document.body) return callback(new Error('No body tag was found.')); | ||
callback(null, new Readability(window.document, options)); | ||
// add meta information to callback | ||
callback(null, new Readability(window.document, options), meta); | ||
} | ||
@@ -133,0 +210,0 @@ }); |
Sorry, the diff of this file is not supported yet
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Long strings
Supply chain riskContains long string literals, which may be a sign of obfuscated or packed code.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
161987
12
1236
98
3
3
2
+ Addedencoding@~0.1.7
+ Addedrequest@~2.31.0
+ Addedasn1@0.1.11(transitive)
+ Addedassert-plus@0.1.5(transitive)
+ Addedasync@0.9.2(transitive)
+ Addedaws-sign2@0.5.0(transitive)
+ Addedboom@0.4.2(transitive)
+ Addedcombined-stream@0.0.7(transitive)
+ Addedcryptiles@0.2.2(transitive)
+ Addedctype@0.5.3(transitive)
+ Addeddelayed-stream@0.0.5(transitive)
+ Addedforever-agent@0.5.2(transitive)
+ Addedform-data@0.1.4(transitive)
+ Addedhawk@1.0.0(transitive)
+ Addedhoek@0.9.1(transitive)
+ Addedhttp-signature@0.10.1(transitive)
+ Addedmime@1.2.11(transitive)
+ Addednode-uuid@1.4.8(transitive)
+ Addedoauth-sign@0.3.0(transitive)
+ Addedqs@0.6.6(transitive)
+ Addedrequest@2.31.0(transitive)
+ Addedsntp@0.2.4(transitive)
+ Addedtough-cookie@0.9.15(transitive)
+ Addedtunnel-agent@0.3.0(transitive)
- Removedfetch@0.3.x
- Removedajv@6.12.6(transitive)
- Removedasn1@0.2.6(transitive)
- Removedassert-plus@1.0.0(transitive)
- Removedasynckit@0.4.0(transitive)
- Removedaws-sign2@0.7.0(transitive)
- Removedaws4@1.13.2(transitive)
- Removedbcrypt-pbkdf@1.0.2(transitive)
- Removedcaseless@0.12.0(transitive)
- Removedcombined-stream@1.0.8(transitive)
- Removedcore-util-is@1.0.2(transitive)
- Removeddashdash@1.14.1(transitive)
- Removeddelayed-stream@1.0.0(transitive)
- Removedecc-jsbn@0.1.2(transitive)
- Removedextend@3.0.2(transitive)
- Removedextsprintf@1.3.0(transitive)
- Removedfast-deep-equal@3.1.3(transitive)
- Removedfast-json-stable-stringify@2.1.0(transitive)
- Removedfetch@0.3.6(transitive)
- Removedforever-agent@0.6.1(transitive)
- Removedform-data@2.3.3(transitive)
- Removedgetpass@0.1.7(transitive)
- Removedhar-schema@2.0.0(transitive)
- Removedhar-validator@5.1.5(transitive)
- Removedhttp-signature@1.2.0(transitive)
- Removedis-typedarray@1.0.0(transitive)
- Removedisstream@0.1.2(transitive)
- Removedjsbn@0.1.1(transitive)
- Removedjson-schema@0.4.0(transitive)
- Removedjson-schema-traverse@0.4.1(transitive)
- Removedjsprim@1.4.2(transitive)
- Removedmime-db@1.52.0(transitive)
- Removedmime-types@2.1.35(transitive)
- Removedoauth-sign@0.9.0(transitive)
- Removedperformance-now@2.1.0(transitive)
- Removedpsl@1.9.0(transitive)
- Removedqs@6.5.3(transitive)
- Removedrequest@2.88.2(transitive)
- Removedsshpk@1.18.0(transitive)
- Removedtough-cookie@2.5.0(transitive)
- Removedtunnel-agent@0.6.0(transitive)
- Removedtweetnacl@0.14.5(transitive)
- Removeduri-js@4.4.1(transitive)
- Removeduuid@3.4.0(transitive)
- Removedverror@1.10.0(transitive)