You're Invited:Meet the Socket Team at BlackHat and DEF CON in Las Vegas, Aug 4-6.RSVP
Socket
Book a DemoInstallSign in
Socket

node-readability

Package Overview
Dependencies
Maintainers
1
Versions
27
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

node-readability - npm Package Compare versions

Comparing version

to
0.3.0

test/charset.js

10

package.json
{
"name": "node-readability",
"version": "0.2.1",
"version": "0.3.0",
"author": "Zihua Li",

@@ -22,4 +22,5 @@ "description": "Turning any web page into a clean view.",

"dependencies": {
"fetch": "0.3.x",
"jsdom": "0.8.x"
"jsdom": "0.8.x",
"request": "~2.31.0",
"encoding": "~0.1.7"
},

@@ -34,4 +35,5 @@ "engines": [

"mocha": "~1.8.2",
"should": "~1.2.2"
"should": "~2.1.1",
"nock": "~0.27.1"
}
}

45

README.md

@@ -46,22 +46,21 @@ # Readability

node-readability will pass the options to [fetch](https://github.com/andris9/fetch) directly.
node-readability will pass the options to [request](https://github.com/mikeal/request) directly.
See request lib to view all available options.
Possible option values
* **maxRedirects** how many redirects allowed, defaults to 10
* **disableRedirects** set to true if redirects are not allowed, defaults to false
* **headers** optional header fields, in the form of `{'Header-Field':'value'}`
* **maxResponseLength** maximum allowed length for the file, the remainder is cut off. Defaults to `Infinity`
* **method** defaults to GET
* **payload** request body
* **disableGzip** set to false, to disable content gzipping, needed for Node v0.5.9 which has buggy zlib
* **cookies** an array of cookie definitions in the form of `['name=val']`
* **cookieJar** for sharing cookies between requests, see below
* **outputEncoding**
* **disableDecoding** set to true to disable automatic charset decoding to utf-8
* **overrideCharset** set input encoding
* **asyncDnsLoookup** use high performance asynchronous DNS resolution based on c-ares instead of a thread pool calling getaddrinfo(3)
* **timeout** set a timeout in ms
* **agent** pass-through http.request agent parameter
node-readability has additional option cleanRules which allow set your own validation rule for tags.
If true rule is valid, otherwise no.
options.cleanRules = [callback(obj, tagName)]
```
read(url, {
cleanRulers : [
function(obj, tag) {
if(tag === 'object') {
if(obj.getAttribute('class') === 'BrightcoveExperience') {
return true;
}
}
}
]
}, function(err, article, response) {});
```
## article object

@@ -83,2 +82,6 @@

## meta object
response object from request lib. If you need to get current url after all redirect or get some headers it can be useful.
The document of the web page generated by jsdom. You can use it to access the DOM directly(for example, `article.document.getElementById('main')`).

@@ -93,1 +96,5 @@

This code is under the Apache License 2.0. http://www.apache.org/licenses/LICENSE-2.0
[![Bitdeli Badge](https://d2weczhvl823v0.cloudfront.net/luin/node-readability/trend.png)](https://bitdeli.com/free "Bitdeli Badge")

@@ -23,2 +23,8 @@ var url = require("url");

var cleanRules = [];
module.exports.setCleanRules = function(rules) {
cleanRules = rules;
};
/**

@@ -389,8 +395,26 @@ * Prepare the HTML document for readability to scrape it.

for (var y = targetList.length - 1; y >= 0; y--) {
/* Allow youtube and vimeo videos through as people usually want to see those. */
if (isEmbed && targetList[y].innerHTML.search(regexps.videoRe) !== -1) {
//------- user clean handler -----------------
var validRule = false;
for(var i = 0; i < cleanRules.length; i++) {
if(cleanRules[i](targetList[y], tag) === true) {
validRule = true;
break;
}
}
if(validRule) {
continue;
}
//------- end user clean handler -----------------
/* Allow youtube and vimeo videos through as people usually want to see those. */
if (isEmbed) {
if(targetList[y].innerHTML.search(regexps.videoRe) !== -1) {
continue;
}
}
targetList[y].parentNode.removeChild(targetList[y]);

@@ -397,0 +421,0 @@ }

var jsdom = require('jsdom');
var fetchUrl = require('fetch').fetchUrl;
var request = require('request');
var helpers = require('./helpers');
var encodinglib = require("encoding");
exports.debug = function (debug) {

@@ -11,3 +13,3 @@ helpers.debug(debug);

function Readability(document) {
function Readability(document, options) {
this._document = document;

@@ -18,2 +20,3 @@ this.iframeLoads = 0;

this._articleContent = '';
helpers.setCleanRules(options.cleanRulers || []);

@@ -103,2 +106,48 @@ this.cache = {};

function _findHTMLCharset(htmlbuffer){
var body = htmlbuffer.toString("ascii"),
input, meta, charset;
if(meta = body.match(/<meta\s+http-equiv=["']content-type["'][^>]*?>/i)){
input = meta[0];
}
if(input){
charset = input.match(/charset\s?=\s?([a-zA-Z\-0-9]*);?/);
if(charset){
charset = (charset[1] || "").trim().toLowerCase();
}
}
if(!charset && (meta = body.match(/<meta\s+charset=["'](.*?)["']/i))){
charset = (meta[1] || "").trim().toLowerCase();
}
return charset;
}
function _parseContentType(str){
if(!str){
return {};
}
var parts = str.split(";"),
mimeType = parts.shift(),
charset, chparts;
for(var i=0, len = parts.length; i<len; i++){
chparts = parts[i].split("=");
if(chparts.length>1){
if(chparts[0].trim().toLowerCase() == "charset"){
charset = chparts[1];
}
}
}
return {
mimeType: (mimeType || "").trim().toLowerCase(),
charset: (charset || "UTF-8").trim().toLowerCase() // defaults to UTF-8
}
}
function read(html, options, callback) {

@@ -110,4 +159,26 @@ if (typeof options === 'function') {

var overrideEncoding = options.encoding;
options.encoding = null;
if (html.indexOf('<') === -1) {
fetchUrl(html, options, jsdomParse);
request(html, options, function(err, res, buffer) {
if(err) {
return callback(err);
}
var content_type = _parseContentType(res.headers['content-type']);
if(content_type.mimeType == "text/html"){
content_type.charset = _findHTMLCharset(buffer) || content_type.charset;
}
content_type.charset = (overrideEncoding || content_type.charset || "utf-8").trim().toLowerCase();
if(!content_type.charset.match(/^utf-?8$/i)){
buffer = encodinglib.convert(buffer, "UTF-8", content_type.charset);
}
jsdomParse(null, res, buffer.toString());
});
} else {

@@ -127,6 +198,12 @@ jsdomParse(null, null, html);

done: function (errors, window) {
window.document.originalURL = html;
if(meta) {
window.document.originalURL = meta.request.uri.href;
} else {
window.document.originalURL = null;
}
if (errors) return callback(errors);
if (!window.document.body) return callback(new Error('No body tag was found.'));
callback(null, new Readability(window.document, options));
// add meta information to callback
callback(null, new Readability(window.document, options), meta);
}

@@ -133,0 +210,0 @@ });

Sorry, the diff of this file is not supported yet