Socket
Socket
Sign inDemoInstall

node-readability

Package Overview
Dependencies
Maintainers
1
Versions
27
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

node-readability - npm Package Compare versions

Comparing version 0.2.1 to 0.3.0

test/charset.js

10

package.json
{
"name": "node-readability",
"version": "0.2.1",
"version": "0.3.0",
"author": "Zihua Li",

@@ -22,4 +22,5 @@ "description": "Turning any web page into a clean view.",

"dependencies": {
"fetch": "0.3.x",
"jsdom": "0.8.x"
"jsdom": "0.8.x",
"request": "~2.31.0",
"encoding": "~0.1.7"
},

@@ -34,4 +35,5 @@ "engines": [

"mocha": "~1.8.2",
"should": "~1.2.2"
"should": "~2.1.1",
"nock": "~0.27.1"
}
}

45

README.md

@@ -46,22 +46,21 @@ # Readability

node-readability will pass the options to [fetch](https://github.com/andris9/fetch) directly.
node-readability will pass the options to [request](https://github.com/mikeal/request) directly.
See request lib to view all available options.
Possible option values
* **maxRedirects** how many redirects allowed, defaults to 10
* **disableRedirects** set to true if redirects are not allowed, defaults to false
* **headers** optional header fields, in the form of `{'Header-Field':'value'}`
* **maxResponseLength** maximum allowed length for the file, the remainder is cut off. Defaults to `Infinity`
* **method** defaults to GET
* **payload** request body
* **disableGzip** set to false, to disable content gzipping, needed for Node v0.5.9 which has buggy zlib
* **cookies** an array of cookie definitions in the form of `['name=val']`
* **cookieJar** for sharing cookies between requests, see below
* **outputEncoding**
* **disableDecoding** set to true to disable automatic charset decoding to utf-8
* **overrideCharset** set input encoding
* **asyncDnsLoookup** use high performance asynchronous DNS resolution based on c-ares instead of a thread pool calling getaddrinfo(3)
* **timeout** set a timeout in ms
* **agent** pass-through http.request agent parameter
node-readability has additional option cleanRules which allow set your own validation rule for tags.
If true rule is valid, otherwise no.
options.cleanRules = [callback(obj, tagName)]
```
read(url, {
cleanRulers : [
function(obj, tag) {
if(tag === 'object') {
if(obj.getAttribute('class') === 'BrightcoveExperience') {
return true;
}
}
}
]
}, function(err, article, response) {});
```
## article object

@@ -83,2 +82,6 @@

## meta object
response object from request lib. If you need to get current url after all redirect or get some headers it can be useful.
The document of the web page generated by jsdom. You can use it to access the DOM directly(for example, `article.document.getElementById('main')`).

@@ -93,1 +96,5 @@

This code is under the Apache License 2.0. http://www.apache.org/licenses/LICENSE-2.0
[![Bitdeli Badge](https://d2weczhvl823v0.cloudfront.net/luin/node-readability/trend.png)](https://bitdeli.com/free "Bitdeli Badge")

@@ -23,2 +23,8 @@ var url = require("url");

var cleanRules = [];
module.exports.setCleanRules = function(rules) {
cleanRules = rules;
};
/**

@@ -389,8 +395,26 @@ * Prepare the HTML document for readability to scrape it.

for (var y = targetList.length - 1; y >= 0; y--) {
/* Allow youtube and vimeo videos through as people usually want to see those. */
if (isEmbed && targetList[y].innerHTML.search(regexps.videoRe) !== -1) {
//------- user clean handler -----------------
var validRule = false;
for(var i = 0; i < cleanRules.length; i++) {
if(cleanRules[i](targetList[y], tag) === true) {
validRule = true;
break;
}
}
if(validRule) {
continue;
}
//------- end user clean handler -----------------
/* Allow youtube and vimeo videos through as people usually want to see those. */
if (isEmbed) {
if(targetList[y].innerHTML.search(regexps.videoRe) !== -1) {
continue;
}
}
targetList[y].parentNode.removeChild(targetList[y]);

@@ -397,0 +421,0 @@ }

var jsdom = require('jsdom');
var fetchUrl = require('fetch').fetchUrl;
var request = require('request');
var helpers = require('./helpers');
var encodinglib = require("encoding");
exports.debug = function (debug) {

@@ -11,3 +13,3 @@ helpers.debug(debug);

function Readability(document) {
function Readability(document, options) {
this._document = document;

@@ -18,2 +20,3 @@ this.iframeLoads = 0;

this._articleContent = '';
helpers.setCleanRules(options.cleanRulers || []);

@@ -103,2 +106,48 @@ this.cache = {};

function _findHTMLCharset(htmlbuffer){
var body = htmlbuffer.toString("ascii"),
input, meta, charset;
if(meta = body.match(/<meta\s+http-equiv=["']content-type["'][^>]*?>/i)){
input = meta[0];
}
if(input){
charset = input.match(/charset\s?=\s?([a-zA-Z\-0-9]*);?/);
if(charset){
charset = (charset[1] || "").trim().toLowerCase();
}
}
if(!charset && (meta = body.match(/<meta\s+charset=["'](.*?)["']/i))){
charset = (meta[1] || "").trim().toLowerCase();
}
return charset;
}
function _parseContentType(str){
if(!str){
return {};
}
var parts = str.split(";"),
mimeType = parts.shift(),
charset, chparts;
for(var i=0, len = parts.length; i<len; i++){
chparts = parts[i].split("=");
if(chparts.length>1){
if(chparts[0].trim().toLowerCase() == "charset"){
charset = chparts[1];
}
}
}
return {
mimeType: (mimeType || "").trim().toLowerCase(),
charset: (charset || "UTF-8").trim().toLowerCase() // defaults to UTF-8
}
}
function read(html, options, callback) {

@@ -110,4 +159,26 @@ if (typeof options === 'function') {

var overrideEncoding = options.encoding;
options.encoding = null;
if (html.indexOf('<') === -1) {
fetchUrl(html, options, jsdomParse);
request(html, options, function(err, res, buffer) {
if(err) {
return callback(err);
}
var content_type = _parseContentType(res.headers['content-type']);
if(content_type.mimeType == "text/html"){
content_type.charset = _findHTMLCharset(buffer) || content_type.charset;
}
content_type.charset = (overrideEncoding || content_type.charset || "utf-8").trim().toLowerCase();
if(!content_type.charset.match(/^utf-?8$/i)){
buffer = encodinglib.convert(buffer, "UTF-8", content_type.charset);
}
jsdomParse(null, res, buffer.toString());
});
} else {

@@ -127,6 +198,12 @@ jsdomParse(null, null, html);

done: function (errors, window) {
window.document.originalURL = html;
if(meta) {
window.document.originalURL = meta.request.uri.href;
} else {
window.document.originalURL = null;
}
if (errors) return callback(errors);
if (!window.document.body) return callback(new Error('No body tag was found.'));
callback(null, new Readability(window.document, options));
// add meta information to callback
callback(null, new Readability(window.document, options), meta);
}

@@ -133,0 +210,0 @@ });

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc