{
		"name": "node-readability",
		"version": "0.2.1",
		"version": "0.3.0",
		"author": "Zihua Li",
		@@ -22,4 +22,5 @@ "description": "Turning any web page into a clean view.",
		"dependencies": {
		"fetch": "0.3.x",
		"jsdom": "0.8.x"
		"jsdom": "0.8.x",
		"request": "~2.31.0",
		"encoding": "~0.1.7"
		},
		@@ -34,4 +35,5 @@ "engines": [
		"mocha": "~1.8.2",
		"should": "~1.2.2"
		"should": "~2.1.1",
		"nock": "~0.27.1"
		}
		}

README.md

		@@ -46,22 +46,21 @@ # Readability

		node-readability will pass the options to [fetch](https://github.com/andris9/fetch) directly.
		node-readability will pass the options to [request](https://github.com/mikeal/request) directly.
		See request lib to view all available options.

		Possible option values

		* maxRedirects how many redirects allowed, defaults to 10
		* disableRedirects set to true if redirects are not allowed, defaults to false
		* headers optional header fields, in the form of `{'Header-Field':'value'}`
		* maxResponseLength maximum allowed length for the file, the remainder is cut off. Defaults to `Infinity`
		* method defaults to GET
		* payload request body
		* disableGzip set to false, to disable content gzipping, needed for Node v0.5.9 which has buggy zlib
		* cookies an array of cookie definitions in the form of `['name=val']`
		* cookieJar for sharing cookies between requests, see below
		* outputEncoding
		* disableDecoding set to true to disable automatic charset decoding to utf-8
		* overrideCharset set input encoding
		* asyncDnsLoookup use high performance asynchronous DNS resolution based on c-ares instead of a thread pool calling getaddrinfo(3)
		* timeout set a timeout in ms
		* agent pass-through http.request agent parameter

		node-readability has additional option cleanRules which allow set your own validation rule for tags.
		If true rule is valid, otherwise no.
		options.cleanRules = [callback(obj, tagName)]
		```
		read(url, {
		cleanRulers : [
		function(obj, tag) {
		if(tag === 'object') {
		if(obj.getAttribute('class') === 'BrightcoveExperience') {
		return true;
		}
		}
		}
		]
		}, function(err, article, response) {});
		```
		## article object
		@@ -83,2 +82,6 @@

		## meta object

		response object from request lib. If you need to get current url after all redirect or get some headers it can be useful.

		The document of the web page generated by jsdom. You can use it to access the DOM directly(for example, `article.document.getElementById('main')`).
		@@ -93,1 +96,5 @@
		This code is under the Apache License 2.0. http://www.apache.org/licenses/LICENSE-2.0


		[![Bitdeli Badge](https://d2weczhvl823v0.cloudfront.net/luin/node-readability/trend.png)](https://bitdeli.com/free "Bitdeli Badge")

src/helpers.js

		@@ -23,2 +23,8 @@ var url = require("url");

		var cleanRules = [];

		module.exports.setCleanRules = function(rules) {
		cleanRules = rules;
		};

		/**
		@@ -389,8 +395,26 @@ * Prepare the HTML document for readability to scrape it.



		for (var y = targetList.length - 1; y >= 0; y--) {
		/* Allow youtube and vimeo videos through as people usually want to see those. */
		if (isEmbed && targetList[y].innerHTML.search(regexps.videoRe) !== -1) {
		//------- user clean handler -----------------
		var validRule = false;
		for(var i = 0; i < cleanRules.length; i++) {
		if(cleanRules[i](targetList[y], tag) === true) {
		validRule = true;
		break;
		}
		}

		if(validRule) {
		continue;
		}
		//------- end user clean handler -----------------

		/* Allow youtube and vimeo videos through as people usually want to see those. */
		if (isEmbed) {
		if(targetList[y].innerHTML.search(regexps.videoRe) !== -1) {
		continue;
		}
		}

		targetList[y].parentNode.removeChild(targetList[y]);
		@@ -397,0 +421,0 @@ }

src/readability.js

		var jsdom = require('jsdom');
		var fetchUrl = require('fetch').fetchUrl;
		var request = require('request');
		var helpers = require('./helpers');
		var encodinglib = require("encoding");


		exports.debug = function (debug) {
		@@ -11,3 +13,3 @@ helpers.debug(debug);

		function Readability(document) {
		function Readability(document, options) {
		this._document = document;
		@@ -18,2 +20,3 @@ this.iframeLoads = 0;
		this._articleContent = '';
		helpers.setCleanRules(options.cleanRulers \|\| []);

		@@ -103,2 +106,48 @@ this.cache = {};

		function _findHTMLCharset(htmlbuffer){

		var body = htmlbuffer.toString("ascii"),
		input, meta, charset;

		if(meta = body.match(/<meta\s+http-equiv=["']content-type["'][^>]*?>/i)){
		input = meta[0];
		}

		if(input){
		charset = input.match(/charset\s?=\s?([a-zA-Z\-0-9]*);?/);
		if(charset){
		charset = (charset[1] \|\| "").trim().toLowerCase();
		}
		}

		if(!charset && (meta = body.match(/<meta\s+charset=["'](.*?)["']/i))){
		charset = (meta[1] \|\| "").trim().toLowerCase();
		}

		return charset;
		}

		function _parseContentType(str){
		if(!str){
		return {};
		}
		var parts = str.split(";"),
		mimeType = parts.shift(),
		charset, chparts;

		for(var i=0, len = parts.length; i<len; i++){
		chparts = parts[i].split("=");
		if(chparts.length>1){
		if(chparts[0].trim().toLowerCase() == "charset"){
		charset = chparts[1];
		}
		}
		}

		return {
		mimeType: (mimeType \|\| "").trim().toLowerCase(),
		charset: (charset \|\| "UTF-8").trim().toLowerCase() // defaults to UTF-8
		}
		}

		function read(html, options, callback) {
		@@ -110,4 +159,26 @@ if (typeof options === 'function') {

		var overrideEncoding = options.encoding;

		options.encoding = null;

		if (html.indexOf('<') === -1) {
		fetchUrl(html, options, jsdomParse);
		request(html, options, function(err, res, buffer) {
		if(err) {
		return callback(err);
		}

		var content_type = _parseContentType(res.headers['content-type']);

		if(content_type.mimeType == "text/html"){
		content_type.charset = _findHTMLCharset(buffer) \|\| content_type.charset;
		}

		content_type.charset = (overrideEncoding \|\| content_type.charset \|\| "utf-8").trim().toLowerCase();

		if(!content_type.charset.match(/^utf-?8$/i)){
		buffer = encodinglib.convert(buffer, "UTF-8", content_type.charset);
		}

		jsdomParse(null, res, buffer.toString());
		});
		} else {
		@@ -127,6 +198,12 @@ jsdomParse(null, null, html);
		done: function (errors, window) {
		window.document.originalURL = html;
		if(meta) {
		window.document.originalURL = meta.request.uri.href;
		} else {
		window.document.originalURL = null;
		}

		if (errors) return callback(errors);
		if (!window.document.body) return callback(new Error('No body tag was found.'));
		callback(null, new Readability(window.document, options));
		// add meta information to callback
		callback(null, new Readability(window.document, options), meta);
		}
		@@ -133,0 +210,0 @@ });

History.md

test/readability.test.js

test/mocha.opts

Sorry, the diff of this file is not supported yet

node-readability - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics

Worsened metrics

Dependency changes