read-art - npm Package Compare versions

Comparing version 0.4.4 to 0.4.5

examples/simple.js

		var read = require('../');
		var URI = require('urijs');

		read('http://www.cqn.com.cn/auto/news/73572.html', {
		//var uri = 'http://news.hexun.com/2016-01-14/181791008.html';
		var uri = 'http://www.kaixian.tv/gd/2016/0114/724631.html';
		//var uri = 'http://www.ntjoy.com/news/vod/xwsph/nttv1/csrl/2016/01/2016-01-14461427.html';
		read(uri, {
		timeout : 15000,
		output : {
		type : 'json',
		stripSpaces: true,
		break: true
		minTextLength: 0,
		minParagraphs: 0,
		selectors: {
		content: '.contxt'
		},
		minTextLength: 0,
		scoreRule: function(node){
		if (node.hasClass('w740')) {
		return 100;
		}
		return 0;
		}
		output: 'text'
		}, function(err, art, options, resp){
		@@ -18,0 +16,0 @@ if (err) {

HISTORY.md

		@@ -0,1 +1,5 @@
		# 2016/01/14
		- `minParagraphs` option.
		- Make images regexp extendable. #14@entertainyou

		# 2016/01/07
		@@ -2,0 +6,0 @@ - feature: threshold

index.js

		@@ -37,3 +37,4 @@ "use strict";
		minTextLength: 25,
		thresholdLinkDensity: 0.25
		thresholdLinkDensity: 0.25,
		minParagraphs: 3
		}, options);
		@@ -47,2 +48,6 @@

		if (!isFinite(options.minParagraphs)) {
		options.minParagraphs = 3;
		}

		// indicating uri is html or url.
		@@ -49,0 +54,0 @@ var isHTML = uri.match(/^\s*</);

lib/reader.js

		@@ -13,3 +13,2 @@ "use strict";
		videos : /(youtube\|vimeo\|youku\|tudou\|56\|letv\|iqiyi\|sohu\|sina\|163)\.(com\|com\.cn\|cn\|net)/i,
		images : /\.(gif\|jpe?g\|png)$/i,
		commas : /[,，.。;；?？、]/g
		@@ -22,3 +21,4 @@ },
		maybe : /and\|article\|body\|column\|main\|column/i,
		div2p : /<(a\|blockquote\|dl\|div\|img\|ol\|p\|pre\|table\|ul)/i
		div2p : /<(a\|blockquote\|dl\|div\|img\|ol\|p\|pre\|table\|ul\|span\|font)/i,
		images : /\.(gif\|jpe?g\|png)$/i
		},
		@@ -54,3 +54,2 @@ tagsToSkip = '';
		var topCandidate = getTopCandidate($, cans);

		// 3nd. grab article
		@@ -411,3 +410,3 @@ if (topCandidate && topCandidate.length > 0) {
		}
		if (topCandidate.children('p').length < 3 && (parent = topCandidate.parent()) && parent.length > 0 && parent.get(0).name.toLowerCase() != 'body') {
		if (topCandidate.children('p').length < options.minParagraphs && (parent = topCandidate.parent()) && parent.length > 0 && parent.get(0).name.toLowerCase() != 'body') {
		// 1. topCandidate has not enough [P] children
		@@ -430,3 +429,2 @@ // 2. parent exist and not [BODY]
		textLen = text.length;

		if (tagName == 'p') {
		@@ -433,0 +431,0 @@ var linkDensity = getLinkDensity($, node);

package.json

		{
		"name": "read-art",
		"version": "0.4.4",
		"version": "0.4.5",
		"description": "Scrape/Crawl article from any site automatically. Make any web page readable, no matter Chinese or English.",
		@@ -33,3 +33,3 @@ "main": "index.js",
		"cheerio": "~0.19.0",
		"req-fast": "^0.2.13",
		"req-fast": "^0.2.14",
		"urijs": "~1.17.0",
		@@ -36,0 +36,0 @@ "entities": "~1.1.1"

README.md

		@@ -65,2 +65,3 @@ read-art [![NPM version](https://badge.fury.io/js/read-art.svg)](http://badge.fury.io/js/read-art) [![Build Status](https://travis-ci.org/Tjatse/node-readability.svg?branch=master)](https://travis-ci.org/Tjatse/node-readability)
		- minTextLength If the content is less than `[minTextLength]` characters, don't even count it, `25` by default.
		- minParagraphs A number indicates whether or not take the top candidate as a article candidate, `3` by default, i.e.: If `topCandidate` dom has more than `3` `<p>` children, `topCandidate` will be considered as the article dom, otherwise, it will be the parent of `topCandidate` (not `<body>`).
		- tidyAttrs Remove all the attributes on elements, `false` by default.
		@@ -310,2 +311,5 @@ - dom Will return the whole cheerio dom when this property is set to `true`, `false` by default, try to use `art.dom` to get the dom object in callback function.

		- `this.regexps.images([re], [override])`
		If `images` regexp test `src` attribute of node success, it will be picked as a normal `img`, otherwise will be dropped. `[re]` is a regexp, e.g. `/\.(gif\|jpe?g\|png)$/i` will match the `image` that `src` likes `/path/to/foo.jpg`, if `[override]` is set to `true`, `readart.regexps.images` will be replaced by `[re]`, otherwise it will be appended to the origin.

		### Example
		@@ -312,0 +316,0 @@ ```javascript

362

test/custom_settings.js

		@@ -1,27 +0,30 @@
		var read = require('../'),
		chai = require('chai'),
		expect = chai.expect,
		should = chai.should();
		var read = require('../'),
		chai = require('chai'),
		expect = chai.expect,
		should = chai.should();

		var html = '';
		describe('custom settings', function(){
		before(function(){
		describe('custom settings', function () {
		before(function () {
		html = '<title>read-art</title>' +
		'<body>' +
		'<div class="dv1"><p class="p1">hi, dude, I am readability (<b>aka read-art</b>)<foot>foot</foot></p></div>' +
		'<div class="dv2"><div class="p2">hey, dude, I am readability too</div></div>' +
		'<div class="dv3"><span></span>hello, dude, I am readability too.</div>' +
		'</body>';
		'<body>' +
		'<div>' +
		'<div class="dv1"><p class="p1">hi, dude, I am readability (<b>aka read-art</b>)<foot>foot</foot></p></div>' +
		'<div class="dv2"><div class="p2">hey, dude, I am readability too</div></div>' +
		'<div class="dv3"><foo>hello</foo>, dude, I am readability too.</div>' +
		'</div>' +
		'</body>';
		});
		after(function(){
		after(function () {
		html = '';
		});
		describe('do not exist', function(){
		it('should works fine', function(done){
		describe('by default', function () {
		it('should skip nothing', function (done) {
		read(html, {
		output: 'text'
		}, function(err, art){
		}, function (err, art) {
		should.not.exist(err);
		expect(art).to.be.an('object');
		art.content.should.contain('hi');
		art.content.should.contain('foot');
		art.content.should.contain('aka read-art');
		done();
		@@ -31,5 +34,5 @@ });
		});
		describe('with appending', function(){
		it('skip <b>', function(done){
		read.use(function(){
		describe('skipTags', function () {
		it('skip <b>', function (done) {
		read.use(function () {
		this.skipTags('b,x,y,z');
		@@ -39,3 +42,3 @@ });
		output: 'text'
		}, function(err, art){
		}, function (err, art) {
		should.not.exist(err);
		@@ -46,3 +49,3 @@ expect(art).to.be.an('object');

		read.use(function(){
		read.use(function () {
		this.reset()
		@@ -53,15 +56,14 @@ });
		});

		it('regexps.positive', function(done){
		read.use(function(){
		this.regexps.positive(/dv2\|p2/);
		it('skip <b> but not skip <foot>', function (done) {
		read.use(function () {
		this.skipTags('b,x,y,z', true);
		});
		read(html, {
		output: 'text'
		}, function(err, art){
		}, function (err, art) {
		should.not.exist(err);
		expect(art).to.be.an('object');
		art.content.should.contain('hey');
		art.content.should.contain('foot');

		read.use(function(){
		read.use(function () {
		this.reset()
		@@ -72,181 +74,215 @@ });
		});
		});

		it('regexps.negative', function(done){
		read.use(function(){
		this.regexps.negative(/dv1\|p1/);
		});
		read(html, {
		output: 'text'
		}, function(err, art){
		should.not.exist(err);
		expect(art).to.be.an('object');
		art.content.should.contain('hey');
		describe('regexps', function () {
		describe('append', function () {
		it('positive', function (done) {
		read.use(function () {
		this.regexps.positive(/dv2\|p2/);
		});
		read(html, {
		output: 'text'
		}, function (err, art) {
		should.not.exist(err);
		expect(art).to.be.an('object');
		art.content.should.contain('hey');

		read.use(function(){
		this.reset()
		read.use(function () {
		this.reset()
		});
		done();
		});
		done();
		});
		});

		it('regexps.unlikely', function(done){
		read.use(function(){
		this.regexps.unlikely(/dv1\|p1/);
		});
		read(html, {
		output: 'text'
		}, function(err, art){
		should.not.exist(err);
		expect(art).to.be.an('object');
		art.content.should.contain('hey');
		it('negative', function (done) {
		read.use(function () {
		this.regexps.negative(/dv1/);
		});
		read(html, {
		output: 'text'
		}, function (err, art) {
		should.not.exist(err);
		expect(art).to.be.an('object');
		art.content.should.contain('hey');

		read.use(function(){
		this.reset()
		read.use(function () {
		this.reset()
		});
		done();
		});
		done();
		});
		});

		it('regexps.maybe', function(done){
		read.use(function(){
		this.regexps.unlikely(/dv1/);
		this.regexps.maybe(/p1/);
		});
		read(html, {
		output: 'text'
		}, function(err, art){
		should.not.exist(err);
		expect(art).to.be.an('object');
		art.content.should.contain('hi');
		art.content.should.not.contain('hey');
		it('unlikely', function (done) {
		read.use(function () {
		this.regexps.unlikely(/dv1\|p1/);
		});
		read(html, {
		output: 'text'
		}, function (err, art) {
		should.not.exist(err);
		expect(art).to.be.an('object');
		art.content.should.contain('hey');

		read.use(function(){
		this.reset()
		read.use(function () {
		this.reset()
		});
		done();
		});
		done();
		});
		});

		it('regexps.div2p', function(done){
		read.use(function(){
		this.regexps.div2p(/<span/);
		});
		read(html, {
		output: 'text'
		}, function(err, art){
		should.not.exist(err);
		expect(art).to.be.an('object');
		art.content.should.contain('hello');
		it('maybe', function (done) {
		read.use(function () {
		this.regexps.unlikely(/dv1/);
		this.regexps.maybe(/dv1/);
		});

		read.use(function(){
		this.reset()
		read(html, {
		output: 'text'
		}, function (err, art) {
		should.not.exist(err);
		expect(art).to.be.an('object');
		art.content.should.contain('hi');
		art.content.should.contain('hey');
		art.content.should.contain('hello');

		read.use(function () {
		this.reset()
		});
		done();
		});
		done();
		});
		});
		});

		describe('with overriding', function(){
		it('skip <b> but not skip <foot>', function(done){
		read.use(function(){
		this.skipTags('b,x,y,z', true);
		});
		read(html, {
		output: 'text'
		}, function(err, art){
		should.not.exist(err);
		expect(art).to.be.an('object');
		art.content.should.contain('foot');
		it('div2p', function (done) {
		read.use(function () {
		this.regexps.div2p(/<foo/);
		});
		read(html, {
		output: 'text'
		}, function (err, art) {
		should.not.exist(err);
		expect(art).to.be.an('object');
		art.content.should.contain('hello');

		read.use(function(){
		this.reset()
		read.use(function () {
		this.reset()
		});
		done();
		});
		done();
		});
		});
		describe('override', function () {
		it('positive', function (done) {
		read.use(function () {
		this.regexps.positive(/dv2\|p2/, true);
		});
		read(html, {
		output: 'text'
		}, function (err, art) {
		should.not.exist(err);
		expect(art).to.be.an('object');
		art.content.should.contain('hey');

		it('regexps.positive', function(done){
		read.use(function(){
		this.regexps.positive(/dv2\|p2/, true);
		read.use(function () {
		this.reset()
		});
		done();
		});
		});
		read(html, {
		output: 'text'
		}, function(err, art){
		should.not.exist(err);
		expect(art).to.be.an('object');
		art.content.should.contain('hey');

		read.use(function(){
		this.reset()
		it('negative', function (done) {
		read.use(function () {
		this.regexps.negative(/dv1\|p1/, true);
		});
		done();
		});
		});
		read(html, {
		output: 'text'
		}, function (err, art) {
		should.not.exist(err);
		expect(art).to.be.an('object');
		art.content.should.contain('hey');

		it('regexps.negative', function(done){
		read.use(function(){
		this.regexps.negative(/dv1\|p1/, true);
		read.use(function () {
		this.reset()
		});
		done();
		});
		});
		read(html, {
		output: 'text'
		}, function(err, art){
		should.not.exist(err);
		expect(art).to.be.an('object');
		art.content.should.contain('hey');

		read.use(function(){
		this.reset()
		it('unlikely', function (done) {
		read.use(function () {
		this.regexps.unlikely(/dv1\|p1/, true);
		});
		done();
		});
		});
		read(html, {
		output: 'text'
		}, function (err, art) {
		should.not.exist(err);
		expect(art).to.be.an('object');
		art.content.should.contain('hey');

		it('regexps.unlikely', function(done){
		read.use(function(){
		this.regexps.unlikely(/dv1\|p1/, true);
		read.use(function () {
		this.reset()
		});
		done();
		});
		});
		read(html, {
		output: 'text'
		}, function(err, art){
		should.not.exist(err);
		expect(art).to.be.an('object');
		art.content.should.contain('hey');

		read.use(function(){
		this.reset()
		it('maybe', function (done) {
		read.use(function () {
		this.regexps.unlikely(/dv1\|p1/, true);
		this.regexps.maybe(/dv2\|p2/, true);
		});
		done();
		});
		});
		read(html, {
		output: 'text'
		}, function (err, art) {
		should.not.exist(err);
		expect(art).to.be.an('object');
		art.content.should.not.contain('hi');
		art.content.should.contain('hey');
		art.content.should.contain('hello');

		it('regexps.maybe', function(done){
		read.use(function(){
		this.regexps.unlikely(/dv1/, true);
		this.regexps.maybe(/p1/, true);
		read.use(function () {
		this.reset()
		});
		done();
		});
		});
		read(html, {
		output: 'text'
		}, function(err, art){
		should.not.exist(err);
		expect(art).to.be.an('object');
		art.content.should.contain('hi');
		art.content.should.not.contain('hey');

		read.use(function(){
		this.reset()
		it('div2p', function (done) {
		read.use(function () {
		this.regexps.div2p(/<(div\|foo)/, true);
		});
		done();
		read(html, {
		output: 'text'
		}, function (err, art) {
		should.not.exist(err);
		expect(art).to.be.an('object');
		art.content.should.contain('hi');
		art.content.should.contain('hey');
		art.content.should.contain('hello');

		read.use(function () {
		this.reset()
		});
		done();
		});
		});
		});

		it('regexps.div2p', function(done){
		it('regexps.images', function(done){

		var article = '<title>title</title>' +
		'<body>' +
		'<p>Text hello <img src="image" /> world</p>' +
		'</body>';

		read.use(function(){
		this.regexps.div2p(/<span/, true);
		this.regexps.images(/.*/, true);
		});
		read(html, {
		output: 'text'
		read({
		html: article,
		uri: 'http://github.com',
		output: 'html',
		}, function(err, art){
		should.not.exist(err);
		expect(art).to.be.an('object');
		art.content.should.contain('hello');
		art.content.should.contain('<img src="http://github.com/image">');

		@@ -260,2 +296,2 @@ read.use(function(){
		});
		});
		});

test/option.js

		@@ -1,2 +0,1 @@

		var read = require('../'),
		@@ -9,4 +8,4 @@ chai = require('chai'),

		describe('different options',function(){
		before(function(){
		describe('different options', function () {
		before(function () {
		uri = 'http://www.bing.com';
		@@ -16,3 +15,3 @@ html = '<p>Hello, node-art</p>';
		});
		after(function(){
		after(function () {
		uri = null;
		@@ -22,5 +21,7 @@ html = null;
		});
		describe('have three arguments',function(){
		it('should detect two options',function(done){
		read(uri, { charset: charset }, function(err, art, options, resp){
		describe('have three arguments', function () {
		it('should detect two options', function (done) {
		read(uri, {
		charset: charset
		}, function (err, art, options, resp) {
		should.not.exist(err);
		@@ -35,5 +36,5 @@ options.uri.should.be.equal(uri);

		describe('have two arguments(string, function)',function(){
		it('should detect one options',function(done){
		read(uri, function(err, art, options){
		describe('have two arguments(string, function)', function () {
		it('should detect one options', function (done) {
		read(uri, function (err, art, options) {
		should.not.exist(err);
		@@ -47,5 +48,8 @@ options.uri.should.be.equal(uri);

		describe('have two arguments(object, function)',function(){
		it('should detect two options',function(done){
		read({ uri: uri, charset: charset }, function(err, art, options){
		describe('have two arguments(object, function)', function () {
		it('should detect two options', function (done) {
		read({
		uri: uri,
		charset: charset
		}, function (err, art, options) {
		should.not.exist(err);
		@@ -59,5 +63,8 @@ options.uri.should.be.equal(uri);

		describe('uri is passed in',function(){
		it('should detect uri in options',function(done){
		read({ uri: uri, charset: charset }, function(err, art, options){
		describe('uri is passed in', function () {
		it('should detect uri in options', function (done) {
		read({
		uri: uri,
		charset: charset
		}, function (err, art, options) {
		should.not.exist(err);
		@@ -70,5 +77,8 @@ options.uri.should.be.equal(uri);

		describe('uri is passed in, but treat as html',function(){
		it('should detect html automatically',function(done){
		read({ uri: html, charset: charset }, function(err, art, options){
		describe('uri is passed in, but treat as html', function () {
		it('should detect html automatically', function (done) {
		read({
		uri: html,
		charset: charset
		}, function (err, art, options) {
		should.not.exist(err);
		@@ -81,5 +91,8 @@ options.html.should.be.equal(html);

		describe('html is passed in',function(){
		it('should detect html in options',function(done){
		read({ uri: html, charset: charset }, function(err, art, options){
		describe('html is passed in', function () {
		it('should detect html in options', function (done) {
		read({
		uri: html,
		charset: charset
		}, function (err, art, options) {
		should.not.exist(err);
		@@ -91,2 +104,33 @@ options.html.should.be.equal(html);
		});
		});
		});

		describe('minParagraphs option', function () {
		before(function () {
		html = '<title>read-art</title><body><div><div><p>hi, dude, i am <a href="/Tjatse/read-art.git">readability</a>, aka read-art...</p></div><span>footer</span></div></body>'
		});
		describe('3 by default', function () {
		it('should find footer', function (done) {
		read({
		minTextLength: 0,
		html: html
		}, function (err, art, options, resp) {
		should.not.exist(err);
		art.content.should.contain('footer');
		done();
		});
		});
		});
		describe('0 by customized', function () {
		it('should find no footer', function (done) {
		read({
		minTextLength: 0,
		minParagraphs: 0,
		html: html
		}, function (err, art, options, resp) {
		should.not.exist(err);
		art.content.should.not.contain('footer');
		done();
		});
		});
		});
		});

		@@ -65,2 +65,3 @@ read-art [![NPM version](https://badge.fury.io/js/read-art.svg)](http://badge.fury.io/js/read-art) [![Build Status](https://travis-ci.org/Tjatse/node-readability.svg?branch=master)](https://travis-ci.org/Tjatse/node-readability)
		- minTextLength If the content is less than `[minTextLength]` characters, don't even count it, `25` by default.
		- minParagraphs A number indicates whether or not take the top candidate as a article candidate, `3` by default, i.e.: If `topCandidate` dom has more than `3` `<p>` children, `topCandidate` will be considered as the article dom, otherwise, it will be the parent of `topCandidate` (not `<body>`).
		- tidyAttrs Remove all the attributes on elements, `false` by default.
		@@ -310,2 +311,5 @@ - dom Will return the whole cheerio dom when this property is set to `true`, `false` by default, try to use `art.dom` to get the dom object in callback function.

		- `this.regexps.images([re], [override])`
		If `images` regexp test `src` attribute of node success, it will be picked as a normal `img`, otherwise will be dropped. `[re]` is a regexp, e.g. `/\.(gif\|jpe?g\|png)$/i` will match the `image` that `src` likes `/path/to/foo.jpg`, if `[override]` is set to `true`, `readart.regexps.images` will be replaced by `[re]`, otherwise it will be appended to the origin.

		### Example
		@@ -312,0 +316,0 @@ ```javascript

read-art - npm Package Compare versions

Improved metrics

Dependency changes