Comparing version 0.4.4 to 0.4.5
var read = require('../'); | ||
var URI = require('urijs'); | ||
read('http://www.cqn.com.cn/auto/news/73572.html', { | ||
//var uri = 'http://news.hexun.com/2016-01-14/181791008.html'; | ||
var uri = 'http://www.kaixian.tv/gd/2016/0114/724631.html'; | ||
//var uri = 'http://www.ntjoy.com/news/vod/xwsph/nttv1/csrl/2016/01/2016-01-14461427.html'; | ||
read(uri, { | ||
timeout : 15000, | ||
output : { | ||
type : 'json', | ||
stripSpaces: true, | ||
break: true | ||
minTextLength: 0, | ||
minParagraphs: 0, | ||
selectors: { | ||
content: '.contxt' | ||
}, | ||
minTextLength: 0, | ||
scoreRule: function(node){ | ||
if (node.hasClass('w740')) { | ||
return 100; | ||
} | ||
return 0; | ||
} | ||
output: 'text' | ||
}, function(err, art, options, resp){ | ||
@@ -18,0 +16,0 @@ if (err) { |
@@ -0,1 +1,5 @@ | ||
# 2016/01/14 | ||
- `minParagraphs` option. | ||
- Make images regexp extendable. #14@entertainyou | ||
# 2016/01/07 | ||
@@ -2,0 +6,0 @@ - feature: threshold |
@@ -37,3 +37,4 @@ "use strict"; | ||
minTextLength: 25, | ||
thresholdLinkDensity: 0.25 | ||
thresholdLinkDensity: 0.25, | ||
minParagraphs: 3 | ||
}, options); | ||
@@ -47,2 +48,6 @@ | ||
if (!isFinite(options.minParagraphs)) { | ||
options.minParagraphs = 3; | ||
} | ||
// indicating uri is html or url. | ||
@@ -49,0 +54,0 @@ var isHTML = uri.match(/^\s*</); |
@@ -13,3 +13,2 @@ "use strict"; | ||
videos : /(youtube|vimeo|youku|tudou|56|letv|iqiyi|sohu|sina|163)\.(com|com\.cn|cn|net)/i, | ||
images : /\.(gif|jpe?g|png)$/i, | ||
commas : /[,,.。;;??、]/g | ||
@@ -22,3 +21,4 @@ }, | ||
maybe : /and|article|body|column|main|column/i, | ||
div2p : /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i | ||
div2p : /<(a|blockquote|dl|div|img|ol|p|pre|table|ul|span|font)/i, | ||
images : /\.(gif|jpe?g|png)$/i | ||
}, | ||
@@ -54,3 +54,2 @@ tagsToSkip = ''; | ||
var topCandidate = getTopCandidate($, cans); | ||
// 3nd. grab article | ||
@@ -411,3 +410,3 @@ if (topCandidate && topCandidate.length > 0) { | ||
} | ||
if (topCandidate.children('p').length < 3 && (parent = topCandidate.parent()) && parent.length > 0 && parent.get(0).name.toLowerCase() != 'body') { | ||
if (topCandidate.children('p').length < options.minParagraphs && (parent = topCandidate.parent()) && parent.length > 0 && parent.get(0).name.toLowerCase() != 'body') { | ||
// 1. topCandidate has not enough [P] children | ||
@@ -430,3 +429,2 @@ // 2. parent exist and not [BODY] | ||
textLen = text.length; | ||
if (tagName == 'p') { | ||
@@ -433,0 +431,0 @@ var linkDensity = getLinkDensity($, node); |
{ | ||
"name": "read-art", | ||
"version": "0.4.4", | ||
"version": "0.4.5", | ||
"description": "Scrape/Crawl article from any site automatically. Make any web page readable, no matter Chinese or English.", | ||
@@ -33,3 +33,3 @@ "main": "index.js", | ||
"cheerio": "~0.19.0", | ||
"req-fast": "^0.2.13", | ||
"req-fast": "^0.2.14", | ||
"urijs": "~1.17.0", | ||
@@ -36,0 +36,0 @@ "entities": "~1.1.1" |
@@ -65,2 +65,3 @@ read-art [](http://badge.fury.io/js/read-art) [](https://travis-ci.org/Tjatse/node-readability) | ||
- **minTextLength** If the content is less than `[minTextLength]` characters, don't even count it, `25` by default. | ||
- **minParagraphs** A number indicates whether or not take the top candidate as a article candidate, `3` by default, i.e.: If `topCandidate` dom has more than `3` `<p>` children, `topCandidate` will be considered as the article dom, otherwise, it will be the parent of `topCandidate` (not `<body>`). | ||
- **tidyAttrs** Remove all the attributes on elements, `false` by default. | ||
@@ -310,2 +311,5 @@ - **dom** Will return the whole cheerio dom when this property is set to `true`, `false` by default, try to use `art.dom` to get the dom object in callback function. | ||
- `this.regexps.images([re], [override])` | ||
If `images` regexp test `src` attribute of node success, it will be picked as a normal `img`, otherwise will be dropped. `[re]` is a regexp, e.g. `/\.(gif|jpe?g|png)$/i` will match the `image` that `src` likes `/path/to/foo.jpg`, if `[override]` is set to `true`, `readart.regexps.images` will be replaced by `[re]`, otherwise it will be appended to the origin. | ||
### Example | ||
@@ -312,0 +316,0 @@ ```javascript |
@@ -1,27 +0,30 @@ | ||
var read = require('../'), | ||
chai = require('chai'), | ||
expect = chai.expect, | ||
should = chai.should(); | ||
var read = require('../'), | ||
chai = require('chai'), | ||
expect = chai.expect, | ||
should = chai.should(); | ||
var html = ''; | ||
describe('custom settings', function(){ | ||
before(function(){ | ||
describe('custom settings', function () { | ||
before(function () { | ||
html = '<title>read-art</title>' + | ||
'<body>' + | ||
'<div class="dv1"><p class="p1">hi, dude, I am readability (<b>aka read-art</b>)<foot>foot</foot></p></div>' + | ||
'<div class="dv2"><div class="p2">hey, dude, I am readability too</div></div>' + | ||
'<div class="dv3"><span></span>hello, dude, I am readability too.</div>' + | ||
'</body>'; | ||
'<body>' + | ||
'<div>' + | ||
'<div class="dv1"><p class="p1">hi, dude, I am readability (<b>aka read-art</b>)<foot>foot</foot></p></div>' + | ||
'<div class="dv2"><div class="p2">hey, dude, I am readability too</div></div>' + | ||
'<div class="dv3"><foo>hello</foo>, dude, I am readability too.</div>' + | ||
'</div>' + | ||
'</body>'; | ||
}); | ||
after(function(){ | ||
after(function () { | ||
html = ''; | ||
}); | ||
describe('do not exist', function(){ | ||
it('should works fine', function(done){ | ||
describe('by default', function () { | ||
it('should skip nothing', function (done) { | ||
read(html, { | ||
output: 'text' | ||
}, function(err, art){ | ||
}, function (err, art) { | ||
should.not.exist(err); | ||
expect(art).to.be.an('object'); | ||
art.content.should.contain('hi'); | ||
art.content.should.contain('foot'); | ||
art.content.should.contain('aka read-art'); | ||
done(); | ||
@@ -31,5 +34,5 @@ }); | ||
}); | ||
describe('with appending', function(){ | ||
it('skip <b>', function(done){ | ||
read.use(function(){ | ||
describe('skipTags', function () { | ||
it('skip <b>', function (done) { | ||
read.use(function () { | ||
this.skipTags('b,x,y,z'); | ||
@@ -39,3 +42,3 @@ }); | ||
output: 'text' | ||
}, function(err, art){ | ||
}, function (err, art) { | ||
should.not.exist(err); | ||
@@ -46,3 +49,3 @@ expect(art).to.be.an('object'); | ||
read.use(function(){ | ||
read.use(function () { | ||
this.reset() | ||
@@ -53,15 +56,14 @@ }); | ||
}); | ||
it('regexps.positive', function(done){ | ||
read.use(function(){ | ||
this.regexps.positive(/dv2|p2/); | ||
it('skip <b> but not skip <foot>', function (done) { | ||
read.use(function () { | ||
this.skipTags('b,x,y,z', true); | ||
}); | ||
read(html, { | ||
output: 'text' | ||
}, function(err, art){ | ||
}, function (err, art) { | ||
should.not.exist(err); | ||
expect(art).to.be.an('object'); | ||
art.content.should.contain('hey'); | ||
art.content.should.contain('foot'); | ||
read.use(function(){ | ||
read.use(function () { | ||
this.reset() | ||
@@ -72,181 +74,215 @@ }); | ||
}); | ||
}); | ||
it('regexps.negative', function(done){ | ||
read.use(function(){ | ||
this.regexps.negative(/dv1|p1/); | ||
}); | ||
read(html, { | ||
output: 'text' | ||
}, function(err, art){ | ||
should.not.exist(err); | ||
expect(art).to.be.an('object'); | ||
art.content.should.contain('hey'); | ||
describe('regexps', function () { | ||
describe('append', function () { | ||
it('positive', function (done) { | ||
read.use(function () { | ||
this.regexps.positive(/dv2|p2/); | ||
}); | ||
read(html, { | ||
output: 'text' | ||
}, function (err, art) { | ||
should.not.exist(err); | ||
expect(art).to.be.an('object'); | ||
art.content.should.contain('hey'); | ||
read.use(function(){ | ||
this.reset() | ||
read.use(function () { | ||
this.reset() | ||
}); | ||
done(); | ||
}); | ||
done(); | ||
}); | ||
}); | ||
it('regexps.unlikely', function(done){ | ||
read.use(function(){ | ||
this.regexps.unlikely(/dv1|p1/); | ||
}); | ||
read(html, { | ||
output: 'text' | ||
}, function(err, art){ | ||
should.not.exist(err); | ||
expect(art).to.be.an('object'); | ||
art.content.should.contain('hey'); | ||
it('negative', function (done) { | ||
read.use(function () { | ||
this.regexps.negative(/dv1/); | ||
}); | ||
read(html, { | ||
output: 'text' | ||
}, function (err, art) { | ||
should.not.exist(err); | ||
expect(art).to.be.an('object'); | ||
art.content.should.contain('hey'); | ||
read.use(function(){ | ||
this.reset() | ||
read.use(function () { | ||
this.reset() | ||
}); | ||
done(); | ||
}); | ||
done(); | ||
}); | ||
}); | ||
it('regexps.maybe', function(done){ | ||
read.use(function(){ | ||
this.regexps.unlikely(/dv1/); | ||
this.regexps.maybe(/p1/); | ||
}); | ||
read(html, { | ||
output: 'text' | ||
}, function(err, art){ | ||
should.not.exist(err); | ||
expect(art).to.be.an('object'); | ||
art.content.should.contain('hi'); | ||
art.content.should.not.contain('hey'); | ||
it('unlikely', function (done) { | ||
read.use(function () { | ||
this.regexps.unlikely(/dv1|p1/); | ||
}); | ||
read(html, { | ||
output: 'text' | ||
}, function (err, art) { | ||
should.not.exist(err); | ||
expect(art).to.be.an('object'); | ||
art.content.should.contain('hey'); | ||
read.use(function(){ | ||
this.reset() | ||
read.use(function () { | ||
this.reset() | ||
}); | ||
done(); | ||
}); | ||
done(); | ||
}); | ||
}); | ||
it('regexps.div2p', function(done){ | ||
read.use(function(){ | ||
this.regexps.div2p(/<span/); | ||
}); | ||
read(html, { | ||
output: 'text' | ||
}, function(err, art){ | ||
should.not.exist(err); | ||
expect(art).to.be.an('object'); | ||
art.content.should.contain('hello'); | ||
it('maybe', function (done) { | ||
read.use(function () { | ||
this.regexps.unlikely(/dv1/); | ||
this.regexps.maybe(/dv1/); | ||
}); | ||
read.use(function(){ | ||
this.reset() | ||
read(html, { | ||
output: 'text' | ||
}, function (err, art) { | ||
should.not.exist(err); | ||
expect(art).to.be.an('object'); | ||
art.content.should.contain('hi'); | ||
art.content.should.contain('hey'); | ||
art.content.should.contain('hello'); | ||
read.use(function () { | ||
this.reset() | ||
}); | ||
done(); | ||
}); | ||
done(); | ||
}); | ||
}); | ||
}); | ||
describe('with overriding', function(){ | ||
it('skip <b> but not skip <foot>', function(done){ | ||
read.use(function(){ | ||
this.skipTags('b,x,y,z', true); | ||
}); | ||
read(html, { | ||
output: 'text' | ||
}, function(err, art){ | ||
should.not.exist(err); | ||
expect(art).to.be.an('object'); | ||
art.content.should.contain('foot'); | ||
it('div2p', function (done) { | ||
read.use(function () { | ||
this.regexps.div2p(/<foo/); | ||
}); | ||
read(html, { | ||
output: 'text' | ||
}, function (err, art) { | ||
should.not.exist(err); | ||
expect(art).to.be.an('object'); | ||
art.content.should.contain('hello'); | ||
read.use(function(){ | ||
this.reset() | ||
read.use(function () { | ||
this.reset() | ||
}); | ||
done(); | ||
}); | ||
done(); | ||
}); | ||
}); | ||
describe('override', function () { | ||
it('positive', function (done) { | ||
read.use(function () { | ||
this.regexps.positive(/dv2|p2/, true); | ||
}); | ||
read(html, { | ||
output: 'text' | ||
}, function (err, art) { | ||
should.not.exist(err); | ||
expect(art).to.be.an('object'); | ||
art.content.should.contain('hey'); | ||
it('regexps.positive', function(done){ | ||
read.use(function(){ | ||
this.regexps.positive(/dv2|p2/, true); | ||
read.use(function () { | ||
this.reset() | ||
}); | ||
done(); | ||
}); | ||
}); | ||
read(html, { | ||
output: 'text' | ||
}, function(err, art){ | ||
should.not.exist(err); | ||
expect(art).to.be.an('object'); | ||
art.content.should.contain('hey'); | ||
read.use(function(){ | ||
this.reset() | ||
it('negative', function (done) { | ||
read.use(function () { | ||
this.regexps.negative(/dv1|p1/, true); | ||
}); | ||
done(); | ||
}); | ||
}); | ||
read(html, { | ||
output: 'text' | ||
}, function (err, art) { | ||
should.not.exist(err); | ||
expect(art).to.be.an('object'); | ||
art.content.should.contain('hey'); | ||
it('regexps.negative', function(done){ | ||
read.use(function(){ | ||
this.regexps.negative(/dv1|p1/, true); | ||
read.use(function () { | ||
this.reset() | ||
}); | ||
done(); | ||
}); | ||
}); | ||
read(html, { | ||
output: 'text' | ||
}, function(err, art){ | ||
should.not.exist(err); | ||
expect(art).to.be.an('object'); | ||
art.content.should.contain('hey'); | ||
read.use(function(){ | ||
this.reset() | ||
it('unlikely', function (done) { | ||
read.use(function () { | ||
this.regexps.unlikely(/dv1|p1/, true); | ||
}); | ||
done(); | ||
}); | ||
}); | ||
read(html, { | ||
output: 'text' | ||
}, function (err, art) { | ||
should.not.exist(err); | ||
expect(art).to.be.an('object'); | ||
art.content.should.contain('hey'); | ||
it('regexps.unlikely', function(done){ | ||
read.use(function(){ | ||
this.regexps.unlikely(/dv1|p1/, true); | ||
read.use(function () { | ||
this.reset() | ||
}); | ||
done(); | ||
}); | ||
}); | ||
read(html, { | ||
output: 'text' | ||
}, function(err, art){ | ||
should.not.exist(err); | ||
expect(art).to.be.an('object'); | ||
art.content.should.contain('hey'); | ||
read.use(function(){ | ||
this.reset() | ||
it('maybe', function (done) { | ||
read.use(function () { | ||
this.regexps.unlikely(/dv1|p1/, true); | ||
this.regexps.maybe(/dv2|p2/, true); | ||
}); | ||
done(); | ||
}); | ||
}); | ||
read(html, { | ||
output: 'text' | ||
}, function (err, art) { | ||
should.not.exist(err); | ||
expect(art).to.be.an('object'); | ||
art.content.should.not.contain('hi'); | ||
art.content.should.contain('hey'); | ||
art.content.should.contain('hello'); | ||
it('regexps.maybe', function(done){ | ||
read.use(function(){ | ||
this.regexps.unlikely(/dv1/, true); | ||
this.regexps.maybe(/p1/, true); | ||
read.use(function () { | ||
this.reset() | ||
}); | ||
done(); | ||
}); | ||
}); | ||
read(html, { | ||
output: 'text' | ||
}, function(err, art){ | ||
should.not.exist(err); | ||
expect(art).to.be.an('object'); | ||
art.content.should.contain('hi'); | ||
art.content.should.not.contain('hey'); | ||
read.use(function(){ | ||
this.reset() | ||
it('div2p', function (done) { | ||
read.use(function () { | ||
this.regexps.div2p(/<(div|foo)/, true); | ||
}); | ||
done(); | ||
read(html, { | ||
output: 'text' | ||
}, function (err, art) { | ||
should.not.exist(err); | ||
expect(art).to.be.an('object'); | ||
art.content.should.contain('hi'); | ||
art.content.should.contain('hey'); | ||
art.content.should.contain('hello'); | ||
read.use(function () { | ||
this.reset() | ||
}); | ||
done(); | ||
}); | ||
}); | ||
}); | ||
it('regexps.div2p', function(done){ | ||
it('regexps.images', function(done){ | ||
var article = '<title>title</title>' + | ||
'<body>' + | ||
'<p>Text hello <img src="image" /> world</p>' + | ||
'</body>'; | ||
read.use(function(){ | ||
this.regexps.div2p(/<span/, true); | ||
this.regexps.images(/.*/, true); | ||
}); | ||
read(html, { | ||
output: 'text' | ||
read({ | ||
html: article, | ||
uri: 'http://github.com', | ||
output: 'html', | ||
}, function(err, art){ | ||
should.not.exist(err); | ||
expect(art).to.be.an('object'); | ||
art.content.should.contain('hello'); | ||
art.content.should.contain('<img src="http://github.com/image">'); | ||
@@ -260,2 +296,2 @@ read.use(function(){ | ||
}); | ||
}); | ||
}); |
@@ -1,2 +0,1 @@ | ||
var read = require('../'), | ||
@@ -9,4 +8,4 @@ chai = require('chai'), | ||
describe('different options',function(){ | ||
before(function(){ | ||
describe('different options', function () { | ||
before(function () { | ||
uri = 'http://www.bing.com'; | ||
@@ -16,3 +15,3 @@ html = '<p>Hello, node-art</p>'; | ||
}); | ||
after(function(){ | ||
after(function () { | ||
uri = null; | ||
@@ -22,5 +21,7 @@ html = null; | ||
}); | ||
describe('have three arguments',function(){ | ||
it('should detect two options',function(done){ | ||
read(uri, { charset: charset }, function(err, art, options, resp){ | ||
describe('have three arguments', function () { | ||
it('should detect two options', function (done) { | ||
read(uri, { | ||
charset: charset | ||
}, function (err, art, options, resp) { | ||
should.not.exist(err); | ||
@@ -35,5 +36,5 @@ options.uri.should.be.equal(uri); | ||
describe('have two arguments(string, function)',function(){ | ||
it('should detect one options',function(done){ | ||
read(uri, function(err, art, options){ | ||
describe('have two arguments(string, function)', function () { | ||
it('should detect one options', function (done) { | ||
read(uri, function (err, art, options) { | ||
should.not.exist(err); | ||
@@ -47,5 +48,8 @@ options.uri.should.be.equal(uri); | ||
describe('have two arguments(object, function)',function(){ | ||
it('should detect two options',function(done){ | ||
read({ uri: uri, charset: charset }, function(err, art, options){ | ||
describe('have two arguments(object, function)', function () { | ||
it('should detect two options', function (done) { | ||
read({ | ||
uri: uri, | ||
charset: charset | ||
}, function (err, art, options) { | ||
should.not.exist(err); | ||
@@ -59,5 +63,8 @@ options.uri.should.be.equal(uri); | ||
describe('uri is passed in',function(){ | ||
it('should detect uri in options',function(done){ | ||
read({ uri: uri, charset: charset }, function(err, art, options){ | ||
describe('uri is passed in', function () { | ||
it('should detect uri in options', function (done) { | ||
read({ | ||
uri: uri, | ||
charset: charset | ||
}, function (err, art, options) { | ||
should.not.exist(err); | ||
@@ -70,5 +77,8 @@ options.uri.should.be.equal(uri); | ||
describe('uri is passed in, but treat as html',function(){ | ||
it('should detect html automatically',function(done){ | ||
read({ uri: html, charset: charset }, function(err, art, options){ | ||
describe('uri is passed in, but treat as html', function () { | ||
it('should detect html automatically', function (done) { | ||
read({ | ||
uri: html, | ||
charset: charset | ||
}, function (err, art, options) { | ||
should.not.exist(err); | ||
@@ -81,5 +91,8 @@ options.html.should.be.equal(html); | ||
describe('html is passed in',function(){ | ||
it('should detect html in options',function(done){ | ||
read({ uri: html, charset: charset }, function(err, art, options){ | ||
describe('html is passed in', function () { | ||
it('should detect html in options', function (done) { | ||
read({ | ||
uri: html, | ||
charset: charset | ||
}, function (err, art, options) { | ||
should.not.exist(err); | ||
@@ -91,2 +104,33 @@ options.html.should.be.equal(html); | ||
}); | ||
}); | ||
}); | ||
describe('minParagraphs option', function () { | ||
before(function () { | ||
html = '<title>read-art</title><body><div><div><p>hi, dude, i am <a href="/Tjatse/read-art.git">readability</a>, aka read-art...</p></div><span>footer</span></div></body>' | ||
}); | ||
describe('3 by default', function () { | ||
it('should find footer', function (done) { | ||
read({ | ||
minTextLength: 0, | ||
html: html | ||
}, function (err, art, options, resp) { | ||
should.not.exist(err); | ||
art.content.should.contain('footer'); | ||
done(); | ||
}); | ||
}); | ||
}); | ||
describe('0 by customized', function () { | ||
it('should find no footer', function (done) { | ||
read({ | ||
minTextLength: 0, | ||
minParagraphs: 0, | ||
html: html | ||
}, function (err, art, options, resp) { | ||
should.not.exist(err); | ||
art.content.should.not.contain('footer'); | ||
done(); | ||
}); | ||
}); | ||
}); | ||
}); |
92284
2212
481
Updatedreq-fast@^0.2.14