Comparing version 0.4.3 to 0.4.4
var read = require('../'); | ||
read('http://news.sohu.com/20151228/n432833902.shtml', { | ||
read('http://www.cqn.com.cn/auto/news/73572.html', { | ||
timeout : 15000, | ||
output : { | ||
type : 'json', | ||
stripSpaces: true, | ||
break: true | ||
}, | ||
minTextLength: 0, | ||
scoreRule: function(node){ | ||
if (node.attr('itemprop') == 'articleBody') { | ||
if (node.hasClass('w740')) { | ||
return 100; | ||
@@ -9,0 +14,0 @@ } |
@@ -0,1 +1,8 @@ | ||
# 2016/01/07 | ||
- feature: threshold | ||
- fixed: #5 #10 | ||
# 2016/01/06 | ||
- `imgFallback` option @entertainyou | ||
# 2015/12/29 | ||
@@ -2,0 +9,0 @@ - fix scoreRule on grandparent node |
14
index.js
@@ -31,9 +31,17 @@ "use strict"; | ||
} | ||
options = util._extend({ | ||
killBreaks : true, | ||
killBreaks: true, | ||
lowerCaseTags: true, | ||
output : 'html', | ||
minTextLength: 25 | ||
output: 'html', | ||
minTextLength: 25, | ||
thresholdLinkDensity: 0.25 | ||
}, options); | ||
var density = options.thresholdLinkDensity; | ||
if (!isFinite(density) || (density > 1 || density < 0)) { | ||
density = 0.25; | ||
} | ||
options.thresholdLinkDensity = density; | ||
// indicating uri is html or url. | ||
@@ -40,0 +48,0 @@ var isHTML = uri.match(/^\s*</); |
@@ -395,3 +395,15 @@ "use strict"; | ||
siblingScoreThreshold = Math.max(10, topCandidate.data(scoreKey) * 0.2), | ||
thresholdScoreType = typeof options.thresholdScore, | ||
parent, siblings; | ||
if (thresholdScoreType != 'undefined') { | ||
if (thresholdScoreType == 'number' && isFinite(options.thresholdScore)) { | ||
siblingScoreThreshold = options.thresholdScore; | ||
} else if (thresholdScoreType == 'function') { | ||
var score = options.thresholdScore(topCandidate, scoreKey); | ||
if (isFinite(score)) { | ||
siblingScoreThreshold = score; | ||
} | ||
} | ||
} | ||
if (topCandidate.children('p').length < 3 && (parent = topCandidate.parent()) && parent.length > 0 && parent.get(0).name.toLowerCase() != 'body') { | ||
@@ -418,9 +430,9 @@ // 1. topCandidate has not enough [P] children | ||
var linkDensity = getLinkDensity($, node); | ||
if (textLen > 80 && linkDensity < 0.25) { | ||
if (textLen > options.minTextLength && linkDensity < options.thresholdLinkDensity) { | ||
append = true; | ||
} else if ((textLen < 80 && linkDensity == 0) || text.search(regexps.stopwords) !== -1) { | ||
} else if ((textLen < options.minTextLength && linkDensity == 0) || text.search(regexps.stopwords) !== -1) { | ||
// end with .|。 commas must be a paragraph. | ||
append = true; | ||
} | ||
} else if ((tagName == 'span' || tagName == 'font') && textLen > 0) { | ||
} else if (regexps.div2p.test('<' + tagName) && textLen > 0) { | ||
append = true; | ||
@@ -450,3 +462,3 @@ } | ||
// fix links | ||
fixLink($, options.uri, article); | ||
fixLink($, options.uri, article, options); | ||
return article; | ||
@@ -456,2 +468,25 @@ } | ||
/** | ||
* Fallback if user defines imgFallback. | ||
* @param {Mixed} fb | ||
* @param {Cheerio} node | ||
* @return {String} | ||
*/ | ||
function parseImgFallback(fb, node) { | ||
switch (typeof fb) { | ||
case 'boolean': | ||
return fb ? (node.data('src') || node.attr('data-src')) : null; | ||
case 'function': | ||
return fb(node); | ||
case 'string': | ||
if(/^data-/i.test(fb)){ | ||
fb = fb.slice(5); | ||
return node.data(fb); | ||
} | ||
return node.attr(fb); | ||
default: | ||
return null; | ||
} | ||
} | ||
/** | ||
* Fix link of image and video objects. | ||
@@ -461,4 +496,5 @@ * @param $ dom | ||
* @param article the article object. | ||
* @param options | ||
*/ | ||
function fixLink($, origin, article){ | ||
function fixLink($, origin, article, options){ | ||
if(!origin){ | ||
@@ -478,2 +514,5 @@ return; | ||
propVal = node.attr(propName = 'src'); | ||
if(!propVal && typeof options.imgFallback != 'undefined'){ | ||
propVal = parseImgFallback(options.imgFallback, node); | ||
} | ||
isValid = propVal && (propVal.search(regexps.images) !== -1); | ||
@@ -504,4 +543,5 @@ break; | ||
if (propURI.is('relative')) { | ||
node.attr(propName, propURI.absoluteTo(origin).href()); | ||
propVal = propURI.absoluteTo(origin).href(); | ||
} | ||
node.attr(propName, propVal); | ||
} | ||
@@ -546,3 +586,5 @@ }); | ||
if (grandParent && grandParent.length > 0) { | ||
scoreNode(grandParent, score / 2, cans, options.scoreRule); | ||
var dampedScore = score * (isFinite(options.damping) ? options.damping : (1 / 2)); | ||
dampedScore = isFinite(dampedScore) ? dampedScore : 0; | ||
scoreNode(grandParent, dampedScore, cans, options.scoreRule); | ||
} | ||
@@ -549,0 +591,0 @@ } |
{ | ||
"name": "read-art", | ||
"version": "0.4.3", | ||
"version": "0.4.4", | ||
"description": "Scrape/Crawl article from any site automatically. Make any web page readable, no matter Chinese or English.", | ||
@@ -33,3 +33,3 @@ "main": "index.js", | ||
"cheerio": "~0.19.0", | ||
"req-fast": "^0.2.9", | ||
"req-fast": "^0.2.13", | ||
"urijs": "~1.17.0", | ||
@@ -36,0 +36,0 @@ "entities": "~1.1.1" |
@@ -18,2 +18,4 @@ read-art [](http://badge.fury.io/js/read-art) [](https://travis-ci.org/Tjatse/node-readability) | ||
- [Extract Selectors](#selectors) | ||
- [Image Fallback](#imgfallback) | ||
- [Threshold](#threshold) | ||
- [Customize Settings](#cus_sets) | ||
@@ -62,12 +64,15 @@ - [Output](#output) | ||
- **output** The data type of article content, head over to [Output](#output) to get more information. | ||
- **killBreaks** A value indicating whether kill breaks, blanks, tab symbols(\r\t\n) into one `<br />` or not, `true` by default. | ||
- **killBreaks** A value indicating whether or not kill breaks, blanks, tab symbols(\r\t\n) into one `<br />`, `true` by default. | ||
- **minTextLength** If the content is less than `[minTextLength]` characters, don't even count it, `25` by default. | ||
- **tidyAttrs** Remove all the attributes on elements, `false` by default. | ||
- **dom** Will return the whole cheerio dom when this property is set to `true`, `false` by default, try to use `art.dom` to get the dom object in callback function. | ||
- **damping** The damping to calculate score of parent node, `1/2` by default. e.g.: the score of current document node is `20`, the score of parent will be `20 * damping`. | ||
- **scoreRule** Customize the score rules of each node, one arguments will be passed into the callback function, [read more](#score_rule). | ||
- **selectors** Customize the data extract [selectors](#selectors). | ||
- **imgFallback** Customize the way to get source of image, [read more](#imgfallback). | ||
- **thresholdScore** A number/function indicates whether or not drop the article content, [read more](#threshold_score). | ||
- **thresholdLinkDensity** A `0~1` decimal indicates whether or not drop the article content, [read more](#threshold_linkdensity). | ||
- **options from [cheerio](https://github.com/cheeriojs/cheerio)** | ||
- **options from [req-fast](https://github.com/Tjatse/req-fast)** | ||
- **scoreRule** Customize the score rules of each node, one arguments will be passed into the callback function (head over to [Score Rule](#score_rule) to get more information): | ||
- **node** The [cheerio object](https://github.com/cheeriojs/cheerio#selectors). | ||
- **selectors** Customize the data extract [selectors](#selectors). | ||
* **callback** The callback to run - `callback(error, article, options, response)`, arguments are: | ||
* **callback** Fire after the article has been crawled - `callback(error, article, options, response)`, arguments are: | ||
- **error** `Error` object when exception has been caught. | ||
@@ -136,3 +141,4 @@ - **article** The article object, including: `article.title`, `article.content` and `article.html`. | ||
<a name="score_rule_eg" /> | ||
**node** The [cheerio object](https://github.com/cheeriojs/cheerio#selectors). | ||
### Example | ||
@@ -187,2 +193,72 @@ ```javascript | ||
<a name="imgfallback" /> | ||
## Image Fallback | ||
Should be one of following types: | ||
- **Boolean** Fallback to `img.src = (node.data('src') || node.attr('data-src'))` when set to `true`. | ||
- **String** Customize the attribute name, it will take `node.attr([imgFallback])` as `src` of `img`. | ||
- **Function** Give users maximum customizability and scalability of source attribute on `img`, e.g.: | ||
```javascript | ||
imgFallback: function(node){ | ||
return node.attr('base') + '/' + node.attr('rel-path'); | ||
} | ||
``` | ||
### Example | ||
```javascript | ||
read({ | ||
imgFallback: true | ||
}, function(err, art){}); | ||
read({ | ||
imgFallback: 'the-src-attr' | ||
}, function(err, art){}); | ||
read({ | ||
imgFallback: function(node){ | ||
return 'http://img-serv/' + node.attr('relative-path'); | ||
} | ||
}, function(err, art){}); | ||
``` | ||
<a name="threshold" /> | ||
## Threshold | ||
Customize the threshold of anchors and nodes' scores. | ||
<a name="threshold_score" /> | ||
### Score | ||
The `thresholdScore` is a threshold number which to identify whether or not to discard children of top candidate directly (skip deeper tag/text/link density checking), should be one of following types: | ||
- **Number** A finite number. | ||
- **Function** Calculate the threshold score by yourself, two arguments are passing in: | ||
- *node* The top candidate (mostly like article dom). | ||
- *scoreKey* The data key to storage score, you can get score by `node.data(scoreKey)`. | ||
After `read-art` got the top candidate, it starts to analyze the children of top candidate, if the score of current child is greater than `thresholdScore`, the child will be appended to article body directly. | ||
`Math.max(10, topCandidate.data(scoreKey) * 0.2)` by default. | ||
#### Example | ||
```javascript | ||
read({ | ||
thresholdScore: 20 | ||
}, function(err, art){}); | ||
read({ | ||
thresholdScore: function(node, scoreKey){ | ||
return Math.max(10, node.data(scoreKey) * 0.2); | ||
} | ||
}, function(err, art){}); | ||
``` | ||
<a name="threshold_linkdensity" /> | ||
### Link Density | ||
`thresholdLinkDensity` is used to identify whether current child of top candidate is a `navigator`, `ad` or `relative-list`, `0.25` by default, so if the text length of anchors in current child devides by text length of top candidate is greater than `thresholdLinkDensity`, the child will be discarded. | ||
#### Example | ||
```javascript | ||
read({ | ||
thresholdLinkDensity: 0.25 | ||
}, function(err, art){}); | ||
``` | ||
<a name="cus_sets" /> | ||
@@ -237,3 +313,2 @@ ## Customize Settings | ||
<a name="cus_sets_eg" /> | ||
### Example | ||
@@ -258,7 +333,6 @@ ```javascript | ||
- **stripSpaces** | ||
A value indicates whether strip the tab symbols (\r\n\t) or not, `false` by default. | ||
A value indicates whether or not strip the tab symbols (\r\n\t), `false` by default. | ||
- **break** | ||
A value indicates whether split content into paragraphs by `<br />` (Only affects JSON output). | ||
A value indicates whether or not split content into paragraphs by `<br />` (Only affects JSON output). | ||
<a name="output_text" /> | ||
### text | ||
@@ -283,3 +357,2 @@ Returns the inner text, e.g.: | ||
<a name="output_html" /> | ||
### html | ||
@@ -306,3 +379,2 @@ Returns the inner HTML, e.g.: | ||
<a name="output_json" /> | ||
### json | ||
@@ -338,3 +410,2 @@ Returns the restful result, e.g.: | ||
<a name="output_cheerio" /> | ||
### cheerio | ||
@@ -341,0 +412,0 @@ Returns the cheerio node, e.g.: |
@@ -35,2 +35,205 @@ var read = require('../'), | ||
describe('image fallback option',function(){ | ||
describe('as boolean', function() { | ||
describe('true', function() { | ||
it('fallback to relative data-src if imgFallback option is true',function(done){ | ||
read({ | ||
uri: 'http://github.com/Tjatse', | ||
html: '<title>read-art</title><body><div><p>hi, dude, i am <img data-src="/path/to/foo.png" />, aka read-art...</p></div></body>', | ||
imgFallback: true | ||
}, function(err, art){ | ||
should.not.exist(err); | ||
expect(art).to.be.an('object'); | ||
art.content.should.contain(' src="http://github.com/path/to/foo.png"'); | ||
art.title.should.equal('read-art'); | ||
done(); | ||
}); | ||
}); | ||
it('fallback to absolute data-src if imgFallback option is true',function(done){ | ||
read({ | ||
uri: 'http://example.com/', | ||
html: '<title>read-art</title><body><div><p>hi, dude, i am <img data-src="http://github.com/path/to/foo.png" />, aka read-art...</p></div></body>', | ||
imgFallback: true | ||
}, function(err, art){ | ||
should.not.exist(err); | ||
expect(art).to.be.an('object'); | ||
art.content.should.contain(' src="http://github.com/path/to/foo.png"'); | ||
art.title.should.equal('read-art'); | ||
done(); | ||
}); | ||
}); | ||
it('not fallback to data-src if imgFallback option is true and src exists',function(done){ | ||
read({ | ||
uri: 'http://github.com/Tjatse', | ||
html: '<title>read-art</title><body><div><p>hi, dude, i am <img data-src="/path/to/foo.png" src="/path/to/bar.png" />, aka read-art...</p></div></body>', | ||
imgFallback: true | ||
}, function(err, art){ | ||
should.not.exist(err); | ||
expect(art).to.be.an('object'); | ||
art.content.should.contain(' src="http://github.com/path/to/bar.png"'); | ||
art.content.should.not.contain(' src="http://github.com/path/to/foo.png"'); | ||
art.title.should.equal('read-art'); | ||
done(); | ||
}); | ||
}); | ||
}); | ||
describe('false', function() { | ||
it('remove node if fallback does not work and neither src',function(done){ | ||
read({ | ||
uri: 'http://github.com/Tjatse', | ||
html: '<title>read-art</title><body><div><p>hi, dude, i am <img data-src="/path/to/foo.png" />, aka read-art...</p></div></body>', | ||
imgFallback: false | ||
}, function(err, art){ | ||
should.not.exist(err); | ||
expect(art).to.be.an('object'); | ||
art.content.should.not.contain('<img'); | ||
art.title.should.equal('read-art'); | ||
done(); | ||
}); | ||
}); | ||
}); | ||
}); | ||
describe('as string', function() { | ||
it('fallback to imgFallback attr (data-*)',function(done){ | ||
read({ | ||
uri: 'http://github.com/Tjatse', | ||
html: '<title>read-art</title><body><div><p>hi, dude, i am <img data-foo="/path/to/foo.png" />, aka read-art...</p></div></body>', | ||
imgFallback: 'data-foo' | ||
}, function(err, art){ | ||
should.not.exist(err); | ||
expect(art).to.be.an('object'); | ||
art.content.should.contain(' src="http://github.com/path/to/foo.png"'); | ||
art.title.should.equal('read-art'); | ||
done(); | ||
}); | ||
}); | ||
it('fallback to imgFallback attr !(data-*)',function(done){ | ||
read({ | ||
uri: 'http://github.com/Tjatse', | ||
html: '<title>read-art</title><body><div><p>hi, dude, i am <img foo-bar="/path/to/foo.png" />, aka read-art...</p></div></body>', | ||
imgFallback: 'foo-bar' | ||
}, function(err, art){ | ||
should.not.exist(err); | ||
expect(art).to.be.an('object'); | ||
art.content.should.contain(' src="http://github.com/path/to/foo.png"'); | ||
art.title.should.equal('read-art'); | ||
done(); | ||
}); | ||
}); | ||
it('not fallback to imgFallback attr if src exists',function(done){ | ||
read({ | ||
uri: 'http://github.com/Tjatse', | ||
html: '<title>read-art</title><body><div><p>hi, dude, i am <img foo-bar="/path/to/foo.png" src="/path/to/bar.png" />, aka read-art...</p></div></body>', | ||
imgFallback: 'foo-bar' | ||
}, function(err, art){ | ||
should.not.exist(err); | ||
expect(art).to.be.an('object'); | ||
art.content.should.contain(' src="http://github.com/path/to/bar.png"'); | ||
art.content.should.not.contain(' src="http://github.com/path/to/foo.png"'); | ||
art.title.should.equal('read-art'); | ||
done(); | ||
}); | ||
}); | ||
it('remove node if fallback does not work and neither src',function(done){ | ||
read({ | ||
uri: 'http://github.com/Tjatse', | ||
html: '<title>read-art</title><body><div><p>hi, dude, i am <img data-src="/path/to/foo.png" />, aka read-art...</p></div></body>', | ||
imgFallback: 'data-src1' | ||
}, function(err, art){ | ||
should.not.exist(err); | ||
expect(art).to.be.an('object'); | ||
art.content.should.not.contain('<img'); | ||
art.title.should.equal('read-art'); | ||
done(); | ||
}); | ||
}); | ||
}); | ||
describe('as function', function () { | ||
it('fallback to imgFallback result attr', function(done) { | ||
read({ | ||
uri: 'http://github.com/Tjatse', | ||
html: '<title>read-art</title><body><div><p>hi, dude, i am <img data-image-dir="/path/to/" thumbnail="foo.png" />, aka read-art...</p></div></body>', | ||
imgFallback: function (node) { | ||
arguments.should.have.length(1); | ||
return node.data('image-dir') + node.attr('thumbnail'); | ||
} | ||
}, function(err, art){ | ||
should.not.exist(err); | ||
expect(art).to.be.an('object'); | ||
art.content.should.contain(' src="http://github.com/path/to/foo.png"'); | ||
art.title.should.equal('read-art'); | ||
done(); | ||
}); | ||
}); | ||
it('fallback to imgFallback result attr', function(done) { | ||
read({ | ||
uri: 'http://github.com/Tjatse', | ||
html: '<title>read-art</title><body><div><p>hi, dude, i am <img data-image-dir="/path/to/" thumbnail="foo.png" />, aka read-art...</p></div></body>', | ||
imgFallback: function (node) { | ||
arguments.should.have.length(1); | ||
return node.data('image-dir') + node.attr('thumbnail'); | ||
} | ||
}, function(err, art){ | ||
should.not.exist(err); | ||
expect(art).to.be.an('object'); | ||
art.content.should.contain(' src="http://github.com/path/to/foo.png"'); | ||
art.title.should.equal('read-art'); | ||
done(); | ||
}); | ||
}); | ||
it('not fallback to imgFallback result attr if src exists', function(done) { | ||
read({ | ||
uri: 'http://github.com/Tjatse', | ||
html: '<title>read-art</title><body><div><p>hi, dude, i am <img data-image-dir="/path/to/" thumbnail="foo.png" src="/path/to/bar.png" />, aka read-art...</p></div></body>', | ||
imgFallback: function (node) { | ||
arguments.should.have.length(1); | ||
return node.data('image-dir') + node.attr('thumbnail'); | ||
}, | ||
}, function(err, art){ | ||
should.not.exist(err); | ||
expect(art).to.be.an('object'); | ||
art.content.should.contain(' src="http://github.com/path/to/bar.png"'); | ||
art.content.should.not.contain(' src="http://github.com/path/to/foo.png"'); | ||
art.title.should.equal('read-art'); | ||
done(); | ||
}); | ||
}); | ||
it('fallback to imgFallback result attr if src exists', function(done) { | ||
read({ | ||
uri: 'http://github.com/Tjatse', | ||
html: '<title>read-art</title><body><div><p>hi, dude, i am <img data-image-dir="/path/to/" thumbnail="foo.png" src="/path/to/bar.png" />, aka read-art...</p></div></body>', | ||
imgFallback: function (node) { | ||
arguments.should.have.length(1); | ||
return node.data('image-dir') + node.attr('thumbnail'); | ||
} | ||
}, function(err, art){ | ||
should.not.exist(err); | ||
expect(art).to.be.an('object'); | ||
art.content.should.contain(' src="http://github.com/path/to/bar.png"'); | ||
art.content.should.not.contain(' src="http://github.com/path/to/foo.png"'); | ||
art.title.should.equal('read-art'); | ||
done(); | ||
}); | ||
}); | ||
it('remove node if fallback does not work and neither src',function(done){ | ||
read({ | ||
uri: 'http://github.com/Tjatse', | ||
html: '<title>read-art</title><body><div><p>hi, dude, i am <img data-src="/path/to/foo.png" />, aka read-art...</p></div></body>', | ||
imgFallback: function (node) { | ||
arguments.should.have.length(1); | ||
} | ||
}, function(err, art){ | ||
should.not.exist(err); | ||
expect(art).to.be.an('object'); | ||
art.content.should.not.contain('<img'); | ||
art.title.should.equal('read-art'); | ||
done(); | ||
}); | ||
}); | ||
}); | ||
}); | ||
}); |
88847
23
2131
477
Updatedreq-fast@^0.2.13