New Case Study:See how Anthropic automated 95% of dependency reviews with Socket.Learn More
Socket
Sign inDemoInstall
Socket

read-art

Package Overview
Dependencies
Maintainers
1
Versions
66
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

read-art - npm Package Compare versions

Comparing version 0.4.3 to 0.4.4

test/damping.js

9

examples/simple.js
var read = require('../');
read('http://news.sohu.com/20151228/n432833902.shtml', {
read('http://www.cqn.com.cn/auto/news/73572.html', {
timeout : 15000,
output : {
type : 'json',
stripSpaces: true,
break: true
},
minTextLength: 0,
scoreRule: function(node){
if (node.attr('itemprop') == 'articleBody') {
if (node.hasClass('w740')) {
return 100;

@@ -9,0 +14,0 @@ }

@@ -0,1 +1,8 @@

# 2016/01/07
- feature: threshold
- fixed: #5 #10
# 2016/01/06
- `imgFallback` option @entertainyou
# 2015/12/29

@@ -2,0 +9,0 @@ - fix scoreRule on grandparent node

@@ -31,9 +31,17 @@ "use strict";

}
options = util._extend({
killBreaks : true,
killBreaks: true,
lowerCaseTags: true,
output : 'html',
minTextLength: 25
output: 'html',
minTextLength: 25,
thresholdLinkDensity: 0.25
}, options);
var density = options.thresholdLinkDensity;
if (!isFinite(density) || (density > 1 || density < 0)) {
density = 0.25;
}
options.thresholdLinkDensity = density;
// indicating uri is html or url.

@@ -40,0 +48,0 @@ var isHTML = uri.match(/^\s*</);

@@ -395,3 +395,15 @@ "use strict";

siblingScoreThreshold = Math.max(10, topCandidate.data(scoreKey) * 0.2),
thresholdScoreType = typeof options.thresholdScore,
parent, siblings;
if (thresholdScoreType != 'undefined') {
if (thresholdScoreType == 'number' && isFinite(options.thresholdScore)) {
siblingScoreThreshold = options.thresholdScore;
} else if (thresholdScoreType == 'function') {
var score = options.thresholdScore(topCandidate, scoreKey);
if (isFinite(score)) {
siblingScoreThreshold = score;
}
}
}
if (topCandidate.children('p').length < 3 && (parent = topCandidate.parent()) && parent.length > 0 && parent.get(0).name.toLowerCase() != 'body') {

@@ -418,9 +430,9 @@ // 1. topCandidate has not enough [P] children

var linkDensity = getLinkDensity($, node);
if (textLen > 80 && linkDensity < 0.25) {
if (textLen > options.minTextLength && linkDensity < options.thresholdLinkDensity) {
append = true;
} else if ((textLen < 80 && linkDensity == 0) || text.search(regexps.stopwords) !== -1) {
} else if ((textLen < options.minTextLength && linkDensity == 0) || text.search(regexps.stopwords) !== -1) {
// end with .|。 commas must be a paragraph.
append = true;
}
} else if ((tagName == 'span' || tagName == 'font') && textLen > 0) {
} else if (regexps.div2p.test('<' + tagName) && textLen > 0) {
append = true;

@@ -450,3 +462,3 @@ }

// fix links
fixLink($, options.uri, article);
fixLink($, options.uri, article, options);
return article;

@@ -456,2 +468,25 @@ }

/**
* Fallback if user defines imgFallback.
* @param {Mixed} fb
* @param {Cheerio} node
* @return {String}
*/
function parseImgFallback(fb, node) {
switch (typeof fb) {
case 'boolean':
return fb ? (node.data('src') || node.attr('data-src')) : null;
case 'function':
return fb(node);
case 'string':
if(/^data-/i.test(fb)){
fb = fb.slice(5);
return node.data(fb);
}
return node.attr(fb);
default:
return null;
}
}
/**
* Fix link of image and video objects.

@@ -461,4 +496,5 @@ * @param $ dom

* @param article the article object.
* @param options
*/
function fixLink($, origin, article){
function fixLink($, origin, article, options){
if(!origin){

@@ -478,2 +514,5 @@ return;

propVal = node.attr(propName = 'src');
if(!propVal && typeof options.imgFallback != 'undefined'){
propVal = parseImgFallback(options.imgFallback, node);
}
isValid = propVal && (propVal.search(regexps.images) !== -1);

@@ -504,4 +543,5 @@ break;

if (propURI.is('relative')) {
node.attr(propName, propURI.absoluteTo(origin).href());
propVal = propURI.absoluteTo(origin).href();
}
node.attr(propName, propVal);
}

@@ -546,3 +586,5 @@ });

if (grandParent && grandParent.length > 0) {
scoreNode(grandParent, score / 2, cans, options.scoreRule);
var dampedScore = score * (isFinite(options.damping) ? options.damping : (1 / 2));
dampedScore = isFinite(dampedScore) ? dampedScore : 0;
scoreNode(grandParent, dampedScore, cans, options.scoreRule);
}

@@ -549,0 +591,0 @@ }

{
"name": "read-art",
"version": "0.4.3",
"version": "0.4.4",
"description": "Scrape/Crawl article from any site automatically. Make any web page readable, no matter Chinese or English.",

@@ -33,3 +33,3 @@ "main": "index.js",

"cheerio": "~0.19.0",
"req-fast": "^0.2.9",
"req-fast": "^0.2.13",
"urijs": "~1.17.0",

@@ -36,0 +36,0 @@ "entities": "~1.1.1"

@@ -18,2 +18,4 @@ read-art [![NPM version](https://badge.fury.io/js/read-art.svg)](http://badge.fury.io/js/read-art) [![Build Status](https://travis-ci.org/Tjatse/node-readability.svg?branch=master)](https://travis-ci.org/Tjatse/node-readability)

- [Extract Selectors](#selectors)
- [Image Fallback](#imgfallback)
- [Threshold](#threshold)
- [Customize Settings](#cus_sets)

@@ -62,12 +64,15 @@ - [Output](#output)

- **output** The data type of article content, head over to [Output](#output) to get more information.
- **killBreaks** A value indicating whether kill breaks, blanks, tab symbols(\r\t\n) into one `<br />` or not, `true` by default.
- **killBreaks** A value indicating whether or not kill breaks, blanks, tab symbols(\r\t\n) into one `<br />`, `true` by default.
- **minTextLength** If the content is less than `[minTextLength]` characters, don't even count it, `25` by default.
- **tidyAttrs** Remove all the attributes on elements, `false` by default.
- **dom** Will return the whole cheerio dom when this property is set to `true`, `false` by default, try to use `art.dom` to get the dom object in callback function.
- **damping** The damping to calculate score of parent node, `1/2` by default. e.g.: the score of current document node is `20`, the score of parent will be `20 * damping`.
- **scoreRule** Customize the score rules of each node, one arguments will be passed into the callback function, [read more](#score_rule).
- **selectors** Customize the data extract [selectors](#selectors).
- **imgFallback** Customize the way to get source of image, [read more](#imgfallback).
- **thresholdScore** A number/function indicates whether or not drop the article content, [read more](#threshold_score).
- **thresholdLinkDensity** A `0~1` decimal indicates whether or not drop the article content, [read more](#threshold_linkdensity).
- **options from [cheerio](https://github.com/cheeriojs/cheerio)**
- **options from [req-fast](https://github.com/Tjatse/req-fast)**
- **scoreRule** Customize the score rules of each node, one arguments will be passed into the callback function (head over to [Score Rule](#score_rule) to get more information):
- **node** The [cheerio object](https://github.com/cheeriojs/cheerio#selectors).
- **selectors** Customize the data extract [selectors](#selectors).
* **callback** The callback to run - `callback(error, article, options, response)`, arguments are:
* **callback** Fire after the article has been crawled - `callback(error, article, options, response)`, arguments are:
- **error** `Error` object when exception has been caught.

@@ -136,3 +141,4 @@ - **article** The article object, including: `article.title`, `article.content` and `article.html`.

<a name="score_rule_eg" />
**node** The [cheerio object](https://github.com/cheeriojs/cheerio#selectors).
### Example

@@ -187,2 +193,72 @@ ```javascript

<a name="imgfallback" />
## Image Fallback
Should be one of following types:
- **Boolean** Fallback to `img.src = (node.data('src') || node.attr('data-src'))` when set to `true`.
- **String** Customize the attribute name, it will take `node.attr([imgFallback])` as `src` of `img`.
- **Function** Give users maximum customizability and scalability of source attribute on `img`, e.g.:
```javascript
imgFallback: function(node){
return node.attr('base') + '/' + node.attr('rel-path');
}
```
### Example
```javascript
read({
imgFallback: true
}, function(err, art){});
read({
imgFallback: 'the-src-attr'
}, function(err, art){});
read({
imgFallback: function(node){
return 'http://img-serv/' + node.attr('relative-path');
}
}, function(err, art){});
```
<a name="threshold" />
## Threshold
Customize the threshold of anchors and nodes' scores.
<a name="threshold_score" />
### Score
The `thresholdScore` is a threshold number which to identify whether or not to discard children of top candidate directly (skip deeper tag/text/link density checking), should be one of following types:
- **Number** A finite number.
- **Function** Calculate the threshold score by yourself, two arguments are passing in:
- *node* The top candidate (mostly like article dom).
- *scoreKey* The data key to storage score, you can get score by `node.data(scoreKey)`.
After `read-art` got the top candidate, it starts to analyze the children of top candidate, if the score of current child is greater than `thresholdScore`, the child will be appended to article body directly.
`Math.max(10, topCandidate.data(scoreKey) * 0.2)` by default.
#### Example
```javascript
read({
thresholdScore: 20
}, function(err, art){});
read({
thresholdScore: function(node, scoreKey){
return Math.max(10, node.data(scoreKey) * 0.2);
}
}, function(err, art){});
```
<a name="threshold_linkdensity" />
### Link Density
`thresholdLinkDensity` is used to identify whether current child of top candidate is a `navigator`, `ad` or `relative-list`, `0.25` by default, so if the text length of anchors in current child devides by text length of top candidate is greater than `thresholdLinkDensity`, the child will be discarded.
#### Example
```javascript
read({
thresholdLinkDensity: 0.25
}, function(err, art){});
```
<a name="cus_sets" />

@@ -237,3 +313,2 @@ ## Customize Settings

<a name="cus_sets_eg" />
### Example

@@ -258,7 +333,6 @@ ```javascript

- **stripSpaces**
A value indicates whether strip the tab symbols (\r\n\t) or not, `false` by default.
A value indicates whether or not strip the tab symbols (\r\n\t), `false` by default.
- **break**
A value indicates whether split content into paragraphs by `<br />` (Only affects JSON output).
A value indicates whether or not split content into paragraphs by `<br />` (Only affects JSON output).
<a name="output_text" />
### text

@@ -283,3 +357,2 @@ Returns the inner text, e.g.:

<a name="output_html" />
### html

@@ -306,3 +379,2 @@ Returns the inner HTML, e.g.:

<a name="output_json" />
### json

@@ -338,3 +410,2 @@ Returns the restful result, e.g.:

<a name="output_cheerio" />
### cheerio

@@ -341,0 +412,0 @@ Returns the cheerio node, e.g.:

@@ -35,2 +35,205 @@ var read = require('../'),

describe('image fallback option',function(){
describe('as boolean', function() {
describe('true', function() {
it('fallback to relative data-src if imgFallback option is true',function(done){
read({
uri: 'http://github.com/Tjatse',
html: '<title>read-art</title><body><div><p>hi, dude, i am <img data-src="/path/to/foo.png" />, aka read-art...</p></div></body>',
imgFallback: true
}, function(err, art){
should.not.exist(err);
expect(art).to.be.an('object');
art.content.should.contain(' src="http://github.com/path/to/foo.png"');
art.title.should.equal('read-art');
done();
});
});
it('fallback to absolute data-src if imgFallback option is true',function(done){
read({
uri: 'http://example.com/',
html: '<title>read-art</title><body><div><p>hi, dude, i am <img data-src="http://github.com/path/to/foo.png" />, aka read-art...</p></div></body>',
imgFallback: true
}, function(err, art){
should.not.exist(err);
expect(art).to.be.an('object');
art.content.should.contain(' src="http://github.com/path/to/foo.png"');
art.title.should.equal('read-art');
done();
});
});
it('not fallback to data-src if imgFallback option is true and src exists',function(done){
read({
uri: 'http://github.com/Tjatse',
html: '<title>read-art</title><body><div><p>hi, dude, i am <img data-src="/path/to/foo.png" src="/path/to/bar.png" />, aka read-art...</p></div></body>',
imgFallback: true
}, function(err, art){
should.not.exist(err);
expect(art).to.be.an('object');
art.content.should.contain(' src="http://github.com/path/to/bar.png"');
art.content.should.not.contain(' src="http://github.com/path/to/foo.png"');
art.title.should.equal('read-art');
done();
});
});
});
describe('false', function() {
it('remove node if fallback does not work and neither src',function(done){
read({
uri: 'http://github.com/Tjatse',
html: '<title>read-art</title><body><div><p>hi, dude, i am <img data-src="/path/to/foo.png" />, aka read-art...</p></div></body>',
imgFallback: false
}, function(err, art){
should.not.exist(err);
expect(art).to.be.an('object');
art.content.should.not.contain('<img');
art.title.should.equal('read-art');
done();
});
});
});
});
describe('as string', function() {
it('fallback to imgFallback attr (data-*)',function(done){
read({
uri: 'http://github.com/Tjatse',
html: '<title>read-art</title><body><div><p>hi, dude, i am <img data-foo="/path/to/foo.png" />, aka read-art...</p></div></body>',
imgFallback: 'data-foo'
}, function(err, art){
should.not.exist(err);
expect(art).to.be.an('object');
art.content.should.contain(' src="http://github.com/path/to/foo.png"');
art.title.should.equal('read-art');
done();
});
});
it('fallback to imgFallback attr !(data-*)',function(done){
read({
uri: 'http://github.com/Tjatse',
html: '<title>read-art</title><body><div><p>hi, dude, i am <img foo-bar="/path/to/foo.png" />, aka read-art...</p></div></body>',
imgFallback: 'foo-bar'
}, function(err, art){
should.not.exist(err);
expect(art).to.be.an('object');
art.content.should.contain(' src="http://github.com/path/to/foo.png"');
art.title.should.equal('read-art');
done();
});
});
it('not fallback to imgFallback attr if src exists',function(done){
read({
uri: 'http://github.com/Tjatse',
html: '<title>read-art</title><body><div><p>hi, dude, i am <img foo-bar="/path/to/foo.png" src="/path/to/bar.png" />, aka read-art...</p></div></body>',
imgFallback: 'foo-bar'
}, function(err, art){
should.not.exist(err);
expect(art).to.be.an('object');
art.content.should.contain(' src="http://github.com/path/to/bar.png"');
art.content.should.not.contain(' src="http://github.com/path/to/foo.png"');
art.title.should.equal('read-art');
done();
});
});
it('remove node if fallback does not work and neither src',function(done){
read({
uri: 'http://github.com/Tjatse',
html: '<title>read-art</title><body><div><p>hi, dude, i am <img data-src="/path/to/foo.png" />, aka read-art...</p></div></body>',
imgFallback: 'data-src1'
}, function(err, art){
should.not.exist(err);
expect(art).to.be.an('object');
art.content.should.not.contain('<img');
art.title.should.equal('read-art');
done();
});
});
});
describe('as function', function () {
it('fallback to imgFallback result attr', function(done) {
read({
uri: 'http://github.com/Tjatse',
html: '<title>read-art</title><body><div><p>hi, dude, i am <img data-image-dir="/path/to/" thumbnail="foo.png" />, aka read-art...</p></div></body>',
imgFallback: function (node) {
arguments.should.have.length(1);
return node.data('image-dir') + node.attr('thumbnail');
}
}, function(err, art){
should.not.exist(err);
expect(art).to.be.an('object');
art.content.should.contain(' src="http://github.com/path/to/foo.png"');
art.title.should.equal('read-art');
done();
});
});
it('fallback to imgFallback result attr', function(done) {
read({
uri: 'http://github.com/Tjatse',
html: '<title>read-art</title><body><div><p>hi, dude, i am <img data-image-dir="/path/to/" thumbnail="foo.png" />, aka read-art...</p></div></body>',
imgFallback: function (node) {
arguments.should.have.length(1);
return node.data('image-dir') + node.attr('thumbnail');
}
}, function(err, art){
should.not.exist(err);
expect(art).to.be.an('object');
art.content.should.contain(' src="http://github.com/path/to/foo.png"');
art.title.should.equal('read-art');
done();
});
});
it('not fallback to imgFallback result attr if src exists', function(done) {
read({
uri: 'http://github.com/Tjatse',
html: '<title>read-art</title><body><div><p>hi, dude, i am <img data-image-dir="/path/to/" thumbnail="foo.png" src="/path/to/bar.png" />, aka read-art...</p></div></body>',
imgFallback: function (node) {
arguments.should.have.length(1);
return node.data('image-dir') + node.attr('thumbnail');
},
}, function(err, art){
should.not.exist(err);
expect(art).to.be.an('object');
art.content.should.contain(' src="http://github.com/path/to/bar.png"');
art.content.should.not.contain(' src="http://github.com/path/to/foo.png"');
art.title.should.equal('read-art');
done();
});
});
it('fallback to imgFallback result attr if src exists', function(done) {
read({
uri: 'http://github.com/Tjatse',
html: '<title>read-art</title><body><div><p>hi, dude, i am <img data-image-dir="/path/to/" thumbnail="foo.png" src="/path/to/bar.png" />, aka read-art...</p></div></body>',
imgFallback: function (node) {
arguments.should.have.length(1);
return node.data('image-dir') + node.attr('thumbnail');
}
}, function(err, art){
should.not.exist(err);
expect(art).to.be.an('object');
art.content.should.contain(' src="http://github.com/path/to/bar.png"');
art.content.should.not.contain(' src="http://github.com/path/to/foo.png"');
art.title.should.equal('read-art');
done();
});
});
it('remove node if fallback does not work and neither src',function(done){
read({
uri: 'http://github.com/Tjatse',
html: '<title>read-art</title><body><div><p>hi, dude, i am <img data-src="/path/to/foo.png" />, aka read-art...</p></div></body>',
imgFallback: function (node) {
arguments.should.have.length(1);
}
}, function(err, art){
should.not.exist(err);
expect(art).to.be.an('object');
art.content.should.not.contain('<img');
art.title.should.equal('read-art');
done();
});
});
});
});
});
SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc