New Case Study:See how Anthropic automated 95% of dependency reviews with Socket.Learn More
Socket
Sign inDemoInstall
Socket

read-art

Package Overview
Dependencies
Maintainers
1
Versions
66
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

read-art - npm Package Compare versions

Comparing version 0.4.4 to 0.4.5

22

examples/simple.js
var read = require('../');
var URI = require('urijs');
read('http://www.cqn.com.cn/auto/news/73572.html', {
//var uri = 'http://news.hexun.com/2016-01-14/181791008.html';
var uri = 'http://www.kaixian.tv/gd/2016/0114/724631.html';
//var uri = 'http://www.ntjoy.com/news/vod/xwsph/nttv1/csrl/2016/01/2016-01-14461427.html';
read(uri, {
timeout : 15000,
output : {
type : 'json',
stripSpaces: true,
break: true
minTextLength: 0,
minParagraphs: 0,
selectors: {
content: '.contxt'
},
minTextLength: 0,
scoreRule: function(node){
if (node.hasClass('w740')) {
return 100;
}
return 0;
}
output: 'text'
}, function(err, art, options, resp){

@@ -18,0 +16,0 @@ if (err) {

@@ -0,1 +1,5 @@

# 2016/01/14
- `minParagraphs` option.
- Make images regexp extendable. #14@entertainyou
# 2016/01/07

@@ -2,0 +6,0 @@ - feature: threshold

@@ -37,3 +37,4 @@ "use strict";

minTextLength: 25,
thresholdLinkDensity: 0.25
thresholdLinkDensity: 0.25,
minParagraphs: 3
}, options);

@@ -47,2 +48,6 @@

if (!isFinite(options.minParagraphs)) {
options.minParagraphs = 3;
}
// indicating uri is html or url.

@@ -49,0 +54,0 @@ var isHTML = uri.match(/^\s*</);

@@ -13,3 +13,2 @@ "use strict";

videos : /(youtube|vimeo|youku|tudou|56|letv|iqiyi|sohu|sina|163)\.(com|com\.cn|cn|net)/i,
images : /\.(gif|jpe?g|png)$/i,
commas : /[,,.。;;??、]/g

@@ -22,3 +21,4 @@ },

maybe : /and|article|body|column|main|column/i,
div2p : /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i
div2p : /<(a|blockquote|dl|div|img|ol|p|pre|table|ul|span|font)/i,
images : /\.(gif|jpe?g|png)$/i
},

@@ -54,3 +54,2 @@ tagsToSkip = '';

var topCandidate = getTopCandidate($, cans);
// 3nd. grab article

@@ -411,3 +410,3 @@ if (topCandidate && topCandidate.length > 0) {

}
if (topCandidate.children('p').length < 3 && (parent = topCandidate.parent()) && parent.length > 0 && parent.get(0).name.toLowerCase() != 'body') {
if (topCandidate.children('p').length < options.minParagraphs && (parent = topCandidate.parent()) && parent.length > 0 && parent.get(0).name.toLowerCase() != 'body') {
// 1. topCandidate has not enough [P] children

@@ -430,3 +429,2 @@ // 2. parent exist and not [BODY]

textLen = text.length;
if (tagName == 'p') {

@@ -433,0 +431,0 @@ var linkDensity = getLinkDensity($, node);

{
"name": "read-art",
"version": "0.4.4",
"version": "0.4.5",
"description": "Scrape/Crawl article from any site automatically. Make any web page readable, no matter Chinese or English.",

@@ -33,3 +33,3 @@ "main": "index.js",

"cheerio": "~0.19.0",
"req-fast": "^0.2.13",
"req-fast": "^0.2.14",
"urijs": "~1.17.0",

@@ -36,0 +36,0 @@ "entities": "~1.1.1"

@@ -65,2 +65,3 @@ read-art [![NPM version](https://badge.fury.io/js/read-art.svg)](http://badge.fury.io/js/read-art) [![Build Status](https://travis-ci.org/Tjatse/node-readability.svg?branch=master)](https://travis-ci.org/Tjatse/node-readability)

- **minTextLength** If the content is less than `[minTextLength]` characters, don't even count it, `25` by default.
- **minParagraphs** A number indicates whether or not take the top candidate as a article candidate, `3` by default, i.e.: If `topCandidate` dom has more than `3` `<p>` children, `topCandidate` will be considered as the article dom, otherwise, it will be the parent of `topCandidate` (not `<body>`).
- **tidyAttrs** Remove all the attributes on elements, `false` by default.

@@ -310,2 +311,5 @@ - **dom** Will return the whole cheerio dom when this property is set to `true`, `false` by default, try to use `art.dom` to get the dom object in callback function.

- `this.regexps.images([re], [override])`
If `images` regexp test `src` attribute of node success, it will be picked as a normal `img`, otherwise will be dropped. `[re]` is a regexp, e.g. `/\.(gif|jpe?g|png)$/i` will match the `image` that `src` likes `/path/to/foo.jpg`, if `[override]` is set to `true`, `readart.regexps.images` will be replaced by `[re]`, otherwise it will be appended to the origin.
### Example

@@ -312,0 +316,0 @@ ```javascript

@@ -1,27 +0,30 @@

var read = require('../'),
chai = require('chai'),
expect = chai.expect,
should = chai.should();
var read = require('../'),
chai = require('chai'),
expect = chai.expect,
should = chai.should();
var html = '';
describe('custom settings', function(){
before(function(){
describe('custom settings', function () {
before(function () {
html = '<title>read-art</title>' +
'<body>' +
'<div class="dv1"><p class="p1">hi, dude, I am readability (<b>aka read-art</b>)<foot>foot</foot></p></div>' +
'<div class="dv2"><div class="p2">hey, dude, I am readability too</div></div>' +
'<div class="dv3"><span></span>hello, dude, I am readability too.</div>' +
'</body>';
'<body>' +
'<div>' +
'<div class="dv1"><p class="p1">hi, dude, I am readability (<b>aka read-art</b>)<foot>foot</foot></p></div>' +
'<div class="dv2"><div class="p2">hey, dude, I am readability too</div></div>' +
'<div class="dv3"><foo>hello</foo>, dude, I am readability too.</div>' +
'</div>' +
'</body>';
});
after(function(){
after(function () {
html = '';
});
describe('do not exist', function(){
it('should works fine', function(done){
describe('by default', function () {
it('should skip nothing', function (done) {
read(html, {
output: 'text'
}, function(err, art){
}, function (err, art) {
should.not.exist(err);
expect(art).to.be.an('object');
art.content.should.contain('hi');
art.content.should.contain('foot');
art.content.should.contain('aka read-art');
done();

@@ -31,5 +34,5 @@ });

});
describe('with appending', function(){
it('skip <b>', function(done){
read.use(function(){
describe('skipTags', function () {
it('skip <b>', function (done) {
read.use(function () {
this.skipTags('b,x,y,z');

@@ -39,3 +42,3 @@ });

output: 'text'
}, function(err, art){
}, function (err, art) {
should.not.exist(err);

@@ -46,3 +49,3 @@ expect(art).to.be.an('object');

read.use(function(){
read.use(function () {
this.reset()

@@ -53,15 +56,14 @@ });

});
it('regexps.positive', function(done){
read.use(function(){
this.regexps.positive(/dv2|p2/);
it('skip <b> but not skip <foot>', function (done) {
read.use(function () {
this.skipTags('b,x,y,z', true);
});
read(html, {
output: 'text'
}, function(err, art){
}, function (err, art) {
should.not.exist(err);
expect(art).to.be.an('object');
art.content.should.contain('hey');
art.content.should.contain('foot');
read.use(function(){
read.use(function () {
this.reset()

@@ -72,181 +74,215 @@ });

});
});
it('regexps.negative', function(done){
read.use(function(){
this.regexps.negative(/dv1|p1/);
});
read(html, {
output: 'text'
}, function(err, art){
should.not.exist(err);
expect(art).to.be.an('object');
art.content.should.contain('hey');
describe('regexps', function () {
describe('append', function () {
it('positive', function (done) {
read.use(function () {
this.regexps.positive(/dv2|p2/);
});
read(html, {
output: 'text'
}, function (err, art) {
should.not.exist(err);
expect(art).to.be.an('object');
art.content.should.contain('hey');
read.use(function(){
this.reset()
read.use(function () {
this.reset()
});
done();
});
done();
});
});
it('regexps.unlikely', function(done){
read.use(function(){
this.regexps.unlikely(/dv1|p1/);
});
read(html, {
output: 'text'
}, function(err, art){
should.not.exist(err);
expect(art).to.be.an('object');
art.content.should.contain('hey');
it('negative', function (done) {
read.use(function () {
this.regexps.negative(/dv1/);
});
read(html, {
output: 'text'
}, function (err, art) {
should.not.exist(err);
expect(art).to.be.an('object');
art.content.should.contain('hey');
read.use(function(){
this.reset()
read.use(function () {
this.reset()
});
done();
});
done();
});
});
it('regexps.maybe', function(done){
read.use(function(){
this.regexps.unlikely(/dv1/);
this.regexps.maybe(/p1/);
});
read(html, {
output: 'text'
}, function(err, art){
should.not.exist(err);
expect(art).to.be.an('object');
art.content.should.contain('hi');
art.content.should.not.contain('hey');
it('unlikely', function (done) {
read.use(function () {
this.regexps.unlikely(/dv1|p1/);
});
read(html, {
output: 'text'
}, function (err, art) {
should.not.exist(err);
expect(art).to.be.an('object');
art.content.should.contain('hey');
read.use(function(){
this.reset()
read.use(function () {
this.reset()
});
done();
});
done();
});
});
it('regexps.div2p', function(done){
read.use(function(){
this.regexps.div2p(/<span/);
});
read(html, {
output: 'text'
}, function(err, art){
should.not.exist(err);
expect(art).to.be.an('object');
art.content.should.contain('hello');
it('maybe', function (done) {
read.use(function () {
this.regexps.unlikely(/dv1/);
this.regexps.maybe(/dv1/);
});
read.use(function(){
this.reset()
read(html, {
output: 'text'
}, function (err, art) {
should.not.exist(err);
expect(art).to.be.an('object');
art.content.should.contain('hi');
art.content.should.contain('hey');
art.content.should.contain('hello');
read.use(function () {
this.reset()
});
done();
});
done();
});
});
});
describe('with overriding', function(){
it('skip <b> but not skip <foot>', function(done){
read.use(function(){
this.skipTags('b,x,y,z', true);
});
read(html, {
output: 'text'
}, function(err, art){
should.not.exist(err);
expect(art).to.be.an('object');
art.content.should.contain('foot');
it('div2p', function (done) {
read.use(function () {
this.regexps.div2p(/<foo/);
});
read(html, {
output: 'text'
}, function (err, art) {
should.not.exist(err);
expect(art).to.be.an('object');
art.content.should.contain('hello');
read.use(function(){
this.reset()
read.use(function () {
this.reset()
});
done();
});
done();
});
});
describe('override', function () {
it('positive', function (done) {
read.use(function () {
this.regexps.positive(/dv2|p2/, true);
});
read(html, {
output: 'text'
}, function (err, art) {
should.not.exist(err);
expect(art).to.be.an('object');
art.content.should.contain('hey');
it('regexps.positive', function(done){
read.use(function(){
this.regexps.positive(/dv2|p2/, true);
read.use(function () {
this.reset()
});
done();
});
});
read(html, {
output: 'text'
}, function(err, art){
should.not.exist(err);
expect(art).to.be.an('object');
art.content.should.contain('hey');
read.use(function(){
this.reset()
it('negative', function (done) {
read.use(function () {
this.regexps.negative(/dv1|p1/, true);
});
done();
});
});
read(html, {
output: 'text'
}, function (err, art) {
should.not.exist(err);
expect(art).to.be.an('object');
art.content.should.contain('hey');
it('regexps.negative', function(done){
read.use(function(){
this.regexps.negative(/dv1|p1/, true);
read.use(function () {
this.reset()
});
done();
});
});
read(html, {
output: 'text'
}, function(err, art){
should.not.exist(err);
expect(art).to.be.an('object');
art.content.should.contain('hey');
read.use(function(){
this.reset()
it('unlikely', function (done) {
read.use(function () {
this.regexps.unlikely(/dv1|p1/, true);
});
done();
});
});
read(html, {
output: 'text'
}, function (err, art) {
should.not.exist(err);
expect(art).to.be.an('object');
art.content.should.contain('hey');
it('regexps.unlikely', function(done){
read.use(function(){
this.regexps.unlikely(/dv1|p1/, true);
read.use(function () {
this.reset()
});
done();
});
});
read(html, {
output: 'text'
}, function(err, art){
should.not.exist(err);
expect(art).to.be.an('object');
art.content.should.contain('hey');
read.use(function(){
this.reset()
it('maybe', function (done) {
read.use(function () {
this.regexps.unlikely(/dv1|p1/, true);
this.regexps.maybe(/dv2|p2/, true);
});
done();
});
});
read(html, {
output: 'text'
}, function (err, art) {
should.not.exist(err);
expect(art).to.be.an('object');
art.content.should.not.contain('hi');
art.content.should.contain('hey');
art.content.should.contain('hello');
it('regexps.maybe', function(done){
read.use(function(){
this.regexps.unlikely(/dv1/, true);
this.regexps.maybe(/p1/, true);
read.use(function () {
this.reset()
});
done();
});
});
read(html, {
output: 'text'
}, function(err, art){
should.not.exist(err);
expect(art).to.be.an('object');
art.content.should.contain('hi');
art.content.should.not.contain('hey');
read.use(function(){
this.reset()
it('div2p', function (done) {
read.use(function () {
this.regexps.div2p(/<(div|foo)/, true);
});
done();
read(html, {
output: 'text'
}, function (err, art) {
should.not.exist(err);
expect(art).to.be.an('object');
art.content.should.contain('hi');
art.content.should.contain('hey');
art.content.should.contain('hello');
read.use(function () {
this.reset()
});
done();
});
});
});
it('regexps.div2p', function(done){
it('regexps.images', function(done){
var article = '<title>title</title>' +
'<body>' +
'<p>Text hello <img src="image" /> world</p>' +
'</body>';
read.use(function(){
this.regexps.div2p(/<span/, true);
this.regexps.images(/.*/, true);
});
read(html, {
output: 'text'
read({
html: article,
uri: 'http://github.com',
output: 'html',
}, function(err, art){
should.not.exist(err);
expect(art).to.be.an('object');
art.content.should.contain('hello');
art.content.should.contain('<img src="http://github.com/image">');

@@ -260,2 +296,2 @@ read.use(function(){

});
});
});

@@ -1,2 +0,1 @@

var read = require('../'),

@@ -9,4 +8,4 @@ chai = require('chai'),

describe('different options',function(){
before(function(){
describe('different options', function () {
before(function () {
uri = 'http://www.bing.com';

@@ -16,3 +15,3 @@ html = '<p>Hello, node-art</p>';

});
after(function(){
after(function () {
uri = null;

@@ -22,5 +21,7 @@ html = null;

});
describe('have three arguments',function(){
it('should detect two options',function(done){
read(uri, { charset: charset }, function(err, art, options, resp){
describe('have three arguments', function () {
it('should detect two options', function (done) {
read(uri, {
charset: charset
}, function (err, art, options, resp) {
should.not.exist(err);

@@ -35,5 +36,5 @@ options.uri.should.be.equal(uri);

describe('have two arguments(string, function)',function(){
it('should detect one options',function(done){
read(uri, function(err, art, options){
describe('have two arguments(string, function)', function () {
it('should detect one options', function (done) {
read(uri, function (err, art, options) {
should.not.exist(err);

@@ -47,5 +48,8 @@ options.uri.should.be.equal(uri);

describe('have two arguments(object, function)',function(){
it('should detect two options',function(done){
read({ uri: uri, charset: charset }, function(err, art, options){
describe('have two arguments(object, function)', function () {
it('should detect two options', function (done) {
read({
uri: uri,
charset: charset
}, function (err, art, options) {
should.not.exist(err);

@@ -59,5 +63,8 @@ options.uri.should.be.equal(uri);

describe('uri is passed in',function(){
it('should detect uri in options',function(done){
read({ uri: uri, charset: charset }, function(err, art, options){
describe('uri is passed in', function () {
it('should detect uri in options', function (done) {
read({
uri: uri,
charset: charset
}, function (err, art, options) {
should.not.exist(err);

@@ -70,5 +77,8 @@ options.uri.should.be.equal(uri);

describe('uri is passed in, but treat as html',function(){
it('should detect html automatically',function(done){
read({ uri: html, charset: charset }, function(err, art, options){
describe('uri is passed in, but treat as html', function () {
it('should detect html automatically', function (done) {
read({
uri: html,
charset: charset
}, function (err, art, options) {
should.not.exist(err);

@@ -81,5 +91,8 @@ options.html.should.be.equal(html);

describe('html is passed in',function(){
it('should detect html in options',function(done){
read({ uri: html, charset: charset }, function(err, art, options){
describe('html is passed in', function () {
it('should detect html in options', function (done) {
read({
uri: html,
charset: charset
}, function (err, art, options) {
should.not.exist(err);

@@ -91,2 +104,33 @@ options.html.should.be.equal(html);

});
});
});
describe('minParagraphs option', function () {
before(function () {
html = '<title>read-art</title><body><div><div><p>hi, dude, i am <a href="/Tjatse/read-art.git">readability</a>, aka read-art...</p></div><span>footer</span></div></body>'
});
describe('3 by default', function () {
it('should find footer', function (done) {
read({
minTextLength: 0,
html: html
}, function (err, art, options, resp) {
should.not.exist(err);
art.content.should.contain('footer');
done();
});
});
});
describe('0 by customized', function () {
it('should find no footer', function (done) {
read({
minTextLength: 0,
minParagraphs: 0,
html: html
}, function (err, art, options, resp) {
should.not.exist(err);
art.content.should.not.contain('footer');
done();
});
});
});
});
SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc