Comparing version 0.0.6 to 0.0.7
var read = require('../'); | ||
read('http://auto.rednet.cn/html/tupian/20145/20145124642220.html', function(err, art){ | ||
read('http://www.tianjinwe.com/rollnews/201405/t20140512_8874908.html', { | ||
dataType: 'json' | ||
}, function(err, art){ | ||
if(err){ | ||
@@ -5,0 +7,0 @@ console.log('[ERROR]', err.message); |
16
index.js
@@ -27,6 +27,3 @@ // Copyright 2014 Tjatse | ||
* @param uri uri or html | ||
* @param options including: | ||
* cacheable: false as default. | ||
* killBreaks: true as default. | ||
* lowerCaseTags: true as default. | ||
* @param options reference to https://github.com/Tjatse/node-readability#options | ||
* @param callback callback, have two arguments been passed: | ||
@@ -49,4 +46,5 @@ * 1: error | ||
} | ||
defineBoolean(options, 'killBreaks', true); | ||
defineBoolean(options, 'lowerCaseTags', true); | ||
defineOption(options, 'killBreaks', true); | ||
defineOption(options, 'lowerCaseTags', true); | ||
defineOption(options, 'dataType', 'html'); | ||
@@ -95,3 +93,3 @@ // indicating uri is html or url. | ||
o.html = o.html.replace(/(<br\s*\/?>(\s| ?)*){1,}/g,'<br />'); | ||
// remove formats like \r\t\n | ||
// remove tab symbols like \r\t\n | ||
o.html = o.html.replace(/([\n\r\t]*){2,}/gi, ''); | ||
@@ -109,3 +107,3 @@ } | ||
/** | ||
* Define property of object to default boolean value. | ||
* Define property of object to default value. | ||
* @param options option object | ||
@@ -115,3 +113,3 @@ * @param k key | ||
*/ | ||
function defineBoolean(options, k, v){ | ||
function defineOption(options, k, v){ | ||
if(typeof options[k] == 'undefined'){ | ||
@@ -118,0 +116,0 @@ options[k] = v; |
@@ -62,4 +62,17 @@ // Copyright 2014 Tjatse | ||
// else read it by article reader. | ||
var node = read(this.$, this.options); | ||
var content = ((node && node.length > 0) ? node.html() : '' ); | ||
var node = read(this.$, this.options), | ||
content; | ||
if((node && node.length > 0)){ | ||
switch(this.options.dataType){ | ||
case 'text': | ||
content = node.text(); | ||
break; | ||
case 'html': | ||
default: | ||
content = node.html(); | ||
break; | ||
} | ||
}else{ | ||
content = ''; | ||
} | ||
// if cacheable, cache it. | ||
@@ -66,0 +79,0 @@ if(this.caches){ |
@@ -129,3 +129,2 @@ // Copyright 2014 Tjatse | ||
node.data(scoreKey, score); | ||
if(!topCandidate || score > topCandidate.data(scoreKey)){ | ||
@@ -148,3 +147,3 @@ topCandidate = node; | ||
parent, siblings; | ||
if((parent == topCandidate.parent()) && parent.length > 0 && parent.get(0).name.toLowerCase() != 'body'){ | ||
if((parent = topCandidate.parent()) && parent.length > 0 && parent.get(0).name.toLowerCase() != 'body'){ | ||
siblings = parent.children(); | ||
@@ -158,2 +157,3 @@ }else{ | ||
append = false; | ||
if(node.is(topCandidate) || (node.data(scoreKey) || 0) > siblingScoreThreshold){ | ||
@@ -160,0 +160,0 @@ append = true; |
{ | ||
"name": "read-art", | ||
"version": "0.0.6", | ||
"version": "0.0.7", | ||
"description": "Scrape article from any page, automatically, make web page readability.", | ||
@@ -5,0 +5,0 @@ "main": "index.js", |
@@ -69,2 +69,5 @@ # read-art -- readability reference to Arc90's | ||
## Options | ||
### dataType | ||
The data type of article content, including: html, text. | ||
### cacheable | ||
@@ -74,3 +77,3 @@ A value indicating whether cache body && title. | ||
### killBreaks | ||
Kill breaks in the HTML, and convert them to simple `<br />`. | ||
Kill breaks, blanks, tab symbols(\r\t\n) into one <br />. | ||
@@ -77,0 +80,0 @@ ###options from [cheerio](https://github.com/cheeriojs/cheerio) |
30484
740
156