node-readability
Advanced tools
Comparing version 0.1.1 to 0.2.0
@@ -1,12 +0,8 @@ | ||
var readability = require('../src/readability') | ||
var read = require('../src/readability'); | ||
// uncoment the following line to print the debug info to console. | ||
// readability.debug(true); | ||
readability.read('http://colorlines.com/archives/2011/08/dispatch_from_angola_faith-based_slavery_in_a_louisiana_prison.html', | ||
read('http://colorlines.com/archives/2011/08/dispatch_from_angola_faith-based_slavery_in_a_louisiana_prison.html', | ||
function(err, read) { | ||
var dom = read.getDocument(); | ||
var html = '<html><head><meta charset="utf-8"><title>'+dom.title+'</title></head><body><h1>'+read.getTitle()+'</h1>'+read.getContent()+'</body></html>'; | ||
var dom = read.document; | ||
var html = '<html><head><meta charset="utf-8"><title>'+dom.title+'</title></head><body><h1>'+read.title+'</h1>'+read.content+'</body></html>'; | ||
console.log(html); | ||
}); |
{ | ||
"name": "node-readability", | ||
"version": "0.1.1", | ||
"version": "0.2.0", | ||
"author": "Zihua Li", | ||
@@ -5,0 +5,0 @@ "description": "Turning any web page into a clean view.", |
@@ -1,2 +0,2 @@ | ||
# node-readability | ||
# Readability | ||
@@ -7,2 +7,7 @@ Turn any web page into a clean view. This module is based on arc90's readability project. | ||
### Features | ||
1. Optimized for more websites. | ||
2. Support encodings such as GBK and GB2312. | ||
3. Converts relative urls to absolute for images and links automatically(Thank [Guillermo Baigorria](https://github.com/gbaygon) & [Tom Sutton](https://github.com/tomsutton1984)). | ||
## Install | ||
@@ -14,3 +19,3 @@ | ||
`readability.read(html [, options], callback)` | ||
`read(html [, options], callback)` | ||
@@ -25,13 +30,21 @@ Where | ||
var readability = require('node-readability'); | ||
var read = require('node-readability'); | ||
readability.read('http://howtonode.org/really-simple-file-uploads', function(err, article) { | ||
console.log(article.getContent()); | ||
read('http://howtonode.org/really-simple-file-uploads', function(err, article) { | ||
// The main body of the page. | ||
console.log(article.content); | ||
// The title of the page. | ||
console.log(article.title); | ||
// The raw HTML code of the page | ||
console.log(article.html); | ||
// The document object of the page | ||
console.log(article.document); | ||
}); | ||
**NB** If the file has been marked with charset other than utf-8, it is converted automatically. Charsets such as GBK, GB2312 is also supported via [iconv](https://github.com/bnoordhuis/node-iconv). | ||
**NB** If the page has been marked with charset other than utf-8, it will be converted automatically. Charsets such as GBK, GB2312 is also supported. | ||
## Options | ||
node-readability support all the options that [fetch](https://github.com/andris9/fetch) support. | ||
node-readability will pass the options to [fetch](https://github.com/andris9/fetch) directly. | ||
@@ -56,24 +69,23 @@ Possible option values | ||
## article | ||
## article object | ||
### getContent() | ||
### content | ||
Return the article content of the web page. Return `false` if failed. | ||
The article content of the web page. Return `false` if failed. | ||
### getTitle() | ||
### title | ||
Return the article title of the web page. | ||
The article title of the web page. It's may not same to the text in the `<title>` tag. | ||
### getHTML() | ||
### html | ||
Return the original html of the web page. | ||
The original html of the web page. | ||
### getDocument() | ||
### document | ||
Return the document of the web page generated by jsdom. | ||
The document of the web page generated by jsdom. You can use it to access the DOM directly(for example, `article.document.getElementById('main')`). | ||
## TODO | ||
## Contributors | ||
* Support more readability features | ||
* Performance optimization | ||
https://github.com/luin/node-readability/graphs/contributors | ||
@@ -80,0 +92,0 @@ ## License |
@@ -0,1 +1,3 @@ | ||
var url = require("url"); | ||
// All of the regular expressions in use within readability. | ||
@@ -465,3 +467,28 @@ var regexps = { | ||
/** | ||
* Converts relative urls to absolute for images and links | ||
**/ | ||
function fixLinks (e) { | ||
function fixLink(link){ | ||
var fixed = url.resolve(e.ownerDocument.originalURL, link); | ||
return fixed; | ||
} | ||
var imgs = e.getElementsByTagName('img'); | ||
for (var i = imgs.length - 1; i >= 0; --i) { | ||
var src = imgs[i].getAttribute('src'); | ||
imgs[i].setAttribute('src', fixLink(src)); | ||
} | ||
var as = e.getElementsByTagName('a'); | ||
for (var i = as.length - 1; i >= 0; --i) { | ||
var href = as[i].getAttribute('href'); | ||
as[i].setAttribute('href', fixLink(href)); | ||
} | ||
} | ||
/** | ||
* Clean out spurious headers from an Element. Checks things like classnames and link density. | ||
@@ -545,2 +572,3 @@ * | ||
fixLinks(articleContent); | ||
} | ||
@@ -547,0 +575,0 @@ |
@@ -24,5 +24,21 @@ var jsdom = require('jsdom'); | ||
}; | ||
this.__defineGetter__('content', function() { | ||
return this.getContent(true); | ||
}); | ||
this.__defineGetter__('title', function() { | ||
return this.getTitle(true); | ||
}); | ||
this.__defineGetter__('html', function() { | ||
return this.getHTML(true); | ||
}); | ||
this.__defineGetter__('document', function() { | ||
return this.getDocument(true); | ||
}); | ||
} | ||
Readability.prototype.getContent = function () { | ||
Readability.prototype.getContent = function (notDeprecated) { | ||
if (!notDeprecated) { | ||
console.warn('The method `getContent()` is deprecated, using `content` property instead.'); | ||
} | ||
if (typeof this.cache['article-content'] !== 'undefined') { | ||
@@ -44,3 +60,6 @@ return this.cache['article-content']; | ||
Readability.prototype.getTitle = function () { | ||
Readability.prototype.getTitle = function (notDeprecated) { | ||
if (!notDeprecated) { | ||
console.warn('The method `getTitle()` is deprecated, using `title` property instead.'); | ||
} | ||
if (typeof this.cache['article-title'] !== 'undefined') { | ||
@@ -70,7 +89,13 @@ return this.cache['article-title']; | ||
Readability.prototype.getDocument = function () { | ||
Readability.prototype.getDocument = function (notDeprecated) { | ||
if (!notDeprecated) { | ||
console.warn('The method `getDocument()` is deprecated, using `document` property instead.'); | ||
} | ||
return this._document; | ||
}; | ||
Readability.prototype.getHTML = function () { | ||
Readability.prototype.getHTML = function (notDeprecated) { | ||
if (!notDeprecated) { | ||
console.warn('The method `getHTML()` is deprecated, using `html` property instead.'); | ||
} | ||
return this._document.getElementsByTagName('html')[0].innerHTML; | ||
@@ -97,5 +122,7 @@ }; | ||
if (typeof body !== 'string') body = body.toString(); | ||
if (!body) return callback(new Error('Empty story body returned from URL')); | ||
jsdom.env({ | ||
html: body, | ||
done: function (errors, window) { | ||
window.document.originalURL = html; | ||
if (errors) return callback(errors); | ||
@@ -109,2 +136,6 @@ if (!window.document.body) return callback(new Error('No body tag was found.')); | ||
module.exports.read = read; | ||
module.exports = read; | ||
module.exports.read = function() { | ||
console.warn('`readability.read` is deprecated. Just use `var read = require("node-readability"); read(url...);`.'); | ||
return read.apply(this, arguments); | ||
}; |
@@ -1,5 +0,5 @@ | ||
var readability = require('../src/readability') | ||
, helpers = require('../src/helpers') | ||
, jsdom = require( 'jsdom' ) | ||
, noBody = '<html><head><title>hi</title></head>hi!</html>'; | ||
var read = require('../src/readability'); | ||
var helpers = require('../src/helpers'); | ||
var jsdom = require( 'jsdom' ); | ||
var noBody = '<html><head><title>hi</title></head>hi!</html>'; | ||
require('should'); | ||
@@ -9,6 +9,6 @@ | ||
it('should get document', function (done) { | ||
readability.read('http://colorlines.com/archives/2011/08/dispatch_from_angola_faith-based_slavery_in_a_louisiana_prison.html', function(err, read) { | ||
read.read('http://colorlines.com/archives/2011/08/dispatch_from_angola_faith-based_slavery_in_a_louisiana_prison.html', function(err, read) { | ||
if (err) return done(err); | ||
var dom = read.getDocument(); | ||
var html = '<html><head><meta charset="utf-8"><title>'+dom.title+'</title></head><body><h1>'+read.getTitle()+'</h1>'+read.getContent()+'</body></html>'; | ||
var dom = read.document; | ||
var html = '<html><head><meta charset="utf-8"><title>'+dom.title+'</title></head><body><h1>'+read.title+'</h1>'+read.content+'</body></html>'; | ||
html.should.include('<title>Dispatch From Angola: Faith-Based Slavery in a Louisiana Prison - COLORLINES</title>'); | ||
@@ -19,6 +19,6 @@ done(); | ||
it('should get document with frames', function (done) { | ||
readability.read('http://www.whitehouse.gov/', function(err, read) { | ||
read('http://www.whitehouse.gov/', function(err, read) { | ||
if (err) return done(err); | ||
var dom = read.getDocument(); | ||
read.getTitle().should.equal('The White House'); | ||
var dom = read.document; | ||
read.title.should.equal('The White House'); | ||
done(); | ||
@@ -28,3 +28,3 @@ }); | ||
it('should handle the html that missing body tag', function (done) { | ||
readability.read(noBody, function (err, read) { | ||
read(noBody, function (err, read) { | ||
err.message.should.equal('No body tag was found.'); | ||
@@ -31,0 +31,0 @@ done(); |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
685
91
29820
9