Socket
Socket
Sign inDemoInstall

node-readability

Package Overview
Dependencies
Maintainers
1
Versions
27
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

node-readability - npm Package Compare versions

Comparing version 0.1.1 to 0.2.0

12

examples/simple.js

@@ -1,12 +0,8 @@

var readability = require('../src/readability')
var read = require('../src/readability');
// uncoment the following line to print the debug info to console.
// readability.debug(true);
readability.read('http://colorlines.com/archives/2011/08/dispatch_from_angola_faith-based_slavery_in_a_louisiana_prison.html',
read('http://colorlines.com/archives/2011/08/dispatch_from_angola_faith-based_slavery_in_a_louisiana_prison.html',
function(err, read) {
var dom = read.getDocument();
var html = '<html><head><meta charset="utf-8"><title>'+dom.title+'</title></head><body><h1>'+read.getTitle()+'</h1>'+read.getContent()+'</body></html>';
var dom = read.document;
var html = '<html><head><meta charset="utf-8"><title>'+dom.title+'</title></head><body><h1>'+read.title+'</h1>'+read.content+'</body></html>';
console.log(html);
});
{
"name": "node-readability",
"version": "0.1.1",
"version": "0.2.0",
"author": "Zihua Li",

@@ -5,0 +5,0 @@ "description": "Turning any web page into a clean view.",

@@ -1,2 +0,2 @@

# node-readability
# Readability

@@ -7,2 +7,7 @@ Turn any web page into a clean view. This module is based on arc90's readability project.

### Features
1. Optimized for more websites.
2. Support encodings such as GBK and GB2312.
3. Converts relative urls to absolute for images and links automatically(Thank [Guillermo Baigorria](https://github.com/gbaygon) & [Tom Sutton](https://github.com/tomsutton1984)).
## Install

@@ -14,3 +19,3 @@

`readability.read(html [, options], callback)`
`read(html [, options], callback)`

@@ -25,13 +30,21 @@ Where

var readability = require('node-readability');
var read = require('node-readability');
readability.read('http://howtonode.org/really-simple-file-uploads', function(err, article) {
console.log(article.getContent());
read('http://howtonode.org/really-simple-file-uploads', function(err, article) {
// The main body of the page.
console.log(article.content);
// The title of the page.
console.log(article.title);
// The raw HTML code of the page
console.log(article.html);
// The document object of the page
console.log(article.document);
});
**NB** If the file has been marked with charset other than utf-8, it is converted automatically. Charsets such as GBK, GB2312 is also supported via [iconv](https://github.com/bnoordhuis/node-iconv).
**NB** If the page has been marked with charset other than utf-8, it will be converted automatically. Charsets such as GBK, GB2312 is also supported.
## Options
node-readability support all the options that [fetch](https://github.com/andris9/fetch) support.
node-readability will pass the options to [fetch](https://github.com/andris9/fetch) directly.

@@ -56,24 +69,23 @@ Possible option values

## article
## article object
### getContent()
### content
Return the article content of the web page. Return `false` if failed.
The article content of the web page. Return `false` if failed.
### getTitle()
### title
Return the article title of the web page.
The article title of the web page. It's may not same to the text in the `<title>` tag.
### getHTML()
### html
Return the original html of the web page.
The original html of the web page.
### getDocument()
### document
Return the document of the web page generated by jsdom.
The document of the web page generated by jsdom. You can use it to access the DOM directly(for example, `article.document.getElementById('main')`).
## TODO
## Contributors
* Support more readability features
* Performance optimization
https://github.com/luin/node-readability/graphs/contributors

@@ -80,0 +92,0 @@ ## License

@@ -0,1 +1,3 @@

var url = require("url");
// All of the regular expressions in use within readability.

@@ -465,3 +467,28 @@ var regexps = {

/**
* Converts relative urls to absolute for images and links
**/
function fixLinks (e) {
function fixLink(link){
var fixed = url.resolve(e.ownerDocument.originalURL, link);
return fixed;
}
var imgs = e.getElementsByTagName('img');
for (var i = imgs.length - 1; i >= 0; --i) {
var src = imgs[i].getAttribute('src');
imgs[i].setAttribute('src', fixLink(src));
}
var as = e.getElementsByTagName('a');
for (var i = as.length - 1; i >= 0; --i) {
var href = as[i].getAttribute('href');
as[i].setAttribute('href', fixLink(href));
}
}
/**
* Clean out spurious headers from an Element. Checks things like classnames and link density.

@@ -545,2 +572,3 @@ *

fixLinks(articleContent);
}

@@ -547,0 +575,0 @@

@@ -24,5 +24,21 @@ var jsdom = require('jsdom');

};
this.__defineGetter__('content', function() {
return this.getContent(true);
});
this.__defineGetter__('title', function() {
return this.getTitle(true);
});
this.__defineGetter__('html', function() {
return this.getHTML(true);
});
this.__defineGetter__('document', function() {
return this.getDocument(true);
});
}
Readability.prototype.getContent = function () {
Readability.prototype.getContent = function (notDeprecated) {
if (!notDeprecated) {
console.warn('The method `getContent()` is deprecated, using `content` property instead.');
}
if (typeof this.cache['article-content'] !== 'undefined') {

@@ -44,3 +60,6 @@ return this.cache['article-content'];

Readability.prototype.getTitle = function () {
Readability.prototype.getTitle = function (notDeprecated) {
if (!notDeprecated) {
console.warn('The method `getTitle()` is deprecated, using `title` property instead.');
}
if (typeof this.cache['article-title'] !== 'undefined') {

@@ -70,7 +89,13 @@ return this.cache['article-title'];

Readability.prototype.getDocument = function () {
Readability.prototype.getDocument = function (notDeprecated) {
if (!notDeprecated) {
console.warn('The method `getDocument()` is deprecated, using `document` property instead.');
}
return this._document;
};
Readability.prototype.getHTML = function () {
Readability.prototype.getHTML = function (notDeprecated) {
if (!notDeprecated) {
console.warn('The method `getHTML()` is deprecated, using `html` property instead.');
}
return this._document.getElementsByTagName('html')[0].innerHTML;

@@ -97,5 +122,7 @@ };

if (typeof body !== 'string') body = body.toString();
if (!body) return callback(new Error('Empty story body returned from URL'));
jsdom.env({
html: body,
done: function (errors, window) {
window.document.originalURL = html;
if (errors) return callback(errors);

@@ -109,2 +136,6 @@ if (!window.document.body) return callback(new Error('No body tag was found.'));

module.exports.read = read;
module.exports = read;
module.exports.read = function() {
console.warn('`readability.read` is deprecated. Just use `var read = require("node-readability"); read(url...);`.');
return read.apply(this, arguments);
};

@@ -1,5 +0,5 @@

var readability = require('../src/readability')
, helpers = require('../src/helpers')
, jsdom = require( 'jsdom' )
, noBody = '<html><head><title>hi</title></head>hi!</html>';
var read = require('../src/readability');
var helpers = require('../src/helpers');
var jsdom = require( 'jsdom' );
var noBody = '<html><head><title>hi</title></head>hi!</html>';
require('should');

@@ -9,6 +9,6 @@

it('should get document', function (done) {
readability.read('http://colorlines.com/archives/2011/08/dispatch_from_angola_faith-based_slavery_in_a_louisiana_prison.html', function(err, read) {
read.read('http://colorlines.com/archives/2011/08/dispatch_from_angola_faith-based_slavery_in_a_louisiana_prison.html', function(err, read) {
if (err) return done(err);
var dom = read.getDocument();
var html = '<html><head><meta charset="utf-8"><title>'+dom.title+'</title></head><body><h1>'+read.getTitle()+'</h1>'+read.getContent()+'</body></html>';
var dom = read.document;
var html = '<html><head><meta charset="utf-8"><title>'+dom.title+'</title></head><body><h1>'+read.title+'</h1>'+read.content+'</body></html>';
html.should.include('<title>Dispatch From Angola: Faith-Based Slavery in a Louisiana Prison - COLORLINES</title>');

@@ -19,6 +19,6 @@ done();

it('should get document with frames', function (done) {
readability.read('http://www.whitehouse.gov/', function(err, read) {
read('http://www.whitehouse.gov/', function(err, read) {
if (err) return done(err);
var dom = read.getDocument();
read.getTitle().should.equal('The White House');
var dom = read.document;
read.title.should.equal('The White House');
done();

@@ -28,3 +28,3 @@ });

it('should handle the html that missing body tag', function (done) {
readability.read(noBody, function (err, read) {
read(noBody, function (err, read) {
err.message.should.equal('No body tag was found.');

@@ -31,0 +31,0 @@ done();

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc