Comparing version 0.1.6 to 0.1.7
10
index.js
@@ -18,3 +18,3 @@ // Copyright 2014 Tjatse | ||
var fetchUrl = require('fetch').fetchUrl, | ||
var req = require('req-fast'), | ||
cheerio = require('cheerio'), | ||
@@ -64,11 +64,11 @@ Article = require('./lib/article'); | ||
if (options.uri) { | ||
fetchUrl(uri, options, function(err, meta, body) { | ||
req(options, function(err, resp) { | ||
if (err) { | ||
return callback(err); | ||
} | ||
if(meta.status != 200){ | ||
return callback(new Error('STATUS: ' + meta.status)); | ||
if(resp.statusCode != 200){ | ||
return callback(new Error('STATUS: ' + resp.statusCode)); | ||
} | ||
parsingData.html = body.toString(); | ||
parsingData.html = resp.body.toString(); | ||
parse(parsingData); | ||
@@ -75,0 +75,0 @@ }); |
{ | ||
"name": "read-art", | ||
"version": "0.1.6", | ||
"version": "0.1.7", | ||
"description": "Scrape article from any page, automatically, make web page readability.", | ||
@@ -31,4 +31,4 @@ "main": "index.js", | ||
"cheerio": "~0.15.0", | ||
"fetch": "0.3.6", | ||
"URIjs": "1.14.0" | ||
"req-fast": "^0.1.6", | ||
"URIjs": "^1.14.0" | ||
}, | ||
@@ -35,0 +35,0 @@ "devDependencies": { |
146
README.md
@@ -10,2 +10,9 @@ # read-art | ||
## Features | ||
- Automatic Read Title & Body | ||
- Follow Redirects | ||
- Automatic Decoding Content Encodings(Avoid Messy Codes, Especially Chinese) | ||
- Gzip/Deflate Encoding(Automatic Decompress) | ||
- Proxy | ||
## Installation | ||
@@ -17,5 +24,2 @@ ```javascript | ||
## Usage | ||
see test or examples folder for a complete example | ||
## Read Article | ||
```javascript | ||
@@ -27,12 +31,17 @@ read(html/uri [, options], callback) | ||
Where | ||
* **html/uri** html or uri string. | ||
* **options** is an optional options object | ||
* **callback** is the callback to run - `callback(error, article, options)` | ||
* **html/uri** Html or Uri string. | ||
* **options** An optional options object, including: | ||
- **dataType** The data type of article content, including: html, text. see more @[Output](#output) | ||
- **killBreaks** A value indicating whether kill breaks, blanks, tab symbols(\r\t\n) into one `<br />` or not, `true` as default. | ||
- **options from [cheerio](https://github.com/cheeriojs/cheerio)** | ||
- **options from [req-fast](https://github.com/Tjatse/req-fast)** | ||
* **callback** The callback to run - `callback(error, article, options)` | ||
Example | ||
scrape by uri? | ||
> See test or examples folder for a complete example | ||
Just try it | ||
```javascript | ||
var read = require('read-art'); | ||
read('http://google.com', { overrideCharset: 'utf8' }, function(err, art, options){ | ||
// read from google could be | ||
read('http://google.com', { charset: 'utf8' }, function(err, art, options){ | ||
if(err){ | ||
@@ -45,64 +54,21 @@ throw err; | ||
}); | ||
``` | ||
// or | ||
read({ uri: 'http://google.com', charset: 'utf8' }, function(err, art, options){ | ||
or | ||
```javascript | ||
read({ uri: 'http://google.com', overrideCharset: 'utf8' }, function(err, art, options){ | ||
... | ||
}); | ||
``` | ||
// what about html? | ||
read('<title>node-art</title><body><div><p>hello, read-art!</p></div></body>', { charset: 'utf8' }, function(err, art, options){ | ||
what about simple html? | ||
```javascript | ||
read('<title>node-art</title><body><div><p>hello, read-art!</p></div></body>', { overrideCharset: 'utf8' }, function(err, art, options){ | ||
... | ||
}); | ||
``` | ||
// of course could be | ||
read({ uri: '<title>node-art</title><body><div><p>hello, read-art!</p></div></body>', charset: 'utf8' }, function(err, art, options){ | ||
or | ||
```javascript | ||
read({ uri: '<title>node-art</title><body><div><p>hello, read-art!</p></div></body>', overrideCharset: 'utf8' }, function(err, art, options){ | ||
... | ||
}); | ||
``` | ||
**CAUTION:** Title must be wrapped in a `<title>` tag and content must be wrapped in a `<body>` tag. | ||
or | ||
```javascript | ||
read({ html: '<title>node-art</title><body><div><p>hello, read-art!</p></div></body>', overrideCharset: 'utf8' }, function(err, art, options){ | ||
... | ||
}); | ||
``` | ||
**CAUTION** title must be wrapped in a *title* tag and content must be wrapped in a *body* tag. | ||
## Options | ||
### dataType | ||
The data type of article content, including: html, text. see more @[Output](#output) | ||
### killBreaks | ||
Kill breaks, blanks, tab symbols(\r\t\n) into one <br />. | ||
###options from [cheerio](https://github.com/cheeriojs/cheerio) | ||
### xmlMode | ||
Indicates whether special tags (`<script>` and `<style>`) should get special treatment and if "empty" tags (eg. `<br>`) can have children. If false, the content of special tags will be text only. | ||
For feeds and other XML content (documents that don't consist of HTML), set this to true. Default: false. | ||
### lowerCaseTags | ||
If set to true, all tags will be lowercased. If xmlMode is disabled, this defaults to true. | ||
### normalizeWhitespace | ||
Returns the innerHTML with the leading, trailing, and repeating white spaces stripped. | ||
### options from [fetch](https://github.com/andris9/fetch) | ||
[Click Here To Redirect](https://github.com/andris9/fetch#options) | ||
## Output | ||
You can set different dataType to wrap the output | ||
### text | ||
Returns the inner text of article content. | ||
Example | ||
Returns the inner text of article content(strip html tags), e.g.: | ||
```javascript | ||
@@ -128,5 +94,3 @@ read('http://example.com', { | ||
### html | ||
Returns the inner HTML of article content. | ||
Example | ||
Returns the inner HTML of article content, e.g.: | ||
```javascript | ||
@@ -152,4 +116,3 @@ read('http://example.com', { | ||
### json | ||
Returns the restful result of article content. | ||
Example | ||
Returns the restful result of article content, e.g.: | ||
```javascript | ||
@@ -173,3 +136,3 @@ read('http://example.com', { | ||
``` | ||
the art.content will be an Array | ||
The art.content will be an Array such as: | ||
```json | ||
@@ -181,11 +144,11 @@ [ | ||
``` | ||
there only two type were supported now: *img* and *text* | ||
There only two types were supported now: *img* and *text* | ||
As you see, the dataType could be defined in two way: | ||
As you see, the dataType could be defined in two ways: | ||
1. Simple String, should be one of *text*, *html* and *json*. | ||
2. Complex Object, including keys: | ||
2. Complex Object, including: | ||
- type: one of *text*, *html* and *json*, default as 'html'. | ||
- stripSpaces: a value indicating whether strip tab symbols(\r\t\n), default as false. | ||
## Features | ||
## Powerful | ||
__ϟ Blazingly fast:__ | ||
@@ -195,15 +158,42 @@ read-art is based on cheerio(cheerio is about __8x__ faster than JSDOM), and the article marking strategy actualized by RegExp, it's supper fast and cost less memory. | ||
__❤ Hit the target:__ | ||
the bonus algorithm make spider or scraper more easier to grab the article title & content. | ||
The bonus algorithm make spider or scraper more easier to grab the article title & content. | ||
__❁ Fetch:__ | ||
if you only wanna fetch html body by url, [fetch](https://github.com/andris9/fetch) is an amazing library, i've test it with [request](https://github.com/mikeal/request), it's really fast and cost less memory, reference to [Vadim's Issue](https://github.com/bndr/node-read/pull/15) | ||
and more important, **fetch** could avoid messy code in 99.9% conditions, event some pages using *utf8* in response headers, but *gb2312* in html head meta, we only need to use the overrideCharset option, e.g.: | ||
__↵ Fetch HTML:__ | ||
If you only wanna fetch html body from server, [req-fast](https://github.com/Tjatse/req-fast) is amazing, it supports: | ||
- Follow Redirects | ||
- Automatic Decoding Content Encodings(Avoid Messy Codes, Especially Chinese) | ||
- Cookies | ||
- JSON Response Auto Handling | ||
- Gzip/Deflate Encoding(Automatic Decompress) | ||
- Proxy | ||
**refrain from the crazy messy codes** | ||
```javascript | ||
read('http://game.163.com/14/0506/10/9RI8M9AO00314SDA.html', { | ||
overrideCharset: 'gbk' | ||
charset: 'gbk' | ||
}, function(err, art){ | ||
... | ||
// ... | ||
}); | ||
``` | ||
to refrain from the crazy messy codes. | ||
**generate agent to simulate browsers** | ||
```javascript | ||
read('http://example.com', { | ||
agent: true // true as default | ||
}, function(err, art){ | ||
// ... | ||
}); | ||
``` | ||
**use proxy** | ||
```javascript | ||
read('http://example.com', { | ||
proxy: { | ||
host: 'http://myproxy.com/', | ||
port: 8081, | ||
proxyAuth: 'user:password' | ||
} | ||
}, function(err, art){ | ||
// ... | ||
}); | ||
``` | ||
and [more](https://github.com/Tjatse/req-fast) is amazing, it supports... | ||
@@ -210,0 +200,0 @@ ## Test |
@@ -10,3 +10,3 @@ var read = require('../'), | ||
read('http://game.163.com/14/0506/10/9RI8M9AO00314SDA.html', { | ||
overrideCharset: 'gbk' | ||
charset: 'gbk' | ||
}, function(err, art){ | ||
@@ -13,0 +13,0 @@ should.not.exist(err); |
39377
237
+ Addedreq-fast@^0.1.6
+ AddedURIjs@1.16.1(transitive)
+ Addediconv-lite@0.4.24(transitive)
+ Addedrandom-ua@0.0.6(transitive)
+ Addedreq-fast@0.1.9(transitive)
+ Addedtunnel@0.0.3(transitive)
- Removedfetch@0.3.6
- RemovedURIjs@1.14.0(transitive)
- Removedencoding@0.1.13(transitive)
- Removedfetch@0.3.6(transitive)
- Removediconv-lite@0.6.3(transitive)
UpdatedURIjs@^1.14.0