json-web-crawler
Advanced tools
Comparing version
@@ -6,12 +6,12 @@ const co = require('co'); | ||
const setting = { | ||
type: 'list', | ||
container: '#projects_list .project-card', | ||
pageNotFound: [{ | ||
elem: '.grey-frame-inner h1', | ||
get: 'text', | ||
equalTo: '404' | ||
check: ['equal', '404'] | ||
}], | ||
type: 'list', | ||
container: '#projects_list .project-card', | ||
// listOption: [ 'limit', 3 ], | ||
// listOption: [ 'range', 0, 6 ], | ||
listOption: [ 'ignore', 0, 2, -1 ], | ||
listOption: [ 'range', 0, 10 ], | ||
// listOption: [ 'ignore', 0, 2, -1 ], | ||
// listOption: [ 'focus', 3, -3 ], | ||
@@ -69,2 +69,3 @@ crawl: { | ||
console.log('Result:', result); | ||
console.log('Total:', result.length); | ||
}).catch(console.error); |
{ | ||
"name": "json-web-crawler", | ||
"version": "0.7.1", | ||
"version": "0.7.2", | ||
"description": "Crawl website by json", | ||
@@ -5,0 +5,0 @@ "main": "index.js", |
289
README.md
# Json Web Crawler | ||
[](https://www.npmjs.com/package/json-web-crawler) | ||
[](https://github.com/ellerbrock/open-source-badge/) | ||
Use JSON to list all elements (with css 3 and jquery selector) that you want to crawl. | ||
@@ -9,2 +12,3 @@ Only run in node >= 6.0 | ||
## Usage | ||
```javascript | ||
@@ -23,93 +27,142 @@ npm i json-web-crawler --save | ||
### type | ||
The default type is `content`. | ||
- content: crawl specific $container to a single json. | ||
- list: crawl a list like Google search result into multi data. | ||
### container | ||
DOM element that will focus on. If type is `list`, it will crawl each container class. | ||
### listOption | ||
Optional, enable in `list` type only, use when you don't want to crawl the whole list. ** ALL STRAT FROM 0 ** | ||
- `[ 'limit', 10 ]`: ten elements only (eq(0) ~ eq(9)). | ||
- `[ 'range', 6, 12 ]`: from eq(6) to eq(12 - 1). If without end, it will continue to the last one. | ||
- `[ 'focus', 0, 3, 7, ... ]`: specific elements in list (eq(0), eq(3), eq(7), ...). You can use -1, -2 to count from backward. | ||
- `[ 'ignore', 1, 2, 5 ]`: elements you want to ignore it. You can use -1, -2 to count from backward. | ||
### crawl | ||
`keyName: { options }` => `keyName: data` | ||
```javascript | ||
const settings = { | ||
// If match one of this checklist, it will return page not found error. | ||
pageNotFound: [{ | ||
elem: '.error-msg', | ||
get: 'text', | ||
check: ['equal', '404'] // No `process` here | ||
}], | ||
crawl: { | ||
image: { | ||
elem: 'img', | ||
get: 'src' | ||
} | ||
} | ||
// 'content' or 'list', default is content if not set | ||
// Content: crawl page to a single json. | ||
type: 'content', | ||
// DOM element that will focus on. | ||
container: '.container', | ||
// will become | ||
image: IMAGE_SRC_URL | ||
``` | ||
// or | ||
// List: crawl a list like Google search result into multi data. | ||
type: 'list', | ||
// DOM element that will loop to crawl | ||
container: 'li.search-result', | ||
#### options | ||
// If type is 'list', you can set these values below (Optional). | ||
// ================================================================= | ||
// Optional, use if you don't want to crawl the whole list. ** ALL STRAT FROM 0 ** | ||
listOption: ['limit', 10], // eq(0) ~ eq(9) | ||
// ['range', 6, 12], // eq(6) ~ eq(11)*, if without end, it will continue to the last one | ||
// ['focus', 0, 3, 7], // [eq(0), eq(3), eq(7)] | ||
// ['ignore', 1, 2, 5], // Elements you want to ignore it. You can use -1, -2 to count from backward. | ||
// ================================================================= | ||
- elem: element inside `container`. If empty or undefined, it will use `container` or `listElems` instead | ||
- noChild (boolean): remove all children elem under $(elem) | ||
- outOfContainer (boolean): If exist, It will use $('html').find(elem) | ||
- get: return type of element | ||
- `text` | ||
- `num` | ||
- `length`: $element.length | ||
- `attrName`: $element.attr('attrName') | ||
- `data-dataName`: $element.data('dataNAme') | ||
- `data-dataName:X`: `X` is optional. | ||
- If data is an array, set `data-dataName:0` will return `$elem.data('dataAttribute')[0]`. | ||
- If data is an object, set `data-dataName:id` will return `$elem.data('dataAttribute')['id']`. | ||
- If X not exist, it will return the whole data. | ||
- process: If you want to do something else after 'get' (string type only) | ||
crawl: { | ||
keyName: { | ||
elem: '.element1:eq(0)', // Must have, If empty or undefined, it will use container or listElems instead | ||
noChild: true, // Optional, remove all children elem under $(elem) | ||
outOfContainer: true, // Optional, If exist, It will use $('html').find() | ||
get: 'text', | ||
// 'num' | ||
// 'html' | ||
// 'length' // => $element.length | ||
// 'attrName' // => $elem.attr('attrName') | ||
// 'data-dataName' // => $elem.data('dataNAme') | ||
// 'data-dataName:X' | ||
// X is optional, if data is an array, set 'data-dataName:0' will return $elem.data('dataAttribute')[0] | ||
// If data is an object, set 'data-dataName:id' will return $elem.data('dataAttribute')['id'] | ||
// If X not exist, it will return the whole data | ||
```javascript | ||
// You can use some simple functions that existed in lodash. | ||
process: [ | ||
['match', /regex here/, number], // => str.match(/regex here/)[number], return array if no number, but will cause other process won't work | ||
['split', ',', number], // => str.split(',')[number], return array if no number, but will cause other process won't work | ||
['replace', 'one', 'two'], | ||
['substring', 0, 3], | ||
['prepend', 'text'], // => 'text' + value | ||
['append', 'text'], // => value + 'text' | ||
['indexOf', 'text'] // => return number | ||
['INDENPENDENT_FUNCTION'], // like encodeURI, encodeURIComponent, unescape, etc... | ||
/** | ||
* Due to lodash has the same name `escape` & `unescape` functions with | ||
* different behavior, the origin `escape` & `unescape` function will | ||
* renamed to `encode` & `decode` instead. | ||
*/ | ||
], | ||
// Optional, if you want to do something else after 'get' (string type only) | ||
// You can use some simple functions that existed in (lodash)[https://lodash.com/docs]. | ||
process: [ | ||
['match', /regex here/, number], // => str.match(/regex here/)[number], return array if no number, but will cause other process won't work | ||
['split', ',', number], // => str.split(',')[number], return array if no number, but will cause other process won't work | ||
['replace', 'one', 'two'], | ||
['substring', 0, 3], | ||
['prepend', 'text'], // => 'text' + value | ||
['append', 'text'], // => value + 'text' | ||
['indexOf', 'text'] // => return number | ||
['INDENPENDENT_FUNCTION'], // like encodeURI, encodeURIComponent, unescape, etc... | ||
// Due to lodash has the same name `escape` & `unescape` functions with different behavior, | ||
// the origin `escape` & `unescape` function will renamed to `encode` & `decode` instead. | ||
], | ||
// Or you want to DIY, you can use function instead | ||
process(value) { | ||
// do something | ||
// Or you want to DIY, you can use function instead | ||
process(value) { | ||
// do something | ||
return newValue; | ||
} | ||
``` | ||
return newValue; | ||
}, | ||
default: '' // return default value when elem not found, null or undefined (`process` will be ignored) | ||
- collect: If the value you want is sperated to several elements, use collect to find them all. | ||
- elems: contain multi elements array. | ||
- loop (boolean): It will run all elems (like `li`) you want to get | ||
- combineWith: without this, collect will return array | ||
- default: return default value when elem not found, null or undefined (`process` will be ignored) | ||
### pageNotFound | ||
If match, it will return page not found error. | ||
- elem | ||
- get | ||
- check: like `process`, but only one step | ||
## Example | ||
### Content Type | ||
Steam Dota2 page in `demo`. | ||
```javascript | ||
const setting = { | ||
type: 'content', | ||
container: '#game_highlights .rightcol', | ||
crawl: { | ||
appId: { | ||
elem: '.glance_tags', | ||
get: 'data-appid' | ||
}, | ||
keyName2: { | ||
elem: 'table tbody', | ||
// If the value you want is sperated to several elements, use collect to get all elems | ||
appName: { | ||
outOfContainer: true, | ||
elem: '.apphub_AppName', | ||
get: 'text' | ||
}, | ||
image: { | ||
elem: '.game_header_image_full', | ||
get: 'src' | ||
}, | ||
reviews: { | ||
elem: '.game_review_summary:eq(0)', | ||
get: 'text', | ||
}, | ||
tags: { | ||
elem: '.glance_tags', | ||
collect: { | ||
elems: [{ | ||
elem: 'tr:nth-child(1)', | ||
get: 'text', | ||
elem: 'a.app_tag:eq(0)', | ||
get: 'text' | ||
}, { | ||
elem: 'tr:nth-child(2)', | ||
get: 'num', | ||
elem: 'a.app_tag:eq(1)', | ||
get: 'text' | ||
}, { | ||
get: 'href' // If no elem, the default is parent elem $(table tbody) | ||
elem: 'a.app_tag:eq(2)', | ||
get: 'text' | ||
}], | ||
// without this, collect will return array | ||
combineWith: ', ' | ||
} | ||
}, | ||
keyName3: { | ||
elem: 'table tbody tr', | ||
// It will run all tr elems you set | ||
allTags: { | ||
elem: '.glance_tags a.app_tag', | ||
collect: { | ||
@@ -120,2 +173,13 @@ loop: true, | ||
} | ||
}, | ||
description: { | ||
elem: '.game_description_snippet', | ||
get: 'text', | ||
process(value) { | ||
return value.split(', '); | ||
} | ||
}, | ||
releaseDate: { | ||
elem: '.release_date .date', | ||
get: 'text' | ||
} | ||
@@ -126,25 +190,62 @@ } | ||
### List Type | ||
## The MIT License (MIT) | ||
KickStarter popular list in `demo`. | ||
Copyright (c) 2016 Knovour Zheng | ||
```javascript | ||
const setting = { | ||
pageNotFound: [{ | ||
elem: '.grey-frame-inner h1', | ||
get: 'text', | ||
check: ['equal', '404'] | ||
}], | ||
type: 'list', | ||
container: '#projects_list .project-card', | ||
listOption: [ 'limit', 3 ], | ||
// listOption: [ 'range', 0, 10 ], | ||
// listOption: [ 'ignore', 0, 2, -1 ], | ||
// listOption: [ 'focus', 3, -3 ], | ||
crawl: { | ||
projectID: { | ||
get: 'data-pid', | ||
}, | ||
name: { | ||
elem: '.project-title', | ||
get: 'text', | ||
}, | ||
image: { | ||
elem: '.project-thumbnail img', | ||
get: 'src' | ||
}, | ||
link: { | ||
elem: '.project-title a', | ||
get: 'href', | ||
process: [ | ||
[ 'split', '?', 0 ], | ||
[ 'prepend', 'https://www.kickstarter.com' ] | ||
] | ||
}, | ||
description: { | ||
elem: '.project-blurb', | ||
get: 'text' | ||
}, | ||
funded: { | ||
elem: '.project-stats-value:eq(0)', | ||
get: 'text' | ||
}, | ||
percentPledged: { | ||
elem: '.project-percent-pledged', | ||
get: 'style', | ||
process: [ | ||
[ 'split', /:\s?/g, 1 ] | ||
] | ||
}, | ||
pledged: { | ||
elem: '.money.usd', | ||
get: 'num' | ||
} | ||
} | ||
}; | ||
``` | ||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
The above copyright notice and this permission notice shall be included in | ||
all copies or substantial portions of the Software. | ||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
THE SOFTWARE. | ||
[Demo]: https://tonicdev.com/knovour/json-web-crawler-demo |
URL strings
Supply chain riskPackage contains fragments of external URLs or IP addresses, which the package may be accessing at runtime.
Found 1 instance in 1 package
URL strings
Supply chain riskPackage contains fragments of external URLs or IP addresses, which the package may be accessing at runtime.
Found 1 instance in 1 package
17836
4.63%362
0.28%248
68.71%