Comparing version 1.2.0 to 1.3.0
{ | ||
"name": "oh-scrap", | ||
"version": "1.2.0", | ||
"version": "1.3.0", | ||
"description": "Node Module skeleton", | ||
@@ -32,4 +32,4 @@ "main": "lib/index.js", | ||
"dependencies": { | ||
"async": "^2.6.0", | ||
"babel-polyfill": "^6.26.0", | ||
"bluebird": "^3.5.1", | ||
"cheerio": "^1.0.0-rc.2", | ||
@@ -36,0 +36,0 @@ "debug": "^3.1.0", |
242
src/index.js
@@ -1,25 +0,16 @@ | ||
/* eslint-disable class-methods-use-this */ | ||
import 'babel-polyfill'; | ||
import Promise from 'bluebird'; | ||
import cheerio from 'cheerio'; | ||
import { forever } from 'async'; | ||
import Debug from 'debug'; | ||
import EventEmitter from 'events'; | ||
import isUrl from 'is-url'; | ||
import { | ||
isArray, | ||
isPlainObject, | ||
isString, | ||
uniq, | ||
} from 'lodash'; | ||
import os from 'os'; | ||
import isValidPath from 'is-valid-path'; | ||
import URL from 'url'; | ||
import Engine from './engine'; | ||
import { crawl } from './crawl'; | ||
const debug = Debug('oh-scrap'); | ||
class OhScrap extends EventEmitter { | ||
constructor(concurrency = os.cpus().length, strict = false) { | ||
constructor(concurrency = os.cpus().length) { | ||
super(); | ||
@@ -33,3 +24,2 @@ | ||
}; | ||
this.strict = strict; | ||
} | ||
@@ -59,161 +49,3 @@ | ||
isUrl(url) { | ||
return isString(url) && isUrl(url); | ||
} | ||
isRelativeUrl(url) { | ||
return isString(url) && isValidPath(url); | ||
} | ||
getDataFromNode(node, attr = false) { | ||
if (attr) { | ||
return attr === 'HTML' ? node.parent().html() : node.attr(attr); | ||
} | ||
return node.text(); | ||
} | ||
loadContent(content = '', selector = '') { | ||
const parts = selector.split('@'); | ||
const xpath = parts[0]; | ||
const attr = parts[1]; | ||
const $ = cheerio.load(content); | ||
return { | ||
attr, | ||
element: $(xpath), | ||
xpath, | ||
$, | ||
}; | ||
} | ||
getSelectorMatches({ $, attr, element }) { | ||
const rawMatches = element | ||
.map((ind, node) => this.getDataFromNode($(node), attr)) | ||
.get(); | ||
const matches = uniq(rawMatches).filter(item => item.length > 0); | ||
debug(`filtered matches: ${rawMatches.length !== matches.length}`); | ||
return matches.length > 1 ? matches : matches[0]; | ||
} | ||
handleSelectorString(content = '', selector = '') { | ||
return new Promise((resolve, reject) => { | ||
const { | ||
attr, | ||
element, | ||
$, | ||
} = this.loadContent(content, selector); | ||
const count = element.length; | ||
debug(`handleSelectorString "${selector}" => ${count}`); | ||
if (count === 1) { | ||
return resolve(this.getDataFromNode(element, attr)); | ||
} | ||
if (count > 1) { | ||
return resolve(this.getSelectorMatches({ $, attr, element })); | ||
} | ||
return this.strict ? reject(new Error('no element found')) : resolve(false); | ||
}); | ||
} | ||
handleSelectorObject(content = '', selector) { | ||
return new Promise(async (resolve, reject) => { | ||
const newObject = {}; | ||
try { | ||
/* eslint-disable no-await-in-loop, no-restricted-syntax */ | ||
for (const [key, value] of Object.entries(selector)) { | ||
newObject[key] = await this.handleSelector(content, value); | ||
} | ||
/* eslint-enable no-await-in-loop, no-restricted-syntax */ | ||
} catch (e) { | ||
reject(e); | ||
return; | ||
} | ||
resolve(newObject); | ||
}); | ||
} | ||
async handleSelectorArray(content = '', [sourceSelector, targetSelector]) { | ||
debug(`handleSelectorArray: ${sourceSelector} => ${targetSelector}`); | ||
const result = await this.handleSelectorString(content, sourceSelector); | ||
if (this.isUrl(result) || this.isRelativeUrl(result)) { | ||
debug(`handleSelectorArray: result => url => ${result}`); | ||
return this.crawl(result, targetSelector); | ||
} | ||
if (isArray(result)) { | ||
debug(`handleSelectorArray: result => array => ${result.length}`, targetSelector); | ||
return Promise.map(result, source => this.crawl(source, targetSelector), { | ||
concurrency: this.concurrency, | ||
}); | ||
} | ||
if (this.strict) { | ||
return Promise.reject(new Error(`no result found: ${sourceSelector}`)); | ||
} | ||
return Promise.resolve(false); | ||
} | ||
handleSelector(content = '', selector) { | ||
debug('handleSelector', selector); | ||
return new Promise((resolve, reject) => { | ||
if (isString(selector)) { | ||
resolve(this.handleSelectorString(content, selector)); | ||
} else if (isPlainObject(selector)) { | ||
resolve(this.handleSelectorObject(content, selector)); | ||
} else if (isArray(selector)) { | ||
resolve(this.handleSelectorArray(content, selector)); | ||
} else { | ||
reject(new Error('selector type not valid')); | ||
} | ||
}); | ||
} | ||
setBaseUrl(source) { | ||
const { hostname, protocol } = URL.parse(source); | ||
this.baseUrl = `${protocol}//${hostname}`; | ||
debug(`setBaseUrl: ${this.baseUrl}`); | ||
return this.baseUrl; | ||
} | ||
async crawl(source, selector) { | ||
let content = source; | ||
if (this.isUrl(source)) { | ||
debug(`crawl absolute link: ${source}`); | ||
this.setBaseUrl(source); | ||
content = await this.engine.retrieveContent(source); | ||
} else if (this.isRelativeUrl(source)) { | ||
const link = URL.resolve(this.baseUrl, source); | ||
debug(`crawl relative link: ${link}`); | ||
content = await this.engine.retrieveContent(link); | ||
} | ||
return this.handleSelector(content, selector); | ||
} | ||
async until(getSource, selector, keepGoing = () => Promise.resolve(false)) { | ||
async until(getSource, selector, keepGoing = () => false) { | ||
let count = 0; | ||
@@ -225,27 +57,34 @@ | ||
/* eslint-disable no-await-in-loop, no-constant-condition */ | ||
while (true) { | ||
const source = getSource(count); | ||
let result; | ||
return new Promise((resolve) => { | ||
forever((next) => { | ||
const source = getSource(count); | ||
try { | ||
result = await this.crawl(source, selector); | ||
crawl({ | ||
engine: this.engine, | ||
selector, | ||
source, | ||
}, async (err, result) => { | ||
if (err) { | ||
next(err); | ||
return; | ||
} | ||
this.emit('data', { count, result, source }); | ||
this.emit('data', { count, result, source }); | ||
if (!await keepGoing(count, result)) { | ||
break; | ||
} | ||
const flag = await keepGoing({ count, result, source }); | ||
count += 1; | ||
} catch (e) { | ||
this.emit('error', e); | ||
break; | ||
} | ||
} | ||
/* eslint-enable no-await-in-loop */ | ||
if (flag) { | ||
count += 1; | ||
await this.teardown(); | ||
next(); | ||
} else { | ||
next(count); | ||
} | ||
}); | ||
}, async () => { | ||
await this.teardown(); | ||
return count; | ||
resolve(count); | ||
}); | ||
}); | ||
} | ||
@@ -258,8 +97,19 @@ | ||
return new Promise((resolve, reject) => { | ||
crawl({ | ||
concurrency: this.concurrency, | ||
engine: this.engine, | ||
selector, | ||
source, | ||
}, async (err, res) => { | ||
await this.teardown(); | ||
const result = await this.crawl(source, selector); | ||
if (err) { | ||
reject(err); | ||
return; | ||
} | ||
await this.teardown(); | ||
return result; | ||
resolve(res); | ||
}); | ||
}); | ||
} | ||
@@ -266,0 +116,0 @@ } |
@@ -69,8 +69,7 @@ import os from 'os'; | ||
describe('when passing concurrency and strict to the constructor', () => { | ||
describe('when passing concurrency to the constructor', () => { | ||
const CONCURRENCY = 2; | ||
const STRICT = true; | ||
beforeEach(() => { | ||
ohscrap = new OhScrap(CONCURRENCY, STRICT); | ||
ohscrap = new OhScrap(CONCURRENCY); | ||
}); | ||
@@ -80,13 +79,10 @@ | ||
expect(ohscrap.concurrency).to.equal(CONCURRENCY); | ||
expect(ohscrap.strict).to.equal(STRICT); | ||
}); | ||
}); | ||
describe('when NOT passing concurrency and strict to the constructor', () => { | ||
describe('when NOT passing concurrency to the constructor', () => { | ||
const CONCURRENCY = os.cpus().length; | ||
const STRICT = false; | ||
it('should use the default settings', () => { | ||
expect(ohscrap.concurrency).to.equal(CONCURRENCY); | ||
expect(ohscrap.strict).to.equal(STRICT); | ||
}); | ||
@@ -197,3 +193,3 @@ }); | ||
const getSource = count => `http://page${count + 1}.com/`; | ||
const keepGoing = (count, result) => { | ||
const keepGoing = ({ result }) => { | ||
const flag = isArray(result.items) && result.items.length > 0; | ||
@@ -215,4 +211,4 @@ | ||
it('should call emitStub 3 times', () => { | ||
expect(emitStub).to.be.calledThrice; | ||
it('should call emitStub 2 times', () => { | ||
expect(emitStub).to.be.calledTwice; | ||
}); | ||
@@ -219,0 +215,0 @@ |
Sorry, the diff of this file is not supported yet
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
143801
17
466
+ Addedasync@^2.6.0
+ Addedasync@2.6.4(transitive)
- Removedbluebird@^3.5.1
- Removedbluebird@3.7.2(transitive)