Comparing version 1.1.0 to 1.2.0
{ | ||
"name": "oh-scrap", | ||
"version": "1.1.0", | ||
"version": "1.2.0", | ||
"description": "Node Module skeleton", | ||
@@ -18,3 +18,3 @@ "main": "lib/index.js", | ||
"babel-cli": "^6.26.0", | ||
"babel-polyfill": "^6.26.0", | ||
"babel-plugin-transform-regenerator": "^6.26.0", | ||
"babel-preset-env": "^1.6.1", | ||
@@ -29,5 +29,7 @@ "babel-register": "^6.26.0", | ||
"sinon": "^4.1.4", | ||
"sinon-chai": "^2.14.0" | ||
"sinon-chai": "^2.14.0", | ||
"webpack": "^3.10.0" | ||
}, | ||
"dependencies": { | ||
"babel-polyfill": "^6.26.0", | ||
"bluebird": "^3.5.1", | ||
@@ -37,2 +39,3 @@ "cheerio": "^1.0.0-rc.2", | ||
"is-url": "^1.2.2", | ||
"is-valid-path": "^0.1.1", | ||
"lodash": "^4.17.4", | ||
@@ -39,0 +42,0 @@ "puppeteer": "^1.0.0" |
177
src/index.js
/* eslint-disable class-methods-use-this */ | ||
import puppeteer from 'puppeteer'; | ||
import 'babel-polyfill'; | ||
import Promise from 'bluebird'; | ||
import cheerio from 'cheerio'; | ||
import Debug from 'debug'; | ||
import EventEmitter from 'events'; | ||
import isUrl from 'is-url'; | ||
import cheerio from 'cheerio'; | ||
import Promise from 'bluebird'; | ||
import os from 'os'; | ||
import { | ||
@@ -11,43 +13,61 @@ isArray, | ||
isString, | ||
uniq, | ||
} from 'lodash'; | ||
import os from 'os'; | ||
import isValidPath from 'is-valid-path'; | ||
import URL from 'url'; | ||
import Debug from 'debug'; | ||
import Engine from './engine'; | ||
const debug = Debug('OhScrap'); | ||
const debug = Debug('oh-scrap'); | ||
export default class OhScrap { | ||
class OhScrap extends EventEmitter { | ||
constructor(concurrency = os.cpus().length, strict = false) { | ||
this.browser = null; | ||
super(); | ||
this.concurrency = concurrency; | ||
this.engine = new Engine(); | ||
this.metrics = { | ||
end: 0, | ||
start: 0, | ||
}; | ||
this.strict = strict; | ||
} | ||
async retrieveContent(url, selector = 'body') { | ||
const page = await this.browser.newPage(); | ||
async init() { | ||
debug('init'); | ||
await page.setViewport({ | ||
height: 720, | ||
isLandscape: true, | ||
width: 1280, | ||
}); | ||
this.metrics.start = Date.now(); | ||
await page.goto(url, { | ||
waitUntil: 'domcontentloaded', | ||
}); | ||
await this.engine.init(); | ||
await page.waitFor(selector); | ||
this.emit('start'); | ||
} | ||
const content = await page.evaluate((sel) => { | ||
const element = document.querySelector(sel); // eslint-disable-line no-undef | ||
async teardown() { | ||
this.metrics.end = Date.now(); | ||
return element ? element.innerHTML : null; | ||
}, selector); | ||
debug(`finished in ${(this.metrics.end - this.metrics.start) / 1000}s`); | ||
await page.close(); | ||
await this.engine.teardown(); | ||
return content; | ||
this.emit('end'); | ||
debug('teardown'); | ||
} | ||
isUrl(url) { | ||
return isString(url) && isUrl(url); | ||
} | ||
isRelativeUrl(url) { | ||
return isString(url) && isValidPath(url); | ||
} | ||
getDataFromNode(node, attr = false) { | ||
return attr ? node.attr(attr) : node.text(); | ||
if (attr) { | ||
return attr === 'HTML' ? node.parent().html() : node.attr(attr); | ||
} | ||
return node.text(); | ||
} | ||
@@ -70,2 +90,14 @@ | ||
getSelectorMatches({ $, attr, element }) { | ||
const rawMatches = element | ||
.map((ind, node) => this.getDataFromNode($(node), attr)) | ||
.get(); | ||
const matches = uniq(rawMatches).filter(item => item.length > 0); | ||
debug(`filtered matches: ${rawMatches.length !== matches.length}`); | ||
return matches.length > 1 ? matches : matches[0]; | ||
} | ||
handleSelectorString(content = '', selector = '') { | ||
@@ -88,6 +120,6 @@ return new Promise((resolve, reject) => { | ||
if (count > 1) { | ||
return resolve(element.map((ind, node) => this.getDataFromNode($(node), attr)).get()); | ||
return resolve(this.getSelectorMatches({ $, attr, element })); | ||
} | ||
return this.strict ? reject(new Error('no element found')) : resolve(); | ||
return this.strict ? reject(new Error('no element found')) : resolve(false); | ||
}); | ||
@@ -116,7 +148,9 @@ } | ||
async handleSelectorArray(content = '', [sourceSelector, targetSelector]) { | ||
debug(`handleSelectorArray: ${sourceSelector} => ${targetSelector}`); | ||
const result = await this.handleSelectorString(content, sourceSelector); | ||
debug('handleSelectorArray', sourceSelector, targetSelector, result); | ||
if (this.isUrl(result) || this.isRelativeUrl(result)) { | ||
debug(`handleSelectorArray: result => url => ${result}`); | ||
if (isUrl(result)) { | ||
return this.crawl(result, targetSelector); | ||
@@ -126,2 +160,4 @@ } | ||
if (isArray(result)) { | ||
debug(`handleSelectorArray: result => array => ${result.length}`, targetSelector); | ||
return Promise.map(result, source => this.crawl(source, targetSelector), { | ||
@@ -133,10 +169,10 @@ concurrency: this.concurrency, | ||
if (this.strict) { | ||
return Promise.reject(new Error(`no result found: ${sourceSelector} => ${targetSelector}`)); | ||
return Promise.reject(new Error(`no result found: ${sourceSelector}`)); | ||
} | ||
return Promise.resolve(); | ||
return Promise.resolve(false); | ||
} | ||
handleSelector(content = '', selector) { | ||
debug('handleSelector', selector, content); | ||
debug('handleSelector', selector); | ||
@@ -156,8 +192,27 @@ return new Promise((resolve, reject) => { | ||
setBaseUrl(source) { | ||
const { hostname, protocol } = URL.parse(source); | ||
this.baseUrl = `${protocol}//${hostname}`; | ||
debug(`setBaseUrl: ${this.baseUrl}`); | ||
return this.baseUrl; | ||
} | ||
async crawl(source, selector) { | ||
let content = source.toString(); | ||
let content = source; | ||
if (isUrl(source)) { | ||
debug(`crawl link: ${source}`); | ||
content = await this.retrieveContent(source); | ||
if (this.isUrl(source)) { | ||
debug(`crawl absolute link: ${source}`); | ||
this.setBaseUrl(source); | ||
content = await this.engine.retrieveContent(source); | ||
} else if (this.isRelativeUrl(source)) { | ||
const link = URL.resolve(this.baseUrl, source); | ||
debug(`crawl relative link: ${link}`); | ||
content = await this.engine.retrieveContent(link); | ||
} | ||
@@ -168,22 +223,50 @@ | ||
async until(getSource, selector, keepGoing = () => Promise.resolve(false)) { | ||
let count = 0; | ||
await this.init(); | ||
debug('started'); | ||
/* eslint-disable no-await-in-loop, no-constant-condition */ | ||
while (true) { | ||
const source = getSource(count); | ||
let result; | ||
try { | ||
result = await this.crawl(source, selector); | ||
this.emit('data', { count, result, source }); | ||
if (!await keepGoing(count, result)) { | ||
break; | ||
} | ||
count += 1; | ||
} catch (e) { | ||
this.emit('error', e); | ||
break; | ||
} | ||
} | ||
/* eslint-enable no-await-in-loop */ | ||
await this.teardown(); | ||
return count; | ||
} | ||
async start(source, selector) { | ||
const START = Date.now(); | ||
await this.init(); | ||
debug('started'); | ||
this.browser = await puppeteer.launch({ | ||
headless: true, | ||
ignoreHTTPSErrors: true, | ||
}); | ||
const result = await this.crawl(source, selector); | ||
await this.browser.close(); | ||
await this.teardown(); | ||
const END = Date.now(); | ||
debug(`finished in ${(END - START) / 1000}s`); | ||
return result; | ||
} | ||
} | ||
module.exports = OhScrap; |
import os from 'os'; | ||
import puppeteer from 'puppeteer'; | ||
import { isArray } from 'lodash'; | ||
import Engine from '../src/engine'; | ||
import OhScrap from '../src/index'; | ||
@@ -7,2 +9,3 @@ | ||
const PAGE_2_URL = 'http://page2.com/'; | ||
const PAGE_3_URL = 'http://page3.com/'; | ||
@@ -14,4 +17,4 @@ const PAGE_1 = ` | ||
<ul> | ||
<li class="item">item1</li> | ||
<li class="item">item2</li> | ||
<li class="item" data-test="test1">item1</li> | ||
<li class="item" data-test="test2">item2</li> | ||
</ul> | ||
@@ -32,2 +35,9 @@ </body> | ||
const PAGE_3 = ` | ||
<body> | ||
<h1>TITLE PAGE 3</h1> | ||
<ul>no items</ul> | ||
</body> | ||
`; | ||
describe('given an OhScrap class', () => { | ||
@@ -45,6 +55,8 @@ let ohscrap; | ||
retrieveContentStub = sandbox.stub(OhScrap.prototype, 'retrieveContent'); | ||
retrieveContentStub = sandbox.stub(Engine.prototype, 'retrieveContent'); | ||
retrieveContentStub.withArgs(PAGE_1_URL).resolves(PAGE_1); | ||
retrieveContentStub.resolves(PAGE_2); | ||
retrieveContentStub.withArgs(PAGE_2_URL).resolves(PAGE_2); | ||
retrieveContentStub.withArgs(PAGE_3_URL).resolves(PAGE_3); | ||
retrieveContentStub.rejects(new Error('wrong page')); | ||
}); | ||
@@ -96,2 +108,21 @@ | ||
describe('when the selector is an object', () => { | ||
describe('and it looks for attributes', () => { | ||
const selector = { | ||
title: 'h1', | ||
items: '.item@data-test', | ||
}; | ||
it('should return the same object structure populated with results', async () => { | ||
const result = await ohscrap.start(PAGE_1_URL, selector); | ||
expect(result).to.deep.equal({ | ||
title: 'TITLE PAGE 1', | ||
items: [ | ||
'test1', | ||
'test2', | ||
], | ||
}); | ||
}); | ||
}); | ||
describe('and it does NOT contain deep links', () => { | ||
@@ -158,2 +189,38 @@ const selector = { | ||
}); | ||
describe('when invoking until()', () => { | ||
const selector = { | ||
items: 'ul li', | ||
}; | ||
let totalCount; | ||
let emitStub; | ||
beforeEach(async () => { | ||
const getSource = count => `http://page${count + 1}.com/`; | ||
const keepGoing = (count, result) => { | ||
const flag = isArray(result.items) && result.items.length > 0; | ||
return Promise.resolve(flag); | ||
}; | ||
emitStub = sandbox.stub(); | ||
ohscrap.on('data', emitStub); | ||
totalCount = await ohscrap.until(getSource, selector, keepGoing); | ||
}); | ||
it('should call retrieveContentStub 3 times', () => { | ||
expect(retrieveContentStub).to.be.calledThrice; | ||
}); | ||
it('should call emitStub 3 times', () => { | ||
expect(emitStub).to.be.calledThrice; | ||
}); | ||
it('should return the total count', () => { | ||
expect(totalCount).to.equal(2); | ||
}); | ||
}); | ||
}); |
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
143659
14
429
8
13
+ Addedbabel-polyfill@^6.26.0
+ Addedis-valid-path@^0.1.1
+ Addedbabel-polyfill@6.26.0(transitive)
+ Addedbabel-runtime@6.26.0(transitive)
+ Addedcore-js@2.6.12(transitive)
+ Addedis-extglob@1.0.0(transitive)
+ Addedis-glob@2.0.1(transitive)
+ Addedis-invalid-path@0.1.0(transitive)
+ Addedis-valid-path@0.1.1(transitive)
+ Addedregenerator-runtime@0.10.50.11.1(transitive)