Socket
Socket
Sign inDemoInstall

oh-scrap

Package Overview
Dependencies
Maintainers
1
Versions
9
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

oh-scrap - npm Package Compare versions

Comparing version 1.3.2 to 2.0.0

65

lib/crawl.js

@@ -24,3 +24,3 @@ 'use strict';

(0, _async.waterfall)([function (next) {
_async2.default.waterfall([function (next) {
return handleSelectorString((0, _lodash.merge)(args, { selector: sourceSelector }), next);

@@ -42,3 +42,3 @@ }, function (result, next) {

(0, _async.mapLimit)(result, concurrency, function (source, mapNext) {
_async2.default.mapLimit(result, concurrency, function (source, mapNext) {
var crawlArgs = (0, _lodash.merge)(args, {

@@ -74,2 +74,4 @@ selector: targetSelector,

var _async2 = _interopRequireDefault(_async);
var _debug = require('debug');

@@ -124,10 +126,11 @@

done(new Error('no element found'));
done();
}
function handleSelectorObject(args, done) {
var selector = args.selector;
var concurrency = args.concurrency,
selector = args.selector;
(0, _async.mapValuesLimit)(selector, 1, function (value, key, next) {
_async2.default.mapValuesLimit(selector, concurrency, function (value, key, next) {
return handleSelector((0, _lodash.merge)(args, { selector: value }), next);

@@ -156,3 +159,5 @@ }, done);

var engine = args.engine,
source = args.source;
retry = args.retry,
url = args.url,
waitForSelector = args.waitForSelector;

@@ -163,11 +168,11 @@ var context = (0, _lodash.merge)({}, args.context);

if ((0, _utils.isUrl)(source)) {
context.url = source;
context.baseUrl = (0, _utils.getBaseUrl)(source);
if ((0, _utils.isUrl)(url)) {
context.url = url;
context.baseUrl = (0, _utils.getBaseUrl)(url);
debug('crawl absolute link: ' + source);
debug('crawl absolute link: ' + url);
link = source;
} else if ((0, _utils.isRelativeUrl)(source)) {
link = _url2.default.resolve(context.baseUrl, source);
link = url;
} else if ((0, _utils.isRelativeUrl)(url)) {
link = _url2.default.resolve(context.baseUrl, url);

@@ -179,8 +184,32 @@ context.url = link;

engine.retrieveContent(link).catch(done).then(function (content) {
handleSelector((0, _lodash.merge)(args, {
content: content,
context: context
}), done);
var attempt = 0;
var interval = retry.interval,
times = retry.times;
_async2.default.retry({
interval: interval,
times: times
}, function (callback) {
debug('retrieveContent attempt ' + attempt + ' => ' + link);
attempt += 1;
engine.retrieveContent(link, waitForSelector).then(function (content) {
if (!(0, _lodash.isString)(content) || content.length < 100) {
callback(new Error('invalid content'));
return;
}
debug('content', content.length);
handleSelector((0, _lodash.merge)(args, {
content: content,
context: context
}), callback);
}, callback);
}, function (err, res) {
return done(null, res);
});
}

@@ -42,3 +42,3 @@ 'use strict';

var _ref = _asyncToGenerator( /*#__PURE__*/regeneratorRuntime.mark(function _callee(url) {
var selector = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : 'body';
var waitForSelector = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : 'body';
var page, content;

@@ -49,35 +49,37 @@ return regeneratorRuntime.wrap(function _callee$(_context) {

case 0:
_context.next = 2;
_context.prev = 0;
_context.next = 3;
return this.browser.newPage();
case 2:
case 3:
page = _context.sent;
_context.next = 5;
_context.next = 6;
return page.setViewport(this.viewportOptions);
case 5:
_context.next = 7;
case 6:
_context.next = 8;
return page.goto(url, this.gotoOptions);
case 7:
_context.next = 9;
return page.waitFor(selector);
case 8:
_context.next = 10;
return page.waitForSelector(waitForSelector);
case 9:
_context.next = 11;
return page.evaluate(function (sel) {
var element = document.querySelector(sel); // eslint-disable-line no-undef
case 10:
_context.next = 12;
return page.content();
return element ? element.innerHTML : null;
}, selector);
case 11:
case 12:
content = _context.sent;
_context.next = 14;
_context.next = 15;
return page.close();
case 14:
case 15:
return _context.abrupt('return', content);
case 15:
case 18:
_context.prev = 18;
_context.t0 = _context['catch'](0);
return _context.abrupt('return', _context.t0);
case 21:
case 'end':

@@ -87,3 +89,3 @@ return _context.stop();

}
}, _callee, this);
}, _callee, this, [[0, 18]]);
}));

@@ -105,10 +107,16 @@

case 0:
_context2.next = 2;
_context2.prev = 0;
_context2.next = 3;
return _puppeteer2.default.launch(this.launchOptions);
case 2:
case 3:
this.browser = _context2.sent;
return _context2.abrupt('return', this.browser);
case 4:
case 7:
_context2.prev = 7;
_context2.t0 = _context2['catch'](0);
return _context2.abrupt('return', _context2.t0);
case 10:
case 'end':

@@ -118,3 +126,3 @@ return _context2.stop();

}
}, _callee2, this);
}, _callee2, this, [[0, 7]]);
}));

@@ -121,0 +129,0 @@

@@ -44,2 +44,3 @@ 'use strict';

var concurrency = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : _os2.default.cpus().length;
var retry = arguments[1];

@@ -56,2 +57,6 @@ _classCallCheck(this, OhScrap);

};
_this.retry = retry || {
interval: 1500,
times: 5
};
return _this;

@@ -131,9 +136,7 @@ }

value: function () {
var _ref3 = _asyncToGenerator( /*#__PURE__*/regeneratorRuntime.mark(function _callee5(getSource, selector) {
var _ref3 = _asyncToGenerator( /*#__PURE__*/regeneratorRuntime.mark(function _callee5(args, done) {
var _this2 = this;
var keepGoing = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : function () {
return false;
};
var count;
var getUrl, selector, _args$keepGoing, keepGoing, waitForSelector, count;
return regeneratorRuntime.wrap(function _callee5$(_context5) {

@@ -143,86 +146,120 @@ while (1) {

case 0:
count = 0;
_context5.next = 3;
getUrl = args.getUrl, selector = args.selector, _args$keepGoing = args.keepGoing, keepGoing = _args$keepGoing === undefined ? function () {
return false;
} : _args$keepGoing, waitForSelector = args.waitForSelector;
_context5.prev = 1;
_context5.next = 4;
return this.init();
case 3:
case 4:
_context5.next = 10;
break;
debug('started');
case 6:
_context5.prev = 6;
_context5.t0 = _context5['catch'](1);
return _context5.abrupt('return', new Promise(function (resolve) {
(0, _async.forever)(function (next) {
var source = getSource(count);
done(_context5.t0);
return _context5.abrupt('return');
(0, _crawl.crawl)({
engine: _this2.engine,
selector: selector,
source: source
}, function () {
var _ref4 = _asyncToGenerator( /*#__PURE__*/regeneratorRuntime.mark(function _callee3(err, result) {
var flag;
return regeneratorRuntime.wrap(function _callee3$(_context3) {
while (1) {
switch (_context3.prev = _context3.next) {
case 0:
if (!err) {
_context3.next = 3;
break;
}
case 10:
next(err);
return _context3.abrupt('return');
debug('until');
case 3:
count = 0;
_this2.emit('data', { count: count, result: result, source: source });
_context3.next = 6;
return keepGoing({ count: count, result: result, source: source });
(0, _async.forever)(function (next) {
debug('count ' + count);
case 6:
flag = _context3.sent;
var url = getUrl(count);
_this2.start({
selector: selector,
url: url,
waitForSelector: waitForSelector
}, function () {
var _ref4 = _asyncToGenerator( /*#__PURE__*/regeneratorRuntime.mark(function _callee3(err, result) {
var flag;
return regeneratorRuntime.wrap(function _callee3$(_context3) {
while (1) {
switch (_context3.prev = _context3.next) {
case 0:
if (!err) {
_context3.next = 3;
break;
}
if (flag) {
count += 1;
next(err);
return _context3.abrupt('return');
next();
} else {
next(count);
}
case 3:
_context3.next = 5;
return keepGoing({ count: count, result: result, url: url });
case 8:
case 'end':
return _context3.stop();
}
case 5:
flag = _context3.sent;
debug('keepGoing', flag);
if (!(flag !== true)) {
_context3.next = 10;
break;
}
next(true);
return _context3.abrupt('return');
case 10:
_this2.emit('data', { count: count, result: result, url: url });
count += 1;
next();
case 13:
case 'end':
return _context3.stop();
}
}, _callee3, _this2);
}));
}
}, _callee3, _this2);
}));
return function (_x5, _x6) {
return _ref4.apply(this, arguments);
};
}());
}, _asyncToGenerator( /*#__PURE__*/regeneratorRuntime.mark(function _callee4() {
return regeneratorRuntime.wrap(function _callee4$(_context4) {
while (1) {
switch (_context4.prev = _context4.next) {
case 0:
_context4.next = 2;
return _this2.teardown();
return function (_x4, _x5) {
return _ref4.apply(this, arguments);
};
}());
}, _asyncToGenerator( /*#__PURE__*/regeneratorRuntime.mark(function _callee4() {
return regeneratorRuntime.wrap(function _callee4$(_context4) {
while (1) {
switch (_context4.prev = _context4.next) {
case 0:
_context4.prev = 0;
_context4.next = 3;
return _this2.teardown();
case 2:
case 3:
_context4.next = 9;
break;
resolve(count);
case 5:
_context4.prev = 5;
_context4.t0 = _context4['catch'](0);
case 3:
case 'end':
return _context4.stop();
}
done(_context4.t0);
return _context4.abrupt('return');
case 9:
done(null, count);
case 10:
case 'end':
return _context4.stop();
}
}, _callee4, _this2);
})));
}));
}
}, _callee4, _this2, [[0, 5]]);
})));
case 5:
case 13:
case 'end':

@@ -232,6 +269,6 @@ return _context5.stop();

}
}, _callee5, this);
}, _callee5, this, [[1, 6]]);
}));
function until(_x3, _x4) {
function until(_x2, _x3) {
return _ref3.apply(this, arguments);

@@ -245,5 +282,7 @@ }

value: function () {
var _ref6 = _asyncToGenerator( /*#__PURE__*/regeneratorRuntime.mark(function _callee7(source, selector) {
var _ref6 = _asyncToGenerator( /*#__PURE__*/regeneratorRuntime.mark(function _callee7(args, done) {
var _this3 = this;
var selector, url, _args$waitForSelector, waitForSelector;
return regeneratorRuntime.wrap(function _callee7$(_context7) {

@@ -253,52 +292,77 @@ while (1) {

case 0:
_context7.next = 2;
selector = args.selector, url = args.url, _args$waitForSelector = args.waitForSelector, waitForSelector = _args$waitForSelector === undefined ? 'body' : _args$waitForSelector;
_context7.prev = 1;
_context7.next = 4;
return this.init();
case 2:
case 4:
_context7.next = 10;
break;
case 6:
_context7.prev = 6;
_context7.t0 = _context7['catch'](1);
done(_context7.t0);
return _context7.abrupt('return');
case 10:
debug('started');
return _context7.abrupt('return', new Promise(function (resolve, reject) {
(0, _crawl.crawl)({
concurrency: _this3.concurrency,
engine: _this3.engine,
selector: selector,
source: source
}, function () {
var _ref7 = _asyncToGenerator( /*#__PURE__*/regeneratorRuntime.mark(function _callee6(err, res) {
return regeneratorRuntime.wrap(function _callee6$(_context6) {
while (1) {
switch (_context6.prev = _context6.next) {
case 0:
_context6.next = 2;
return _this3.teardown();
(0, _crawl.crawl)({
concurrency: this.concurrency,
engine: this.engine,
retry: this.retry,
selector: selector,
url: url,
waitForSelector: waitForSelector
}, function () {
var _ref7 = _asyncToGenerator( /*#__PURE__*/regeneratorRuntime.mark(function _callee6(err, res) {
return regeneratorRuntime.wrap(function _callee6$(_context6) {
while (1) {
switch (_context6.prev = _context6.next) {
case 0:
_context6.prev = 0;
_context6.next = 3;
return _this3.teardown();
case 2:
if (!err) {
_context6.next = 5;
break;
}
case 3:
_context6.next = 9;
break;
reject(err);
return _context6.abrupt('return');
case 5:
_context6.prev = 5;
_context6.t0 = _context6['catch'](0);
case 5:
done(_context6.t0);
return _context6.abrupt('return');
resolve(res);
case 9:
if (!err) {
_context6.next = 12;
break;
}
case 6:
case 'end':
return _context6.stop();
}
done(err);
return _context6.abrupt('return');
case 12:
done(null, res);
case 13:
case 'end':
return _context6.stop();
}
}, _callee6, _this3);
}));
}
}, _callee6, _this3, [[0, 5]]);
}));
return function (_x9, _x10) {
return _ref7.apply(this, arguments);
};
}());
}));
return function (_x8, _x9) {
return _ref7.apply(this, arguments);
};
}());
case 4:
case 12:
case 'end':

@@ -308,6 +372,6 @@ return _context7.stop();

}
}, _callee7, this);
}, _callee7, this, [[1, 6]]);
}));
function start(_x7, _x8) {
function start(_x6, _x7) {
return _ref6.apply(this, arguments);

@@ -314,0 +378,0 @@ }

{
"name": "oh-scrap",
"version": "1.3.2",
"version": "2.0.0",
"description": "Node Module skeleton",

@@ -5,0 +5,0 @@ "main": "lib/index.js",

/* eslint-disable no-use-before-define */
import { mapLimit, mapValuesLimit, waterfall } from 'async';
import async from 'async';
import Debug from 'debug';

@@ -41,11 +41,11 @@ import {

done(new Error('no element found'));
done();
}
export function handleSelectorObject(args, done) {
const { selector } = args;
const { concurrency, selector } = args;
mapValuesLimit(
async.mapValuesLimit(
selector,
1,
concurrency,
(value, key, next) => handleSelector(merge(args, { selector: value }), next),

@@ -62,3 +62,3 @@ done,

waterfall([
async.waterfall([
next => handleSelectorString(merge(args, { selector: sourceSelector }), next),

@@ -80,3 +80,3 @@ (result, next) => {

mapLimit(result, concurrency, (source, mapNext) => {
async.mapLimit(result, concurrency, (source, mapNext) => {
const crawlArgs = merge(args, {

@@ -111,3 +111,5 @@ selector: targetSelector,

export function crawl(args, done) {
const { engine, source } = args;
const {
engine, retry, url, waitForSelector,
} = args;
const context = merge({}, args.context);

@@ -117,11 +119,11 @@

if (isUrl(source)) {
context.url = source;
context.baseUrl = getBaseUrl(source);
if (isUrl(url)) {
context.url = url;
context.baseUrl = getBaseUrl(url);
debug(`crawl absolute link: ${source}`);
debug(`crawl absolute link: ${url}`);
link = source;
} else if (isRelativeUrl(source)) {
link = URL.resolve(context.baseUrl, source);
link = url;
} else if (isRelativeUrl(url)) {
link = URL.resolve(context.baseUrl, url);

@@ -133,10 +135,29 @@ context.url = link;

engine.retrieveContent(link)
.catch(done)
.then((content) => {
handleSelector(merge(args, {
content,
context,
}), done);
});
let attempt = 0;
const { interval, times } = retry;
async.retry({
interval,
times,
}, (callback) => {
debug(`retrieveContent attempt ${attempt} => ${link}`);
attempt += 1;
engine.retrieveContent(link, waitForSelector)
.then((content) => {
if (!isString(content) || content.length < 100) {
callback(new Error('invalid content'));
return;
}
debug('content', content.length);
handleSelector(merge(args, {
content,
context,
}), callback);
}, callback);
}, (err, res) => done(null, res));
}

@@ -20,26 +20,30 @@ import puppeteer from 'puppeteer';

async retrieveContent(url, selector = 'body') {
const page = await this.browser.newPage();
async retrieveContent(url, waitForSelector = 'body') {
try {
const page = await this.browser.newPage();
await page.setViewport(this.viewportOptions);
await page.setViewport(this.viewportOptions);
await page.goto(url, this.gotoOptions);
await page.goto(url, this.gotoOptions);
await page.waitFor(selector);
await page.waitForSelector(waitForSelector);
const content = await page.evaluate((sel) => {
const element = document.querySelector(sel); // eslint-disable-line no-undef
const content = await page.content();
return element ? element.innerHTML : null;
}, selector);
await page.close();
await page.close();
return content;
return content;
} catch (e) {
return e;
}
}
async init() {
this.browser = await puppeteer.launch(this.launchOptions);
try {
this.browser = await puppeteer.launch(this.launchOptions);
return this.browser;
return this.browser;
} catch (e) {
return e;
}
}

@@ -46,0 +50,0 @@

@@ -15,3 +15,3 @@ import 'babel-polyfill';

class OhScrap extends EventEmitter {
constructor(concurrency = os.cpus().length) {
constructor(concurrency = os.cpus().length, retry) {
super();

@@ -25,2 +25,6 @@

};
this.retry = retry || {
interval: 1500,
times: 5,
};
}

@@ -50,64 +54,94 @@

async until(getSource, selector, keepGoing = () => false) {
async until(args, done) {
const {
getUrl,
selector,
keepGoing = () => false,
waitForSelector,
} = args;
try {
await this.init();
} catch (e) {
done(e);
return;
}
debug('until');
let count = 0;
await this.init();
forever((next) => {
debug(`count ${count}`);
debug('started');
const url = getUrl(count);
return new Promise((resolve) => {
forever((next) => {
const source = getSource(count);
this.start({
selector,
url,
waitForSelector,
}, async (err, result) => {
if (err) {
next(err);
return;
}
crawl({
engine: this.engine,
selector,
source,
}, async (err, result) => {
if (err) {
next(err);
return;
}
const flag = await keepGoing({ count, result, url });
this.emit('data', { count, result, source });
debug('keepGoing', flag);
const flag = await keepGoing({ count, result, source });
if (flag !== true) {
next(true);
return;
}
if (flag) {
count += 1;
next();
} else {
next(count);
}
});
}, async () => {
this.emit('data', { count, result, url });
count += 1;
next();
});
}, async () => {
try {
await this.teardown();
} catch (e) {
done(e);
return;
}
resolve(count);
});
done(null, count);
});
}
async start(source, selector) {
await this.init();
async start(args, done) {
const { selector, url, waitForSelector = 'body' } = args;
try {
await this.init();
} catch (e) {
done(e);
return;
}
debug('started');
return new Promise((resolve, reject) => {
crawl({
concurrency: this.concurrency,
engine: this.engine,
selector,
source,
}, async (err, res) => {
crawl({
concurrency: this.concurrency,
engine: this.engine,
retry: this.retry,
selector,
url,
waitForSelector,
}, async (err, res) => {
try {
await this.teardown();
} catch (e) {
done(e);
return;
}
if (err) {
reject(err);
return;
}
if (err) {
done(err);
return;
}
resolve(res);
});
done(null, res);
});

@@ -114,0 +148,0 @@ }

@@ -37,2 +37,3 @@ import os from 'os';

<ul>no items</ul>
<p>something else to reach 100 characters</p>
</body>

@@ -93,6 +94,7 @@ `;

it('should return a string result', async () => {
const result = await ohscrap.start(PAGE_1_URL, selector);
expect(result).to.equal('TITLE PAGE 1');
it('should return a string result', (done) => {
ohscrap.start({ url: PAGE_1_URL, selector }, (err, result) => {
expect(result).to.equal('TITLE PAGE 1');
done();
});
});

@@ -108,11 +110,12 @@ });

it('should return the same object structure populated with results', async () => {
const result = await ohscrap.start(PAGE_1_URL, selector);
expect(result).to.deep.equal({
title: 'TITLE PAGE 1',
items: [
'test1',
'test2',
],
it('should return the same object structure populated with results', (done) => {
ohscrap.start({ url: PAGE_1_URL, selector }, (err, result) => {
expect(result).to.deep.equal({
title: 'TITLE PAGE 1',
items: [
'test1',
'test2',
],
});
done();
});

@@ -128,11 +131,12 @@ });

it('should return the same object structure populated with results', async () => {
const result = await ohscrap.start(PAGE_1_URL, selector);
expect(result).to.deep.equal({
title: 'TITLE PAGE 1',
items: [
'item1',
'item2',
],
it('should return the same object structure populated with results', (done) => {
ohscrap.start({ url: PAGE_1_URL, selector }, (err, result) => {
expect(result).to.deep.equal({
title: 'TITLE PAGE 1',
items: [
'item1',
'item2',
],
});
done();
});

@@ -142,3 +146,3 @@ });

describe('and it does contain deep links', () => {
describe.skip('and it does contain deep links', () => {
const selector = {

@@ -155,4 +159,7 @@ title: 'h1',

beforeEach(async () => {
result = await ohscrap.start(PAGE_1_URL, selector);
beforeEach((done) => {
ohscrap.start({ url: PAGE_1_URL, selector }, (err, data) => {
result = data;
done();
});
});

@@ -191,14 +198,13 @@

};
const getUrl = count => `http://page${count + 1}.com/`;
const keepGoing = ({ result }) => {
const flag = isArray(result.items) && result.items.length > 0;
return Promise.resolve(flag);
};
let emitStub;
let totalCount;
let emitStub;
beforeEach(async () => {
const getSource = count => `http://page${count + 1}.com/`;
const keepGoing = ({ result }) => {
const flag = isArray(result.items) && result.items.length > 0;
return Promise.resolve(flag);
};
beforeEach((done) => {
emitStub = sandbox.stub();

@@ -208,3 +214,6 @@

totalCount = await ohscrap.until(getSource, selector, keepGoing);
ohscrap.until({ getUrl, selector, keepGoing }, (err, res) => {
totalCount = res;
done();
});
});

@@ -211,0 +220,0 @@

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc