Socket
Socket
Sign inDemoInstall

oh-scrap

Package Overview
Dependencies
Maintainers
1
Versions
9
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

oh-scrap - npm Package Compare versions

Comparing version 1.1.0 to 1.2.0

src/engine.js

9

package.json
{
"name": "oh-scrap",
"version": "1.1.0",
"version": "1.2.0",
"description": "Node Module skeleton",

@@ -18,3 +18,3 @@ "main": "lib/index.js",

"babel-cli": "^6.26.0",
"babel-polyfill": "^6.26.0",
"babel-plugin-transform-regenerator": "^6.26.0",
"babel-preset-env": "^1.6.1",

@@ -29,5 +29,7 @@ "babel-register": "^6.26.0",

"sinon": "^4.1.4",
"sinon-chai": "^2.14.0"
"sinon-chai": "^2.14.0",
"webpack": "^3.10.0"
},
"dependencies": {
"babel-polyfill": "^6.26.0",
"bluebird": "^3.5.1",

@@ -37,2 +39,3 @@ "cheerio": "^1.0.0-rc.2",

"is-url": "^1.2.2",
"is-valid-path": "^0.1.1",
"lodash": "^4.17.4",

@@ -39,0 +42,0 @@ "puppeteer": "^1.0.0"

/* eslint-disable class-methods-use-this */
import puppeteer from 'puppeteer';
import 'babel-polyfill';
import Promise from 'bluebird';
import cheerio from 'cheerio';
import Debug from 'debug';
import EventEmitter from 'events';
import isUrl from 'is-url';
import cheerio from 'cheerio';
import Promise from 'bluebird';
import os from 'os';
import {

@@ -11,43 +13,61 @@ isArray,

isString,
uniq,
} from 'lodash';
import os from 'os';
import isValidPath from 'is-valid-path';
import URL from 'url';
import Debug from 'debug';
import Engine from './engine';
const debug = Debug('OhScrap');
const debug = Debug('oh-scrap');
export default class OhScrap {
class OhScrap extends EventEmitter {
constructor(concurrency = os.cpus().length, strict = false) {
this.browser = null;
super();
this.concurrency = concurrency;
this.engine = new Engine();
this.metrics = {
end: 0,
start: 0,
};
this.strict = strict;
}
async retrieveContent(url, selector = 'body') {
const page = await this.browser.newPage();
async init() {
debug('init');
await page.setViewport({
height: 720,
isLandscape: true,
width: 1280,
});
this.metrics.start = Date.now();
await page.goto(url, {
waitUntil: 'domcontentloaded',
});
await this.engine.init();
await page.waitFor(selector);
this.emit('start');
}
const content = await page.evaluate((sel) => {
const element = document.querySelector(sel); // eslint-disable-line no-undef
async teardown() {
this.metrics.end = Date.now();
return element ? element.innerHTML : null;
}, selector);
debug(`finished in ${(this.metrics.end - this.metrics.start) / 1000}s`);
await page.close();
await this.engine.teardown();
return content;
this.emit('end');
debug('teardown');
}
isUrl(url) {
return isString(url) && isUrl(url);
}
isRelativeUrl(url) {
return isString(url) && isValidPath(url);
}
getDataFromNode(node, attr = false) {
return attr ? node.attr(attr) : node.text();
if (attr) {
return attr === 'HTML' ? node.parent().html() : node.attr(attr);
}
return node.text();
}

@@ -70,2 +90,14 @@

getSelectorMatches({ $, attr, element }) {
const rawMatches = element
.map((ind, node) => this.getDataFromNode($(node), attr))
.get();
const matches = uniq(rawMatches).filter(item => item.length > 0);
debug(`filtered matches: ${rawMatches.length !== matches.length}`);
return matches.length > 1 ? matches : matches[0];
}
handleSelectorString(content = '', selector = '') {

@@ -88,6 +120,6 @@ return new Promise((resolve, reject) => {

if (count > 1) {
return resolve(element.map((ind, node) => this.getDataFromNode($(node), attr)).get());
return resolve(this.getSelectorMatches({ $, attr, element }));
}
return this.strict ? reject(new Error('no element found')) : resolve();
return this.strict ? reject(new Error('no element found')) : resolve(false);
});

@@ -116,7 +148,9 @@ }

async handleSelectorArray(content = '', [sourceSelector, targetSelector]) {
debug(`handleSelectorArray: ${sourceSelector} => ${targetSelector}`);
const result = await this.handleSelectorString(content, sourceSelector);
debug('handleSelectorArray', sourceSelector, targetSelector, result);
if (this.isUrl(result) || this.isRelativeUrl(result)) {
debug(`handleSelectorArray: result => url => ${result}`);
if (isUrl(result)) {
return this.crawl(result, targetSelector);

@@ -126,2 +160,4 @@ }

if (isArray(result)) {
debug(`handleSelectorArray: result => array => ${result.length}`, targetSelector);
return Promise.map(result, source => this.crawl(source, targetSelector), {

@@ -133,10 +169,10 @@ concurrency: this.concurrency,

if (this.strict) {
return Promise.reject(new Error(`no result found: ${sourceSelector} => ${targetSelector}`));
return Promise.reject(new Error(`no result found: ${sourceSelector}`));
}
return Promise.resolve();
return Promise.resolve(false);
}
handleSelector(content = '', selector) {
debug('handleSelector', selector, content);
debug('handleSelector', selector);

@@ -156,8 +192,27 @@ return new Promise((resolve, reject) => {

setBaseUrl(source) {
const { hostname, protocol } = URL.parse(source);
this.baseUrl = `${protocol}//${hostname}`;
debug(`setBaseUrl: ${this.baseUrl}`);
return this.baseUrl;
}
async crawl(source, selector) {
let content = source.toString();
let content = source;
if (isUrl(source)) {
debug(`crawl link: ${source}`);
content = await this.retrieveContent(source);
if (this.isUrl(source)) {
debug(`crawl absolute link: ${source}`);
this.setBaseUrl(source);
content = await this.engine.retrieveContent(source);
} else if (this.isRelativeUrl(source)) {
const link = URL.resolve(this.baseUrl, source);
debug(`crawl relative link: ${link}`);
content = await this.engine.retrieveContent(link);
}

@@ -168,22 +223,50 @@

async until(getSource, selector, keepGoing = () => Promise.resolve(false)) {
let count = 0;
await this.init();
debug('started');
/* eslint-disable no-await-in-loop, no-constant-condition */
while (true) {
const source = getSource(count);
let result;
try {
result = await this.crawl(source, selector);
this.emit('data', { count, result, source });
if (!await keepGoing(count, result)) {
break;
}
count += 1;
} catch (e) {
this.emit('error', e);
break;
}
}
/* eslint-enable no-await-in-loop */
await this.teardown();
return count;
}
async start(source, selector) {
const START = Date.now();
await this.init();
debug('started');
this.browser = await puppeteer.launch({
headless: true,
ignoreHTTPSErrors: true,
});
const result = await this.crawl(source, selector);
await this.browser.close();
await this.teardown();
const END = Date.now();
debug(`finished in ${(END - START) / 1000}s`);
return result;
}
}
module.exports = OhScrap;
import os from 'os';
import puppeteer from 'puppeteer';
import { isArray } from 'lodash';
import Engine from '../src/engine';
import OhScrap from '../src/index';

@@ -7,2 +9,3 @@

const PAGE_2_URL = 'http://page2.com/';
const PAGE_3_URL = 'http://page3.com/';

@@ -14,4 +17,4 @@ const PAGE_1 = `

<ul>
<li class="item">item1</li>
<li class="item">item2</li>
<li class="item" data-test="test1">item1</li>
<li class="item" data-test="test2">item2</li>
</ul>

@@ -32,2 +35,9 @@ </body>

const PAGE_3 = `
<body>
<h1>TITLE PAGE 3</h1>
<ul>no items</ul>
</body>
`;
describe('given an OhScrap class', () => {

@@ -45,6 +55,8 @@ let ohscrap;

retrieveContentStub = sandbox.stub(OhScrap.prototype, 'retrieveContent');
retrieveContentStub = sandbox.stub(Engine.prototype, 'retrieveContent');
retrieveContentStub.withArgs(PAGE_1_URL).resolves(PAGE_1);
retrieveContentStub.resolves(PAGE_2);
retrieveContentStub.withArgs(PAGE_2_URL).resolves(PAGE_2);
retrieveContentStub.withArgs(PAGE_3_URL).resolves(PAGE_3);
retrieveContentStub.rejects(new Error('wrong page'));
});

@@ -96,2 +108,21 @@

describe('when the selector is an object', () => {
describe('and it looks for attributes', () => {
const selector = {
title: 'h1',
items: '.item@data-test',
};
it('should return the same object structure populated with results', async () => {
const result = await ohscrap.start(PAGE_1_URL, selector);
expect(result).to.deep.equal({
title: 'TITLE PAGE 1',
items: [
'test1',
'test2',
],
});
});
});
describe('and it does NOT contain deep links', () => {

@@ -158,2 +189,38 @@ const selector = {

});
describe('when invoking until()', () => {
const selector = {
items: 'ul li',
};
let totalCount;
let emitStub;
beforeEach(async () => {
const getSource = count => `http://page${count + 1}.com/`;
const keepGoing = (count, result) => {
const flag = isArray(result.items) && result.items.length > 0;
return Promise.resolve(flag);
};
emitStub = sandbox.stub();
ohscrap.on('data', emitStub);
totalCount = await ohscrap.until(getSource, selector, keepGoing);
});
it('should call retrieveContentStub 3 times', () => {
expect(retrieveContentStub).to.be.calledThrice;
});
it('should call emitStub 3 times', () => {
expect(emitStub).to.be.calledThrice;
});
it('should return the total count', () => {
expect(totalCount).to.equal(2);
});
});
});

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc