Socket
Socket
Sign inDemoInstall

oh-scrap

Package Overview
Dependencies
Maintainers
1
Versions
9
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

oh-scrap - npm Package Compare versions

Comparing version 1.2.0 to 1.3.0

src/crawl.js

4

package.json
{
"name": "oh-scrap",
"version": "1.2.0",
"version": "1.3.0",
"description": "Node Module skeleton",

@@ -32,4 +32,4 @@ "main": "lib/index.js",

"dependencies": {
"async": "^2.6.0",
"babel-polyfill": "^6.26.0",
"bluebird": "^3.5.1",
"cheerio": "^1.0.0-rc.2",

@@ -36,0 +36,0 @@ "debug": "^3.1.0",

@@ -1,25 +0,16 @@

/* eslint-disable class-methods-use-this */
import 'babel-polyfill';
import Promise from 'bluebird';
import cheerio from 'cheerio';
import { forever } from 'async';
import Debug from 'debug';
import EventEmitter from 'events';
import isUrl from 'is-url';
import {
isArray,
isPlainObject,
isString,
uniq,
} from 'lodash';
import os from 'os';
import isValidPath from 'is-valid-path';
import URL from 'url';
import Engine from './engine';
import { crawl } from './crawl';
const debug = Debug('oh-scrap');
class OhScrap extends EventEmitter {
constructor(concurrency = os.cpus().length, strict = false) {
constructor(concurrency = os.cpus().length) {
super();

@@ -33,3 +24,2 @@

};
this.strict = strict;
}

@@ -59,161 +49,3 @@

isUrl(url) {
return isString(url) && isUrl(url);
}
isRelativeUrl(url) {
return isString(url) && isValidPath(url);
}
getDataFromNode(node, attr = false) {
if (attr) {
return attr === 'HTML' ? node.parent().html() : node.attr(attr);
}
return node.text();
}
loadContent(content = '', selector = '') {
const parts = selector.split('@');
const xpath = parts[0];
const attr = parts[1];
const $ = cheerio.load(content);
return {
attr,
element: $(xpath),
xpath,
$,
};
}
getSelectorMatches({ $, attr, element }) {
const rawMatches = element
.map((ind, node) => this.getDataFromNode($(node), attr))
.get();
const matches = uniq(rawMatches).filter(item => item.length > 0);
debug(`filtered matches: ${rawMatches.length !== matches.length}`);
return matches.length > 1 ? matches : matches[0];
}
handleSelectorString(content = '', selector = '') {
return new Promise((resolve, reject) => {
const {
attr,
element,
$,
} = this.loadContent(content, selector);
const count = element.length;
debug(`handleSelectorString "${selector}" => ${count}`);
if (count === 1) {
return resolve(this.getDataFromNode(element, attr));
}
if (count > 1) {
return resolve(this.getSelectorMatches({ $, attr, element }));
}
return this.strict ? reject(new Error('no element found')) : resolve(false);
});
}
handleSelectorObject(content = '', selector) {
return new Promise(async (resolve, reject) => {
const newObject = {};
try {
/* eslint-disable no-await-in-loop, no-restricted-syntax */
for (const [key, value] of Object.entries(selector)) {
newObject[key] = await this.handleSelector(content, value);
}
/* eslint-enable no-await-in-loop, no-restricted-syntax */
} catch (e) {
reject(e);
return;
}
resolve(newObject);
});
}
async handleSelectorArray(content = '', [sourceSelector, targetSelector]) {
debug(`handleSelectorArray: ${sourceSelector} => ${targetSelector}`);
const result = await this.handleSelectorString(content, sourceSelector);
if (this.isUrl(result) || this.isRelativeUrl(result)) {
debug(`handleSelectorArray: result => url => ${result}`);
return this.crawl(result, targetSelector);
}
if (isArray(result)) {
debug(`handleSelectorArray: result => array => ${result.length}`, targetSelector);
return Promise.map(result, source => this.crawl(source, targetSelector), {
concurrency: this.concurrency,
});
}
if (this.strict) {
return Promise.reject(new Error(`no result found: ${sourceSelector}`));
}
return Promise.resolve(false);
}
handleSelector(content = '', selector) {
debug('handleSelector', selector);
return new Promise((resolve, reject) => {
if (isString(selector)) {
resolve(this.handleSelectorString(content, selector));
} else if (isPlainObject(selector)) {
resolve(this.handleSelectorObject(content, selector));
} else if (isArray(selector)) {
resolve(this.handleSelectorArray(content, selector));
} else {
reject(new Error('selector type not valid'));
}
});
}
setBaseUrl(source) {
const { hostname, protocol } = URL.parse(source);
this.baseUrl = `${protocol}//${hostname}`;
debug(`setBaseUrl: ${this.baseUrl}`);
return this.baseUrl;
}
async crawl(source, selector) {
let content = source;
if (this.isUrl(source)) {
debug(`crawl absolute link: ${source}`);
this.setBaseUrl(source);
content = await this.engine.retrieveContent(source);
} else if (this.isRelativeUrl(source)) {
const link = URL.resolve(this.baseUrl, source);
debug(`crawl relative link: ${link}`);
content = await this.engine.retrieveContent(link);
}
return this.handleSelector(content, selector);
}
async until(getSource, selector, keepGoing = () => Promise.resolve(false)) {
async until(getSource, selector, keepGoing = () => false) {
let count = 0;

@@ -225,27 +57,34 @@

/* eslint-disable no-await-in-loop, no-constant-condition */
while (true) {
const source = getSource(count);
let result;
return new Promise((resolve) => {
forever((next) => {
const source = getSource(count);
try {
result = await this.crawl(source, selector);
crawl({
engine: this.engine,
selector,
source,
}, async (err, result) => {
if (err) {
next(err);
return;
}
this.emit('data', { count, result, source });
this.emit('data', { count, result, source });
if (!await keepGoing(count, result)) {
break;
}
const flag = await keepGoing({ count, result, source });
count += 1;
} catch (e) {
this.emit('error', e);
break;
}
}
/* eslint-enable no-await-in-loop */
if (flag) {
count += 1;
await this.teardown();
next();
} else {
next(count);
}
});
}, async () => {
await this.teardown();
return count;
resolve(count);
});
});
}

@@ -258,8 +97,19 @@

return new Promise((resolve, reject) => {
crawl({
concurrency: this.concurrency,
engine: this.engine,
selector,
source,
}, async (err, res) => {
await this.teardown();
const result = await this.crawl(source, selector);
if (err) {
reject(err);
return;
}
await this.teardown();
return result;
resolve(res);
});
});
}

@@ -266,0 +116,0 @@ }

@@ -69,8 +69,7 @@ import os from 'os';

describe('when passing concurrency and strict to the constructor', () => {
describe('when passing concurrency to the constructor', () => {
const CONCURRENCY = 2;
const STRICT = true;
beforeEach(() => {
ohscrap = new OhScrap(CONCURRENCY, STRICT);
ohscrap = new OhScrap(CONCURRENCY);
});

@@ -80,13 +79,10 @@

expect(ohscrap.concurrency).to.equal(CONCURRENCY);
expect(ohscrap.strict).to.equal(STRICT);
});
});
describe('when NOT passing concurrency and strict to the constructor', () => {
describe('when NOT passing concurrency to the constructor', () => {
const CONCURRENCY = os.cpus().length;
const STRICT = false;
it('should use the default settings', () => {
expect(ohscrap.concurrency).to.equal(CONCURRENCY);
expect(ohscrap.strict).to.equal(STRICT);
});

@@ -197,3 +193,3 @@ });

const getSource = count => `http://page${count + 1}.com/`;
const keepGoing = (count, result) => {
const keepGoing = ({ result }) => {
const flag = isArray(result.items) && result.items.length > 0;

@@ -215,4 +211,4 @@

it('should call emitStub 3 times', () => {
expect(emitStub).to.be.calledThrice;
it('should call emitStub 2 times', () => {
expect(emitStub).to.be.calledTwice;
});

@@ -219,0 +215,0 @@

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc