Socket
Socket
Sign inDemoInstall

crawler-ts

Package Overview
Dependencies
0
Maintainers
1
Versions
5
Alerts
File Explorer

Advanced tools

Install Socket

Detect and block malicious and high-risk dependencies

Install

Comparing version 1.0.2 to 1.1.0

dist/cjs/generator.d.ts

40

dist/cjs/crawl.d.ts

@@ -6,32 +6,44 @@ declare type ValueOrPromise<T> = T | Promise<T>;

}
export interface PreParseProps<L, R> {
location: L;
response: R;
}
export interface AfterParseProps<L, R, P> extends PreParseProps<L, R> {
parsed: P;
}
/**
* @type {Location} The type of the locations to crawl, e.g. `URL` or `string` that represents a path.
* @type {Response} The type of the response at the location that is crawler, e.g. Cheerio object, file system `fs.Stats`.
* @type {Result} The intermediate result that can be parsed from the response and generated by the crawler.
* @type {L} The type of the locations to crawl, e.g. `URL` or `string` that represents a path.
* @type {R} The type of the response at the location that is crawler, e.g. Cheerio object, file system `fs.Stats`.
* @type {P} The intermediate parsed result that can be parsed from the response and generated by the crawler.
*/
export interface Config<Location, Response, Result> {
export interface Config<L, R, P> {
/**
* This function should return the response for the given location.
*/
requester(loc: Location): ValueOrPromise<Response | undefined>;
requester(location: L): ValueOrPromise<R | undefined>;
/**
* This function should return true if the crawler should parse the response, or false if not.
*/
shouldParse(loc: Location, response: Response): ValueOrPromise<boolean>;
shouldParse(props: PreParseProps<L, R>): ValueOrPromise<boolean>;
/**
* This function should parse the response and convert the response to the result type.
* This function should parse the response and convert the response to the parsed type.
*/
parser(loc: Location, response: Response): ValueOrPromise<Result | undefined>;
parser(props: PreParseProps<L, R>): ValueOrPromise<P | undefined>;
/**
* This function should return true if the crawler should yield the result, or false if not.
* This function should return true if the crawler should yield the parsed result, or false if not.
*/
shouldYield(result: Result): ValueOrPromise<boolean>;
shouldYield(props: AfterParseProps<L, R, P>): ValueOrPromise<boolean>;
/**
* This function should yield all the locations to follow in the given result.
* This function should yield all the locations to follow in the given parsed result.
*/
follower(result: Result): AsyncGenerator<Location>;
follower(props: AfterParseProps<L, R, P>): AsyncGenerator<L>;
/**
* This function should return true if the crawler should queue the location for crawling, or false if not.
*/
shouldQueue(loc: Location): ValueOrPromise<boolean>;
shouldQueue(props: {
location: L;
origin: L;
response: R;
parsed: P;
}): ValueOrPromise<boolean>;
/**

@@ -44,3 +56,3 @@ * The logger can be set to `console` to output debug information to the `console`.

}
export declare function crawl<Location, Response, Result>(config: Config<Location, Response, Result>): (start: Location) => AsyncGenerator<Result>;
export declare function crawl<L, R, P>(config: Config<L, R, P>): (start: L) => AsyncGenerator<AfterParseProps<L, R, P>>;
export {};

@@ -68,5 +68,5 @@ "use strict";

var requester = config.requester, shouldParse = config.shouldParse, parser = config.parser, shouldYield = config.shouldYield, follower = config.follower, shouldQueue = config.shouldQueue, logger = config.logger;
return function gen(loc) {
return function gen(location) {
return __asyncGenerator(this, arguments, function gen_1() {
var response, _a, result, _b, _c, next, e_1, e_2_1, e_3;
var response, _a, parsed, _b, _c, next, e_1, e_2_1, e_3;
var e_2, _d;

@@ -77,4 +77,4 @@ return __generator(this, function (_e) {

_e.trys.push([0, 28, , 29]);
logger === null || logger === void 0 ? void 0 : logger.info("Requesting " + loc);
return [4 /*yield*/, __await(requester(loc))];
logger === null || logger === void 0 ? void 0 : logger.info("Requesting " + location);
return [4 /*yield*/, __await(requester(location))];
case 1:

@@ -84,3 +84,3 @@ response = _e.sent();

if (!_a) return [3 /*break*/, 3];
return [4 /*yield*/, __await(shouldParse(loc, response))];
return [4 /*yield*/, __await(shouldParse({ location: location, response: response }))];
case 2:

@@ -91,14 +91,14 @@ _a = (_e.sent());

if (!_a) return [3 /*break*/, 27];
logger === null || logger === void 0 ? void 0 : logger.info("Parsing " + loc);
return [4 /*yield*/, __await(parser(loc, response))];
logger === null || logger === void 0 ? void 0 : logger.info("Parsing " + location);
return [4 /*yield*/, __await(parser({ location: location, response: response }))];
case 4:
result = _e.sent();
if (!!result) return [3 /*break*/, 6];
parsed = _e.sent();
if (!!parsed) return [3 /*break*/, 6];
return [4 /*yield*/, __await(void 0)];
case 5: return [2 /*return*/, _e.sent()];
case 6: return [4 /*yield*/, __await(shouldYield(result))];
case 6: return [4 /*yield*/, __await(shouldYield({ location: location, response: response, parsed: parsed }))];
case 7:
if (!_e.sent()) return [3 /*break*/, 10];
logger === null || logger === void 0 ? void 0 : logger.info("Yielding " + loc);
return [4 /*yield*/, __await(result)];
logger === null || logger === void 0 ? void 0 : logger.info("Yielding " + location);
return [4 /*yield*/, __await({ location: location, response: response, parsed: parsed })];
case 8: return [4 /*yield*/, _e.sent()];

@@ -110,3 +110,3 @@ case 9:

_e.trys.push([10, 21, 22, 27]);
_b = __asyncValues(follower(result));
_b = __asyncValues(follower({ location: location, response: response, parsed: parsed }));
_e.label = 11;

@@ -120,3 +120,3 @@ case 11: return [4 /*yield*/, __await(_b.next())];

_e.trys.push([13, 18, , 19]);
return [4 /*yield*/, __await(shouldQueue(next))];
return [4 /*yield*/, __await(shouldQueue({ location: next, origin: location, response: response, parsed: parsed }))];
case 14:

@@ -157,3 +157,3 @@ if (!_e.sent()) return [3 /*break*/, 17];

e_3 = _e.sent();
logger === null || logger === void 0 ? void 0 : logger.error("Cannot visit " + loc);
logger === null || logger === void 0 ? void 0 : logger.error("Cannot visit " + location);
logger === null || logger === void 0 ? void 0 : logger.error(e_3);

@@ -160,0 +160,0 @@ return [3 /*break*/, 29];

@@ -1,3 +0,5 @@

import { Logger } from "./crawl";
export declare type Filter<T> = (location: T) => boolean;
import { Logger } from './crawl';
export declare type Filter<T> = ({ location }: {
location: T;
}) => boolean;
export declare type ToString<T> = (location: T) => string;

@@ -4,0 +6,0 @@ export declare const toString: ToString<any>;

@@ -31,7 +31,8 @@ "use strict";

return function (allowedExtensions, logger) {
return function (location) {
return function (_a) {
var location = _a.location;
var converted = strFn(location);
var lastSlashIndex = Math.max(0, converted.lastIndexOf("/"));
var lastSlashIndex = Math.max(0, converted.lastIndexOf('/'));
var lastSlashPart = converted.substr(lastSlashIndex);
var lastDotIndex = lastSlashPart.lastIndexOf(".");
var lastDotIndex = lastSlashPart.lastIndexOf('.');
if (lastDotIndex !== -1) {

@@ -55,3 +56,4 @@ var extension = lastSlashPart.substr(lastDotIndex + 1);

return function (allowUrls, logger) {
return function (location) {
return function (_a) {
var location = _a.location;
for (var _i = 0, allowUrls_1 = allowUrls; _i < allowUrls_1.length; _i++) {

@@ -76,3 +78,4 @@ var allowUrl = allowUrls_1[_i];

return function (ignoredUrls, logger) {
return function (location) {
return function (_a) {
var location = _a.location;
for (var _i = 0, ignoredUrls_1 = ignoredUrls; _i < ignoredUrls_1.length; _i++) {

@@ -98,3 +101,4 @@ var ignoredUrl = ignoredUrls_1[_i];

var seen = [];
return function (location) {
return function (_a) {
var location = _a.location;
var key = strFn(location);

@@ -115,3 +119,4 @@ if (!key || seen.includes(key)) {

var shouldFollowCache = {};
return function cachedShouldFollow(location) {
return function cachedShouldFollow(_a) {
var location = _a.location;
var string = strFn(location);

@@ -121,3 +126,3 @@ if (shouldFollowCache.hasOwnProperty(string)) {

}
var shouldFollow = fn(location);
var shouldFollow = fn({ location: location });
shouldFollowCache[string] = shouldFollow;

@@ -124,0 +129,0 @@ return shouldFollow;

@@ -1,2 +0,2 @@

export * from "./crawl";
export * from "./filter";
export * from './crawl';
export * from './filter';

@@ -6,32 +6,44 @@ declare type ValueOrPromise<T> = T | Promise<T>;

}
export interface PreParseProps<L, R> {
location: L;
response: R;
}
export interface AfterParseProps<L, R, P> extends PreParseProps<L, R> {
parsed: P;
}
/**
* @type {Location} The type of the locations to crawl, e.g. `URL` or `string` that represents a path.
* @type {Response} The type of the response at the location that is crawler, e.g. Cheerio object, file system `fs.Stats`.
* @type {Result} The intermediate result that can be parsed from the response and generated by the crawler.
* @type {L} The type of the locations to crawl, e.g. `URL` or `string` that represents a path.
* @type {R} The type of the response at the location that is crawler, e.g. Cheerio object, file system `fs.Stats`.
* @type {P} The intermediate parsed result that can be parsed from the response and generated by the crawler.
*/
export interface Config<Location, Response, Result> {
export interface Config<L, R, P> {
/**
* This function should return the response for the given location.
*/
requester(loc: Location): ValueOrPromise<Response | undefined>;
requester(location: L): ValueOrPromise<R | undefined>;
/**
* This function should return true if the crawler should parse the response, or false if not.
*/
shouldParse(loc: Location, response: Response): ValueOrPromise<boolean>;
shouldParse(props: PreParseProps<L, R>): ValueOrPromise<boolean>;
/**
* This function should parse the response and convert the response to the result type.
* This function should parse the response and convert the response to the parsed type.
*/
parser(loc: Location, response: Response): ValueOrPromise<Result | undefined>;
parser(props: PreParseProps<L, R>): ValueOrPromise<P | undefined>;
/**
* This function should return true if the crawler should yield the result, or false if not.
* This function should return true if the crawler should yield the parsed result, or false if not.
*/
shouldYield(result: Result): ValueOrPromise<boolean>;
shouldYield(props: AfterParseProps<L, R, P>): ValueOrPromise<boolean>;
/**
* This function should yield all the locations to follow in the given result.
* This function should yield all the locations to follow in the given parsed result.
*/
follower(result: Result): AsyncGenerator<Location>;
follower(props: AfterParseProps<L, R, P>): AsyncGenerator<L>;
/**
* This function should return true if the crawler should queue the location for crawling, or false if not.
*/
shouldQueue(loc: Location): ValueOrPromise<boolean>;
shouldQueue(props: {
location: L;
origin: L;
response: R;
parsed: P;
}): ValueOrPromise<boolean>;
/**

@@ -44,3 +56,3 @@ * The logger can be set to `console` to output debug information to the `console`.

}
export declare function crawl<Location, Response, Result>(config: Config<Location, Response, Result>): (start: Location) => AsyncGenerator<Result>;
export declare function crawl<L, R, P>(config: Config<L, R, P>): (start: L) => AsyncGenerator<AfterParseProps<L, R, P>>;
export {};

@@ -26,24 +26,24 @@ var __await = (this && this.__await) || function (v) { return this instanceof __await ? (this.v = v, this) : new __await(v); }

export function crawl(config) {
const { requester, shouldParse, parser, shouldYield, follower, shouldQueue, logger, } = config;
return function gen(loc) {
const { requester, shouldParse, parser, shouldYield, follower, shouldQueue, logger } = config;
return function gen(location) {
return __asyncGenerator(this, arguments, function* gen_1() {
var e_1, _a;
try {
logger === null || logger === void 0 ? void 0 : logger.info(`Requesting ${loc}`);
const response = yield __await(requester(loc));
if (response && (yield __await(shouldParse(loc, response)))) {
logger === null || logger === void 0 ? void 0 : logger.info(`Parsing ${loc}`);
const result = yield __await(parser(loc, response));
if (!result) {
logger === null || logger === void 0 ? void 0 : logger.info(`Requesting ${location}`);
const response = yield __await(requester(location));
if (response && (yield __await(shouldParse({ location, response })))) {
logger === null || logger === void 0 ? void 0 : logger.info(`Parsing ${location}`);
const parsed = yield __await(parser({ location, response }));
if (!parsed) {
return yield __await(void 0);
}
if (yield __await(shouldYield(result))) {
logger === null || logger === void 0 ? void 0 : logger.info(`Yielding ${loc}`);
yield yield __await(result);
if (yield __await(shouldYield({ location, response, parsed }))) {
logger === null || logger === void 0 ? void 0 : logger.info(`Yielding ${location}`);
yield yield __await({ location, response, parsed });
}
try {
for (var _b = __asyncValues(follower(result)), _c; _c = yield __await(_b.next()), !_c.done;) {
for (var _b = __asyncValues(follower({ location, response, parsed })), _c; _c = yield __await(_b.next()), !_c.done;) {
const next = _c.value;
try {
if (yield __await(shouldQueue(next))) {
if (yield __await(shouldQueue({ location: next, origin: location, response, parsed }))) {
logger === null || logger === void 0 ? void 0 : logger.info(`Queueing ${next}`);

@@ -69,3 +69,3 @@ yield __await(yield* __asyncDelegator(__asyncValues(gen(next))));

catch (e) {
logger === null || logger === void 0 ? void 0 : logger.error(`Cannot visit ${loc}`);
logger === null || logger === void 0 ? void 0 : logger.error(`Cannot visit ${location}`);
logger === null || logger === void 0 ? void 0 : logger.error(e);

@@ -72,0 +72,0 @@ }

@@ -1,3 +0,5 @@

import { Logger } from "./crawl";
export declare type Filter<T> = (location: T) => boolean;
import { Logger } from './crawl';
export declare type Filter<T> = ({ location }: {
location: T;
}) => boolean;
export declare type ToString<T> = (location: T) => string;

@@ -4,0 +6,0 @@ export declare const toString: ToString<any>;

@@ -19,7 +19,7 @@ export const toString = (value) => `${value}`;

export const allowExtensions = (strFn = toString) => (allowedExtensions, logger) => {
return (location) => {
return ({ location }) => {
const converted = strFn(location);
const lastSlashIndex = Math.max(0, converted.lastIndexOf("/"));
const lastSlashIndex = Math.max(0, converted.lastIndexOf('/'));
const lastSlashPart = converted.substr(lastSlashIndex);
const lastDotIndex = lastSlashPart.lastIndexOf(".");
const lastDotIndex = lastSlashPart.lastIndexOf('.');
if (lastDotIndex !== -1) {

@@ -39,3 +39,3 @@ const extension = lastSlashPart.substr(lastDotIndex + 1);

export const allowRegex = (strFn = toString) => (allowUrls, logger) => {
return (location) => {
return ({ location }) => {
for (const allowUrl of allowUrls) {

@@ -55,3 +55,3 @@ const converted = strFn(location);

export const ignoreRegex = (strFn = toString) => (ignoredUrls, logger) => {
return (location) => {
return ({ location }) => {
for (const ignoredUrl of ignoredUrls) {

@@ -72,3 +72,3 @@ const converted = strFn(location);

const seen = [];
return (location) => {
return ({ location }) => {
const key = strFn(location);

@@ -85,3 +85,3 @@ if (!key || seen.includes(key)) {

const shouldFollowCache = {};
return function cachedShouldFollow(location) {
return function cachedShouldFollow({ location }) {
const string = strFn(location);

@@ -91,3 +91,3 @@ if (shouldFollowCache.hasOwnProperty(string)) {

}
const shouldFollow = fn(location);
const shouldFollow = fn({ location });
shouldFollowCache[string] = shouldFollow;

@@ -94,0 +94,0 @@ return shouldFollow;

@@ -1,2 +0,2 @@

export * from "./crawl";
export * from "./filter";
export * from './crawl';
export * from './filter';

@@ -1,2 +0,2 @@

export * from "./crawl";
export * from "./filter";
export * from './crawl';
export * from './filter';
{
"name": "crawler-ts",
"version": "1.0.2",
"version": "1.1.0",
"author": {

@@ -5,0 +5,0 @@ "name": "Gillis Van Ginderacter",

@@ -8,32 +8,41 @@ type ValueOrPromise<T> = T | Promise<T>;

export interface PreParseProps<L, R> {
location: L;
response: R;
}
export interface AfterParseProps<L, R, P> extends PreParseProps<L, R> {
parsed: P;
}
/**
* @type {Location} The type of the locations to crawl, e.g. `URL` or `string` that represents a path.
* @type {Response} The type of the response at the location that is crawler, e.g. Cheerio object, file system `fs.Stats`.
* @type {Result} The intermediate result that can be parsed from the response and generated by the crawler.
* @type {L} The type of the locations to crawl, e.g. `URL` or `string` that represents a path.
* @type {R} The type of the response at the location that is crawler, e.g. Cheerio object, file system `fs.Stats`.
* @type {P} The intermediate parsed result that can be parsed from the response and generated by the crawler.
*/
export interface Config<Location, Response, Result> {
export interface Config<L, R, P> {
/**
* This function should return the response for the given location.
*/
requester(loc: Location): ValueOrPromise<Response | undefined>;
requester(location: L): ValueOrPromise<R | undefined>;
/**
* This function should return true if the crawler should parse the response, or false if not.
*/
shouldParse(loc: Location, response: Response): ValueOrPromise<boolean>;
shouldParse(props: PreParseProps<L, R>): ValueOrPromise<boolean>;
/**
* This function should parse the response and convert the response to the result type.
* This function should parse the response and convert the response to the parsed type.
*/
parser(loc: Location, response: Response): ValueOrPromise<Result | undefined>;
parser(props: PreParseProps<L, R>): ValueOrPromise<P | undefined>;
/**
* This function should return true if the crawler should yield the result, or false if not.
* This function should return true if the crawler should yield the parsed result, or false if not.
*/
shouldYield(result: Result): ValueOrPromise<boolean>;
shouldYield(props: AfterParseProps<L, R, P>): ValueOrPromise<boolean>;
/**
* This function should yield all the locations to follow in the given result.
* This function should yield all the locations to follow in the given parsed result.
*/
follower(result: Result): AsyncGenerator<Location>;
follower(props: AfterParseProps<L, R, P>): AsyncGenerator<L>;
/**
* This function should return true if the crawler should queue the location for crawling, or false if not.
*/
shouldQueue(loc: Location): ValueOrPromise<boolean>;
shouldQueue(props: { location: L; origin: L; response: R; parsed: P }): ValueOrPromise<boolean>;
/**

@@ -47,34 +56,25 @@ * The logger can be set to `console` to output debug information to the `console`.

export function crawl<Location, Response, Result>(
config: Config<Location, Response, Result>
): (start: Location) => AsyncGenerator<Result> {
const {
requester,
shouldParse,
parser,
shouldYield,
follower,
shouldQueue,
logger,
} = config;
export function crawl<L, R, P>(config: Config<L, R, P>): (start: L) => AsyncGenerator<AfterParseProps<L, R, P>> {
const { requester, shouldParse, parser, shouldYield, follower, shouldQueue, logger } = config;
return async function* gen(loc: Location): AsyncGenerator<Result> {
return async function* gen(location: L): AsyncGenerator<AfterParseProps<L, R, P>> {
try {
logger?.info(`Requesting ${loc}`);
const response = await requester(loc);
if (response && (await shouldParse(loc, response))) {
logger?.info(`Parsing ${loc}`);
const result = await parser(loc, response);
if (!result) {
logger?.info(`Requesting ${location}`);
const response = await requester(location);
if (response && (await shouldParse({ location, response }))) {
logger?.info(`Parsing ${location}`);
const parsed = await parser({ location, response });
if (!parsed) {
return;
}
if (await shouldYield(result)) {
logger?.info(`Yielding ${loc}`);
yield result;
if (await shouldYield({ location, response, parsed })) {
logger?.info(`Yielding ${location}`);
yield { location, response, parsed };
}
for await (const next of follower(result)) {
for await (const next of follower({ location, response, parsed })) {
try {
if (await shouldQueue(next)) {
if (await shouldQueue({ location: next, origin: location, response, parsed })) {
logger?.info(`Queueing ${next}`);

@@ -90,3 +90,3 @@ yield* gen(next);

} catch (e) {
logger?.error(`Cannot visit ${loc}`);
logger?.error(`Cannot visit ${location}`);
logger?.error(e);

@@ -93,0 +93,0 @@ }

@@ -1,4 +0,4 @@

import { Logger } from "./crawl";
import { Logger } from './crawl';
export type Filter<T> = (location: T) => boolean;
export type Filter<T> = ({ location }: { location: T }) => boolean;
export type ToString<T> = (location: T) => string;

@@ -9,5 +9,3 @@

export function chain<A>(...fns: Array<(a: A) => boolean>): (a: A) => boolean;
export function chain<A, B>(
...fns: Array<(a: A, b: B) => boolean>
): (a: A, b: B) => boolean;
export function chain<A, B>(...fns: Array<(a: A, b: B) => boolean>): (a: A, b: B) => boolean;

@@ -33,9 +31,9 @@ /**

allowedExtensions: string[],
logger?: Logger
logger?: Logger,
): Filter<T> => {
return (location: T): boolean => {
return ({ location }): boolean => {
const converted = strFn(location);
const lastSlashIndex = Math.max(0, converted.lastIndexOf("/"));
const lastSlashIndex = Math.max(0, converted.lastIndexOf('/'));
const lastSlashPart = converted.substr(lastSlashIndex);
const lastDotIndex = lastSlashPart.lastIndexOf(".");
const lastDotIndex = lastSlashPart.lastIndexOf('.');
if (lastDotIndex !== -1) {

@@ -55,7 +53,4 @@ const extension = lastSlashPart.substr(lastDotIndex + 1);

*/
export const allowRegex = <T>(strFn: ToString<T> = toString) => (
allowUrls: RegExp[],
logger?: Logger
): Filter<T> => {
return (location: T): boolean => {
export const allowRegex = <T>(strFn: ToString<T> = toString) => (allowUrls: RegExp[], logger?: Logger): Filter<T> => {
return ({ location }: { location: T }): boolean => {
for (const allowUrl of allowUrls) {

@@ -77,5 +72,5 @@ const converted = strFn(location);

ignoredUrls: RegExp[],
logger?: Logger
logger?: Logger,
): Filter<T> => {
return (location: T): boolean => {
return ({ location }: { location: T }): boolean => {
for (const ignoredUrl of ignoredUrls) {

@@ -95,7 +90,5 @@ const converted = strFn(location);

*/
export const ignoreDoubles = <T>(strFn: ToString<T> = toString) => (
logger?: Logger
): Filter<T> => {
export const ignoreDoubles = <T>(strFn: ToString<T> = toString) => (logger?: Logger): Filter<T> => {
const seen: string[] = [];
return (location: T): boolean => {
return ({ location }: { location: T }): boolean => {
const key = strFn(location);

@@ -111,7 +104,5 @@ if (!key || seen.includes(key)) {

export const cache = <T>(strFn: ToString<T> = toString) => (
fn: Filter<T>
): Filter<T> => {
export const cache = <T>(strFn: ToString<T> = toString) => (fn: Filter<T>): Filter<T> => {
const shouldFollowCache: { [key: string]: boolean } = {};
return function cachedShouldFollow(location: T): boolean {
return function cachedShouldFollow({ location }: { location: T }): boolean {
const string = strFn(location);

@@ -121,3 +112,3 @@ if (shouldFollowCache.hasOwnProperty(string)) {

}
const shouldFollow = fn(location);
const shouldFollow = fn({ location });
shouldFollowCache[string] = shouldFollow;

@@ -124,0 +115,0 @@ return shouldFollow;

@@ -1,2 +0,2 @@

export * from "./crawl";
export * from "./filter";
export * from './crawl';
export * from './filter';
SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc