Comparing version 2.0.1 to 2.0.2
@@ -9,3 +9,2 @@ /// <reference types="node" /> | ||
options: CrawlerOptions; | ||
globalOnlyOptions: string[]; | ||
seen: any; | ||
@@ -19,3 +18,2 @@ constructor(options?: CrawlerOptions); | ||
/** | ||
* | ||
* @param rateLimiterId | ||
@@ -34,3 +32,2 @@ * @param property | ||
/** | ||
* | ||
* @param options | ||
@@ -59,3 +56,2 @@ * @returns if there is a "callback" function in the options, return the result of the callback function. \ | ||
/** | ||
* | ||
* @param options | ||
@@ -62,0 +58,0 @@ * @description Add a request to the queue. |
@@ -12,5 +12,4 @@ import { EventEmitter } from "events"; | ||
// @todo: remove seenreq dependency | ||
process.env.NODE_ENV = process.env.NODE_ENV ?? process.argv[2]; | ||
// process.env.NODE_ENV = "debug"; | ||
logOptions.minLevel = process.env.NODE_ENV === "debug" ? 0 : 3; | ||
// process.env.NODE_ENV = process.env.NODE_ENV ?? process.argv[2]; | ||
logOptions.minLevel = process.env.NODE_ENV === "debug" ? 0 : process.env.NODE_ENV === "test" ? 7 : 3; | ||
const log = new Logger(logOptions); | ||
@@ -30,3 +29,5 @@ class Crawler extends EventEmitter { | ||
this.emit("schedule", options); | ||
this._limiters.getRateLimiter(options.rateLimiterId).submit(options.priority, (done, rateLimiterId) => { | ||
this._limiters | ||
.getRateLimiter(options.rateLimiterId) | ||
.submit(options.priority, (done, rateLimiterId) => { | ||
options.release = () => { | ||
@@ -44,12 +45,14 @@ done(); | ||
} | ||
else if (typeof options.uri === "function") { | ||
options.uri((uri) => { | ||
options.url = uri; | ||
this._execute(options); | ||
}); | ||
} | ||
else { | ||
options.url = options.url ?? options.uri; | ||
delete options.uri; | ||
this._execute(options); | ||
if (typeof options.url === "function") { | ||
options.url((url) => { | ||
options.url = url; | ||
this._execute(options); | ||
}); | ||
} | ||
else { | ||
delete options.uri; | ||
this._execute(options); | ||
} | ||
} | ||
@@ -116,3 +119,3 @@ }); | ||
if (options.retries && options.retries > 0) { | ||
log.warn(`${error} when fetching ${options.url} ${options.retries ? `(${options.retries} retries left)` : ""}`); | ||
log.warn(`${error} occurred on ${options.url}. ${options.retries ? `(${options.retries} retries left)` : ""}`); | ||
setTimeout(() => { | ||
@@ -125,7 +128,9 @@ options.retries--; | ||
else { | ||
log.error(`${error} when fetching ${options.url}. Request failed.`); | ||
log.error(`${error} occurred on ${options.url}. Request failed.`); | ||
if (options.callback && typeof options.callback === "function") { | ||
return options.callback(error, { options }, options.release); | ||
} | ||
throw error; | ||
else { | ||
throw error; | ||
} | ||
} | ||
@@ -163,3 +168,3 @@ } | ||
} | ||
if (options.jQuery === true) { | ||
if (options.jQuery === true && !options.isJson) { | ||
if (response.body === "" || !this._detectHtmlOnHeaders(response.headers)) { | ||
@@ -183,3 +188,2 @@ log.warn("response body is not HTML, skip injecting. Set jQuery to false to mute this warning."); | ||
/** | ||
* | ||
* @param options | ||
@@ -204,5 +208,2 @@ * @returns if there is a "callback" function in the options, return the result of the callback function. \ | ||
setDefaults(options, this.options); | ||
this.globalOnlyOptions.forEach(globalOnlyOption => { | ||
delete options[globalOnlyOption]; | ||
}); | ||
options.skipEventRequest = isBoolean(options.skipEventRequest) ? options.skipEventRequest : true; | ||
@@ -221,3 +222,2 @@ delete options.preRequest; | ||
/** | ||
* | ||
* @param options | ||
@@ -247,5 +247,2 @@ * @description Add a request to the queue. | ||
options.headers = { ...this.options.headers, ...options.headers }; | ||
this.globalOnlyOptions.forEach(globalOnlyOption => { | ||
delete options[globalOnlyOption]; | ||
}); | ||
if (!this.options.skipDuplicates) { | ||
@@ -287,2 +284,3 @@ this._schedule(options); | ||
isJson: false, | ||
silence: false, | ||
}; | ||
@@ -293,10 +291,5 @@ this.options = { ...defaultOptions, ...options }; | ||
} | ||
this.globalOnlyOptions = [ | ||
"maxConnections", | ||
"rateLimit", | ||
"priorityLevels", | ||
"skipDuplicates", | ||
"homogeneous", | ||
"userAgents", | ||
]; | ||
if (this.options.silence) { | ||
log.settings.minLevel = 7; | ||
} | ||
this._limiters = new Cluster({ | ||
@@ -328,3 +321,2 @@ maxConnections: this.options.maxConnections, | ||
/** | ||
* | ||
* @param rateLimiterId | ||
@@ -331,0 +323,0 @@ * @param property |
import { RequestConfig, RequestOptions } from "./types/crawler.js"; | ||
export declare const globalOnlyOptions: string[]; | ||
export declare const crawlerOnlyOptions: string[]; | ||
export declare const deprecatedOptions: string[]; | ||
export declare const getCharset: (headers: Record<string, unknown>) => null | string; | ||
@@ -3,0 +6,0 @@ export declare const getValidOptions: (options: RequestConfig) => RequestOptions; |
import { HttpProxyAgent, HttpsProxyAgent } from "hpagent"; | ||
import http2Wrapper from "http2-wrapper"; | ||
import { cleanObject, getType, isValidUrl } from "./lib/utils.js"; | ||
export const globalOnlyOptions = [ | ||
"maxConnections", | ||
"priorityLevels", | ||
"rateLimit", | ||
"skipDuplicates", | ||
"homogeneous", | ||
"userAgents", | ||
"silence", | ||
]; | ||
export const crawlerOnlyOptions = [ | ||
"rateLimiterId", | ||
"forceUTF8", | ||
"jQuery", | ||
"retryInterval", | ||
"priority", | ||
"proxy", | ||
"retries", | ||
"preRequest", | ||
"callback", | ||
"release", | ||
"isJson", | ||
"referer", | ||
"rejectUnauthorized", | ||
"userParams", | ||
].concat(globalOnlyOptions); | ||
export const deprecatedOptions = [ | ||
"uri", | ||
"qs", | ||
"strictSSL", | ||
"incomingEncoding", | ||
"gzip", | ||
"jar", | ||
"jsonReviver", | ||
"jsonReplacer", | ||
"skipEventRequest", | ||
]; | ||
export const getCharset = (headers) => { | ||
@@ -36,21 +72,2 @@ let charset = null; | ||
export const alignOptions = (options) => { | ||
const crawlerOnlyOptions = [ | ||
"rateLimiterId", | ||
"forceUTF8", | ||
"incomingEncoding", | ||
"jQuery", | ||
"retryInterval", | ||
"priority", | ||
"proxy", | ||
"retries", | ||
"preRequest", | ||
"callback", | ||
"release", | ||
"userAgents", | ||
"isJson", | ||
"referer", | ||
"rejectUnauthorized", | ||
"userParams", | ||
]; | ||
const deprecatedOptions = ["uri", "qs", "strictSSL", "gzip", "jar", "jsonReviver", "jsonReplacer", "skipEventRequest"].concat(crawlerOnlyOptions); | ||
const gotOptions = { | ||
@@ -100,6 +117,6 @@ ...options, | ||
options.encoding = options.incomingEncoding; | ||
delete options["incomingEncoding"]; | ||
gotOptions.responseType = "buffer"; | ||
Object.keys(gotOptions).forEach(key => { | ||
if (deprecatedOptions.includes(key)) { | ||
const invalidOptions = crawlerOnlyOptions.concat(deprecatedOptions); | ||
invalidOptions.forEach(key => { | ||
if (key in gotOptions) { | ||
delete gotOptions[key]; | ||
@@ -106,0 +123,0 @@ } |
@@ -42,2 +42,8 @@ export type GlobalOnlyOptions = { | ||
userAgents?: string | string[]; | ||
/** | ||
* Global Only option. | ||
* @default false | ||
* @description If true, the crawler will mute all warning and error messages. The request error will be still thrown. | ||
*/ | ||
silence?: boolean; | ||
}; | ||
@@ -44,0 +50,0 @@ export type RequestOptions = { |
{ | ||
"name": "crawler", | ||
"version": "2.0.1", | ||
"version": "2.0.2", | ||
"description": "Crawler is a ready-to-use web spider that works with proxies, asynchrony, rate limit, configurable request pools, jQuery, and HTTP/2 support.", | ||
@@ -12,4 +12,4 @@ "repository": { | ||
"build": "tsc", | ||
"test": "ava", | ||
"cover": "c8 ava" | ||
"test": "NODE_ENV=test ava", | ||
"cover": "NODE_ENV=test c8 ava" | ||
}, | ||
@@ -45,3 +45,2 @@ "engines": { | ||
"seenreq": "^3.0.0", | ||
"sinon": "^18.0.0", | ||
"tslog": "^4.9.3" | ||
@@ -52,3 +51,3 @@ }, | ||
"@types/got": "^9.6.12", | ||
"@types/node": "^20.14.7", | ||
"@types/node": "^20.14.8", | ||
"ava": "^6.1.3", | ||
@@ -59,6 +58,7 @@ "c8": "^10.1.2", | ||
"nock": "^13.5.4", | ||
"sinon": "^18.0.0", | ||
"tough-cookie": "^4.1.4", | ||
"tsx": "^4.15.7", | ||
"typescript": "^5.5.2", | ||
"typescript-eslint": "8.0.0-alpha.27" | ||
"typescript": "^5.4.5", | ||
"typescript-eslint": "8.0.0-alpha.30" | ||
}, | ||
@@ -74,3 +74,2 @@ "ava": { | ||
}, | ||
"failFast": true, | ||
"verbose": true | ||
@@ -77,0 +76,0 @@ }, |
@@ -431,2 +431,7 @@ <p align="center"> | ||
#### `silence` | ||
- **Type:** `boolean` | ||
- **Default** : false | ||
- If true, the crawler will mute all warning and error messages. The request error will be still reported. | ||
#### `maxConnections` | ||
@@ -433,0 +438,0 @@ |
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
106695
7
1340
679
2
13
- Removedsinon@^18.0.0
- Removed@sinonjs/commons@3.0.1(transitive)
- Removed@sinonjs/fake-timers@11.2.213.0.5(transitive)
- Removed@sinonjs/samsam@8.0.2(transitive)
- Removed@sinonjs/text-encoding@0.7.3(transitive)
- Removeddiff@5.2.0(transitive)
- Removedhas-flag@4.0.0(transitive)
- Removedjust-extend@6.2.0(transitive)
- Removedlodash.get@4.4.2(transitive)
- Removednise@6.1.1(transitive)
- Removedpath-to-regexp@8.2.0(transitive)
- Removedsinon@18.0.1(transitive)
- Removedsupports-color@7.2.0(transitive)
- Removedtype-detect@4.0.84.1.0(transitive)