Comparing version
@@ -1,3 +0,841 @@ | ||
import Crawler from "./crawler.js"; | ||
export default Crawler; | ||
//# sourceMappingURL=index.js.map | ||
// src/crawler.ts | ||
import { EventEmitter } from "events"; | ||
// src/lib/queue.ts | ||
var Node = class { | ||
constructor(value) { | ||
this.value = value; | ||
this.next = null; | ||
this.prev = null; | ||
} | ||
}; | ||
var DummyHeadNode = class { | ||
constructor() { | ||
this.next = null; | ||
} | ||
}; | ||
var DummyTailNode = class { | ||
constructor() { | ||
this.prev = null; | ||
} | ||
}; | ||
var Queue = class { | ||
constructor() { | ||
this._dummyHead = new DummyHeadNode(); | ||
this._dummyTail = new DummyTailNode(); | ||
this._dummyHead.next = this._dummyTail; | ||
this._dummyTail.prev = this._dummyHead; | ||
this._length = 0; | ||
} | ||
/** | ||
* Adds an element to the back of the Queue. | ||
* @param {*} element | ||
* @return {number} The new length of the Queue. | ||
*/ | ||
enqueue(value) { | ||
const node = new Node(value); | ||
const prevLast = this._dummyTail.prev; | ||
prevLast.next = node; | ||
node.prev = prevLast; | ||
node.next = this._dummyTail; | ||
this._dummyTail.prev = node; | ||
this._length++; | ||
return this._length; | ||
} | ||
/** | ||
* Removes the element at the front of the Queue. | ||
* @return {*} The element at the front of the Queue. | ||
*/ | ||
dequeue() { | ||
if (this.isEmpty()) { | ||
return void 0; | ||
} | ||
const node = this._dummyHead.next; | ||
const newFirst = node.next; | ||
this._dummyHead.next = newFirst; | ||
newFirst.prev = this._dummyHead; | ||
node.next = null; | ||
this._length--; | ||
return node.value; | ||
} | ||
/** | ||
* Returns true if the Queue has no elements. | ||
* @return {boolean} Whether the Queue has no elements. | ||
*/ | ||
isEmpty() { | ||
return this._length === 0; | ||
} | ||
/** | ||
* Returns the element at the front of the Queue. | ||
* @return {*} The element at the front of the Queue. | ||
*/ | ||
front() { | ||
if (this.isEmpty()) { | ||
return void 0; | ||
} | ||
return this._dummyHead.next.value; | ||
} | ||
/** | ||
* Returns the element at the back of the Queue. | ||
* @return {*} The element at the back of the Queue. | ||
*/ | ||
back() { | ||
if (this.isEmpty()) { | ||
return void 0; | ||
} | ||
return this._dummyTail.prev.value; | ||
} | ||
/** | ||
* Returns the number of elements in the Queue. | ||
* @return {number} Number of elements in the Queue. | ||
*/ | ||
get length() { | ||
return this._length; | ||
} | ||
/** | ||
* Returns the number of elements in the Queue (same as length). | ||
* @return {number} Number of elements in the Queue. | ||
*/ | ||
get size() { | ||
return this._length; | ||
} | ||
}; | ||
var queue_default = Queue; | ||
// src/lib/multiPriorityQueue.ts | ||
var multiPriorityQueue = class { | ||
constructor(priorities) { | ||
this._elements = []; | ||
priorities = Math.max(+priorities | 0, 1); | ||
for (let i = 0; i < priorities; i += 1) { | ||
this._elements.push(new queue_default()); | ||
} | ||
this._size = 0; | ||
} | ||
size() { | ||
if (this._size) return this._size; | ||
let totalSize = 0; | ||
for (const queue of this._elements) { | ||
totalSize += queue.length; | ||
} | ||
return this._size = totalSize; | ||
} | ||
enqueue(value, priority) { | ||
priority = priority && +priority | 0 || 0; | ||
if (priority < 0 || priority >= this._elements.length) { | ||
priority = this._elements.length - 1; | ||
throw new RangeError(`Invalid priority: ${priority} must be between 0 and ${this._elements.length - 1}`); | ||
} | ||
this._elements[priority].enqueue(value); | ||
this._size++; | ||
} | ||
dequeue() { | ||
for (let i = 0; i < this._elements.length; i++) { | ||
if (this._elements[i].length > 0) { | ||
this._size--; | ||
return this._elements[i].dequeue(); | ||
} | ||
} | ||
throw new ReferenceError("multiPriorityQueue is empty"); | ||
} | ||
}; | ||
var multiPriorityQueue_default = multiPriorityQueue; | ||
// src/lib/utils.ts | ||
var getType = (value) => Object.prototype.toString.call(value).slice(8, -1).toLocaleLowerCase(); | ||
var isNumber = (value) => getType(value) === "number" && !isNaN(value); | ||
var isFunction = (value) => getType(value) === "function"; | ||
var isBoolean = (value) => getType(value) === "boolean"; | ||
var setDefaults = (target, source) => { | ||
for (const key in source) { | ||
if (target[key] === void 0) { | ||
target[key] = source[key]; | ||
} | ||
} | ||
return target; | ||
}; | ||
var isValidUrl = (url) => { | ||
try { | ||
new URL(url); | ||
return true; | ||
} catch (_e) { | ||
return false; | ||
} | ||
}; | ||
function flattenDeep(array) { | ||
const result = []; | ||
array.forEach((element) => { | ||
if (Array.isArray(element)) { | ||
result.push(...flattenDeep(element)); | ||
} else { | ||
result.push(element); | ||
} | ||
}); | ||
return result; | ||
} | ||
var cleanObject = (obj) => { | ||
Object.keys(obj).forEach((key) => { | ||
if (getType(obj[key]) === "object") { | ||
obj[key] = cleanObject(obj[key]); | ||
} | ||
if (obj[key] === void 0 || obj[key] === null) { | ||
delete obj[key]; | ||
} | ||
}); | ||
return obj; | ||
}; | ||
var lowerObjectKeys = (obj) => { | ||
const result = {}; | ||
Object.keys(obj).forEach((key) => { | ||
result[key.toLowerCase()] = obj[key]; | ||
}); | ||
return result; | ||
}; | ||
// src/rateLimiter/rateLimiter.ts | ||
var RateLimiter = class { | ||
constructor({ maxConnections, rateLimit, priorityLevels = 1, defaultPriority = 0, cluster }) { | ||
if (!Number.isInteger(maxConnections) || !Number.isInteger(rateLimit) || !Number.isInteger(priorityLevels)) { | ||
throw new Error("maxConnections, rateLimit and priorityLevels must be positive integers"); | ||
} | ||
this.maxConnections = maxConnections; | ||
this.priorityLevels = priorityLevels; | ||
this.defaultPriority = Number(defaultPriority); | ||
this.defaultPriority = Number.isInteger(defaultPriority) ? Math.min(Math.max(defaultPriority, 0), priorityLevels - 1) : Math.floor(priorityLevels / 2); | ||
this.nextRequestTime = Date.now(); | ||
this._waitingTasks = new multiPriorityQueue_default(priorityLevels); | ||
this._cluster = cluster; | ||
this.rateLimit = rateLimit; | ||
this.runningSize = 0; | ||
} | ||
get waitingSize() { | ||
return this._waitingTasks.size(); | ||
} | ||
hasWaitingTasks() { | ||
return this.waitingSize > 0 || this._cluster !== void 0 && this._cluster.hasWaitingTasks(); | ||
} | ||
setId(id) { | ||
this.id = id; | ||
} | ||
setRateLimit(rateLimit) { | ||
if (!Number.isInteger(rateLimit) || rateLimit < 0) { | ||
throw new Error("rateLimit must be non negative integers"); | ||
} | ||
this.rateLimit = rateLimit; | ||
if (this.rateLimit > 0) this.maxConnections = 1; | ||
} | ||
submit(options, task) { | ||
let priority = typeof options === "number" ? options : options.priority; | ||
priority = Number.isInteger(priority) ? priority : this.defaultPriority; | ||
priority = Math.min(priority, this.priorityLevels - 1); | ||
this._waitingTasks.enqueue(task, priority); | ||
this._schedule(); | ||
} | ||
_schedule() { | ||
if (this.runningSize < this.maxConnections && this.hasWaitingTasks()) { | ||
++this.runningSize; | ||
const delay = Math.max(this.nextRequestTime - Date.now(), 0); | ||
this.nextRequestTime = Date.now() + delay + this.rateLimit; | ||
const { next, rateLimiterId } = this.dequeue(); | ||
setTimeout(() => { | ||
const done = () => { | ||
--this.runningSize; | ||
this._schedule(); | ||
}; | ||
next(done, rateLimiterId); | ||
}, delay); | ||
} | ||
} | ||
directDequeue() { | ||
return this._waitingTasks.dequeue(); | ||
} | ||
dequeue() { | ||
if (this.waitingSize) { | ||
return { | ||
next: this._waitingTasks.dequeue(), | ||
rateLimiterId: void 0 | ||
}; | ||
} | ||
return this._cluster?.dequeue(); | ||
} | ||
}; | ||
var rateLimiter_default = RateLimiter; | ||
// src/rateLimiter/cluster.ts | ||
var Cluster = class { | ||
constructor({ maxConnections, rateLimit, priorityLevels, defaultPriority, homogeneous }) { | ||
this._interval = null; | ||
this.globalMaxConnections = maxConnections; | ||
this.globalRateLimit = rateLimit; | ||
this.globalpriorityLevels = priorityLevels; | ||
this.globalDefaultPriority = defaultPriority; | ||
this._homogeneous = homogeneous || false; | ||
this._rateLimiters = {}; | ||
} | ||
/** | ||
* Alternative to Old Cluster.prototype.key | ||
*/ | ||
getRateLimiter(id) { | ||
id = id ?? 0; | ||
if (!this._rateLimiters[id]) { | ||
this._rateLimiters[id] = new rateLimiter_default({ | ||
"maxConnections": this.globalMaxConnections, | ||
"rateLimit": this.globalRateLimit, | ||
"priorityLevels": this.globalpriorityLevels, | ||
"defaultPriority": this.globalDefaultPriority, | ||
"cluster": this._homogeneous ? this : void 0 | ||
}); | ||
this._rateLimiters[id].setId(id); | ||
return this._rateLimiters[id]; | ||
} else { | ||
return this._rateLimiters[id]; | ||
} | ||
} | ||
hasRateLimiter(id) { | ||
return !!this._rateLimiters[id]; | ||
} | ||
deleteRateLimiter(id) { | ||
id = id ?? 0; | ||
return delete this._rateLimiters[id]; | ||
} | ||
/** | ||
* @deprecated use waitingSize instead | ||
*/ | ||
get waitingClients() { | ||
return this.waitingSize; | ||
} | ||
get waitingSize() { | ||
return Object.values(this._rateLimiters).reduce( | ||
(waitingCount, rateLimiter) => waitingCount + rateLimiter.waitingSize, | ||
0 | ||
); | ||
} | ||
/** | ||
* @deprecated use unfinishedSize instead | ||
*/ | ||
get unfinishedClients() { | ||
return this.unfinishedSize; | ||
} | ||
get unfinishedSize() { | ||
return Object.values(this._rateLimiters).reduce( | ||
(unfinishedCount, rateLimiter) => unfinishedCount + rateLimiter.runningSize + rateLimiter.waitingSize, | ||
0 | ||
); | ||
} | ||
hasWaitingTasks() { | ||
return Object.values(this._rateLimiters).some((rateLimiter) => rateLimiter.hasWaitingTasks()); | ||
} | ||
dequeue() { | ||
for (const rateLimiter of Object.values(this._rateLimiters)) { | ||
if (rateLimiter.waitingSize) { | ||
return { | ||
"next": rateLimiter.directDequeue(), | ||
"rateLimiterId": rateLimiter.id | ||
}; | ||
} else { | ||
} | ||
} | ||
return void 0; | ||
} | ||
get status() { | ||
const status = []; | ||
Object.keys(this._rateLimiters).forEach((key) => { | ||
const id = Number(key); | ||
status.push( | ||
[ | ||
"Id: " + id, | ||
"running: " + this._rateLimiters[id].runningSize, | ||
"waiting: " + this._rateLimiters[id].waitingSize | ||
].join() | ||
); | ||
}); | ||
return status.join(";"); | ||
} | ||
// startCleanup(): void { | ||
// clearInterval(this._interval as NodeJS.Timeout); | ||
// const base = (this._interval = setInterval(() => { | ||
// const time = Date.now(); | ||
// Object.keys(this._rateLimiters).forEach(key => { | ||
// const id = Number(key); | ||
// const rateLimiter = this._rateLimiters[id]; | ||
// if (rateLimiter.nextRequestTime + 1000 * 60 * 5 < time) { | ||
// this.deleteRateLimiter(id); | ||
// } | ||
// }); | ||
// }, 1000 * 30)); | ||
// if (typeof base.unref === "function") { | ||
// base.unref(); | ||
// } | ||
// } | ||
get empty() { | ||
return this.unfinishedSize === 0; | ||
} | ||
}; | ||
var cluster_default = Cluster; | ||
// src/options.ts | ||
import { HttpProxyAgent, HttpsProxyAgent } from "hpagent"; | ||
import http2Wrapper from "http2-wrapper"; | ||
var globalOnlyOptions = [ | ||
"maxConnections", | ||
"rateLimit", | ||
"priorityLevels", | ||
"skipDuplicates", | ||
"homogeneous", | ||
"userAgents", | ||
"silence" | ||
]; | ||
var crawlerOnlyOptions = [ | ||
"rateLimiterId", | ||
"forceUTF8", | ||
"jQuery", | ||
"retryInterval", | ||
"priority", | ||
"proxy", | ||
"retries", | ||
"preRequest", | ||
"callback", | ||
"release", | ||
"isJson", | ||
"referer", | ||
"rejectUnauthorized", | ||
"userParams" | ||
].concat(globalOnlyOptions); | ||
var deprecatedOptions = [ | ||
"uri", | ||
"qs", | ||
"strictSSL", | ||
"incomingEncoding", | ||
"gzip", | ||
"jar", | ||
"jsonReviver", | ||
"jsonReplacer", | ||
"skipEventRequest", | ||
"logger", | ||
"debug", | ||
"time", | ||
"limiter", | ||
"gene" | ||
]; | ||
var getCharset = (headers) => { | ||
let charset = null; | ||
const contentType = headers["content-type"]; | ||
if (contentType) { | ||
const match = contentType.match(/charset=['"]?([\w.-]+)/i); | ||
if (match) { | ||
charset = match[1].trim().toLowerCase(); | ||
} | ||
} | ||
return charset; | ||
}; | ||
var getValidOptions = (options) => { | ||
const type = getType(options); | ||
if (type === "string") { | ||
try { | ||
if (isValidUrl(options)) return { url: options }; | ||
options = JSON.parse(options); | ||
return options; | ||
} catch (_err) { | ||
throw new TypeError(`Invalid options: ${JSON.stringify(options)}`); | ||
} | ||
} else if (type === "object") { | ||
const prototype = Object.getPrototypeOf(options); | ||
if (prototype === Object.prototype || prototype === null) return options; | ||
} | ||
throw new TypeError(`Invalid options: ${JSON.stringify(options)}`); | ||
}; | ||
var alignOptions = (options) => { | ||
const gotOptions = { | ||
...options, | ||
url: options.url ?? options.uri, | ||
searchParams: options.searchParams ?? options.qs, | ||
decompress: options.decompress ?? options.gzip, | ||
userParams: options.userParams ?? options.gene, | ||
parseJson: options.parseJson ?? options.jsonReviver, | ||
stringifyJson: options.stringifyJson ?? options.jsonReplacer, | ||
cookieJar: options.cookieJar ?? options.jar, | ||
timeout: { request: options.timeout } | ||
}; | ||
const sslConfig = options.rejectUnauthorized ?? options.strictSSL; | ||
if (sslConfig !== void 0) { | ||
if (gotOptions.https === void 0) { | ||
gotOptions.https = { rejectUnauthorized: sslConfig }; | ||
} else { | ||
gotOptions.https.rejectUnauthorized = sslConfig; | ||
} | ||
} | ||
const defaultagent = options["proxy"] ? { | ||
https: new HttpsProxyAgent({ proxy: options["proxy"] }), | ||
http: new HttpProxyAgent({ proxy: options["proxy"] }) | ||
} : void 0; | ||
if (options.http2 === true && options.proxy) { | ||
const { proxies: Http2Proxies } = http2Wrapper; | ||
const protocol = options.proxy.startsWith("https") ? "https" : "http"; | ||
const http2Agent = protocol === "https" ? new Http2Proxies.Http2OverHttps({ | ||
proxyOptions: { url: options.proxy } | ||
}) : new Http2Proxies.Http2OverHttp({ | ||
proxyOptions: { url: options.proxy } | ||
}); | ||
gotOptions.agent = { http2: http2Agent }; | ||
} else { | ||
gotOptions.agent = gotOptions.agent ?? (options.proxy ? defaultagent : void 0); | ||
} | ||
if (options.encoding === void 0) options.encoding = options.incomingEncoding; | ||
gotOptions.responseType = "buffer"; | ||
const invalidOptions = crawlerOnlyOptions.concat(deprecatedOptions); | ||
invalidOptions.forEach((key) => { | ||
if (key in gotOptions) { | ||
delete gotOptions[key]; | ||
} | ||
}); | ||
const headers = gotOptions.headers; | ||
cleanObject(gotOptions); | ||
gotOptions.headers = headers; | ||
if (!gotOptions.headers.referer) { | ||
if (options.referer) { | ||
gotOptions.headers.referer = options.referer; | ||
} else { | ||
const domain = gotOptions.url.match(/^(\w+):\/\/([^/]+)/); | ||
if (domain) gotOptions.headers.referer = domain[0]; | ||
} | ||
} | ||
gotOptions.retry = { limit: 0 }; | ||
return gotOptions; | ||
}; | ||
// src/logger.ts | ||
import { Logger } from "tslog"; | ||
var logLevelsByEnv = { | ||
"debug": 0, | ||
"production": 3, | ||
"test": 7 | ||
}; | ||
var logOptions = { | ||
type: "pretty", | ||
name: "Crawler", | ||
hideLogPositionForProduction: true, | ||
prettyLogTemplate: "{{name}} {{logLevelName}} ", | ||
prettyLogStyles: { | ||
logLevelName: { | ||
SILLY: ["bold", "white"], | ||
TRACE: ["bold", "whiteBright"], | ||
DEBUG: ["bold", "green"], | ||
INFO: ["bold", "blue"], | ||
WARN: ["bold", "yellow"], | ||
ERROR: ["bold", "red"], | ||
FATAL: ["bold", "redBright"] | ||
}, | ||
name: ["bold", "green"], | ||
dateIsoStr: "white", | ||
filePathWithLine: "white", | ||
nameWithDelimiterPrefix: ["white", "bold"], | ||
nameWithDelimiterSuffix: ["white", "bold"], | ||
errorName: ["bold", "bgRedBright", "whiteBright"], | ||
fileName: ["yellow"] | ||
}, | ||
minLevel: 0 | ||
}; | ||
logOptions.minLevel = process.env.NODE_ENV ? logLevelsByEnv[process.env.NODE_ENV] : 3; | ||
var getLogger = () => new Logger(logOptions); | ||
// src/crawler.ts | ||
import { load } from "cheerio"; | ||
import seenreq from "seenreq"; | ||
import iconv from "iconv-lite"; | ||
var log = getLogger(); | ||
var gotInstance = null; | ||
async function loadGot() { | ||
if (!gotInstance) { | ||
gotInstance = (await import("got")).default; | ||
} | ||
return gotInstance; | ||
} | ||
var Crawler = class extends EventEmitter { | ||
constructor(options) { | ||
super(); | ||
this._UAIndex = 0; | ||
this._proxyIndex = 0; | ||
this._detectHtmlOnHeaders = (headers) => { | ||
const contentType = headers["content-type"]; | ||
if (/xml|html/i.test(contentType)) return true; | ||
return false; | ||
}; | ||
this._schedule = (options) => { | ||
this.emit("schedule", options); | ||
this._limiters.getRateLimiter(options.rateLimiterId).submit(options.priority, (done, rateLimiterId) => { | ||
options.release = () => { | ||
done(); | ||
this.emit("_release"); | ||
}; | ||
options.callback = options.callback || options.release; | ||
if (rateLimiterId) { | ||
this.emit("limiterChange", options, rateLimiterId); | ||
} | ||
if (options.html) { | ||
options.url = options.url ?? ""; | ||
this._handler(null, options, { body: options.html, headers: { "content-type": "text/html" } }); | ||
} else { | ||
options.url = options.url ?? options.uri; | ||
if (typeof options.url === "function") { | ||
options.url((url) => { | ||
options.url = url; | ||
this._execute(options); | ||
}); | ||
} else { | ||
delete options.uri; | ||
this._execute(options); | ||
} | ||
} | ||
}); | ||
}; | ||
this._execute = async (options) => { | ||
if (options.proxy) log.debug(`Using proxy: ${options.proxy}`); | ||
else if (options.proxies) log.debug(`Using proxies: ${options.proxies}`); | ||
options.headers = options.headers ?? {}; | ||
options.headers = lowerObjectKeys(options.headers); | ||
if (options.forceUTF8 || options.isJson) options.encoding = "utf8"; | ||
if (Array.isArray(options.userAgents)) { | ||
this._UAIndex = this._UAIndex % options.userAgents.length; | ||
options.headers["user-agent"] = options.userAgents[this._UAIndex]; | ||
this._UAIndex++; | ||
} else { | ||
options.headers["user-agent"] = options.headers["user-agent"] ?? options.userAgents; | ||
} | ||
if (!options.proxy && Array.isArray(options.proxies)) { | ||
this._proxyIndex = this._proxyIndex % options.proxies.length; | ||
options.proxy = options.proxies[this._proxyIndex]; | ||
this._proxyIndex++; | ||
} | ||
const request = async () => { | ||
if (options.skipEventRequest !== true) { | ||
this.emit("request", options); | ||
} | ||
let response; | ||
try { | ||
const got = await loadGot(); | ||
response = await got(alignOptions(options)); | ||
} catch (error) { | ||
log.debug(error); | ||
return this._handler(error, options); | ||
} | ||
return this._handler(null, options, response); | ||
}; | ||
if (isFunction(options.preRequest)) { | ||
try { | ||
options.preRequest(options, async (err) => { | ||
if (err) { | ||
log.debug(err); | ||
return this._handler(err, options); | ||
} | ||
return await request(); | ||
}); | ||
} catch (err) { | ||
log.error(err); | ||
throw err; | ||
} | ||
} else { | ||
return await request(); | ||
} | ||
}; | ||
this._handler = (error, options, response) => { | ||
if (error) { | ||
if (options.retries && options.retries > 0) { | ||
log.warn(`${error} occurred on ${options.url}. ${options.retries ? `(${options.retries} retries left)` : ""}`); | ||
setTimeout(() => { | ||
options.retries--; | ||
this._execute(options); | ||
}, options.retryInterval); | ||
return; | ||
} else { | ||
log.error(`${error} occurred on ${options.url}. Request failed.`); | ||
if (options.callback && typeof options.callback === "function") { | ||
return options.callback(error, { options }, options.release); | ||
} else { | ||
throw error; | ||
} | ||
} | ||
} | ||
if (!response.body) response.body = ""; | ||
log.debug("Got " + (options.url || "html") + " (" + response.body.length + " bytes)..."); | ||
response.options = options; | ||
response.charset = getCharset(response.headers); | ||
if (!response.charset) { | ||
const match = response.body.toString().match(/charset=['"]?([\w.-]+)/i); | ||
response.charset = match ? match[1].trim().toLowerCase() : null; | ||
} | ||
log.debug("Charset: " + response.charset); | ||
if (options.encoding !== null) { | ||
options.encoding = options.encoding ?? response.charset ?? "utf8"; | ||
try { | ||
if (!Buffer.isBuffer(response.body)) response.body = Buffer.from(response.body); | ||
response.body = iconv.decode(response.body, options.encoding); | ||
response.body = response.body.toString(); | ||
} catch (err) { | ||
log.error(err); | ||
} | ||
} | ||
if (options.isJson) { | ||
try { | ||
response.body = JSON.parse(response.body); | ||
} catch (_err) { | ||
log.warn("JSON parsing failed, body is not JSON. Set isJson to false to mute this warning."); | ||
} | ||
} | ||
if (options.jQuery === true && !options.isJson) { | ||
if (response.body === "" || !this._detectHtmlOnHeaders(response.headers)) { | ||
log.warn("response body is not HTML, skip injecting. Set jQuery to false to mute this warning."); | ||
} else { | ||
try { | ||
response.$ = load(response.body); | ||
} catch (_err) { | ||
log.warn("HTML detected failed. Set jQuery to false to mute this warning."); | ||
} | ||
} | ||
} | ||
if (options.callback && typeof options.callback === "function") { | ||
return options.callback(null, response, options.release); | ||
} | ||
return response; | ||
}; | ||
/** | ||
* @param options | ||
* @returns if there is a "callback" function in the options, return the result of the callback function. \ | ||
* Otherwise, return a promise, which resolves when the request is successful and rejects when the request fails. | ||
* In the case of the promise, the resolved value will be the response object. | ||
* @description Send a request directly. | ||
* @example | ||
* ```js | ||
* const crawler = new Crawler(); | ||
* crawler.send({ | ||
* url: "https://example.com", | ||
* callback: (error, response, done) => { done(); } | ||
* }); | ||
* await crawler.send("https://example.com"); | ||
* ``` | ||
*/ | ||
this.send = async (options) => { | ||
options = getValidOptions(options); | ||
options.retries = options.retries ?? 0; | ||
setDefaults(options, this.options); | ||
options.skipEventRequest = isBoolean(options.skipEventRequest) ? options.skipEventRequest : true; | ||
delete options.preRequest; | ||
return await this._execute(options); | ||
}; | ||
/** | ||
* @deprecated | ||
* @description Old interface version. It is recommended to use `Crawler.send()` instead. | ||
* @see Crawler.send | ||
*/ | ||
this.direct = async (options) => { | ||
return await this.send(options); | ||
}; | ||
/** | ||
* @param options | ||
* @description Add a request to the queue. | ||
* @example | ||
* ```js | ||
* const crawler = new Crawler(); | ||
* crawler.add({ | ||
* url: "https://example.com", | ||
* callback: (error, response, done) => { done(); } | ||
* }); | ||
* ``` | ||
*/ | ||
this.add = (options) => { | ||
let optionsArray = Array.isArray(options) ? options : [options]; | ||
optionsArray = flattenDeep(optionsArray); | ||
optionsArray.forEach((options2) => { | ||
try { | ||
options2 = getValidOptions(options2); | ||
} catch (err) { | ||
log.warn(err); | ||
return; | ||
} | ||
setDefaults(options2, this.options); | ||
options2.headers = { ...this.options.headers, ...options2.headers }; | ||
if (!this.options.skipDuplicates) { | ||
this._schedule(options2); | ||
return; | ||
} | ||
this.seen.exists(options2, options2.seenreq).then((rst) => { | ||
if (!rst) { | ||
this._schedule(options2); | ||
} | ||
}).catch((error) => log.error(error)); | ||
}); | ||
}; | ||
/** | ||
* @deprecated | ||
* @description Old interface version. It is recommended to use `Crawler.add()` instead. | ||
* @see Crawler.add | ||
*/ | ||
this.queue = (options) => { | ||
return this.add(options); | ||
}; | ||
const defaultOptions = { | ||
maxConnections: 10, | ||
rateLimit: 0, | ||
priorityLevels: 10, | ||
skipDuplicates: false, | ||
homogeneous: false, | ||
method: "GET", | ||
forceUTF8: false, | ||
jQuery: true, | ||
priority: 5, | ||
retries: 2, | ||
retryInterval: 3e3, | ||
timeout: 2e4, | ||
isJson: false, | ||
silence: false, | ||
rejectUnauthorized: false, | ||
// set to "true" in production environment. | ||
userAgents: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36" | ||
}; | ||
this.options = { ...defaultOptions, ...options }; | ||
if (this.options.rateLimit > 0) { | ||
this.options.maxConnections = 1; | ||
} | ||
if (this.options.silence) { | ||
log.settings.minLevel = 7; | ||
} | ||
this._limiters = new cluster_default({ | ||
maxConnections: this.options.maxConnections, | ||
rateLimit: this.options.rateLimit, | ||
priorityLevels: this.options.priorityLevels, | ||
defaultPriority: this.options.priority, | ||
homogeneous: this.options.homogeneous | ||
}); | ||
this.seen = new seenreq(this.options.seenreq); | ||
this.seen.initialize().then(() => { | ||
log.debug("seenreq initialized"); | ||
}).catch((error) => { | ||
log.error(error); | ||
}); | ||
this.on("_release", () => { | ||
log.debug(`Queue size: ${this.queueSize}`); | ||
if (this._limiters.empty) this.emit("drain"); | ||
}); | ||
} | ||
get queueSize() { | ||
return 0; | ||
} | ||
setLimiter(rateLimiterId, property, value) { | ||
if (!isNumber(rateLimiterId)) { | ||
log.error("rateLimiterId must be a number"); | ||
return; | ||
} | ||
if (property === "rateLimit") { | ||
this._limiters.getRateLimiter(rateLimiterId).setRateLimit(value); | ||
} | ||
} | ||
}; | ||
var crawler_default = Crawler; | ||
// src/index.ts | ||
if (typeof module !== "undefined" && module.exports) { | ||
module.exports = crawler_default; | ||
} | ||
var index_default = crawler_default; | ||
export { | ||
index_default as default | ||
}; |
{ | ||
"name": "crawler", | ||
"version": "2.0.2", | ||
"version": "2.0.3-beta.1", | ||
"description": "Crawler is a ready-to-use web spider that works with proxies, asynchrony, rate limit, configurable request pools, jQuery, and HTTP/2 support.", | ||
@@ -9,5 +9,11 @@ "repository": { | ||
}, | ||
"exports": "./dist/index.js", | ||
"exports": { | ||
".": { | ||
"require": "./dist/index.js", | ||
"import": "./dist/index.mjs" | ||
} | ||
}, | ||
"scripts": { | ||
"build": "tsc", | ||
"build": "tsup src/index.ts --format cjs,esm --clean", | ||
"prepublishOnly": "npm run build", | ||
"test": "NODE_ENV=test ava", | ||
@@ -40,3 +46,3 @@ "cover": "NODE_ENV=test c8 ava" | ||
"cheerio": "1.0.0-rc.12", | ||
"got": "^14.4.1", | ||
"got": "^14.4.2", | ||
"hpagent": "^1.2.0", | ||
@@ -46,18 +52,19 @@ "http2-wrapper": "^2.2.1", | ||
"seenreq": "^3.0.0", | ||
"tslog": "^4.9.3" | ||
"tslog": "^4.9.3", | ||
"tsup": "^8.4.0" | ||
}, | ||
"devDependencies": { | ||
"@eslint/js": "^9.5.0", | ||
"@eslint/js": "^9.8.0", | ||
"@types/got": "^9.6.12", | ||
"@types/node": "^20.14.8", | ||
"@types/node": "^20.14.13", | ||
"ava": "^6.1.3", | ||
"c8": "^10.1.2", | ||
"eslint": "~9.4.0", | ||
"globals": "^15.6.0", | ||
"eslint": "^9.8.0", | ||
"globals": "^15.8.0", | ||
"nock": "^13.5.4", | ||
"sinon": "^18.0.0", | ||
"tough-cookie": "^4.1.4", | ||
"tsx": "^4.15.7", | ||
"typescript": "^5.4.5", | ||
"typescript-eslint": "8.0.0-alpha.30" | ||
"tsx": "^4.16.3", | ||
"typescript": "^5.5.4", | ||
"typescript-eslint": "^8.0.0" | ||
}, | ||
@@ -64,0 +71,0 @@ "ava": { |
@@ -9,3 +9,3 @@ <p align="center"> | ||
[](https://www.npmjs.com/package/crawler/v/2.0.0) | ||
[](https://www.npmjs.com/package/crawler/v/2.0.2) | ||
@@ -250,2 +250,3 @@ [](https://circleci.com/gh/bda-research/node-crawler/tree/master) | ||
- [Global only options](#global-only-options) | ||
- [`silence`](#silence) | ||
- [`maxConnections`](#maxconnections) | ||
@@ -525,3 +526,3 @@ - [`priorityLevels`](#prioritylevels) | ||
- **Type:** `number` | ||
- **Default** : 2000 | ||
- **Default** : 3000 | ||
- The number of milliseconds to wait before retrying. | ||
@@ -532,3 +533,3 @@ | ||
- **Type:** `number` | ||
- **Default** : 15000 | ||
- **Default** : 20000 | ||
- The number of milliseconds to wait before the request times out. | ||
@@ -535,0 +536,0 @@ |
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
No v1
QualityPackage is not semver >=1. This means it is not stable and does not support ^ ranges.
Found 1 instance in 1 package
1695
26.49%680
0.15%76659
-28.15%8
14.29%5
-90.91%2
100%4
100%3
200%+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
Updated