Socket
Socket
Sign inDemoInstall

nodejs-web-scraper

Package Overview
Dependencies
31
Maintainers
1
Versions
81
Alerts
File Explorer

Advanced tools

Install Socket

Detect and block malicious and high-risk dependencies

Install

Comparing version 6.1.0 to 6.1.1

17

file_downloader/file_processor.js

@@ -7,4 +7,3 @@ const path = require('path');

constructor(config) {
// console.log(config)
// debugger;
this.originalFileName = config.fileName;

@@ -14,10 +13,8 @@ this.fileExtension = path.extname(this.originalFileName);

this.basePath = config.path[config.path.length - 1] === '/' ? config.path : config.path + '/';
// debugger;
this.initialFileNameExists = this.doesFileExist(this.basePath+this.originalFileName)
// console.log(this);
}
getAvailableFileName() {
// debugger;
getAvailableFileName() {
return this.createNewFileName(this.originalFileName);

@@ -39,3 +36,3 @@ }

if (!this.doesFileExist(this.basePath+fileName)) {
// console.log('new file name', newFileName)
return fileName;

@@ -51,8 +48,8 @@ }

doesFileExist(path) {
// debugger;
if (fs.existsSync(path)) {
// console.log(`file ${fileName} already exists!`);
return true;
}
// console.log(`file ${fileName} is being created for the first time`);
return false;

@@ -59,0 +56,0 @@

@@ -8,3 +8,3 @@ const sanitize = require('sanitize-filename');

const cleanUrl = removeQueryString(url)
// console.log('filenamecontentdisposition',fileNameFromContentDisposition)
const fileNameFromContentDisposition = getFileNameFromContentDisposition(headers['content-disposition'] || headers['Content-Disposition']);

@@ -14,3 +14,3 @@

// debugger;
//Second option

@@ -35,3 +35,3 @@ if (path.extname(cleanUrl)) {//First check if the url even has an extension

function getFileNameFromContentDisposition(contentDisposition) {
// debugger;
if (!contentDisposition || !contentDisposition.includes('filename=')) {

@@ -52,3 +52,2 @@ return "";

// console.log(contentType)
let extension = mime.extension(contentType)

@@ -66,3 +65,3 @@

function removeExtension(str) {
// debugger;
const arr = str.split('.');

@@ -82,3 +81,3 @@ if (arr.length == 1) {

function deduceFileNameFromUrl(url) {
// debugger;
const baseName = sanitize(path.basename(url));

@@ -85,0 +84,0 @@ return baseName;

@@ -1,4 +0,2 @@

counter = 0;
const {request} = require('../request/request.js');
// const sanitize = require('sanitize-filename');
const { request } = require('../request/request.js');
const path = require('path');

@@ -12,8 +10,8 @@ const FileProcessor = require('./file_processor');

const getFileNameFromResponse = require('./fileNameFromResponse')
// var mime = require('mime-types')
class FileDownloader {
constructor({ url, shouldBufferResponse = false, directory, cloneFiles, auth, timeout, headers, proxy }) {
constructor({ url, shouldBufferResponse = false, directory, cloneFiles, auth, timeout, headers, proxy }) {
this.url = url;

@@ -31,3 +29,3 @@ this.directory = directory;

async download() {
// debugger;
try {

@@ -45,3 +43,2 @@

})
this.response = response;

@@ -51,16 +48,13 @@ this.data = response.data;

} catch (error) {
throw error;
}
}
}
getFileNameData() {
const originalFileName = getFileNameFromResponse(this.url, this.response.headers);
const originalFileName = getFileNameFromResponse(this.url,this.response.headers);
let finalFileName;
let finalFileName;
const fileProcessor = new FileProcessor({ fileName: originalFileName, path: this.directory });

@@ -74,4 +68,2 @@ if (this.cloneFiles) {

const initialFileNameExists = fileProcessor.didInitialFileNameExist();
if (initialFileNameExists) counter++
// console.log('initialFileNameExists',counter)

@@ -88,8 +80,5 @@ return {//Return an object with both the "original"(deduced from the URL and headers) file name, and the final one

async save() {
// debugger;
try {
// debugger;
const { originalFileName, finalFileName, initialFileNameExists } = this.getFileNameData();
const { finalFileName } = this.getFileNameData();
if (this.shouldBufferResponse) {

@@ -103,6 +92,4 @@

}
}
catch (error) {
// debugger;
throw error

@@ -109,0 +96,0 @@ }

@@ -5,4 +5,2 @@ const CollectContent = require('./operations/CollectContent'),

Root = require('./operations/Root'),
// ScrollToBottom = require('./limitedSpa/ScrollToBottom'),
// ClickButton = require('./limitedSpa/ClickButton'),
Scraper = require('./Scraper.js');

@@ -16,4 +14,2 @@

CollectContent,
// ScrollToBottom,
// ClickButton
};

@@ -20,0 +16,0 @@

const Operation = require('./Operation')
var cheerio = require('cheerio');
// var cheerioAdv = require('cheerio-advanced-selectors')
// cheerio = cheerioAdv.wrap(cheerio)
// const fs = require('fs')
const { createElementList, getNodeContent } = require('../utils/cheerio')
// const { CustomResponse } = require('../request/request')

@@ -13,3 +9,3 @@ class CollectContent extends Operation {

*
* @param {string} querySelector cheerio-advanced-selectors selector
* @param {string} querySelector
* @param {Object} [config]

@@ -28,3 +24,2 @@ * @param {string} [config.name = 'Default CollectContent name']

this.querySelector = querySelector;
// this.validateOperationArguments();
if (typeof this.config.shouldTrim !== 'undefined') {//Checks if the user passed a "shouldTrim" property.

@@ -51,9 +46,2 @@ this.config.shouldTrim = this.config.shouldTrim;

async scrape({ html, url }) {
// this.scraper.log('colelcting content',url)
// debugger;
// if(this.config.name === 'videoLabel')
// debugger;
// const arr = url.split('/');
// const fileName = arr[arr.length-1]
// fs.writeFile(`${this.scraper.config.logPath}/${fileName}.html`,html,()=>{})

@@ -64,10 +52,6 @@ const parentAddress = url

this.config.contentType = this.config.contentType || 'text';
// !responseObjectFromParent && this.scraper.log('Empty response from content operation', responseObjectFromParent)
// debugger;
var $ = cheerio.load(html);
const elementList = await createElementList($, this.querySelector, { condition: this.config.condition, slice: this.config.slice });
// if(this.config.name === 'videoLabel')
// this.scraper.log(url,' Number of video elements: ',elementList.length)
if (this.config.getElementList) {

@@ -91,3 +75,2 @@ await this.config.getElementList(elementList, parentAddress);

if (this.config.getAllItems) {
// await this.config.afterScrape(currentWrapper);
await this.config.getAllItems(iterations, parentAddress);

@@ -94,0 +77,0 @@ }

const HttpOperation = require('./HttpOperation');
var cheerio = require('cheerio')
// var cheerioAdv = require('cheerio-advanced-selectors');
// cheerio = cheerioAdv.wrap(cheerio);
const fs = require('fs');

@@ -9,5 +7,3 @@ const { promisify } = require('util');

const Downloader = require('../file_downloader')
// const Downloader = require('nodejs-file-downloader')
const FileProcessor = require('../file_downloader/file_processor');
// const FileProcessor = require('nodejs-file-downloader/FileProcessor.js');
const crypto = require('crypto')

@@ -38,8 +34,6 @@ const { verifyDirectoryExists } = require('../utils/files')

constructor(querySelector, config = {}) {
// debugger;
super(config);
this.querySelector = querySelector;
// this.overridableProps = ['filePath', 'fileFlag', 'imageResponseType'];
this.overridableProps = ['filePath'];

@@ -76,3 +70,3 @@ for (let prop in config) {

*/
async scrape({html,url}) {
async scrape({ html, url }) {
// debugger;

@@ -94,4 +88,3 @@ if (!this.directoryVerified) {

}
// debugger;config.
const fileRefs = this.getFileRefs(url,elementList,baseUrlFromBaseTag)
const fileRefs = this.getFileRefs(url, elementList, baseUrlFromBaseTag)

@@ -117,3 +110,3 @@

*/
getFileRefs(url,elementList,baseUrlFromBaseTag){
getFileRefs(url, elementList, baseUrlFromBaseTag) {
const fileRefs = []

@@ -140,3 +133,3 @@ elementList.forEach((element) => {

const absoluteUrl = getAbsoluteUrl(baseUrlFromBaseTag || url, src);
fileRefs.push(absoluteUrl);
fileRefs.push(absoluteUrl);

@@ -180,6 +173,4 @@ })

const fileProcessor = new FileProcessor({fileName: `${fileName}.${extension}`, path: this.config.filePath || this.scraper.config.filePath });
const fileProcessor = new FileProcessor({ fileName: `${fileName}.${extension}`, path: this.config.filePath || this.scraper.config.filePath });
if (this.scraper.config.cloneFiles) {
// debugger;
// fileName = await fileProcessor.getAvailableFileName();
fileName = fileProcessor.getAvailableFileName();

@@ -193,3 +184,2 @@ } else {

// this.scraper.log('images:', this.scraper.state.downloadedFiles)

@@ -209,3 +199,3 @@ }

const options = {

@@ -218,6 +208,4 @@ url,

timeout: this.scraper.config.timeout,
// timeout: 150,
headers: this.scraper.config.headers,
proxy: this.scraper.config.proxy,
// useSynchronousMode:true

@@ -224,0 +212,0 @@ }

@@ -1,2 +0,1 @@

// const Operation = require('../Operation');//For jsdoc
const { request } = require('../../request/request.js');

@@ -7,6 +6,3 @@ const { stripTags } = require('../../utils/html');

const { getPaginationUrls } = require('../../utils/pagination');
// const { CustomResponse } = require('../../request/request')//For jsdoc
// const SPA_page = require('../../limitedSpa/SPA_Page');
// require('../typedef.js');

@@ -24,4 +20,2 @@ class PageHelper {

/**

@@ -34,3 +28,3 @@ *

async processOneIteration(href, shouldPaginate) {//Will process one scraping object, including a pagination object. Used by Root and OpenLinks.
// debugger;
if (shouldPaginate) {//If the scraping object is actually a pagination one, a different function is called.

@@ -48,8 +42,7 @@ return this.paginate(href);

var response = await this.getPage(href);
// debugger
await this.runAfterResponseHooks(response)
// debugger;
var dataFromChildren = await this.Operation.scrapeChildren(this.Operation.operations, {html:response.data,url:response.url})
var dataFromChildren = await this.Operation.scrapeChildren(this.Operation.operations, { html: response.data, url: response.url })
await this.runGetPageObjectHook(href, dataFromChildren)

@@ -60,4 +53,3 @@

catch (error) {
// this.Operation.scraper.log(error)
// debugger;
const errorString = `There was an error opening page ${href}, ${error}`;

@@ -128,3 +120,3 @@ iteration.error = errorString;

if (this.Operation.config.getPageHtml) {
// debugger;
await this.Operation.config.getPageHtml(resp.data, resp.url)

@@ -134,3 +126,3 @@ }

} catch (error) {
// debugger;
throw error;

@@ -155,14 +147,10 @@ }

if (this.Operation.config.getPageObject) {
// debugger;
const tree = {
}
const tree = {}
for (let child of dataFromChildren) {
// debugger;
// tree[child.name] = child.data
const func = getDictionaryKey(child.name);
tree[func(child.name, tree)] = child.data
}
await this.Operation.config.getPageObject(tree,address)
await this.Operation.config.getPageObject(tree, address)
}

@@ -182,3 +170,3 @@ }

if (this.Operation.config.getPageResponse) {//If a "getResponse" callback was provided, it will be called
// debugger;
if (typeof this.Operation.config.getPageResponse !== 'function')

@@ -185,0 +173,0 @@ throw "'getPageResponse' callback must be a function";

const Operation = require('./Operation');
var cheerio = require('cheerio')
// var cheerioAdv = require('cheerio-advanced-selectors');
// cheerio = cheerioAdv.wrap(cheerio);
const { createDelay } = require('../utils/delay');

@@ -17,4 +14,2 @@ const rpur = require('../utils/rpur')

// this.virtualOperations = [];//Will hold "virtual operations" performed by Puppeteer, which are out of the normal scraping flow.
if (this.condition) {

@@ -28,3 +23,2 @@ const type = typeof this.condition;

}
this.counter = 0;

@@ -46,15 +40,15 @@

async repeatPromiseUntilResolved(promiseFactory, href) {
// debugger;
const maxAttempts = this.scraper.config.maxRetries + 1;//Note that "maxRetries refers" to the number of retries, whereas
//"maxAttempts" is the overall number of iterations, therefore adding 1.
const shouldStop = (error) => {
// debugger;
const errorCode = error.response ? error.response.status : error
if (this.scraper.config.errorCodesToSkip.includes(errorCode)) {
// debugger;
const error = new Error();
error.message = `Skipping error ${errorCode}`;
// debugger;
error.code = errorCode;

@@ -69,3 +63,2 @@ return true

this.scraper.log(`Retrying failed promise...error: ${error}, 'href:' ${href}`);
// this.scraper.log('Retrying failed promise...error:', error);
const newRetries = retries + 1;

@@ -76,3 +69,2 @@ this.scraper.log(`Retreis ${newRetries}`)

// return await this.qyuFactory(() => this.repeatPromiseUntilResolved(promiseFactory, url));

@@ -121,3 +113,2 @@ return await rpur(promiseFactory, { maxAttempts, shouldStop, onError, timeout: 0 });

let currentSpacer = this.scraper.requestSpacer;
// this.scraper.requestSpacer = currentSpacer.then(() => Promise.delay(this.scraper.config.delay));
this.scraper.requestSpacer = currentSpacer.then(() => createDelay(this.scraper.config.delay));

@@ -131,3 +122,2 @@ await currentSpacer;

}
// Object.assign(HttpOperation.prototype,PageMixin)
module.exports = HttpOperation;

@@ -8,18 +8,15 @@

injectScraper: function (ScraperInstance) {//Override the original init function of Operation
this.scraper = ScraperInstance;
// debugger;
ScraperInstance.registerOperation(this);
for (let operation of this.operations) {
operation.injectScraper(ScraperInstance);
}
this.validateOperationArguments();
},
};
module.exports = CompositeInjectMixin;
injectScraper: function (ScraperInstance) {//Override the original init function of Operation
this.scraper = ScraperInstance;
ScraperInstance.registerOperation(this);
for (let operation of this.operations) {
operation.injectScraper(ScraperInstance);
}
this.validateOperationArguments();
},
};
module.exports = CompositeInjectMixin;

@@ -6,16 +6,13 @@

*/
const CompositeScrapeMixin = {
const CompositeScrapeMixin = {
scrapeChildren: async function (childOperations, {url,html}) {//Scrapes the child operations of this OpenLinks object.
scrapeChildren: async function (childOperations, { url, html }) {//Scrapes the child operations of this OpenLinks object.
// debugger;
const scrapedData = []
for (let operation of childOperations) {
// const dataFromChild = await operation.scrape(responseObjectFromParent);
const dataFromChild = await operation.scrape({url,html});
const dataFromChild = await operation.scrape({ url, html });
scrapedData.push(dataFromChild);
}
// responseObjectFromParent = null;
return scrapedData;

@@ -22,0 +19,0 @@ }

const HttpOperation = require('./HttpOperation');
const CompositeInjectMixin = require('./mixins/CompositeInjectMixin');
const CompositeScrapeMixin = require('./mixins/CompositeScrapeMixin');
// const Operation = require('./Operation')//For jsdoc
var cheerio = require('cheerio')
// var cheerioAdv = require('cheerio-advanced-selectors');
// cheerio = cheerioAdv.wrap(cheerio);
const { getBaseUrlFromBaseTag, createElementList } = require('../utils/cheerio');
const { getAbsoluteUrl } = require('../utils/url');
const PageHelper = require('./helpers/PageHelper');
// const SPA_PageHelper = require('./helpers/SPA_PageHelper');
// const { CustomResponse } = require('../request/request');//For jsdoc
const { mapPromisesWithLimitation } = require('../utils/concurrency');

@@ -45,6 +40,3 @@

super(config);
// this.pageHelper = new PageHelper(this);
this.pageHelper = null;
// this.compositeHelper = new CompositeHelper(this);
// this.virtualOperations = []
this.operations = [];//References to child operation objects.

@@ -67,3 +59,2 @@ this.querySelector = querySelector;

addOperation(Operation) {
// this._addOperation(Operation);
this.operations.push(Operation)

@@ -73,7 +64,3 @@ }

initPageHelper() {
if (!this.scraper.config.usePuppeteer) {
this.pageHelper = new PageHelper(this)
} else {
this.pageHelper = new SPA_PageHelper(this);
}
this.pageHelper = new PageHelper(this)
}

@@ -96,3 +83,3 @@

this.initPageHelper();
// debugger;
const refs = await this.createLinkList(html, url)

@@ -105,3 +92,3 @@

}
// debugger;
const shouldPaginate = this.config.pagination ? true : false;

@@ -111,3 +98,3 @@ const iterations = [];

await mapPromisesWithLimitation(refs, async (href) => {
// debugger;
const data = await this.pageHelper.processOneIteration(

@@ -133,5 +120,5 @@ this.transformHref(href),

async createLinkList(html, url) {
// debugger;
var $ = cheerio.load(html);
// debugger;
const elementList = await createElementList($, this.querySelector, {

@@ -138,0 +125,0 @@ condition: this.config.condition,

// const Scraper = require('../Scraper');//For jsdoc
/**
* Base class for all operations(not including limitedSpa).
* Base class for all operations
* Every Operation must implement its own scrape() method.

@@ -12,3 +11,2 @@ */

this.config = {}

@@ -24,4 +22,2 @@ if (objectConfig) {

// this.querySelector = querySelector;
// this.config.name = this.getOperationName(this.config.name);
this.scraper = null; //Scraper instance is passed later on.

@@ -31,7 +27,5 @@ this.data = []; //Holds all data collected by this operation, in the form of possibly multiple "ScrapingWrappers".

}
}
/**

@@ -46,7 +40,5 @@ * Being that all Operation objects are created independetly from the Scraper, a Scraper reference must be passed to them.

this.handleNewOperationCreation(this)
// debugger;
this.handleNewOperationCreation(this)
this.validateOperationArguments();//Implemented by all Operation objects
}

@@ -70,7 +62,4 @@

handleFailedScrapingIteration(errorString) {
// handleFailedScrapingIteration(error) {
this.scraper.log(errorString);
// scrapingAction.setError(errorString, errorCode)
this.scraper.reportFailedScrapingAction(errorString);
}

@@ -87,3 +76,2 @@

/**

@@ -93,4 +81,3 @@ * Get the entire data collected by this operation

*/
getData() {
// debugger;
getData() {
return this.data;

@@ -100,4 +87,2 @@ }

/**

@@ -110,8 +95,5 @@ * @return {string[]}

}
module.exports = Operation;

@@ -5,5 +5,3 @@ const HttpOperation = require('./HttpOperation');

const PageHelper = require('./helpers/PageHelper');
// const SPA_PageHelper = require('./helpers/SPA_PageHelper');
/**

@@ -30,4 +28,2 @@ *

this.operations = [];//References to child operation objects.
// this.virtualOperations = []
// this.pageHelper = new PageHelper(this);
this.pageHelper = null;

@@ -41,3 +37,2 @@ }

addOperation(Operation) {
// this._addOperation(Operation);
this.operations.push(Operation)

@@ -48,7 +43,3 @@ }

initPageHelper() {
if (!this.scraper.config.usePuppeteer) {
this.pageHelper = new PageHelper(this)
}else{
this.pageHelper = new SPA_PageHelper(this);
}
this.pageHelper = new PageHelper(this)
}

@@ -64,6 +55,6 @@

const shouldPaginate = this.config.pagination ? true : false;
// debugger;
const data = await this.pageHelper.processOneIteration(this.scraper.config.startUrl, shouldPaginate);
// debugger;
this.data = data

@@ -80,4 +71,3 @@ if (this.config.getPageData) {

*/
getErrors() {
// debugger;
getErrors() {
let errors = [...this.errors];

@@ -93,11 +83,5 @@

validateOperationArguments() {
// return;
}
}
Object.assign(Root.prototype, CompositeInjectMixin)

@@ -104,0 +88,0 @@ Object.assign(Root.prototype, CompositeScrapeMixin)

{
"name": "nodejs-web-scraper",
"version": "6.1.0",
"version": "6.1.1",
"description": "A web scraper for NodeJs",

@@ -5,0 +5,0 @@ "main": "index.js",

const fetch = require('node-fetch')
// const fetch = require('./fetch.js')
var HttpsProxyAgent = require('https-proxy-agent');
// import AbortController from 'abort-controller';
// const Signal = require('./signal.js')

@@ -13,3 +10,2 @@ function createInstance(config) {

function request(config) {
// throw 'error from request.request';
return createInstance(config).getFinalResponseObject();

@@ -20,12 +16,9 @@ }

// module.exports = class Request {
class Request {
constructor(config) {
this.originalResponse = null;//Original response object from fetch.
constructor(config) {
// debugger;
this.originalResponse = null;//Original response object from fetch.
// debugger;
// this.abortController = new AbortController()
const defaultConfig = {

@@ -35,3 +28,2 @@ method: 'GET',

headers: null,
// signal: new Signal(),
proxy: null,//Proxy string

@@ -42,6 +34,6 @@ responseType: 'text',//'text','json' or 'stream'. If 'stream' is chosen, the stream itself is returned.

}
// debugger;
if(config.headers){
if (config.headers) {
const isEmpty = Object.keys(config.headers).length === 0 && config.headers.constructor === Object;
if(isEmpty){
if (isEmpty) {
config.headers = null;

@@ -59,13 +51,11 @@ }

}
// debugger;
}
if (this.config.proxy) {
// this.config.agent = getAgent(this.config.proxy);
this.config.agent = new HttpsProxyAgent(this.config.proxy)
}
// debugger;
}

@@ -75,9 +65,8 @@

// controller.abort()
const url = config.url;
// debugger;
const response = await fetch(url, config);
// debugger;
this.originalResponse = response;
return response;

@@ -89,11 +78,8 @@ }

getRequestHeaders() {
// debugger;
// console.log(this)
return {
"Accept-Encoding": "gzip,deflate",
// 'User-Agent': "node-fetch/1.0 (+https://github.com/bitinn/node-fetch)",
'User-Agent': "node-fetch/1.0",
"Accept": "*/*",
...this.config.headers,
}

@@ -106,3 +92,2 @@ }

case 'text':
// debugger;
data = await fetchResponse.text();

@@ -113,3 +98,3 @@

case 'buffer':
// debugger;
data = await fetchResponse.buffer();

@@ -134,3 +119,3 @@

const requestHeaders = this.getRequestHeaders()
// debugger;
return new CustomResponse({

@@ -148,10 +133,5 @@ config: { ...this.config, headers: requestHeaders },

handleStatusCodes(customResponse) {
// debugger;
// const {status} = fetchResponse
// const response= this.createCustomResponseObjectFromFetchResponse(fetchResponse);
if (customResponse.status >= 400) {
const error = new CustomError({ code: customResponse.status, response: customResponse, message: `Server responded with ${customResponse.status}` })
// debugger;
throw error;
}

@@ -161,3 +141,3 @@ }

createCustomErrorFromFetchError(fetchError) {//Fetch errors are thrown only for network errors. There is no actual "response".
// debugger;
const error = new CustomError({ errno: fetchError.errno, message: fetchError.message })

@@ -169,8 +149,6 @@ return error;

async getFinalResponseObject() {
// throw 'error from request.getFinalResponseObject'
// console.log(this.config)
try {
var response = await this.performRequest(this.config);
} catch (fetchError) {//Network error has ocurred.
// debugger;
const error = this.createCustomErrorFromFetchError(fetchError)

@@ -189,3 +167,2 @@ throw error;

}

@@ -203,4 +180,2 @@ }

this.headers = headers
// this.aborted = false
// this.signal = new Signal();
}

@@ -213,6 +188,5 @@

class CustomError extends Error {
// debugger;
constructor({ code, response, message, errno }) {
super(message)
// this.config = config;//The config object of the failing request
this.errno = errno//Error constant. Will be set Only in the case of network errors.

@@ -224,6 +198,3 @@ this.code = code;//http code.Null if network error

// module.exports.default = request;
module.exports = {request, CustomResponse};
// module.exports.Request = Request;
module.exports = { request, CustomResponse };
// debugger;

@@ -7,5 +7,2 @@

const { deepSpread } = require('./utils/objects')
// const { Root } = require('./');//For jsdoc
// const PathQueue = require('./utils/PathQueue');
// const PuppeteerSimple = require('puppeteer-simple').default

@@ -16,4 +13,2 @@ /**

*/
class Scraper {

@@ -58,17 +53,5 @@

showConsoleLogs: true,
usePuppeteer: false,//Deprecated
puppeteerDebugMode: false,//For debugging
puppeteerConfig: {
// headless: false,
// args:[],
defaultViewport: null,
timeout: 40000,//40 seconds for full page load(network idle)
waitUntil: 'networkidle0',
...globalConfig.puppeteerConfig
},
onError: null //callback runs whenever any error occurs during scraping
}
// this.state = new State();
this.state = {
// existingUserFileDirectories: [],
failedScrapingIterations: [],

@@ -82,9 +65,5 @@ downloadedFiles: 0,

this.validateGlobalConfig(globalConfig);
deepSpread(this.config, globalConfig)
this.validateGlobalConfig(globalConfig);
deepSpread(this.config,globalConfig)
// debugger;
this.config.errorCodesToSkip = [404, 403, 400];

@@ -94,17 +73,7 @@

this.requestSpacer = Promise.resolve();
// debugger;
if (this.config.usePuppeteer) {
throw new Error('usePuppeteer is deprecated since version 5. If you need it, downgrade to version 4.2.2')
// debugger;
const puppeteerConfig = this.config.puppeteerConfig;
// debugger
// const { args,headless } = puppeteerConfig;
this.puppeteerSimple = new PuppeteerSimple({ ...puppeteerConfig})
this.isBrowserReady = this.puppeteerSimple.createBrowser();
}
// this.pathQueue = new PathQueue();
this.referenceToRoot = null;

@@ -126,8 +95,3 @@

getPuppeteerSimpleInstance() {
return this.puppeteerSimple;
}
validateGlobalConfig(conf) {

@@ -142,4 +106,2 @@ if (!conf || typeof conf !== 'object')

/**

@@ -155,5 +117,3 @@ * Starts the recursive scraping process

this.referenceToRoot = rootObject;
// debugger;
// rootObject.injectScraper(this)
// debugger;
rootObject.injectScraper(this)

@@ -173,3 +133,2 @@

}
// this.outPutErrors();
if (this.config.logPath) {

@@ -182,21 +141,4 @@ try {

}
// this.log('global.counter of alternative src ',global.counter)
this.log(`overall files: ${this.state.downloadedFiles}`)
if (this.config.usePuppeteer) {
// setTimeout(()=>{
if (!this.config.puppeteerDebugMode) {
try {
await this.puppeteerSimple.close()
} catch (error) {
this.log('Error shutting down puppeteer',error)
}
}
// },1000)
}
}

@@ -209,3 +151,2 @@

areThereRepeatableErrors() {
// debugger;
return this.state.failedScrapingIterations.length > 0;

@@ -221,3 +162,3 @@ }

this.state.failedScrapingIterations.push(errorString);
if(this.config.onError) this.config.onError(errorString);
if (this.config.onError) this.config.onError(errorString);
}

@@ -233,8 +174,5 @@

saveFile(data, fileName) {
// verifyDirectoryExists(this.config.logPath);
return new Promise(async (resolve, reject) => {
await verifyDirectoryExists(this.config.logPath);
// this.log('saving file')
// debugger;
fs.writeFile(path.join(this.config.logPath, `${fileName}.json`), JSON.stringify(data), (error) => {

@@ -247,3 +185,2 @@ if (error) {

}
});

@@ -259,3 +196,3 @@

async createLogs() {
// debugger;
for (let operation of this.state.registeredOperations) {

@@ -267,3 +204,2 @@ const fileName = operation.constructor.name === 'Root' ? 'log' : operation.config.name;

await this.createLog({ fileName: 'finalErrors', data: this.state.failedScrapingIterations })
// await this.createLog({ fileName: 'allErrors', data: this.referenceToRoot.getErrors() })
}

@@ -294,5 +230,5 @@

module.exports = Scraper;
// debugger;

@@ -7,6 +7,6 @@

*/
function getBaseUrlFromBaseTag($,baseSiteUrl) {
function getBaseUrlFromBaseTag($, baseSiteUrl) {
let baseMetaTag = $('base');
// debugger;
if (baseMetaTag.length == 0 || baseMetaTag.length > 1) {

@@ -38,4 +38,4 @@ baseMetaTag = null;

*/
function createNodeList($,querySelector,config={}) {//Gets a cheerio object and creates a nodelist.
const {slice}=config;
function createNodeList($, querySelector, config = {}) {//Gets a cheerio object and creates a nodelist.
const { slice } = config;
const nodeList = slice ? $(querySelector).slice(typeof slice === 'number' ? slice : slice[0], slice[1]) : $(querySelector);

@@ -55,5 +55,5 @@

*/
async function createElementList($,querySelector,config={}) {
const {condition,slice} = config
const nodeList = Array.from(createNodeList($,querySelector,{slice}));
async function createElementList($, querySelector, config = {}) {
const { condition, slice } = config
const nodeList = Array.from(createNodeList($, querySelector, { slice }));
const elementList = [];

@@ -84,4 +84,4 @@ for (let node of nodeList) {

*/
function getNodeContent(elem,config={}) {
const {contentType,shouldTrim} = config;
function getNodeContent(elem, config = {}) {
const { contentType, shouldTrim } = config;
const getText = () => shouldTrim ? elem.text().trim() : elem.text();//Will trim the string, if "shouldTrim" is true.

@@ -94,4 +94,3 @@ switch (contentType) {

default:
return getText();;
return getText();
}

@@ -101,3 +100,3 @@ }

module.exports ={
module.exports = {
getBaseUrlFromBaseTag,

@@ -104,0 +103,0 @@ createNodeList,

const fs = require('fs');
// const {promisify} = require('util');
// const access = promisify(fs.access);
// const mkdir = promisify(fs.mkdir);
function verifyDirectoryExists(path) {//Will make sure the target directory exists.
if (!fs.existsSync(path)) {
// console.log('creating dir:', path)
fs.mkdirSync(path,{recursive:true});
fs.mkdirSync(path, { recursive: true });
}
}
// async function verifyDirectoryExists(path){
// try {
// await access(path);
// } catch (error) {
// console.log('error from verify',error)
// try {
// await mkdir(path);
// } catch (error) {
// if(error.code !== 'EEXIST'){
// throw error;
// }
// // debugger
// }
// }
// }
function verifyDirectoryExistsAsync(path) {
function verifyDirectoryExistsAsync(path){
return new Promise((resolve, reject) => {
fs.access(path, (err) => {
if (err) {
fs.mkdir(path, { recursive: true }, (err) => {
resolve();
})
} else {
resolve();
}
})
})
return new Promise((resolve,reject)=>{
fs.access(path,(err)=>{
// debugger;
if(err){
fs.mkdir(path,{recursive:true},(err)=>{
// debugger;
resolve();
})
}else{
resolve();
}
})
})
}

@@ -53,0 +27,0 @@

@@ -8,8 +8,8 @@

function stripTags(html) {//Cleans the html string from script and style tags.
let clean;
clean = html.replace(/<\s*script[^>]*>[\s\S]*?(<\s*\/script[^>]*>|$)/ig, '');
clean = clean.replace(/<style[^>]*>[\s\S]*?(<\/style[^>]*>|$)/ig, '');
return clean;
let clean;
clean = html.replace(/<\s*script[^>]*>[\s\S]*?(<\s*\/script[^>]*>|$)/ig, '');
clean = clean.replace(/<style[^>]*>[\s\S]*?(<\/style[^>]*>|$)/ig, '');
return clean;
}

@@ -16,0 +16,0 @@

@@ -1,2 +0,1 @@

/**

@@ -10,3 +9,2 @@ * To prevent dictionary key-collision, get a number-appended key. Returns a higher order function, to preserve original key.

if (!dictionary[keyName]) {
// console.log('new file name', newFileName)
return keyName;

@@ -34,15 +32,14 @@ }

*/
function deepSpread(originalObject,secondaryObject){
// debugger
if(!originalObject)
function deepSpread(originalObject, secondaryObject) {
if (!originalObject)
originalObject = {}
for(let prop in secondaryObject){
if(typeof secondaryObject[prop] === 'object' && !Array.isArray(secondaryObject[prop])){
// debugger;
deepSpread(originalObject[prop],secondaryObject[prop]);
}else{
originalObject[prop] = secondaryObject[prop]
for (let prop in secondaryObject) {
if (typeof secondaryObject[prop] === 'object' && !Array.isArray(secondaryObject[prop])) {
deepSpread(originalObject[prop], secondaryObject[prop]);
} else {
originalObject[prop] = secondaryObject[prop]
}
}

@@ -49,0 +46,0 @@ }

@@ -8,6 +8,4 @@ /**

function getPaginationUrls(address, { numPages, begin, end, offset = 1, queryString, routingString }) {
// const numPages = pagination.numPages;
const firstPage = typeof begin !== 'undefined' ? begin : 1;
const lastPage = end || numPages;
// const offset = offset || 1;
const paginationUrls = []

@@ -14,0 +12,0 @@ for (let i = firstPage; i <= lastPage; i = i + offset) {

const {createDelay} = require('./delay')
/**

@@ -15,12 +14,7 @@ *

*/
// async function repeatPromiseUntilResolved(...args) {//Destructuring arguments in order to avoid having the "attempts" counter as part of the API.
module.exports = async function rpur(promiseFactory,config={}) {
// const promiseFactory = args[0]
// const config = args[1]
// const attempts = args[2] || 0
// debugger;
const attempts = arguments[2] || 0
// console.log(attempts)
// const {maxRetries} = config
// debugger;
const dummy = () => false;

@@ -32,18 +26,15 @@ const shouldStop = config.shouldStop || dummy;

try {
// console.log('Attempt number: ',attempts+1)
if (config.onAttempt) {
await config.onAttempt(attempts + 1)
}
// debugger;
const promise = promiseFactory();
const result = await promiseWithTimeout(promise, timeout);
// const result = await promiseFactory();
return result;
} catch (error) {
// debugger;
// console.log('Retrying failed promise');
const newAttempts = attempts + 1;
if (config.onError) {
// debugger;
await config.onError(error, newAttempts)

@@ -55,6 +46,2 @@ }

throw error;
// console.log('Attempts', newAttempts)
// if (newAttempts == maxAttempts) {//If it reached the maximum allowed number of retries, it throws an error.
// throw error;
// }

@@ -73,7 +60,6 @@ if (delay) {

function promiseWithTimeout(promise, time) {
// debugger;
return new Promise(async (resolve, reject) => {
if (time) {
var timeout = setTimeout(() => {
// console.log('timed out!')
reject(new Error('Promise timed out as defined in the config'))

@@ -80,0 +66,0 @@ }, time)

@@ -9,6 +9,5 @@

function getAbsoluteUrl(base, relative) {//Handles the absolute URL.
// debugger;
const newUrl = new URL(relative, base).toString();
return newUrl;
}

@@ -15,0 +14,0 @@

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc