nodejs-web-scraper
Advanced tools
Comparing version 6.1.0 to 6.1.1
@@ -7,4 +7,3 @@ const path = require('path'); | ||
constructor(config) { | ||
// console.log(config) | ||
// debugger; | ||
this.originalFileName = config.fileName; | ||
@@ -14,10 +13,8 @@ this.fileExtension = path.extname(this.originalFileName); | ||
this.basePath = config.path[config.path.length - 1] === '/' ? config.path : config.path + '/'; | ||
// debugger; | ||
this.initialFileNameExists = this.doesFileExist(this.basePath+this.originalFileName) | ||
// console.log(this); | ||
} | ||
getAvailableFileName() { | ||
// debugger; | ||
getAvailableFileName() { | ||
return this.createNewFileName(this.originalFileName); | ||
@@ -39,3 +36,3 @@ } | ||
if (!this.doesFileExist(this.basePath+fileName)) { | ||
// console.log('new file name', newFileName) | ||
return fileName; | ||
@@ -51,8 +48,8 @@ } | ||
doesFileExist(path) { | ||
// debugger; | ||
if (fs.existsSync(path)) { | ||
// console.log(`file ${fileName} already exists!`); | ||
return true; | ||
} | ||
// console.log(`file ${fileName} is being created for the first time`); | ||
return false; | ||
@@ -59,0 +56,0 @@ |
@@ -8,3 +8,3 @@ const sanitize = require('sanitize-filename'); | ||
const cleanUrl = removeQueryString(url) | ||
// console.log('filenamecontentdisposition',fileNameFromContentDisposition) | ||
const fileNameFromContentDisposition = getFileNameFromContentDisposition(headers['content-disposition'] || headers['Content-Disposition']); | ||
@@ -14,3 +14,3 @@ | ||
// debugger; | ||
//Second option | ||
@@ -35,3 +35,3 @@ if (path.extname(cleanUrl)) {//First check if the url even has an extension | ||
function getFileNameFromContentDisposition(contentDisposition) { | ||
// debugger; | ||
if (!contentDisposition || !contentDisposition.includes('filename=')) { | ||
@@ -52,3 +52,2 @@ return ""; | ||
// console.log(contentType) | ||
let extension = mime.extension(contentType) | ||
@@ -66,3 +65,3 @@ | ||
function removeExtension(str) { | ||
// debugger; | ||
const arr = str.split('.'); | ||
@@ -82,3 +81,3 @@ if (arr.length == 1) { | ||
function deduceFileNameFromUrl(url) { | ||
// debugger; | ||
const baseName = sanitize(path.basename(url)); | ||
@@ -85,0 +84,0 @@ return baseName; |
@@ -1,4 +0,2 @@ | ||
counter = 0; | ||
const {request} = require('../request/request.js'); | ||
// const sanitize = require('sanitize-filename'); | ||
const { request } = require('../request/request.js'); | ||
const path = require('path'); | ||
@@ -12,8 +10,8 @@ const FileProcessor = require('./file_processor'); | ||
const getFileNameFromResponse = require('./fileNameFromResponse') | ||
// var mime = require('mime-types') | ||
class FileDownloader { | ||
constructor({ url, shouldBufferResponse = false, directory, cloneFiles, auth, timeout, headers, proxy }) { | ||
constructor({ url, shouldBufferResponse = false, directory, cloneFiles, auth, timeout, headers, proxy }) { | ||
this.url = url; | ||
@@ -31,3 +29,3 @@ this.directory = directory; | ||
async download() { | ||
// debugger; | ||
try { | ||
@@ -45,3 +43,2 @@ | ||
}) | ||
this.response = response; | ||
@@ -51,16 +48,13 @@ this.data = response.data; | ||
} catch (error) { | ||
throw error; | ||
} | ||
} | ||
} | ||
getFileNameData() { | ||
const originalFileName = getFileNameFromResponse(this.url, this.response.headers); | ||
const originalFileName = getFileNameFromResponse(this.url,this.response.headers); | ||
let finalFileName; | ||
let finalFileName; | ||
const fileProcessor = new FileProcessor({ fileName: originalFileName, path: this.directory }); | ||
@@ -74,4 +68,2 @@ if (this.cloneFiles) { | ||
const initialFileNameExists = fileProcessor.didInitialFileNameExist(); | ||
if (initialFileNameExists) counter++ | ||
// console.log('initialFileNameExists',counter) | ||
@@ -88,8 +80,5 @@ return {//Return an object with both the "original"(deduced from the URL and headers) file name, and the final one | ||
async save() { | ||
// debugger; | ||
try { | ||
// debugger; | ||
const { originalFileName, finalFileName, initialFileNameExists } = this.getFileNameData(); | ||
const { finalFileName } = this.getFileNameData(); | ||
if (this.shouldBufferResponse) { | ||
@@ -103,6 +92,4 @@ | ||
} | ||
} | ||
catch (error) { | ||
// debugger; | ||
throw error | ||
@@ -109,0 +96,0 @@ } |
@@ -5,4 +5,2 @@ const CollectContent = require('./operations/CollectContent'), | ||
Root = require('./operations/Root'), | ||
// ScrollToBottom = require('./limitedSpa/ScrollToBottom'), | ||
// ClickButton = require('./limitedSpa/ClickButton'), | ||
Scraper = require('./Scraper.js'); | ||
@@ -16,4 +14,2 @@ | ||
CollectContent, | ||
// ScrollToBottom, | ||
// ClickButton | ||
}; | ||
@@ -20,0 +16,0 @@ |
const Operation = require('./Operation') | ||
var cheerio = require('cheerio'); | ||
// var cheerioAdv = require('cheerio-advanced-selectors') | ||
// cheerio = cheerioAdv.wrap(cheerio) | ||
// const fs = require('fs') | ||
const { createElementList, getNodeContent } = require('../utils/cheerio') | ||
// const { CustomResponse } = require('../request/request') | ||
@@ -13,3 +9,3 @@ class CollectContent extends Operation { | ||
* | ||
* @param {string} querySelector cheerio-advanced-selectors selector | ||
* @param {string} querySelector | ||
* @param {Object} [config] | ||
@@ -28,3 +24,2 @@ * @param {string} [config.name = 'Default CollectContent name'] | ||
this.querySelector = querySelector; | ||
// this.validateOperationArguments(); | ||
if (typeof this.config.shouldTrim !== 'undefined') {//Checks if the user passed a "shouldTrim" property. | ||
@@ -51,9 +46,2 @@ this.config.shouldTrim = this.config.shouldTrim; | ||
async scrape({ html, url }) { | ||
// this.scraper.log('colelcting content',url) | ||
// debugger; | ||
// if(this.config.name === 'videoLabel') | ||
// debugger; | ||
// const arr = url.split('/'); | ||
// const fileName = arr[arr.length-1] | ||
// fs.writeFile(`${this.scraper.config.logPath}/${fileName}.html`,html,()=>{}) | ||
@@ -64,10 +52,6 @@ const parentAddress = url | ||
this.config.contentType = this.config.contentType || 'text'; | ||
// !responseObjectFromParent && this.scraper.log('Empty response from content operation', responseObjectFromParent) | ||
// debugger; | ||
var $ = cheerio.load(html); | ||
const elementList = await createElementList($, this.querySelector, { condition: this.config.condition, slice: this.config.slice }); | ||
// if(this.config.name === 'videoLabel') | ||
// this.scraper.log(url,' Number of video elements: ',elementList.length) | ||
if (this.config.getElementList) { | ||
@@ -91,3 +75,2 @@ await this.config.getElementList(elementList, parentAddress); | ||
if (this.config.getAllItems) { | ||
// await this.config.afterScrape(currentWrapper); | ||
await this.config.getAllItems(iterations, parentAddress); | ||
@@ -94,0 +77,0 @@ } |
const HttpOperation = require('./HttpOperation'); | ||
var cheerio = require('cheerio') | ||
// var cheerioAdv = require('cheerio-advanced-selectors'); | ||
// cheerio = cheerioAdv.wrap(cheerio); | ||
const fs = require('fs'); | ||
@@ -9,5 +7,3 @@ const { promisify } = require('util'); | ||
const Downloader = require('../file_downloader') | ||
// const Downloader = require('nodejs-file-downloader') | ||
const FileProcessor = require('../file_downloader/file_processor'); | ||
// const FileProcessor = require('nodejs-file-downloader/FileProcessor.js'); | ||
const crypto = require('crypto') | ||
@@ -38,8 +34,6 @@ const { verifyDirectoryExists } = require('../utils/files') | ||
constructor(querySelector, config = {}) { | ||
// debugger; | ||
super(config); | ||
this.querySelector = querySelector; | ||
// this.overridableProps = ['filePath', 'fileFlag', 'imageResponseType']; | ||
this.overridableProps = ['filePath']; | ||
@@ -76,3 +70,3 @@ for (let prop in config) { | ||
*/ | ||
async scrape({html,url}) { | ||
async scrape({ html, url }) { | ||
// debugger; | ||
@@ -94,4 +88,3 @@ if (!this.directoryVerified) { | ||
} | ||
// debugger;config. | ||
const fileRefs = this.getFileRefs(url,elementList,baseUrlFromBaseTag) | ||
const fileRefs = this.getFileRefs(url, elementList, baseUrlFromBaseTag) | ||
@@ -117,3 +110,3 @@ | ||
*/ | ||
getFileRefs(url,elementList,baseUrlFromBaseTag){ | ||
getFileRefs(url, elementList, baseUrlFromBaseTag) { | ||
const fileRefs = [] | ||
@@ -140,3 +133,3 @@ elementList.forEach((element) => { | ||
const absoluteUrl = getAbsoluteUrl(baseUrlFromBaseTag || url, src); | ||
fileRefs.push(absoluteUrl); | ||
fileRefs.push(absoluteUrl); | ||
@@ -180,6 +173,4 @@ }) | ||
const fileProcessor = new FileProcessor({fileName: `${fileName}.${extension}`, path: this.config.filePath || this.scraper.config.filePath }); | ||
const fileProcessor = new FileProcessor({ fileName: `${fileName}.${extension}`, path: this.config.filePath || this.scraper.config.filePath }); | ||
if (this.scraper.config.cloneFiles) { | ||
// debugger; | ||
// fileName = await fileProcessor.getAvailableFileName(); | ||
fileName = fileProcessor.getAvailableFileName(); | ||
@@ -193,3 +184,2 @@ } else { | ||
// this.scraper.log('images:', this.scraper.state.downloadedFiles) | ||
@@ -209,3 +199,3 @@ } | ||
const options = { | ||
@@ -218,6 +208,4 @@ url, | ||
timeout: this.scraper.config.timeout, | ||
// timeout: 150, | ||
headers: this.scraper.config.headers, | ||
proxy: this.scraper.config.proxy, | ||
// useSynchronousMode:true | ||
@@ -224,0 +212,0 @@ } |
@@ -1,2 +0,1 @@ | ||
// const Operation = require('../Operation');//For jsdoc | ||
const { request } = require('../../request/request.js'); | ||
@@ -7,6 +6,3 @@ const { stripTags } = require('../../utils/html'); | ||
const { getPaginationUrls } = require('../../utils/pagination'); | ||
// const { CustomResponse } = require('../../request/request')//For jsdoc | ||
// const SPA_page = require('../../limitedSpa/SPA_Page'); | ||
// require('../typedef.js'); | ||
@@ -24,4 +20,2 @@ class PageHelper { | ||
/** | ||
@@ -34,3 +28,3 @@ * | ||
async processOneIteration(href, shouldPaginate) {//Will process one scraping object, including a pagination object. Used by Root and OpenLinks. | ||
// debugger; | ||
if (shouldPaginate) {//If the scraping object is actually a pagination one, a different function is called. | ||
@@ -48,8 +42,7 @@ return this.paginate(href); | ||
var response = await this.getPage(href); | ||
// debugger | ||
await this.runAfterResponseHooks(response) | ||
// debugger; | ||
var dataFromChildren = await this.Operation.scrapeChildren(this.Operation.operations, {html:response.data,url:response.url}) | ||
var dataFromChildren = await this.Operation.scrapeChildren(this.Operation.operations, { html: response.data, url: response.url }) | ||
await this.runGetPageObjectHook(href, dataFromChildren) | ||
@@ -60,4 +53,3 @@ | ||
catch (error) { | ||
// this.Operation.scraper.log(error) | ||
// debugger; | ||
const errorString = `There was an error opening page ${href}, ${error}`; | ||
@@ -128,3 +120,3 @@ iteration.error = errorString; | ||
if (this.Operation.config.getPageHtml) { | ||
// debugger; | ||
await this.Operation.config.getPageHtml(resp.data, resp.url) | ||
@@ -134,3 +126,3 @@ } | ||
} catch (error) { | ||
// debugger; | ||
throw error; | ||
@@ -155,14 +147,10 @@ } | ||
if (this.Operation.config.getPageObject) { | ||
// debugger; | ||
const tree = { | ||
} | ||
const tree = {} | ||
for (let child of dataFromChildren) { | ||
// debugger; | ||
// tree[child.name] = child.data | ||
const func = getDictionaryKey(child.name); | ||
tree[func(child.name, tree)] = child.data | ||
} | ||
await this.Operation.config.getPageObject(tree,address) | ||
await this.Operation.config.getPageObject(tree, address) | ||
} | ||
@@ -182,3 +170,3 @@ } | ||
if (this.Operation.config.getPageResponse) {//If a "getResponse" callback was provided, it will be called | ||
// debugger; | ||
if (typeof this.Operation.config.getPageResponse !== 'function') | ||
@@ -185,0 +173,0 @@ throw "'getPageResponse' callback must be a function"; |
const Operation = require('./Operation'); | ||
var cheerio = require('cheerio') | ||
// var cheerioAdv = require('cheerio-advanced-selectors'); | ||
// cheerio = cheerioAdv.wrap(cheerio); | ||
const { createDelay } = require('../utils/delay'); | ||
@@ -17,4 +14,2 @@ const rpur = require('../utils/rpur') | ||
// this.virtualOperations = [];//Will hold "virtual operations" performed by Puppeteer, which are out of the normal scraping flow. | ||
if (this.condition) { | ||
@@ -28,3 +23,2 @@ const type = typeof this.condition; | ||
} | ||
this.counter = 0; | ||
@@ -46,15 +40,15 @@ | ||
async repeatPromiseUntilResolved(promiseFactory, href) { | ||
// debugger; | ||
const maxAttempts = this.scraper.config.maxRetries + 1;//Note that "maxRetries refers" to the number of retries, whereas | ||
//"maxAttempts" is the overall number of iterations, therefore adding 1. | ||
const shouldStop = (error) => { | ||
// debugger; | ||
const errorCode = error.response ? error.response.status : error | ||
if (this.scraper.config.errorCodesToSkip.includes(errorCode)) { | ||
// debugger; | ||
const error = new Error(); | ||
error.message = `Skipping error ${errorCode}`; | ||
// debugger; | ||
error.code = errorCode; | ||
@@ -69,3 +63,2 @@ return true | ||
this.scraper.log(`Retrying failed promise...error: ${error}, 'href:' ${href}`); | ||
// this.scraper.log('Retrying failed promise...error:', error); | ||
const newRetries = retries + 1; | ||
@@ -76,3 +69,2 @@ this.scraper.log(`Retreis ${newRetries}`) | ||
// return await this.qyuFactory(() => this.repeatPromiseUntilResolved(promiseFactory, url)); | ||
@@ -121,3 +113,2 @@ return await rpur(promiseFactory, { maxAttempts, shouldStop, onError, timeout: 0 }); | ||
let currentSpacer = this.scraper.requestSpacer; | ||
// this.scraper.requestSpacer = currentSpacer.then(() => Promise.delay(this.scraper.config.delay)); | ||
this.scraper.requestSpacer = currentSpacer.then(() => createDelay(this.scraper.config.delay)); | ||
@@ -131,3 +122,2 @@ await currentSpacer; | ||
} | ||
// Object.assign(HttpOperation.prototype,PageMixin) | ||
module.exports = HttpOperation; |
@@ -8,18 +8,15 @@ | ||
injectScraper: function (ScraperInstance) {//Override the original init function of Operation | ||
this.scraper = ScraperInstance; | ||
// debugger; | ||
ScraperInstance.registerOperation(this); | ||
for (let operation of this.operations) { | ||
operation.injectScraper(ScraperInstance); | ||
} | ||
this.validateOperationArguments(); | ||
}, | ||
}; | ||
module.exports = CompositeInjectMixin; | ||
injectScraper: function (ScraperInstance) {//Override the original init function of Operation | ||
this.scraper = ScraperInstance; | ||
ScraperInstance.registerOperation(this); | ||
for (let operation of this.operations) { | ||
operation.injectScraper(ScraperInstance); | ||
} | ||
this.validateOperationArguments(); | ||
}, | ||
}; | ||
module.exports = CompositeInjectMixin; |
@@ -6,16 +6,13 @@ | ||
*/ | ||
const CompositeScrapeMixin = { | ||
const CompositeScrapeMixin = { | ||
scrapeChildren: async function (childOperations, {url,html}) {//Scrapes the child operations of this OpenLinks object. | ||
scrapeChildren: async function (childOperations, { url, html }) {//Scrapes the child operations of this OpenLinks object. | ||
// debugger; | ||
const scrapedData = [] | ||
for (let operation of childOperations) { | ||
// const dataFromChild = await operation.scrape(responseObjectFromParent); | ||
const dataFromChild = await operation.scrape({url,html}); | ||
const dataFromChild = await operation.scrape({ url, html }); | ||
scrapedData.push(dataFromChild); | ||
} | ||
// responseObjectFromParent = null; | ||
return scrapedData; | ||
@@ -22,0 +19,0 @@ } |
const HttpOperation = require('./HttpOperation'); | ||
const CompositeInjectMixin = require('./mixins/CompositeInjectMixin'); | ||
const CompositeScrapeMixin = require('./mixins/CompositeScrapeMixin'); | ||
// const Operation = require('./Operation')//For jsdoc | ||
var cheerio = require('cheerio') | ||
// var cheerioAdv = require('cheerio-advanced-selectors'); | ||
// cheerio = cheerioAdv.wrap(cheerio); | ||
const { getBaseUrlFromBaseTag, createElementList } = require('../utils/cheerio'); | ||
const { getAbsoluteUrl } = require('../utils/url'); | ||
const PageHelper = require('./helpers/PageHelper'); | ||
// const SPA_PageHelper = require('./helpers/SPA_PageHelper'); | ||
// const { CustomResponse } = require('../request/request');//For jsdoc | ||
const { mapPromisesWithLimitation } = require('../utils/concurrency'); | ||
@@ -45,6 +40,3 @@ | ||
super(config); | ||
// this.pageHelper = new PageHelper(this); | ||
this.pageHelper = null; | ||
// this.compositeHelper = new CompositeHelper(this); | ||
// this.virtualOperations = [] | ||
this.operations = [];//References to child operation objects. | ||
@@ -67,3 +59,2 @@ this.querySelector = querySelector; | ||
addOperation(Operation) { | ||
// this._addOperation(Operation); | ||
this.operations.push(Operation) | ||
@@ -73,7 +64,3 @@ } | ||
initPageHelper() { | ||
if (!this.scraper.config.usePuppeteer) { | ||
this.pageHelper = new PageHelper(this) | ||
} else { | ||
this.pageHelper = new SPA_PageHelper(this); | ||
} | ||
this.pageHelper = new PageHelper(this) | ||
} | ||
@@ -96,3 +83,3 @@ | ||
this.initPageHelper(); | ||
// debugger; | ||
const refs = await this.createLinkList(html, url) | ||
@@ -105,3 +92,3 @@ | ||
} | ||
// debugger; | ||
const shouldPaginate = this.config.pagination ? true : false; | ||
@@ -111,3 +98,3 @@ const iterations = []; | ||
await mapPromisesWithLimitation(refs, async (href) => { | ||
// debugger; | ||
const data = await this.pageHelper.processOneIteration( | ||
@@ -133,5 +120,5 @@ this.transformHref(href), | ||
async createLinkList(html, url) { | ||
// debugger; | ||
var $ = cheerio.load(html); | ||
// debugger; | ||
const elementList = await createElementList($, this.querySelector, { | ||
@@ -138,0 +125,0 @@ condition: this.config.condition, |
// const Scraper = require('../Scraper');//For jsdoc | ||
/** | ||
* Base class for all operations(not including limitedSpa). | ||
* Base class for all operations | ||
* Every Operation must implement its own scrape() method. | ||
@@ -12,3 +11,2 @@ */ | ||
this.config = {} | ||
@@ -24,4 +22,2 @@ if (objectConfig) { | ||
// this.querySelector = querySelector; | ||
// this.config.name = this.getOperationName(this.config.name); | ||
this.scraper = null; //Scraper instance is passed later on. | ||
@@ -31,7 +27,5 @@ this.data = []; //Holds all data collected by this operation, in the form of possibly multiple "ScrapingWrappers". | ||
} | ||
} | ||
/** | ||
@@ -46,7 +40,5 @@ * Being that all Operation objects are created independetly from the Scraper, a Scraper reference must be passed to them. | ||
this.handleNewOperationCreation(this) | ||
// debugger; | ||
this.handleNewOperationCreation(this) | ||
this.validateOperationArguments();//Implemented by all Operation objects | ||
} | ||
@@ -70,7 +62,4 @@ | ||
handleFailedScrapingIteration(errorString) { | ||
// handleFailedScrapingIteration(error) { | ||
this.scraper.log(errorString); | ||
// scrapingAction.setError(errorString, errorCode) | ||
this.scraper.reportFailedScrapingAction(errorString); | ||
} | ||
@@ -87,3 +76,2 @@ | ||
/** | ||
@@ -93,4 +81,3 @@ * Get the entire data collected by this operation | ||
*/ | ||
getData() { | ||
// debugger; | ||
getData() { | ||
return this.data; | ||
@@ -100,4 +87,2 @@ } | ||
/** | ||
@@ -110,8 +95,5 @@ * @return {string[]} | ||
} | ||
module.exports = Operation; |
@@ -5,5 +5,3 @@ const HttpOperation = require('./HttpOperation'); | ||
const PageHelper = require('./helpers/PageHelper'); | ||
// const SPA_PageHelper = require('./helpers/SPA_PageHelper'); | ||
/** | ||
@@ -30,4 +28,2 @@ * | ||
this.operations = [];//References to child operation objects. | ||
// this.virtualOperations = [] | ||
// this.pageHelper = new PageHelper(this); | ||
this.pageHelper = null; | ||
@@ -41,3 +37,2 @@ } | ||
addOperation(Operation) { | ||
// this._addOperation(Operation); | ||
this.operations.push(Operation) | ||
@@ -48,7 +43,3 @@ } | ||
initPageHelper() { | ||
if (!this.scraper.config.usePuppeteer) { | ||
this.pageHelper = new PageHelper(this) | ||
}else{ | ||
this.pageHelper = new SPA_PageHelper(this); | ||
} | ||
this.pageHelper = new PageHelper(this) | ||
} | ||
@@ -64,6 +55,6 @@ | ||
const shouldPaginate = this.config.pagination ? true : false; | ||
// debugger; | ||
const data = await this.pageHelper.processOneIteration(this.scraper.config.startUrl, shouldPaginate); | ||
// debugger; | ||
this.data = data | ||
@@ -80,4 +71,3 @@ if (this.config.getPageData) { | ||
*/ | ||
getErrors() { | ||
// debugger; | ||
getErrors() { | ||
let errors = [...this.errors]; | ||
@@ -93,11 +83,5 @@ | ||
validateOperationArguments() { | ||
// return; | ||
} | ||
} | ||
Object.assign(Root.prototype, CompositeInjectMixin) | ||
@@ -104,0 +88,0 @@ Object.assign(Root.prototype, CompositeScrapeMixin) |
{ | ||
"name": "nodejs-web-scraper", | ||
"version": "6.1.0", | ||
"version": "6.1.1", | ||
"description": "A web scraper for NodeJs", | ||
@@ -5,0 +5,0 @@ "main": "index.js", |
const fetch = require('node-fetch') | ||
// const fetch = require('./fetch.js') | ||
var HttpsProxyAgent = require('https-proxy-agent'); | ||
// import AbortController from 'abort-controller'; | ||
// const Signal = require('./signal.js') | ||
@@ -13,3 +10,2 @@ function createInstance(config) { | ||
function request(config) { | ||
// throw 'error from request.request'; | ||
return createInstance(config).getFinalResponseObject(); | ||
@@ -20,12 +16,9 @@ } | ||
// module.exports = class Request { | ||
class Request { | ||
constructor(config) { | ||
this.originalResponse = null;//Original response object from fetch. | ||
constructor(config) { | ||
// debugger; | ||
this.originalResponse = null;//Original response object from fetch. | ||
// debugger; | ||
// this.abortController = new AbortController() | ||
const defaultConfig = { | ||
@@ -35,3 +28,2 @@ method: 'GET', | ||
headers: null, | ||
// signal: new Signal(), | ||
proxy: null,//Proxy string | ||
@@ -42,6 +34,6 @@ responseType: 'text',//'text','json' or 'stream'. If 'stream' is chosen, the stream itself is returned. | ||
} | ||
// debugger; | ||
if(config.headers){ | ||
if (config.headers) { | ||
const isEmpty = Object.keys(config.headers).length === 0 && config.headers.constructor === Object; | ||
if(isEmpty){ | ||
if (isEmpty) { | ||
config.headers = null; | ||
@@ -59,13 +51,11 @@ } | ||
} | ||
// debugger; | ||
} | ||
if (this.config.proxy) { | ||
// this.config.agent = getAgent(this.config.proxy); | ||
this.config.agent = new HttpsProxyAgent(this.config.proxy) | ||
} | ||
// debugger; | ||
} | ||
@@ -75,9 +65,8 @@ | ||
// controller.abort() | ||
const url = config.url; | ||
// debugger; | ||
const response = await fetch(url, config); | ||
// debugger; | ||
this.originalResponse = response; | ||
return response; | ||
@@ -89,11 +78,8 @@ } | ||
getRequestHeaders() { | ||
// debugger; | ||
// console.log(this) | ||
return { | ||
"Accept-Encoding": "gzip,deflate", | ||
// 'User-Agent': "node-fetch/1.0 (+https://github.com/bitinn/node-fetch)", | ||
'User-Agent': "node-fetch/1.0", | ||
"Accept": "*/*", | ||
...this.config.headers, | ||
} | ||
@@ -106,3 +92,2 @@ } | ||
case 'text': | ||
// debugger; | ||
data = await fetchResponse.text(); | ||
@@ -113,3 +98,3 @@ | ||
case 'buffer': | ||
// debugger; | ||
data = await fetchResponse.buffer(); | ||
@@ -134,3 +119,3 @@ | ||
const requestHeaders = this.getRequestHeaders() | ||
// debugger; | ||
return new CustomResponse({ | ||
@@ -148,10 +133,5 @@ config: { ...this.config, headers: requestHeaders }, | ||
handleStatusCodes(customResponse) { | ||
// debugger; | ||
// const {status} = fetchResponse | ||
// const response= this.createCustomResponseObjectFromFetchResponse(fetchResponse); | ||
if (customResponse.status >= 400) { | ||
const error = new CustomError({ code: customResponse.status, response: customResponse, message: `Server responded with ${customResponse.status}` }) | ||
// debugger; | ||
throw error; | ||
} | ||
@@ -161,3 +141,3 @@ } | ||
createCustomErrorFromFetchError(fetchError) {//Fetch errors are thrown only for network errors. There is no actual "response". | ||
// debugger; | ||
const error = new CustomError({ errno: fetchError.errno, message: fetchError.message }) | ||
@@ -169,8 +149,6 @@ return error; | ||
async getFinalResponseObject() { | ||
// throw 'error from request.getFinalResponseObject' | ||
// console.log(this.config) | ||
try { | ||
var response = await this.performRequest(this.config); | ||
} catch (fetchError) {//Network error has ocurred. | ||
// debugger; | ||
const error = this.createCustomErrorFromFetchError(fetchError) | ||
@@ -189,3 +167,2 @@ throw error; | ||
} | ||
@@ -203,4 +180,2 @@ } | ||
this.headers = headers | ||
// this.aborted = false | ||
// this.signal = new Signal(); | ||
} | ||
@@ -213,6 +188,5 @@ | ||
class CustomError extends Error { | ||
// debugger; | ||
constructor({ code, response, message, errno }) { | ||
super(message) | ||
// this.config = config;//The config object of the failing request | ||
this.errno = errno//Error constant. Will be set Only in the case of network errors. | ||
@@ -224,6 +198,3 @@ this.code = code;//http code.Null if network error | ||
// module.exports.default = request; | ||
module.exports = {request, CustomResponse}; | ||
// module.exports.Request = Request; | ||
module.exports = { request, CustomResponse }; | ||
// debugger; |
@@ -7,5 +7,2 @@ | ||
const { deepSpread } = require('./utils/objects') | ||
// const { Root } = require('./');//For jsdoc | ||
// const PathQueue = require('./utils/PathQueue'); | ||
// const PuppeteerSimple = require('puppeteer-simple').default | ||
@@ -16,4 +13,2 @@ /** | ||
*/ | ||
class Scraper { | ||
@@ -58,17 +53,5 @@ | ||
showConsoleLogs: true, | ||
usePuppeteer: false,//Deprecated | ||
puppeteerDebugMode: false,//For debugging | ||
puppeteerConfig: { | ||
// headless: false, | ||
// args:[], | ||
defaultViewport: null, | ||
timeout: 40000,//40 seconds for full page load(network idle) | ||
waitUntil: 'networkidle0', | ||
...globalConfig.puppeteerConfig | ||
}, | ||
onError: null //callback runs whenever any error occurs during scraping | ||
} | ||
// this.state = new State(); | ||
this.state = { | ||
// existingUserFileDirectories: [], | ||
failedScrapingIterations: [], | ||
@@ -82,9 +65,5 @@ downloadedFiles: 0, | ||
this.validateGlobalConfig(globalConfig); | ||
deepSpread(this.config, globalConfig) | ||
this.validateGlobalConfig(globalConfig); | ||
deepSpread(this.config,globalConfig) | ||
// debugger; | ||
this.config.errorCodesToSkip = [404, 403, 400]; | ||
@@ -94,17 +73,7 @@ | ||
this.requestSpacer = Promise.resolve(); | ||
// debugger; | ||
if (this.config.usePuppeteer) { | ||
throw new Error('usePuppeteer is deprecated since version 5. If you need it, downgrade to version 4.2.2') | ||
// debugger; | ||
const puppeteerConfig = this.config.puppeteerConfig; | ||
// debugger | ||
// const { args,headless } = puppeteerConfig; | ||
this.puppeteerSimple = new PuppeteerSimple({ ...puppeteerConfig}) | ||
this.isBrowserReady = this.puppeteerSimple.createBrowser(); | ||
} | ||
// this.pathQueue = new PathQueue(); | ||
this.referenceToRoot = null; | ||
@@ -126,8 +95,3 @@ | ||
getPuppeteerSimpleInstance() { | ||
return this.puppeteerSimple; | ||
} | ||
validateGlobalConfig(conf) { | ||
@@ -142,4 +106,2 @@ if (!conf || typeof conf !== 'object') | ||
/** | ||
@@ -155,5 +117,3 @@ * Starts the recursive scraping process | ||
this.referenceToRoot = rootObject; | ||
// debugger; | ||
// rootObject.injectScraper(this) | ||
// debugger; | ||
rootObject.injectScraper(this) | ||
@@ -173,3 +133,2 @@ | ||
} | ||
// this.outPutErrors(); | ||
if (this.config.logPath) { | ||
@@ -182,21 +141,4 @@ try { | ||
} | ||
// this.log('global.counter of alternative src ',global.counter) | ||
this.log(`overall files: ${this.state.downloadedFiles}`) | ||
if (this.config.usePuppeteer) { | ||
// setTimeout(()=>{ | ||
if (!this.config.puppeteerDebugMode) { | ||
try { | ||
await this.puppeteerSimple.close() | ||
} catch (error) { | ||
this.log('Error shutting down puppeteer',error) | ||
} | ||
} | ||
// },1000) | ||
} | ||
} | ||
@@ -209,3 +151,2 @@ | ||
areThereRepeatableErrors() { | ||
// debugger; | ||
return this.state.failedScrapingIterations.length > 0; | ||
@@ -221,3 +162,3 @@ } | ||
this.state.failedScrapingIterations.push(errorString); | ||
if(this.config.onError) this.config.onError(errorString); | ||
if (this.config.onError) this.config.onError(errorString); | ||
} | ||
@@ -233,8 +174,5 @@ | ||
saveFile(data, fileName) { | ||
// verifyDirectoryExists(this.config.logPath); | ||
return new Promise(async (resolve, reject) => { | ||
await verifyDirectoryExists(this.config.logPath); | ||
// this.log('saving file') | ||
// debugger; | ||
fs.writeFile(path.join(this.config.logPath, `${fileName}.json`), JSON.stringify(data), (error) => { | ||
@@ -247,3 +185,2 @@ if (error) { | ||
} | ||
}); | ||
@@ -259,3 +196,3 @@ | ||
async createLogs() { | ||
// debugger; | ||
for (let operation of this.state.registeredOperations) { | ||
@@ -267,3 +204,2 @@ const fileName = operation.constructor.name === 'Root' ? 'log' : operation.config.name; | ||
await this.createLog({ fileName: 'finalErrors', data: this.state.failedScrapingIterations }) | ||
// await this.createLog({ fileName: 'allErrors', data: this.referenceToRoot.getErrors() }) | ||
} | ||
@@ -294,5 +230,5 @@ | ||
module.exports = Scraper; | ||
// debugger; | ||
@@ -7,6 +7,6 @@ | ||
*/ | ||
function getBaseUrlFromBaseTag($,baseSiteUrl) { | ||
function getBaseUrlFromBaseTag($, baseSiteUrl) { | ||
let baseMetaTag = $('base'); | ||
// debugger; | ||
if (baseMetaTag.length == 0 || baseMetaTag.length > 1) { | ||
@@ -38,4 +38,4 @@ baseMetaTag = null; | ||
*/ | ||
function createNodeList($,querySelector,config={}) {//Gets a cheerio object and creates a nodelist. | ||
const {slice}=config; | ||
function createNodeList($, querySelector, config = {}) {//Gets a cheerio object and creates a nodelist. | ||
const { slice } = config; | ||
const nodeList = slice ? $(querySelector).slice(typeof slice === 'number' ? slice : slice[0], slice[1]) : $(querySelector); | ||
@@ -55,5 +55,5 @@ | ||
*/ | ||
async function createElementList($,querySelector,config={}) { | ||
const {condition,slice} = config | ||
const nodeList = Array.from(createNodeList($,querySelector,{slice})); | ||
async function createElementList($, querySelector, config = {}) { | ||
const { condition, slice } = config | ||
const nodeList = Array.from(createNodeList($, querySelector, { slice })); | ||
const elementList = []; | ||
@@ -84,4 +84,4 @@ for (let node of nodeList) { | ||
*/ | ||
function getNodeContent(elem,config={}) { | ||
const {contentType,shouldTrim} = config; | ||
function getNodeContent(elem, config = {}) { | ||
const { contentType, shouldTrim } = config; | ||
const getText = () => shouldTrim ? elem.text().trim() : elem.text();//Will trim the string, if "shouldTrim" is true. | ||
@@ -94,4 +94,3 @@ switch (contentType) { | ||
default: | ||
return getText();; | ||
return getText(); | ||
} | ||
@@ -101,3 +100,3 @@ } | ||
module.exports ={ | ||
module.exports = { | ||
getBaseUrlFromBaseTag, | ||
@@ -104,0 +103,0 @@ createNodeList, |
const fs = require('fs'); | ||
// const {promisify} = require('util'); | ||
// const access = promisify(fs.access); | ||
// const mkdir = promisify(fs.mkdir); | ||
function verifyDirectoryExists(path) {//Will make sure the target directory exists. | ||
if (!fs.existsSync(path)) { | ||
// console.log('creating dir:', path) | ||
fs.mkdirSync(path,{recursive:true}); | ||
fs.mkdirSync(path, { recursive: true }); | ||
} | ||
} | ||
// async function verifyDirectoryExists(path){ | ||
// try { | ||
// await access(path); | ||
// } catch (error) { | ||
// console.log('error from verify',error) | ||
// try { | ||
// await mkdir(path); | ||
// } catch (error) { | ||
// if(error.code !== 'EEXIST'){ | ||
// throw error; | ||
// } | ||
// // debugger | ||
// } | ||
// } | ||
// } | ||
function verifyDirectoryExistsAsync(path) { | ||
function verifyDirectoryExistsAsync(path){ | ||
return new Promise((resolve, reject) => { | ||
fs.access(path, (err) => { | ||
if (err) { | ||
fs.mkdir(path, { recursive: true }, (err) => { | ||
resolve(); | ||
}) | ||
} else { | ||
resolve(); | ||
} | ||
}) | ||
}) | ||
return new Promise((resolve,reject)=>{ | ||
fs.access(path,(err)=>{ | ||
// debugger; | ||
if(err){ | ||
fs.mkdir(path,{recursive:true},(err)=>{ | ||
// debugger; | ||
resolve(); | ||
}) | ||
}else{ | ||
resolve(); | ||
} | ||
}) | ||
}) | ||
} | ||
@@ -53,0 +27,0 @@ |
@@ -8,8 +8,8 @@ | ||
function stripTags(html) {//Cleans the html string from script and style tags. | ||
let clean; | ||
clean = html.replace(/<\s*script[^>]*>[\s\S]*?(<\s*\/script[^>]*>|$)/ig, ''); | ||
clean = clean.replace(/<style[^>]*>[\s\S]*?(<\/style[^>]*>|$)/ig, ''); | ||
return clean; | ||
let clean; | ||
clean = html.replace(/<\s*script[^>]*>[\s\S]*?(<\s*\/script[^>]*>|$)/ig, ''); | ||
clean = clean.replace(/<style[^>]*>[\s\S]*?(<\/style[^>]*>|$)/ig, ''); | ||
return clean; | ||
} | ||
@@ -16,0 +16,0 @@ |
@@ -1,2 +0,1 @@ | ||
/** | ||
@@ -10,3 +9,2 @@ * To prevent dictionary key-collision, get a number-appended key. Returns a higher order function, to preserve original key. | ||
if (!dictionary[keyName]) { | ||
// console.log('new file name', newFileName) | ||
return keyName; | ||
@@ -34,15 +32,14 @@ } | ||
*/ | ||
function deepSpread(originalObject,secondaryObject){ | ||
// debugger | ||
if(!originalObject) | ||
function deepSpread(originalObject, secondaryObject) { | ||
if (!originalObject) | ||
originalObject = {} | ||
for(let prop in secondaryObject){ | ||
if(typeof secondaryObject[prop] === 'object' && !Array.isArray(secondaryObject[prop])){ | ||
// debugger; | ||
deepSpread(originalObject[prop],secondaryObject[prop]); | ||
}else{ | ||
originalObject[prop] = secondaryObject[prop] | ||
for (let prop in secondaryObject) { | ||
if (typeof secondaryObject[prop] === 'object' && !Array.isArray(secondaryObject[prop])) { | ||
deepSpread(originalObject[prop], secondaryObject[prop]); | ||
} else { | ||
originalObject[prop] = secondaryObject[prop] | ||
} | ||
} | ||
@@ -49,0 +46,0 @@ } |
@@ -8,6 +8,4 @@ /** | ||
function getPaginationUrls(address, { numPages, begin, end, offset = 1, queryString, routingString }) { | ||
// const numPages = pagination.numPages; | ||
const firstPage = typeof begin !== 'undefined' ? begin : 1; | ||
const lastPage = end || numPages; | ||
// const offset = offset || 1; | ||
const paginationUrls = [] | ||
@@ -14,0 +12,0 @@ for (let i = firstPage; i <= lastPage; i = i + offset) { |
const {createDelay} = require('./delay') | ||
/** | ||
@@ -15,12 +14,7 @@ * | ||
*/ | ||
// async function repeatPromiseUntilResolved(...args) {//Destructuring arguments in order to avoid having the "attempts" counter as part of the API. | ||
module.exports = async function rpur(promiseFactory,config={}) { | ||
// const promiseFactory = args[0] | ||
// const config = args[1] | ||
// const attempts = args[2] || 0 | ||
// debugger; | ||
const attempts = arguments[2] || 0 | ||
// console.log(attempts) | ||
// const {maxRetries} = config | ||
// debugger; | ||
const dummy = () => false; | ||
@@ -32,18 +26,15 @@ const shouldStop = config.shouldStop || dummy; | ||
try { | ||
// console.log('Attempt number: ',attempts+1) | ||
if (config.onAttempt) { | ||
await config.onAttempt(attempts + 1) | ||
} | ||
// debugger; | ||
const promise = promiseFactory(); | ||
const result = await promiseWithTimeout(promise, timeout); | ||
// const result = await promiseFactory(); | ||
return result; | ||
} catch (error) { | ||
// debugger; | ||
// console.log('Retrying failed promise'); | ||
const newAttempts = attempts + 1; | ||
if (config.onError) { | ||
// debugger; | ||
await config.onError(error, newAttempts) | ||
@@ -55,6 +46,2 @@ } | ||
throw error; | ||
// console.log('Attempts', newAttempts) | ||
// if (newAttempts == maxAttempts) {//If it reached the maximum allowed number of retries, it throws an error. | ||
// throw error; | ||
// } | ||
@@ -73,7 +60,6 @@ if (delay) { | ||
function promiseWithTimeout(promise, time) { | ||
// debugger; | ||
return new Promise(async (resolve, reject) => { | ||
if (time) { | ||
var timeout = setTimeout(() => { | ||
// console.log('timed out!') | ||
reject(new Error('Promise timed out as defined in the config')) | ||
@@ -80,0 +66,0 @@ }, time) |
@@ -9,6 +9,5 @@ | ||
function getAbsoluteUrl(base, relative) {//Handles the absolute URL. | ||
// debugger; | ||
const newUrl = new URL(relative, base).toString(); | ||
return newUrl; | ||
} | ||
@@ -15,0 +14,0 @@ |
89183
26
1474