nodejs-web-scraper
Advanced tools
Comparing version 5.0.1 to 5.1.0
@@ -12,5 +12,5 @@ const Operation = require('./Operation') | ||
/** | ||
* | ||
* @param {string} querySelector cheerio-advanced-selectors selector | ||
* @param {Object} [config] | ||
* | ||
* @param {string} querySelector cheerio-advanced-selectors selector | ||
* @param {Object} [config] | ||
* @param {string} [config.name = 'Default CollectContent name'] | ||
@@ -21,5 +21,5 @@ * @param {string} [config.contentType = 'text'] | ||
* @param {Function} [config.getElementList = null] Receives an elementList array | ||
* @param {Function} [config.getElementContent = null] Receives elementContentString and pageAddress | ||
* @param {Function} [config.getElementContent = null] Receives elementContentString, pageAddress, and element | ||
* @param {Function} [config.getAllItems = null] Receives all items collected from a specific page. Will run for each page. | ||
*/ | ||
@@ -46,5 +46,5 @@ constructor(querySelector, config) { | ||
/** | ||
* | ||
* @param {{url:string,html:string}} params | ||
* @return {Promise<{type:string,name:string,data:[]}>} | ||
* | ||
* @param {{url:string,html:string}} params | ||
* @return {Promise<{type:string,name:string,data:[]}>} | ||
*/ | ||
@@ -59,3 +59,3 @@ async scrape({ html, url }) { | ||
// fs.writeFile(`${this.scraper.config.logPath}/${fileName}.html`,html,()=>{}) | ||
const parentAddress = url | ||
@@ -82,3 +82,3 @@ | ||
if (this.config.getElementContent) { | ||
const contentFromCallback = await this.config.getElementContent(content, parentAddress) | ||
const contentFromCallback = await this.config.getElementContent(content, parentAddress, element) | ||
content = typeof contentFromCallback === 'string' ? contentFromCallback : content; | ||
@@ -85,0 +85,0 @@ } |
@@ -17,3 +17,3 @@ const HttpOperation = require('./HttpOperation'); | ||
/** | ||
* | ||
* | ||
* @mixes CompositeInjectMixin | ||
@@ -26,16 +26,17 @@ * @mixes CompositeScrapeMixin | ||
/** | ||
* | ||
* @param {string} querySelector cheerio-advanced-selectors selector | ||
* | ||
* @param {string} querySelector cheerio-advanced-selectors selector | ||
* @param {Object} [config] | ||
* @param {string} [config.name = 'Default OpenLinks name'] | ||
* @param {Object} [config.pagination = null] Look at the pagination API for more details. | ||
* @param {string} [config.name = 'Default OpenLinks name'] | ||
* @param {Object} [config.pagination = null] Look at the pagination API for more details. | ||
* @param {number[]} [config.slice = null] | ||
* @param {Function} [config.condition = null] Receives a Cheerio node. Use this hook to decide if this node should be included in the scraping. Return true or false | ||
* @param {Function} [config.getElementList = null] Receives an elementList array | ||
* @param {Function} [config.getPageData = null] | ||
* @param {Function} [config.getElementList = null] Receives an elementList array | ||
* @param {Function} [config.getPageData = null] | ||
* @param {Function} [config.getPageObject = null] Receives a dictionary of children, and an address argument | ||
* @param {Function} [config.getPageResponse = null] Receives an axiosResponse object | ||
* @param {Function} [config.getPageResponse = null] Receives an axiosResponse object | ||
* @param {Function} [config.getPageHtml = null] Receives htmlString and pageAddress | ||
* @param {Function} [config.getException = null] Listens to every exception. Receives the Error object. | ||
* | ||
* @param {Function} [config.getException = null] Listens to every exception. Receives the Error object. | ||
* @param {(href: string) => string} [config.transformHref = undefined] Callback that receives the href before it is opened. | ||
* | ||
*/ | ||
@@ -53,7 +54,14 @@ | ||
if (typeof config === 'object' && typeof config.transformHref === 'function') { | ||
this.transformHref = config.transformHref | ||
} else { | ||
this.transformHref = function (href) { | ||
return href | ||
} | ||
} | ||
} | ||
/** | ||
* | ||
* @param {Operation} Operation | ||
* | ||
* @param {Operation} Operation | ||
*/ | ||
@@ -68,3 +76,3 @@ addOperation(Operation) { | ||
this.pageHelper = new PageHelper(this) | ||
}else{ | ||
} else { | ||
this.pageHelper = new SPA_PageHelper(this); | ||
@@ -81,13 +89,12 @@ } | ||
/** | ||
* | ||
* @param {{url:string,html:string}} params | ||
* | ||
* @param {{url:string,html:string}} params | ||
* @return {Promise<{type:string,name:string,data:[]}>} | ||
*/ | ||
async scrape({url,html}) { | ||
async scrape({ url, html }) { | ||
if (!this.pageHelper) | ||
this.initPageHelper(); | ||
// debugger; | ||
const refs = await this.createLinkList(html,url) | ||
const refs = await this.createLinkList(html, url) | ||
@@ -105,3 +112,6 @@ const hasOpenLinksOperation = this.operations.filter(child => child.constructor.name === 'OpenLinks').length > 0;//Checks if the current page operation has any other page operations in it. If so, will force concurrency limitation. | ||
// debugger; | ||
const data = await this.pageHelper.processOneIteration(href, shouldPaginate) | ||
const data = await this.pageHelper.processOneIteration( | ||
this.transformHref(href), | ||
shouldPaginate | ||
) | ||
@@ -122,7 +132,10 @@ if (this.config.getPageData) | ||
async createLinkList(html,url) { | ||
async createLinkList(html, url) { | ||
// debugger; | ||
var $ = cheerio.load(html); | ||
// debugger; | ||
const elementList = await createElementList($, this.querySelector, { condition: this.config.condition, slice: this.config.slice }); | ||
const elementList = await createElementList($, this.querySelector, { | ||
condition: this.config.condition, | ||
slice: this.config.slice | ||
}); | ||
if (this.config.getElementList) { | ||
@@ -145,3 +158,2 @@ await this.config.getElementList(elementList); | ||
} | ||
@@ -152,2 +164,2 @@ | ||
module.exports = OpenLinks; | ||
module.exports = OpenLinks; |
{ | ||
"name": "nodejs-web-scraper", | ||
"version": "5.0.1", | ||
"version": "5.1.0", | ||
"description": "A web scraper for NodeJs", | ||
@@ -5,0 +5,0 @@ "main": "index.js", |
115765
2123