Socket
Socket
Sign inDemoInstall

nodejs-web-scraper

Package Overview
Dependencies
32
Maintainers
1
Versions
81
Alerts
File Explorer

Advanced tools

Install Socket

Detect and block malicious and high-risk dependencies

Install

Comparing version 5.0.1 to 5.1.0

20

operations/CollectContent.js

@@ -12,5 +12,5 @@ const Operation = require('./Operation')

/**
*
* @param {string} querySelector cheerio-advanced-selectors selector
* @param {Object} [config]
*
* @param {string} querySelector cheerio-advanced-selectors selector
* @param {Object} [config]
* @param {string} [config.name = 'Default CollectContent name']

@@ -21,5 +21,5 @@ * @param {string} [config.contentType = 'text']

* @param {Function} [config.getElementList = null] Receives an elementList array
* @param {Function} [config.getElementContent = null] Receives elementContentString and pageAddress
* @param {Function} [config.getElementContent = null] Receives elementContentString, pageAddress, and element
* @param {Function} [config.getAllItems = null] Receives all items collected from a specific page. Will run for each page.
*/

@@ -46,5 +46,5 @@ constructor(querySelector, config) {

/**
*
* @param {{url:string,html:string}} params
* @return {Promise<{type:string,name:string,data:[]}>}
*
* @param {{url:string,html:string}} params
* @return {Promise<{type:string,name:string,data:[]}>}
*/

@@ -59,3 +59,3 @@ async scrape({ html, url }) {

// fs.writeFile(`${this.scraper.config.logPath}/${fileName}.html`,html,()=>{})
const parentAddress = url

@@ -82,3 +82,3 @@

if (this.config.getElementContent) {
const contentFromCallback = await this.config.getElementContent(content, parentAddress)
const contentFromCallback = await this.config.getElementContent(content, parentAddress, element)
content = typeof contentFromCallback === 'string' ? contentFromCallback : content;

@@ -85,0 +85,0 @@ }

@@ -17,3 +17,3 @@ const HttpOperation = require('./HttpOperation');

/**
*
*
* @mixes CompositeInjectMixin

@@ -26,16 +26,17 @@ * @mixes CompositeScrapeMixin

/**
*
* @param {string} querySelector cheerio-advanced-selectors selector
*
* @param {string} querySelector cheerio-advanced-selectors selector
* @param {Object} [config]
* @param {string} [config.name = 'Default OpenLinks name']
* @param {Object} [config.pagination = null] Look at the pagination API for more details.
* @param {string} [config.name = 'Default OpenLinks name']
* @param {Object} [config.pagination = null] Look at the pagination API for more details.
* @param {number[]} [config.slice = null]
* @param {Function} [config.condition = null] Receives a Cheerio node. Use this hook to decide if this node should be included in the scraping. Return true or false
* @param {Function} [config.getElementList = null] Receives an elementList array
* @param {Function} [config.getPageData = null]
* @param {Function} [config.getElementList = null] Receives an elementList array
* @param {Function} [config.getPageData = null]
* @param {Function} [config.getPageObject = null] Receives a dictionary of children, and an address argument
* @param {Function} [config.getPageResponse = null] Receives an axiosResponse object
* @param {Function} [config.getPageResponse = null] Receives an axiosResponse object
* @param {Function} [config.getPageHtml = null] Receives htmlString and pageAddress
* @param {Function} [config.getException = null] Listens to every exception. Receives the Error object.
*
* @param {Function} [config.getException = null] Listens to every exception. Receives the Error object.
* @param {(href: string) => string} [config.transformHref = undefined] Callback that receives the href before it is opened.
*
*/

@@ -53,7 +54,14 @@

if (typeof config === 'object' && typeof config.transformHref === 'function') {
this.transformHref = config.transformHref
} else {
this.transformHref = function (href) {
return href
}
}
}
/**
*
* @param {Operation} Operation
*
* @param {Operation} Operation
*/

@@ -68,3 +76,3 @@ addOperation(Operation) {

this.pageHelper = new PageHelper(this)
}else{
} else {
this.pageHelper = new SPA_PageHelper(this);

@@ -81,13 +89,12 @@ }

/**
*
* @param {{url:string,html:string}} params
*
* @param {{url:string,html:string}} params
* @return {Promise<{type:string,name:string,data:[]}>}
*/
async scrape({url,html}) {
async scrape({ url, html }) {
if (!this.pageHelper)
this.initPageHelper();
// debugger;
const refs = await this.createLinkList(html,url)
const refs = await this.createLinkList(html, url)

@@ -105,3 +112,6 @@ const hasOpenLinksOperation = this.operations.filter(child => child.constructor.name === 'OpenLinks').length > 0;//Checks if the current page operation has any other page operations in it. If so, will force concurrency limitation.

// debugger;
const data = await this.pageHelper.processOneIteration(href, shouldPaginate)
const data = await this.pageHelper.processOneIteration(
this.transformHref(href),
shouldPaginate
)

@@ -122,7 +132,10 @@ if (this.config.getPageData)

async createLinkList(html,url) {
async createLinkList(html, url) {
// debugger;
var $ = cheerio.load(html);
// debugger;
const elementList = await createElementList($, this.querySelector, { condition: this.config.condition, slice: this.config.slice });
const elementList = await createElementList($, this.querySelector, {
condition: this.config.condition,
slice: this.config.slice
});
if (this.config.getElementList) {

@@ -145,3 +158,2 @@ await this.config.getElementList(elementList);

}

@@ -152,2 +164,2 @@

module.exports = OpenLinks;
module.exports = OpenLinks;
{
"name": "nodejs-web-scraper",
"version": "5.0.1",
"version": "5.1.0",
"description": "A web scraper for NodeJs",

@@ -5,0 +5,0 @@ "main": "index.js",

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc