webcrawlerapi-js
Advanced tools
Comparing version
20
api.ts
@@ -25,3 +25,4 @@ import {CrawlRequest, Job, JobId, ScrapeRequest, ScrapeResponse} from "./model"; | ||
'Content-Type': 'application/json', | ||
'Authorization': `Bearer ${this.apiKey}` | ||
'Authorization': `Bearer ${this.apiKey}`, | ||
"User-Agent": "WebcrawlerAPI-NodeJS-Client" | ||
}, | ||
@@ -45,3 +46,3 @@ 'body': JSON.stringify(scrapeRequest), | ||
public async scrapeWithMeta(scrapeRequest: ScrapeRequest): Promise<ScrapeResponse> { | ||
public async scrapeWithMeta(scrapeRequest: ScrapeRequest, maxPollingRetries: number = MaxPullRetries): Promise<ScrapeResponse> { | ||
const url = `${this.basePath}/${this.apiVersion}/scrape`; | ||
@@ -65,3 +66,3 @@ | ||
let delayIntervalMs = initialPullDelayMs; | ||
for (let i = 0; i < MaxPullRetries; i++) { | ||
for (let i = 0; i < maxPollingRetries; i++) { | ||
await new Promise(resolve => setTimeout(resolve, delayIntervalMs)); | ||
@@ -79,7 +80,7 @@ const scrapeResult = await this.getScrapeResult(jobIdResponse.id); | ||
} | ||
throw new Error("Scraping took too long, please retry"); | ||
throw new Error("Scraping took too long, please retry or increase the number of polling retries"); | ||
} | ||
public async scrape(scrapeRequest: ScrapeRequest): Promise<ScrapeResponse> { | ||
const scrapeResult = await this.scrapeWithMeta(scrapeRequest); | ||
public async scrape(scrapeRequest: ScrapeRequest, maxPollingRetries: number = MaxPullRetries): Promise<any> { | ||
const scrapeResult = await this.scrapeWithMeta(scrapeRequest, maxPollingRetries); | ||
return scrapeResult.structured_data; | ||
@@ -94,2 +95,3 @@ } | ||
'Authorization': `Bearer ${this.apiKey}`, | ||
"User-Agent": "WebcrawlerAPI-NodeJS-Client" | ||
}, | ||
@@ -121,3 +123,4 @@ }; | ||
'Content-Type': 'application/json', | ||
'Authorization': `Bearer ${this.apiKey}` | ||
'Authorization': `Bearer ${this.apiKey}`, | ||
"User-Agent": "WebcrawlerAPI-NodeJS-Client" | ||
}, | ||
@@ -167,3 +170,4 @@ 'body': JSON.stringify(crawlRequest), | ||
'Content-Type': 'application/json', | ||
'Authorization': `Bearer ${this.apiKey}` | ||
'Authorization': `Bearer ${this.apiKey}`, | ||
"User-Agent": "WebcrawlerAPI-NodeJS-Client" | ||
} | ||
@@ -170,0 +174,0 @@ } |
@@ -152,3 +152,2 @@ "use strict"; | ||
const url = `${this.basePath}/${this.apiVersion}/job/${jobID}`; | ||
console.log(url); | ||
const requestOptions = { | ||
@@ -155,0 +154,0 @@ 'method': 'GET', |
{ | ||
"name": "webcrawlerapi-js", | ||
"version": "1.0.4", | ||
"version": "1.0.5", | ||
"description": "JS client for WecrawlerAPI", | ||
@@ -20,3 +20,3 @@ "main": "./dist/index.js", | ||
}, | ||
"author": "Andrew <support@webcrawlerapi.com>", | ||
"author": "Andrew <sdk@webcrawlerapi.com>", | ||
"license": "MIT", | ||
@@ -23,0 +23,0 @@ "dependencies": {}, |
@@ -1,5 +0,11 @@ | ||
# JS client for WebcrawlerAPI scrapers | ||
# JS client for WebcrawlerAPI | ||
Official client for [WebcrawlerAPI](https://webcrawlerapi.com/) scrapers. | ||
Official client for [WebcrawlerAPI](https://webcrawlerapi.com/). | ||
WebcrawlerAPI allows you to extract data from any website with just a simple API call. | ||
## Preparation | ||
1. Register to [dashboard|https://dash.webcrawlerapi.com/]. | ||
2. Get an [Access Key](https://dash.webcrawlerapi.com/access). | ||
## Installation | ||
@@ -10,22 +16,33 @@ Install WebcrawlerAPI js package: | ||
## Preparation | ||
1. Register to [dashboard|https://dash.webcrawlerapi.com/]. | ||
2. Get an [Access Key](https://dash.webcrawlerapi.com/access). | ||
## Request example | ||
```javascript | ||
const webcrawlerapi = require('webcrawlerapi-js'); | ||
import webcrawlerapi from "webcrawlerapi-js"; | ||
async function main() { | ||
const client = new webcrawlerapi.WebcrawlerClient( | ||
"YOUR API KEY HERE" | ||
"YOUR API ACCESS KEY HERE", | ||
) | ||
const response = await client.scrape({ | ||
input: { | ||
"url": "https://www.funda.nl/detail/koop/heerhugowaard/huis-govert-flinckplantsoen-1/89968455/" | ||
}, | ||
crawler_id: "webcrawler/funda", | ||
}) | ||
console.log(response) | ||
// sync way - promise will be resolved with the all the data | ||
const syncJob = await client.crawl({ | ||
"items_limit": 10, | ||
"url": "https://stripe.com/", | ||
"scrape_type": "markdown" | ||
} | ||
) | ||
console.log(syncJob); | ||
// or async - get the job id and then poll the job status and get the data | ||
const jobWithId = await client.crawlAsync({ | ||
"items_limit": 10, | ||
"url": "https://stripe.com/", | ||
"scrape_type": "markdown" | ||
} | ||
) | ||
// wait for job to complete | ||
const jobId = jobWithId.id; | ||
let asyncJob = await client.getJob(jobId); | ||
console.log(asyncJob); | ||
} | ||
@@ -37,26 +54,34 @@ | ||
## Response example | ||
```json | ||
```javascript | ||
{ | ||
city: 'Heerhugowaard', | ||
price: 325000, | ||
images: [ 'https://cloud.funda.nl/valentina_media/191/215/183_2160.jpg' ], | ||
status: 'inonderhandeling', | ||
videos: [], | ||
address: 'Govert Flinckplantsoen 1', | ||
country: 'Nederland', | ||
province: 'Noord-Holland', | ||
plot_area: '183 m²', | ||
post_code: '1701NH', | ||
description: 'De woning is met liefde en zorg 53 jaar bewoond door...', | ||
living_area: 127, | ||
house_number: 1, | ||
energie_label: 'd', | ||
property_type: 'woonhuis', | ||
publication_date: '2024-05-28T00:00:00', | ||
number_of_bedrooms: 4, | ||
coordinates_latitude: 52.67685, | ||
year_of_construction: 1971, | ||
coordinates_longitude: 4.8560443, | ||
house_number_extension: '' | ||
id: '49c4942b-b7d9-4d62-94b5-b54a3016ac51', | ||
org_id: 'clxsnorta00075wuuqxgzzvxm', | ||
url: 'https://stripe.com/', | ||
scrape_type: 'markdown', | ||
whitelist_regexp: '', | ||
blacklist_regexp: '', | ||
allow_subdomains: false, | ||
items_limit: 10, | ||
created_at: '2024-12-28T21:36:04.417Z', | ||
finished_at: null, | ||
updated_at: '2024-12-28T21:36:04.383Z', | ||
webhook_url: '', | ||
status: 'in_progress', | ||
job_items: [ | ||
{ | ||
id: 'f26cefe1-09d1-4d4c-8b74-b65e075e230d', | ||
job_id: '49c4942a-b7d9-4d62-94b5-b54a3016ac51', | ||
original_url: 'https://stripe.com/', | ||
page_status_code: 0, | ||
status: 'new', | ||
title: '', | ||
last_error: '', | ||
created_at: '2024-12-28T21:36:04.468Z', | ||
updated_at: '2024-12-28T21:36:04.435Z', | ||
cost: 0, | ||
referred_url: '' | ||
} | ||
], | ||
recommended_pull_delay_ms: 5000 | ||
} | ||
``` |
23389
4.39%564
0.53%85
41.67%