webcrawlerapi-js
Advanced tools
Comparing version
59
api.ts
@@ -90,4 +90,8 @@ import {CrawlRequest, Job, JobId, ScrapeRequest, ScrapeResponse} from "./model"; | ||
'headers': { | ||
'Content-Type': 'application/json', | ||
'Authorization': `Bearer ${this.apiKey}`, | ||
"User-Agent": "WebcrawlerAPI-NodeJS-Client" | ||
"User-Agent": "WebcrawlerAPI-NodeJS-Client", | ||
'Cache-Control': 'no-cache, no-store, must-revalidate', | ||
'Pragma': 'no-cache', | ||
'Expires': '0' | ||
}, | ||
@@ -120,3 +124,6 @@ }; | ||
'Authorization': `Bearer ${this.apiKey}`, | ||
"User-Agent": "WebcrawlerAPI-NodeJS-Client" | ||
"User-Agent": "WebcrawlerAPI-NodeJS-Client", | ||
'Cache-Control': 'no-cache, no-store, must-revalidate', | ||
'Pragma': 'no-cache', | ||
'Expires': '0' | ||
}, | ||
@@ -135,4 +142,44 @@ 'body': JSON.stringify(crawlRequest), | ||
await new Promise(resolve => setTimeout(resolve, delayIntervalMs)); | ||
const job = await this.getJob(jobIdResponse.id); | ||
const timestamp = new Date().getTime(); | ||
const job = await this.getJob(`${jobIdResponse.id}?t=${timestamp}`); | ||
if (job.status !== 'in_progress' && job.status !== 'new') { | ||
// Transform each job item to include getContent method | ||
job.job_items = job.job_items.map(item => ({ | ||
...item, | ||
getContent: async function(): Promise<string | null> { | ||
if (this.status !== 'done') { | ||
return null; | ||
} | ||
let contentUrl: string | undefined; | ||
switch (job.scrape_type) { | ||
case 'html': | ||
contentUrl = this.raw_content_url; | ||
break; | ||
case 'cleaned': | ||
contentUrl = this.cleaned_content_url; | ||
break; | ||
case 'markdown': | ||
contentUrl = this.markdown_content_url; | ||
break; | ||
} | ||
if (!contentUrl) { | ||
return null; | ||
} | ||
const response = await fetch(contentUrl, { | ||
headers: { | ||
'Accept-Encoding': 'gzip, deflate, br', | ||
'Accept': '*/*' | ||
} | ||
}); | ||
if (!response.ok) { | ||
throw new Error(`Failed to fetch content: ${response.statusText}`); | ||
} | ||
return await response.text(); | ||
} | ||
})); | ||
return job; | ||
@@ -144,2 +191,3 @@ } | ||
} | ||
throw new Error("Crawling took too long, please retry or increase the number of polling retries"); | ||
} | ||
@@ -169,3 +217,6 @@ | ||
'Authorization': `Bearer ${this.apiKey}`, | ||
"User-Agent": "WebcrawlerAPI-NodeJS-Client" | ||
"User-Agent": "WebcrawlerAPI-NodeJS-Client", | ||
'Cache-Control': 'no-cache, no-store, must-revalidate', | ||
'Pragma': 'no-cache', | ||
'Expires': '0' | ||
} | ||
@@ -172,0 +223,0 @@ } |
@@ -92,4 +92,8 @@ "use strict"; | ||
'headers': { | ||
'Content-Type': 'application/json', | ||
'Authorization': `Bearer ${this.apiKey}`, | ||
"User-Agent": "WebcrawlerAPI-NodeJS-Client" | ||
"User-Agent": "WebcrawlerAPI-NodeJS-Client", | ||
'Cache-Control': 'no-cache, no-store, must-revalidate', | ||
'Pragma': 'no-cache', | ||
'Expires': '0' | ||
}, | ||
@@ -118,3 +122,6 @@ }; | ||
'Authorization': `Bearer ${this.apiKey}`, | ||
"User-Agent": "WebcrawlerAPI-NodeJS-Client" | ||
"User-Agent": "WebcrawlerAPI-NodeJS-Client", | ||
'Cache-Control': 'no-cache, no-store, must-revalidate', | ||
'Pragma': 'no-cache', | ||
'Expires': '0' | ||
}, | ||
@@ -130,4 +137,38 @@ 'body': JSON.stringify(crawlRequest), | ||
yield new Promise(resolve => setTimeout(resolve, delayIntervalMs)); | ||
const job = yield this.getJob(jobIdResponse.id); | ||
const timestamp = new Date().getTime(); | ||
const job = yield this.getJob(`${jobIdResponse.id}?t=${timestamp}`); | ||
if (job.status !== 'in_progress' && job.status !== 'new') { | ||
// Transform each job item to include getContent method | ||
job.job_items = job.job_items.map(item => (Object.assign(Object.assign({}, item), { getContent: function () { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
if (this.status !== 'done') { | ||
return null; | ||
} | ||
let contentUrl; | ||
switch (job.scrape_type) { | ||
case 'html': | ||
contentUrl = this.raw_content_url; | ||
break; | ||
case 'cleaned': | ||
contentUrl = this.cleaned_content_url; | ||
break; | ||
case 'markdown': | ||
contentUrl = this.markdown_content_url; | ||
break; | ||
} | ||
if (!contentUrl) { | ||
return null; | ||
} | ||
const response = yield fetch(contentUrl, { | ||
headers: { | ||
'Accept-Encoding': 'gzip, deflate, br', | ||
'Accept': '*/*' | ||
} | ||
}); | ||
if (!response.ok) { | ||
throw new Error(`Failed to fetch content: ${response.statusText}`); | ||
} | ||
return yield response.text(); | ||
}); | ||
} }))); | ||
return job; | ||
@@ -139,2 +180,3 @@ } | ||
} | ||
throw new Error("Crawling took too long, please retry or increase the number of polling retries"); | ||
}); | ||
@@ -164,3 +206,6 @@ } | ||
'Authorization': `Bearer ${this.apiKey}`, | ||
"User-Agent": "WebcrawlerAPI-NodeJS-Client" | ||
"User-Agent": "WebcrawlerAPI-NodeJS-Client", | ||
'Cache-Control': 'no-cache, no-store, must-revalidate', | ||
'Pragma': 'no-cache', | ||
'Expires': '0' | ||
} | ||
@@ -167,0 +212,0 @@ }; |
@@ -59,2 +59,5 @@ export interface ScrapeRequest { | ||
referred_url: string; | ||
raw_content_url?: string; | ||
cleaned_content_url?: string; | ||
getContent(): Promise<string | null>; | ||
} |
@@ -64,2 +64,5 @@ export interface ScrapeRequest { | ||
referred_url: string; | ||
raw_content_url?: string; | ||
cleaned_content_url?: string; | ||
getContent(): Promise<string | null>; | ||
} |
{ | ||
"name": "webcrawlerapi-js", | ||
"version": "1.0.7", | ||
"version": "1.0.8", | ||
"description": "JS client for WecrawlerAPI", | ||
@@ -5,0 +5,0 @@ "main": "./dist/index.js", |
@@ -32,2 +32,7 @@ # JS client for WebcrawlerAPI | ||
) | ||
for (const item of syncJob.job_items) { | ||
item.getContent().then((content) => { | ||
console.log(content.slice(0, 100)); | ||
}) | ||
} | ||
console.log(syncJob); | ||
@@ -34,0 +39,0 @@ |
29145
21.86%667
17.02%90
5.88%