@harvestapi/scraper
Advanced tools
Comparing version
@@ -20,2 +20,3 @@ import { ListingScraperOptions } from './types'; | ||
private scrapedItems; | ||
private paginationToken; | ||
private undefinedPagination; | ||
@@ -22,0 +23,0 @@ constructor(options: ListingScraperOptions<TItemShort, TItemDetail>); |
@@ -19,4 +19,5 @@ import { ApiItemResponse, ApiListResponse } from '../types'; | ||
export type ListingScraperOptions<TItemShot, TItemDetails> = ListingScraperConfig<TItemShot, TItemDetails> & { | ||
fetchList: ({ page }: { | ||
fetchList: (args: { | ||
page: number; | ||
paginationToken?: string | null; | ||
}) => Promise<ApiListResponse<TItemShot>>; | ||
@@ -23,0 +24,0 @@ fetchItem: ({ item, }: { |
@@ -815,2 +815,3 @@ 'use strict'; | ||
this.scrapedItems = {}; | ||
this.paginationToken = null; | ||
this.undefinedPagination = false; | ||
@@ -861,3 +862,3 @@ this.onItemScraped = async ({ item }) => { | ||
async scrapeStart() { | ||
var _a, _b, _c, _d, _e; | ||
var _a, _b, _c, _d, _e, _f; | ||
this.stats = { | ||
@@ -871,8 +872,11 @@ pages: 0, | ||
}; | ||
this.paginationToken = null; | ||
this.scrapePagesDone = false; | ||
const firstPage = await this.fetchPage({ page: 1 }); | ||
let totalPages = ((_a = firstPage === null || firstPage === void 0 ? void 0 : firstPage.pagination) === null || _a === void 0 ? void 0 : _a.totalPages) || 0; | ||
this.paginationToken = ((_b = firstPage === null || firstPage === void 0 ? void 0 : firstPage.pagination) === null || _b === void 0 ? void 0 : _b.paginationToken) || null; | ||
if (this.options.maxPages && totalPages > this.options.maxPages) { | ||
totalPages = this.options.maxPages; | ||
} | ||
if (!totalPages && ((_b = firstPage === null || firstPage === void 0 ? void 0 : firstPage.elements) === null || _b === void 0 ? void 0 : _b.length)) { | ||
if (!totalPages && ((_c = firstPage === null || firstPage === void 0 ? void 0 : firstPage.elements) === null || _c === void 0 ? void 0 : _c.length)) { | ||
totalPages = this.options.maxPages; | ||
@@ -884,5 +888,5 @@ this.undefinedPagination = true; | ||
} | ||
const concurrency = ((_c = this.options) === null || _c === void 0 ? void 0 : _c.overrideConcurrency) || ((_d = firstPage === null || firstPage === void 0 ? void 0 : firstPage.user) === null || _d === void 0 ? void 0 : _d.requestsConcurrency) || 1; | ||
const concurrency = ((_d = this.options) === null || _d === void 0 ? void 0 : _d.overrideConcurrency) || ((_e = firstPage === null || firstPage === void 0 ? void 0 : firstPage.user) === null || _e === void 0 ? void 0 : _e.requestsConcurrency) || 1; | ||
this.log(`Scraping ${this.options.entityName} with ${concurrency} concurrent ${concurrency === 1 ? 'worker' : 'workers'}... Total pages: ${totalPages}`); | ||
if (!((_e = firstPage === null || firstPage === void 0 ? void 0 : firstPage.elements) === null || _e === void 0 ? void 0 : _e.length)) { | ||
if (!((_f = firstPage === null || firstPage === void 0 ? void 0 : firstPage.elements) === null || _f === void 0 ? void 0 : _f.length)) { | ||
this.done = true; | ||
@@ -956,3 +960,5 @@ if (this.error) { | ||
this.log(`Scraping page ${page} of ${this.options.entityName}...`); | ||
const result = await this.options.fetchList({ page }).catch((error) => { | ||
const result = await this.options | ||
.fetchList({ page, paginationToken: this.paginationToken }) | ||
.catch((error) => { | ||
this.errorLog('Error fetching page', page, error); | ||
@@ -1105,2 +1111,5 @@ return null; | ||
} | ||
async getPostComments(params) { | ||
return this.scraper.fetchApi({ path: 'linkedin/post-comments', params }); | ||
} | ||
async searchCompanyAssociatedProfiles(params) { | ||
@@ -1182,2 +1191,14 @@ return this.scraper.fetchApi({ | ||
} | ||
async scrapePostComments({ query, ...options }) { | ||
return new ListingScraper({ | ||
fetchList: (fetchArgs) => this.getPostComments({ ...query, ...fetchArgs }), | ||
fetchItem: async ({ item }) => (item === null || item === void 0 ? void 0 : item.id) | ||
? { entityId: item === null || item === void 0 ? void 0 : item.id, element: item } | ||
: { skipped: true }, | ||
scrapeDetails: false, | ||
entityName: 'post-comments', | ||
...options, | ||
maxPages: 100, | ||
}).scrapeStart(); | ||
} | ||
async test() { | ||
@@ -1184,0 +1205,0 @@ return this.scraper.fetchApi({ path: 'linkedin/test' }); |
@@ -813,2 +813,3 @@ import { randomUUID } from 'crypto'; | ||
this.scrapedItems = {}; | ||
this.paginationToken = null; | ||
this.undefinedPagination = false; | ||
@@ -859,3 +860,3 @@ this.onItemScraped = async ({ item }) => { | ||
async scrapeStart() { | ||
var _a, _b, _c, _d, _e; | ||
var _a, _b, _c, _d, _e, _f; | ||
this.stats = { | ||
@@ -869,8 +870,11 @@ pages: 0, | ||
}; | ||
this.paginationToken = null; | ||
this.scrapePagesDone = false; | ||
const firstPage = await this.fetchPage({ page: 1 }); | ||
let totalPages = ((_a = firstPage === null || firstPage === void 0 ? void 0 : firstPage.pagination) === null || _a === void 0 ? void 0 : _a.totalPages) || 0; | ||
this.paginationToken = ((_b = firstPage === null || firstPage === void 0 ? void 0 : firstPage.pagination) === null || _b === void 0 ? void 0 : _b.paginationToken) || null; | ||
if (this.options.maxPages && totalPages > this.options.maxPages) { | ||
totalPages = this.options.maxPages; | ||
} | ||
if (!totalPages && ((_b = firstPage === null || firstPage === void 0 ? void 0 : firstPage.elements) === null || _b === void 0 ? void 0 : _b.length)) { | ||
if (!totalPages && ((_c = firstPage === null || firstPage === void 0 ? void 0 : firstPage.elements) === null || _c === void 0 ? void 0 : _c.length)) { | ||
totalPages = this.options.maxPages; | ||
@@ -882,5 +886,5 @@ this.undefinedPagination = true; | ||
} | ||
const concurrency = ((_c = this.options) === null || _c === void 0 ? void 0 : _c.overrideConcurrency) || ((_d = firstPage === null || firstPage === void 0 ? void 0 : firstPage.user) === null || _d === void 0 ? void 0 : _d.requestsConcurrency) || 1; | ||
const concurrency = ((_d = this.options) === null || _d === void 0 ? void 0 : _d.overrideConcurrency) || ((_e = firstPage === null || firstPage === void 0 ? void 0 : firstPage.user) === null || _e === void 0 ? void 0 : _e.requestsConcurrency) || 1; | ||
this.log(`Scraping ${this.options.entityName} with ${concurrency} concurrent ${concurrency === 1 ? 'worker' : 'workers'}... Total pages: ${totalPages}`); | ||
if (!((_e = firstPage === null || firstPage === void 0 ? void 0 : firstPage.elements) === null || _e === void 0 ? void 0 : _e.length)) { | ||
if (!((_f = firstPage === null || firstPage === void 0 ? void 0 : firstPage.elements) === null || _f === void 0 ? void 0 : _f.length)) { | ||
this.done = true; | ||
@@ -954,3 +958,5 @@ if (this.error) { | ||
this.log(`Scraping page ${page} of ${this.options.entityName}...`); | ||
const result = await this.options.fetchList({ page }).catch((error) => { | ||
const result = await this.options | ||
.fetchList({ page, paginationToken: this.paginationToken }) | ||
.catch((error) => { | ||
this.errorLog('Error fetching page', page, error); | ||
@@ -1103,2 +1109,5 @@ return null; | ||
} | ||
async getPostComments(params) { | ||
return this.scraper.fetchApi({ path: 'linkedin/post-comments', params }); | ||
} | ||
async searchCompanyAssociatedProfiles(params) { | ||
@@ -1180,2 +1189,14 @@ return this.scraper.fetchApi({ | ||
} | ||
async scrapePostComments({ query, ...options }) { | ||
return new ListingScraper({ | ||
fetchList: (fetchArgs) => this.getPostComments({ ...query, ...fetchArgs }), | ||
fetchItem: async ({ item }) => (item === null || item === void 0 ? void 0 : item.id) | ||
? { entityId: item === null || item === void 0 ? void 0 : item.id, element: item } | ||
: { skipped: true }, | ||
scrapeDetails: false, | ||
entityName: 'post-comments', | ||
...options, | ||
maxPages: 100, | ||
}).scrapeStart(); | ||
} | ||
async test() { | ||
@@ -1182,0 +1203,0 @@ return this.scraper.fetchApi({ path: 'linkedin/test' }); |
import { ScraperOptions } from '../base'; | ||
import { ApiItemResponse, ApiListResponse } from '../types'; | ||
import { Company, CompanyShort, GetLinkedinCompanyParams, GetLinkedinJobParams, GetLinkedinPostReactionsParams, GetLinkedInProfileParams, Job, JobShort, PostReaction, PostShort, Profile, ProfileShort, ScrapeLinkedinCompaniesParams, ScrapeLinkedinJobsParams, ScrapeLinkedinPostReactionsParams, ScrapeLinkedinPostsParams, ScrapeLinkedinProfilesParams, SearchLinkedinCompaniesParams, SearchLinkedInCompanyAssociatedProfilesParams, SearchLinkedinJobsParams, SearchLinkedinPostsParams, SearchLinkedInProfilesParams, SearchLinkedInProfilesParamsV2 } from './types'; | ||
import { Company, CompanyShort, GetLinkedinCompanyParams, GetLinkedinJobParams, GetLinkedinPostCommentsParams, GetLinkedinPostReactionsParams, GetLinkedInProfileParams, Job, JobShort, PostComment, PostReaction, PostShort, Profile, ProfileShort, ScrapeLinkedinCompaniesParams, ScrapeLinkedinJobsParams, ScrapeLinkedinPostCommentsParams, ScrapeLinkedinPostReactionsParams, ScrapeLinkedinPostsParams, ScrapeLinkedinProfilesParams, SearchLinkedinCompaniesParams, SearchLinkedInCompanyAssociatedProfilesParams, SearchLinkedinJobsParams, SearchLinkedinPostsParams, SearchLinkedInProfilesParams, SearchLinkedInProfilesParamsV2 } from './types'; | ||
export declare class LinkedinScraper { | ||
@@ -23,2 +23,3 @@ private options; | ||
getPostReactions(params: GetLinkedinPostReactionsParams): Promise<ApiListResponse<PostReaction>>; | ||
getPostComments(params: GetLinkedinPostCommentsParams): Promise<ApiListResponse<PostComment>>; | ||
searchCompanyAssociatedProfiles(params: SearchLinkedInCompanyAssociatedProfilesParams): Promise<ApiListResponse<ProfileShort>>; | ||
@@ -73,3 +74,11 @@ scrapeJobs({ query, ...options }: ScrapeLinkedinJobsParams): Promise<{ | ||
} | undefined>; | ||
scrapePostComments({ query, ...options }: ScrapeLinkedinPostCommentsParams): Promise<{ | ||
pages: number; | ||
pagesSuccess: number; | ||
items: number; | ||
itemsSuccess: number; | ||
requests: number; | ||
requestsStartTime: Date; | ||
} | undefined>; | ||
test(): Promise<any>; | ||
} |
@@ -97,2 +97,8 @@ import { ListingScraperConfig } from '../base'; | ||
} | ||
export interface GetLinkedinPostCommentsParams { | ||
post: string | number; | ||
page?: number; | ||
paginationToken?: string | null; | ||
sortBy?: 'date' | 'relevance'; | ||
} | ||
export type Profile = { | ||
@@ -464,2 +470,26 @@ id: string; | ||
}; | ||
export type PostComment = { | ||
id: string; | ||
linkedinUrl: string; | ||
commentary: string; | ||
createdAt: string; | ||
postId: string; | ||
actor: { | ||
id: string; | ||
name: string; | ||
linkedinUrl: string; | ||
position: string; | ||
pictureUrl: string; | ||
picture: { | ||
url: string; | ||
width: number; | ||
height: number; | ||
expiresAt: number; | ||
}; | ||
}; | ||
createdAtTimestamp: number; | ||
pinned?: boolean | null; | ||
contributed?: boolean | null; | ||
edited?: boolean | null; | ||
}; | ||
export type ScrapeLinkedinJobsParams = { | ||
@@ -481,2 +511,5 @@ query: SearchLinkedinJobsParams; | ||
} & ListingScraperConfig<PostReaction, PostReaction>; | ||
export type ScrapeLinkedinPostCommentsParams = { | ||
query: GetLinkedinPostCommentsParams; | ||
} & ListingScraperConfig<PostComment, PostComment>; | ||
export type ErrorResponse = { | ||
@@ -483,0 +516,0 @@ error: string; |
@@ -25,4 +25,5 @@ export interface BaseApiResponse { | ||
pageSize: number; | ||
paginationToken?: string | null; | ||
} | null; | ||
elements: TItem[]; | ||
}; |
{ | ||
"name": "@harvestapi/scraper", | ||
"version": "1.3.10", | ||
"version": "1.3.11", | ||
"main": "dist/index.cjs.js", | ||
@@ -5,0 +5,0 @@ "module": "dist/index.esm.js", |
152
README.md
@@ -232,2 +232,16 @@ # HarvestAPI scraping tools | ||
##### getPostComments() | ||
> **getPostComments**(`params`): `Promise`\<[`ApiListResponse`](#apilistresponsetitem)\<[`PostComment`](#postcomment)\>\> | ||
###### Parameters | ||
###### params | ||
[`GetLinkedinPostCommentsParams`](#getlinkedinpostcommentsparams) | ||
###### Returns | ||
`Promise`\<[`ApiListResponse`](#apilistresponsetitem)\<[`PostComment`](#postcomment)\>\> | ||
##### searchCompanyAssociatedProfiles() | ||
@@ -331,2 +345,16 @@ | ||
##### scrapePostComments() | ||
> **scrapePostComments**(`__namedParameters`): `Promise`\<`undefined` \| \{ `pages`: `number`; `pagesSuccess`: `number`; `items`: `number`; `itemsSuccess`: `number`; `requests`: `number`; `requestsStartTime`: `Date`; \}\> | ||
###### Parameters | ||
###### \_\_namedParameters | ||
[`ScrapeLinkedinPostCommentsParams`](#scrapelinkedinpostcommentsparams) | ||
###### Returns | ||
`Promise`\<`undefined` \| \{ `pages`: `number`; `pagesSuccess`: `number`; `items`: `number`; `itemsSuccess`: `number`; `requests`: `number`; `requestsStartTime`: `Date`; \}\> | ||
## Interfaces | ||
@@ -674,2 +702,24 @@ | ||
### GetLinkedinPostCommentsParams | ||
#### Properties | ||
##### post | ||
> **post**: `string` \| `number` | ||
##### page? | ||
> `optional` **page**: `number` | ||
##### paginationToken? | ||
> `optional` **paginationToken**: `null` \| `string` | ||
##### sortBy? | ||
> `optional` **sortBy**: `"date"` \| `"relevance"` | ||
*** | ||
### BaseApiResponse | ||
@@ -1815,2 +1865,90 @@ | ||
### PostComment | ||
> **PostComment**: `object` | ||
#### Type declaration | ||
##### id | ||
> **id**: `string` | ||
##### linkedinUrl | ||
> **linkedinUrl**: `string` | ||
##### commentary | ||
> **commentary**: `string` | ||
##### createdAt | ||
> **createdAt**: `string` | ||
##### postId | ||
> **postId**: `string` | ||
##### actor | ||
> **actor**: `object` | ||
###### actor.id | ||
> **id**: `string` | ||
###### actor.name | ||
> **name**: `string` | ||
###### actor.linkedinUrl | ||
> **linkedinUrl**: `string` | ||
###### actor.position | ||
> **position**: `string` | ||
###### actor.pictureUrl | ||
> **pictureUrl**: `string` | ||
###### actor.picture | ||
> **picture**: `object` | ||
###### actor.picture.url | ||
> **url**: `string` | ||
###### actor.picture.width | ||
> **width**: `number` | ||
###### actor.picture.height | ||
> **height**: `number` | ||
###### actor.picture.expiresAt | ||
> **expiresAt**: `number` | ||
##### createdAtTimestamp | ||
> **createdAtTimestamp**: `number` | ||
##### pinned? | ||
> `optional` **pinned**: `boolean` \| `null` | ||
##### contributed? | ||
> `optional` **contributed**: `boolean` \| `null` | ||
##### edited? | ||
> `optional` **edited**: `boolean` \| `null` | ||
*** | ||
### ScrapeLinkedinJobsParams | ||
@@ -1880,2 +2018,14 @@ | ||
### ScrapeLinkedinPostCommentsParams | ||
> **ScrapeLinkedinPostCommentsParams**: `object` & [`ListingScraperConfig`](#listingscraperconfigtitemshot-titemdetails)\<[`PostComment`](#postcomment), [`PostComment`](#postcomment)\> | ||
#### Type declaration | ||
##### query | ||
> **query**: [`GetLinkedinPostCommentsParams`](#getlinkedinpostcommentsparams) | ||
*** | ||
### ErrorResponse | ||
@@ -1925,3 +2075,3 @@ | ||
> **pagination**: \{ `totalPages`: `number`; `totalElements`: `number`; `pageNumber`: `number`; `previousElements`: `number`; `pageSize`: `number`; \} \| `null` | ||
> **pagination**: \{ `totalPages`: `number`; `totalElements`: `number`; `pageNumber`: `number`; `previousElements`: `number`; `pageSize`: `number`; `paginationToken`: `string` \| `null`; \} \| `null` | ||
@@ -1928,0 +2078,0 @@ ##### elements |
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
313274
3.46%2894
3.1%2079
7.78%