js-imdb-scraper
Advanced tools
Comparing version
import fetch from 'node-fetch'; | ||
import cheerio from 'cheerio'; | ||
import {bufferCount, firstValueFrom, from, map, mergeMap} from "rxjs"; | ||
const HEADER = { | ||
'user-agent': | ||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36', | ||
'user-agent': | ||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36', | ||
}; | ||
@@ -11,3 +12,3 @@ | ||
const convertToQueryString = (showName) => { | ||
return showName.replace(' ', '+'); | ||
return showName.replace(' ', '+'); | ||
}; | ||
@@ -18,13 +19,12 @@ | ||
const getImdbSearchPage = async (showName) => { | ||
const showNameQuery = convertToQueryString(showName); | ||
try { | ||
const result = await fetch(`https://www.imdb.com/find?q=${showNameQuery}`, { | ||
headers: HEADER, | ||
}); | ||
const resultBody = await result.text(); | ||
return resultBody; | ||
} catch (err) { | ||
console.error(err); | ||
return null; | ||
} | ||
const showNameQuery = convertToQueryString(showName); | ||
try { | ||
const result = await fetch(`https://www.imdb.com/find?q=${showNameQuery}`, { | ||
headers: HEADER, | ||
}); | ||
return await result.text(); | ||
} catch (err) { | ||
console.error(err); | ||
return null; | ||
} | ||
}; | ||
@@ -35,25 +35,25 @@ | ||
const getImdbResults = (resultBody) => { | ||
const $ = cheerio.load(resultBody); | ||
return $('table.findList > tbody > tr') | ||
.map((i, e) => { | ||
const url = $(e).children('td').find('a').attr('href'); | ||
const title = $(e).text().trim(); | ||
if ( | ||
// Only return titles | ||
url.includes('/title/') && | ||
// Only return tv shows | ||
title.includes(' (TV Series)') && | ||
// Don't show individual episodes | ||
!title.includes('(TV Episode)') | ||
) { | ||
return { | ||
title: $(e).text().trim().replace(' (TV Series)', ''), | ||
id: $(e).children('td').find('a').attr('href').substr(7, 9), | ||
img: getHighQualityImage( | ||
$(e).children('td.primary_photo').find('img').attr('src') | ||
), | ||
}; | ||
} | ||
}) | ||
.get(); | ||
const $ = cheerio.load(resultBody); | ||
return $('table.findList > tbody > tr') | ||
.map((i, e) => { | ||
const url = $(e).children('td').find('a').attr('href'); | ||
const title = $(e).text().trim(); | ||
if ( | ||
// Only return titles | ||
url.includes('/title/') && | ||
// Only return tv shows | ||
title.includes(' (TV Series)') && | ||
// Don't show individual episodes | ||
!title.includes('(TV Episode)') | ||
) { | ||
return { | ||
title: $(e).text().trim().replace(' (TV Series)', ''), | ||
id: $(e).children('td').find('a').attr('href').substr(7, 9), | ||
img: getHighQualityImage( | ||
$(e).children('td.primary_photo').find('img').attr('src') | ||
), | ||
}; | ||
} | ||
}) | ||
.get(); | ||
}; | ||
@@ -64,3 +64,3 @@ | ||
const getHighQualityImage = (imgUrl) => { | ||
return imgUrl.split('@.')[0] + '@._V1_UY268_CR8,0,182,268_AL_.jpg'; | ||
return imgUrl.split('@.')[0] + '@._V1_UY268_CR8,0,182,268_AL_.jpg'; | ||
}; | ||
@@ -71,22 +71,24 @@ | ||
const getAllRatings = async (imdbId) => { | ||
let ratings = {}; | ||
try { | ||
const page = await fetchShowImdbPage(imdbId); | ||
const $ = cheerio.load(page); | ||
// The page defaults to showing latest season, so we can use this to determine total number of seasons. | ||
const lastSeasonNumber = parseInt($('#bySeason option').length + 1); | ||
// First we get the page using the imdbId | ||
try { | ||
const page = await fetchShowImdbPage(imdbId); | ||
const $ = cheerio.load(page); | ||
// The page defaults to showing latest season, so we can use this to determine total number of seasons. | ||
const seasons = parseInt($('#bySeason option:selected').text().trim()); | ||
// an array with all numbers from one to thirty | ||
const seasons = Array.from(Array(lastSeasonNumber).keys()).slice(1); | ||
// Iterate through all seasons and populate ratings object | ||
for (let i = 1; i <= seasons; i++) { | ||
const seasonRatings = await getSeasonRatings(imdbId, i); | ||
ratings[i] = seasonRatings; | ||
const source = from(seasons).pipe( | ||
mergeMap(season => getSeasonRatings(imdbId, season) | ||
), | ||
bufferCount(seasons.length), | ||
map(result => result.reduce((acc, cur) => | ||
cur.ratings.length ? {...acc, [cur.season]: cur.ratings} : acc | ||
, {})) | ||
) | ||
return await firstValueFrom(source); | ||
} catch (err) { | ||
console.error(err); | ||
return null; | ||
} | ||
return ratings; | ||
} catch (err) { | ||
console.error(err); | ||
return null; | ||
} | ||
}; | ||
@@ -96,26 +98,27 @@ | ||
// Returns each episode and its rating | ||
// { episode, rating } | ||
// { season, ratings: { episode, rating } } | ||
const getSeasonRatings = async (imdbId, season) => { | ||
try { | ||
const result = await fetch( | ||
`https://www.imdb.com/title/${imdbId}/episodes?season=${season}` | ||
); | ||
const resultText = await result.text(); | ||
const $ = cheerio.load(resultText); | ||
try { | ||
const result = await fetch( | ||
`https://www.imdb.com/title/${imdbId}/episodes?season=${season}` | ||
); | ||
const resultText = await result.text(); | ||
const $ = cheerio.load(resultText); | ||
let seasonRatings = $( | ||
'div.eplist > div > div.info > div.ipl-rating-widget > div.ipl-rating-star' | ||
) | ||
.map(function (e, i) { | ||
return { | ||
episode: e + 1, | ||
rating: $(this).children('span.ipl-rating-star__rating').text(), | ||
}; | ||
}) | ||
.get(); | ||
return seasonRatings; | ||
} catch (err) { | ||
console.error(err); | ||
return null; | ||
} | ||
const ratings = $( | ||
'div.eplist > div > div.info > div.ipl-rating-widget > div.ipl-rating-star' | ||
) | ||
.map(function (e) { | ||
return { | ||
episode: e + 1, | ||
rating: $(this).children('span.ipl-rating-star__rating').text(), | ||
}; | ||
}) | ||
.get(); | ||
return {season, ratings} | ||
} catch (err) { | ||
console.error(err); | ||
return null; | ||
} | ||
}; | ||
@@ -126,15 +129,14 @@ | ||
const fetchShowImdbPage = async (imdbId) => { | ||
try { | ||
const result = await fetch( | ||
`https://www.imdb.com/title/${imdbId}/episodes`, | ||
{ | ||
headers: HEADER, | ||
} | ||
); | ||
const resultBody = await result.text(); | ||
return resultBody; | ||
} catch (err) { | ||
console.error(err); | ||
return null; | ||
} | ||
try { | ||
const result = await fetch( | ||
`https://www.imdb.com/title/${imdbId}/episodes`, | ||
{ | ||
headers: HEADER, | ||
} | ||
); | ||
return await result.text(); | ||
} catch (err) { | ||
console.error(err); | ||
return null; | ||
} | ||
}; | ||
@@ -146,6 +148,5 @@ | ||
const getSearchResults = async (show) => { | ||
const page = await getImdbSearchPage(show); | ||
if (page == null) return null; | ||
const showDetails = getImdbResults(page); | ||
return showDetails; | ||
const page = await getImdbSearchPage(show); | ||
if (page == null) return null; | ||
return getImdbResults(page); | ||
}; | ||
@@ -164,6 +165,6 @@ | ||
export default { | ||
getSearchResults, | ||
getAllRatings, | ||
getSeasonRatings, | ||
getNumSeasons, | ||
getSearchResults, | ||
getAllRatings, | ||
getSeasonRatings, | ||
getNumSeasons, | ||
}; |
{ | ||
"name": "js-imdb-scraper", | ||
"version": "0.0.10", | ||
"description": "", | ||
"version": "0.1.1", | ||
"description": "Simple IMDB scraper", | ||
"main": "imdbScraper.js", | ||
@@ -9,5 +9,5 @@ "type": "module", | ||
"cheerio": "^1.0.0-rc.3", | ||
"node-fetch": "^2.6.0" | ||
"node-fetch": "^2.6.0", | ||
"rxjs": "^7.5.2" | ||
}, | ||
"devDependencies": {}, | ||
"scripts": { | ||
@@ -14,0 +14,0 @@ "start": "node imdbScraper.js", |
@@ -84,14 +84,17 @@ # js-imdb-scraper | ||
```js | ||
[ | ||
{ episode: 1, rating: '8.9' }, | ||
{ episode: 2, rating: '8.6' }, | ||
{ episode: 3, rating: '8.4' }, | ||
{ episode: 4, rating: '8.7' }, | ||
{ episode: 5, rating: '8.7' }, | ||
{ episode: 6, rating: '8.9' }, | ||
{ episode: 7, rating: '9.5' }, | ||
{ episode: 8, rating: '8.8' }, | ||
{ episode: 9, rating: '9.4' }, | ||
{ episode: 10, rating: '9.7' } | ||
] | ||
{ | ||
season: 1, | ||
ratings: [ | ||
{ episode: 1, rating: '8.9' }, | ||
{ episode: 2, rating: '8.6' }, | ||
{ episode: 3, rating: '8.4' }, | ||
{ episode: 4, rating: '8.7' }, | ||
{ episode: 5, rating: '8.7' }, | ||
{ episode: 6, rating: '8.9' }, | ||
{ episode: 7, rating: '9.5' }, | ||
{ episode: 8, rating: '8.8' }, | ||
{ episode: 9, rating: '9.4' }, | ||
{ episode: 10, rating: '9.7' } | ||
], | ||
} | ||
``` | ||
@@ -98,0 +101,0 @@ |
11891
6.56%146
0.69%181
1.69%3
50%+ Added
+ Added
+ Added