bandcamp-fetch
Advanced tools
Comparing version 0.1.0-a-20210203 to 0.1.0-a-20210210
@@ -262,2 +262,30 @@ const fetch = require('node-fetch'); | ||
async function getArticleCategories() { | ||
return _fetchPage(utils.getDailyUrl()) | ||
.then( html => parser.parseArticleCategories(html) ); | ||
} | ||
async function getArticleList(params = {}, options = {}) { | ||
if (params.categoryUrl == undefined) { | ||
params.categoryUrl = utils.getUrl('latest', utils.getDailyUrl()); | ||
} | ||
const opts = { | ||
imageFormat: await _parseImageFormatArg(options.imageFormat) | ||
}; | ||
return _fetchPage(utils.getDailyUrl(params)) | ||
.then( html => parser.parseArticleList(html, opts) ); | ||
} | ||
async function getArticle(articleUrl, options = {}) { | ||
const imageConstants = await _getImageConstants(); | ||
const opts = { | ||
imageBaseUrl: imageConstants.baseUrl, | ||
albumImageFormat: await _parseImageFormatArg(options.albumImageFormat, 9), | ||
artistImageFormat: await _parseImageFormatArg(options.artistImageFormat, 21), | ||
includeRawData: options.includeRawData ? true : false | ||
}; | ||
return _fetchPage(articleUrl) | ||
.then( html => parser.parseArticle(html, opts) ); | ||
} | ||
async function _fetchPage(url, json = false) { | ||
@@ -295,2 +323,5 @@ return _cache.getOrSet('page', url + (json ? ':json' : ':html'), () => { | ||
getShow, | ||
getArticleCategories, | ||
getArticleList, | ||
getArticle | ||
}; |
const cheerio = require('cheerio'); | ||
const {decode} = require('html-entities'); | ||
const utils = require('./utils.js'); | ||
const {URL} = require('url'); | ||
const {EOL} = require('os'); | ||
@@ -192,3 +192,3 @@ // https://github.com/masterT/bandcamp-scraper/blob/master/lib/htmlParser.js | ||
url: trackUrl, | ||
duration: track.item.duration_secs, | ||
duration: getAdditionalPropertyValue(track.item, 'duration_secs'), | ||
streamUrl: _getStreamUrl(track.item.url) | ||
@@ -237,3 +237,3 @@ }); | ||
releaseDate: extra.current.release_date, | ||
duration: basic.duration_secs, | ||
duration: getAdditionalPropertyValue(basic, 'duration_secs'), | ||
streamUrl: extra.trackinfo && extra.trackinfo[0] && extra.trackinfo[0].file && extra.trackinfo[0].file['mp3-128'] ? extra.trackinfo[0].file['mp3-128'] : null, | ||
@@ -294,2 +294,12 @@ artist: { | ||
function getAdditionalPropertyValue(o, propName) { | ||
if (Array.isArray(o.additionalProperty)) { | ||
const p = o.additionalProperty.find( prop => prop.name === propName ); | ||
if (p && p.value !== undefined) { | ||
return p.value; | ||
} | ||
} | ||
return undefined; | ||
} | ||
function parseDiscography(html, opts) { | ||
@@ -708,2 +718,297 @@ const $ = cheerio.load(html); | ||
function parseArticleCategories(html) { | ||
const $ = cheerio.load(html); | ||
const dailyUrl = utils.getDailyUrl(); | ||
const _parseSection = (section) => { | ||
const h = section.prev('h2'); | ||
const title = h.length ? h.text() : ''; | ||
const s = { | ||
name: section.attr('class'), | ||
title, | ||
sections: [], | ||
categories: [] | ||
} | ||
section.children().each( (i, c) => { | ||
const tag = c.tagName; | ||
c = $(c); | ||
if (tag === 'section') { | ||
const parsed = _parseSection($(c)); | ||
if (parsed !== null) { | ||
s.sections.push(parsed); | ||
} | ||
} | ||
else if (tag === 'div') { | ||
c.find('a').each( (i, a) => { | ||
a = $(a); | ||
let url = a.attr('href'); | ||
if (!utils.isAbsoluteUrl(url)) { | ||
url = utils.getUrl(url, dailyUrl); | ||
} | ||
s.categories.push({ | ||
url, | ||
name: a.text() | ||
}); | ||
}); | ||
} | ||
}); | ||
if (s.sections.length === 0) { | ||
delete s.sections; | ||
} | ||
if (s.categories.length === 0) { | ||
delete s.categories; | ||
} | ||
if (!s.sections && !s.categories) { | ||
return null; | ||
} | ||
else { | ||
return s; | ||
} | ||
}; | ||
const sections = $('#daily-view-all').children('section'); | ||
const results = []; | ||
sections.each( (i, section) => { | ||
const parsed = _parseSection($(section)); | ||
if (parsed !== null) { | ||
results.push(parsed); | ||
} | ||
}); | ||
return results; | ||
} | ||
function parseArticleList(html, opts) { | ||
const $ = cheerio.load(html); | ||
const dailyUrl = utils.getDailyUrl(); | ||
const results = { | ||
articles: [], | ||
total: 0, | ||
start: 0, | ||
end: 0 | ||
}; | ||
$('articles-list').each( (i, list) => { | ||
$('.list-article', $(list)).each( (i, article) => { | ||
article = $(article); | ||
const imageUrl = article.find('img').attr('src') || null; | ||
// category | ||
const infoText = article.find('.article-info-text'); | ||
const infoTextCategoryLink = infoText.find('a.franchise'); | ||
const infoTextMiddot = infoText.find('.middot'); | ||
const category = { | ||
url: infoTextCategoryLink.attr('href') || null, | ||
name: infoTextCategoryLink.text() || '' | ||
}; | ||
if (!utils.isAbsoluteUrl(category.url)) { | ||
category.url = utils.getUrl(category.url, dailyUrl); | ||
} | ||
// date | ||
infoTextCategoryLink.remove(); | ||
infoTextMiddot.remove(); | ||
const date = utils.stripLineBreaks(infoText.text()).trim(); | ||
// title and url | ||
const titleLink = article.find('a.title'); | ||
const title = titleLink.text(); | ||
let url = titleLink.attr('href'); | ||
if (!utils.isAbsoluteUrl(url)) { | ||
url = utils.getUrl(url, dailyUrl); | ||
} | ||
if (titleLink) { | ||
results.articles.push({ | ||
url, | ||
title, | ||
date, | ||
imageUrl: utils.reformatImageUrl(imageUrl, opts.imageFormat), | ||
category, | ||
}); | ||
} | ||
}); | ||
}); | ||
const resultsText = utils.stripLineBreaks($('#num-results').text()).trim(); | ||
const rtm = resultsText.match(/(\d+)(?:\s*to\s*)(\d+)(?:\s*of\s*)(\d+)/); | ||
if (rtm.length === 4) { | ||
results.total = parseInt(rtm[3], 10); | ||
results.start = parseInt(rtm[1], 10); | ||
results.end = parseInt(rtm[2], 10); | ||
} | ||
return results; | ||
} | ||
function parseArticle(html, opts) { | ||
const $ = cheerio.load(html); | ||
const basic = JSON.parse($('script[type="application/ld+json"]').html()); | ||
const players = JSON.parse(decode($('#p-daily-article').attr('data-player-infos'))); | ||
const article = { | ||
title: basic.headline, | ||
description: basic.description, | ||
url: basic['@id'], | ||
imageUrl: basic.image, | ||
date: basic.datePublished, | ||
category: { | ||
name: basic.articleSection, | ||
url: null | ||
}, | ||
genre: null, | ||
author: { | ||
name: basic.author.name, | ||
url: basic.author['@id'] | ||
}, | ||
mediaItems: [], | ||
sections: {} | ||
}; | ||
// get genre | ||
const genreLink = $('.genre a'); | ||
if (genreLink.length > 0) { | ||
article.genre = { | ||
name: genreLink.text(), | ||
url: genreLink.attr('href') | ||
}; | ||
const genreReadMoreLink = $('.moreingenre a'); | ||
if (genreReadMoreLink.length > 0) { | ||
article.genre.readMoreUrl = genreReadMoreLink.attr('href'); | ||
if (!utils.isAbsoluteUrl(article.genre.readMoreUrl)) { | ||
article.genre.readMoreUrl = utils.getUrl(article.genre.readMoreUrl, utils.getDailyUrl()); | ||
} | ||
} | ||
} | ||
// get category url | ||
const categoryLink = $('article-type a'); | ||
if (categoryLink.length > 0) { | ||
article.category.url = categoryLink.attr('href'); | ||
if (!utils.isAbsoluteUrl(article.category.url)) { | ||
article.category.url = utils.getUrl(article.category.url, utils.getDailyUrl()); | ||
} | ||
} | ||
// get media items (albums and tracks featured in article) | ||
if (Array.isArray(players)) { | ||
players.forEach( player => { | ||
const mediaItem = { | ||
type: 'unknown', | ||
name: player.title, | ||
url: player.tralbum_url, | ||
imageUrl: '', | ||
featuredTrackPosition: player.featured_track_number, | ||
artist: { | ||
name: player.band_name, | ||
url: player.band_url, | ||
imageUrl: '', | ||
location: player.band_location | ||
}, | ||
tracks: [], | ||
mediaItemRef: player.player_id | ||
}; | ||
if (player.parent_tralbum_type === 'a') { | ||
mediaItem.type = 'album'; | ||
} | ||
else if (player.parent_tralbum_type === 't') { | ||
mediaItem.type = 'track'; | ||
} | ||
if (player.art_id) { | ||
mediaItem.imageUrl = opts.imageBaseUrl + '/img/a' + player.art_id + '_' + opts.albumImageFormat.id + '.jpg'; | ||
} | ||
if (player.band_image_id) { | ||
mediaItem.artist.imageUrl = opts.imageBaseUrl + '/img/' + player.band_image_id + '_' + opts.artistImageFormat.id + '.jpg'; | ||
} | ||
if (Array.isArray(player.tracklist)) { | ||
player.tracklist.forEach( trackInfo => { | ||
const track = { | ||
position: trackInfo.track_number, | ||
name: trackInfo.track_title, | ||
duration: trackInfo.audio_track_duration, | ||
streamUrl: trackInfo.audio_url['mp3-128'] | ||
} | ||
mediaItem.tracks.push(track); | ||
}); | ||
} | ||
article.mediaItems.push(mediaItem); | ||
}); | ||
} | ||
// Function that returns a section corresponding to a media item | ||
const _getSectionByPlayer = player => { | ||
const section = { | ||
heading: null, | ||
html: '', | ||
text: '', | ||
mediaItemRef: null | ||
}; | ||
// Get heading | ||
const heading = player.prevUntil('bamplayer-art', 'h3, h2').first(); | ||
if (heading.length > 0) { | ||
section.heading = { | ||
html: heading.html(), | ||
text: utils.stripTags(utils.brToNewLine(heading.html())).trim() | ||
}; | ||
} | ||
// Get html and text | ||
const paragraphs = player.nextUntil('bamplayer-art, h3, h5, article-end', 'p'); | ||
paragraphs.each( (i, p) => { | ||
p = $(p); | ||
section.html += (section.html !== '' ? EOL : '') + p.html(); | ||
section.text += (section.text !== '' ? EOL + EOL : '') + p.text(); | ||
}); | ||
// get mediaItemRef | ||
const playerIdMatch = player.attr('data-bind').match(/playerMap\["(.+?)"]/); | ||
section.mediaItemRef = playerIdMatch[1] || null; | ||
return section; | ||
} | ||
// Function that returns the introductory paragraph(s) of the article | ||
const _getIntroSection = articleBody => { | ||
const firstPlayer = articleBody.find('bamplayer-art').first(); | ||
const paragraphs = firstPlayer.length > 0 ? firstPlayer.prevAll('p') : articleBody.find('p'); | ||
if (paragraphs.length > 0) { | ||
const section = { | ||
html: '', | ||
text: '' | ||
}; | ||
paragraphs.each( (i, p) => { | ||
p = $(p); | ||
section.html += (section.html !== '' ? EOL : '') + p.html(); | ||
section.text += (section.text !== '' ? EOL + EOL : '') + p.text(); | ||
}); | ||
return section; | ||
} | ||
else { | ||
return null; | ||
} | ||
} | ||
// sections | ||
const articleBody = $('#p-daily-article article'); | ||
const sections = []; | ||
const introSection = _getIntroSection(articleBody); | ||
if (introSection) { | ||
sections.push(introSection); | ||
} | ||
const bcplayers = articleBody.find('bamplayer-art'); | ||
bcplayers.each( (i, player) => { | ||
sections.push(_getSectionByPlayer($(player))); | ||
}); | ||
article.sections = sections; | ||
if (opts.includeRawData) { | ||
article.raw = { | ||
basic, | ||
mediaItems: players, | ||
body: articleBody.html() | ||
}; | ||
} | ||
return article; | ||
} | ||
module.exports = { | ||
@@ -723,2 +1028,5 @@ parseDiscoverResults, | ||
parseShow, | ||
parseArticleCategories, | ||
parseArticleList, | ||
parseArticle | ||
}; |
@@ -135,2 +135,10 @@ const querystring = require('querystring'); | ||
function getDailyUrl(params = {}) { | ||
let url = params.categoryUrl || 'https://daily.bandcamp.com'; | ||
if (params.page) { | ||
url += '?page=' + params.page; | ||
} | ||
return url; | ||
} | ||
module.exports = { | ||
@@ -152,3 +160,4 @@ getUrl, | ||
getShowIdFromUrl, | ||
getShowUrl | ||
getShowUrl, | ||
getDailyUrl | ||
}; |
{ | ||
"name": "bandcamp-fetch", | ||
"version": "0.1.0a-20210203", | ||
"version": "0.1.0a-20210210", | ||
"description": "JS library for scraping Bandcamp content", | ||
@@ -5,0 +5,0 @@ "main": "lib/index.js", |
@@ -185,2 +185,31 @@ # bandcamp-fetch | ||
### `getArticleCategories()` | ||
[**Example**](examples/getArticleCategories.js) ([output](examples/getArticleCategories_output.txt)) | ||
Fetches the list of Bandcamp Daily article categories. Categories are grouped into sections. | ||
### `getArticleList([params], [options])` | ||
[**Example**](examples/getArticleList.js) ([output](examples/getArticleList_output.txt)) | ||
Fetches the list of Bandcamp Daily articles under the category specified by `params.categoryUrl` (or all categories if not specified). | ||
- `params` (optional) | ||
- categoryUrl | ||
- `options` (optional) | ||
- imageFormat | ||
### `getArticle(articleUrl, [options])` | ||
[**Example**](examples/getArticle.js) ([output](examples/getArticle_output.txt)) | ||
Fetches the contents of the Bandcamp Daily article at `articleUrl`. | ||
- `articleUrl` | ||
- `options` (optional) | ||
- albumImageFormat | ||
- artistImageFormat | ||
- includeRawData | ||
## Caching | ||
@@ -187,0 +216,0 @@ |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Manifest confusion
Supply chain riskThis package has inconsistent metadata. This could be malicious or caused by an error when publishing the package.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Manifest confusion
Supply chain riskThis package has inconsistent metadata. This could be malicious or caused by an error when publishing the package.
Found 1 instance in 1 package
345120
41
1632
250