bandcamp-fetch - npm Package Compare versions

Comparing version 0.1.0-a-20210203 to 0.1.0-a-20210210

examples/getArticle_output.txt

examples/getArticle.js

examples/getArticleCategories_output.txt

examples/getArticleCategories.js

examples/getArticleList_output.txt

examples/getArticleList.js

lib/index.js

		@@ -262,2 +262,30 @@ const fetch = require('node-fetch');

		async function getArticleCategories() {
		return _fetchPage(utils.getDailyUrl())
		.then( html => parser.parseArticleCategories(html) );
		}

		async function getArticleList(params = {}, options = {}) {
		if (params.categoryUrl == undefined) {
		params.categoryUrl = utils.getUrl('latest', utils.getDailyUrl());
		}
		const opts = {
		imageFormat: await _parseImageFormatArg(options.imageFormat)
		};
		return _fetchPage(utils.getDailyUrl(params))
		.then( html => parser.parseArticleList(html, opts) );
		}

		async function getArticle(articleUrl, options = {}) {
		const imageConstants = await _getImageConstants();
		const opts = {
		imageBaseUrl: imageConstants.baseUrl,
		albumImageFormat: await _parseImageFormatArg(options.albumImageFormat, 9),
		artistImageFormat: await _parseImageFormatArg(options.artistImageFormat, 21),
		includeRawData: options.includeRawData ? true : false
		};
		return _fetchPage(articleUrl)
		.then( html => parser.parseArticle(html, opts) );
		}

		async function _fetchPage(url, json = false) {
		@@ -295,2 +323,5 @@ return _cache.getOrSet('page', url + (json ? ':json' : ':html'), () => {
		getShow,
		getArticleCategories,
		getArticleList,
		getArticle
		};

314

lib/parser.js

		const cheerio = require('cheerio');
		const {decode} = require('html-entities');
		const utils = require('./utils.js');
		const {URL} = require('url');
		const {EOL} = require('os');

		@@ -192,3 +192,3 @@ // https://github.com/masterT/bandcamp-scraper/blob/master/lib/htmlParser.js
		url: trackUrl,
		duration: track.item.duration_secs,
		duration: getAdditionalPropertyValue(track.item, 'duration_secs'),
		streamUrl: _getStreamUrl(track.item.url)
		@@ -237,3 +237,3 @@ });
		releaseDate: extra.current.release_date,
		duration: basic.duration_secs,
		duration: getAdditionalPropertyValue(basic, 'duration_secs'),
		streamUrl: extra.trackinfo && extra.trackinfo[0] && extra.trackinfo[0].file && extra.trackinfo[0].file['mp3-128'] ? extra.trackinfo[0].file['mp3-128'] : null,
		@@ -294,2 +294,12 @@ artist: {

		function getAdditionalPropertyValue(o, propName) {
		if (Array.isArray(o.additionalProperty)) {
		const p = o.additionalProperty.find( prop => prop.name === propName );
		if (p && p.value !== undefined) {
		return p.value;
		}
		}
		return undefined;
		}

		function parseDiscography(html, opts) {
		@@ -708,2 +718,297 @@ const $ = cheerio.load(html);

		function parseArticleCategories(html) {
		const $ = cheerio.load(html);
		const dailyUrl = utils.getDailyUrl();
		const _parseSection = (section) => {
		const h = section.prev('h2');
		const title = h.length ? h.text() : '';
		const s = {
		name: section.attr('class'),
		title,
		sections: [],
		categories: []
		}
		section.children().each( (i, c) => {
		const tag = c.tagName;
		c = $(c);
		if (tag === 'section') {
		const parsed = _parseSection($(c));
		if (parsed !== null) {
		s.sections.push(parsed);
		}
		}
		else if (tag === 'div') {
		c.find('a').each( (i, a) => {
		a = $(a);
		let url = a.attr('href');
		if (!utils.isAbsoluteUrl(url)) {
		url = utils.getUrl(url, dailyUrl);
		}
		s.categories.push({
		url,
		name: a.text()
		});
		});

		}
		});
		if (s.sections.length === 0) {
		delete s.sections;
		}
		if (s.categories.length === 0) {
		delete s.categories;
		}
		if (!s.sections && !s.categories) {
		return null;
		}
		else {
		return s;
		}
		};

		const sections = $('#daily-view-all').children('section');
		const results = [];
		sections.each( (i, section) => {
		const parsed = _parseSection($(section));
		if (parsed !== null) {
		results.push(parsed);
		}
		});

		return results;
		}

		function parseArticleList(html, opts) {
		const $ = cheerio.load(html);
		const dailyUrl = utils.getDailyUrl();
		const results = {
		articles: [],
		total: 0,
		start: 0,
		end: 0
		};

		$('articles-list').each( (i, list) => {
		$('.list-article', $(list)).each( (i, article) => {
		article = $(article);
		const imageUrl = article.find('img').attr('src') \|\| null;
		// category
		const infoText = article.find('.article-info-text');
		const infoTextCategoryLink = infoText.find('a.franchise');
		const infoTextMiddot = infoText.find('.middot');
		const category = {
		url: infoTextCategoryLink.attr('href') \|\| null,
		name: infoTextCategoryLink.text() \|\| ''
		};
		if (!utils.isAbsoluteUrl(category.url)) {
		category.url = utils.getUrl(category.url, dailyUrl);
		}
		// date
		infoTextCategoryLink.remove();
		infoTextMiddot.remove();
		const date = utils.stripLineBreaks(infoText.text()).trim();
		// title and url
		const titleLink = article.find('a.title');
		const title = titleLink.text();
		let url = titleLink.attr('href');
		if (!utils.isAbsoluteUrl(url)) {
		url = utils.getUrl(url, dailyUrl);
		}

		if (titleLink) {
		results.articles.push({
		url,
		title,
		date,
		imageUrl: utils.reformatImageUrl(imageUrl, opts.imageFormat),
		category,
		});
		}
		});
		});

		const resultsText = utils.stripLineBreaks($('#num-results').text()).trim();
		const rtm = resultsText.match(/(\d+)(?:\sto\s)(\d+)(?:\sof\s)(\d+)/);
		if (rtm.length === 4) {
		results.total = parseInt(rtm[3], 10);
		results.start = parseInt(rtm[1], 10);
		results.end = parseInt(rtm[2], 10);
		}
		return results;
		}

		function parseArticle(html, opts) {
		const $ = cheerio.load(html);
		const basic = JSON.parse($('script[type="application/ld+json"]').html());
		const players = JSON.parse(decode($('#p-daily-article').attr('data-player-infos')));

		const article = {
		title: basic.headline,
		description: basic.description,
		url: basic['@id'],
		imageUrl: basic.image,
		date: basic.datePublished,
		category: {
		name: basic.articleSection,
		url: null
		},
		genre: null,
		author: {
		name: basic.author.name,
		url: basic.author['@id']
		},
		mediaItems: [],
		sections: {}
		};

		// get genre
		const genreLink = $('.genre a');
		if (genreLink.length > 0) {
		article.genre = {
		name: genreLink.text(),
		url: genreLink.attr('href')
		};

		const genreReadMoreLink = $('.moreingenre a');
		if (genreReadMoreLink.length > 0) {
		article.genre.readMoreUrl = genreReadMoreLink.attr('href');
		if (!utils.isAbsoluteUrl(article.genre.readMoreUrl)) {
		article.genre.readMoreUrl = utils.getUrl(article.genre.readMoreUrl, utils.getDailyUrl());
		}
		}
		}

		// get category url
		const categoryLink = $('article-type a');
		if (categoryLink.length > 0) {
		article.category.url = categoryLink.attr('href');
		if (!utils.isAbsoluteUrl(article.category.url)) {
		article.category.url = utils.getUrl(article.category.url, utils.getDailyUrl());
		}
		}

		// get media items (albums and tracks featured in article)
		if (Array.isArray(players)) {
		players.forEach( player => {
		const mediaItem = {
		type: 'unknown',
		name: player.title,
		url: player.tralbum_url,
		imageUrl: '',
		featuredTrackPosition: player.featured_track_number,
		artist: {
		name: player.band_name,
		url: player.band_url,
		imageUrl: '',
		location: player.band_location
		},
		tracks: [],
		mediaItemRef: player.player_id
		};
		if (player.parent_tralbum_type === 'a') {
		mediaItem.type = 'album';
		}
		else if (player.parent_tralbum_type === 't') {
		mediaItem.type = 'track';
		}
		if (player.art_id) {
		mediaItem.imageUrl = opts.imageBaseUrl + '/img/a' + player.art_id + '_' + opts.albumImageFormat.id + '.jpg';
		}
		if (player.band_image_id) {
		mediaItem.artist.imageUrl = opts.imageBaseUrl + '/img/' + player.band_image_id + '_' + opts.artistImageFormat.id + '.jpg';
		}
		if (Array.isArray(player.tracklist)) {
		player.tracklist.forEach( trackInfo => {
		const track = {
		position: trackInfo.track_number,
		name: trackInfo.track_title,
		duration: trackInfo.audio_track_duration,
		streamUrl: trackInfo.audio_url['mp3-128']
		}
		mediaItem.tracks.push(track);
		});
		}

		article.mediaItems.push(mediaItem);
		});
		}

		// Function that returns a section corresponding to a media item
		const _getSectionByPlayer = player => {
		const section = {
		heading: null,
		html: '',
		text: '',
		mediaItemRef: null
		};

		// Get heading
		const heading = player.prevUntil('bamplayer-art', 'h3, h2').first();
		if (heading.length > 0) {
		section.heading = {
		html: heading.html(),
		text: utils.stripTags(utils.brToNewLine(heading.html())).trim()
		};
		}

		// Get html and text
		const paragraphs = player.nextUntil('bamplayer-art, h3, h5, article-end', 'p');
		paragraphs.each( (i, p) => {
		p = $(p);
		section.html += (section.html !== '' ? EOL : '') + p.html();
		section.text += (section.text !== '' ? EOL + EOL : '') + p.text();
		});

		// get mediaItemRef
		const playerIdMatch = player.attr('data-bind').match(/playerMap\["(.+?)"]/);
		section.mediaItemRef = playerIdMatch[1] \|\| null;

		return section;
		}

		// Function that returns the introductory paragraph(s) of the article
		const _getIntroSection = articleBody => {
		const firstPlayer = articleBody.find('bamplayer-art').first();
		const paragraphs = firstPlayer.length > 0 ? firstPlayer.prevAll('p') : articleBody.find('p');
		if (paragraphs.length > 0) {
		const section = {
		html: '',
		text: ''
		};
		paragraphs.each( (i, p) => {
		p = $(p);
		section.html += (section.html !== '' ? EOL : '') + p.html();
		section.text += (section.text !== '' ? EOL + EOL : '') + p.text();
		});
		return section;
		}
		else {
		return null;
		}
		}

		// sections
		const articleBody = $('#p-daily-article article');
		const sections = [];
		const introSection = _getIntroSection(articleBody);
		if (introSection) {
		sections.push(introSection);
		}
		const bcplayers = articleBody.find('bamplayer-art');
		bcplayers.each( (i, player) => {
		sections.push(_getSectionByPlayer($(player)));
		});
		article.sections = sections;

		if (opts.includeRawData) {
		article.raw = {
		basic,
		mediaItems: players,
		body: articleBody.html()
		};
		}

		return article;
		}

		module.exports = {
		@@ -723,2 +1028,5 @@ parseDiscoverResults,
		parseShow,
		parseArticleCategories,
		parseArticleList,
		parseArticle
		};

lib/utils.js

		@@ -135,2 +135,10 @@ const querystring = require('querystring');

		function getDailyUrl(params = {}) {
		let url = params.categoryUrl \|\| 'https://daily.bandcamp.com';
		if (params.page) {
		url += '?page=' + params.page;
		}
		return url;
		}

		module.exports = {
		@@ -152,3 +160,4 @@ getUrl,
		getShowIdFromUrl,
		getShowUrl
		getShowUrl,
		getDailyUrl
		};

package.json

		{
		"name": "bandcamp-fetch",
		"version": "0.1.0a-20210203",
		"version": "0.1.0a-20210210",
		"description": "JS library for scraping Bandcamp content",
		@@ -5,0 +5,0 @@ "main": "lib/index.js",

README.md

		@@ -185,2 +185,31 @@ # bandcamp-fetch

		### `getArticleCategories()`

		[Example](examples/getArticleCategories.js) ([output](examples/getArticleCategories_output.txt))

		Fetches the list of Bandcamp Daily article categories. Categories are grouped into sections.

		### `getArticleList([params], [options])`

		[Example](examples/getArticleList.js) ([output](examples/getArticleList_output.txt))

		Fetches the list of Bandcamp Daily articles under the category specified by `params.categoryUrl` (or all categories if not specified).

		- `params` (optional)
		- categoryUrl
		- `options` (optional)
		- imageFormat

		### `getArticle(articleUrl, [options])`

		[Example](examples/getArticle.js) ([output](examples/getArticle_output.txt))

		Fetches the contents of the Bandcamp Daily article at `articleUrl`.

		- `articleUrl`
		- `options` (optional)
		- albumImageFormat
		- artistImageFormat
		- includeRawData

		## Caching
		@@ -187,0 +216,0 @@

bandcamp-fetch - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics