@@ -0,130 +1,135 @@
		const Writable = require('stream').Writable
		const ebml = require('ebml')
		const ebmlBlock = require('ebml-block')
		const through = require('through2')
		const readElement = require('./lib/read-element')

		// track elements we care about
		const TRACK_ELEMENTS = ['TrackNumber', 'TrackType', 'Language', 'CodecID', 'CodecPrivate']

		const SUBTITLE_TYPES = ['S_TEXT/UTF8', 'S_TEXT/SSA', 'S_TEXT/ASS']
		const ASS_KEYS = ['readOrder', 'layer', 'style', 'name', 'marginL', 'marginR', 'marginV', 'effect', 'text']

		module.exports = function () {
		const subtitleTracks = new Map()
		const decoder = new ebml.Decoder()
		class MatroskaSubtitles extends Writable {
		constructor (prevInstance) {
		super()

		var timecodeScale = 1
		var currentTrack = null
		var currentSubtitleBlock = null
		var currentClusterTimecode = null

		var currentTrack
		var currentSubtitleBlock
		var currentClusterTimecode
		this.decoder = new ebml.Decoder()

		decoder.on('data', function (chunk) {
		// Segment Information //

		if (chunk[1].name === 'TimecodeScale') {
		timecodeScale = readData(chunk) / 1000000
		if (prevInstance instanceof MatroskaSubtitles) {
		prevInstance.end()
		// copy previous metadata
		this.subtitleTracks = prevInstance.subtitleTracks
		this.timecodeScale = prevInstance.timecodeScale
		this.decoder.on('data', _onClusterData)
		} else {
		this.subtitleTracks = new Map()
		this.timecodeScale = 1
		this.decoder.on('data', _onMetaData)
		}

		// Clusters //
		var self = this

		// TODO: assuming this is a Cluster `Timecode`
		if (chunk[1].name === 'Timecode') {
		currentClusterTimecode = readData(chunk)
		}
		function _onMetaData (chunk) {
		// Segment Information
		if (chunk[1].name === 'TimecodeScale') {
		self.timecodeScale = readElement(chunk[1]) / 1000000
		}

		// Tracks //
		// Tracks
		if (chunk[0] === 'start' && chunk[1].name === 'TrackEntry') {
		currentTrack = {}
		}

		if (chunk[0] === 'start' && chunk[1].name === 'TrackEntry') {
		currentTrack = {}
		}
		if (currentTrack && chunk[0] === 'tag') {
		// save info about track currently being scanned
		if (TRACK_ELEMENTS.includes(chunk[1].name)) {
		currentTrack[chunk[1].name] = readElement(chunk[1])
		}
		}

		if (chunk[0] === 'end' && chunk[1].name === 'TrackEntry') {
		// 0x11: Subtitle Track, S_TEXT/UTF8: SRT format
		if (currentTrack.TrackType === 0x11) {
		if (currentTrack.CodecID === 'S_TEXT/UTF8' \|\| currentTrack.CodecID === 'S_TEXT/ASS') {
		subtitleTracks.set(currentTrack.TrackNumber, currentTrack.CodecID)
		var info = {
		track: currentTrack.TrackNumber,
		language: currentTrack.Language,
		type: currentTrack.CodecID.substring(7)
		if (chunk[0] === 'end' && chunk[1].name === 'TrackEntry') {
		if (currentTrack.TrackType === 0x11) { // Subtitle Track
		if (SUBTITLE_TYPES.includes(currentTrack.CodecID)) {
		var track = {
		number: currentTrack.TrackNumber,
		language: currentTrack.Language,
		type: currentTrack.CodecID.substring(7).toLowerCase()
		}

		if (currentTrack.CodecPrivate) {
		// only SSA/ASS
		track.header = currentTrack.CodecPrivate.toString('utf8')
		}

		self.subtitleTracks.set(currentTrack.TrackNumber, track)
		}
		if (currentTrack.CodecPrivate) {
		// only SSA/ASS
		info.header = currentTrack.CodecPrivate.toString('utf8')
		}
		stream.push(['new', info])
		}
		currentTrack = null
		}
		currentTrack = null
		}

		if (currentTrack && chunk[0] === 'tag') {
		// save info about track currently being scanned
		if (TRACK_ELEMENTS.includes(chunk[1].name)) {
		currentTrack[chunk[1].name] = readData(chunk)
		if (chunk[0] === 'end' && chunk[1].name === 'Tracks') {
		self.decoder.removeListener('data', _onMetaData)

		if (self.subtitleTracks.size <= 0) return self.end()

		self.decoder.on('data', _onClusterData)
		self.emit('tracks', Array.from(self.subtitleTracks.values()))
		}
		}

		// Blocks //
		function _onClusterData (chunk) {
		// TODO: assuming this is a Cluster `Timecode`
		if (chunk[1].name === 'Timecode') {
		currentClusterTimecode = readElement(chunk[1])
		}

		if (chunk[1].name === 'Block') {
		var block = ebmlBlock(chunk[1].data)
		if (chunk[1].name === 'Block') {
		var block = ebmlBlock(chunk[1].data)

		if (subtitleTracks.has(block.trackNumber)) {
		var type = subtitleTracks.get(block.trackNumber)
		if (self.subtitleTracks.has(block.trackNumber)) {
		var type = self.subtitleTracks.get(block.trackNumber).type

		// TODO: would a subtitle track ever use lacing? We just take the first (only) frame.
		var subtitle = {
		text: block.frames[0].toString('utf8'),
		time: (block.timecode + currentClusterTimecode) * timecodeScale
		}
		var subtitle = {
		text: block.frames[0].toString('utf8'),
		time: (block.timecode + currentClusterTimecode) * self.timecodeScale
		}

		if (type === 'S_TEXT/ASS') {
		// extract ASS keys
		var values = subtitle.text.split(',')
		// ignore read-order
		for (var i = 1; i < 9; i++) {
		subtitle[ASS_KEYS[i]] = values[i]
		if (type === 'ass' \|\| type === 'ssa') {
		// extract SSA/ASS keys
		var values = subtitle.text.split(',')
		// ignore read-order, and skip layer if ssa
		var i = type === 'ssa' ? 2 : 1
		for (; i < 9; i++) {
		subtitle[ASS_KEYS[i]] = values[i]
		}
		// re-append extra text that might have been splitted
		for (i = 9; i < values.length; i++) {
		subtitle.text += ',' + values[i]
		}
		}
		// re-append extra text that might have been splitted
		for (i = 9; i < values.length; i++) {
		subtitle.text += ',' + values[i]
		}

		currentSubtitleBlock = [subtitle, block.trackNumber]
		}

		currentSubtitleBlock = [block.trackNumber, subtitle]
		}
		}

		// TODO: assuming `BlockDuration` exists and always comes after `Block`
		if (currentSubtitleBlock && chunk[1].name === 'BlockDuration') {
		currentSubtitleBlock[1].duration = readData(chunk) * timecodeScale
		// TODO: assuming `BlockDuration` exists and always comes after `Block`
		if (currentSubtitleBlock && chunk[1].name === 'BlockDuration') {
		currentSubtitleBlock[0].duration = readElement(chunk[1]) * self.timecodeScale

		stream.push(currentSubtitleBlock)
		self.emit('subtitle', ...currentSubtitleBlock)

		currentSubtitleBlock = null
		currentSubtitleBlock = null
		}
		}
		})
		}

		// create object stream
		var stream = through.obj(function write (chunk, _, callback) {
		decoder.write(chunk)
		callback()
		})

		return stream
		_write (chunk, _, callback) {
		this.decoder.write(chunk)
		callback(null)
		}
		}

		function readData (chunk) {
		switch (chunk[1].type) {
		case 'b':
		return chunk[1].data
		case 's':
		return chunk[1].data.toString('ascii')
		case '8':
		return chunk[1].data.toString('utf8')
		case 'u':
		return chunk[1].data.readUIntBE(0, chunk[1].dataSize)
		default:
		console.error('Unsupported data:', chunk)
		}
		}
		module.exports = MatroskaSubtitles

package.json

		{
		"name": "matroska-subtitles",
		"version": "1.1.2",
		"description": "Transform stream for parsing embedded .mkv subtitles.",
		"version": "2.0.0",
		"description": "Writable stream for parsing embedded .mkv subtitles.",
		"main": "index.js",
		"dependencies": {
		"ebml": "^2.2.0",
		"ebml-block": "^1.0.0",
		"through2": "^2.0.1"
		"ebml-block": "^1.0.0"
		},
		@@ -32,3 +31,6 @@ "devDependencies": {},
		},
		"homepage": "https://github.com/mathiasvr/matroska-subtitles#readme"
		"homepage": "https://github.com/mathiasvr/matroska-subtitles#readme",
		"directories": {
		"example": "examples"
		}
		}

106

README.md

		@@ -10,102 +10,62 @@ # matroska-subtitles [![npm][npm-img]][npm-url] [![dependencies][dep-img]][dep-url] [![license][lic-img]][lic-url]

		Transform stream for parsing embedded .mkv subtitles.
		Writable stream for parsing embedded .mkv subtitles.

		> Currently supports extraction of the .srt and .ass format.
		Supported formats: `.srt`, `.ssa`, `.ass`.

		## install

		```
		```bash
		npm install matroska-subtitles
		```

		## documentation
		## example

		The `data` event of the stream will emit an array that determines the type of the data.
		When a new subtitle track is encountered the track number, language, type and optionally a header is emitted:

		```
		data = [ 'new', { track: <track number>, language: <string>, type: <string>, header: <string> } ]
		```

		Subsequently a specific subtitle track will emit data of this form:
		```
		data = [ <track number>, { text: <string>, time: <ms>, duration: <ms> } ]
		```

		## examples

		### dump all subtitles

		```javascript
		const fs = require('fs')
		const matroskaSubtitles = require('matroska-subtitles')
		const MatroskaSubtitles = require('matroska-subtitles')

		var subs = matroskaSubtitles()
		var parser = new MatroskaSubtitles()

		subs.on('data', function (data) {
		console.log(data)
		// first an array of subtitle track information is emitted
		parser.once('tracks', function (tracks) {
		console.log(tracks)
		})

		fs.createReadStream('Sintel.2010.720p.mkv').pipe(subs)
		// afterwards each subtitle is emitted
		parser.on('subtitle', function (subtitle, trackNumber) {
		console.log('Track ' + trackNumber + ':', subtitle)
		})

		fs.createReadStream('Sintel.2010.720p.mkv').pipe(parser)
		```

		### group subtitle tracks
		### `tracks` event response format

		The following is an example of extracting subtitle tracks of an mkv:

		```javascript
		const fs = require('fs')
		const matroskaSubtitles = require('matroska-subtitles')

		var tracks = new Map()
		var subs = matroskaSubtitles()

		subs.on('data', function (data) {
		if (data[0] === 'new') {
		var key = data[1].track
		tracks.set(key, {
		language: data[1].language,
		subtitles: []
		})
		} else {
		var key = data[0]
		var subtitle = data[1]
		tracks.get(key).subtitles.push(subtitle)
		}
		})

		subs.on('end', function () {
		tracks.forEach((track) => console.log(track))
		})

		fs.createReadStream('Sintel.2010.720p.mkv').pipe(subs)
		[
		{ number: 3, language: 'eng', type: 'utf8' },
		{ number: 4, language: 'jpn', type: 'ass', header: '[Script Info]\r\n...' }
		]
		```

		> Notice that this example doesn't take advantage of streaming since the subtitles first are being outputted when the stream ends.
		> Note that the `language` may be `undefined` if the mkv track doesn't specify it.

		### response
		### `subtitle` event response format

		The response of this example would look like this:
		```javascript
		{ language: 'eng',
		subtitles:
		[ { text: 'This blade has a dark past.',
		time: 107250,
		duration: 1970 },
		{ text: 'It has shed much innocent blood.',
		time: 111800,
		duration: 4000 },
		{ text: 'You\'re a fool for traveling alone,\r\nso completely unprepared.',
		time: 118000,
		duration: 3450 } ] }
		{
		text: 'This blade has a dark past.',
		time: 107250, // ms
		duration: 1970 // ms
		}
		```

		> Note that the `language` might be `undefined` if the mkv track has not specified it.
		> May also contain additional `.ass` specific values

		## contributing
		## random access
		The parser must obtain the `tracks` metadata event before it can begin to emit subtitles.
		To read subtitles from a specific position in the stream,
		you can pass in a previous instance as parameter: `parser = new MatroskaSubtitles(parser)`
		after the `tracks` event and pipe from a given position. See `examples/random-access.js` for an example.

		This is still a work in progress.

		If you find a bug or have suggestions feel free to create an issue or a pull request!

		## see also
		@@ -112,0 +72,0 @@

matroska-subtitles - npm Package Compare versions

New alerts

Improved metrics

Worsened metrics

Dependency changes