transcript-parser - npm Package Compare versions

Comparing version 0.3.0 to 0.4.0

benchmark/benchmark.js

		@@ -10,2 +10,3 @@ 'use strict';
		const TranscriptParser = require('../app.js');
		const Readable = require('stream').Readable;

		@@ -18,3 +19,3 @@ const tp = new TranscriptParser();


		var s = new Readable();
		/***********************
		@@ -48,5 +49,10 @@ * Benchmarks
		console.log('Async Parse #3:', msg);
		}).catch(e => console.error(e));
		s.push(firstTranscript, 'utf8');
		s.push(null);
		console.log('Starting stream parse');
		return timePromise(() => Promise.fromCallback(cb => tp.parseStream(s, cb)) );
		}).then(msg => {
		console.log('Stream Parse #1:', msg);
		}).catch(e => console.error(e.stack));


		/***********************
		@@ -90,7 +96,7 @@ * Functions
		function readTranscript(name) {
		return fs.readFileAsync(__dirname + '/transcripts/'+name+'.txt', {encoding: 'UTF-8'});
		return fs.readFileAsync(__dirname + '/transcripts/'+name+'.txt', {encoding: 'utf8'});
		}

		function readTranscriptSync(name) {
		return fs.readFileSync(__dirname + '/transcripts/'+name+'.txt', {encoding: 'UTF-8'});
		return fs.readFileSync(__dirname + '/transcripts/'+name+'.txt', {encoding: 'utf8'});
		}

105

lib/parser.js

		@@ -8,2 +8,3 @@ 'use strict';
		const Promise = require('bluebird');
		const byline = require('byline');

		@@ -21,12 +22,13 @@
		removeUnknownSpeakers: false,
		aliases: {}
		aliases: {},
		regex: {
		newLine: /\r?\n/,
		action: /$[A-Z\ ]+$\ ?/,
		speaker: /^((?:\[\d{1,2}:\d{1,2}:\d{1,2}\]\ ?)?[A-Z\d\ \/,.\-]+?)(?:\ ?\[[A-z\ ]+\])? ?:\ ?/,
		timestamp: /\ ?\[\d{1,2}:\d{1,2}:\d{1,2}\]\ ?/,
		annotation: /\[.+?\]\ ?/
		}
		};
		this.settings = _.assign(this.defaultSettings, options);
		this.regex = {
		newLine: /\r?\n/,
		action: /$[A-Z\ ]+$\ ?/,
		speaker: /^((?:\[\d{1,2}:\d{1,2}:\d{1,2}\]\ ?)?[A-Z\d\ \/,.\-]+)(?: \[.+\])?:\ ?/,
		timestamp: /\[\d{1,2}:\d{1,2}:\d{1,2}\]\ ?/,
		annotation: /\[.+?\]\ ?/
		};
		this.regex = this.settings.regex;
		};
		@@ -45,10 +47,11 @@
		.filter(line => line.length > 0); //Remove blank lines
		lines = (this.settings.removeActions) ? lines.map(line => line.split(this.regex.action).join('')): lines;
		lines = this.settings.removeActions ? lines.map(line => removeAll(line, this.regex.action)): lines;
		if(this.settings.removeAnnotations) {
		//Remove annotations
		lines = lines.map(line => line.split(this.regex.annotation).join(''));
		lines = lines.map(line => removeAll(line, this.regex.annotation));
		} else if(this.settings.removeTimestamps) {
		//Remove timestamps
		lines = lines.map(line => line.split(this.regex.timestamp).join(''));
		lines = lines.map(line => removeAll(line, this.regex.timestamp));
		}
		lines = lines.filter(line => line.length > 0); //Remove newely blank lines

		@@ -67,4 +70,4 @@ //Output object
		if(lines[i].match(this.regex.speaker)) {
		//Regex match
		speaker = this.regex.speaker.exec(lines[i])[1];
		//Regex match - is speaker
		speaker = this.regex.speaker.exec(lines[i])[1].trim();
		//Remove the speaker from the line
		@@ -110,3 +113,3 @@ lines[i] = lines[i].replace(this.regex.speaker, '');
		if(this.settings.removeActions) {
		return Promise.map(lines, line => line.split(this.regex.action).join(''));
		return Promise.map(lines, line => removeAll(line, this.regex.action));
		}
		@@ -117,13 +120,18 @@ return Promise.resolve(lines);
		//Remove annotations
		return Promise.map(lines, line => line.split(this.regex.annotation).join(''));
		return Promise.map(lines, line => removeAll(line, this.regex.annotation));
		} else if(this.settings.removeTimestamps) {
		//Remove timestamps
		return Promise.map(lines, line => line.split(this.regex.timestamp).join(''));
		return Promise.map(lines, line => removeAll(line, this.regex.timestamp));
		}
		return Promise.resolve(lines);
		}).then(lines => {
		})
		.then(lines => {
		//Remove newly blank lines
		return Promise.filter(lines, line => line.length > 0);
		})
		.then(lines => {
		return Promise.each(lines, (line) => {
		if(line.match(this.regex.speaker)) {
		//Regex match
		speaker = this.regex.speaker.exec(line)[1];
		speaker = this.regex.speaker.exec(line)[1].trim();
		//Remove the speaker from the line
		@@ -258,1 +266,62 @@ line = line.replace(this.regex.speaker, '');
		};

		proto.parseStream = function(inputStream, cb) {
		const stream = byline.createStream(inputStream);
		//Output object
		const output = {};
		//Object containing the speakers and their lines
		output.speaker = {};
		//List of the speakers, in order
		output.order = [];

		var line;
		var speaker = 'none';

		stream.on('readable', () => {
		const line = stream.read()
		if(line === null) return cb(null, output);

		var filteredLine = this.filterLine(line);
		if(filteredLine) {
		if(filteredLine.match(this.regex.speaker)) {
		//Regex match - is speaker
		speaker = this.regex.speaker.exec(filteredLine)[1].trim();
		//Remove the speaker from the line
		filteredLine = filteredLine.replace(this.regex.speaker, '');
		}
		//If the speaker's key doesn't already exist
		if(!(speaker in output.speaker) &&
		//And the speaker is defined or the setting to remove undefined speakers is false
		(speaker !== 'none' \|\| !this.settings.removeUnknownSpeakers)) {
		//Set the output's speaker key to a new empty array
		output.speaker[speaker] = [];
		}
		//If the speaker is defined or the setting to remove undefined speakers is false
		if(speaker !== 'none' \|\| !this.settings.removeUnknownSpeakers) {
		//Add the text to the output speaker's key and speaker name to the order array
		output.speaker[speaker].push(filteredLine);
		output.order.push(speaker);
		}
		}
		});

		};

		//Filters a line based on the defined settings
		//Returns null on the line being completely removed
		proto.filterLine = function(line) {
		if(typeof line !== 'string') {line = line.toString();}
		line = this.settings.removeActions ? removeAll(line, this.regex.action) : line;
		if(this.settings.removeAnnotations) {
		line = removeAll(line, this.regex.annotation);
		} else if(this.settings.removeTimestamps) {
		line = removeAll(line, this.regex.timestamp);
		}
		if(line.length <= 0) return null;
		return line;
		};

		function removeAll(text, regex) {

		return text.split(regex).join('');
		}

package.json

		{
		"name": "transcript-parser",
		"version": "0.3.0",
		"version": "0.4.0",
		"description": "Parses plaintext speech/debate/radio transcripts into JavaScript objects.",
		@@ -38,4 +38,5 @@ "main": "app.js",
		"bluebird": "^3.3.4",
		"byline": "^4.2.1",
		"lodash": "^4.9.0"
		}
		}

README.md

		@@ -18,5 +18,21 @@ transcript-parser

		//Do not use fs.readFileSync in production
		const output = tp.parseOneSync(fs.readFileSync('transcript.txt', {encoding: 'UTF-8'}));
		console.log(output);
		//Synchronous example
		const parsed = tp.parseOneSync(fs.readFileSync('transcript.txt', {encoding: 'UTF-8'}));
		console.log(parsed);

		//Asyncronous example
		fs.readFile('transcript.txt', (err, data) => {
		if(err) return console.error('Error:', err);
		tp.parseOne(data, (err, parsed => {
		if(err) return console.error('Error:', err);
		console.log(parsed);
		}));
		});

		//Stream example
		const stream = fs.createReadStream('transcript.txt', 'utf8');
		tp.parseStream(stream, (err, parsed) => {
		if(err) return console.error('Error:', err);
		console.log(parsed);
		});

		@@ -52,2 +68,20 @@

		### .parseStream()

		The `parseStream()` method parses a [`Stream`](https://nodejs.org/api/stream.html) and returns an object representing it.

		This is the preferred method for parsing streams asynchronously as it doesn't have to load the entire transcript into memory (unlike `parseOne()`).

		#### Syntax

		`tp.parseOneSync(stream, callback)`

		##### Parameters

		- `stream`
		+ The `Readable` stream to read.
		- `callback(err, data)`
		+ A callback to be executed on function completion or error.


		### .parseOneSync()
		@@ -80,3 +114,3 @@
		- `callback(err, data)`
		+ A callback to be exectuted on function completion.
		+ A callback to be exectuted on function completion or error.

		@@ -115,3 +149,3 @@
		- `callback(err, resolved)`
		+ A callback to be executed on function completion.
		+ A callback to be executed on function completion or error.

test/parser.js

		@@ -21,2 +21,22 @@ 'use strict';

		describe('#parseStream()', function() {
		const tp = new TranscriptParser();

		it('should parse a transcript correctly', function(done) {
		readSample(1)
		.bind({})
		.then(info => {
		var stream = fs.createReadStream(path.join(TEST_DIR, '1.txt'), 'utf8');
		return Promise.fromCallback(cb => tp.parseStream(stream, cb));
		}).then(result => {
		this.result = result;
		return readExpected(1);
		}).then(expected => {
		this.result.should.be.eql(JSON.parse(expected));
		done();
		})
		.catch(e => done(e));
		});
		});

		/*
		@@ -61,3 +81,3 @@ * For the synchronous parseOne method
		const parser = new TranscriptParser({removeAnnotations: false, removeTimestamps: true});
		var result = parser.parseOneSync('[20:20:34] BERMAN: [2:1:41] The [first] name...');
		var result = parser.parseOneSync('[20:20:34] BERMAN [2:1:41] : The [first] name...');
		result.speaker.should.eql({
		@@ -146,3 +166,3 @@ 'BERMAN': [
		const parser = new TranscriptParser({removeAnnotations: false, removeTimestamps: true});
		parser.parseOne('[20:20:34] BERMAN: [2:1:41] The [first] name...',
		parser.parseOne('[20:20:34] BERMAN: The [first] name...',
		(err, result) => {
		@@ -345,7 +365,7 @@ if(err) return done(err);
		function readSample(sampleName) {
		return fs.readFileAsync(path.join(TEST_DIR, sampleName+'.txt'), {encoding: 'UTF-8'});
		return fs.readFileAsync(path.join(TEST_DIR, sampleName+'.txt'), {encoding: 'utf8'});
		}

		function readExpected(expectedName) {
		return fs.readFileAsync(path.join(EXPECTED_DIR, expectedName+'.txt'), {encoding: 'UTF-8'});
		return fs.readFileAsync(path.join(EXPECTED_DIR, expectedName+'.txt'), {encoding: 'utf8'});
		}

test/regex.js

		@@ -60,3 +60,3 @@ 'use strict';
		'[20:20:34] BERMAN: [2:1:41] The...'.split(regex.timestamp).join('')
		.should.equal('BERMAN: The...');
		.should.equal('BERMAN:The...');
		});
		@@ -63,0 +63,0 @@ });

test/expected/1.txt

Sorry, the diff of this file is too big to display

test/transcripts/1.txt

Sorry, the diff of this file is too big to display

transcript-parser - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics

Worsened metrics

Dependency changes