---
		name: Bug report
		about: Create a report to help us improve this npm package

		---
		@@ -20,7 +19,8 @@
		Desktop (please complete the following information):
		- OS: [e.g. iOS]
		- Browser [e.g. chrome, safari]
		- Version [e.g. 22]

		- OS: (e.g. iOS)
		- Browser: (e.g. chrome, safari)
		- Version: (e.g. 22)

		Additional context
		Add any other context about the problem here.

lib/ColumnsParser.js

		@@ -10,3 +10,3 @@ /**

		function getColumnIndex(cols, x){
		function getColumnIndex(cols, x) {
		var bestDist = null;
		@@ -17,4 +17,3 @@ for (var i = 0; i < cols.length; ++i) {
		break;
		}
		else {
		} else {
		bestDist = dist;
		@@ -26,9 +25,9 @@ }

		function ColumnsParser(colNames){
		function ColumnsParser(colNames) {
		this.cols = [];
		var cols = this.cols,
		colNames = colNames.slice(), // clone (for parameter immutability)
		line = -1; // -1 = header
		colNames = colNames.slice(), // clone (for parameter immutability)
		line = -1; // -1 = header

		this.processItem = function(item){
		this.processItem = function(item) {
		if (line == -1) {
		@@ -50,9 +49,8 @@ // parse x-position of column headers
		}
		}
		else {
		} else {
		cols[getColumnIndex(cols, item.x)].items.push(item);
		}
		};
		};
		}

		module.exports = ColumnsParser;

lib/LOG.js

		@@ -9,5 +9,5 @@ /**

		var nullLog = function LOG(){};
		var nullLog = function LOG() {};

		var realLog = function LOG(){
		var realLog = function LOG() {
		for (var i in arguments)
		@@ -21,9 +21,9 @@ if (arguments[i] instanceof Object \|\| arguments[i] instanceof Array)

		module.exports = function(){
		module.exports = function() {
		LOG.apply(null, arguments);
		};

		module.exports.toggle = function(enabled){
		module.exports.toggle = function(enabled) {
		LOG = !enabled ? nullLog : realLog;
		return module.exports;
		};

lib/parseColumns.js

		@@ -11,16 +11,15 @@ /**

		module.exports = function(/* columns */){
		module.exports = function(/* columns */) {
		this.output = [];
		this.cols = Array.prototype.slice.apply(arguments);
		var colNames = this.cols,
		colX = [],
		rows = this.output,
		line = -1, // header
		lineY = null;
		function processItem(item){
		colX = [],
		rows = this.output,
		line = -1, // header
		lineY = null;
		function processItem(item) {
		if (line == -1) {
		// parse x-position of column headers
		var i = colNames.indexOf(item.text);
		if (i > -1)
		colX[i] = item.x;
		if (i > -1) colX[i] = item.x;
		if (colX.length == colNames.length) {
		@@ -30,8 +29,6 @@ LOG("table header:", colNames, colX);
		}
		}
		else {
		} else {
		if (lineY === null) {
		lineY = item.y;
		}
		else if (lineY != item.y) {
		} else if (lineY != item.y) {
		lineY = item.y;
		@@ -42,3 +39,3 @@ line++;
		var col = 0;
		for (var i=colX.length-1; i>=0; --i)
		for (var i = colX.length - 1; i >= 0; --i)
		if (item.x > colX[i]) {
		@@ -51,5 +48,5 @@ col = i;
		}
		};
		}
		processItem(this.currentItem); // apply on header's first item
		return processItem; // then the same function will be run on all following items, until another rule is triggered
		};

116

lib/parseTable.js

		@@ -10,13 +10,13 @@ /**

		function getTopPos(item){
		function getTopPos(item) {
		return item.y;
		}

		function getLeftPos(item){
		function getLeftPos(item) {
		return item.x;
		}

		function getText(item){
		function getText(item) {
		return item.text;
		};
		}

		@@ -29,23 +29,24 @@ /**
		**/
		function makeFloorClassifier(nbClusters, arr){
		function makeFloorClassifier(nbClusters, arr) {
		var min = Math.min.apply(Math, arr);
		var delta = Math.max.apply(Math, arr) - min;
		min -= (delta / nbClusters) / 2;
		return function classify(value){
		return Math.floor(nbClusters * (value - min) / delta);
		min -= delta / nbClusters / 2;
		return function classify(value) {
		return Math.floor((nbClusters * (value - min)) / delta);
		};
		}

		function makeColumnClassifier(header){
		var colX = [0].concat(header.map(getLeftPos)).sort(function(a,b){return a-b;});
		return function classify(item){
		for (var i=colX.length-1; i>-1; --i)
		if (getLeftPos(item) >= colX[i])
		return i;
		function makeColumnClassifier(header) {
		var colX = [0].concat(header.map(getLeftPos)).sort(function(a, b) {
		return a - b;
		});
		return function classify(item) {
		for (var i = colX.length - 1; i > -1; --i)
		if (getLeftPos(item) >= colX[i]) return i;
		};
		}

		function buildRowList(items, classifyRow){
		function buildRowList(items, classifyRow) {
		var rows = [];
		for (var i in items){
		for (var i in items) {
		var item = items[i];
		@@ -58,44 +59,55 @@ var row = classifyRow(getTopPos(item));

		function joinCellCollisions(separ){
		return function(cell){
		return (cell \|\| []).map(getText).join(separ).substr(0, 7);
		function joinCellCollisions(separ) {
		return function(cell) {
		return (cell \|\| [])
		.map(getText)
		.join(separ)
		.substr(0, 7);
		};
		}

		function fillTab(str){
		function fillTab(str) {
		return str.substr(0, 7);
		}

		function renderTable(table){
		return (table \|\| []).map(function(row){
		return (row \|\| []).map(fillTab).join("\t");
		}).join("\n");
		function renderTable(table) {
		return (table \|\| [])
		.map(function(row) {
		return (row \|\| []).map(fillTab).join("\t");
		})
		.join("\n");
		}

		function renderMatrix(matrix){
		return (matrix \|\| []).map(function(row){
		return (row \|\| []).map(joinCellCollisions("+")).join("\t");
		}).join("\n");
		function renderMatrix(matrix) {
		return (matrix \|\| [])
		.map(function(row) {
		return (row \|\| []).map(joinCellCollisions("+")).join("\t");
		})
		.join("\n");
		}

		function renderRows(rows){
		return (rows \|\| []).map(function(row, rowId){
		var cells = [ rowId + ":" ];
		for (var i in row)
		cells.push((Math.floor(row[i].x) + ":" + row[i].text).substr(0, 7));
		return cells.join("\t");
		}).join("\n");
		function renderRows(rows) {
		return (rows \|\| [])
		.map(function(row, rowId) {
		var cells = [rowId + ":"];
		for (var i in row)
		cells.push((Math.floor(row[i].x) + ":" + row[i].text).substr(0, 7));
		return cells.join("\t");
		})
		.join("\n");
		}

		function renderItems(items) {
		return items.map(function(i){
		return [i.y, i.x, i.text].join("\t");
		}).join("\n");
		return items
		.map(function(i) {
		return [i.y, i.x, i.text].join("\t");
		})
		.join("\n");
		}

		function buildMatrix(rows, classifyColumn){
		function buildMatrix(rows, classifyColumn) {
		var matrix = [];
		for (var y in rows){
		for (var y in rows) {
		var row = [];
		for (var x in rows[y]){
		for (var x in rows[y]) {
		var item = rows[y][x];
		@@ -110,6 +122,6 @@ var colN = classifyColumn(item);

		function detectCollisions(matrix){
		function detectCollisions(matrix) {
		var collisions = [];
		(matrix \|\| []).map(function(row, rowN){
		(row \|\| []).map(function(cellItems, colN){
		(matrix \|\| []).map(function(row, rowN) {
		(row \|\| []).map(function(cellItems, colN) {
		if (cellItems.length > 1)
		@@ -126,6 +138,5 @@ collisions.push({

		function makeAccumulator(nbRows, headerRow){

		function makeAccumulator(nbRows, headerRow) {
		var rule = this,
		items = [];
		items = [];

		@@ -137,10 +148,10 @@ rule.nbRows = nbRows \|\| 0;
		matrix: null
		};
		};

		function accumulate(item){
		function accumulate(item) {
		items.push(item);
		};
		}

		// when parsing is done: generate a clean table, from items.
		rule.whenDone(function(){
		rule.whenDone(function() {
		// classify items into rows
		@@ -153,7 +164,6 @@ var classifyRow = makeFloorClassifier(rule.nbRows, items.map(getTopPos));
		this.output.matrix = buildMatrix(this.output.rows, classifyColumn);

		});

		return accumulate; // then the same function will be run on all following items, until another rule is triggered
		};
		}

		@@ -160,0 +170,0 @@ module.exports = makeAccumulator;

lib/SequentialParser.js

		@@ -6,3 +6,3 @@ /**
		**/
		function SequentialParser(accumulators, callback){
		function SequentialParser(accumulators, callback) {
		var step = 0;
		@@ -12,8 +12,10 @@ var fields = {};
		fields: fields,
		addField: function(key, value){
		addField: function(key, value) {
		this.fields[key] = value;
		},
		parseItem: function(item){
		parseItem: function(item) {
		if (step >= accumulators.length) {
		return console.warn("warning: skipping item, because SequentialParser is done.");
		return console.warn(
		"warning: skipping item, because SequentialParser is done."
		);
		}
		@@ -24,10 +26,7 @@ var current = accumulators[step];
		++step;
		}
		else if (current.accumulator) {
		} else if (current.accumulator) {
		var doneAccumulating = current.accumulator(item, this);
		if (doneAccumulating)
		++step;
		}
		else // no action => skip item
		++step;
		if (doneAccumulating) ++step;
		} // no action => skip item
		else ++step;
		if (!item \|\| step >= accumulators.length) {
		@@ -34,0 +33,0 @@ callback && callback(null, this);

lib/TableParser.js

		@@ -9,23 +9,26 @@ /**

		function TableParser(){
		function TableParser() {
		this.rows = {};
		};
		}

		TableParser.prototype.processItem = function(item, col){
		var row = this.rows[""+item.y] = this.rows[""+item.y] \|\| {};
		TableParser.prototype.processItem = function(item, col) {
		var row = (this.rows["" + item.y] = this.rows["" + item.y] \|\| {});
		(row[col] = row[col] \|\| []).push(item);
		}
		};

		TableParser.prototype.processHeadingItem = function(item, col){
		this.processItem({
		y: 0,
		x: item.x,
		text: item.text
		}, col);
		}
		TableParser.prototype.processHeadingItem = function(item, col) {
		this.processItem(
		{
		y: 0,
		x: item.x,
		text: item.text
		},
		col
		);
		};

		// Rows

		function sortAsFloatValues(values){
		return values.slice().sort(function(a, b){
		function sortAsFloatValues(values) {
		return values.slice().sort(function(a, b) {
		return parseFloat(a) - parseFloat(b);
		@@ -35,40 +38,39 @@ });

		TableParser.prototype.getRows = function(){
		TableParser.prototype.getRows = function() {
		var rows = this.rows;
		var yValues = sortAsFloatValues(Object.keys(rows));
		return yValues.map(function(y){
		return yValues.map(function(y) {
		return rows["" + y];
		});
		}
		};

		function renderRows(rows){
		return (rows \|\| []).map(function(row, rowId){
		var cells = [];
		for (var i in row)
		for (var j in row[i])
		cells.push(row[i][j].x + ": " + row[i][j].text);
		return rowId + ":\t" + cells.join(", ");
		}).join("\n");
		function renderRows(rows) {
		return (rows \|\| [])
		.map(function(row, rowId) {
		var cells = [];
		for (var i in row)
		for (var j in row[i]) cells.push(row[i][j].x + ": " + row[i][j].text);
		return rowId + ":\t" + cells.join(", ");
		})
		.join("\n");
		}

		TableParser.prototype.renderRows = function(){
		TableParser.prototype.renderRows = function() {
		return renderRows(this.getRows());
		}
		};

		// Matrix

		function getSortedXValues(rows){
		function getSortedXValues(rows) {
		var xSet = {};
		for (var y in rows)
		for (var x in rows[y])
		xSet[x] = true;
		for (var y in rows) for (var x in rows[y]) xSet[x] = true;
		return sortAsFloatValues(Object.keys(xSet));
		}

		TableParser.prototype.getMatrix = function(){
		TableParser.prototype.getMatrix = function() {
		var rows = this.getRows();
		var xValues = getSortedXValues(rows);
		return rows.map(function(row, y){
		return rows.map(function(row, y) {
		var rowNew = [];
		for (var x in row){
		for (var x in row) {
		var items = row[x];
		@@ -80,24 +82,29 @@ var colN = xValues.indexOf(x);
		});
		}
		};

		function getText(item){
		function getText(item) {
		return item.text;
		};
		}

		function joinCellCollisions(separ){
		return function(cell){
		return (cell \|\| []).map(getText).join(separ).substr(0, 7);
		function joinCellCollisions(separ) {
		return function(cell) {
		return (cell \|\| [])
		.map(getText)
		.join(separ)
		.substr(0, 7);
		};
		}

		function renderMatrix(matrix){
		return (matrix \|\| []).map(function(row){
		return (row \|\| []).map(joinCellCollisions("+")).join("\t");
		}).join("\n");
		function renderMatrix(matrix) {
		return (matrix \|\| [])
		.map(function(row) {
		return (row \|\| []).map(joinCellCollisions("+")).join("\t");
		})
		.join("\n");
		}

		TableParser.prototype.renderMatrix = function(){
		TableParser.prototype.renderMatrix = function() {
		return renderMatrix(this.getMatrix());
		}
		};

		module.exports = TableParser;

package.json

		{
		"name": "pdfreader",
		"version": "1.0.0",
		"version": "1.0.2",
		"description": "Read text and parse tables from PDF files. Supports tabular data with automatic column detection, and rule-based parsing.",
		"main": "index.js",
		"scripts": {
		"test:print:result": "echo ✅ Tests passed.",
		"prettier:print": "prettier --list-different \"./*/.js\" \"./*/.md\"",
		"prettier:check": "npm run -s prettier:print 1>&2; exit $(npm run -s prettier:print \| wc -l)",
		"prettier:fix": "prettier \"./*/.js\" \"./*/.md\" --write",
		"test:print:result": "echo ✅ All tests passed.",
		"test:diff:buffer": "node parseAsBuffer.js test/sample.pdf >test/test-buffer-snapshot.log; git --no-pager diff test/test-buffer-snapshot.log 1>&2; exit $(git --no-pager diff test/test-buffer-snapshot.log \| wc -l)",
		@@ -13,3 +16,4 @@ "test:diff:file": "node test/test.js >test/test-snapshot.log; git --no-pager diff test/test-snapshot.log 1>&2; exit $(git --no-pager diff test/test-snapshot.log \| wc -l)",
		"test:sample": "node test/test.js && node parseAsBuffer.js test/sample.pdf",
		"test": "npm run -s test:sample && npm run -s test:regression && npm run -s test:diff && npm run -s test:print:result"
		"test:functional": "npm run -s test:sample && npm run -s test:regression && npm run -s test:diff",
		"test": "npm run -s prettier:check && npm run -s test:functional && npm run -s test:print:result"
		},
		@@ -41,4 +45,10 @@ "repository": {
		"dependencies": {
		"pdf2json": "1.1.2"
		"pdf2json": "1.1.7"
		},
		"devDependencies": {
		"eslint-config-prettier": "^3.3.0",
		"eslint-plugin-prettier": "^3.0.0",
		"prettier": "1.15.3",
		"semantic-release": "^15.13.3"
		}
		}

parse.js

		var LOG = require("./lib/LOG.js").toggle(false);
		var PdfReader = require("./index.js").PdfReader;

		function printRawItems(filename, callback){
		new PdfReader().parseFileItems(filename, function(err, item){
		if (err)
		callback(err);
		else if (!item)
		callback();
		else if (item.file)
		console.log("file =", item.file.path);
		else if (item.page)
		console.log("page =", item.page);
		function printRawItems(filename, callback) {
		new PdfReader().parseFileItems(filename, function(err, item) {
		if (err) callback(err);
		else if (!item) callback();
		else if (item.file) console.log("file =", item.file.path);
		else if (item.page) console.log("page =", item.page);
		else if (item.x)
		console.log([item.x, item.y, item.oc, item.A, Math.floor(item.w), item.text].join("\t"));
		else
		console.warn(item);
		console.log(
		[item.x, item.y, item.oc, item.A, Math.floor(item.w), item.text].join(
		"\t"
		)
		);
		else console.warn(item);
		});
		@@ -24,8 +23,7 @@ }
		console.error("please provide the name of a PDF file");
		}
		else {
		} else {
		console.warn("printing raw items from file:", filename, "...");
		printRawItems(filename, function(){
		printRawItems(filename, function() {
		console.warn("done.");
		});
		}

parseAsBuffer.js

		var LOG = require("./lib/LOG.js").toggle(false);
		var PdfReader = require("./index.js").PdfReader;
		var fs = require('fs');
		var fs = require("fs");

		function printRawItems(pdfBuffer, callback){
		new PdfReader().parseBuffer(pdfBuffer, function(err, item){
		if (err)
		callback(err);
		else if (!item)
		callback();
		else if (item.file)
		console.log("file =", item.file.path);
		else if (item.page)
		console.log("page =", item.page);
		function printRawItems(pdfBuffer, callback) {
		new PdfReader().parseBuffer(pdfBuffer, function(err, item) {
		if (err) callback(err);
		else if (!item) callback();
		else if (item.file) console.log("file =", item.file.path);
		else if (item.page) console.log("page =", item.page);
		else if (item.x)
		console.log([item.x, item.y, item.oc, item.A, Math.floor(item.w), item.text].join("\t"));
		else
		console.warn(item);
		console.log(
		[item.x, item.y, item.oc, item.A, Math.floor(item.w), item.text].join(
		"\t"
		)
		);
		else console.warn(item);
		});
		@@ -25,10 +24,9 @@ }
		console.error("please provide the name of a PDF file");
		}
		else {
		} else {
		console.warn("printing raw items from file:", filename, "...");
		fs.readFile(filename, (err, pdfBuffer) => {
		printRawItems(pdfBuffer, function (){
		console.warn("done.");
		});
		printRawItems(pdfBuffer, function() {
		console.warn("done.");
		});
		});
		}

PdfReader.js

		@@ -5,3 +5,3 @@ /**
		* This content is released under the MIT License.
		*
		*
		* An item object can match one of the following objects:
		@@ -12,3 +12,3 @@ * - null, when the parsing is over, or an error occured.
		* - {text:string, x:float, y:float, w:float, h:float...}, represents each text with its position.
		*
		*
		**/
		@@ -19,11 +19,11 @@

		function forEachItem(pdf, handler){
		function forEachItem(pdf, handler) {
		var pageNumber = 0;
		for (var p in pdf.data.Pages) {
		var page = pdf.data.Pages[p];
		for (var p in pdf.formImage.Pages) {
		var page = pdf.formImage.Pages[p];
		var number = ++pageNumber;
		handler(null, {
		page: number,
		width: pdf.data.Width,
		height:pdf.data.Pages[number-1].Height
		width: pdf.formImage.Width,
		height: pdf.formImage.Pages[number - 1].Height
		});
		@@ -39,3 +39,3 @@ for (var t in page.Texts) {

		function PdfReader(options){
		function PdfReader(options) {
		LOG("PdfReader"); // only displayed if LOG.js was first loaded with `true` as init parameter
		@@ -48,7 +48,7 @@ this.options = options \|\| {};
		**/
		PdfReader.prototype.parseFileItems = function(pdfFilePath, itemHandler){
		itemHandler(null, { file: { path: pdfFilePath }});
		PdfReader.prototype.parseFileItems = function(pdfFilePath, itemHandler) {
		itemHandler(null, { file: { path: pdfFilePath } });
		var pdfParser = new PFParser();
		pdfParser.on("pdfParser_dataError", itemHandler);
		pdfParser.on("pdfParser_dataReady", function (pdfData){
		pdfParser.on("pdfParser_dataReady", function(pdfData) {
		forEachItem(pdfData, itemHandler);
		@@ -63,7 +63,7 @@ });
		*/
		PdfReader.prototype.parseBuffer = function(pdfBuffer, itemHandler){
		itemHandler(null, { file: { buffer: pdfBuffer }});
		PdfReader.prototype.parseBuffer = function(pdfBuffer, itemHandler) {
		itemHandler(null, { file: { buffer: pdfBuffer } });
		var pdfParser = new PFParser();
		pdfParser.on("pdfParser_dataError", itemHandler);
		pdfParser.on("pdfParser_dataReady", function (pdfData){
		pdfParser.on("pdfParser_dataReady", function(pdfData) {
		forEachItem(pdfData, itemHandler);
		@@ -75,3 +75,2 @@ });


		module.exports = PdfReader;

149

README.md

		@@ -1,8 +0,6 @@
		[![CircleCI](https://circleci.com/gh/adrienjoly/npm-pdfreader.svg?style=svg)](https://circleci.com/gh/adrienjoly/npm-pdfreader)
		# pdfreader [![Continuous Integration](https://circleci.com/gh/adrienjoly/npm-pdfreader.svg?style=shield)](https://circleci.com/gh/adrienjoly/npm-pdfreader) [![Code Quality](https://api.codacy.com/project/badge/Grade/73d37dbb0ff84795acf65a55c5936d83)](https://www.codacy.com/app/adrien-joly/npm-pdfreader?utm_source=github.com&utm_medium=referral&utm_content=adrienjoly/npm-pdfreader&utm_campaign=Badge_Grade)

		# pdfreader

		Read text and parse tables from PDF files.

		Supports tabular data with automatic column detection, and rule-based parsing.
		Supports tabular data with automatic column detection, and rule-based parsing.

		@@ -13,2 +11,9 @@ Dependencies: it is based on [pdf2json](https://www.npmjs.com/package/pdf2json), which itself relies on Mozilla's [pdf.js](https://github.com/mozilla/pdf.js/).

		Summary:

		- [Installation, tests and CLI usage](#installation-tests-and-cli-usage)
		- [Raw PDF reading](#raw-pdf-reading) (incl. examples)
		- [Rule-based data extraction](#rule-based-data-extraction)
		- [Troubleshooting & FAQ](#troubleshooting--faq)

		## Installation, tests and CLI usage
		@@ -23,37 +28,31 @@

		The PdfReader class reads a PDF file, and calls a function on each item found while parsing that file.
		This module exposes the `PdfReader` class, to be instantiated.

		An item object can match one of the following objects:
		Your instance has two methods for parsing a PDF. They return the same output and differ only in input: `PdfReader.parseFileItems` (as below) for a filename, and `PdfReader.parseBuffer` (see: "Raw PDF reading from a PDF already in memory (buffer)") from data that you don't want to reference from the filesystem.

		- `null`, when the parsing is over, or an error occured.
		- `{file:{path:string}}`, when a PDF file is being opened.
		- `{page:integer, width:float, height:float}`, when a new page is being parsed, provides the page number, starting at 1.
		- `{text:string, x:float, y:float, w:float, h:float...}`, represents each text with its position.
		Whichever method you choose, it asks for a callback, which gets called each time the instance finds what it denotes as a PDF item.

		Example:
		An item object can match one of the following objects:

		- `null`, when the parsing is over, or an error occured.
		- File metadata, `{file:{path:string}}`, when a PDF file is being opened, and is always the first item.
		- Page metadata, `{page:integer, width:float, height:float}`, when a new page is being parsed, provides the page number, starting at 1. This basically acts as a carriage return for the coordinates of text items to be processed.
		- Text items, `{text:string, x:float, y:float, w:float, h:float...}`, which you can think of as simple objects with a text property, and floating 2D AABB coordinates on the page.

		It's up to your callback to process these items into a data structure of your choice, and also to handle any errors thrown to it.

		For example:

		```javascript
		new PdfReader().parseFileItems("sample.pdf", function(err, item){
		if (err)
		callback(err);
		else if (!item)
		callback();
		else if (item.text)
		console.log(item.text);
		new PdfReader().parseFileItems("sample.pdf", function(err, item) {
		if (err) callback(err);
		else if (!item) callback();
		else if (item.text) console.log(item.text);
		});
		```

		## Raw PDF reading from a PDF already in memory (buffer)
		### Raw PDF reading from a PDF already in memory (buffer)

		The PdfReader class reads a PDF file, and calls a function on each item found while parsing that file.
		As above, but reading from a buffer in memory rather than from a file referenced by path. For example:

		An item object can match one of the following objects:

		- `null`, when the parsing is over, or an error occured.
		- `{file:{path:string}}`, when a PDF file is being opened.
		- `{page:integer}`, when a new page is being parsed, provides the page number, starting at 1.
		- `{text:string, x:float, y:float, w:float, h:float...}`, represents each text with its position.

		Example:

		```javascript
		@@ -63,9 +62,6 @@ var fs = require("fs");
		// pdfBuffer contains the file content
		new PdfReader().parseBuffer(pdfBuffer, function(err, item){
		if (err)
		callback(err);
		else if (!item)
		callback();
		else if (item.text)
		console.log(item.text);
		new PdfReader().parseBuffer(pdfBuffer, function(err, item) {
		if (err) callback(err);
		else if (!item) callback();
		else if (item.text) console.log(item.text);
		});
		@@ -75,3 +71,3 @@ });

		## Example: parsing lines of text from a PDF file
		### Example: parsing lines of text from a PDF file

		@@ -83,3 +79,3 @@ ![example cv resume parse convert pdf to text](https://github.com/adrienjoly/npm-pdfreader-example/raw/master/parseRows.png)
		```js
		var pdfreader = require('pdfreader');
		var pdfreader = require("pdfreader");

		@@ -91,13 +87,15 @@ var rows = {}; // indexed by y-position
		.sort((y1, y2) => parseFloat(y1) - parseFloat(y2)) // sort float positions
		.forEach((y) => console.log((rows[y] \|\| []).join('')));
		.forEach(y => console.log((rows[y] \|\| []).join("")));
		}

		new pdfreader.PdfReader().parseFileItems('CV_ErhanYasar.pdf', function(err, item){
		new pdfreader.PdfReader().parseFileItems("CV_ErhanYasar.pdf", function(
		err,
		item
		) {
		if (!item \|\| item.page) {
		// end of file, or page
		printRows();
		console.log('PAGE:', item.page);
		console.log("PAGE:", item.page);
		rows = {}; // clear rows for next page
		}
		else if (item.text) {
		} else if (item.text) {
		// accumulate text items into rows object, per line
		@@ -111,3 +109,3 @@ (rows[item.y] = rows[item.y] \|\| []).push(item.text);

		## Example: parsing a table from a PDF file
		### Example: parsing a table from a PDF file

		@@ -119,29 +117,35 @@ ![example cv resume parse convert pdf table to text](https://github.com/adrienjoly/npm-pdfreader-example/raw/master/parseTable.png)
		```js
		var pdfreader = require('pdfreader');
		var pdfreader = require("pdfreader");

		const nbCols = 2;
		const cellPadding = 40; // each cell is padded to fit 40 characters
		const columnQuantitizer = (item) => parseFloat(item.x) >= 20;
		const columnQuantitizer = item => parseFloat(item.x) >= 20;

		const padColumns = (array, nb) =>
		Array.apply(null, {length: nb}).map((val, i) => array[i] \|\| []);
		// .. because map() skips undefined elements
		Array.apply(null, { length: nb }).map((val, i) => array[i] \|\| []);
		// .. because map() skips undefined elements

		const mergeCells = (cells) => (cells \|\| [])
		.map((cell) => cell.text).join('') // merge cells
		.substr(0, cellPadding).padEnd(cellPadding, ' '); // padding
		const mergeCells = cells =>
		(cells \|\| [])
		.map(cell => cell.text)
		.join("") // merge cells
		.substr(0, cellPadding)
		.padEnd(cellPadding, " "); // padding

		const renderMatrix = (matrix) => (matrix \|\| [])
		.map((row, y) => padColumns(row, nbCols)
		.map(mergeCells)
		.join(' \| ')
		).join('\n');
		const renderMatrix = matrix =>
		(matrix \|\| [])
		.map((row, y) =>
		padColumns(row, nbCols)
		.map(mergeCells)
		.join(" \| ")
		)
		.join("\n");

		var table = new pdfreader.TableParser();

		new pdfreader.PdfReader().parseFileItems(filename, function(err, item){
		new pdfreader.PdfReader().parseFileItems(filename, function(err, item) {
		if (!item \|\| item.page) {
		// end of file, or page
		console.log(renderMatrix(table.getMatrix()));
		console.log('PAGE:', item.page);
		console.log("PAGE:", item.page);
		table = new pdfreader.TableParser(); // new/clear table for next page
		@@ -157,3 +161,2 @@ } else if (item.text) {


		## Rule-based data extraction
		@@ -169,8 +172,16 @@
		var processItem = Rule.makeItemProcessor([
		Rule.on(/^Hello \"(.*)\"$/).extractRegexpValues().then(displayValue),
		Rule.on(/^Value\:/).parseNextItemValue().then(displayValue),
		Rule.on(/^c1$/).parseTable(3).then(displayTable),
		Rule.on(/^Values\:/).accumulateAfterHeading().then(displayValue),
		Rule.on(/^Hello \"(.*)\"$/)
		.extractRegexpValues()
		.then(displayValue),
		Rule.on(/^Value\:/)
		.parseNextItemValue()
		.then(displayValue),
		Rule.on(/^c1$/)
		.parseTable(3)
		.then(displayTable),
		Rule.on(/^Values\:/)
		.accumulateAfterHeading()
		.then(displayValue)
		]);
		new PdfReader().parseFileItems("sample.pdf", function(err, item){
		new PdfReader().parseFileItems("sample.pdf", function(err, item) {
		processItem(item);
		@@ -186,3 +197,3 @@ });

		## Problem: when I use pdfreader from my express-based node.js app, I'm getting `Cannot read property 'userAgent' of undefined`.
		### `Cannot read property 'userAgent' of undefined` error from an express-based node.js app

		@@ -193,10 +204,10 @@ Dmitry found out that you may need to run these instructions before including the `pdfreader` module:
		global.navigator = {
		userAgent: 'node',
		}
		userAgent: "node"
		};

		window.navigator = {
		userAgent: 'node',
		}
		userAgent: "node"
		};
		```

		Source: [express - TypeError: Cannot read property 'userAgent' of undefined error on node.js app run - Stack Overflow](https://stackoverflow.com/questions/49208414/typeerror-cannot-read-property-useragent-of-undefined-error-on-node-js-app-ru)

Rule.js

		@@ -13,8 +13,8 @@ /**
		**/
		function Rule(regexp){
		function Rule(regexp) {
		this.regexp = regexp;
		var self = this;
		// proxy accumulators methods
		Object.keys(Rule.accumulators).forEach(function(name){
		self[name] = function(){
		Object.keys(Rule.accumulators).forEach(function(name) {
		self[name] = function() {
		LOG("building rule:", regexp, "->", name);
		@@ -30,11 +30,11 @@ self.methodName = name;
		// shortcut for defining Rule objects in a more concise manner
		Rule.on = function(regexp){
		Rule.on = function(regexp) {
		return new Rule(regexp);
		}
		};

		Rule.after = function(regexp){
		Rule.after = function(regexp) {
		var rule = new Rule(regexp);
		rule.skipCurrentItem = true;
		return rule;
		}
		};

		@@ -46,5 +46,5 @@ /**
		**/
		Rule.prototype.then = function(fct){
		Rule.prototype.then = function(fct) {
		var self = this;
		this.terminate = function(){
		this.terminate = function() {
		fct.call(self, self.output);
		@@ -56,3 +56,3 @@ };
		// private function that checks a PDF item against the Rule's regexp, and returns the corresponding accumulator.
		Rule.prototype.test = function(item){
		Rule.prototype.test = function(item) {
		if (this.regexp.test(item.text)) {
		@@ -62,3 +62,6 @@ // lazy init of accumulators: build and init the accumulator on first match
		if (!this.accumulatorImpl && this.accumulatorBuilder) {
		this.accumulatorImpl = this.accumulatorBuilder.apply(this, this.accumulatorParams);
		this.accumulatorImpl = this.accumulatorBuilder.apply(
		this,
		this.accumulatorParams
		);
		this.accumulatorImpl.methodName = this.methodName;
		@@ -72,9 +75,9 @@ this.accumulatorImpl.terminate = this.terminate;
		// intended to be run from accumulator, in order to process output before calling termination then() handler.
		Rule.prototype.whenDone = function(fct){
		Rule.prototype.whenDone = function(fct) {
		var self = this;
		var then = this.terminate;
		this.terminate = function(){
		this.terminate = function() {
		fct.call(self);
		then();
		}
		};
		};
		@@ -87,5 +90,5 @@
		**/
		Rule.makeItemProcessor = function(rules){
		Rule.makeItemProcessor = function(rules) {
		var currentAccumulator = null;
		function terminateAccumulator(){
		function terminateAccumulator() {
		var terminatePreviousAcc = (currentAccumulator \|\| {}).terminate;
		@@ -98,7 +101,7 @@ if (terminatePreviousAcc) {
		var applyRulesOnNextItem = true;
		return function(item){
		if (!item) // last item of the file => flush buffers
		return function(item) {
		if (!item)
		// last item of the file => flush buffers
		return terminateAccumulator();
		else if (!item.text)
		return;
		else if (!item.text) return;
		//LOG("ITEM:", item.text, "=> apply rules:", applyRulesOnNextItem);
		@@ -111,4 +114,3 @@ if (applyRulesOnNextItem)
		LOG("current accumulator:", accumulator.methodName);
		if (rules[r].skipCurrentItem)
		applyRulesOnNextItem = false;
		if (rules[r].skipCurrentItem) applyRulesOnNextItem = false;
		currentAccumulator = accumulator;
		@@ -119,9 +121,7 @@ delete rules[r];
		}
		else
		applyRulesOnNextItem = true;
		else applyRulesOnNextItem = true;
		// if reaching this point, the current item matches none of the rules => accumulating data on current accumulator
		if (currentAccumulator)
		applyRulesOnNextItem = !currentAccumulator(item);
		if (currentAccumulator) applyRulesOnNextItem = !currentAccumulator(item);
		};
		}
		};

		@@ -135,9 +135,11 @@ /**
		Rule.accumulators = {
		stopAccumulating: function(){ return function(){}; },
		stopAccumulating: function() {
		return function() {};
		}
		};

		// method for adding accumulators
		Rule.addAccumulator = function(methodName, methodBuilder){
		Rule.addAccumulator = function(methodName, methodBuilder) {
		Rule.accumulators[methodName] = methodBuilder;
		}
		};

		@@ -148,6 +150,6 @@ /**
		**/
		Rule.addAccumulator("extractRegexpValues", function(){
		Rule.addAccumulator("extractRegexpValues", function() {
		var matches = this.regexp.exec(this.currentItem.text);
		this.output = matches.slice(1);
		return function(){}; // following lines are not to be processed by this accumulator
		return function() {}; // following lines are not to be processed by this accumulator
		});
		@@ -158,11 +160,10 @@
		**/
		Rule.addAccumulator("parseNextItemValue", function(){
		Rule.addAccumulator("parseNextItemValue", function() {
		var self = this,
		done = false;
		return function (item){
		if (done)
		return;
		done = false;
		return function(item) {
		if (done) return;
		done = true;
		self.output = item.text;
		}
		};
		});
		@@ -173,5 +174,5 @@
		**/
		Rule.addAccumulator("accumulateAfterHeading", function(){
		var output = this.output = [];
		return function accumulate(item){
		Rule.addAccumulator("accumulateAfterHeading", function() {
		var output = (this.output = []);
		return function accumulate(item) {
		output.push(item.text);
		@@ -184,11 +185,9 @@ };
		**/
		Rule.addAccumulator("accumulateFromSameX", function(){
		var output = this.output = [],
		x = null;
		return function accumulate(item){
		if (x === null)
		x = item.x;
		if (x == item.x)
		output.push(item.text);
		}
		Rule.addAccumulator("accumulateFromSameX", function() {
		var output = (this.output = []),
		x = null;
		return function accumulate(item) {
		if (x === null) x = item.x;
		if (x == item.x) output.push(item.text);
		};
		});
		@@ -195,0 +194,0 @@

test/test.js

		@@ -10,10 +10,7 @@ var LOG = require("../lib/LOG.js").toggle(false);

		function printRawItems(callback){
		new PdfReader().parseFileItems(TESTFILE, function(err, item){
		if (err)
		callback(err);
		else if (!item)
		callback();
		else
		console.log(item);
		function printRawItems(callback) {
		new PdfReader().parseFileItems(TESTFILE, function(err, item) {
		if (err) callback(err);
		else if (!item) callback();
		else console.log(item);
		});
		@@ -24,24 +21,29 @@ }

		function parseData(callback){
		function displayValue(value){
		function parseData(callback) {
		function displayValue(value) {
		console.log("extracted value:", value);
		}
		function displayTable(table){
		for (var i=0; i<table.length; ++i)
		console.log(table[i].join("\t"));
		function displayTable(table) {
		for (var i = 0; i < table.length; ++i) console.log(table[i].join("\t"));
		}
		var rules = [
		Rule.on(/^Hello \"(.*)\"$/).extractRegexpValues().then(displayValue),
		Rule.on(/^Value\:/).parseNextItemValue().then(displayValue),
		Rule.on(/^c1$/).parseTable(3).then(displayTable),
		Rule.on(/^Values\:/).accumulateAfterHeading().then(displayValue),
		Rule.on(/^Hello \"(.*)\"$/)
		.extractRegexpValues()
		.then(displayValue),
		Rule.on(/^Value\:/)
		.parseNextItemValue()
		.then(displayValue),
		Rule.on(/^c1$/)
		.parseTable(3)
		.then(displayTable),
		Rule.on(/^Values\:/)
		.accumulateAfterHeading()
		.then(displayValue)
		];
		var processItem = Rule.makeItemProcessor(rules);
		new PdfReader().parseFileItems(TESTFILE, function(err, item){
		if (err)
		callback(err);
		new PdfReader().parseFileItems(TESTFILE, function(err, item) {
		if (err) callback(err);
		else {
		processItem(item);
		if (!item)
		callback(err, item);
		if (!item) callback(err, item);
		}
		@@ -54,7 +56,7 @@ });
		console.log("\ntest 1: raw items from sample.pdf\n");
		printRawItems(function(){
		printRawItems(function() {
		console.log("\ntest 2: parse values from sample.pdf\n");
		parseData(function(){
		parseData(function() {
		console.log("\ndone.\n");
		});
		});
		});

Scenario-4.1-RiskTables-FQA.pdf

yarn.lock

Sorry, the diff of this file is not supported yet

pdfreader - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics

Worsened metrics

Dependency changes