Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

pdfreader

Package Overview
Dependencies
Maintainers
1
Versions
56
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

pdfreader - npm Package Compare versions

Comparing version 1.0.0 to 1.0.2

.circleci/config.yml

8

.github/ISSUE_TEMPLATE/bug-report.md
---
name: Bug report
about: Create a report to help us improve this npm package
---

@@ -20,7 +19,8 @@

**Desktop (please complete the following information):**
- OS: [e.g. iOS]
- Browser [e.g. chrome, safari]
- Version [e.g. 22]
- OS: (e.g. iOS)
- Browser: (e.g. chrome, safari)
- Version: (e.g. 22)
**Additional context**
Add any other context about the problem here.

@@ -10,3 +10,3 @@ /**

function getColumnIndex(cols, x){
function getColumnIndex(cols, x) {
var bestDist = null;

@@ -17,4 +17,3 @@ for (var i = 0; i < cols.length; ++i) {

break;
}
else {
} else {
bestDist = dist;

@@ -26,9 +25,9 @@ }

function ColumnsParser(colNames){
function ColumnsParser(colNames) {
this.cols = [];
var cols = this.cols,
colNames = colNames.slice(), // clone (for parameter immutability)
line = -1; // -1 = header
colNames = colNames.slice(), // clone (for parameter immutability)
line = -1; // -1 = header
this.processItem = function(item){
this.processItem = function(item) {
if (line == -1) {

@@ -50,9 +49,8 @@ // parse x-position of column headers

}
}
else {
} else {
cols[getColumnIndex(cols, item.x)].items.push(item);
}
};
};
}
module.exports = ColumnsParser;

@@ -9,5 +9,5 @@ /**

var nullLog = function LOG(){};
var nullLog = function LOG() {};
var realLog = function LOG(){
var realLog = function LOG() {
for (var i in arguments)

@@ -21,9 +21,9 @@ if (arguments[i] instanceof Object || arguments[i] instanceof Array)

module.exports = function(){
module.exports = function() {
LOG.apply(null, arguments);
};
module.exports.toggle = function(enabled){
module.exports.toggle = function(enabled) {
LOG = !enabled ? nullLog : realLog;
return module.exports;
};

@@ -11,16 +11,15 @@ /**

module.exports = function(/* columns */){
module.exports = function(/* columns */) {
this.output = [];
this.cols = Array.prototype.slice.apply(arguments);
var colNames = this.cols,
colX = [],
rows = this.output,
line = -1, // header
lineY = null;
function processItem(item){
colX = [],
rows = this.output,
line = -1, // header
lineY = null;
function processItem(item) {
if (line == -1) {
// parse x-position of column headers
var i = colNames.indexOf(item.text);
if (i > -1)
colX[i] = item.x;
if (i > -1) colX[i] = item.x;
if (colX.length == colNames.length) {

@@ -30,8 +29,6 @@ LOG("table header:", colNames, colX);

}
}
else {
} else {
if (lineY === null) {
lineY = item.y;
}
else if (lineY != item.y) {
} else if (lineY != item.y) {
lineY = item.y;

@@ -42,3 +39,3 @@ line++;

var col = 0;
for (var i=colX.length-1; i>=0; --i)
for (var i = colX.length - 1; i >= 0; --i)
if (item.x > colX[i]) {

@@ -51,5 +48,5 @@ col = i;

}
};
}
processItem(this.currentItem); // apply on header's first item
return processItem; // then the same function will be run on all following items, until another rule is triggered
};

@@ -10,13 +10,13 @@ /**

function getTopPos(item){
function getTopPos(item) {
return item.y;
}
function getLeftPos(item){
function getLeftPos(item) {
return item.x;
}
function getText(item){
function getText(item) {
return item.text;
};
}

@@ -29,23 +29,24 @@ /**

**/
function makeFloorClassifier(nbClusters, arr){
function makeFloorClassifier(nbClusters, arr) {
var min = Math.min.apply(Math, arr);
var delta = Math.max.apply(Math, arr) - min;
min -= (delta / nbClusters) / 2;
return function classify(value){
return Math.floor(nbClusters * (value - min) / delta);
min -= delta / nbClusters / 2;
return function classify(value) {
return Math.floor((nbClusters * (value - min)) / delta);
};
}
function makeColumnClassifier(header){
var colX = [0].concat(header.map(getLeftPos)).sort(function(a,b){return a-b;});
return function classify(item){
for (var i=colX.length-1; i>-1; --i)
if (getLeftPos(item) >= colX[i])
return i;
function makeColumnClassifier(header) {
var colX = [0].concat(header.map(getLeftPos)).sort(function(a, b) {
return a - b;
});
return function classify(item) {
for (var i = colX.length - 1; i > -1; --i)
if (getLeftPos(item) >= colX[i]) return i;
};
}
function buildRowList(items, classifyRow){
function buildRowList(items, classifyRow) {
var rows = [];
for (var i in items){
for (var i in items) {
var item = items[i];

@@ -58,44 +59,55 @@ var row = classifyRow(getTopPos(item));

function joinCellCollisions(separ){
return function(cell){
return (cell || []).map(getText).join(separ).substr(0, 7);
function joinCellCollisions(separ) {
return function(cell) {
return (cell || [])
.map(getText)
.join(separ)
.substr(0, 7);
};
}
function fillTab(str){
function fillTab(str) {
return str.substr(0, 7);
}
function renderTable(table){
return (table || []).map(function(row){
return (row || []).map(fillTab).join("\t");
}).join("\n");
function renderTable(table) {
return (table || [])
.map(function(row) {
return (row || []).map(fillTab).join("\t");
})
.join("\n");
}
function renderMatrix(matrix){
return (matrix || []).map(function(row){
return (row || []).map(joinCellCollisions("+")).join("\t");
}).join("\n");
function renderMatrix(matrix) {
return (matrix || [])
.map(function(row) {
return (row || []).map(joinCellCollisions("+")).join("\t");
})
.join("\n");
}
function renderRows(rows){
return (rows || []).map(function(row, rowId){
var cells = [ rowId + ":" ];
for (var i in row)
cells.push((Math.floor(row[i].x) + ":" + row[i].text).substr(0, 7));
return cells.join("\t");
}).join("\n");
function renderRows(rows) {
return (rows || [])
.map(function(row, rowId) {
var cells = [rowId + ":"];
for (var i in row)
cells.push((Math.floor(row[i].x) + ":" + row[i].text).substr(0, 7));
return cells.join("\t");
})
.join("\n");
}
function renderItems(items) {
return items.map(function(i){
return [i.y, i.x, i.text].join("\t");
}).join("\n");
return items
.map(function(i) {
return [i.y, i.x, i.text].join("\t");
})
.join("\n");
}
function buildMatrix(rows, classifyColumn){
function buildMatrix(rows, classifyColumn) {
var matrix = [];
for (var y in rows){
for (var y in rows) {
var row = [];
for (var x in rows[y]){
for (var x in rows[y]) {
var item = rows[y][x];

@@ -110,6 +122,6 @@ var colN = classifyColumn(item);

function detectCollisions(matrix){
function detectCollisions(matrix) {
var collisions = [];
(matrix || []).map(function(row, rowN){
(row || []).map(function(cellItems, colN){
(matrix || []).map(function(row, rowN) {
(row || []).map(function(cellItems, colN) {
if (cellItems.length > 1)

@@ -126,6 +138,5 @@ collisions.push({

function makeAccumulator(nbRows, headerRow){
function makeAccumulator(nbRows, headerRow) {
var rule = this,
items = [];
items = [];

@@ -137,10 +148,10 @@ rule.nbRows = nbRows || 0;

matrix: null
};
};
function accumulate(item){
function accumulate(item) {
items.push(item);
};
}
// when parsing is done: generate a clean table, from items.
rule.whenDone(function(){
rule.whenDone(function() {
// classify items into rows

@@ -153,7 +164,6 @@ var classifyRow = makeFloorClassifier(rule.nbRows, items.map(getTopPos));

this.output.matrix = buildMatrix(this.output.rows, classifyColumn);
});
return accumulate; // then the same function will be run on all following items, until another rule is triggered
};
}

@@ -160,0 +170,0 @@ module.exports = makeAccumulator;

@@ -6,3 +6,3 @@ /**

**/
function SequentialParser(accumulators, callback){
function SequentialParser(accumulators, callback) {
var step = 0;

@@ -12,8 +12,10 @@ var fields = {};

fields: fields,
addField: function(key, value){
addField: function(key, value) {
this.fields[key] = value;
},
parseItem: function(item){
parseItem: function(item) {
if (step >= accumulators.length) {
return console.warn("warning: skipping item, because SequentialParser is done.");
return console.warn(
"warning: skipping item, because SequentialParser is done."
);
}

@@ -24,10 +26,7 @@ var current = accumulators[step];

++step;
}
else if (current.accumulator) {
} else if (current.accumulator) {
var doneAccumulating = current.accumulator(item, this);
if (doneAccumulating)
++step;
}
else // no action => skip item
++step;
if (doneAccumulating) ++step;
} // no action => skip item
else ++step;
if (!item || step >= accumulators.length) {

@@ -34,0 +33,0 @@ callback && callback(null, this);

@@ -9,23 +9,26 @@ /**

function TableParser(){
function TableParser() {
this.rows = {};
};
}
TableParser.prototype.processItem = function(item, col){
var row = this.rows[""+item.y] = this.rows[""+item.y] || {};
TableParser.prototype.processItem = function(item, col) {
var row = (this.rows["" + item.y] = this.rows["" + item.y] || {});
(row[col] = row[col] || []).push(item);
}
};
TableParser.prototype.processHeadingItem = function(item, col){
this.processItem({
y: 0,
x: item.x,
text: item.text
}, col);
}
TableParser.prototype.processHeadingItem = function(item, col) {
this.processItem(
{
y: 0,
x: item.x,
text: item.text
},
col
);
};
// Rows
function sortAsFloatValues(values){
return values.slice().sort(function(a, b){
function sortAsFloatValues(values) {
return values.slice().sort(function(a, b) {
return parseFloat(a) - parseFloat(b);

@@ -35,40 +38,39 @@ });

TableParser.prototype.getRows = function(){
TableParser.prototype.getRows = function() {
var rows = this.rows;
var yValues = sortAsFloatValues(Object.keys(rows));
return yValues.map(function(y){
return yValues.map(function(y) {
return rows["" + y];
});
}
};
function renderRows(rows){
return (rows || []).map(function(row, rowId){
var cells = [];
for (var i in row)
for (var j in row[i])
cells.push(row[i][j].x + ": " + row[i][j].text);
return rowId + ":\t" + cells.join(", ");
}).join("\n");
function renderRows(rows) {
return (rows || [])
.map(function(row, rowId) {
var cells = [];
for (var i in row)
for (var j in row[i]) cells.push(row[i][j].x + ": " + row[i][j].text);
return rowId + ":\t" + cells.join(", ");
})
.join("\n");
}
TableParser.prototype.renderRows = function(){
TableParser.prototype.renderRows = function() {
return renderRows(this.getRows());
}
};
// Matrix
function getSortedXValues(rows){
function getSortedXValues(rows) {
var xSet = {};
for (var y in rows)
for (var x in rows[y])
xSet[x] = true;
for (var y in rows) for (var x in rows[y]) xSet[x] = true;
return sortAsFloatValues(Object.keys(xSet));
}
TableParser.prototype.getMatrix = function(){
TableParser.prototype.getMatrix = function() {
var rows = this.getRows();
var xValues = getSortedXValues(rows);
return rows.map(function(row, y){
return rows.map(function(row, y) {
var rowNew = [];
for (var x in row){
for (var x in row) {
var items = row[x];

@@ -80,24 +82,29 @@ var colN = xValues.indexOf(x);

});
}
};
function getText(item){
function getText(item) {
return item.text;
};
}
function joinCellCollisions(separ){
return function(cell){
return (cell || []).map(getText).join(separ).substr(0, 7);
function joinCellCollisions(separ) {
return function(cell) {
return (cell || [])
.map(getText)
.join(separ)
.substr(0, 7);
};
}
function renderMatrix(matrix){
return (matrix || []).map(function(row){
return (row || []).map(joinCellCollisions("+")).join("\t");
}).join("\n");
function renderMatrix(matrix) {
return (matrix || [])
.map(function(row) {
return (row || []).map(joinCellCollisions("+")).join("\t");
})
.join("\n");
}
TableParser.prototype.renderMatrix = function(){
TableParser.prototype.renderMatrix = function() {
return renderMatrix(this.getMatrix());
}
};
module.exports = TableParser;
{
"name": "pdfreader",
"version": "1.0.0",
"version": "1.0.2",
"description": "Read text and parse tables from PDF files. Supports tabular data with automatic column detection, and rule-based parsing.",
"main": "index.js",
"scripts": {
"test:print:result": "echo ✅ Tests passed.",
"prettier:print": "prettier --list-different \"./**/*.js\" \"./**/*.md\"",
"prettier:check": "npm run -s prettier:print 1>&2; exit $(npm run -s prettier:print | wc -l)",
"prettier:fix": "prettier \"./**/*.js\" \"./**/*.md\" --write",
"test:print:result": "echo ✅ All tests passed.",
"test:diff:buffer": "node parseAsBuffer.js test/sample.pdf >test/test-buffer-snapshot.log; git --no-pager diff test/test-buffer-snapshot.log 1>&2; exit $(git --no-pager diff test/test-buffer-snapshot.log | wc -l)",

@@ -13,3 +16,4 @@ "test:diff:file": "node test/test.js >test/test-snapshot.log; git --no-pager diff test/test-snapshot.log 1>&2; exit $(git --no-pager diff test/test-snapshot.log | wc -l)",

"test:sample": "node test/test.js && node parseAsBuffer.js test/sample.pdf",
"test": "npm run -s test:sample && npm run -s test:regression && npm run -s test:diff && npm run -s test:print:result"
"test:functional": "npm run -s test:sample && npm run -s test:regression && npm run -s test:diff",
"test": "npm run -s prettier:check && npm run -s test:functional && npm run -s test:print:result"
},

@@ -41,4 +45,10 @@ "repository": {

"dependencies": {
"pdf2json": "1.1.2"
"pdf2json": "1.1.7"
},
"devDependencies": {
"eslint-config-prettier": "^3.3.0",
"eslint-plugin-prettier": "^3.0.0",
"prettier": "1.15.3",
"semantic-release": "^15.13.3"
}
}
var LOG = require("./lib/LOG.js").toggle(false);
var PdfReader = require("./index.js").PdfReader;
function printRawItems(filename, callback){
new PdfReader().parseFileItems(filename, function(err, item){
if (err)
callback(err);
else if (!item)
callback();
else if (item.file)
console.log("file =", item.file.path);
else if (item.page)
console.log("page =", item.page);
function printRawItems(filename, callback) {
new PdfReader().parseFileItems(filename, function(err, item) {
if (err) callback(err);
else if (!item) callback();
else if (item.file) console.log("file =", item.file.path);
else if (item.page) console.log("page =", item.page);
else if (item.x)
console.log([item.x, item.y, item.oc, item.A, Math.floor(item.w), item.text].join("\t"));
else
console.warn(item);
console.log(
[item.x, item.y, item.oc, item.A, Math.floor(item.w), item.text].join(
"\t"
)
);
else console.warn(item);
});

@@ -24,8 +23,7 @@ }

console.error("please provide the name of a PDF file");
}
else {
} else {
console.warn("printing raw items from file:", filename, "...");
printRawItems(filename, function(){
printRawItems(filename, function() {
console.warn("done.");
});
}
var LOG = require("./lib/LOG.js").toggle(false);
var PdfReader = require("./index.js").PdfReader;
var fs = require('fs');
var fs = require("fs");
function printRawItems(pdfBuffer, callback){
new PdfReader().parseBuffer(pdfBuffer, function(err, item){
if (err)
callback(err);
else if (!item)
callback();
else if (item.file)
console.log("file =", item.file.path);
else if (item.page)
console.log("page =", item.page);
function printRawItems(pdfBuffer, callback) {
new PdfReader().parseBuffer(pdfBuffer, function(err, item) {
if (err) callback(err);
else if (!item) callback();
else if (item.file) console.log("file =", item.file.path);
else if (item.page) console.log("page =", item.page);
else if (item.x)
console.log([item.x, item.y, item.oc, item.A, Math.floor(item.w), item.text].join("\t"));
else
console.warn(item);
console.log(
[item.x, item.y, item.oc, item.A, Math.floor(item.w), item.text].join(
"\t"
)
);
else console.warn(item);
});

@@ -25,10 +24,9 @@ }

console.error("please provide the name of a PDF file");
}
else {
} else {
console.warn("printing raw items from file:", filename, "...");
fs.readFile(filename, (err, pdfBuffer) => {
printRawItems(pdfBuffer, function (){
console.warn("done.");
});
printRawItems(pdfBuffer, function() {
console.warn("done.");
});
});
}

@@ -5,3 +5,3 @@ /**

* This content is released under the MIT License.
*
*
* An item object can match one of the following objects:

@@ -12,3 +12,3 @@ * - null, when the parsing is over, or an error occured.

* - {text:string, x:float, y:float, w:float, h:float...}, represents each text with its position.
*
*
**/

@@ -19,11 +19,11 @@

function forEachItem(pdf, handler){
function forEachItem(pdf, handler) {
var pageNumber = 0;
for (var p in pdf.data.Pages) {
var page = pdf.data.Pages[p];
for (var p in pdf.formImage.Pages) {
var page = pdf.formImage.Pages[p];
var number = ++pageNumber;
handler(null, {
page: number,
width: pdf.data.Width,
height:pdf.data.Pages[number-1].Height
width: pdf.formImage.Width,
height: pdf.formImage.Pages[number - 1].Height
});

@@ -39,3 +39,3 @@ for (var t in page.Texts) {

function PdfReader(options){
function PdfReader(options) {
LOG("PdfReader"); // only displayed if LOG.js was first loaded with `true` as init parameter

@@ -48,7 +48,7 @@ this.options = options || {};

**/
PdfReader.prototype.parseFileItems = function(pdfFilePath, itemHandler){
itemHandler(null, { file: { path: pdfFilePath }});
PdfReader.prototype.parseFileItems = function(pdfFilePath, itemHandler) {
itemHandler(null, { file: { path: pdfFilePath } });
var pdfParser = new PFParser();
pdfParser.on("pdfParser_dataError", itemHandler);
pdfParser.on("pdfParser_dataReady", function (pdfData){
pdfParser.on("pdfParser_dataReady", function(pdfData) {
forEachItem(pdfData, itemHandler);

@@ -63,7 +63,7 @@ });

*/
PdfReader.prototype.parseBuffer = function(pdfBuffer, itemHandler){
itemHandler(null, { file: { buffer: pdfBuffer }});
PdfReader.prototype.parseBuffer = function(pdfBuffer, itemHandler) {
itemHandler(null, { file: { buffer: pdfBuffer } });
var pdfParser = new PFParser();
pdfParser.on("pdfParser_dataError", itemHandler);
pdfParser.on("pdfParser_dataReady", function (pdfData){
pdfParser.on("pdfParser_dataReady", function(pdfData) {
forEachItem(pdfData, itemHandler);

@@ -75,3 +75,2 @@ });

module.exports = PdfReader;

@@ -1,8 +0,6 @@

[![CircleCI](https://circleci.com/gh/adrienjoly/npm-pdfreader.svg?style=svg)](https://circleci.com/gh/adrienjoly/npm-pdfreader)
# pdfreader [![Continuous Integration](https://circleci.com/gh/adrienjoly/npm-pdfreader.svg?style=shield)](https://circleci.com/gh/adrienjoly/npm-pdfreader) [![Code Quality](https://api.codacy.com/project/badge/Grade/73d37dbb0ff84795acf65a55c5936d83)](https://www.codacy.com/app/adrien-joly/npm-pdfreader?utm_source=github.com&utm_medium=referral&utm_content=adrienjoly/npm-pdfreader&utm_campaign=Badge_Grade)
# pdfreader
Read text and parse tables from PDF files.
Supports tabular data with automatic column detection, and rule-based parsing.
Supports **tabular data** with automatic column detection, and **rule-based parsing**.

@@ -13,2 +11,9 @@ Dependencies: it is based on [pdf2json](https://www.npmjs.com/package/pdf2json), which itself relies on Mozilla's [pdf.js](https://github.com/mozilla/pdf.js/).

Summary:
- [Installation, tests and CLI usage](#installation-tests-and-cli-usage)
- [Raw PDF reading](#raw-pdf-reading) (incl. examples)
- [Rule-based data extraction](#rule-based-data-extraction)
- [Troubleshooting & FAQ](#troubleshooting--faq)
## Installation, tests and CLI usage

@@ -23,37 +28,31 @@

The PdfReader class reads a PDF file, and calls a function on each item found while parsing that file.
This module exposes the `PdfReader` class, to be instantiated.
An item object can match one of the following objects:
Your instance has two methods for parsing a PDF. They return the same output and differ only in input: `PdfReader.parseFileItems` (as below) for a filename, and `PdfReader.parseBuffer` (see: "Raw PDF reading from a PDF already in memory (buffer)") from data that you don't want to reference from the filesystem.
- `null`, when the parsing is over, or an error occured.
- `{file:{path:string}}`, when a PDF file is being opened.
- `{page:integer, width:float, height:float}`, when a new page is being parsed, provides the page number, starting at 1.
- `{text:string, x:float, y:float, w:float, h:float...}`, represents each text with its position.
Whichever method you choose, it asks for a callback, which gets called each time the instance finds what it denotes as a PDF item.
Example:
An item object can match one of the following objects:
- `null`, when the parsing is over, or an error occured.
- File metadata, `{file:{path:string}}`, when a PDF file is being opened, and is always the first item.
- Page metadata, `{page:integer, width:float, height:float}`, when a new page is being parsed, provides the page number, starting at 1. This basically acts as a carriage return for the coordinates of text items to be processed.
- Text items, `{text:string, x:float, y:float, w:float, h:float...}`, which you can think of as simple objects with a text property, and floating 2D AABB coordinates on the page.
It's up to your callback to process these items into a data structure of your choice, and also to handle any errors thrown to it.
For example:
```javascript
new PdfReader().parseFileItems("sample.pdf", function(err, item){
if (err)
callback(err);
else if (!item)
callback();
else if (item.text)
console.log(item.text);
new PdfReader().parseFileItems("sample.pdf", function(err, item) {
if (err) callback(err);
else if (!item) callback();
else if (item.text) console.log(item.text);
});
```
## Raw PDF reading from a PDF already in memory (buffer)
### Raw PDF reading from a PDF already in memory (buffer)
The PdfReader class reads a PDF file, and calls a function on each item found while parsing that file.
As above, but reading from a buffer in memory rather than from a file referenced by path. For example:
An item object can match one of the following objects:
- `null`, when the parsing is over, or an error occured.
- `{file:{path:string}}`, when a PDF file is being opened.
- `{page:integer}`, when a new page is being parsed, provides the page number, starting at 1.
- `{text:string, x:float, y:float, w:float, h:float...}`, represents each text with its position.
Example:
```javascript

@@ -63,9 +62,6 @@ var fs = require("fs");

// pdfBuffer contains the file content
new PdfReader().parseBuffer(pdfBuffer, function(err, item){
if (err)
callback(err);
else if (!item)
callback();
else if (item.text)
console.log(item.text);
new PdfReader().parseBuffer(pdfBuffer, function(err, item) {
if (err) callback(err);
else if (!item) callback();
else if (item.text) console.log(item.text);
});

@@ -75,3 +71,3 @@ });

## Example: parsing lines of text from a PDF file
### Example: parsing lines of text from a PDF file

@@ -83,3 +79,3 @@ ![example cv resume parse convert pdf to text](https://github.com/adrienjoly/npm-pdfreader-example/raw/master/parseRows.png)

```js
var pdfreader = require('pdfreader');
var pdfreader = require("pdfreader");

@@ -91,13 +87,15 @@ var rows = {}; // indexed by y-position

.sort((y1, y2) => parseFloat(y1) - parseFloat(y2)) // sort float positions
.forEach((y) => console.log((rows[y] || []).join('')));
.forEach(y => console.log((rows[y] || []).join("")));
}
new pdfreader.PdfReader().parseFileItems('CV_ErhanYasar.pdf', function(err, item){
new pdfreader.PdfReader().parseFileItems("CV_ErhanYasar.pdf", function(
err,
item
) {
if (!item || item.page) {
// end of file, or page
printRows();
console.log('PAGE:', item.page);
console.log("PAGE:", item.page);
rows = {}; // clear rows for next page
}
else if (item.text) {
} else if (item.text) {
// accumulate text items into rows object, per line

@@ -111,3 +109,3 @@ (rows[item.y] = rows[item.y] || []).push(item.text);

## Example: parsing a table from a PDF file
### Example: parsing a table from a PDF file

@@ -119,29 +117,35 @@ ![example cv resume parse convert pdf table to text](https://github.com/adrienjoly/npm-pdfreader-example/raw/master/parseTable.png)

```js
var pdfreader = require('pdfreader');
var pdfreader = require("pdfreader");
const nbCols = 2;
const cellPadding = 40; // each cell is padded to fit 40 characters
const columnQuantitizer = (item) => parseFloat(item.x) >= 20;
const columnQuantitizer = item => parseFloat(item.x) >= 20;
const padColumns = (array, nb) =>
Array.apply(null, {length: nb}).map((val, i) => array[i] || []);
// .. because map() skips undefined elements
Array.apply(null, { length: nb }).map((val, i) => array[i] || []);
// .. because map() skips undefined elements
const mergeCells = (cells) => (cells || [])
.map((cell) => cell.text).join('') // merge cells
.substr(0, cellPadding).padEnd(cellPadding, ' '); // padding
const mergeCells = cells =>
(cells || [])
.map(cell => cell.text)
.join("") // merge cells
.substr(0, cellPadding)
.padEnd(cellPadding, " "); // padding
const renderMatrix = (matrix) => (matrix || [])
.map((row, y) => padColumns(row, nbCols)
.map(mergeCells)
.join(' | ')
).join('\n');
const renderMatrix = matrix =>
(matrix || [])
.map((row, y) =>
padColumns(row, nbCols)
.map(mergeCells)
.join(" | ")
)
.join("\n");
var table = new pdfreader.TableParser();
new pdfreader.PdfReader().parseFileItems(filename, function(err, item){
new pdfreader.PdfReader().parseFileItems(filename, function(err, item) {
if (!item || item.page) {
// end of file, or page
console.log(renderMatrix(table.getMatrix()));
console.log('PAGE:', item.page);
console.log("PAGE:", item.page);
table = new pdfreader.TableParser(); // new/clear table for next page

@@ -157,3 +161,2 @@ } else if (item.text) {

## Rule-based data extraction

@@ -169,8 +172,16 @@

var processItem = Rule.makeItemProcessor([
Rule.on(/^Hello \"(.*)\"$/).extractRegexpValues().then(displayValue),
Rule.on(/^Value\:/).parseNextItemValue().then(displayValue),
Rule.on(/^c1$/).parseTable(3).then(displayTable),
Rule.on(/^Values\:/).accumulateAfterHeading().then(displayValue),
Rule.on(/^Hello \"(.*)\"$/)
.extractRegexpValues()
.then(displayValue),
Rule.on(/^Value\:/)
.parseNextItemValue()
.then(displayValue),
Rule.on(/^c1$/)
.parseTable(3)
.then(displayTable),
Rule.on(/^Values\:/)
.accumulateAfterHeading()
.then(displayValue)
]);
new PdfReader().parseFileItems("sample.pdf", function(err, item){
new PdfReader().parseFileItems("sample.pdf", function(err, item) {
processItem(item);

@@ -186,3 +197,3 @@ });

## Problem: when I use pdfreader from my express-based node.js app, I'm getting `Cannot read property 'userAgent' of undefined`.
### `Cannot read property 'userAgent' of undefined` error from an express-based node.js app

@@ -193,10 +204,10 @@ Dmitry found out that you may need to run these instructions before including the `pdfreader` module:

global.navigator = {
userAgent: 'node',
}
userAgent: "node"
};
window.navigator = {
userAgent: 'node',
}
userAgent: "node"
};
```
Source: [express - TypeError: Cannot read property 'userAgent' of undefined error on node.js app run - Stack Overflow](https://stackoverflow.com/questions/49208414/typeerror-cannot-read-property-useragent-of-undefined-error-on-node-js-app-ru)

@@ -13,8 +13,8 @@ /**

**/
function Rule(regexp){
function Rule(regexp) {
this.regexp = regexp;
var self = this;
// proxy accumulators methods
Object.keys(Rule.accumulators).forEach(function(name){
self[name] = function(){
Object.keys(Rule.accumulators).forEach(function(name) {
self[name] = function() {
LOG("building rule:", regexp, "->", name);

@@ -30,11 +30,11 @@ self.methodName = name;

// shortcut for defining Rule objects in a more concise manner
Rule.on = function(regexp){
Rule.on = function(regexp) {
return new Rule(regexp);
}
};
Rule.after = function(regexp){
Rule.after = function(regexp) {
var rule = new Rule(regexp);
rule.skipCurrentItem = true;
return rule;
}
};

@@ -46,5 +46,5 @@ /**

**/
Rule.prototype.then = function(fct){
Rule.prototype.then = function(fct) {
var self = this;
this.terminate = function(){
this.terminate = function() {
fct.call(self, self.output);

@@ -56,3 +56,3 @@ };

// private function that checks a PDF item against the Rule's regexp, and returns the corresponding accumulator.
Rule.prototype.test = function(item){
Rule.prototype.test = function(item) {
if (this.regexp.test(item.text)) {

@@ -62,3 +62,6 @@ // lazy init of accumulators: build and init the accumulator on first match

if (!this.accumulatorImpl && this.accumulatorBuilder) {
this.accumulatorImpl = this.accumulatorBuilder.apply(this, this.accumulatorParams);
this.accumulatorImpl = this.accumulatorBuilder.apply(
this,
this.accumulatorParams
);
this.accumulatorImpl.methodName = this.methodName;

@@ -72,9 +75,9 @@ this.accumulatorImpl.terminate = this.terminate;

// intended to be run from accumulator, in order to process output before calling termination then() handler.
Rule.prototype.whenDone = function(fct){
Rule.prototype.whenDone = function(fct) {
var self = this;
var then = this.terminate;
this.terminate = function(){
this.terminate = function() {
fct.call(self);
then();
}
};
};

@@ -87,5 +90,5 @@

**/
Rule.makeItemProcessor = function(rules){
Rule.makeItemProcessor = function(rules) {
var currentAccumulator = null;
function terminateAccumulator(){
function terminateAccumulator() {
var terminatePreviousAcc = (currentAccumulator || {}).terminate;

@@ -98,7 +101,7 @@ if (terminatePreviousAcc) {

var applyRulesOnNextItem = true;
return function(item){
if (!item) // last item of the file => flush buffers
return function(item) {
if (!item)
// last item of the file => flush buffers
return terminateAccumulator();
else if (!item.text)
return;
else if (!item.text) return;
//LOG("ITEM:", item.text, "=> apply rules:", applyRulesOnNextItem);

@@ -111,4 +114,3 @@ if (applyRulesOnNextItem)

LOG("current accumulator:", accumulator.methodName);
if (rules[r].skipCurrentItem)
applyRulesOnNextItem = false;
if (rules[r].skipCurrentItem) applyRulesOnNextItem = false;
currentAccumulator = accumulator;

@@ -119,9 +121,7 @@ delete rules[r];

}
else
applyRulesOnNextItem = true;
else applyRulesOnNextItem = true;
// if reaching this point, the current item matches none of the rules => accumulating data on current accumulator
if (currentAccumulator)
applyRulesOnNextItem = !currentAccumulator(item);
if (currentAccumulator) applyRulesOnNextItem = !currentAccumulator(item);
};
}
};

@@ -135,9 +135,11 @@ /**

Rule.accumulators = {
stopAccumulating: function(){ return function(){}; },
stopAccumulating: function() {
return function() {};
}
};
// method for adding accumulators
Rule.addAccumulator = function(methodName, methodBuilder){
Rule.addAccumulator = function(methodName, methodBuilder) {
Rule.accumulators[methodName] = methodBuilder;
}
};

@@ -148,6 +150,6 @@ /**

**/
Rule.addAccumulator("extractRegexpValues", function(){
Rule.addAccumulator("extractRegexpValues", function() {
var matches = this.regexp.exec(this.currentItem.text);
this.output = matches.slice(1);
return function(){}; // following lines are not to be processed by this accumulator
return function() {}; // following lines are not to be processed by this accumulator
});

@@ -158,11 +160,10 @@

**/
Rule.addAccumulator("parseNextItemValue", function(){
Rule.addAccumulator("parseNextItemValue", function() {
var self = this,
done = false;
return function (item){
if (done)
return;
done = false;
return function(item) {
if (done) return;
done = true;
self.output = item.text;
}
};
});

@@ -173,5 +174,5 @@

**/
Rule.addAccumulator("accumulateAfterHeading", function(){
var output = this.output = [];
return function accumulate(item){
Rule.addAccumulator("accumulateAfterHeading", function() {
var output = (this.output = []);
return function accumulate(item) {
output.push(item.text);

@@ -184,11 +185,9 @@ };

**/
Rule.addAccumulator("accumulateFromSameX", function(){
var output = this.output = [],
x = null;
return function accumulate(item){
if (x === null)
x = item.x;
if (x == item.x)
output.push(item.text);
}
Rule.addAccumulator("accumulateFromSameX", function() {
var output = (this.output = []),
x = null;
return function accumulate(item) {
if (x === null) x = item.x;
if (x == item.x) output.push(item.text);
};
});

@@ -195,0 +194,0 @@

@@ -10,10 +10,7 @@ var LOG = require("../lib/LOG.js").toggle(false);

function printRawItems(callback){
new PdfReader().parseFileItems(TESTFILE, function(err, item){
if (err)
callback(err);
else if (!item)
callback();
else
console.log(item);
function printRawItems(callback) {
new PdfReader().parseFileItems(TESTFILE, function(err, item) {
if (err) callback(err);
else if (!item) callback();
else console.log(item);
});

@@ -24,24 +21,29 @@ }

function parseData(callback){
function displayValue(value){
function parseData(callback) {
function displayValue(value) {
console.log("extracted value:", value);
}
function displayTable(table){
for (var i=0; i<table.length; ++i)
console.log(table[i].join("\t"));
function displayTable(table) {
for (var i = 0; i < table.length; ++i) console.log(table[i].join("\t"));
}
var rules = [
Rule.on(/^Hello \"(.*)\"$/).extractRegexpValues().then(displayValue),
Rule.on(/^Value\:/).parseNextItemValue().then(displayValue),
Rule.on(/^c1$/).parseTable(3).then(displayTable),
Rule.on(/^Values\:/).accumulateAfterHeading().then(displayValue),
Rule.on(/^Hello \"(.*)\"$/)
.extractRegexpValues()
.then(displayValue),
Rule.on(/^Value\:/)
.parseNextItemValue()
.then(displayValue),
Rule.on(/^c1$/)
.parseTable(3)
.then(displayTable),
Rule.on(/^Values\:/)
.accumulateAfterHeading()
.then(displayValue)
];
var processItem = Rule.makeItemProcessor(rules);
new PdfReader().parseFileItems(TESTFILE, function(err, item){
if (err)
callback(err);
new PdfReader().parseFileItems(TESTFILE, function(err, item) {
if (err) callback(err);
else {
processItem(item);
if (!item)
callback(err, item);
if (!item) callback(err, item);
}

@@ -54,7 +56,7 @@ });

console.log("\ntest 1: raw items from sample.pdf\n");
printRawItems(function(){
printRawItems(function() {
console.log("\ntest 2: parse values from sample.pdf\n");
parseData(function(){
parseData(function() {
console.log("\ndone.\n");
});
});
});

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc