pdfreader
Read text and parse tables from PDF files.
Supports tabular data with automatic column detection, and rule-based parsing.
This module is meant to be run using Node.js only. It does not work from a web browser.
Installation, tests and CLI usage
npm install pdfreader
cd node_modules/pdfreader
npm test
node parse.js test/sample.pdf
Raw PDF reading
The PdfReader class reads a PDF file, and calls a function on each item found while parsing that file.
An item object can match one of the following objects:
null
, when the parsing is over, or an error occured.{file:{path:string}}
, when a PDF file is being opened.{page:integer}
, when a new page is being parsed, provides the page number, starting at 1.{text:string, x:float, y:float, w:float, h:float...}
, represents each text with its position.
Example:
new PdfReader().parseFileItems("sample.pdf", function(err, item){
if (err)
callback(err);
else if (!item)
callback();
else if (item.text)
console.log(item.text);
});
Example: parsing lines of text from a PDF file
Here is the code required to convert this PDF file into text:
var pdfreader = require('pdfreader');
var rows = {};
function printRows() {
Object.keys(rows)
.sort((y1, y2) => parseFloat(y1) - parseFloat(y2))
.forEach((y) => console.log((rows[y] || []).join('')));
}
new pdfreader.PdfReader().parseFileItems('CV_ErhanYasar.pdf', function(err, item){
if (!item || item.page) {
printRows();
console.log('PAGE:', item.page);
rows = {};
}
else if (item.text) {
(rows[item.y] = rows[item.y] || []).push(item.text);
}
});
Fork this example from parsing a CV/résumé.
Example: parsing a table from a PDF file
Here is the code required to convert this PDF file into a textual table:
var pdfreader = require('pdfreader');
const nbCols = 2;
const cellPadding = 40;
const columnQuantitizer = (item) => parseFloat(item.x) >= 20;
const padColumns = (array, nb) =>
Array.apply(null, {length: nb}).map((val, i) => array[i] || []);
const mergeCells = (cells) => (cells || [])
.map((cell) => cell.text).join('')
.substr(0, cellPadding).padEnd(cellPadding, ' ');
const renderMatrix = (matrix) => (matrix || [])
.map((row, y) => padColumns(row, nbCols)
.map(mergeCells)
.join(' | ')
).join('\n');
var table = new pdfreader.TableParser();
new pdfreader.PdfReader().parseFileItems(filename, function(err, item){
if (!item || item.page) {
console.log(renderMatrix(table.getMatrix()));
console.log('PAGE:', item.page);
table = new pdfreader.TableParser();
} else if (item.text) {
table.processItem(item, columnQuantitizer(item));
}
});
Fork this example from parsing a CV/résumé.
The Rule class can be used to define and process data extraction rules, while parsing a PDF document.
Rule instances expose "accumulators": methods that defines the data extraction strategy to be used for each rule.
Example:
var processItem = Rule.makeItemProcessor([
Rule.on(/^Hello \"(.*)\"$/).extractRegexpValues().then(displayValue),
Rule.on(/^Value\:/).parseNextItemValue().then(displayValue),
Rule.on(/^c1$/).parseTable(3).then(displayTable),
Rule.on(/^Values\:/).accumulateAfterHeading().then(displayValue),
]);
new PdfReader().parseFileItems("sample.pdf", function(err, item){
processItem(item);
});