Comparing version 1.0.0 to 1.0.2
--- | ||
name: Bug report | ||
about: Create a report to help us improve this npm package | ||
--- | ||
@@ -20,7 +19,8 @@ | ||
**Desktop (please complete the following information):** | ||
- OS: [e.g. iOS] | ||
- Browser [e.g. chrome, safari] | ||
- Version [e.g. 22] | ||
- OS: (e.g. iOS) | ||
- Browser: (e.g. chrome, safari) | ||
- Version: (e.g. 22) | ||
**Additional context** | ||
Add any other context about the problem here. |
@@ -10,3 +10,3 @@ /** | ||
function getColumnIndex(cols, x){ | ||
function getColumnIndex(cols, x) { | ||
var bestDist = null; | ||
@@ -17,4 +17,3 @@ for (var i = 0; i < cols.length; ++i) { | ||
break; | ||
} | ||
else { | ||
} else { | ||
bestDist = dist; | ||
@@ -26,9 +25,9 @@ } | ||
function ColumnsParser(colNames){ | ||
function ColumnsParser(colNames) { | ||
this.cols = []; | ||
var cols = this.cols, | ||
colNames = colNames.slice(), // clone (for parameter immutability) | ||
line = -1; // -1 = header | ||
colNames = colNames.slice(), // clone (for parameter immutability) | ||
line = -1; // -1 = header | ||
this.processItem = function(item){ | ||
this.processItem = function(item) { | ||
if (line == -1) { | ||
@@ -50,9 +49,8 @@ // parse x-position of column headers | ||
} | ||
} | ||
else { | ||
} else { | ||
cols[getColumnIndex(cols, item.x)].items.push(item); | ||
} | ||
}; | ||
}; | ||
} | ||
module.exports = ColumnsParser; |
@@ -9,5 +9,5 @@ /** | ||
var nullLog = function LOG(){}; | ||
var nullLog = function LOG() {}; | ||
var realLog = function LOG(){ | ||
var realLog = function LOG() { | ||
for (var i in arguments) | ||
@@ -21,9 +21,9 @@ if (arguments[i] instanceof Object || arguments[i] instanceof Array) | ||
module.exports = function(){ | ||
module.exports = function() { | ||
LOG.apply(null, arguments); | ||
}; | ||
module.exports.toggle = function(enabled){ | ||
module.exports.toggle = function(enabled) { | ||
LOG = !enabled ? nullLog : realLog; | ||
return module.exports; | ||
}; |
@@ -11,16 +11,15 @@ /** | ||
module.exports = function(/* columns */){ | ||
module.exports = function(/* columns */) { | ||
this.output = []; | ||
this.cols = Array.prototype.slice.apply(arguments); | ||
var colNames = this.cols, | ||
colX = [], | ||
rows = this.output, | ||
line = -1, // header | ||
lineY = null; | ||
function processItem(item){ | ||
colX = [], | ||
rows = this.output, | ||
line = -1, // header | ||
lineY = null; | ||
function processItem(item) { | ||
if (line == -1) { | ||
// parse x-position of column headers | ||
var i = colNames.indexOf(item.text); | ||
if (i > -1) | ||
colX[i] = item.x; | ||
if (i > -1) colX[i] = item.x; | ||
if (colX.length == colNames.length) { | ||
@@ -30,8 +29,6 @@ LOG("table header:", colNames, colX); | ||
} | ||
} | ||
else { | ||
} else { | ||
if (lineY === null) { | ||
lineY = item.y; | ||
} | ||
else if (lineY != item.y) { | ||
} else if (lineY != item.y) { | ||
lineY = item.y; | ||
@@ -42,3 +39,3 @@ line++; | ||
var col = 0; | ||
for (var i=colX.length-1; i>=0; --i) | ||
for (var i = colX.length - 1; i >= 0; --i) | ||
if (item.x > colX[i]) { | ||
@@ -51,5 +48,5 @@ col = i; | ||
} | ||
}; | ||
} | ||
processItem(this.currentItem); // apply on header's first item | ||
return processItem; // then the same function will be run on all following items, until another rule is triggered | ||
}; |
@@ -10,13 +10,13 @@ /** | ||
function getTopPos(item){ | ||
function getTopPos(item) { | ||
return item.y; | ||
} | ||
function getLeftPos(item){ | ||
function getLeftPos(item) { | ||
return item.x; | ||
} | ||
function getText(item){ | ||
function getText(item) { | ||
return item.text; | ||
}; | ||
} | ||
@@ -29,23 +29,24 @@ /** | ||
**/ | ||
function makeFloorClassifier(nbClusters, arr){ | ||
function makeFloorClassifier(nbClusters, arr) { | ||
var min = Math.min.apply(Math, arr); | ||
var delta = Math.max.apply(Math, arr) - min; | ||
min -= (delta / nbClusters) / 2; | ||
return function classify(value){ | ||
return Math.floor(nbClusters * (value - min) / delta); | ||
min -= delta / nbClusters / 2; | ||
return function classify(value) { | ||
return Math.floor((nbClusters * (value - min)) / delta); | ||
}; | ||
} | ||
function makeColumnClassifier(header){ | ||
var colX = [0].concat(header.map(getLeftPos)).sort(function(a,b){return a-b;}); | ||
return function classify(item){ | ||
for (var i=colX.length-1; i>-1; --i) | ||
if (getLeftPos(item) >= colX[i]) | ||
return i; | ||
function makeColumnClassifier(header) { | ||
var colX = [0].concat(header.map(getLeftPos)).sort(function(a, b) { | ||
return a - b; | ||
}); | ||
return function classify(item) { | ||
for (var i = colX.length - 1; i > -1; --i) | ||
if (getLeftPos(item) >= colX[i]) return i; | ||
}; | ||
} | ||
function buildRowList(items, classifyRow){ | ||
function buildRowList(items, classifyRow) { | ||
var rows = []; | ||
for (var i in items){ | ||
for (var i in items) { | ||
var item = items[i]; | ||
@@ -58,44 +59,55 @@ var row = classifyRow(getTopPos(item)); | ||
function joinCellCollisions(separ){ | ||
return function(cell){ | ||
return (cell || []).map(getText).join(separ).substr(0, 7); | ||
function joinCellCollisions(separ) { | ||
return function(cell) { | ||
return (cell || []) | ||
.map(getText) | ||
.join(separ) | ||
.substr(0, 7); | ||
}; | ||
} | ||
function fillTab(str){ | ||
function fillTab(str) { | ||
return str.substr(0, 7); | ||
} | ||
function renderTable(table){ | ||
return (table || []).map(function(row){ | ||
return (row || []).map(fillTab).join("\t"); | ||
}).join("\n"); | ||
function renderTable(table) { | ||
return (table || []) | ||
.map(function(row) { | ||
return (row || []).map(fillTab).join("\t"); | ||
}) | ||
.join("\n"); | ||
} | ||
function renderMatrix(matrix){ | ||
return (matrix || []).map(function(row){ | ||
return (row || []).map(joinCellCollisions("+")).join("\t"); | ||
}).join("\n"); | ||
function renderMatrix(matrix) { | ||
return (matrix || []) | ||
.map(function(row) { | ||
return (row || []).map(joinCellCollisions("+")).join("\t"); | ||
}) | ||
.join("\n"); | ||
} | ||
function renderRows(rows){ | ||
return (rows || []).map(function(row, rowId){ | ||
var cells = [ rowId + ":" ]; | ||
for (var i in row) | ||
cells.push((Math.floor(row[i].x) + ":" + row[i].text).substr(0, 7)); | ||
return cells.join("\t"); | ||
}).join("\n"); | ||
function renderRows(rows) { | ||
return (rows || []) | ||
.map(function(row, rowId) { | ||
var cells = [rowId + ":"]; | ||
for (var i in row) | ||
cells.push((Math.floor(row[i].x) + ":" + row[i].text).substr(0, 7)); | ||
return cells.join("\t"); | ||
}) | ||
.join("\n"); | ||
} | ||
function renderItems(items) { | ||
return items.map(function(i){ | ||
return [i.y, i.x, i.text].join("\t"); | ||
}).join("\n"); | ||
return items | ||
.map(function(i) { | ||
return [i.y, i.x, i.text].join("\t"); | ||
}) | ||
.join("\n"); | ||
} | ||
function buildMatrix(rows, classifyColumn){ | ||
function buildMatrix(rows, classifyColumn) { | ||
var matrix = []; | ||
for (var y in rows){ | ||
for (var y in rows) { | ||
var row = []; | ||
for (var x in rows[y]){ | ||
for (var x in rows[y]) { | ||
var item = rows[y][x]; | ||
@@ -110,6 +122,6 @@ var colN = classifyColumn(item); | ||
function detectCollisions(matrix){ | ||
function detectCollisions(matrix) { | ||
var collisions = []; | ||
(matrix || []).map(function(row, rowN){ | ||
(row || []).map(function(cellItems, colN){ | ||
(matrix || []).map(function(row, rowN) { | ||
(row || []).map(function(cellItems, colN) { | ||
if (cellItems.length > 1) | ||
@@ -126,6 +138,5 @@ collisions.push({ | ||
function makeAccumulator(nbRows, headerRow){ | ||
function makeAccumulator(nbRows, headerRow) { | ||
var rule = this, | ||
items = []; | ||
items = []; | ||
@@ -137,10 +148,10 @@ rule.nbRows = nbRows || 0; | ||
matrix: null | ||
}; | ||
}; | ||
function accumulate(item){ | ||
function accumulate(item) { | ||
items.push(item); | ||
}; | ||
} | ||
// when parsing is done: generate a clean table, from items. | ||
rule.whenDone(function(){ | ||
rule.whenDone(function() { | ||
// classify items into rows | ||
@@ -153,7 +164,6 @@ var classifyRow = makeFloorClassifier(rule.nbRows, items.map(getTopPos)); | ||
this.output.matrix = buildMatrix(this.output.rows, classifyColumn); | ||
}); | ||
return accumulate; // then the same function will be run on all following items, until another rule is triggered | ||
}; | ||
} | ||
@@ -160,0 +170,0 @@ module.exports = makeAccumulator; |
@@ -6,3 +6,3 @@ /** | ||
**/ | ||
function SequentialParser(accumulators, callback){ | ||
function SequentialParser(accumulators, callback) { | ||
var step = 0; | ||
@@ -12,8 +12,10 @@ var fields = {}; | ||
fields: fields, | ||
addField: function(key, value){ | ||
addField: function(key, value) { | ||
this.fields[key] = value; | ||
}, | ||
parseItem: function(item){ | ||
parseItem: function(item) { | ||
if (step >= accumulators.length) { | ||
return console.warn("warning: skipping item, because SequentialParser is done."); | ||
return console.warn( | ||
"warning: skipping item, because SequentialParser is done." | ||
); | ||
} | ||
@@ -24,10 +26,7 @@ var current = accumulators[step]; | ||
++step; | ||
} | ||
else if (current.accumulator) { | ||
} else if (current.accumulator) { | ||
var doneAccumulating = current.accumulator(item, this); | ||
if (doneAccumulating) | ||
++step; | ||
} | ||
else // no action => skip item | ||
++step; | ||
if (doneAccumulating) ++step; | ||
} // no action => skip item | ||
else ++step; | ||
if (!item || step >= accumulators.length) { | ||
@@ -34,0 +33,0 @@ callback && callback(null, this); |
@@ -9,23 +9,26 @@ /** | ||
function TableParser(){ | ||
function TableParser() { | ||
this.rows = {}; | ||
}; | ||
} | ||
TableParser.prototype.processItem = function(item, col){ | ||
var row = this.rows[""+item.y] = this.rows[""+item.y] || {}; | ||
TableParser.prototype.processItem = function(item, col) { | ||
var row = (this.rows["" + item.y] = this.rows["" + item.y] || {}); | ||
(row[col] = row[col] || []).push(item); | ||
} | ||
}; | ||
TableParser.prototype.processHeadingItem = function(item, col){ | ||
this.processItem({ | ||
y: 0, | ||
x: item.x, | ||
text: item.text | ||
}, col); | ||
} | ||
TableParser.prototype.processHeadingItem = function(item, col) { | ||
this.processItem( | ||
{ | ||
y: 0, | ||
x: item.x, | ||
text: item.text | ||
}, | ||
col | ||
); | ||
}; | ||
// Rows | ||
function sortAsFloatValues(values){ | ||
return values.slice().sort(function(a, b){ | ||
function sortAsFloatValues(values) { | ||
return values.slice().sort(function(a, b) { | ||
return parseFloat(a) - parseFloat(b); | ||
@@ -35,40 +38,39 @@ }); | ||
TableParser.prototype.getRows = function(){ | ||
TableParser.prototype.getRows = function() { | ||
var rows = this.rows; | ||
var yValues = sortAsFloatValues(Object.keys(rows)); | ||
return yValues.map(function(y){ | ||
return yValues.map(function(y) { | ||
return rows["" + y]; | ||
}); | ||
} | ||
}; | ||
function renderRows(rows){ | ||
return (rows || []).map(function(row, rowId){ | ||
var cells = []; | ||
for (var i in row) | ||
for (var j in row[i]) | ||
cells.push(row[i][j].x + ": " + row[i][j].text); | ||
return rowId + ":\t" + cells.join(", "); | ||
}).join("\n"); | ||
function renderRows(rows) { | ||
return (rows || []) | ||
.map(function(row, rowId) { | ||
var cells = []; | ||
for (var i in row) | ||
for (var j in row[i]) cells.push(row[i][j].x + ": " + row[i][j].text); | ||
return rowId + ":\t" + cells.join(", "); | ||
}) | ||
.join("\n"); | ||
} | ||
TableParser.prototype.renderRows = function(){ | ||
TableParser.prototype.renderRows = function() { | ||
return renderRows(this.getRows()); | ||
} | ||
}; | ||
// Matrix | ||
function getSortedXValues(rows){ | ||
function getSortedXValues(rows) { | ||
var xSet = {}; | ||
for (var y in rows) | ||
for (var x in rows[y]) | ||
xSet[x] = true; | ||
for (var y in rows) for (var x in rows[y]) xSet[x] = true; | ||
return sortAsFloatValues(Object.keys(xSet)); | ||
} | ||
TableParser.prototype.getMatrix = function(){ | ||
TableParser.prototype.getMatrix = function() { | ||
var rows = this.getRows(); | ||
var xValues = getSortedXValues(rows); | ||
return rows.map(function(row, y){ | ||
return rows.map(function(row, y) { | ||
var rowNew = []; | ||
for (var x in row){ | ||
for (var x in row) { | ||
var items = row[x]; | ||
@@ -80,24 +82,29 @@ var colN = xValues.indexOf(x); | ||
}); | ||
} | ||
}; | ||
function getText(item){ | ||
function getText(item) { | ||
return item.text; | ||
}; | ||
} | ||
function joinCellCollisions(separ){ | ||
return function(cell){ | ||
return (cell || []).map(getText).join(separ).substr(0, 7); | ||
function joinCellCollisions(separ) { | ||
return function(cell) { | ||
return (cell || []) | ||
.map(getText) | ||
.join(separ) | ||
.substr(0, 7); | ||
}; | ||
} | ||
function renderMatrix(matrix){ | ||
return (matrix || []).map(function(row){ | ||
return (row || []).map(joinCellCollisions("+")).join("\t"); | ||
}).join("\n"); | ||
function renderMatrix(matrix) { | ||
return (matrix || []) | ||
.map(function(row) { | ||
return (row || []).map(joinCellCollisions("+")).join("\t"); | ||
}) | ||
.join("\n"); | ||
} | ||
TableParser.prototype.renderMatrix = function(){ | ||
TableParser.prototype.renderMatrix = function() { | ||
return renderMatrix(this.getMatrix()); | ||
} | ||
}; | ||
module.exports = TableParser; |
{ | ||
"name": "pdfreader", | ||
"version": "1.0.0", | ||
"version": "1.0.2", | ||
"description": "Read text and parse tables from PDF files. Supports tabular data with automatic column detection, and rule-based parsing.", | ||
"main": "index.js", | ||
"scripts": { | ||
"test:print:result": "echo ✅ Tests passed.", | ||
"prettier:print": "prettier --list-different \"./**/*.js\" \"./**/*.md\"", | ||
"prettier:check": "npm run -s prettier:print 1>&2; exit $(npm run -s prettier:print | wc -l)", | ||
"prettier:fix": "prettier \"./**/*.js\" \"./**/*.md\" --write", | ||
"test:print:result": "echo ✅ All tests passed.", | ||
"test:diff:buffer": "node parseAsBuffer.js test/sample.pdf >test/test-buffer-snapshot.log; git --no-pager diff test/test-buffer-snapshot.log 1>&2; exit $(git --no-pager diff test/test-buffer-snapshot.log | wc -l)", | ||
@@ -13,3 +16,4 @@ "test:diff:file": "node test/test.js >test/test-snapshot.log; git --no-pager diff test/test-snapshot.log 1>&2; exit $(git --no-pager diff test/test-snapshot.log | wc -l)", | ||
"test:sample": "node test/test.js && node parseAsBuffer.js test/sample.pdf", | ||
"test": "npm run -s test:sample && npm run -s test:regression && npm run -s test:diff && npm run -s test:print:result" | ||
"test:functional": "npm run -s test:sample && npm run -s test:regression && npm run -s test:diff", | ||
"test": "npm run -s prettier:check && npm run -s test:functional && npm run -s test:print:result" | ||
}, | ||
@@ -41,4 +45,10 @@ "repository": { | ||
"dependencies": { | ||
"pdf2json": "1.1.2" | ||
"pdf2json": "1.1.7" | ||
}, | ||
"devDependencies": { | ||
"eslint-config-prettier": "^3.3.0", | ||
"eslint-plugin-prettier": "^3.0.0", | ||
"prettier": "1.15.3", | ||
"semantic-release": "^15.13.3" | ||
} | ||
} |
30
parse.js
var LOG = require("./lib/LOG.js").toggle(false); | ||
var PdfReader = require("./index.js").PdfReader; | ||
function printRawItems(filename, callback){ | ||
new PdfReader().parseFileItems(filename, function(err, item){ | ||
if (err) | ||
callback(err); | ||
else if (!item) | ||
callback(); | ||
else if (item.file) | ||
console.log("file =", item.file.path); | ||
else if (item.page) | ||
console.log("page =", item.page); | ||
function printRawItems(filename, callback) { | ||
new PdfReader().parseFileItems(filename, function(err, item) { | ||
if (err) callback(err); | ||
else if (!item) callback(); | ||
else if (item.file) console.log("file =", item.file.path); | ||
else if (item.page) console.log("page =", item.page); | ||
else if (item.x) | ||
console.log([item.x, item.y, item.oc, item.A, Math.floor(item.w), item.text].join("\t")); | ||
else | ||
console.warn(item); | ||
console.log( | ||
[item.x, item.y, item.oc, item.A, Math.floor(item.w), item.text].join( | ||
"\t" | ||
) | ||
); | ||
else console.warn(item); | ||
}); | ||
@@ -24,8 +23,7 @@ } | ||
console.error("please provide the name of a PDF file"); | ||
} | ||
else { | ||
} else { | ||
console.warn("printing raw items from file:", filename, "..."); | ||
printRawItems(filename, function(){ | ||
printRawItems(filename, function() { | ||
console.warn("done."); | ||
}); | ||
} |
var LOG = require("./lib/LOG.js").toggle(false); | ||
var PdfReader = require("./index.js").PdfReader; | ||
var fs = require('fs'); | ||
var fs = require("fs"); | ||
function printRawItems(pdfBuffer, callback){ | ||
new PdfReader().parseBuffer(pdfBuffer, function(err, item){ | ||
if (err) | ||
callback(err); | ||
else if (!item) | ||
callback(); | ||
else if (item.file) | ||
console.log("file =", item.file.path); | ||
else if (item.page) | ||
console.log("page =", item.page); | ||
function printRawItems(pdfBuffer, callback) { | ||
new PdfReader().parseBuffer(pdfBuffer, function(err, item) { | ||
if (err) callback(err); | ||
else if (!item) callback(); | ||
else if (item.file) console.log("file =", item.file.path); | ||
else if (item.page) console.log("page =", item.page); | ||
else if (item.x) | ||
console.log([item.x, item.y, item.oc, item.A, Math.floor(item.w), item.text].join("\t")); | ||
else | ||
console.warn(item); | ||
console.log( | ||
[item.x, item.y, item.oc, item.A, Math.floor(item.w), item.text].join( | ||
"\t" | ||
) | ||
); | ||
else console.warn(item); | ||
}); | ||
@@ -25,10 +24,9 @@ } | ||
console.error("please provide the name of a PDF file"); | ||
} | ||
else { | ||
} else { | ||
console.warn("printing raw items from file:", filename, "..."); | ||
fs.readFile(filename, (err, pdfBuffer) => { | ||
printRawItems(pdfBuffer, function (){ | ||
console.warn("done."); | ||
}); | ||
printRawItems(pdfBuffer, function() { | ||
console.warn("done."); | ||
}); | ||
}); | ||
} |
@@ -5,3 +5,3 @@ /** | ||
* This content is released under the MIT License. | ||
* | ||
* | ||
* An item object can match one of the following objects: | ||
@@ -12,3 +12,3 @@ * - null, when the parsing is over, or an error occured. | ||
* - {text:string, x:float, y:float, w:float, h:float...}, represents each text with its position. | ||
* | ||
* | ||
**/ | ||
@@ -19,11 +19,11 @@ | ||
function forEachItem(pdf, handler){ | ||
function forEachItem(pdf, handler) { | ||
var pageNumber = 0; | ||
for (var p in pdf.data.Pages) { | ||
var page = pdf.data.Pages[p]; | ||
for (var p in pdf.formImage.Pages) { | ||
var page = pdf.formImage.Pages[p]; | ||
var number = ++pageNumber; | ||
handler(null, { | ||
page: number, | ||
width: pdf.data.Width, | ||
height:pdf.data.Pages[number-1].Height | ||
width: pdf.formImage.Width, | ||
height: pdf.formImage.Pages[number - 1].Height | ||
}); | ||
@@ -39,3 +39,3 @@ for (var t in page.Texts) { | ||
function PdfReader(options){ | ||
function PdfReader(options) { | ||
LOG("PdfReader"); // only displayed if LOG.js was first loaded with `true` as init parameter | ||
@@ -48,7 +48,7 @@ this.options = options || {}; | ||
**/ | ||
PdfReader.prototype.parseFileItems = function(pdfFilePath, itemHandler){ | ||
itemHandler(null, { file: { path: pdfFilePath }}); | ||
PdfReader.prototype.parseFileItems = function(pdfFilePath, itemHandler) { | ||
itemHandler(null, { file: { path: pdfFilePath } }); | ||
var pdfParser = new PFParser(); | ||
pdfParser.on("pdfParser_dataError", itemHandler); | ||
pdfParser.on("pdfParser_dataReady", function (pdfData){ | ||
pdfParser.on("pdfParser_dataReady", function(pdfData) { | ||
forEachItem(pdfData, itemHandler); | ||
@@ -63,7 +63,7 @@ }); | ||
*/ | ||
PdfReader.prototype.parseBuffer = function(pdfBuffer, itemHandler){ | ||
itemHandler(null, { file: { buffer: pdfBuffer }}); | ||
PdfReader.prototype.parseBuffer = function(pdfBuffer, itemHandler) { | ||
itemHandler(null, { file: { buffer: pdfBuffer } }); | ||
var pdfParser = new PFParser(); | ||
pdfParser.on("pdfParser_dataError", itemHandler); | ||
pdfParser.on("pdfParser_dataReady", function (pdfData){ | ||
pdfParser.on("pdfParser_dataReady", function(pdfData) { | ||
forEachItem(pdfData, itemHandler); | ||
@@ -75,3 +75,2 @@ }); | ||
module.exports = PdfReader; |
149
README.md
@@ -1,8 +0,6 @@ | ||
[![CircleCI](https://circleci.com/gh/adrienjoly/npm-pdfreader.svg?style=svg)](https://circleci.com/gh/adrienjoly/npm-pdfreader) | ||
# pdfreader [![Continuous Integration](https://circleci.com/gh/adrienjoly/npm-pdfreader.svg?style=shield)](https://circleci.com/gh/adrienjoly/npm-pdfreader) [![Code Quality](https://api.codacy.com/project/badge/Grade/73d37dbb0ff84795acf65a55c5936d83)](https://www.codacy.com/app/adrien-joly/npm-pdfreader?utm_source=github.com&utm_medium=referral&utm_content=adrienjoly/npm-pdfreader&utm_campaign=Badge_Grade) | ||
# pdfreader | ||
Read text and parse tables from PDF files. | ||
Supports tabular data with automatic column detection, and rule-based parsing. | ||
Supports **tabular data** with automatic column detection, and **rule-based parsing**. | ||
@@ -13,2 +11,9 @@ Dependencies: it is based on [pdf2json](https://www.npmjs.com/package/pdf2json), which itself relies on Mozilla's [pdf.js](https://github.com/mozilla/pdf.js/). | ||
Summary: | ||
- [Installation, tests and CLI usage](#installation-tests-and-cli-usage) | ||
- [Raw PDF reading](#raw-pdf-reading) (incl. examples) | ||
- [Rule-based data extraction](#rule-based-data-extraction) | ||
- [Troubleshooting & FAQ](#troubleshooting--faq) | ||
## Installation, tests and CLI usage | ||
@@ -23,37 +28,31 @@ | ||
The PdfReader class reads a PDF file, and calls a function on each item found while parsing that file. | ||
This module exposes the `PdfReader` class, to be instantiated. | ||
An item object can match one of the following objects: | ||
Your instance has two methods for parsing a PDF. They return the same output and differ only in input: `PdfReader.parseFileItems` (as below) for a filename, and `PdfReader.parseBuffer` (see: "Raw PDF reading from a PDF already in memory (buffer)") from data that you don't want to reference from the filesystem. | ||
- `null`, when the parsing is over, or an error occured. | ||
- `{file:{path:string}}`, when a PDF file is being opened. | ||
- `{page:integer, width:float, height:float}`, when a new page is being parsed, provides the page number, starting at 1. | ||
- `{text:string, x:float, y:float, w:float, h:float...}`, represents each text with its position. | ||
Whichever method you choose, it asks for a callback, which gets called each time the instance finds what it denotes as a PDF item. | ||
Example: | ||
An item object can match one of the following objects: | ||
- `null`, when the parsing is over, or an error occured. | ||
- File metadata, `{file:{path:string}}`, when a PDF file is being opened, and is always the first item. | ||
- Page metadata, `{page:integer, width:float, height:float}`, when a new page is being parsed, provides the page number, starting at 1. This basically acts as a carriage return for the coordinates of text items to be processed. | ||
- Text items, `{text:string, x:float, y:float, w:float, h:float...}`, which you can think of as simple objects with a text property, and floating 2D AABB coordinates on the page. | ||
It's up to your callback to process these items into a data structure of your choice, and also to handle any errors thrown to it. | ||
For example: | ||
```javascript | ||
new PdfReader().parseFileItems("sample.pdf", function(err, item){ | ||
if (err) | ||
callback(err); | ||
else if (!item) | ||
callback(); | ||
else if (item.text) | ||
console.log(item.text); | ||
new PdfReader().parseFileItems("sample.pdf", function(err, item) { | ||
if (err) callback(err); | ||
else if (!item) callback(); | ||
else if (item.text) console.log(item.text); | ||
}); | ||
``` | ||
## Raw PDF reading from a PDF already in memory (buffer) | ||
### Raw PDF reading from a PDF already in memory (buffer) | ||
The PdfReader class reads a PDF file, and calls a function on each item found while parsing that file. | ||
As above, but reading from a buffer in memory rather than from a file referenced by path. For example: | ||
An item object can match one of the following objects: | ||
- `null`, when the parsing is over, or an error occured. | ||
- `{file:{path:string}}`, when a PDF file is being opened. | ||
- `{page:integer}`, when a new page is being parsed, provides the page number, starting at 1. | ||
- `{text:string, x:float, y:float, w:float, h:float...}`, represents each text with its position. | ||
Example: | ||
```javascript | ||
@@ -63,9 +62,6 @@ var fs = require("fs"); | ||
// pdfBuffer contains the file content | ||
new PdfReader().parseBuffer(pdfBuffer, function(err, item){ | ||
if (err) | ||
callback(err); | ||
else if (!item) | ||
callback(); | ||
else if (item.text) | ||
console.log(item.text); | ||
new PdfReader().parseBuffer(pdfBuffer, function(err, item) { | ||
if (err) callback(err); | ||
else if (!item) callback(); | ||
else if (item.text) console.log(item.text); | ||
}); | ||
@@ -75,3 +71,3 @@ }); | ||
## Example: parsing lines of text from a PDF file | ||
### Example: parsing lines of text from a PDF file | ||
@@ -83,3 +79,3 @@ ![example cv resume parse convert pdf to text](https://github.com/adrienjoly/npm-pdfreader-example/raw/master/parseRows.png) | ||
```js | ||
var pdfreader = require('pdfreader'); | ||
var pdfreader = require("pdfreader"); | ||
@@ -91,13 +87,15 @@ var rows = {}; // indexed by y-position | ||
.sort((y1, y2) => parseFloat(y1) - parseFloat(y2)) // sort float positions | ||
.forEach((y) => console.log((rows[y] || []).join(''))); | ||
.forEach(y => console.log((rows[y] || []).join(""))); | ||
} | ||
new pdfreader.PdfReader().parseFileItems('CV_ErhanYasar.pdf', function(err, item){ | ||
new pdfreader.PdfReader().parseFileItems("CV_ErhanYasar.pdf", function( | ||
err, | ||
item | ||
) { | ||
if (!item || item.page) { | ||
// end of file, or page | ||
printRows(); | ||
console.log('PAGE:', item.page); | ||
console.log("PAGE:", item.page); | ||
rows = {}; // clear rows for next page | ||
} | ||
else if (item.text) { | ||
} else if (item.text) { | ||
// accumulate text items into rows object, per line | ||
@@ -111,3 +109,3 @@ (rows[item.y] = rows[item.y] || []).push(item.text); | ||
## Example: parsing a table from a PDF file | ||
### Example: parsing a table from a PDF file | ||
@@ -119,29 +117,35 @@ ![example cv resume parse convert pdf table to text](https://github.com/adrienjoly/npm-pdfreader-example/raw/master/parseTable.png) | ||
```js | ||
var pdfreader = require('pdfreader'); | ||
var pdfreader = require("pdfreader"); | ||
const nbCols = 2; | ||
const cellPadding = 40; // each cell is padded to fit 40 characters | ||
const columnQuantitizer = (item) => parseFloat(item.x) >= 20; | ||
const columnQuantitizer = item => parseFloat(item.x) >= 20; | ||
const padColumns = (array, nb) => | ||
Array.apply(null, {length: nb}).map((val, i) => array[i] || []); | ||
// .. because map() skips undefined elements | ||
Array.apply(null, { length: nb }).map((val, i) => array[i] || []); | ||
// .. because map() skips undefined elements | ||
const mergeCells = (cells) => (cells || []) | ||
.map((cell) => cell.text).join('') // merge cells | ||
.substr(0, cellPadding).padEnd(cellPadding, ' '); // padding | ||
const mergeCells = cells => | ||
(cells || []) | ||
.map(cell => cell.text) | ||
.join("") // merge cells | ||
.substr(0, cellPadding) | ||
.padEnd(cellPadding, " "); // padding | ||
const renderMatrix = (matrix) => (matrix || []) | ||
.map((row, y) => padColumns(row, nbCols) | ||
.map(mergeCells) | ||
.join(' | ') | ||
).join('\n'); | ||
const renderMatrix = matrix => | ||
(matrix || []) | ||
.map((row, y) => | ||
padColumns(row, nbCols) | ||
.map(mergeCells) | ||
.join(" | ") | ||
) | ||
.join("\n"); | ||
var table = new pdfreader.TableParser(); | ||
new pdfreader.PdfReader().parseFileItems(filename, function(err, item){ | ||
new pdfreader.PdfReader().parseFileItems(filename, function(err, item) { | ||
if (!item || item.page) { | ||
// end of file, or page | ||
console.log(renderMatrix(table.getMatrix())); | ||
console.log('PAGE:', item.page); | ||
console.log("PAGE:", item.page); | ||
table = new pdfreader.TableParser(); // new/clear table for next page | ||
@@ -157,3 +161,2 @@ } else if (item.text) { | ||
## Rule-based data extraction | ||
@@ -169,8 +172,16 @@ | ||
var processItem = Rule.makeItemProcessor([ | ||
Rule.on(/^Hello \"(.*)\"$/).extractRegexpValues().then(displayValue), | ||
Rule.on(/^Value\:/).parseNextItemValue().then(displayValue), | ||
Rule.on(/^c1$/).parseTable(3).then(displayTable), | ||
Rule.on(/^Values\:/).accumulateAfterHeading().then(displayValue), | ||
Rule.on(/^Hello \"(.*)\"$/) | ||
.extractRegexpValues() | ||
.then(displayValue), | ||
Rule.on(/^Value\:/) | ||
.parseNextItemValue() | ||
.then(displayValue), | ||
Rule.on(/^c1$/) | ||
.parseTable(3) | ||
.then(displayTable), | ||
Rule.on(/^Values\:/) | ||
.accumulateAfterHeading() | ||
.then(displayValue) | ||
]); | ||
new PdfReader().parseFileItems("sample.pdf", function(err, item){ | ||
new PdfReader().parseFileItems("sample.pdf", function(err, item) { | ||
processItem(item); | ||
@@ -186,3 +197,3 @@ }); | ||
## Problem: when I use pdfreader from my express-based node.js app, I'm getting `Cannot read property 'userAgent' of undefined`. | ||
### `Cannot read property 'userAgent' of undefined` error from an express-based node.js app | ||
@@ -193,10 +204,10 @@ Dmitry found out that you may need to run these instructions before including the `pdfreader` module: | ||
global.navigator = { | ||
userAgent: 'node', | ||
} | ||
userAgent: "node" | ||
}; | ||
window.navigator = { | ||
userAgent: 'node', | ||
} | ||
userAgent: "node" | ||
}; | ||
``` | ||
Source: [express - TypeError: Cannot read property 'userAgent' of undefined error on node.js app run - Stack Overflow](https://stackoverflow.com/questions/49208414/typeerror-cannot-read-property-useragent-of-undefined-error-on-node-js-app-ru) |
99
Rule.js
@@ -13,8 +13,8 @@ /** | ||
**/ | ||
function Rule(regexp){ | ||
function Rule(regexp) { | ||
this.regexp = regexp; | ||
var self = this; | ||
// proxy accumulators methods | ||
Object.keys(Rule.accumulators).forEach(function(name){ | ||
self[name] = function(){ | ||
Object.keys(Rule.accumulators).forEach(function(name) { | ||
self[name] = function() { | ||
LOG("building rule:", regexp, "->", name); | ||
@@ -30,11 +30,11 @@ self.methodName = name; | ||
// shortcut for defining Rule objects in a more concise manner | ||
Rule.on = function(regexp){ | ||
Rule.on = function(regexp) { | ||
return new Rule(regexp); | ||
} | ||
}; | ||
Rule.after = function(regexp){ | ||
Rule.after = function(regexp) { | ||
var rule = new Rule(regexp); | ||
rule.skipCurrentItem = true; | ||
return rule; | ||
} | ||
}; | ||
@@ -46,5 +46,5 @@ /** | ||
**/ | ||
Rule.prototype.then = function(fct){ | ||
Rule.prototype.then = function(fct) { | ||
var self = this; | ||
this.terminate = function(){ | ||
this.terminate = function() { | ||
fct.call(self, self.output); | ||
@@ -56,3 +56,3 @@ }; | ||
// private function that checks a PDF item against the Rule's regexp, and returns the corresponding accumulator. | ||
Rule.prototype.test = function(item){ | ||
Rule.prototype.test = function(item) { | ||
if (this.regexp.test(item.text)) { | ||
@@ -62,3 +62,6 @@ // lazy init of accumulators: build and init the accumulator on first match | ||
if (!this.accumulatorImpl && this.accumulatorBuilder) { | ||
this.accumulatorImpl = this.accumulatorBuilder.apply(this, this.accumulatorParams); | ||
this.accumulatorImpl = this.accumulatorBuilder.apply( | ||
this, | ||
this.accumulatorParams | ||
); | ||
this.accumulatorImpl.methodName = this.methodName; | ||
@@ -72,9 +75,9 @@ this.accumulatorImpl.terminate = this.terminate; | ||
// intended to be run from accumulator, in order to process output before calling termination then() handler. | ||
Rule.prototype.whenDone = function(fct){ | ||
Rule.prototype.whenDone = function(fct) { | ||
var self = this; | ||
var then = this.terminate; | ||
this.terminate = function(){ | ||
this.terminate = function() { | ||
fct.call(self); | ||
then(); | ||
} | ||
}; | ||
}; | ||
@@ -87,5 +90,5 @@ | ||
**/ | ||
Rule.makeItemProcessor = function(rules){ | ||
Rule.makeItemProcessor = function(rules) { | ||
var currentAccumulator = null; | ||
function terminateAccumulator(){ | ||
function terminateAccumulator() { | ||
var terminatePreviousAcc = (currentAccumulator || {}).terminate; | ||
@@ -98,7 +101,7 @@ if (terminatePreviousAcc) { | ||
var applyRulesOnNextItem = true; | ||
return function(item){ | ||
if (!item) // last item of the file => flush buffers | ||
return function(item) { | ||
if (!item) | ||
// last item of the file => flush buffers | ||
return terminateAccumulator(); | ||
else if (!item.text) | ||
return; | ||
else if (!item.text) return; | ||
//LOG("ITEM:", item.text, "=> apply rules:", applyRulesOnNextItem); | ||
@@ -111,4 +114,3 @@ if (applyRulesOnNextItem) | ||
LOG("current accumulator:", accumulator.methodName); | ||
if (rules[r].skipCurrentItem) | ||
applyRulesOnNextItem = false; | ||
if (rules[r].skipCurrentItem) applyRulesOnNextItem = false; | ||
currentAccumulator = accumulator; | ||
@@ -119,9 +121,7 @@ delete rules[r]; | ||
} | ||
else | ||
applyRulesOnNextItem = true; | ||
else applyRulesOnNextItem = true; | ||
// if reaching this point, the current item matches none of the rules => accumulating data on current accumulator | ||
if (currentAccumulator) | ||
applyRulesOnNextItem = !currentAccumulator(item); | ||
if (currentAccumulator) applyRulesOnNextItem = !currentAccumulator(item); | ||
}; | ||
} | ||
}; | ||
@@ -135,9 +135,11 @@ /** | ||
Rule.accumulators = { | ||
stopAccumulating: function(){ return function(){}; }, | ||
stopAccumulating: function() { | ||
return function() {}; | ||
} | ||
}; | ||
// method for adding accumulators | ||
Rule.addAccumulator = function(methodName, methodBuilder){ | ||
Rule.addAccumulator = function(methodName, methodBuilder) { | ||
Rule.accumulators[methodName] = methodBuilder; | ||
} | ||
}; | ||
@@ -148,6 +150,6 @@ /** | ||
**/ | ||
Rule.addAccumulator("extractRegexpValues", function(){ | ||
Rule.addAccumulator("extractRegexpValues", function() { | ||
var matches = this.regexp.exec(this.currentItem.text); | ||
this.output = matches.slice(1); | ||
return function(){}; // following lines are not to be processed by this accumulator | ||
return function() {}; // following lines are not to be processed by this accumulator | ||
}); | ||
@@ -158,11 +160,10 @@ | ||
**/ | ||
Rule.addAccumulator("parseNextItemValue", function(){ | ||
Rule.addAccumulator("parseNextItemValue", function() { | ||
var self = this, | ||
done = false; | ||
return function (item){ | ||
if (done) | ||
return; | ||
done = false; | ||
return function(item) { | ||
if (done) return; | ||
done = true; | ||
self.output = item.text; | ||
} | ||
}; | ||
}); | ||
@@ -173,5 +174,5 @@ | ||
**/ | ||
Rule.addAccumulator("accumulateAfterHeading", function(){ | ||
var output = this.output = []; | ||
return function accumulate(item){ | ||
Rule.addAccumulator("accumulateAfterHeading", function() { | ||
var output = (this.output = []); | ||
return function accumulate(item) { | ||
output.push(item.text); | ||
@@ -184,11 +185,9 @@ }; | ||
**/ | ||
Rule.addAccumulator("accumulateFromSameX", function(){ | ||
var output = this.output = [], | ||
x = null; | ||
return function accumulate(item){ | ||
if (x === null) | ||
x = item.x; | ||
if (x == item.x) | ||
output.push(item.text); | ||
} | ||
Rule.addAccumulator("accumulateFromSameX", function() { | ||
var output = (this.output = []), | ||
x = null; | ||
return function accumulate(item) { | ||
if (x === null) x = item.x; | ||
if (x == item.x) output.push(item.text); | ||
}; | ||
}); | ||
@@ -195,0 +194,0 @@ |
@@ -10,10 +10,7 @@ var LOG = require("../lib/LOG.js").toggle(false); | ||
function printRawItems(callback){ | ||
new PdfReader().parseFileItems(TESTFILE, function(err, item){ | ||
if (err) | ||
callback(err); | ||
else if (!item) | ||
callback(); | ||
else | ||
console.log(item); | ||
function printRawItems(callback) { | ||
new PdfReader().parseFileItems(TESTFILE, function(err, item) { | ||
if (err) callback(err); | ||
else if (!item) callback(); | ||
else console.log(item); | ||
}); | ||
@@ -24,24 +21,29 @@ } | ||
function parseData(callback){ | ||
function displayValue(value){ | ||
function parseData(callback) { | ||
function displayValue(value) { | ||
console.log("extracted value:", value); | ||
} | ||
function displayTable(table){ | ||
for (var i=0; i<table.length; ++i) | ||
console.log(table[i].join("\t")); | ||
function displayTable(table) { | ||
for (var i = 0; i < table.length; ++i) console.log(table[i].join("\t")); | ||
} | ||
var rules = [ | ||
Rule.on(/^Hello \"(.*)\"$/).extractRegexpValues().then(displayValue), | ||
Rule.on(/^Value\:/).parseNextItemValue().then(displayValue), | ||
Rule.on(/^c1$/).parseTable(3).then(displayTable), | ||
Rule.on(/^Values\:/).accumulateAfterHeading().then(displayValue), | ||
Rule.on(/^Hello \"(.*)\"$/) | ||
.extractRegexpValues() | ||
.then(displayValue), | ||
Rule.on(/^Value\:/) | ||
.parseNextItemValue() | ||
.then(displayValue), | ||
Rule.on(/^c1$/) | ||
.parseTable(3) | ||
.then(displayTable), | ||
Rule.on(/^Values\:/) | ||
.accumulateAfterHeading() | ||
.then(displayValue) | ||
]; | ||
var processItem = Rule.makeItemProcessor(rules); | ||
new PdfReader().parseFileItems(TESTFILE, function(err, item){ | ||
if (err) | ||
callback(err); | ||
new PdfReader().parseFileItems(TESTFILE, function(err, item) { | ||
if (err) callback(err); | ||
else { | ||
processItem(item); | ||
if (!item) | ||
callback(err, item); | ||
if (!item) callback(err, item); | ||
} | ||
@@ -54,7 +56,7 @@ }); | ||
console.log("\ntest 1: raw items from sample.pdf\n"); | ||
printRawItems(function(){ | ||
printRawItems(function() { | ||
console.log("\ntest 2: parse values from sample.pdf\n"); | ||
parseData(function(){ | ||
parseData(function() { | ||
console.log("\ndone.\n"); | ||
}); | ||
}); | ||
}); |
Sorry, the diff of this file is not supported yet
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Filesystem access
Supply chain riskAccesses the file system, and could potentially read sensitive data.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Filesystem access
Supply chain riskAccesses the file system, and could potentially read sensitive data.
Found 1 instance in 1 package
29
749
202
1435419
4
+ Addedpdf2json@1.1.7(transitive)
- Removedpdf2json@1.1.2(transitive)
Updatedpdf2json@1.1.7