Comparing version 1.3.1 to 1.4.0
@@ -68,2 +68,3 @@ /** | ||
/** @returns an 3-dimension matrix: row -> column -> items_collisionning_in_column -> item */ | ||
TableParser.prototype.getMatrix = function () { | ||
@@ -83,2 +84,15 @@ var rows = this.getRows(); | ||
/** | ||
* For use with console.table(). | ||
* @param {String} collisionSeparator separator to use when there are multiple values to join for a given column | ||
* @returns a 2-dimension matrix: row -> column -> value | ||
*/ | ||
TableParser.prototype.getCleanMatrix = function ({ collisionSeparator } = {}) { | ||
return this.getMatrix().map((rowColumns) => | ||
rowColumns.map((items) => | ||
items.map((item) => item.text).join(collisionSeparator || "") | ||
) | ||
); | ||
}; | ||
function getText(item) { | ||
@@ -85,0 +99,0 @@ return item.text; |
{ | ||
"name": "pdfreader", | ||
"version": "1.3.1", | ||
"version": "1.4.0", | ||
"description": "Read text and parse tables from PDF files. Supports tabular data with automatic column detection, and rule-based parsing.", | ||
@@ -5,0 +5,0 @@ "main": "index.js", |
161
README.md
@@ -1,2 +0,2 @@ | ||
# pdfreader ![Node CI](https://github.com/adrienjoly/npm-pdfreader/workflows/Node%20CI/badge.svg) [![Code Quality](https://api.codacy.com/project/badge/Grade/73d37dbb0ff84795acf65a55c5936d83)](https://www.codacy.com/app/adrien-joly/npm-pdfreader?utm_source=github.com&utm_medium=referral&utm_content=adrienjoly/npm-pdfreader&utm_campaign=Badge_Grade) | ||
# pdfreader ![Node CI](https://github.com/adrienjoly/npm-pdfreader/workflows/Node%20CI/badge.svg) [![Code Quality](https://api.codacy.com/project/badge/Grade/73d37dbb0ff84795acf65a55c5936d83)](https://app.codacy.com/gh/adrienjoly/npm-pdfreader?utm_source=github.com&utm_medium=referral&utm_content=adrienjoly/npm-pdfreader&utm_campaign=Badge_Grade) | ||
@@ -57,4 +57,17 @@ Read text and parse tables from PDF files. | ||
### Raw PDF reading from a PDF already in memory (buffer) | ||
### Parsing a password-protected PDF file | ||
```javascript | ||
new PdfReader({ password: "YOUR_PASSWORD" }).parseFileItems( | ||
"test/sample-with-password.pdf", | ||
function (err, item) { | ||
if (err) console.error(err); | ||
else if (!item) console.warn("end of file"); | ||
else if (item.text) console.log(item.text); | ||
} | ||
); | ||
``` | ||
### Raw PDF reading from a PDF buffer | ||
As above, but reading from a buffer in memory rather than from a file referenced by path. For example: | ||
@@ -76,151 +89,17 @@ | ||
### Example: reading from a buffer of an online PDF | ||
### Other examples of use | ||
```javascript | ||
const get = (url) => | ||
new Promise((resolve, reject) => | ||
https | ||
.get(url, (res) => { | ||
const data = []; | ||
res | ||
.on("data", (chunk) => data.push(chunk)) | ||
.on("end", () => resolve(Buffer.concat(data))); | ||
}) | ||
.on("error", reject) | ||
); | ||
function addTextToLines(textLines, item) { | ||
const existingLine = textLines.find(({ y }) => y === item.y); | ||
if (existingLine) { | ||
existingLine.text += " " + item.text; | ||
} else { | ||
textLines.push(item); | ||
} | ||
} | ||
const parseLinesPerPage = (buffer) => | ||
new Promise((resolve, reject) => { | ||
const linesPerPage = []; | ||
let pageNumber = 0; | ||
new PdfReader().parseBuffer(buffer, (err, item) => { | ||
if (err) reject(err); | ||
else if (!item) { | ||
resolve(linesPerPage.map((page) => page.map((line) => line.text))); | ||
} else if (item.page) { | ||
pageNumber = item.page - 1; | ||
linesPerPage[pageNumber] = []; | ||
} else if (item.text) { | ||
addTextToLines(linesPerPage[pageNumber], item); | ||
} | ||
}); | ||
}); | ||
const url = new URL( | ||
"https://raw.githubusercontent.com/adrienjoly/npm-pdfreader/master/test/sample.pdf" | ||
); | ||
const buffer = get(url) | ||
.then((buffer) => parseLinesPerPage(buffer)) | ||
.then((linesPerPage) => console.log(linesPerPage)); | ||
``` | ||
### Example: parsing lines of text from a PDF file | ||
![example cv resume parse convert pdf to text](https://github.com/adrienjoly/npm-pdfreader-example/raw/master/parseRows.png) | ||
Here is the code required to convert this PDF file into text: | ||
```js | ||
const { PdfReader } = require("pdfreader"); | ||
let rows = {}; // indexed by y-position | ||
function flushRows() { | ||
Object.keys(rows) // => array of y-positions (type: float) | ||
.sort((y1, y2) => parseFloat(y1) - parseFloat(y2)) // sort float positions | ||
.forEach((y) => console.log((rows[y] || []).join(""))); | ||
rows = {}; // clear rows for next page | ||
} | ||
new PdfReader().parseFileItems("test/sample.pdf", (err, item) => { | ||
if (err) { | ||
console.error({ err }); | ||
} else if (!item) { | ||
flushRows(); | ||
console.log("END OF FILE"); | ||
} else if (item.page) { | ||
flushRows(); // print the rows of the previous page | ||
console.log("PAGE:", item.page); | ||
} else if (item.text) { | ||
// accumulate text items into rows object, per line | ||
(rows[item.y] = rows[item.y] || []).push(item.text); | ||
} | ||
}); | ||
``` | ||
Fork this example from [parsing a CV/résumé](https://github.com/adrienjoly/npm-pdfreader-example). | ||
### Example: parsing a table from a PDF file | ||
![example cv resume parse convert pdf table to text](https://github.com/adrienjoly/npm-pdfreader-example/raw/master/parseTable.png) | ||
Here is the code required to convert this PDF file into a textual table: | ||
Source code of the examples above: [parsing a CV/résumé](https://github.com/adrienjoly/npm-pdfreader-example). | ||
```js | ||
var pdfreader = require("pdfreader"); | ||
For more, see [Examples of use](https://github.com/adrienjoly/npm-pdfreader/discussions/categories/examples-of-use). | ||
const nbCols = 2; | ||
const cellPadding = 40; // each cell is padded to fit 40 characters | ||
const columnQuantitizer = (item) => parseFloat(item.x) >= 20; | ||
const padColumns = (array, nb) => | ||
Array.apply(null, { length: nb }).map((val, i) => array[i] || []); | ||
// .. because map() skips undefined elements | ||
const mergeCells = (cells) => | ||
(cells || []) | ||
.map((cell) => cell.text) | ||
.join("") // merge cells | ||
.substr(0, cellPadding) | ||
.padEnd(cellPadding, " "); // padding | ||
const renderMatrix = (matrix) => | ||
(matrix || []) | ||
.map((row, y) => padColumns(row, nbCols).map(mergeCells).join(" | ")) | ||
.join("\n"); | ||
var table = new pdfreader.TableParser(); | ||
new pdfreader.PdfReader().parseFileItems(filename, function (err, item) { | ||
if (!item || item.page) { | ||
// end of file, or page | ||
console.log(renderMatrix(table.getMatrix())); | ||
console.log("PAGE:", item.page); | ||
table = new pdfreader.TableParser(); // new/clear table for next page | ||
} else if (item.text) { | ||
// accumulate text items into rows object, per line | ||
table.processItem(item, columnQuantitizer(item)); | ||
} | ||
}); | ||
``` | ||
Fork this example from [parsing a CV/résumé](https://github.com/adrienjoly/npm-pdfreader-example). | ||
## Example: opening a PDF file with a password | ||
```javascript | ||
new PdfReader({ password: "YOUR_PASSWORD" }).parseFileItems( | ||
"test/sample-with-password.pdf", | ||
function (err, item) { | ||
if (err) console.error(err); | ||
else if (!item) console.warn("end of file"); | ||
else if (item.text) console.log(item.text); | ||
} | ||
); | ||
``` | ||
## Rule-based data extraction | ||
The Rule class can be used to define and process data extraction rules, while parsing a PDF document. | ||
The `Rule` class can be used to define and process data extraction rules, while parsing a PDF document. | ||
Rule instances expose "accumulators": methods that defines the data extraction strategy to be used for each rule. | ||
`Rule` instances expose "accumulators": methods that defines the data extraction strategy to be used for each rule. | ||
@@ -227,0 +106,0 @@ Example: |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
713
35130
146