Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

pdfreader

Package Overview
Dependencies
Maintainers
1
Versions
56
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

pdfreader - npm Package Compare versions

Comparing version 1.3.1 to 1.4.0

14

lib/TableParser.js

@@ -68,2 +68,3 @@ /**

/** @returns an 3-dimension matrix: row -> column -> items_collisionning_in_column -> item */
TableParser.prototype.getMatrix = function () {

@@ -83,2 +84,15 @@ var rows = this.getRows();

/**
* For use with console.table().
* @param {String} collisionSeparator separator to use when there are multiple values to join for a given column
* @returns a 2-dimension matrix: row -> column -> value
*/
TableParser.prototype.getCleanMatrix = function ({ collisionSeparator } = {}) {
return this.getMatrix().map((rowColumns) =>
rowColumns.map((items) =>
items.map((item) => item.text).join(collisionSeparator || "")
)
);
};
function getText(item) {

@@ -85,0 +99,0 @@ return item.text;

2

package.json
{
"name": "pdfreader",
"version": "1.3.1",
"version": "1.4.0",
"description": "Read text and parse tables from PDF files. Supports tabular data with automatic column detection, and rule-based parsing.",

@@ -5,0 +5,0 @@ "main": "index.js",

@@ -1,2 +0,2 @@

# pdfreader ![Node CI](https://github.com/adrienjoly/npm-pdfreader/workflows/Node%20CI/badge.svg) [![Code Quality](https://api.codacy.com/project/badge/Grade/73d37dbb0ff84795acf65a55c5936d83)](https://www.codacy.com/app/adrien-joly/npm-pdfreader?utm_source=github.com&utm_medium=referral&utm_content=adrienjoly/npm-pdfreader&utm_campaign=Badge_Grade)
# pdfreader ![Node CI](https://github.com/adrienjoly/npm-pdfreader/workflows/Node%20CI/badge.svg) [![Code Quality](https://api.codacy.com/project/badge/Grade/73d37dbb0ff84795acf65a55c5936d83)](https://app.codacy.com/gh/adrienjoly/npm-pdfreader?utm_source=github.com&utm_medium=referral&utm_content=adrienjoly/npm-pdfreader&utm_campaign=Badge_Grade)

@@ -57,4 +57,17 @@ Read text and parse tables from PDF files.

### Raw PDF reading from a PDF already in memory (buffer)
### Parsing a password-protected PDF file
```javascript
new PdfReader({ password: "YOUR_PASSWORD" }).parseFileItems(
"test/sample-with-password.pdf",
function (err, item) {
if (err) console.error(err);
else if (!item) console.warn("end of file");
else if (item.text) console.log(item.text);
}
);
```
### Raw PDF reading from a PDF buffer
As above, but reading from a buffer in memory rather than from a file referenced by path. For example:

@@ -76,151 +89,17 @@

### Example: reading from a buffer of an online PDF
### Other examples of use
```javascript
const get = (url) =>
new Promise((resolve, reject) =>
https
.get(url, (res) => {
const data = [];
res
.on("data", (chunk) => data.push(chunk))
.on("end", () => resolve(Buffer.concat(data)));
})
.on("error", reject)
);
function addTextToLines(textLines, item) {
const existingLine = textLines.find(({ y }) => y === item.y);
if (existingLine) {
existingLine.text += " " + item.text;
} else {
textLines.push(item);
}
}
const parseLinesPerPage = (buffer) =>
new Promise((resolve, reject) => {
const linesPerPage = [];
let pageNumber = 0;
new PdfReader().parseBuffer(buffer, (err, item) => {
if (err) reject(err);
else if (!item) {
resolve(linesPerPage.map((page) => page.map((line) => line.text)));
} else if (item.page) {
pageNumber = item.page - 1;
linesPerPage[pageNumber] = [];
} else if (item.text) {
addTextToLines(linesPerPage[pageNumber], item);
}
});
});
const url = new URL(
"https://raw.githubusercontent.com/adrienjoly/npm-pdfreader/master/test/sample.pdf"
);
const buffer = get(url)
.then((buffer) => parseLinesPerPage(buffer))
.then((linesPerPage) => console.log(linesPerPage));
```
### Example: parsing lines of text from a PDF file
![example cv resume parse convert pdf to text](https://github.com/adrienjoly/npm-pdfreader-example/raw/master/parseRows.png)
Here is the code required to convert this PDF file into text:
```js
const { PdfReader } = require("pdfreader");
let rows = {}; // indexed by y-position
function flushRows() {
Object.keys(rows) // => array of y-positions (type: float)
.sort((y1, y2) => parseFloat(y1) - parseFloat(y2)) // sort float positions
.forEach((y) => console.log((rows[y] || []).join("")));
rows = {}; // clear rows for next page
}
new PdfReader().parseFileItems("test/sample.pdf", (err, item) => {
if (err) {
console.error({ err });
} else if (!item) {
flushRows();
console.log("END OF FILE");
} else if (item.page) {
flushRows(); // print the rows of the previous page
console.log("PAGE:", item.page);
} else if (item.text) {
// accumulate text items into rows object, per line
(rows[item.y] = rows[item.y] || []).push(item.text);
}
});
```
Fork this example from [parsing a CV/résumé](https://github.com/adrienjoly/npm-pdfreader-example).
### Example: parsing a table from a PDF file
![example cv resume parse convert pdf table to text](https://github.com/adrienjoly/npm-pdfreader-example/raw/master/parseTable.png)
Here is the code required to convert this PDF file into a textual table:
Source code of the examples above: [parsing a CV/résumé](https://github.com/adrienjoly/npm-pdfreader-example).
```js
var pdfreader = require("pdfreader");
For more, see [Examples of use](https://github.com/adrienjoly/npm-pdfreader/discussions/categories/examples-of-use).
const nbCols = 2;
const cellPadding = 40; // each cell is padded to fit 40 characters
const columnQuantitizer = (item) => parseFloat(item.x) >= 20;
const padColumns = (array, nb) =>
Array.apply(null, { length: nb }).map((val, i) => array[i] || []);
// .. because map() skips undefined elements
const mergeCells = (cells) =>
(cells || [])
.map((cell) => cell.text)
.join("") // merge cells
.substr(0, cellPadding)
.padEnd(cellPadding, " "); // padding
const renderMatrix = (matrix) =>
(matrix || [])
.map((row, y) => padColumns(row, nbCols).map(mergeCells).join(" | "))
.join("\n");
var table = new pdfreader.TableParser();
new pdfreader.PdfReader().parseFileItems(filename, function (err, item) {
if (!item || item.page) {
// end of file, or page
console.log(renderMatrix(table.getMatrix()));
console.log("PAGE:", item.page);
table = new pdfreader.TableParser(); // new/clear table for next page
} else if (item.text) {
// accumulate text items into rows object, per line
table.processItem(item, columnQuantitizer(item));
}
});
```
Fork this example from [parsing a CV/résumé](https://github.com/adrienjoly/npm-pdfreader-example).
## Example: opening a PDF file with a password
```javascript
new PdfReader({ password: "YOUR_PASSWORD" }).parseFileItems(
"test/sample-with-password.pdf",
function (err, item) {
if (err) console.error(err);
else if (!item) console.warn("end of file");
else if (item.text) console.log(item.text);
}
);
```
## Rule-based data extraction
The Rule class can be used to define and process data extraction rules, while parsing a PDF document.
The `Rule` class can be used to define and process data extraction rules, while parsing a PDF document.
Rule instances expose "accumulators": methods that defines the data extraction strategy to be used for each rule.
`Rule` instances expose "accumulators": methods that defines the data extraction strategy to be used for each rule.

@@ -227,0 +106,0 @@ Example:

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc