Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

pdfreader

Package Overview
Dependencies
Maintainers
1
Versions
56
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

pdfreader - npm Package Compare versions

Comparing version 0.1.5 to 0.2.0

2

package.json
{
"name": "pdfreader",
"version": "0.1.5",
"version": "0.2.0",
"description": "Utility for simplifying the development of scripted / rule-based parsing of PDF files, including tabular data (tables, with automatic column detection).",

@@ -5,0 +5,0 @@ "main": "index.js",

@@ -38,4 +38,80 @@ # pdfreader

Other example of use: [parsing a CV/résumé](https://github.com/adrienjoly/npm-pdfreader-example)
## Example: parsing lines of text from a PDF file
![example cv resume parse convert pdf to text](https://github.com/adrienjoly/npm-pdfreader-example/raw/master/parseRows.png)
Here is the code required to convert this PDF file into text:
```js
var pdfreader = require('pdfreader');
var rows = {}; // indexed by y-position
function printRows() {
Object.keys(rows) // => array of y-positions (type: float)
.sort((y1, y2) => parseFloat(y1) - parseFloat(y2)) // sort float positions
.forEach((y) => console.log((rows[y] || []).join('')));
}
new pdfreader.PdfReader().parseFileItems('CV_ErhanYasar.pdf', function(err, item){
if (!item || item.page) {
// end of file, or page
printRows();
console.log('PAGE:', item.page);
rows = {}; // clear rows for next page
}
else if (item.text) {
// accumulate text items into rows object, per line
(rows[item.y] = rows[item.y] || []).push(item.text);
}
});
```
Fork this example from [parsing a CV/résumé](https://github.com/adrienjoly/npm-pdfreader-example).
## Example: parsing a table from a PDF file
![example cv resume parse convert pdf table to text](https://github.com/adrienjoly/npm-pdfreader-example/raw/master/parseTable.png)
Here is the code required to convert this PDF file into a textual table:
```js
var pdfreader = require('pdfreader');
const nbCols = 2;
const cellPadding = 40; // each cell is padded to fit 40 characters
const columnQuantitizer = (item) => parseFloat(item.x) >= 20;
const padColumns = (array, nb) =>
Array.apply(null, {length: nb}).map((val, i) => array[i] || []);
// .. because map() skips undefined elements
const mergeCells = (cells) => (cells || [])
.map((cell) => cell.text).join('') // merge cells
.substr(0, cellPadding).padEnd(cellPadding, ' '); // padding
const renderMatrix = (matrix) => (matrix || [])
.map((row, y) => padColumns(row, nbCols)
.map(mergeCells)
.join(' | ')
).join('\n');
var table = new pdfreader.TableParser();
new pdfreader.PdfReader().parseFileItems(filename, function(err, item){
if (!item || item.page) {
// end of file, or page
console.log(renderMatrix(table.getMatrix()));
console.log('PAGE:', item.page);
table = new pdfreader.TableParser(); // new/clear table for next page
} else if (item.text) {
// accumulate text items into rows object, per line
table.processItem(item, columnQuantitizer(item));
}
});
```
Fork this example from [parsing a CV/résumé](https://github.com/adrienjoly/npm-pdfreader-example).
## Rule-based data extraction

@@ -42,0 +118,0 @@

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc