Comparing version 0.1.5 to 0.1.6
{ | ||
"name": "pdf2json", | ||
"version": "0.1.5", | ||
"version": "0.1.6", | ||
"description": "A PDF file parser that converts PDF binaries to text based JSON, powered by a fork of PDF.JS", | ||
@@ -5,0 +5,0 @@ "keywords": [ |
17
pdf.js
@@ -197,6 +197,2 @@ | ||
cls.prototype.checkType = function() { | ||
nodeUtil._logN.call(this, "typeof(PDFJS.getDocument) == " + typeof(PDFJS.getDocument)); | ||
}; | ||
cls.prototype.parsePDFData = function(arrayBuffer) { | ||
@@ -209,3 +205,2 @@ var parameters = {password: '', data: arrayBuffer}; | ||
function getDocumentCallback(pdfDocument) { | ||
nodeUtil._logN.call(self, "getDocumentCallback(" + typeof pdfDocument + ")"); | ||
self.load(pdfDocument, 1); | ||
@@ -236,3 +231,3 @@ }, | ||
nodeUtil._logN.call(self, "load: pagesCount = " + pagesCount); | ||
nodeUtil._logN.call(self, "PDF loaded. pagesCount = " + pagesCount); | ||
@@ -260,3 +255,3 @@ pagesPromise.then(function(promisedPages) { | ||
cls.prototype.parsePage = function(promisedPages, id, scale) { | ||
nodeUtil._logN.call(this, "parsePage:" + id); | ||
nodeUtil._logN.call(this, "start to parse page:" + (id+1)); | ||
var self = this; | ||
@@ -283,2 +278,3 @@ var pdfPage = promisedPages[id]; | ||
if (id == self.pdfDocument.numPages - 1) { | ||
nodeUtil._logN.call(self, "complete parsing page:" + (id+1)); | ||
self.emit("pdfjs_parseDataReady", {Pages:self.pages, Width: self.pageWidth}); | ||
@@ -294,2 +290,9 @@ } | ||
cls.prototype.destroy = function() { | ||
this.removeAllListeners(); | ||
this.pdfDocument = null; | ||
this.formImage = null; | ||
}; | ||
return cls; | ||
@@ -296,0 +299,0 @@ })(); |
@@ -187,3 +187,3 @@ 'use strict'; | ||
this.currentFont.processText(p, str, maxWidth, color, fontSize, this.canvas); | ||
this.currentFont.processText(p, text, maxWidth, color, fontSize, this.canvas); | ||
}; | ||
@@ -190,0 +190,0 @@ |
@@ -99,2 +99,12 @@ var nodeUtil = require("util"), | ||
this.fontObj = fontObj; | ||
var typeName = (fontObj.name || fontObj.fallbackName).toLowerCase(); | ||
if (this.fontObj.isSymbolicFont) { | ||
if (typeName.indexOf("arial") > 0) | ||
this.fontObj.isSymbolicFont = false; //lots of Arial-based font is detected as symbol in VA forms (301, 76-c, etc.) reset the flag for now | ||
} | ||
else { | ||
if (typeName.indexOf("symbol") > 0) | ||
this.fontObj.isSymbolicFont = true; //text pdf: va_ind_760c | ||
} | ||
this.fontSize = 1; | ||
@@ -105,4 +115,2 @@ | ||
this.italic = false; | ||
this.faceName = null; | ||
this.faceSubName = null; | ||
@@ -122,2 +130,6 @@ this.fontStyleId = -1; | ||
var typeName = fontObj.name || fontObj.fallbackName; | ||
if (!this.bold) { | ||
this.bold = typeName.toLowerCase().indexOf("bold") >= 0; | ||
} | ||
var nameArray = typeName.split('+'); | ||
@@ -130,3 +142,2 @@ if (_.isArray(nameArray) && nameArray.length > 1) { | ||
this.bold = _boldSubNames.indexOf(subName) >= 0; | ||
this.faceSubName = subName; | ||
} | ||
@@ -136,3 +147,2 @@ typeName = typeName[0]; | ||
} | ||
this.faceName = typeName; | ||
@@ -188,2 +198,14 @@ if (fontObj.isSerifFont) { | ||
if (retVal === -1) { | ||
_.each(_kFontStyles, function(element, index, list){ | ||
if (retVal === -1) { | ||
if (element[0] === fsa[0] ) { | ||
if (element[1] >= fsa[1]) { | ||
retVal = index; | ||
} | ||
} | ||
} | ||
}); | ||
} | ||
if (retVal === -1) { | ||
retVal = 2; | ||
@@ -196,16 +218,20 @@ } | ||
var _processSymbolicFont = function(str) { | ||
var retVal = str; | ||
if (!str || str.length !== 1) | ||
return retVal; | ||
if (!this.fontObj.isSymbolicFont) | ||
return str; | ||
return retVal; | ||
if (!str || str.length !== 2) | ||
return str; | ||
var retVal = "G"; | ||
switch(str.charCodeAt(1)) { | ||
case 99: retVal = 'C'; break; //up triangle | ||
case 97: retVal = 'G'; break; //right triangle | ||
case 20: retVal = 'M'; break; //check mark | ||
switch(str.charCodeAt(0)) { | ||
case 99: retVal = '\u25b2'; break; //up triangle | ||
case 97: retVal = '\u25b6'; break; //right triangle | ||
case 20: retVal = '\u2713'; break; //check mark | ||
case 70: retVal = '\u007D'; break; //right curly bracket | ||
case 118: retVal = '\u2022'; break; //Bullet dot | ||
case 106: retVal = ''; break; //VA 301: string j character by the checkbox, hide it for now | ||
default: | ||
retVal = ""; | ||
nodeUtil._logN.call(this, "Default - SymbolicFont - (" + this.fontObj.name + ") : " + str.charCodeAt(1) + " => " + retVal); | ||
nodeUtil._logN.call(this, "Default - SymbolicFont - (" + this.fontObj.name + ") : " + | ||
str.charCodeAt(0) + "::" + str.charCodeAt(1) + " => " + retVal + " length = " + str.length); | ||
} | ||
@@ -223,2 +249,6 @@ | ||
if (text == "C" || text == "G") { //prevent symbolic encoding from the client | ||
text = " " + text + " "; | ||
} | ||
var oneText = {x: PDFUnit.toFormX(p.x) - 0.25, | ||
@@ -235,3 +265,16 @@ y: PDFUnit.toFormY(p.y) - 0.75, | ||
targetData.Texts.push(oneText); | ||
// var lastIdx = targetData.Texts.length - 1; | ||
// if (lastIdx >= 0) { | ||
// if (text[0] != text[0].toUpperCase()) { | ||
// var lastTextBlock = targetData.Texts[lastIdx]; | ||
// if (lastTextBlock.y == oneText.y && lastTextBlock.R[0].S == oneText.R[0].S) { | ||
// nodeUtil._logN.call(this, "Merged text: " + text); | ||
// lastTextBlock.R[0].T += text; //add to last text run when style is the same and in the same line. Test form: VA_IND_760c.pdf | ||
// oneText = null; | ||
// } | ||
// } | ||
// } | ||
// | ||
// if (oneText != null) | ||
targetData.Texts.push(oneText); | ||
}; | ||
@@ -238,0 +281,0 @@ |
@@ -752,11 +752,13 @@ /* -*- Mode: Java; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ | ||
var x = 0; | ||
//MQZ. 11/16/2012: added helper function | ||
str = ""; | ||
var alphaExp = /^[0-9a-zA-Z| !@#$%^&*()]+$/; | ||
var isAlphanumeric = function(char) { | ||
return char.match(alphaExp); | ||
var useGlyph = (str[0] != glyphs[0].fontChar); | ||
if (useGlyph) | ||
str = ""; | ||
var alphaExp = /[^ -~]/; // /^[0-9a-zA-Z| !@#$%^&*():\/\\_,.]+$/; // | ||
var isAlphanumeric = function (char) { | ||
return (!alphaExp.test(char));// char.match(alphaExp); // | ||
}; | ||
for (var i = 0; i < glyphsLength; ++i) { | ||
var x = 0; | ||
for (var i = 0; i < glyphsLength; ++i) { | ||
var glyph = glyphs[i]; | ||
@@ -768,11 +770,7 @@ if (glyph === null) { | ||
} | ||
//MQZ. Disable character-based text rendering | ||
// var character = glyph.fontChar; | ||
var character = glyph.fontChar; | ||
var charWidth = glyph.width * fontSize * 0.001 + | ||
Util.sign(current.fontMatrix[0]) * charSpacing; | ||
if (!glyph.disabled) { | ||
//MQZ. Disable character-based text rendering | ||
if (isAlphanumeric(glyph.fontChar)) | ||
str += glyph.fontChar; | ||
// if (!glyph.disabled) { | ||
// var scaledX = x / fontSizeScale; | ||
@@ -797,11 +795,21 @@ // switch (textRenderingMode) { | ||
// } | ||
// } | ||
if (useGlyph) { | ||
if (isAlphanumeric(character) && (textRenderingMode != TextRenderingMode.INVISIBLE)) | ||
str += character; | ||
else if (glyph.unicode) { | ||
str += glyph.unicode; | ||
} | ||
else { | ||
str += " "; | ||
// console.log("non-printable:\\u" + character.charCodeAt(0).toString(16) + "; unicode = " + glyph.unicode); | ||
} | ||
} | ||
} | ||
x += charWidth; | ||
var glyphUnicode = glyph.unicode === ' ' ? '\u00A0' : glyph.unicode; | ||
if (glyphUnicode in NormalizedUnicodes) | ||
glyphUnicode = NormalizedUnicodes[glyphUnicode]; | ||
canvasWidth += charWidth; | ||
//MQZ. Nov.27.2012: comment out uesless unicode conversion | ||
// var glyphUnicode = glyph.unicode === ' ' ? '\u00A0' : glyph.unicode; | ||
// if (glyphUnicode in NormalizedUnicodes) | ||
// glyphUnicode = NormalizedUnicodes[glyphUnicode]; | ||
canvasWidth += charWidth; | ||
} | ||
@@ -823,3 +831,3 @@ current.x += x * textHScale2; | ||
ctx.fillText(str, 0, 0, canvasWidth, fontSize * scale); | ||
// ctx.strokeText(str, 0, 0, canvasWidth); | ||
// ctx.strokeText(str, 0, 0, canvasWidth); | ||
break; | ||
@@ -870,15 +878,18 @@ case TextRenderingMode.INVISIBLE: | ||
var sText = ""; | ||
for (var i = 0; i < arrLength; ++i) { | ||
var e = arr[i]; | ||
if (isNum(e)) { | ||
var spacingLength = -e * 0.001 * fontSize * textHScale; | ||
current.x += spacingLength; | ||
if (textSelection) | ||
canvasWidth += spacingLength; | ||
//MQZ. Nov.28.2012 Disable character based rendering, make it a string | ||
// var spacingLength = -e * 0.001 * fontSize * textHScale; | ||
// current.x += spacingLength; | ||
// | ||
// if (textSelection) | ||
// canvasWidth += spacingLength; | ||
} else if (isString(e)) { | ||
var shownCanvasWidth = this.showText(e, true); | ||
if (textSelection) | ||
canvasWidth += shownCanvasWidth; | ||
// var shownCanvasWidth = this.showText(e, true); | ||
// | ||
// if (textSelection) | ||
// canvasWidth += shownCanvasWidth; | ||
sText += e; | ||
} else { | ||
@@ -889,2 +900,7 @@ error('TJ array element ' + e + ' is not string or num'); | ||
//MQZ. Nov.28.2012 Disable character based rendering, make it a string | ||
var shownCanvasWidth = this.showText(sText, true); | ||
if (textSelection) | ||
canvasWidth += shownCanvasWidth; | ||
if (textSelection) { | ||
@@ -891,0 +907,0 @@ geom.canvasWidth = canvasWidth; |
@@ -109,4 +109,4 @@ var nodeUtil = require("util"), | ||
if (idx < 0) { | ||
nodeUtil.log("Reaplcing new color (" + color + ") with color (4) = " + kColors[4]); | ||
idx = 4; | ||
idx = 7; | ||
nodeUtil.log("Reaplcing new color (" + color + ") with color (" + idx + ") = " + kColors[idx]); | ||
} | ||
@@ -113,0 +113,0 @@ return idx; |
@@ -17,7 +17,7 @@ var nodeUtil = require("util"), | ||
var PFParser = (function () { | ||
var PDFParser = (function () { | ||
'use strict'; | ||
// private static | ||
var _nextId = 1; | ||
var _name = 'PFParser'; | ||
var _name = 'PDFParser'; | ||
@@ -124,3 +124,6 @@ var _binBuffer = {}; | ||
this.data = null; | ||
this.PDFJS.destroy(); | ||
this.PDFJS = null; | ||
this.parsePropCount = 0; | ||
@@ -132,3 +135,3 @@ }; | ||
module.exports = PFParser; | ||
module.exports = PDFParser; | ||
@@ -207,10 +207,3 @@ Introduction | ||
Run Unit Test | ||
===== | ||
Test suite for PDF2JSON is created with Vows.js, it'll parse 3 PDF files under 'test/data' directory in parallel and have 12 test cases need to be honored. | ||
node test/index.js | ||
Interactive Forms Elements | ||
@@ -395,8 +388,32 @@ ===== | ||
Known Issues | ||
=== | ||
This pdf2json module's output does not 100% maps from PDF definitions, some of them is because of time limitation I currently have, some others result from the 'dictionary' concept for the output. Given these known issues or unsupported features in current implementation, it allows me to contribute back to the open source community with the most important features implemented while leaving some improvement space for the future. All un-supported featurs listed below can be resolved technically some way or other, if your use case really requires them: | ||
* Embedded content: | ||
* All embedded content are igored, current implementation focuses on static contents and interactive forms. Un-supported PDF embedded contents includes 'Images', 'Fonts' and other dynmatic contents; | ||
* Text and Form Styles: | ||
* text and form elements styles has partial support. This means when you have client side renderer (say in HTML5 canvas or SVG renderer), the PDF content may not look exactly the same as how Acrobat renders. The reason is that we've used "style dictionary" in order to reduce the payload size over the wire, while "style dictionary" doesn't have all styles defined. This sort of partial support can be resolved by extending those 'style dictionaries'. Primary text style issues include: | ||
* Font face: only limit to the font families defined in style dictionry | ||
* Font size: only limit to 6, 8, 10, 12, 14, 18 that are defined in style dictionary, all other sized font are mapped to the closest size. For example: when a PDF defines a 7px sized font, the size will be mapped to 8px in the output; | ||
* Color: either font color or fill colors, are limited to the entries in color dictionry | ||
* Style combinations: when style combination is not supported, say in different size, face, bold and italic, the closest entry will be selected in the output; | ||
* Text positioning and spacing: | ||
* Since embedd font and font styles are only honored if they defined in style dictionary, when they are not in there, the final output may have word positioning and spacing issues that's noticable. | ||
* User input data in form element: | ||
* As for interactive forms elements, their type, poisitions, sizes, limited styles and control data are all parsed and served in output, but user interactive data are not parsed, like which radio button is selected, which checkbox is checked, text in text input box, etc., should be handled in client as part of user data, so that we can treat parsed PDF data as template data. | ||
Run Unit Test | ||
===== | ||
Test suite for PDF2JSON is created with Vows.js, it'll parse 3 PDF files under 'test/data' directory in parallel and have 12 test cases need to be honored. | ||
node test/index.js | ||
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
29272952
26450
418