Comparing version 0.1.10 to 0.1.11
{ | ||
"name": "pdf2json", | ||
"version": "0.1.10", | ||
"version": "0.1.11", | ||
"description": "A PDF file parser that converts PDF binaries to text based JSON, powered by a fork of PDF.JS", | ||
@@ -57,4 +57,4 @@ "keywords": [ | ||
"readme": "", | ||
"_id": "pdf2json@0.1.10", | ||
"_id": "pdf2json@0.1.11", | ||
"_from": "pdf2json" | ||
} |
@@ -186,31 +186,2 @@ var nodeUtil = require("util"), | ||
if (retVal === -1) { | ||
_.each(_kFontStyles, function(element, index, list){ | ||
if (retVal === -1) { | ||
if (element[0] === fsa[0] && | ||
element[2] === fsa[2] && element[3] === fsa[3]) { | ||
if (element[1] >= fsa[1]) { | ||
retVal = index; | ||
} | ||
} | ||
} | ||
}); | ||
} | ||
if (retVal === -1) { | ||
_.each(_kFontStyles, function(element, index, list){ | ||
if (retVal === -1) { | ||
if (element[0] === fsa[0] ) { | ||
if (element[1] >= fsa[1]) { | ||
retVal = index; | ||
} | ||
} | ||
} | ||
}); | ||
} | ||
if (retVal === -1) { | ||
retVal = 2; | ||
} | ||
return retVal; | ||
@@ -253,2 +224,6 @@ }; | ||
// when this.fontStyleId === -1, it means the text style doesn't match any entry in the dictionary | ||
// adding TS to better describe text style [fontFaceId, fontSize, 1/0 for bold, 1/0 for italic]; | ||
var TS = [this.faceIdx, this.fontSize, this.bold?1:0, this.italic?1:0]; | ||
var oneText = {x: PDFUnit.toFormX(p.x) - 0.25, | ||
@@ -261,20 +236,8 @@ y: PDFUnit.toFormY(p.y) - 0.75, | ||
T: this.flash_encode(text), | ||
S: this.fontStyleId | ||
S: this.fontStyleId, | ||
TS: TS | ||
}] | ||
}; | ||
// var lastIdx = targetData.Texts.length - 1; | ||
// if (lastIdx >= 0) { | ||
// if (text[0] != text[0].toUpperCase()) { | ||
// var lastTextBlock = targetData.Texts[lastIdx]; | ||
// if (lastTextBlock.y == oneText.y && lastTextBlock.R[0].S == oneText.R[0].S) { | ||
// nodeUtil._logN.call(this, "Merged text: " + text); | ||
// lastTextBlock.R[0].T += text; //add to last text run when style is the same and in the same line. Test form: VA_IND_760c.pdf | ||
// oneText = null; | ||
// } | ||
// } | ||
// } | ||
// | ||
// if (oneText != null) | ||
targetData.Texts.push(oneText); | ||
targetData.Texts.push(oneText); | ||
@@ -281,0 +244,0 @@ // nodeUtil._logN.call(this, text + ":" + this.fontStyleId + ":" + this.typeName); |
@@ -301,23 +301,5 @@ /* -*- Mode: Java; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ | ||
if (fnName !== 'dependency') { | ||
//MQZ.Jan.22. trying to merge kerning text | ||
var goOn = true; | ||
if (fnName == 'showSpacedText') { | ||
if (this.lastSTParam.length > 0) { | ||
this.lastSTParam[0] = this.lastSTParam[0].concat(argsArray[i][0].slice(0)); | ||
console.log("merged:" + JSON.stringify(this.lastSTParam[0])); | ||
this[fnName].apply(this, this.lastSTParam); | ||
this.lastSTParam = []; | ||
goOn = false; | ||
} | ||
else if ( i + 3 < fnArray.length) { | ||
if (fnArray[i+1] === 'setCharSpacing' && fnArray[i+2] === 'setWordSpacing' && | ||
fnArray[i+3] === 'showSpacedText') { | ||
this.lastSTParam = argsArray[i].slice(0); //clone it | ||
goOn = false; | ||
} | ||
} | ||
} | ||
if (goOn) { | ||
this[fnName].apply(this, argsArray[i]); | ||
} | ||
//MQZ Feb.19.2013. Trying to fix text positioning | ||
// console.log(fnName + JSON.stringify(argsArray[i])); | ||
this[fnName].apply(this, argsArray[i]); | ||
} else { | ||
@@ -756,2 +738,9 @@ var deps = argsArray[i]; | ||
ctx.save(); | ||
//MQZ. Feb.20.2013. Adjust text positioning based on wordSpacing | ||
if (str.indexOf(' ') === 0) { | ||
var ns = str.replace(/^\s+/g, ''); | ||
current.x += (str.length - ns.length) * this.current.wordSpacing * fontSize * textHScale; | ||
} | ||
this.applyTextTransforms(); | ||
@@ -840,14 +829,15 @@ | ||
//MQZ. 10/23/2012 Enable string rendering | ||
var curFontSize = fontSize * scale * textHScale + 3; | ||
switch (textRenderingMode) { | ||
case TextRenderingMode.FILL: | ||
case TextRenderingMode.FILL_ADD_TO_PATH: | ||
ctx.fillText(str, 0, 0, canvasWidth, fontSize * scale); | ||
ctx.fillText(str, 0, 0, canvasWidth, curFontSize); | ||
break; | ||
case TextRenderingMode.STROKE: | ||
case TextRenderingMode.STROKE_ADD_TO_PATH: | ||
ctx.strokeText(str, 0, 0, canvasWidth, fontSize * scale); | ||
ctx.strokeText(str, 0, 0, canvasWidth, curFontSize); | ||
break; | ||
case TextRenderingMode.FILL_STROKE: | ||
case TextRenderingMode.FILL_STROKE_ADD_TO_PATH: | ||
ctx.fillText(str, 0, 0, canvasWidth, fontSize * scale); | ||
ctx.fillText(str, 0, 0, canvasWidth, curFontSize); | ||
// ctx.strokeText(str, 0, 0, canvasWidth); | ||
@@ -899,6 +889,7 @@ break; | ||
// console.log("In spacedText:" + JSON.stringify(arr) + ";this.current = " + this.current.x + "," + this.current.y + ";"); | ||
var charWidth = {min:0.220 * fontSize * textHScale, max: 0.35 * fontSize * textHScale}; | ||
//MQZ. Dec.28. Adjust text positioning | ||
// console.log("In spacedText:" + JSON.stringify(arr) + ";this.current = " + this.current.x + "," + this.current.y + ";"); | ||
var sText = ""; | ||
var spacingLength = 0; | ||
var spaceWidthKerning = font.spaceWidth * 0.001 * fontSize * textHScale; | ||
for (var i = 0; i < arrLength; ++i) { | ||
@@ -910,19 +901,16 @@ var e = arr[i]; | ||
if (spacingLength > 0) { | ||
var spTextWidth = 0; | ||
if (spacingLength >= charWidth.max) { | ||
spTextWidth = this.showText(sText, true); | ||
if (!sText) { | ||
current.x += spacingLength; | ||
} | ||
else if (spacingLength >= spaceWidthKerning) { | ||
var spTextWidth = this.showText(sText, true); | ||
sText = ""; | ||
current.x += spacingLength; | ||
if (textSelection) | ||
canvasWidth += spacingLength + spTextWidth; | ||
} | ||
else if (spacingLength >= charWidth.min) { | ||
sText += " ";//converting -220 to -350 kerning to be a space character | ||
} | ||
else { | ||
// console.log("ignored kerning of " + e + " in #" + i + " of " + JSON.stringify(arr)); | ||
} | ||
} | ||
else if (i > 0) | ||
current.x -= spacingLength; | ||
else if (!sText) | ||
current.x += spacingLength; | ||
@@ -934,3 +922,11 @@ } else if (isString(e)) { | ||
// canvasWidth += shownCanvasWidth; | ||
sText += e; | ||
if (!sText && e.indexOf(' ') === 0) { | ||
var ns = e.replace(/^\s+/g, ''); | ||
current.x += (e.length - ns.length) * this.current.wordSpacing * fontSize * textHScale; | ||
if (!!ns) | ||
sText += ns; | ||
} | ||
else | ||
sText += e; | ||
} else { | ||
@@ -937,0 +933,0 @@ error('TJ array element ' + e + ' is not string or num'); |
Introduction | ||
==== | ||
A server side PDF parser Node.js module that converts PDF binaries to JavaScript objects, which can be easily serialized to | ||
PDF2JSON module is ported from client side PDF.JS to Node.JS, it also extends PDF.JS library with interactive form elements and text content parsing. | ||
JSON when running in node.js based web service or web app. | ||
The goal is to enable server side PDF parsing with interactive form elements wehen wrapped in web service, it also enables parsing PDF to local JSON file when using in a commanline tool. | ||
@@ -409,2 +409,36 @@ Install: | ||
Text Style data without Style Dictionary | ||
===== | ||
v0.1.11 added text style information in addition to style dictionary. As we discussed earlier, the idea of style dictionary is to make the parsing result payload to be compact, but I found out the limited dictionary entries for font (face, size) and style (bold, italic) can not cover majority of text contents in PDFs, because of some styles are matched with closest dictionary entry, the client rendering will have mis-aligned, gapped or overlapped text. To solve this problem, pdf2json v0.1.11 extends the dictionary approach, all previous dictionary entries stay the same, but parsing result will not try to match to a closest style entry, instead, all exact text style will be returned in a TS filed. | ||
When the actual text style doesn't match any pre-defined style dictionary entry, the text style ID (S filed) will be set as -1. The actual text style will be set in a new field (TS) with or without a matched style dictionary entry ID. This means, if your client renderer works with pdf2json v0.1.11 and later, style dictionary ID can be ignored. Otherwise, previous client renderer can still work with style dictionary ID. | ||
The new TS filed is an Array with format as: | ||
* First element in TS Array is Font Face ID (integer) | ||
* Second element is Font Size (px) | ||
* Third is 1 when font weight is bold, otherwise 0 | ||
* Forth is 1 when font style is italic, otherwise 0 | ||
For example, the following is a text block data in the parsing result: | ||
{ | ||
x: 7.11, | ||
y: 2.47, | ||
w: 1.6, | ||
clr: 0, | ||
A: "left", | ||
R: [ | ||
{ | ||
T: "Modesty%20PDF%20Parser%20NodeJS", | ||
S: -1, | ||
TS: [0, 15, 1, 0] | ||
} | ||
] | ||
}, | ||
The text is "Modesty PDF Parser NodeJS", text style dictionary entry ID is -1 (S field, meaning no match), and its Font Face ID is 0 (TS[0], "QuickType,Arial,Helvetica,sans-serif"), Font Size is 15px (TS[1]), Font weight is bold (TS[2]) and font style is normal (TS[3]). | ||
Notes | ||
@@ -411,0 +445,0 @@ ===== |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
29281484
502
26529