You're Invited:Meet the Socket Team at BlackHat and DEF CON in Las Vegas, Aug 4-6.RSVP
Socket
Book a DemoInstallSign in
Socket

officeparser

Package Overview
Dependencies
Maintainers
1
Versions
45
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

officeparser - npm Package Compare versions

Comparing version

to
4.1.2

85

officeParser.js

@@ -129,2 +129,3 @@ #!/usr/bin/env node

const slidesRegex = /ppt\/slides\/slide\d+.xml/g;
const slideNumberRegex = /lide(\d+)\.xml/;

@@ -138,2 +139,13 @@ /** The decompress location which contains the filename in it */

.then(files => {
// Sort files by slide number and their notes (if any).
files.sort((a, b) => {
const matchedANumber = parseInt(a.path.match(slideNumberRegex)?.at(1), 10);
const matchedBNumber = parseInt(b.path.match(slideNumberRegex)?.at(1), 10);
const aNumber = isNaN(matchedANumber) ? Infinity : matchedANumber;
const bNumber = isNaN(matchedBNumber) ? Infinity : matchedBNumber;
return aNumber - bNumber || Number(a.path.includes('notes')) - Number(b.path.includes('notes'));
});
// Verify if atleast the slides xml files exist in the extracted files list.

@@ -223,6 +235,9 @@ if (files.length == 0 || !files.map(file => file.path).some(filename => filename.match(slidesRegex)))

// Structure of xmlContent of an excel file is a bit complex.
// We have a sharedStrings.xml file which has strings inside t tags
// We usually have a sharedStrings.xml file which has strings inside t tags
// However, this file is not necessary to be present. It is sometimes absent if the file has no shared strings indices represented in v nodes.
// Each sheet has an individual sheet xml file which has numbers in v tags (probably value) inside c tags (probably cell)
// Each value of v tag is to be used as it is if the "t" attribute (probably type) of c tag is not "s" (probably shared string)
// If the "t" attribute of c tag is "s", then we use the value to select value from sharedStrings array with the value as its index.
// However, if the "t" attribute of c tag is "inlineStr", strings can be inline inside "is"(probably inside String) > "t".
// We extract either the inline strings or use the value to get numbers of text from shared strings.
// Drawing files contain all text for each drawing and have text nodes in a:t and paragraph nodes in a:p.

@@ -234,4 +249,28 @@ // ******************************************************************************************************

/** Find text nodes with t tags in sharedStrings xml file */
const sharedStringsXmlTNodesList = parseString(xmlContentFilesObject.sharedStringsFile).getElementsByTagName("t");
/** Function to check if the given c node is a valid inline string node. */
function isValidInlineStringCNode(cNode) {
// Initial check to see if the passed node is a cNode
if (cNode.tagName.toLowerCase() != 'c')
return false;
if (cNode.getAttribute("t") != 'inlineStr')
return false;
const childNodesNamedIs = cNode.getElementsByTagName('is');
if (childNodesNamedIs.length != 1)
return false;
const childNodesNamedT = childNodesNamedIs[0].getElementsByTagName('t');
if (childNodesNamedT.length != 1)
return false;
return childNodesNamedT[0].childNodes[0] && childNodesNamedT[0].childNodes[0].nodeValue != '';
}
/** Function to check if the given c node has a valid v node */
function hasValidVNodeInCNode(cNode) {
return cNode.getElementsByTagName("v")[0]
&& cNode.getElementsByTagName("v")[0].childNodes[0]
&& cNode.getElementsByTagName("v")[0].childNodes[0].nodeValue != ''
}
/** Find text nodes with t tags in sharedStrings xml file. If the sharedStringsFile is not present, we return an empty array. */
const sharedStringsXmlTNodesList = xmlContentFilesObject.sharedStringsFile != undefined ? parseString(xmlContentFilesObject.sharedStringsFile).getElementsByTagName("t")
: [];
/** Create shared string array. This will be used as a map to get strings from within sheet files. */

@@ -245,21 +284,29 @@ const sharedStrings = Array.from(sharedStringsXmlTNodesList)

const sheetsXmlCNodesList = parseString(sheetXmlContent).getElementsByTagName("c");
// Traverse through the nodes list and fill responseText with either the number value in its v node or find a mapped string from sharedStrings.
// Traverse through the nodes list and fill responseText with either the number value in its v node or find a mapped string from sharedStrings or an inline string.
responseText.push(
Array.from(sheetsXmlCNodesList)
// Filter c nodes than do not have any valid v nodes
.filter(cNode => cNode.getElementsByTagName("v")[0]
&& cNode.getElementsByTagName("v")[0].childNodes[0]
&& cNode.getElementsByTagName("v")[0].childNodes[0].nodeValue)
// Filter out invalid c nodes
.filter(cNode => isValidInlineStringCNode(cNode) || hasValidVNodeInCNode(cNode))
.map(cNode => {
/** Flag whether this node's value represents a string index */
const isString = cNode.getAttribute("t") == "s";
/** Find value nodes represented by v tags */
const value = cNode.getElementsByTagName("v")[0].childNodes[0].nodeValue;
// Validate text
if (isString && value >= sharedStrings.length)
throw ERRORMSG.fileCorrupted(filepath);
// Processing if this is a valid inline string c node.
if (isValidInlineStringCNode(cNode))
return cNode.getElementsByTagName('is')[0].getElementsByTagName('t')[0].childNodes[0].nodeValue;
return isString
? sharedStrings[value]
: value;
// Processing if this c node has a valid v node.
if (hasValidVNodeInCNode(cNode)) {
/** Flag whether this node's value represents an index in the shared string array */
const isIndexInSharedStrings = cNode.getAttribute("t") == "s";
/** Find value nodes represented by v tags */
const value = cNode.getElementsByTagName("v")[0].childNodes[0].nodeValue;
// Validate text
if (isIndexInSharedStrings && value >= sharedStrings.length)
throw ERRORMSG.fileCorrupted(filepath);
return isIndexInSharedStrings
? sharedStrings[value]
: value;
}
// TODO: Add debug asserts for if we reach here which would mean we are filtering more items than we are processing.
// Not the case now but it could happen and it is better to be safe.
return '';
})

@@ -642,3 +689,3 @@ // Join each cell text within a sheet with a space.

// Run this library on CLI
if ((process.argv[0].split('/').pop() == "node" || process.argv[0].split('/').pop() == "npx") && (process.argv[1].split('/').pop() == "officeParser.js" || process.argv[1].split('/').pop() == "officeparser")) {
if ((process.argv[0].split('/').pop() == "node" || process.argv[0].split('/').pop() == "npx") && (process.argv[1].split('/').pop() == "officeParser.js" || process.argv[1].split('/').pop().toLowerCase() == "officeparser")) {
if (process.argv.length == 2) {

@@ -645,0 +692,0 @@ // continue

{
"name": "officeparser",
"version": "4.1.1",
"version": "4.1.2",
"description": "A Node.js library to parse text out of any office file. Currently supports docx, pptx, xlsx, odt, odp, ods, pdf files.",

@@ -5,0 +5,0 @@ "main": "officeParser.js",

SocketSocket SOC 2 Logo

Product

About

Packages

Stay in touch

Get open source security insights delivered straight into your inbox.

  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc

U.S. Patent No. 12,346,443 & 12,314,394. Other pending.