@@ -129,2 +129,3 @@ #!/usr/bin/env node
		const slidesRegex = /ppt\/slides\/slide\d+.xml/g;
		const slideNumberRegex = /lide(\d+)\.xml/;

		@@ -138,2 +139,13 @@ /** The decompress location which contains the filename in it */
		.then(files => {
		// Sort files by slide number and their notes (if any).
		files.sort((a, b) => {
		const matchedANumber = parseInt(a.path.match(slideNumberRegex)?.at(1), 10);
		const matchedBNumber = parseInt(b.path.match(slideNumberRegex)?.at(1), 10);

		const aNumber = isNaN(matchedANumber) ? Infinity : matchedANumber;
		const bNumber = isNaN(matchedBNumber) ? Infinity : matchedBNumber;

		return aNumber - bNumber \|\| Number(a.path.includes('notes')) - Number(b.path.includes('notes'));
		});

		// Verify if atleast the slides xml files exist in the extracted files list.
		@@ -223,6 +235,9 @@ if (files.length == 0 \|\| !files.map(file => file.path).some(filename => filename.match(slidesRegex)))
		// Structure of xmlContent of an excel file is a bit complex.
		// We have a sharedStrings.xml file which has strings inside t tags
		// We usually have a sharedStrings.xml file which has strings inside t tags
		// However, this file is not necessary to be present. It is sometimes absent if the file has no shared strings indices represented in v nodes.
		// Each sheet has an individual sheet xml file which has numbers in v tags (probably value) inside c tags (probably cell)
		// Each value of v tag is to be used as it is if the "t" attribute (probably type) of c tag is not "s" (probably shared string)
		// If the "t" attribute of c tag is "s", then we use the value to select value from sharedStrings array with the value as its index.
		// However, if the "t" attribute of c tag is "inlineStr", strings can be inline inside "is"(probably inside String) > "t".
		// We extract either the inline strings or use the value to get numbers of text from shared strings.
		// Drawing files contain all text for each drawing and have text nodes in a:t and paragraph nodes in a:p.
		@@ -234,4 +249,28 @@ // ******************************************************************************************************

		/** Find text nodes with t tags in sharedStrings xml file */
		const sharedStringsXmlTNodesList = parseString(xmlContentFilesObject.sharedStringsFile).getElementsByTagName("t");
		/** Function to check if the given c node is a valid inline string node. */
		function isValidInlineStringCNode(cNode) {
		// Initial check to see if the passed node is a cNode
		if (cNode.tagName.toLowerCase() != 'c')
		return false;
		if (cNode.getAttribute("t") != 'inlineStr')
		return false;
		const childNodesNamedIs = cNode.getElementsByTagName('is');
		if (childNodesNamedIs.length != 1)
		return false;
		const childNodesNamedT = childNodesNamedIs[0].getElementsByTagName('t');
		if (childNodesNamedT.length != 1)
		return false;
		return childNodesNamedT[0].childNodes[0] && childNodesNamedT[0].childNodes[0].nodeValue != '';
		}

		/** Function to check if the given c node has a valid v node */
		function hasValidVNodeInCNode(cNode) {
		return cNode.getElementsByTagName("v")[0]
		&& cNode.getElementsByTagName("v")[0].childNodes[0]
		&& cNode.getElementsByTagName("v")[0].childNodes[0].nodeValue != ''
		}

		/** Find text nodes with t tags in sharedStrings xml file. If the sharedStringsFile is not present, we return an empty array. */
		const sharedStringsXmlTNodesList = xmlContentFilesObject.sharedStringsFile != undefined ? parseString(xmlContentFilesObject.sharedStringsFile).getElementsByTagName("t")
		: [];
		/** Create shared string array. This will be used as a map to get strings from within sheet files. */
		@@ -245,21 +284,29 @@ const sharedStrings = Array.from(sharedStringsXmlTNodesList)
		const sheetsXmlCNodesList = parseString(sheetXmlContent).getElementsByTagName("c");
		// Traverse through the nodes list and fill responseText with either the number value in its v node or find a mapped string from sharedStrings.
		// Traverse through the nodes list and fill responseText with either the number value in its v node or find a mapped string from sharedStrings or an inline string.
		responseText.push(
		Array.from(sheetsXmlCNodesList)
		// Filter c nodes than do not have any valid v nodes
		.filter(cNode => cNode.getElementsByTagName("v")[0]
		&& cNode.getElementsByTagName("v")[0].childNodes[0]
		&& cNode.getElementsByTagName("v")[0].childNodes[0].nodeValue)
		// Filter out invalid c nodes
		.filter(cNode => isValidInlineStringCNode(cNode) \|\| hasValidVNodeInCNode(cNode))
		.map(cNode => {
		/** Flag whether this node's value represents a string index */
		const isString = cNode.getAttribute("t") == "s";
		/** Find value nodes represented by v tags */
		const value = cNode.getElementsByTagName("v")[0].childNodes[0].nodeValue;
		// Validate text
		if (isString && value >= sharedStrings.length)
		throw ERRORMSG.fileCorrupted(filepath);
		// Processing if this is a valid inline string c node.
		if (isValidInlineStringCNode(cNode))
		return cNode.getElementsByTagName('is')[0].getElementsByTagName('t')[0].childNodes[0].nodeValue;

		return isString
		? sharedStrings[value]
		: value;
		// Processing if this c node has a valid v node.
		if (hasValidVNodeInCNode(cNode)) {
		/** Flag whether this node's value represents an index in the shared string array */
		const isIndexInSharedStrings = cNode.getAttribute("t") == "s";
		/** Find value nodes represented by v tags */
		const value = cNode.getElementsByTagName("v")[0].childNodes[0].nodeValue;
		// Validate text
		if (isIndexInSharedStrings && value >= sharedStrings.length)
		throw ERRORMSG.fileCorrupted(filepath);

		return isIndexInSharedStrings
		? sharedStrings[value]
		: value;
		}
		// TODO: Add debug asserts for if we reach here which would mean we are filtering more items than we are processing.
		// Not the case now but it could happen and it is better to be safe.
		return '';
		})
		@@ -642,3 +689,3 @@ // Join each cell text within a sheet with a space.
		// Run this library on CLI
		if ((process.argv[0].split('/').pop() == "node" \|\| process.argv[0].split('/').pop() == "npx") && (process.argv[1].split('/').pop() == "officeParser.js" \|\| process.argv[1].split('/').pop() == "officeparser")) {
		if ((process.argv[0].split('/').pop() == "node" \|\| process.argv[0].split('/').pop() == "npx") && (process.argv[1].split('/').pop() == "officeParser.js" \|\| process.argv[1].split('/').pop().toLowerCase() == "officeparser")) {
		if (process.argv.length == 2) {
		@@ -645,0 +692,0 @@ // continue

package.json

		{
		"name": "officeparser",
		"version": "4.1.1",
		"version": "4.1.2",
		"description": "A Node.js library to parse text out of any office file. Currently supports docx, pptx, xlsx, odt, odp, ods, pdf files.",
		@@ -5,0 +5,0 @@ "main": "officeParser.js",

officeparser - npm Package Compare versions

Improved metrics