Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

tm-content-parser

Package Overview
Dependencies
Maintainers
10
Versions
21
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

tm-content-parser - npm Package Compare versions

Comparing version 1.0.15 to 1.0.18

53

index.js

@@ -203,2 +203,3 @@ "use strict";

replaceEntityTags,
removeNewlinesFromLists,
replaceNewLineCharacters,

@@ -211,13 +212,51 @@ formatHtml

// return array of same dimensions containing count of <b> tags in each cell
const tableCountBolds = ($) => $("tr").toArray().map( (tr) => {
const tds = $(tr).find("td");
return tds.map((idx, td) => $(td).has("b").length).toArray();
});
// return float bold % in table
const tableBoldPercent = (table) => {
const flat = table.flat();
const countBold = flat.filter(x => x).length;
const boldPercent = countBold/flat.length;
return boldPercent;
};
// return 2 dimensional array from html
const tableToArray = ($) => $("tr").toArray().map( (tr) => {
const tds = $(tr).find("td");
return tds.map((idx, td) => $(td).text()).toArray();
});
// extract <th> into array
const tableGetHeaders = ($) => $("th").toArray().map(th => $(th).text());
const parseTable = text => {
const $ = cheerio.load(text, {decodeEntities: false});
const tableToArray = $("tr").toArray().map( (tr) => {
const tds = $(tr).find("td");
return tds.map((idx, td) => $(td).text()).toArray();
});
const filtered = tableToArray.filter(arr => arr.length); // remove empty inner arrays []
const rows = tableToArray($).filter(arr => arr.length); // remove empty inner arrays []
let headers = tableGetHeaders($);
if (!headers.length) { // no <th>. check bold rules
let boldTable = tableCountBolds($);
boldTable = boldTable.filter(arr => arr.length);
const firstRowAllBold = (boldTable) => !!boldTable[0].length && boldTable[0].filter(x => x).length === boldTable[0].length;
const boldPercent = tableBoldPercent(boldTable);
const tableAllBold = Math.abs(1.0 - boldPercent) < 0.001;
// if first row is all bold AND table is not all with bold then use first row as header
if (!tableAllBold && firstRowAllBold(boldTable)) {
headers = rows.shift();
}
}
const static_content = {
type: 'table',
body: JSON.stringify(filtered)
body: {
headers,
rows
},
};
return {

@@ -248,2 +287,4 @@ type: 'static',

const removeNewlinesFromLists = R.replace(/\/li>([^<]+)<li>/g, '\/li><li>');
const replaceNewLineCharacters = R.replace(/\n/g, '<br>');

@@ -250,0 +291,0 @@

2

package.json
{
"name": "tm-content-parser",
"version": "1.0.15",
"version": "1.0.18",
"description": "Trinity Mirror Content Type Parser",

@@ -5,0 +5,0 @@ "main": "index.js",

@@ -37,2 +37,8 @@ # tm-content-parser

## Release Log
1.0.11 - not parsing &amp; as this broke urls querystrings.
1.0.18 - use newer node version
1.0.17 - pulled AB-700 branch
1.0.15 - extract <th> from tables
1.0.15 - parse html tables, produce new type: table
1.0.13 remove text between closing <li> and next open <li> element
1.0.12 AB-564 add span element to whitelist
1.0.11 - not parsing &amp; as this broke urls querystrings.

@@ -107,2 +107,32 @@ "use strict";

describe('.removeNewlinesFromLists', () => {
const removeNewlinesFromLists = contentTypeParser.__get__("removeNewlinesFromLists");
it('should remove anything between closing </li> and the next opening <li> tag', () => {
const tests = [
{
source: '<p>This is a paragraph with \n new \n lines \n <ul><li>Coffee</li>\n<li>Tea</li>\n<li>Milk</li></ul></p>',
result: '<p>This is a paragraph with \n new \n lines \n <ul><li>Coffee</li><li>Tea</li><li>Milk</li></ul></p>'
},
{
source: "<ul> \n <li>People aged 50 to 59;</li> \n <li>Unpaid carers who are 16 and over;</li> \n <li>Those aged 16 and over who are household contacts of immunosuppressed individuals.</li> \n</ul>",
result: "<ul> \n <li>People aged 50 to 59;</li><li>Unpaid carers who are 16 and over;</li><li>Those aged 16 and over who are household contacts of immunosuppressed individuals.</li> \n</ul>",
},
{
source: "<ul> \n <li>Thursday 9 December to Sunday 12 December</li> \n <li>Thursday 16 December to Sunday 19 December</li> \n</ul>",
result: "<ul> \n <li>Thursday 9 December to Sunday 12 December</li><li>Thursday 16 December to Sunday 19 December</li> \n</ul>",
},
{
source: "<ul> \n <li>The whites of your eyes or your skin turning yellow</li> \n <li>Loss of appetite or losing weight without trying to</li> \n <li>Feeling fatigued</li> \n <li>High temperature, or feeling hot or shivery</li> \n <li>Darker urine and paler stool than usual</li> \n <li>Itchy skin</li> \n</ul>",
result: "<ul> \n <li>The whites of your eyes or your skin turning yellow</li><li>Loss of appetite or losing weight without trying to</li><li>Feeling fatigued</li><li>High temperature, or feeling hot or shivery</li><li>Darker urine and paler stool than usual</li><li>Itchy skin</li> \n</ul>",
},
];
for (const test of tests) {
removeNewlinesFromLists(test.source).should.eql(test.result);
}
});
});
describe('.replaceNewLineCharacters', () => {

@@ -144,5 +174,35 @@

describe('test table parser', () => {
const table = `
const parseTable = contentTypeParser.__get__("parseTable");
it('should parse table without <th>', () => {
const table = `
<table>
<tr>
</tr>
<tr>
<td>Alfreds Futterkiste</td>
<td><b>Maria Anders</b></td>
<td>Germany</td>
</tr>
<tr>
<td>Centro comercial Moctezuma</td>
<td>Francisco Chang</td>
<td><b>Mexico</b></td>
</tr>
</table>`;
const parsed = parseTable(table);
parsed.type.should.eql("static");
parsed.static_content.type.should.eql("table");
parsed.static_content.body.rows.should.eql([
["Alfreds Futterkiste","Maria Anders","Germany"],
["Centro comercial Moctezuma","Francisco Chang","Mexico"]
]);
parsed.static_content.body.headers.should.eql([]);
});
it('should extract table header from <th>', () => {
const table = `
<table>
<tr>
<th>Company</th>

@@ -154,3 +214,3 @@ <th>Contact</th>

<td>Alfreds Futterkiste</td>
<td>Maria Anders</td>
<td><b>Maria Anders</b></td>
<td>Germany</td>

@@ -161,13 +221,68 @@ </tr>

<td>Francisco Chang</td>
<td>Mexico</td>
<td><b>Mexico</b></td>
</tr>
</table>`;
const parsed = parseTable(table);
parsed.type.should.eql("static");
parsed.static_content.type.should.eql("table");
parsed.static_content.body.rows.should.eql([
[ 'Alfreds Futterkiste', 'Maria Anders', 'Germany' ],
[ 'Centro comercial Moctezuma', 'Francisco Chang', 'Mexico' ]
]);
parsed.static_content.body.headers.should.eql([ 'Company', 'Contact', 'Country' ]);
});
const parseTable = contentTypeParser.__get__("parseTable");
it('should correctly parse table', () => {
it('should extract header from first row with all bold', () => {
const table = `
<table>
<tr>
<td><b>Company</b></td>
<td><b>Contact</b></td>
<td><b>Country</b></td>
</tr>
<tr>
<td>Alfreds Futterkiste</td>
<td><b>Maria Anders</b></td>
<td>Germany</td>
</tr>
<tr>
<td>Centro comercial Moctezuma</td>
<td>Francisco Chang</td>
<td><b>Mexico</b></td>
</tr>
</table>`;
const parsed = parseTable(table);
parsed.type.should.eql("static");
parsed.static_content.type.should.eql("table");
parsed.static_content.body.should.eql(`[["Alfreds Futterkiste","Maria Anders","Germany"],["Centro comercial Moctezuma","Francisco Chang","Mexico"]]`);
parsed.static_content.body.rows.should.eql([
[ 'Alfreds Futterkiste', 'Maria Anders', 'Germany' ],
[ 'Centro comercial Moctezuma', 'Francisco Chang', 'Mexico' ]
]);
parsed.static_content.body.headers.should.eql([ 'Company', 'Contact', 'Country' ]);
});
it('should not add hedaers when all table elements are bold', () => {
const table = `
<table>
<tr>
<td><b>Company</b></td>
<td><b>Contact</b></td>
<td><b>Country</b></td>
</tr>
<tr>
<td><b>Alfreds Futterkiste</td>
<td><b>Maria Anders</b></td>
<td><b>Germany</td>
</tr>
</table>`;
const parsed = parseTable(table);
parsed.type.should.eql("static");
parsed.static_content.type.should.eql("table");
parsed.static_content.body.rows.should.eql([
[ 'Company', 'Contact', 'Country' ],
[ 'Alfreds Futterkiste', 'Maria Anders', 'Germany' ]
]
);
parsed.static_content.body.headers.should.eql([]);
});
});

@@ -174,0 +289,0 @@

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc