tm-content-parser
Advanced tools
Comparing version 1.0.15 to 1.0.18
53
index.js
@@ -203,2 +203,3 @@ "use strict"; | ||
replaceEntityTags, | ||
removeNewlinesFromLists, | ||
replaceNewLineCharacters, | ||
@@ -211,13 +212,51 @@ formatHtml | ||
// return array of same dimensions containing count of <b> tags in each cell | ||
const tableCountBolds = ($) => $("tr").toArray().map( (tr) => { | ||
const tds = $(tr).find("td"); | ||
return tds.map((idx, td) => $(td).has("b").length).toArray(); | ||
}); | ||
// return float bold % in table | ||
const tableBoldPercent = (table) => { | ||
const flat = table.flat(); | ||
const countBold = flat.filter(x => x).length; | ||
const boldPercent = countBold/flat.length; | ||
return boldPercent; | ||
}; | ||
// return 2 dimensional array from html | ||
const tableToArray = ($) => $("tr").toArray().map( (tr) => { | ||
const tds = $(tr).find("td"); | ||
return tds.map((idx, td) => $(td).text()).toArray(); | ||
}); | ||
// extract <th> into array | ||
const tableGetHeaders = ($) => $("th").toArray().map(th => $(th).text()); | ||
const parseTable = text => { | ||
const $ = cheerio.load(text, {decodeEntities: false}); | ||
const tableToArray = $("tr").toArray().map( (tr) => { | ||
const tds = $(tr).find("td"); | ||
return tds.map((idx, td) => $(td).text()).toArray(); | ||
}); | ||
const filtered = tableToArray.filter(arr => arr.length); // remove empty inner arrays [] | ||
const rows = tableToArray($).filter(arr => arr.length); // remove empty inner arrays [] | ||
let headers = tableGetHeaders($); | ||
if (!headers.length) { // no <th>. check bold rules | ||
let boldTable = tableCountBolds($); | ||
boldTable = boldTable.filter(arr => arr.length); | ||
const firstRowAllBold = (boldTable) => !!boldTable[0].length && boldTable[0].filter(x => x).length === boldTable[0].length; | ||
const boldPercent = tableBoldPercent(boldTable); | ||
const tableAllBold = Math.abs(1.0 - boldPercent) < 0.001; | ||
// if first row is all bold AND table is not all with bold then use first row as header | ||
if (!tableAllBold && firstRowAllBold(boldTable)) { | ||
headers = rows.shift(); | ||
} | ||
} | ||
const static_content = { | ||
type: 'table', | ||
body: JSON.stringify(filtered) | ||
body: { | ||
headers, | ||
rows | ||
}, | ||
}; | ||
return { | ||
@@ -248,2 +287,4 @@ type: 'static', | ||
const removeNewlinesFromLists = R.replace(/\/li>([^<]+)<li>/g, '\/li><li>'); | ||
const replaceNewLineCharacters = R.replace(/\n/g, '<br>'); | ||
@@ -250,0 +291,0 @@ |
{ | ||
"name": "tm-content-parser", | ||
"version": "1.0.15", | ||
"version": "1.0.18", | ||
"description": "Trinity Mirror Content Type Parser", | ||
@@ -5,0 +5,0 @@ "main": "index.js", |
@@ -37,2 +37,8 @@ # tm-content-parser | ||
## Release Log | ||
1.0.11 - not parsing & as this broke urls querystrings. | ||
1.0.18 - use newer node version | ||
1.0.17 - pulled AB-700 branch | ||
1.0.15 - extract <th> from tables | ||
1.0.15 - parse html tables, produce new type: table | ||
1.0.13 remove text between closing <li> and next open <li> element | ||
1.0.12 AB-564 add span element to whitelist | ||
1.0.11 - not parsing & as this broke urls querystrings. |
@@ -107,2 +107,32 @@ "use strict"; | ||
describe('.removeNewlinesFromLists', () => { | ||
const removeNewlinesFromLists = contentTypeParser.__get__("removeNewlinesFromLists"); | ||
it('should remove anything between closing </li> and the next opening <li> tag', () => { | ||
const tests = [ | ||
{ | ||
source: '<p>This is a paragraph with \n new \n lines \n <ul><li>Coffee</li>\n<li>Tea</li>\n<li>Milk</li></ul></p>', | ||
result: '<p>This is a paragraph with \n new \n lines \n <ul><li>Coffee</li><li>Tea</li><li>Milk</li></ul></p>' | ||
}, | ||
{ | ||
source: "<ul> \n <li>People aged 50 to 59;</li> \n <li>Unpaid carers who are 16 and over;</li> \n <li>Those aged 16 and over who are household contacts of immunosuppressed individuals.</li> \n</ul>", | ||
result: "<ul> \n <li>People aged 50 to 59;</li><li>Unpaid carers who are 16 and over;</li><li>Those aged 16 and over who are household contacts of immunosuppressed individuals.</li> \n</ul>", | ||
}, | ||
{ | ||
source: "<ul> \n <li>Thursday 9 December to Sunday 12 December</li> \n <li>Thursday 16 December to Sunday 19 December</li> \n</ul>", | ||
result: "<ul> \n <li>Thursday 9 December to Sunday 12 December</li><li>Thursday 16 December to Sunday 19 December</li> \n</ul>", | ||
}, | ||
{ | ||
source: "<ul> \n <li>The whites of your eyes or your skin turning yellow</li> \n <li>Loss of appetite or losing weight without trying to</li> \n <li>Feeling fatigued</li> \n <li>High temperature, or feeling hot or shivery</li> \n <li>Darker urine and paler stool than usual</li> \n <li>Itchy skin</li> \n</ul>", | ||
result: "<ul> \n <li>The whites of your eyes or your skin turning yellow</li><li>Loss of appetite or losing weight without trying to</li><li>Feeling fatigued</li><li>High temperature, or feeling hot or shivery</li><li>Darker urine and paler stool than usual</li><li>Itchy skin</li> \n</ul>", | ||
}, | ||
]; | ||
for (const test of tests) { | ||
removeNewlinesFromLists(test.source).should.eql(test.result); | ||
} | ||
}); | ||
}); | ||
describe('.replaceNewLineCharacters', () => { | ||
@@ -144,5 +174,35 @@ | ||
describe('test table parser', () => { | ||
const table = ` | ||
const parseTable = contentTypeParser.__get__("parseTable"); | ||
it('should parse table without <th>', () => { | ||
const table = ` | ||
<table> | ||
<tr> | ||
</tr> | ||
<tr> | ||
<td>Alfreds Futterkiste</td> | ||
<td><b>Maria Anders</b></td> | ||
<td>Germany</td> | ||
</tr> | ||
<tr> | ||
<td>Centro comercial Moctezuma</td> | ||
<td>Francisco Chang</td> | ||
<td><b>Mexico</b></td> | ||
</tr> | ||
</table>`; | ||
const parsed = parseTable(table); | ||
parsed.type.should.eql("static"); | ||
parsed.static_content.type.should.eql("table"); | ||
parsed.static_content.body.rows.should.eql([ | ||
["Alfreds Futterkiste","Maria Anders","Germany"], | ||
["Centro comercial Moctezuma","Francisco Chang","Mexico"] | ||
]); | ||
parsed.static_content.body.headers.should.eql([]); | ||
}); | ||
it('should extract table header from <th>', () => { | ||
const table = ` | ||
<table> | ||
<tr> | ||
<th>Company</th> | ||
@@ -154,3 +214,3 @@ <th>Contact</th> | ||
<td>Alfreds Futterkiste</td> | ||
<td>Maria Anders</td> | ||
<td><b>Maria Anders</b></td> | ||
<td>Germany</td> | ||
@@ -161,13 +221,68 @@ </tr> | ||
<td>Francisco Chang</td> | ||
<td>Mexico</td> | ||
<td><b>Mexico</b></td> | ||
</tr> | ||
</table>`; | ||
const parsed = parseTable(table); | ||
parsed.type.should.eql("static"); | ||
parsed.static_content.type.should.eql("table"); | ||
parsed.static_content.body.rows.should.eql([ | ||
[ 'Alfreds Futterkiste', 'Maria Anders', 'Germany' ], | ||
[ 'Centro comercial Moctezuma', 'Francisco Chang', 'Mexico' ] | ||
]); | ||
parsed.static_content.body.headers.should.eql([ 'Company', 'Contact', 'Country' ]); | ||
}); | ||
const parseTable = contentTypeParser.__get__("parseTable"); | ||
it('should correctly parse table', () => { | ||
it('should extract header from first row with all bold', () => { | ||
const table = ` | ||
<table> | ||
<tr> | ||
<td><b>Company</b></td> | ||
<td><b>Contact</b></td> | ||
<td><b>Country</b></td> | ||
</tr> | ||
<tr> | ||
<td>Alfreds Futterkiste</td> | ||
<td><b>Maria Anders</b></td> | ||
<td>Germany</td> | ||
</tr> | ||
<tr> | ||
<td>Centro comercial Moctezuma</td> | ||
<td>Francisco Chang</td> | ||
<td><b>Mexico</b></td> | ||
</tr> | ||
</table>`; | ||
const parsed = parseTable(table); | ||
parsed.type.should.eql("static"); | ||
parsed.static_content.type.should.eql("table"); | ||
parsed.static_content.body.should.eql(`[["Alfreds Futterkiste","Maria Anders","Germany"],["Centro comercial Moctezuma","Francisco Chang","Mexico"]]`); | ||
parsed.static_content.body.rows.should.eql([ | ||
[ 'Alfreds Futterkiste', 'Maria Anders', 'Germany' ], | ||
[ 'Centro comercial Moctezuma', 'Francisco Chang', 'Mexico' ] | ||
]); | ||
parsed.static_content.body.headers.should.eql([ 'Company', 'Contact', 'Country' ]); | ||
}); | ||
it('should not add hedaers when all table elements are bold', () => { | ||
const table = ` | ||
<table> | ||
<tr> | ||
<td><b>Company</b></td> | ||
<td><b>Contact</b></td> | ||
<td><b>Country</b></td> | ||
</tr> | ||
<tr> | ||
<td><b>Alfreds Futterkiste</td> | ||
<td><b>Maria Anders</b></td> | ||
<td><b>Germany</td> | ||
</tr> | ||
</table>`; | ||
const parsed = parseTable(table); | ||
parsed.type.should.eql("static"); | ||
parsed.static_content.type.should.eql("table"); | ||
parsed.static_content.body.rows.should.eql([ | ||
[ 'Company', 'Contact', 'Country' ], | ||
[ 'Alfreds Futterkiste', 'Maria Anders', 'Germany' ] | ||
] | ||
); | ||
parsed.static_content.body.headers.should.eql([]); | ||
}); | ||
}); | ||
@@ -174,0 +289,0 @@ |
Sorry, the diff of this file is not supported yet
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
77619
1551
44
11