web-auto-extractor
Advanced tools
Comparing version 1.0.0-beta.2 to 1.0.0-beta.3
@@ -11,7 +11,10 @@ 'use strict'; | ||
var $html = (0, _utils.getCheerioObject)(html); | ||
var jsonldData = []; | ||
var jsonldData = {}; | ||
$html('script[type="application/ld+json"]').each(function (index, item) { | ||
try { | ||
jsonldData[index] = JSON.parse((0, _cheerio2.default)(item).text()); | ||
var parsedJSON = JSON.parse((0, _cheerio2.default)(item).text()); | ||
var type = parsedJSON['@type']; | ||
jsonldData[type] = jsonldData[type] || []; | ||
jsonldData[type].push(parsedJSON); | ||
} catch (e) { | ||
@@ -22,6 +25,3 @@ console.log('Error in jsonld parse - ' + e); | ||
return { | ||
data: jsonldData, | ||
unnormalizedData: null | ||
}; | ||
return jsonldData; | ||
}; | ||
@@ -28,0 +28,0 @@ |
@@ -18,24 +18,13 @@ 'use strict'; | ||
function getPropValue(tagName, attribs, TYPE, PROP) { | ||
var value = void 0, | ||
attr = void 0; | ||
if (attribs[TYPE]) { | ||
value = null; | ||
attr = null; | ||
return null; | ||
} else if (tagName === 'a' || tagName === 'link') { | ||
value = attribs.href.trim(); | ||
attr = 'href'; | ||
return attribs.href.trim(); | ||
} else if (attribs.content) { | ||
value = attribs.content.trim(); | ||
attr = 'content'; | ||
return attribs.content.trim(); | ||
} else if (attribs[PROP] === 'image' && attribs.src) { | ||
value = attribs.src.trim(); | ||
attr = 'src'; | ||
return attribs.src.trim(); | ||
} else { | ||
value = null; | ||
attr = '@text'; | ||
return null; | ||
} | ||
return { | ||
value: value, | ||
attr: attr | ||
}; | ||
} | ||
@@ -75,5 +64,4 @@ | ||
var tags = []; | ||
var props = []; | ||
var path = []; | ||
var topLevelScope = {}; | ||
var textForProp = null; | ||
@@ -84,4 +72,2 @@ var parser = new _htmlparser2.default.Parser({ | ||
var tag = false; | ||
var parentScope = void 0, | ||
scopeIndex = void 0; | ||
@@ -91,9 +77,6 @@ if (attribs[TYPE]) { | ||
var newScope = {}; | ||
parentScope = currentScope; | ||
currentScope[attribs[PROP]] = currentScope[attribs[PROP]] || []; | ||
scopeIndex = currentScope[attribs[PROP]].length; | ||
currentScope[attribs[PROP]].push(newScope); | ||
currentScope = newScope; | ||
} else { | ||
parentScope = topLevelScope; | ||
currentScope = {}; | ||
@@ -106,3 +89,2 @@ | ||
topLevelScope[type] = topLevelScope[type] || []; | ||
scopeIndex = topLevelScope[type].length; | ||
topLevelScope[type].push(currentScope); | ||
@@ -113,10 +95,4 @@ } | ||
if (currentScope) { | ||
var _getPropValue = getPropValue(tagName, attribs, TYPE, PROP); | ||
var value = getPropValue(tagName, attribs, TYPE, PROP); | ||
var value = _getPropValue.value; | ||
var attr = _getPropValue.attr; | ||
var selector = void 0, | ||
name = void 0; | ||
if (attribs[TYPE]) { | ||
@@ -131,37 +107,12 @@ var _getType2 = getType(attribs[TYPE]); | ||
currentScope['@type'] = _type; | ||
name = attribs[PROP] ? attribs[PROP] : _type; | ||
var parentSelector = parentScope['@selector'] ? parentScope['@selector'] + ' ' : ''; | ||
var selfSelector = attribs[PROP] ? '[' + PROP + '="' + attribs[PROP] + '"]' : '[' + TYPE + '="' + attribs[TYPE] + '"]'; | ||
currentScope['@selector'] = parentSelector + selfSelector + (':eq(' + scopeIndex + ')'); | ||
tag = TYPE; | ||
selector = { | ||
select: currentScope['@selector'], | ||
extract: { | ||
attr: attr | ||
} | ||
}; | ||
props.push({ | ||
name: name, | ||
value: value, | ||
selector: selector, | ||
path: path.concat(name, scopeIndex) | ||
}); | ||
path.push(name, scopeIndex); | ||
scopes.push(currentScope); | ||
} else if (attribs[PROP]) { | ||
selector = { | ||
select: currentScope['@selector'] + ' ' + ('[' + PROP + '="' + attribs[PROP] + '"]') + ':eq(0)', | ||
extract: { | ||
attr: attr | ||
} | ||
}; | ||
value = !value && selector ? $(selector.select).text().trim() : value; | ||
currentScope[attribs[PROP]] = value; | ||
name = attribs[PROP]; | ||
props.push({ | ||
name: name, | ||
value: value, | ||
selector: selector, | ||
path: path.concat(name) | ||
}); | ||
if (!value) { | ||
tag = PROP; | ||
currentScope[attribs[PROP]] = ''; | ||
textForProp = attribs[PROP]; | ||
} else { | ||
currentScope[attribs[PROP]] = value; | ||
} | ||
} | ||
@@ -171,8 +122,13 @@ } | ||
}, | ||
ontext: function ontext(text) { | ||
if (textForProp) { | ||
scopes[scopes.length - 1][textForProp] += text.trim(); | ||
} | ||
}, | ||
onclosetag: function onclosetag(tagname) { | ||
var tag = tags.pop(); | ||
if (tag) { | ||
if (tag === TYPE) { | ||
(function () { | ||
var scope = scopes.pop(); | ||
delete scope['@selector']; | ||
if (!scope['@context']) { | ||
@@ -186,5 +142,5 @@ delete scope['@context']; | ||
}); | ||
path.pop(); | ||
path.pop(); | ||
})(); | ||
} else if (tag === PROP) { | ||
textForProp = false; | ||
} | ||
@@ -196,3 +152,3 @@ }, | ||
onend: function onend() { | ||
resolve({ data: topLevelScope, unnormalizedData: props }); | ||
resolve(topLevelScope); | ||
} | ||
@@ -199,0 +155,0 @@ }); |
{ | ||
"name": "web-auto-extractor", | ||
"version": "1.0.0-beta.2", | ||
"version": "1.0.0-beta.3", | ||
"description": "Automatically extracts structured information from webpages", | ||
@@ -5,0 +5,0 @@ "main": "dist/index.js", |
17552
300