web-auto-extractor
Advanced tools
Comparing version 1.0.0-beta.5 to 1.0.0-beta.6
@@ -27,44 +27,15 @@ 'use strict'; | ||
function _asyncToGenerator(fn) { return function () { var gen = fn.apply(this, arguments); return new Promise(function (resolve, reject) { function step(key, arg) { try { var info = gen[key](arg); var value = info.value; } catch (error) { reject(error); return; } if (info.done) { resolve(value); } else { return Promise.resolve(value).then(function (value) { return step("next", value); }, function (err) { return step("throw", err); }); } } return step("next"); }); }; } | ||
exports.default = { | ||
parse: function parse(html, $html) { | ||
var _this = this; | ||
if (!($html && $html.prototype && $html.prototype.cheerio)) { | ||
$html = _cheerio2.default.load(html, { xmlMode: true }); | ||
} | ||
return _asyncToGenerator(regeneratorRuntime.mark(function _callee() { | ||
return regeneratorRuntime.wrap(function _callee$(_context) { | ||
while (1) { | ||
switch (_context.prev = _context.next) { | ||
case 0: | ||
if (!($html && $html.prototype && $html.prototype.cheerio)) { | ||
$html = _cheerio2.default.load(html, { xmlMode: true }); | ||
} | ||
_context.t0 = (0, _metatagParser2.default)($html); | ||
_context.next = 4; | ||
return (0, _microRdfaParser2.default)(html, 'micro', $html); | ||
case 4: | ||
_context.t1 = _context.sent; | ||
_context.next = 7; | ||
return (0, _microRdfaParser2.default)(html, 'rdfa', $html); | ||
case 7: | ||
_context.t2 = _context.sent; | ||
_context.t3 = (0, _jsonldParser2.default)($html); | ||
return _context.abrupt('return', { | ||
metaTags: _context.t0, | ||
microdata: _context.t1, | ||
rdfa: _context.t2, | ||
jsonld: _context.t3 | ||
}); | ||
case 10: | ||
case 'end': | ||
return _context.stop(); | ||
} | ||
} | ||
}, _callee, _this); | ||
}))(); | ||
return { | ||
metatags: (0, _metatagParser2.default)($html), | ||
microdata: (0, _microRdfaParser2.default)(html, 'micro', $html), | ||
rdfa: (0, _microRdfaParser2.default)(html, 'rdfa', $html), | ||
jsonld: (0, _jsonldParser2.default)($html) | ||
}; | ||
} | ||
}; |
@@ -13,14 +13,4 @@ 'use strict'; | ||
var normalize = function normalize(items) { | ||
return Object.keys(items).reduce(function (normalizedItems, itemName) { | ||
normalizedItems[itemName] = items[itemName].map(function (_ref) { | ||
var value = _ref.value; | ||
return value; | ||
}); | ||
return normalizedItems; | ||
}, {}); | ||
}; | ||
exports.default = function ($) { | ||
var parsedMetaItems = {}; | ||
var metatagsData = {}; | ||
$('meta').each(function (index, elem) { | ||
@@ -32,20 +22,8 @@ var nameKey = _lodash2.default.find(_lodash2.default.keys(elem.attribs), function (attr) { | ||
var value = elem.attribs['content']; | ||
if (!parsedMetaItems[name]) { | ||
parsedMetaItems[name] = []; | ||
if (!metatagsData[name]) { | ||
metatagsData[name] = []; | ||
} | ||
parsedMetaItems[name].push({ | ||
value: value, | ||
selector: { | ||
select: 'meta[' + nameKey + '="' + name + '"]:eq(' + parsedMetaItems[name].length + ')', | ||
extract: { | ||
attr: 'content' | ||
} | ||
} | ||
}); | ||
metatagsData[name].push(value); | ||
}); | ||
var data = normalize(parsedMetaItems); | ||
return { | ||
data: data, | ||
unnormalizedData: parsedMetaItems | ||
}; | ||
return metatagsData; | ||
}; |
@@ -54,96 +54,97 @@ 'use strict'; | ||
exports.default = function (html, specName, $) { | ||
return new Promise(function (resolve, reject) { | ||
var _getAttrNames = getAttrNames(specName); | ||
var createHandler = function createHandler(specName) { | ||
var scopes = []; | ||
var tags = []; | ||
var topLevelScope = {}; | ||
var textForProp = null; | ||
var TYPE = _getAttrNames.TYPE; | ||
var PROP = _getAttrNames.PROP; | ||
var _getAttrNames = getAttrNames(specName); | ||
var scopes = []; | ||
var tags = []; | ||
var topLevelScope = {}; | ||
var textForProp = null; | ||
var TYPE = _getAttrNames.TYPE; | ||
var PROP = _getAttrNames.PROP; | ||
var parser = new _htmlparser2.default.Parser({ | ||
onopentag: function onopentag(tagName, attribs) { | ||
var currentScope = scopes[scopes.length - 1]; | ||
var tag = false; | ||
if (attribs[TYPE]) { | ||
if (attribs[PROP] && currentScope) { | ||
var newScope = {}; | ||
currentScope[attribs[PROP]] = currentScope[attribs[PROP]] || []; | ||
currentScope[attribs[PROP]].push(newScope); | ||
currentScope = newScope; | ||
} else { | ||
currentScope = {}; | ||
var onopentag = function onopentag(tagName, attribs) { | ||
var currentScope = scopes[scopes.length - 1]; | ||
var tag = false; | ||
var _getType = getType(attribs[TYPE]); | ||
if (attribs[TYPE]) { | ||
if (attribs[PROP] && currentScope) { | ||
var newScope = {}; | ||
currentScope[attribs[PROP]] = currentScope[attribs[PROP]] || []; | ||
currentScope[attribs[PROP]].push(newScope); | ||
currentScope = newScope; | ||
} else { | ||
currentScope = {}; | ||
var type = _getType.type; | ||
var _getType = getType(attribs[TYPE]); | ||
topLevelScope[type] = topLevelScope[type] || []; | ||
topLevelScope[type].push(currentScope); | ||
} | ||
} | ||
var type = _getType.type; | ||
if (currentScope) { | ||
if (attribs[TYPE]) { | ||
var _getType2 = getType(attribs[TYPE]); | ||
topLevelScope[type] = topLevelScope[type] || []; | ||
topLevelScope[type].push(currentScope); | ||
} | ||
} | ||
var context = _getType2.context; | ||
var _type = _getType2.type; | ||
if (currentScope) { | ||
if (attribs[TYPE]) { | ||
var _getType2 = getType(attribs[TYPE]); | ||
var vocab = attribs.vocab; | ||
currentScope['@context'] = context || vocab; | ||
currentScope['@type'] = _type; | ||
tag = TYPE; | ||
scopes.push(currentScope); | ||
} else if (attribs[PROP]) { | ||
var value = getPropValue(tagName, attribs, TYPE, PROP); | ||
if (!value) { | ||
tag = PROP; | ||
currentScope[attribs[PROP]] = ''; | ||
textForProp = attribs[PROP]; | ||
} else { | ||
currentScope[attribs[PROP]] = value; | ||
} | ||
} | ||
} | ||
tags.push(tag); | ||
}, | ||
var context = _getType2.context; | ||
var _type = _getType2.type; | ||
ontext: function ontext(text) { | ||
if (textForProp) { | ||
scopes[scopes.length - 1][textForProp] += text.trim(); | ||
var vocab = attribs.vocab; | ||
currentScope['@context'] = context || vocab; | ||
currentScope['@type'] = _type; | ||
tag = TYPE; | ||
scopes.push(currentScope); | ||
} else if (attribs[PROP]) { | ||
var value = getPropValue(tagName, attribs, TYPE, PROP); | ||
if (!value) { | ||
tag = PROP; | ||
currentScope[attribs[PROP]] = ''; | ||
textForProp = attribs[PROP]; | ||
} else { | ||
currentScope[attribs[PROP]] = value; | ||
} | ||
}, | ||
onclosetag: function onclosetag(tagname) { | ||
var tag = tags.pop(); | ||
if (tag === TYPE) { | ||
(function () { | ||
var scope = scopes.pop(); | ||
if (!scope['@context']) { | ||
delete scope['@context']; | ||
} | ||
Object.keys(scope).forEach(function (key) { | ||
if (_lodash2.default.isArray(scope[key]) && scope[key].length === 1) { | ||
scope[key] = scope[key][0]; | ||
} | ||
}); | ||
})(); | ||
} else if (tag === PROP) { | ||
textForProp = false; | ||
} | ||
} | ||
tags.push(tag); | ||
}; | ||
var ontext = function ontext(text) { | ||
if (textForProp) { | ||
scopes[scopes.length - 1][textForProp] += text.trim(); | ||
} | ||
}; | ||
var onclosetag = function onclosetag(tagname) { | ||
var tag = tags.pop(); | ||
if (tag === TYPE) { | ||
(function () { | ||
var scope = scopes.pop(); | ||
if (!scope['@context']) { | ||
delete scope['@context']; | ||
} | ||
}, | ||
onerror: function onerror(err) { | ||
reject(err); | ||
}, | ||
onend: function onend() { | ||
resolve(topLevelScope); | ||
} | ||
}); | ||
parser.write(html); | ||
parser.done(); | ||
}); | ||
Object.keys(scope).forEach(function (key) { | ||
if (_lodash2.default.isArray(scope[key]) && scope[key].length === 1) { | ||
scope[key] = scope[key][0]; | ||
} | ||
}); | ||
})(); | ||
} else if (tag === PROP) { | ||
textForProp = false; | ||
} | ||
}; | ||
return { | ||
onopentag: onopentag, | ||
ontext: ontext, | ||
onclosetag: onclosetag, | ||
topLevelScope: topLevelScope | ||
}; | ||
}; | ||
exports.default = function (html, specName, $) { | ||
var handler = createHandler(specName); | ||
new _htmlparser2.default.Parser(handler).end(html); | ||
return handler.topLevelScope; | ||
}; |
{ | ||
"name": "web-auto-extractor", | ||
"version": "1.0.0-beta.5", | ||
"version": "1.0.0-beta.6", | ||
"description": "Automatically extracts structured information from webpages", | ||
@@ -5,0 +5,0 @@ "main": "dist/index.js", |
15235
252