Socket
Socket
Sign inDemoInstall

web-auto-extractor

Package Overview
Dependencies
98
Maintainers
1
Versions
38
Alerts
File Explorer

Advanced tools

Install Socket

Detect and block malicious and high-risk dependencies

Install

Comparing version 1.0.0-beta.5 to 1.0.0-beta.6

47

dist/index.js

@@ -27,44 +27,15 @@ 'use strict';

function _asyncToGenerator(fn) { return function () { var gen = fn.apply(this, arguments); return new Promise(function (resolve, reject) { function step(key, arg) { try { var info = gen[key](arg); var value = info.value; } catch (error) { reject(error); return; } if (info.done) { resolve(value); } else { return Promise.resolve(value).then(function (value) { return step("next", value); }, function (err) { return step("throw", err); }); } } return step("next"); }); }; }
exports.default = {
parse: function parse(html, $html) {
var _this = this;
if (!($html && $html.prototype && $html.prototype.cheerio)) {
$html = _cheerio2.default.load(html, { xmlMode: true });
}
return _asyncToGenerator(regeneratorRuntime.mark(function _callee() {
return regeneratorRuntime.wrap(function _callee$(_context) {
while (1) {
switch (_context.prev = _context.next) {
case 0:
if (!($html && $html.prototype && $html.prototype.cheerio)) {
$html = _cheerio2.default.load(html, { xmlMode: true });
}
_context.t0 = (0, _metatagParser2.default)($html);
_context.next = 4;
return (0, _microRdfaParser2.default)(html, 'micro', $html);
case 4:
_context.t1 = _context.sent;
_context.next = 7;
return (0, _microRdfaParser2.default)(html, 'rdfa', $html);
case 7:
_context.t2 = _context.sent;
_context.t3 = (0, _jsonldParser2.default)($html);
return _context.abrupt('return', {
metaTags: _context.t0,
microdata: _context.t1,
rdfa: _context.t2,
jsonld: _context.t3
});
case 10:
case 'end':
return _context.stop();
}
}
}, _callee, _this);
}))();
return {
metatags: (0, _metatagParser2.default)($html),
microdata: (0, _microRdfaParser2.default)(html, 'micro', $html),
rdfa: (0, _microRdfaParser2.default)(html, 'rdfa', $html),
jsonld: (0, _jsonldParser2.default)($html)
};
}
};

@@ -13,14 +13,4 @@ 'use strict';

var normalize = function normalize(items) {
return Object.keys(items).reduce(function (normalizedItems, itemName) {
normalizedItems[itemName] = items[itemName].map(function (_ref) {
var value = _ref.value;
return value;
});
return normalizedItems;
}, {});
};
exports.default = function ($) {
var parsedMetaItems = {};
var metatagsData = {};
$('meta').each(function (index, elem) {

@@ -32,20 +22,8 @@ var nameKey = _lodash2.default.find(_lodash2.default.keys(elem.attribs), function (attr) {

var value = elem.attribs['content'];
if (!parsedMetaItems[name]) {
parsedMetaItems[name] = [];
if (!metatagsData[name]) {
metatagsData[name] = [];
}
parsedMetaItems[name].push({
value: value,
selector: {
select: 'meta[' + nameKey + '="' + name + '"]:eq(' + parsedMetaItems[name].length + ')',
extract: {
attr: 'content'
}
}
});
metatagsData[name].push(value);
});
var data = normalize(parsedMetaItems);
return {
data: data,
unnormalizedData: parsedMetaItems
};
return metatagsData;
};

163

dist/parsers/micro-rdfa-parser.js

@@ -54,96 +54,97 @@ 'use strict';

exports.default = function (html, specName, $) {
return new Promise(function (resolve, reject) {
var _getAttrNames = getAttrNames(specName);
var createHandler = function createHandler(specName) {
var scopes = [];
var tags = [];
var topLevelScope = {};
var textForProp = null;
var TYPE = _getAttrNames.TYPE;
var PROP = _getAttrNames.PROP;
var _getAttrNames = getAttrNames(specName);
var scopes = [];
var tags = [];
var topLevelScope = {};
var textForProp = null;
var TYPE = _getAttrNames.TYPE;
var PROP = _getAttrNames.PROP;
var parser = new _htmlparser2.default.Parser({
onopentag: function onopentag(tagName, attribs) {
var currentScope = scopes[scopes.length - 1];
var tag = false;
if (attribs[TYPE]) {
if (attribs[PROP] && currentScope) {
var newScope = {};
currentScope[attribs[PROP]] = currentScope[attribs[PROP]] || [];
currentScope[attribs[PROP]].push(newScope);
currentScope = newScope;
} else {
currentScope = {};
var onopentag = function onopentag(tagName, attribs) {
var currentScope = scopes[scopes.length - 1];
var tag = false;
var _getType = getType(attribs[TYPE]);
if (attribs[TYPE]) {
if (attribs[PROP] && currentScope) {
var newScope = {};
currentScope[attribs[PROP]] = currentScope[attribs[PROP]] || [];
currentScope[attribs[PROP]].push(newScope);
currentScope = newScope;
} else {
currentScope = {};
var type = _getType.type;
var _getType = getType(attribs[TYPE]);
topLevelScope[type] = topLevelScope[type] || [];
topLevelScope[type].push(currentScope);
}
}
var type = _getType.type;
if (currentScope) {
if (attribs[TYPE]) {
var _getType2 = getType(attribs[TYPE]);
topLevelScope[type] = topLevelScope[type] || [];
topLevelScope[type].push(currentScope);
}
}
var context = _getType2.context;
var _type = _getType2.type;
if (currentScope) {
if (attribs[TYPE]) {
var _getType2 = getType(attribs[TYPE]);
var vocab = attribs.vocab;
currentScope['@context'] = context || vocab;
currentScope['@type'] = _type;
tag = TYPE;
scopes.push(currentScope);
} else if (attribs[PROP]) {
var value = getPropValue(tagName, attribs, TYPE, PROP);
if (!value) {
tag = PROP;
currentScope[attribs[PROP]] = '';
textForProp = attribs[PROP];
} else {
currentScope[attribs[PROP]] = value;
}
}
}
tags.push(tag);
},
var context = _getType2.context;
var _type = _getType2.type;
ontext: function ontext(text) {
if (textForProp) {
scopes[scopes.length - 1][textForProp] += text.trim();
var vocab = attribs.vocab;
currentScope['@context'] = context || vocab;
currentScope['@type'] = _type;
tag = TYPE;
scopes.push(currentScope);
} else if (attribs[PROP]) {
var value = getPropValue(tagName, attribs, TYPE, PROP);
if (!value) {
tag = PROP;
currentScope[attribs[PROP]] = '';
textForProp = attribs[PROP];
} else {
currentScope[attribs[PROP]] = value;
}
},
onclosetag: function onclosetag(tagname) {
var tag = tags.pop();
if (tag === TYPE) {
(function () {
var scope = scopes.pop();
if (!scope['@context']) {
delete scope['@context'];
}
Object.keys(scope).forEach(function (key) {
if (_lodash2.default.isArray(scope[key]) && scope[key].length === 1) {
scope[key] = scope[key][0];
}
});
})();
} else if (tag === PROP) {
textForProp = false;
}
}
tags.push(tag);
};
var ontext = function ontext(text) {
if (textForProp) {
scopes[scopes.length - 1][textForProp] += text.trim();
}
};
var onclosetag = function onclosetag(tagname) {
var tag = tags.pop();
if (tag === TYPE) {
(function () {
var scope = scopes.pop();
if (!scope['@context']) {
delete scope['@context'];
}
},
onerror: function onerror(err) {
reject(err);
},
onend: function onend() {
resolve(topLevelScope);
}
});
parser.write(html);
parser.done();
});
Object.keys(scope).forEach(function (key) {
if (_lodash2.default.isArray(scope[key]) && scope[key].length === 1) {
scope[key] = scope[key][0];
}
});
})();
} else if (tag === PROP) {
textForProp = false;
}
};
return {
onopentag: onopentag,
ontext: ontext,
onclosetag: onclosetag,
topLevelScope: topLevelScope
};
};
exports.default = function (html, specName, $) {
var handler = createHandler(specName);
new _htmlparser2.default.Parser(handler).end(html);
return handler.topLevelScope;
};
{
"name": "web-auto-extractor",
"version": "1.0.0-beta.5",
"version": "1.0.0-beta.6",
"description": "Automatically extracts structured information from webpages",

@@ -5,0 +5,0 @@ "main": "dist/index.js",

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc