Socket
Socket
Sign inDemoInstall

web-auto-extractor

Package Overview
Dependencies
97
Maintainers
1
Versions
38
Alerts
File Explorer

Advanced tools

Install Socket

Detect and block malicious and high-risk dependencies

Install

Comparing version 0.2.1 to 0.3.0

75

dist/index.js

@@ -25,73 +25,12 @@ 'use strict';

var WAEParserObject = function WAEParserObject() {
var result = {};
return {
find: function find(propName) {
var _this = this;
if (!result[propName]) {
(function () {
result[propName] = [];
var items = _this.unnormalizedData();
Object.keys(items).forEach(function (key) {
var item = items[key];
if (item.name === propName) {
result[propName].push(item);
}
});
})();
}
return result[propName];
}
};
};
var WAEObject = function WAEObject() {
var micro = void 0,
rdfa = void 0,
jsonld = void 0,
meta = void 0;
return {
parseMicrodata: function parseMicrodata() {
if (!micro) {
micro = Object.assign(WAEParserObject(), (0, _microRdfaParser2.default)(this.$html, 'micro'));
}
return micro;
},
parseRdfa: function parseRdfa() {
if (!rdfa) {
rdfa = Object.assign(WAEParserObject(), (0, _microRdfaParser2.default)(this.$html, 'rdfa'));
}
return rdfa;
},
parseJsonld: function parseJsonld() {
if (!jsonld) {
jsonld = Object.assign(WAEParserObject(), (0, _jsonldParser2.default)(this.$html));
}
return jsonld;
},
parseMetaTags: function parseMetaTags() {
if (!meta) {
meta = Object.assign(WAEParserObject(), (0, _metatagParser2.default)(this.$html));
}
return meta;
},
parse: function parse() {
return {
meta: this.parseMetaTags().data(),
micro: this.parseMicrodata().data(),
rdfa: this.parseRdfa().data(),
jsonld: this.parseJsonld().data()
};
}
};
};
exports.default = {
init: function init(html) {
parse: function parse(html) {
var $html = _cheerio2.default.load(html, { xmlMode: true });
return Object.assign({}, WAEObject(), {
$html: $html
});
return {
metaTags: (0, _metatagParser2.default)($html),
microdata: (0, _microRdfaParser2.default)($html, 'micro'),
rdfa: (0, _microRdfaParser2.default)($html, 'rdfa'),
jsonld: (0, _jsonldParser2.default)($html)
};
}
};

8

dist/parsers/jsonld-parser.js

@@ -22,8 +22,4 @@ 'use strict';

return {
data: function data() {
return jsonldData;
},
unnormalizedData: function unnormalizedData() {
return jsonldData;
}
data: jsonldData,
unnormalizedData: null
};

@@ -30,0 +26,0 @@ };

@@ -51,16 +51,7 @@ 'use strict';

});
return function () {
var cachedData = null;
return {
data: function data() {
if (!cachedData) {
cachedData = normalize(parsedMetaItems);
}
return cachedData;
},
unnormalizedData: function unnormalizedData() {
return parsedMetaItems;
}
};
}();
var data = normalize(parsedMetaItems);
return {
data: data,
unnormalizedData: parsedMetaItems
};
};

@@ -174,17 +174,7 @@ 'use strict';

});
return function () {
var cachedData = null;
return {
data: function data() {
if (!cachedData) {
cachedData = normalize(items);
}
return cachedData;
},
unnormalizedData: function unnormalizedData() {
return items;
}
};
}();
var data = normalize(items);
return {
data: data,
unnormalizedData: items
};
};
{
"name": "web-auto-extractor",
"version": "0.2.1",
"version": "0.3.0",
"description": "Automatically extracts structured information from webpages",

@@ -5,0 +5,0 @@ "main": "dist/index.js",

@@ -15,73 +15,126 @@ # Web Auto Extractor

## Introduction
Parse any sematically structured HTML and query on it.
```js
import WAE from 'web-auto-extractor'
import request from 'request'
const pageUrl = 'http://southernafricatravel.com/'
request(pageUrl, function (error, response, body) {
let wae = WAE.init(body)
// console.log(wae.parse())
// If the page uses microdata
let waeMicrodata = wae.parseMicrodata()
// See API for more options
// console.log(waeMicrodata.data())
// You can query on the parsed result to look for properties marked up by the page
let images = waeMicrodata.find('telephone')
// console.log(images)
})
```
#### CommonJS import style
```js
var WAE = require('web-auto-extractor').default
//ES6: import WAE from 'web-auto-extractor'
var wae = WAE.parse(sampleHTML)
console.log(wae)
/*
OUTPUT
======
{
microdata: { data: {..}, unnormalizedData: {..} },
rdfa: { data: {..}, unnormalizedData: {..} },
jsonld: { data: {..}, unnormalizedData: null,
metaTags: { data: {..}, unnormalizedData: {..} }
}
*/
```
## Installation
### Installation
`npm install web-auto-extractor`
## API
### Initializing
You would first need to load in the HTML to get a WAEObject
### Usage
#### Import
```js
const wae = WAE.init('<div itemtype="Product">...</div>')
> var WAE = require('web-auto-extractor').default
//ES6: import WAE from 'web-auto-extractor'
```
Each WAEObject comes with the following set of methods
### WAEObject Methods
*NOTE: The result of these functions are **cached**, so multiple calls to them shouldn't affect performance.*
Lets use this `sampleHTML` for our example
```html
<div itemscope itemtype="http://schema.org/Product">
<span itemprop="brand">ACME</span>
<span itemprop="name">Executive Anvil</span>
<img itemprop="image" src="anvil_executive.jpg" alt="Executive Anvil logo" />
<span itemprop="description">Sleeker than ACME's Classic Anvil, the
Executive Anvil is perfect for the business traveler
looking for something to drop from a height.
</span>
Product #: <span itemprop="mpn">925872</span>
<span itemprop="aggregateRating" itemscope itemtype="http://schema.org/AggregateRating">
<span itemprop="ratingValue">4.4</span> stars, based on <span itemprop="reviewCount">89
</span> reviews
</span>
#### .parse()
Finds all supported semantically structured information on the HTML in normalized format.
<span itemprop="offers" itemscope itemtype="http://schema.org/Offer">
Regular price: $179.99
<meta itemprop="priceCurrency" content="USD" />
$<span itemprop="price">119.99</span>
(Sale ends <time itemprop="priceValidUntil" datetime="2020-11-05">
5 November!</time>)
Available from: <span itemprop="seller" itemscope itemtype="http://schema.org/Organization">
<span itemprop="name">Executive Objects</span>
</span>
Condition: <link itemprop="itemCondition" href="http://schema.org/UsedCondition"/>Previously owned,
in excellent condition
<link itemprop="availability" href="http://schema.org/InStock"/>In stock! Order now!</span>
</span>
</div>
```
#### .parseMicrodata()
Finds all Microdata information on the page and returns it as a [WAEParserObject](#waeparserobject-attributes).
#### Parsing
```js
> var wae = WAE.parse(sampleHTML)
```
This returns an object with the following attributes, each of which is of the type [WAEParserObject](#waeparserobject-attributes).
#### .parseRdfa()
Finds all RDFa-Lite information on the page and returns it as a [WAEParserObject](#waeparserobject-attributes).
- microdata
- rdfa
- jsonld
- metaTags
#### .parseJsonld()
Finds all JSON-LD information on the page and returns it as a [WAEParserObject](#waeparserobject-attributes).
```js
// Since our sampleHTML uses microdata
> var parsedMicrodata = wae.microdata
```
#### .parseMetaTags()
Finds all meta tags information on the page and returns it as a [WAEParserObject](#waeparserobject-attributes).
##### WAEParserObject Attributes
### WAEParserObject Attributes
*NOTE: The result of these functions are **cached**, so multiple calls to them shouldn't affect performance.*
#### .data()
###### .data
Gets the normalized result of the parsed format.
#### .unnormalizedData()
Gets the unnormalized flattened result of the parsed format which includes meta information relating to the parsed properties.
```js
// Let's print this out for our example
> parsedMicrodata.data
```
OUTPUT:
```json
[
{
"@context": "http://schema.org/",
"@type": "Product",
"brand": "ACME",
"name": "Executive Anvil",
"image": "anvil_executive.jpg",
"description": "Sleeker than ACME's Classic Anvil, the\n Executive Anvil is perfect for the business traveler\n looking for something to drop from a height.",
"mpn": "925872",
"aggregateRating": {
"@context": "http://schema.org/",
"@type": "AggregateRating",
"ratingValue": "4.4",
"reviewCount": "89"
},
"offers": {
"@context": "http://schema.org/",
"@type": "Offer",
"priceCurrency": "USD",
"price": "119.99",
"priceValidUntil": "5 November!",
"seller": {
"@context": "http://schema.org/",
"@type": "Organization",
"name": "Executive Objects"
},
"itemCondition": "http://schema.org/UsedCondition",
"availability": "http://schema.org/InStock"
}
}
]
```
#### .find(propName)
Returns a list of elements from `.data()` that corresponds to the property with the name `[propName]`.
###### .unnormalizedData
Gets the unnormalized flattened intermediate result of the parsed format which includes meta information relating to the parsed properties.
[See test cases](https://github.com/ind9/web-auto-extractor/blob/master/test/test.js) for more examples.
For more examples, [See this output here](https://github.com/ind9/web-auto-extractor/blob/master/test/resources/expectedResult.json) which uses [this HTML](https://github.com/ind9/web-auto-extractor/blob/master/test/resources/testPage.html)
var WAE = require('web-auto-extractor').default
//ES6: import WAE from 'web-auto-extractor'
var request = require('request')
var pageUrl = 'http://southernafricatravel.com/'
//var pageUrl = 'https://raw.githubusercontent.com/ind9/web-auto-extractor/master/test/resources/testPage.html'
request(pageUrl, function (error, response, body) {
var wae = WAE.init(body)
console.log(wae.parse())
// Useful operations for you to try. Refer API section in README for more.
// var waeMicrodata = wae.parseMicrodata()
// console.log(waeMicrodata.data())
//
// var images = waeMicrodata.find('telephone')
// console.log(images)
var wae = WAE.parse(body)
console.log(wae)
/*
OUTPUT
======
{
microdata: { .. },
rdfa: { .. },
jsonld: { .. },
metaTags: { .. }
}
*/
})
SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc