Socket
Socket
Sign inDemoInstall

web-auto-extractor

Package Overview
Dependencies
98
Maintainers
1
Versions
38
Alerts
File Explorer

Advanced tools

Install Socket

Detect and block malicious and high-risk dependencies

Install

Comparing version 1.0.0-beta.6 to 1.0.0

43

dist/index.js

@@ -7,2 +7,28 @@ 'use strict';

exports.default = function () {
var $html = null;
var loadCheerioObject = function loadCheerioObject(_$html) {
$html = _$html;
};
var parse = function parse(html, options) {
if (!($html && $html.prototype && $html.prototype.cheerio)) {
$html = _cheerio2.default.load(html, options);
}
return {
metatags: (0, _metatagParser2.default)($html),
microdata: (0, _microRdfaParser2.default)(html, 'micro'),
rdfa: (0, _microRdfaParser2.default)(html, 'rdfa'),
jsonld: (0, _jsonldParser2.default)($html)
};
};
return {
parse: parse,
loadCheerioObject: loadCheerioObject
};
};
require('babel-polyfill');

@@ -26,17 +52,2 @@

function _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { default: obj }; }
exports.default = {
parse: function parse(html, $html) {
if (!($html && $html.prototype && $html.prototype.cheerio)) {
$html = _cheerio2.default.load(html, { xmlMode: true });
}
return {
metatags: (0, _metatagParser2.default)($html),
microdata: (0, _microRdfaParser2.default)(html, 'micro', $html),
rdfa: (0, _microRdfaParser2.default)(html, 'rdfa', $html),
jsonld: (0, _jsonldParser2.default)($html)
};
}
};
function _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { default: obj }; }

@@ -145,3 +145,3 @@ 'use strict';

exports.default = function (html, specName, $) {
exports.default = function (html, specName) {
var handler = createHandler(specName);

@@ -148,0 +148,0 @@ new _htmlparser2.default.Parser(handler).end(html);

{
"name": "web-auto-extractor",
"version": "1.0.0-beta.6",
"version": "1.0.0",
"description": "Automatically extracts structured information from webpages",

@@ -5,0 +5,0 @@ "main": "dist/index.js",

# Web Auto Extractor
[![Build Status](https://travis-ci.org/ind9/web-auto-extractor.svg?branch=master)](https://travis-ci.org/ind9/web-auto-extractor)
Automatically extracts semantically structured information from any HTML webpage.
Parse semantically structured information from any HTML webpage.
Supported formats:-
- Formats that support Schema.org vocabularies:-
- Encodings that support [Schema.org](http://schema.org/) vocabularies:-
- Microdata
- RDFa-lite
- JSON-LD
- Miscellaneous meta tags
- Random Meta tags
Popularly, many websites mark up their webpages with Schema.org vocabularies for better SEO. This library helps you parse that information to JSON.
**[Demo](https://tonicdev.com/npm/web-auto-extractor)** it on tonicdev
## Installation
`npm install web-auto-extractor`
## Usage
```js
// IF CommonJS
var WAE = require('web-auto-extractor').default
//ES6: import WAE from 'web-auto-extractor'
var wae = WAE.parse(sampleHTML)
console.log(wae)
/*
OUTPUT
======
{
microdata: { data: {..}, unnormalizedData: {..} },
rdfa: { data: {..}, unnormalizedData: {..} },
jsonld: { data: {..}, unnormalizedData: null,
metaTags: { data: {..}, unnormalizedData: {..} }
}
*/
```
// IF ES6
import WAE from 'web-auto-extractor'
### Installation
`npm install web-auto-extractor`
var parsed = WAE().parse(sampleHTML)
```
### Usage
Let's use the following text as the `sampleHTML` in our example. It uses Schema.org vocabularies to structure a Product information and is encoded in `microdata` format.
#### Import
```js
> var WAE = require('web-auto-extractor').default
//ES6: import WAE from 'web-auto-extractor'
```
Lets use this `sampleHTML` for our example
```html

@@ -76,65 +65,78 @@ <div itemscope itemtype="http://schema.org/Product">

#### Parsing
```js
> var wae = WAE.parse(sampleHTML)
```
This returns an object with the following attributes, each of which is of the type [WAEParserObject](#waeparserobject-attributes).
#### Result
- microdata
- rdfa
- jsonld
- metaTags
Our `parsed` object should look like -
```js
// Since our sampleHTML uses microdata
> var parsedMicrodata = wae.microdata
```json
{
"microdata": {
"Product": [
{
"@context": "http://schema.org/",
"@type": "Product",
"brand": "ACME",
"name": "Executive Anvil",
"image": "anvil_executive.jpg",
"description": "Sleeker than ACME's Classic Anvil, the\n Executive Anvil is perfect for the business traveler\n looking for something to drop from a height.",
"mpn": "925872",
"aggregateRating": {
"@context": "http://schema.org/",
"@type": "AggregateRating",
"ratingValue": "4.4",
"reviewCount": "89"
},
"offers": {
"@context": "http://schema.org/",
"@type": "Offer",
"priceCurrency": "USD",
"price": "119.99",
"priceValidUntil": "5 November!",
"seller": {
"@context": "http://schema.org/",
"@type": "Organization",
"name": "Executive Objects"
},
"itemCondition": "http://schema.org/UsedCondition",
"availability": "http://schema.org/InStock"
}
}
]
},
"rdfa": {},
"jsonld": {},
"metatags": {
"priceCurrency": [
"USD",
"USD"
]
}
}
```
##### WAEParserObject Attributes
The `parsed` object includes four objects - `microdata`, `rdfa`, `jsonld` and `metatags`. Since the above HTML does not have any information encoded in `rdfa` and `jsonld`, those two objects are empty.
###### .data
Gets the normalized result of the parsed format.
## Caveat
```js
// Let's print this out for our example
> parsedMicrodata.data
I wouldn't call it a caveat but rather the parser is strict by design. It might not parse like expected if the HTML isn't encoded correctly, so one might assume the parser is broken.
For example, take the following HTML snippet.
```html
<div itemscope itemtype="http://schema.org/Movie">
<h1 itemprop="name">Ghostbusters</h1>
<div itemprop="productionCompany" itemscope itemtype="http://schema.org/Organization">Black Rhino</div>
<div itemprop="countryOfOrigin" itemscope itemtype="http://schema.org/Country">
Country: <span itemprop="name" content="USA">United States</span><p>
</div>
</div>
```
OUTPUT:
```json
[
{
"@context": "http://schema.org/",
"@type": "Product",
"brand": "ACME",
"name": "Executive Anvil",
"image": "anvil_executive.jpg",
"description": "Sleeker than ACME's Classic Anvil, the\n Executive Anvil is perfect for the business traveler\n looking for something to drop from a height.",
"mpn": "925872",
"aggregateRating": {
"@context": "http://schema.org/",
"@type": "AggregateRating",
"ratingValue": "4.4",
"reviewCount": "89"
},
"offers": {
"@context": "http://schema.org/",
"@type": "Offer",
"priceCurrency": "USD",
"price": "119.99",
"priceValidUntil": "5 November!",
"seller": {
"@context": "http://schema.org/",
"@type": "Organization",
"name": "Executive Objects"
},
"itemCondition": "http://schema.org/UsedCondition",
"availability": "http://schema.org/InStock"
}
}
]
```
###### .unnormalizedData
Gets the unnormalized flattened intermediate result of the parsed format which includes meta information relating to the parsed properties.
The problem here is the `itemprop` - `productionCompany` which is of `itemtype` - `Organization` doesn't have any `itemprop` as its children, in this case - `name`.
For more examples, [See this output here](https://github.com/ind9/web-auto-extractor/blob/master/test/resources/expectedResult.json) which uses [this HTML](https://github.com/ind9/web-auto-extractor/blob/master/test/resources/testPage.html)
The parser assumes every `itemtype` contains an `itemprop`, or every `typeof` contains a `property` in case of `rdfa`. So the `"Black Rhino"` information is lost.
It'll be nice to fix this by having a `non-strict` mode for parsing this information. PRs are welcome.
## License
MIT

@@ -9,14 +9,9 @@ var WAE = require('web-auto-extractor').default

request(pageUrl, function (error, response, body) {
var wae = WAE.parse(body)
var wae = WAE()
var parsed = wae.parse(body)
console.log(wae)
/*
OUTPUT
======
{
microdata: { .. },
rdfa: { .. },
jsonld: { .. },
metaTags: { .. }
}
*/
})
SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc