web-auto-extractor
Advanced tools
Comparing version 0.2.1 to 0.3.0
@@ -25,73 +25,12 @@ 'use strict'; | ||
var WAEParserObject = function WAEParserObject() { | ||
var result = {}; | ||
return { | ||
find: function find(propName) { | ||
var _this = this; | ||
if (!result[propName]) { | ||
(function () { | ||
result[propName] = []; | ||
var items = _this.unnormalizedData(); | ||
Object.keys(items).forEach(function (key) { | ||
var item = items[key]; | ||
if (item.name === propName) { | ||
result[propName].push(item); | ||
} | ||
}); | ||
})(); | ||
} | ||
return result[propName]; | ||
} | ||
}; | ||
}; | ||
var WAEObject = function WAEObject() { | ||
var micro = void 0, | ||
rdfa = void 0, | ||
jsonld = void 0, | ||
meta = void 0; | ||
return { | ||
parseMicrodata: function parseMicrodata() { | ||
if (!micro) { | ||
micro = Object.assign(WAEParserObject(), (0, _microRdfaParser2.default)(this.$html, 'micro')); | ||
} | ||
return micro; | ||
}, | ||
parseRdfa: function parseRdfa() { | ||
if (!rdfa) { | ||
rdfa = Object.assign(WAEParserObject(), (0, _microRdfaParser2.default)(this.$html, 'rdfa')); | ||
} | ||
return rdfa; | ||
}, | ||
parseJsonld: function parseJsonld() { | ||
if (!jsonld) { | ||
jsonld = Object.assign(WAEParserObject(), (0, _jsonldParser2.default)(this.$html)); | ||
} | ||
return jsonld; | ||
}, | ||
parseMetaTags: function parseMetaTags() { | ||
if (!meta) { | ||
meta = Object.assign(WAEParserObject(), (0, _metatagParser2.default)(this.$html)); | ||
} | ||
return meta; | ||
}, | ||
parse: function parse() { | ||
return { | ||
meta: this.parseMetaTags().data(), | ||
micro: this.parseMicrodata().data(), | ||
rdfa: this.parseRdfa().data(), | ||
jsonld: this.parseJsonld().data() | ||
}; | ||
} | ||
}; | ||
}; | ||
exports.default = { | ||
init: function init(html) { | ||
parse: function parse(html) { | ||
var $html = _cheerio2.default.load(html, { xmlMode: true }); | ||
return Object.assign({}, WAEObject(), { | ||
$html: $html | ||
}); | ||
return { | ||
metaTags: (0, _metatagParser2.default)($html), | ||
microdata: (0, _microRdfaParser2.default)($html, 'micro'), | ||
rdfa: (0, _microRdfaParser2.default)($html, 'rdfa'), | ||
jsonld: (0, _jsonldParser2.default)($html) | ||
}; | ||
} | ||
}; |
@@ -22,8 +22,4 @@ 'use strict'; | ||
return { | ||
data: function data() { | ||
return jsonldData; | ||
}, | ||
unnormalizedData: function unnormalizedData() { | ||
return jsonldData; | ||
} | ||
data: jsonldData, | ||
unnormalizedData: null | ||
}; | ||
@@ -30,0 +26,0 @@ }; |
@@ -51,16 +51,7 @@ 'use strict'; | ||
}); | ||
return function () { | ||
var cachedData = null; | ||
return { | ||
data: function data() { | ||
if (!cachedData) { | ||
cachedData = normalize(parsedMetaItems); | ||
} | ||
return cachedData; | ||
}, | ||
unnormalizedData: function unnormalizedData() { | ||
return parsedMetaItems; | ||
} | ||
}; | ||
}(); | ||
var data = normalize(parsedMetaItems); | ||
return { | ||
data: data, | ||
unnormalizedData: parsedMetaItems | ||
}; | ||
}; |
@@ -174,17 +174,7 @@ 'use strict'; | ||
}); | ||
return function () { | ||
var cachedData = null; | ||
return { | ||
data: function data() { | ||
if (!cachedData) { | ||
cachedData = normalize(items); | ||
} | ||
return cachedData; | ||
}, | ||
unnormalizedData: function unnormalizedData() { | ||
return items; | ||
} | ||
}; | ||
}(); | ||
var data = normalize(items); | ||
return { | ||
data: data, | ||
unnormalizedData: items | ||
}; | ||
}; |
{ | ||
"name": "web-auto-extractor", | ||
"version": "0.2.1", | ||
"version": "0.3.0", | ||
"description": "Automatically extracts structured information from webpages", | ||
@@ -5,0 +5,0 @@ "main": "dist/index.js", |
155
README.md
@@ -15,73 +15,126 @@ # Web Auto Extractor | ||
## Introduction | ||
Parse any sematically structured HTML and query on it. | ||
```js | ||
import WAE from 'web-auto-extractor' | ||
import request from 'request' | ||
const pageUrl = 'http://southernafricatravel.com/' | ||
request(pageUrl, function (error, response, body) { | ||
let wae = WAE.init(body) | ||
// console.log(wae.parse()) | ||
// If the page uses microdata | ||
let waeMicrodata = wae.parseMicrodata() | ||
// See API for more options | ||
// console.log(waeMicrodata.data()) | ||
// You can query on the parsed result to look for properties marked up by the page | ||
let images = waeMicrodata.find('telephone') | ||
// console.log(images) | ||
}) | ||
``` | ||
#### CommonJS import style | ||
```js | ||
var WAE = require('web-auto-extractor').default | ||
//ES6: import WAE from 'web-auto-extractor' | ||
var wae = WAE.parse(sampleHTML) | ||
console.log(wae) | ||
/* | ||
OUTPUT | ||
====== | ||
{ | ||
microdata: { data: {..}, unnormalizedData: {..} }, | ||
rdfa: { data: {..}, unnormalizedData: {..} }, | ||
jsonld: { data: {..}, unnormalizedData: null, | ||
metaTags: { data: {..}, unnormalizedData: {..} } | ||
} | ||
*/ | ||
``` | ||
## Installation | ||
### Installation | ||
`npm install web-auto-extractor` | ||
## API | ||
### Initializing | ||
You would first need to load in the HTML to get a WAEObject | ||
### Usage | ||
#### Import | ||
```js | ||
const wae = WAE.init('<div itemtype="Product">...</div>') | ||
> var WAE = require('web-auto-extractor').default | ||
//ES6: import WAE from 'web-auto-extractor' | ||
``` | ||
Each WAEObject comes with the following set of methods | ||
### WAEObject Methods | ||
*NOTE: The result of these functions are **cached**, so multiple calls to them shouldn't affect performance.* | ||
Lets use this `sampleHTML` for our example | ||
```html | ||
<div itemscope itemtype="http://schema.org/Product"> | ||
<span itemprop="brand">ACME</span> | ||
<span itemprop="name">Executive Anvil</span> | ||
<img itemprop="image" src="anvil_executive.jpg" alt="Executive Anvil logo" /> | ||
<span itemprop="description">Sleeker than ACME's Classic Anvil, the | ||
Executive Anvil is perfect for the business traveler | ||
looking for something to drop from a height. | ||
</span> | ||
Product #: <span itemprop="mpn">925872</span> | ||
<span itemprop="aggregateRating" itemscope itemtype="http://schema.org/AggregateRating"> | ||
<span itemprop="ratingValue">4.4</span> stars, based on <span itemprop="reviewCount">89 | ||
</span> reviews | ||
</span> | ||
#### .parse() | ||
Finds all supported semantically structured information on the HTML in normalized format. | ||
<span itemprop="offers" itemscope itemtype="http://schema.org/Offer"> | ||
Regular price: $179.99 | ||
<meta itemprop="priceCurrency" content="USD" /> | ||
$<span itemprop="price">119.99</span> | ||
(Sale ends <time itemprop="priceValidUntil" datetime="2020-11-05"> | ||
5 November!</time>) | ||
Available from: <span itemprop="seller" itemscope itemtype="http://schema.org/Organization"> | ||
<span itemprop="name">Executive Objects</span> | ||
</span> | ||
Condition: <link itemprop="itemCondition" href="http://schema.org/UsedCondition"/>Previously owned, | ||
in excellent condition | ||
<link itemprop="availability" href="http://schema.org/InStock"/>In stock! Order now!</span> | ||
</span> | ||
</div> | ||
``` | ||
#### .parseMicrodata() | ||
Finds all Microdata information on the page and returns it as a [WAEParserObject](#waeparserobject-attributes). | ||
#### Parsing | ||
```js | ||
> var wae = WAE.parse(sampleHTML) | ||
``` | ||
This returns an object with the following attributes, each of which is of the type [WAEParserObject](#waeparserobject-attributes). | ||
#### .parseRdfa() | ||
Finds all RDFa-Lite information on the page and returns it as a [WAEParserObject](#waeparserobject-attributes). | ||
- microdata | ||
- rdfa | ||
- jsonld | ||
- metaTags | ||
#### .parseJsonld() | ||
Finds all JSON-LD information on the page and returns it as a [WAEParserObject](#waeparserobject-attributes). | ||
```js | ||
// Since our sampleHTML uses microdata | ||
> var parsedMicrodata = wae.microdata | ||
``` | ||
#### .parseMetaTags() | ||
Finds all meta tags information on the page and returns it as a [WAEParserObject](#waeparserobject-attributes). | ||
##### WAEParserObject Attributes | ||
### WAEParserObject Attributes | ||
*NOTE: The result of these functions are **cached**, so multiple calls to them shouldn't affect performance.* | ||
#### .data() | ||
###### .data | ||
Gets the normalized result of the parsed format. | ||
#### .unnormalizedData() | ||
Gets the unnormalized flattened result of the parsed format which includes meta information relating to the parsed properties. | ||
```js | ||
// Let's print this out for our example | ||
> parsedMicrodata.data | ||
``` | ||
OUTPUT: | ||
```json | ||
[ | ||
{ | ||
"@context": "http://schema.org/", | ||
"@type": "Product", | ||
"brand": "ACME", | ||
"name": "Executive Anvil", | ||
"image": "anvil_executive.jpg", | ||
"description": "Sleeker than ACME's Classic Anvil, the\n Executive Anvil is perfect for the business traveler\n looking for something to drop from a height.", | ||
"mpn": "925872", | ||
"aggregateRating": { | ||
"@context": "http://schema.org/", | ||
"@type": "AggregateRating", | ||
"ratingValue": "4.4", | ||
"reviewCount": "89" | ||
}, | ||
"offers": { | ||
"@context": "http://schema.org/", | ||
"@type": "Offer", | ||
"priceCurrency": "USD", | ||
"price": "119.99", | ||
"priceValidUntil": "5 November!", | ||
"seller": { | ||
"@context": "http://schema.org/", | ||
"@type": "Organization", | ||
"name": "Executive Objects" | ||
}, | ||
"itemCondition": "http://schema.org/UsedCondition", | ||
"availability": "http://schema.org/InStock" | ||
} | ||
} | ||
] | ||
``` | ||
#### .find(propName) | ||
Returns a list of elements from `.data()` that corresponds to the property with the name `[propName]`. | ||
###### .unnormalizedData | ||
Gets the unnormalized flattened intermediate result of the parsed format which includes meta information relating to the parsed properties. | ||
[See test cases](https://github.com/ind9/web-auto-extractor/blob/master/test/test.js) for more examples. | ||
For more examples, [See this output here](https://github.com/ind9/web-auto-extractor/blob/master/test/resources/expectedResult.json) which uses [this HTML](https://github.com/ind9/web-auto-extractor/blob/master/test/resources/testPage.html) |
var WAE = require('web-auto-extractor').default | ||
//ES6: import WAE from 'web-auto-extractor' | ||
var request = require('request') | ||
var pageUrl = 'http://southernafricatravel.com/' | ||
//var pageUrl = 'https://raw.githubusercontent.com/ind9/web-auto-extractor/master/test/resources/testPage.html' | ||
request(pageUrl, function (error, response, body) { | ||
var wae = WAE.init(body) | ||
console.log(wae.parse()) | ||
// Useful operations for you to try. Refer API section in README for more. | ||
// var waeMicrodata = wae.parseMicrodata() | ||
// console.log(waeMicrodata.data()) | ||
// | ||
// var images = waeMicrodata.find('telephone') | ||
// console.log(images) | ||
var wae = WAE.parse(body) | ||
console.log(wae) | ||
/* | ||
OUTPUT | ||
====== | ||
{ | ||
microdata: { .. }, | ||
rdfa: { .. }, | ||
jsonld: { .. }, | ||
metaTags: { .. } | ||
} | ||
*/ | ||
}) |
140
17597
300