@extractus/feed-extractor
Advanced tools
Comparing version 7.0.4 to 7.0.5
@@ -65,3 +65,3 @@ { | ||
{ | ||
"max": 460, | ||
"max": 520, | ||
"skipBlankLines": true, | ||
@@ -74,3 +74,3 @@ "skipComments": false | ||
{ | ||
"max": 150, | ||
"max": 240, | ||
"skipBlankLines": true | ||
@@ -77,0 +77,0 @@ } |
{ | ||
"version": "7.0.4", | ||
"version": "7.0.5", | ||
"name": "@extractus/feed-extractor", | ||
@@ -4,0 +4,0 @@ "description": "To read and normalize RSS/ATOM/JSON feed data", |
@@ -117,2 +117,3 @@ # feed-extractor | ||
- [RSS Feed](https://www.rssboard.org/rss-specification) | ||
- [RDF Feed](https://web.resource.org/rss/1.0/spec) | ||
- [ATOM Feed](https://datatracker.ietf.org/doc/html/rfc5023) | ||
@@ -119,0 +120,0 @@ - [JSON Feed](https://www.jsonfeed.org/version/1.1/) |
@@ -6,6 +6,7 @@ // main.js | ||
import retrieve from './utils/retrieve.js' | ||
import { validate, xml2obj, isRSS, isAtom } from './utils/xmlparser.js' | ||
import { validate, xml2obj, isRSS, isAtom, isRdf } from './utils/xmlparser.js' | ||
import parseJsonFeed from './utils/parseJsonFeed.js' | ||
import parseRssFeed from './utils/parseRssFeed.js' | ||
import parseAtomFeed from './utils/parseAtomFeed.js' | ||
import parseRdfFeed from './utils/parseRdfFeed.js' | ||
@@ -46,2 +47,3 @@ const getopt = (options = {}) => { | ||
const data = xml2obj(xml, opts.xmlParserOptions) | ||
return isRSS(data) | ||
@@ -51,3 +53,5 @@ ? parseRssFeed(data, opts) | ||
? parseAtomFeed(data, opts) | ||
: null | ||
: isRdf(data) | ||
? parseRdfFeed(data, opts) | ||
: null | ||
} | ||
@@ -54,0 +58,0 @@ |
@@ -141,2 +141,26 @@ // main.test | ||
test('extract rdf feed from Slashdot with extraFields', async () => { | ||
const url = 'https://some-news-page.tld/atom' | ||
const xml = readFileSync('test-data/rdf-standard.xml', 'utf8') | ||
const { baseUrl, path } = parseUrl(url) | ||
nock(baseUrl).get(path).reply(200, xml, { | ||
'Content-Type': 'application/xml', | ||
}) | ||
const result = await extract(url, { | ||
getExtraFeedFields: data => { | ||
return { | ||
subject: data['dc:subject'], | ||
} | ||
}, | ||
getExtraEntryFields: data => { | ||
return { | ||
author: data['dc:creator'], | ||
} | ||
}, | ||
}) | ||
expect(hasProperty(result, 'subject')).toBe(true) | ||
expect(hasProperty(result.entries[0], 'author')).toBe(true) | ||
expect(validateProps(result.entries[0])).toBe(true) | ||
}) | ||
test('extract atom feed which contains multi links', async () => { | ||
@@ -295,2 +319,18 @@ const url = 'https://some-news-page.tld/atom/multilinks' | ||
test('extract rdf feed from Slashdot without normalization', async () => { | ||
const url = 'https://some-news-page.tld/atom' | ||
const xml = readFileSync('test-data/rdf-standard.xml', 'utf8') | ||
const { baseUrl, path } = parseUrl(url) | ||
nock(baseUrl).get(path).reply(200, xml, { | ||
'Content-Type': 'application/xml', | ||
}) | ||
const result = await extract(url, { | ||
normalization: false, | ||
}) | ||
expect(hasProperty(result.channel, 'syn:updateBase')).toBe(true) | ||
expect(hasProperty(result.channel, 'dc:rights')).toBe(true) | ||
expect(hasProperty(result, 'item')).toBe(true) | ||
expect(hasProperty(result.item[0], 'slash:department')).toBe(true) | ||
}) | ||
test('extract atom feed from Google', async () => { | ||
@@ -363,3 +403,3 @@ const url = 'https://some-news-page.tld/atom' | ||
describe('test extract with `baseUrl` option', () => { | ||
test('extract rss feed with xml', () => { | ||
test('extract rss feed from file', () => { | ||
const baseUrl = 'https://huggingface.co' | ||
@@ -382,3 +422,22 @@ const xml = readFileSync('test-data/rss-feed-miss-base-url.xml', 'utf8') | ||
test('extract rss feed with json', () => { | ||
test('extract rdf feed from file', () => { | ||
const baseUrl = 'https://slashdot.org' | ||
const xml = readFileSync('test-data/rdf-standard.xml', 'utf8') | ||
const result = extractFromXml(xml, { baseUrl }) | ||
feedAttrs.forEach((k) => { | ||
expect(hasProperty(result, k)).toBe(true) | ||
}) | ||
entryAttrs.forEach((k) => { | ||
expect(hasProperty(result.entries[0], k)).toBe(true) | ||
}) | ||
expect(validateProps(result.entries[0])).toBe(true) | ||
expect(result.link).toBe(baseUrl + '/') | ||
const firstItemLink = result.entries[0].link | ||
expect(firstItemLink.startsWith('https://tech.slashdot.org/story/23/08/23/2238246/spacex-')).toBe(true) | ||
}) | ||
test('extract json feed from file', () => { | ||
const baseUrl = 'https://www.jsonfeed.org' | ||
@@ -385,0 +444,0 @@ const json = readFileSync('test-data/json-feed-miss-base-url.json', 'utf8') |
@@ -101,4 +101,6 @@ // parseAtomFeed.js | ||
const feedData = data.feed | ||
if (!normalization) { | ||
return flatten(data.feed, baseUrl) | ||
return flatten(feedData, baseUrl) | ||
} | ||
@@ -115,5 +117,5 @@ | ||
entry: item = [], | ||
} = data.feed | ||
} = feedData | ||
const extraFields = getExtraFeedFields(data.feed) | ||
const extraFields = getExtraFeedFields(feedData) | ||
@@ -120,0 +122,0 @@ const items = isArray(item) ? item : [item] |
@@ -106,4 +106,6 @@ // parseRssFeed.js | ||
const feedData = data.rss.channel | ||
if (!normalization) { | ||
return flatten(data.rss.channel, baseUrl) | ||
return flatten(feedData, baseUrl) | ||
} | ||
@@ -119,5 +121,5 @@ | ||
item = [], | ||
} = data.rss.channel | ||
} = feedData | ||
const extraFields = getExtraFeedFields(data.rss.channel) | ||
const extraFields = getExtraFeedFields(feedData) | ||
@@ -124,0 +126,0 @@ const items = isArray(item) ? item : [item] |
@@ -15,2 +15,6 @@ // utils / xmlparser | ||
export const isRdf = (data = {}) => { | ||
return hasProperty(data, 'rdf:RDF') && hasProperty(data['rdf:RDF'], 'channel') | ||
} | ||
export const validate = (xml) => { | ||
@@ -22,5 +26,5 @@ return (!isString(xml) || !xml.length) ? false : XMLValidator.validate(xml) === true | ||
const options = { | ||
attributeNamePrefix: '@_', | ||
ignoreAttributes: false, | ||
...extraOptions, | ||
ignoreAttributes: false, | ||
attributeNamePrefix: '@_', | ||
} | ||
@@ -27,0 +31,0 @@ const parser = new XMLParser(options) |
64966
26
1701
381