supercrawler
Advanced tools
Comparing version 0.7.0 to 0.8.0
@@ -28,2 +28,5 @@ var cheerio = require("cheerio"), | ||
return xmlBufProm.then(function (xmlBuf) { | ||
var locUrls, | ||
linkUrls; | ||
var $ = cheerio.load(xmlBuf, { | ||
@@ -33,11 +36,13 @@ xmlMode: true | ||
return $("sitemapindex > sitemap > loc, urlset > url > loc").map(function () { | ||
var $this; | ||
locUrls = $("sitemapindex > sitemap > loc, urlset > url > loc").map(function () { | ||
return $(this).text(); | ||
}).get(); | ||
$this = $(this); | ||
linkUrls = $("urlset > url > xhtml\\:link[href][rel=alternate]").map(function () { | ||
return $(this).attr("href"); | ||
}).get(); | ||
return $this.text(); | ||
}).get(); | ||
return locUrls.concat(linkUrls); | ||
}); | ||
}; | ||
}; |
{ | ||
"name": "supercrawler", | ||
"description": "A web crawler. Supercrawler automatically crawls websites. Define custom handlers to parse content. Obeys robots.txt, rate limits and concurrency limits.", | ||
"version": "0.7.0", | ||
"version": "0.8.0", | ||
"homepage": "https://github.com/brendonboshell/supercrawler", | ||
@@ -6,0 +6,0 @@ "author": "Brendon Boshell <brendonboshell@gmail.com>", |
@@ -304,2 +304,7 @@ # Node.js Web Crawler | ||
### 0.8.0 | ||
* [Changed] Sitemaps parser now extracts `<xhtml:link rel="alternate">` URLs, | ||
in addition to the `<loc>` URLs. | ||
### 0.7.0 | ||
@@ -309,3 +314,3 @@ | ||
a large list of URLs into the crawl queue. | ||
* [Updated] `DbUrlList` supports the bulk insert method. | ||
* [Changed] `DbUrlList` supports the bulk insert method. | ||
@@ -312,0 +317,0 @@ ### 0.6.1 |
@@ -61,2 +61,22 @@ var sitemapsParser = require("../../lib/handlers/sitemapsParser"), | ||
it("discovers an alternate link", function (done) { | ||
urlset = [ | ||
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>", | ||
"<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\" xmlns:xhtml=\"http://www.w3.org/1999/xhtml\" >", | ||
"<url>", | ||
"<loc>https://example.com/home.html</loc>", | ||
"<xhtml:link rel=\"alternate\" hreflang=\"de\" href=\"https://example.com/home-de.html\" />", | ||
"</url>", | ||
"</urlset>]" | ||
].join("\n"); | ||
sp(new Buffer(urlset), "http://example.com/sitemap_index.xml").then(function (urls) { | ||
expect(urls).to.deep.equal([ | ||
"https://example.com/home.html", | ||
"https://example.com/home-de.html" | ||
]); | ||
done(); | ||
}); | ||
}); | ||
it("supports a .gz sitemap file", function (done) { | ||
@@ -63,0 +83,0 @@ Promise.promisify(zlib.gzip)(new Buffer(urlset)).then(function (buf) { |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
95962
2267
377