crawler-url-parser
An URL parser for crawling purpose
Installation
npm install crawler-url-parser
Usage
Parse
const cup = require('crawler-url-parser');
let result = cup.parse("http://question.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2");
console.log(result.url);
console.log(result.baseurl);
console.log(result.normalized);
console.log(result.host);
console.log(result.domain);
console.log(result.subdomain);
console.log(result.protocol);
console.log(result.path);
console.log(result.search);
console.log(result.querycount);
Parse with baseURL
const cup = require('crawler-url-parser');
let result = cup.parse("../ddd?q1=query1&q2=query2","http://question.stackoverflow.com/aaa/bbb/ccc/");
console.log(result.url);
console.log(result.baseurl);
console.log(result.normalized);
console.log(result.host);
console.log(result.domain);
console.log(result.subdomain);
console.log(result.protocol);
console.log(result.path);
console.log(result.search);
console.log(result.querycount);
const cup = require('crawler-url-parser');
let htmlStr='<html><body> \
<a href="http://best.question.stackoverflow.com">subdomain</a><br /> \
<a href="http://faq.stackoverflow.com">subdomain</a><br /> \
<a href="http://stackoverflow.com">updomain</a><br /> \
<a href="http://www.google.com">external</a><br /> \
<a href="http://www.facebook.com">external</a><br /> \
<a href="http://question.stackoverflow.com/aaa/bbb/ccc">sublevel</a><br /> \
<a href="http://question.stackoverflow.com/aaa/bbb/zzz">sublevel</a><br /> \
<a href="http://question.stackoverflow.com/aaa/">uplevel</a><br /> \
<a href="http://question.stackoverflow.com/aaa/ddd">samelevel</a><br /> \
<a href="http://question.stackoverflow.com/aaa/eee">samelevel</a><br /> \
<a href="http://question.stackoverflow.com/aaa/ddd/eee">internal</a><br /> \
<a href="http://question.stackoverflow.com/zzz">internal</a><br /> \
</body></html>';
let currentUrl= "http://question.stackoverflow.com/aaa/bbb";
let urls = cup.extract(htmlStr,currentUrl);
console.log(urls[0].type);
console.log(urls[1].type);
console.log(urls[2].type);
console.log(urls[3].type);
console.log(urls[4].type);
console.log(urls[5].type);
console.log(urls[6].type);
console.log(urls[7].type);
console.log(urls[8].type);
console.log(urls[9].type);
console.log(urls[10].type);
console.log(urls[11].type);
Level
const cup = require('crawler-url-parser');
let level = cup.gettype("sub.domain.com/aaa/bbb/","sub.domain.com/aaa/bbb/ccc");
console.log(level);
level = cup.gettype("sub.domain.com/aaa/bbb/ccc/ddd","sub.domain.com/aaa/bbb/ccc");
console.log(level);
level = cup.gettype("sub.domain.com/aaa/bbb/eee","sub.domain.com/aaa/bbb/ccc");
console.log(level);
level = cup.gettype("sub.domain.com/aaa/bbb/eee","sub.anotherdomain.com/aaa/bbb/ccc");
console.log(level);
Test
mocha
or npm test
more than 200 unit test cases.
check test folder and QUICKSTART.js for extra usage.