crawler-url-parser
An URL parser for crawling purpose
Installation
npm install crawler-url-parser
Usage
Parse
const cup = require('crawler-url-parser');
let url = cup.parse("../ddd","http://question.stackoverflow.com/aaa/bbb/ccc/");
console.log(url.normalized);
console.log(url.host);
console.log(url.domain);
console.log(url.subdomain);
console.log(url.protocol);
console.log(url.path);
const cup = require('crawler-url-parser');
let htmlStr=
'html> \
<body> \
<a href="http://www.stackoverflow.com/internal-1">test-link-4</a><br /> \
<a href="http://www.stackoverflow.com/internal-2">test-link-5</a><br /> \
<a href="http://www.stackoverflow.com/internal-2">test-link-6</a><br /> \
<a href="http://faq.stackoverflow.com/subdomain-1">test-link-7</a><br /> \
<a href="http://faq.stackoverflow.com/subdomain-2">test-link-8</a><br /> \
<a href="http://faq.stackoverflow.com/subdomain-2">test-link-9</a><br /> \
<a href="http://www.google.com/external-1">test-link-10</a><br /> \
<a href="http://www.google.com/external-2">test-link-11</a><br /> \
<a href="http://www.google.com/external-2">test-link-12</a><br /> \
</body> \
</html>';
let currentUrl= "http://www.stackoverflow.com/aaa/bbb/ccc";
let urls = cup.extract(htmlStr,currentUrl);
console.log(urls.length);
Level
const cup = require('crawler-url-parser');
let level = cup.getlevel("sub.domain.com/aaa/bbb/","sub.domain.com/aaa/bbb/ccc");
console.log(level);
level = cup.getlevel("sub.domain.com/aaa/bbb/ccc/ddd","sub.domain.com/aaa/bbb/ccc");
console.log(level);
level = cup.getlevel("sub.domain.com/aaa/bbb/eee","sub.domain.com/aaa/bbb/ccc");
console.log(level);
level = cup.getlevel("sub.domain.com/aaa/bbb/eee","sub.anotherdomain.com/aaa/bbb/ccc");
console.log(level);
Query
const cup = require('crawler-url-parser');
let count = cup.querycount("sub.domain.com/aaa/bbb?q1=data1&q2=data2&q3=data3");
console.log(count);
Test
mocha
or npm test
more than 200 unit test cases.
check test folder and QUICKSTART.js for extra usage.