Socket
Socket
Sign inDemoInstall

crawler-url-parser

Package Overview
Dependencies
37
Maintainers
1
Versions
21
Alerts
File Explorer

Advanced tools

Install Socket

Detect and block malicious and high-risk dependencies

Install

Comparing version 1.5.1 to 2.0.0

.vscode/settings.json

202

crawler-url-parser.js

@@ -1,43 +0,55 @@

const url = require('url');
const URL = require('url');
const psl = require('psl');
const normalizeUrl = require('normalize-url');
const cleanUrl = require('url-clean');
const cheerio = require('cheerio');
const normalize_options={
removeDirectoryIndex:false,
removeTrailingSlash:false,
stripWWW:false,
stripFragment:true,
normalizeHttps:false,
normalizeProtocol:true,
removeQueryParameters: [/^utm_\w+/i, 'ref']
}
const result_normalize_options={
removeDirectoryIndex:true,
removeTrailingSlash:true,
stripWWW:true,
stripFragment:true,
normalizeHttps:false,
normalizeProtocol:true,
removeQueryParameters: [/^utm_\w+/i, 'ref']
}
function parse(currentUrlStr,baseUrlStr)
{
let ret = {url:null,normalized:null,protocol:null,host:null,domain:null,subdomain:null,path:null}
let ret = {url:null,baseurl:null,normalized:null,protocol:null,host:null,domain:null,subdomain:null,path:null,search:null,querycount:0}
if(typeof currentUrlStr === 'undefined') return null;
let currentNormUrlStr = cleanUrl(currentUrlStr,normalize_options);
if(currentNormUrlStr === "") return null;
//check if currentUrlStr format like "aaa"
let currentNormUrlStr = normalizeUrl(currentUrlStr);
let parsedUrl = URL.parse(currentNormUrlStr,true,true);
//test for normalised url like "http://aaa"
if(/^http:\/\/[^.]+$/.test(currentNormUrlStr)){
currentNormUrlStr = currentNormUrlStr.replace("http://","");
currentNormUrlStr = currentNormUrlStr.replace("/?","?");
}
let parsedUrl = url.parse(currentNormUrlStr,true,true);
if(parsedUrl.protocol !='http:' && parsedUrl.protocol !='https:' && parsedUrl.protocol != null) return null;
//current url is relative like "abc", "/abc" or "../abc"
if(parsedUrl.host == null && typeof baseUrlStr !== "undefined")
let normalizedBaseUrl = null;
if(parsedUrl.host == null && typeof baseUrlStr !== "undefined" && baseUrlStr != null)
{
let normalizedBaseUrl = normalizeUrl(baseUrlStr,{removeTrailingSlash: false});
let parsedBaseUrl = url.parse(normalizedBaseUrl,{removeTrailingSlash: false});
let absoluteUrl = url.parse(url.resolve(parsedBaseUrl,parsedUrl));
currentUrlStr = url.format(absoluteUrl);
normalizedBaseUrl = cleanUrl(baseUrlStr,normalize_options);
ret.baseurl = normalizedBaseUrl;
let parsedBaseUrl = URL.parse(normalizedBaseUrl,normalize_options);
let absoluteUrl = URL.parse(URL.resolve(parsedBaseUrl,parsedUrl));
currentUrlStr = URL.format(absoluteUrl);
}
ret.url = currentUrlStr;
ret.normalized = normalizeUrl(currentUrlStr);
ret.normalized = cleanUrl(currentUrlStr,result_normalize_options);
if(/^http:\/\/[^.]+$/.test(ret.normalized)){
ret.normalized = ret.normalized.replace("http://","");
ret.normalized = ret.normalized.replace("/?","?");
}
parsedUrl = URL.parse(ret.normalized,true,true);
parsedUrl = url.parse(ret.normalized,true,true);
ret.protocol = parsedUrl.protocol;

@@ -53,2 +65,7 @@ ret.host = parsedUrl.host;

ret.search=parsedUrl.search;
ret.querycount = parsedUrl.search ? parsedUrl.search.split("=").length -1 : 0;
//ret.type = normalizedBaseUrl ? gettype(ret.normalized,normalizedBaseUrl):"none";
return ret;

@@ -64,13 +81,18 @@ }

let embedBaseUrl = parse(embedBaseUrlStr);
baseUrl = embedBaseUrl ? embedBaseUrl : baseUrl;
let baseUrlStr = baseUrl ? baseUrl.normalized : null;
$('a').each(function(i, el) {
let href = $(this).attr('href');
let text = $(this).text();
if(typeof href == "undefined" && href.length < 3 && /^(javascript|mailto:|ftp:)/ig.test(href)) return;
let text = $(this).text().trim();
//href = href.replace(/;.*$/g,"");
if(typeof href == "undefined" || href.length < 3 || /^(javascript|mailto:|ftp:)/ig.test(href)) return;
let currentUrl = embedBaseUrl == null ? parse(href,baseUrl.normalized) : parse(href,embedBaseUrl.normalized);
if(currentUrl == null) return;
//let currentUrl = embedBaseUrl == null ? parse(href,baseUrl.normalized) : parse(href,embedBaseUrl.normalized);
let currentUrl = parse(href,baseUrlStr);
if(!urlMap.has(currentUrl.normalized)){
urlMap.set(currentUrl.normalized,{url:currentUrl,text:text});
currentUrl.text = text == null ? "": text;
currentUrl.baseurl = baseUrlStr;
urlMap.set(currentUrl.normalized,currentUrl);
}

@@ -80,3 +102,3 @@ else{

if(! tmpUrl.text.includes(text)){
tmpUrl.text += ` ${text}`;
tmpUrl.text = `${tmpUrl.text} ${text}`;
}

@@ -87,75 +109,56 @@ }

//remove base url
urlMap.delete(baseUrl.normalized);
if(embedBaseUrl!=null){
urlMap.delete(embedBaseUrl.normalized);
}
urlMap.delete(baseUrlStr);
for (let currentUrl of urlMap.values()) {
if(baseUrl.host == currentUrl.host){
//internal
currentUrl.type="internal";
}
else if(baseUrl.domain == currentUrl.domain){
//subdomain
currentUrl.type="subdomain";
}
else{
//external
currentUrl.type="external";
}
currentUrl.type = gettype(currentUrl,baseUrl);
}
return Array.from(urlMap.values());
let retArr = Array.from(urlMap.values());
retArr = retArr.map(function(el) {
return {url:el.normalized, text:el.text, type:el.type}
});
return retArr;
}
function getlevel(current,base){
//samelevel,sublevel,uplevel
//if baseurl "sub.domain.com/aaa/bbb/ccc"
// "sub.domain.com/aaa/bbb/" - uplevel
// "sub.domain.com/aaa/ddd/" - samelevel
// "sub.domain.com/aaa/bbb/ccc/ddd" - sublevel
//else
// null
let ret = null ;
let normlizedCurrent = normalizeUrl(current);
let normlizedBase = normalizeUrl(base);
let parsedCurrentUrl = url.parse(normlizedCurrent);
let parsedBaseUrl = url.parse(normlizedBase);
function gettype(linkurl,pageurl){
if(parsedCurrentUrl.host==parsedBaseUrl.host){
let cPath = parsedCurrentUrl.pathname;
let bPath = parsedBaseUrl.pathname;
let cCount = cPath.split("/").length -1;
let bCount = bPath.split("/").length -1;
if(cCount!=bCount){
if(cPath.includes(bPath)){
ret = "uplevel";
}
else if(bPath.includes(cPath)){
ret = "sublevel"
}
if(typeof linkurl == "string") linkurl = parse(linkurl);
if(typeof pageurl == "string") pageurl = parse(pageurl);
let linkurl_subdomain_len = linkurl.subdomain ? linkurl.subdomain.length : 0;
let pageurl_subdomain_len = pageurl.subdomain ? pageurl.subdomain.length : 0;
let linkurl_path = linkurl.path ? linkurl.path : "";
let pageurl_path = pageurl.path ? pageurl.path : "";
let linkurl_parts = linkurl_path.split("/").filter(function(elem, index, array){ return elem.length > 0});
let pageurl_parts = pageurl_path.split("/").filter(function(elem, index, array){ return elem.length > 0});
if(pageurl.host == linkurl.host){
let part_count_diff = linkurl_parts.length - pageurl_parts.length;
if(part_count_diff == 0){
let linkurl_without_last_part = linkurl_path.replace(/(\/[^\/]*)[\/]?$/,"");
let pageurl_without_last_part = pageurl_path.replace(/(\/[^\/]*)[\/]?$/,"");
if(linkurl_without_last_part == pageurl_without_last_part) return "samelevel"
}
else if(cCount == bCount){
cPath = cPath.replace(/(\/[^\/]*)[\/]?$/,"");
bPath = bPath.replace(/(\/[^\/]*)[\/]?$/,"");
if(cPath == bPath){
return "samelevel";
}
else if(part_count_diff == 1){
if(linkurl_path.includes(pageurl_path)) return "sublevel";
}
else if(part_count_diff == -1){
if(pageurl_path.includes(linkurl_path)) return "uplevel";
}
return "internal";
}
return ret;
}
function querycount(current){
let ret = 0 ;
let normlizedCurrent = normalizeUrl(current);
let parsedCurrentUrl = url.parse(normlizedCurrent);
else if(linkurl.domain == pageurl.domain){
if(linkurl_subdomain_len < pageurl_subdomain_len) return "updomain";
return "subdomain";
}
if(parsedCurrentUrl.search!=null){
ret = parsedCurrentUrl.search.split("=").length -1;
}
return ret;
return "external";
}

@@ -165,5 +168,5 @@

module.exports.extract = extract;
module.exports.getlevel = getlevel;
module.exports.querycount = querycount;
module.exports.gettype = gettype;
//for testing purpose

@@ -177,3 +180,12 @@ if (!module.parent){

//debugger;
process.exit();
//let res = parse("ddd","http://www.stackoverflow.com/aaa/bbb/ccc/");
let page = 'http://journals.tubitak.gov.tr/';
let link = 'http://journals.tubitak.gov.tr/genel/telifhakki.pdf';
let res = gettype(link,page);
debugger
res = gettype(page,link);
debugger
//process.exit();
}
{
"name": "crawler-url-parser",
"version": "1.5.1",
"version": "2.0.0",
"description": "An `URL` parser for crawling purpose.",

@@ -38,9 +38,10 @@ "main": "crawler-url-parser.js",

"cheerio": "^1.0.0-rc.2",
"normalize-url": "^2.0.0",
"psl": "^1.1.20",
"url": "^0.11.0"
"url": "^0.11.0",
"url-clean": "1.0.2"
},
"devDependencies": {
"mocha": "^4.0.1",
"path": "^0.12.7"
"path": "^0.12.7",
"crawler-request": "^1.1.3"
},

@@ -47,0 +48,0 @@ "scripts": {

@@ -20,12 +20,74 @@ # crawler-url-parser

//// parse(current_url,base_url)
let url = cup.parse("../ddd","http://question.stackoverflow.com/aaa/bbb/ccc/");
console.log(url.normalized);//http://question.stackoverflow.com/aaa/bbb/ddd
console.log(url.host); // question.stackoverflow.com
console.log(url.domain); // stackoverflow.com
console.log(url.subdomain); // question
console.log(url.protocol); // http:
console.log(url.path); // /aaa/bbb/ddd
//// parse(current_url[,base_url])
let result = cup.parse("http://question.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2");
console.log(result.url);
// http://question.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2
console.log(result.baseurl);
// null
console.log(result.normalized);
// http://question.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2
console.log(result.host);
// question.stackoverflow.com
console.log(result.domain);
// stackoverflow.com
console.log(result.subdomain);
// question
console.log(result.protocol);
// http:
console.log(result.path);
// /aaa/bbb/ddd
console.log(result.search);
// q1=query1&q2=query2
console.log(result.querycount);
// 2
```
### Parse with baseURL
```js
const cup = require('crawler-url-parser');
//// parse(current_url[,base_url])
let result = cup.parse("../ddd?q1=query1&q2=query2","http://question.stackoverflow.com/aaa/bbb/ccc/");
console.log(result.url);
// http://question.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2
console.log(result.baseurl);
// http://question.stackoverflow.com/aaa/bbb/ccc
console.log(result.normalized);
// http://question.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2
console.log(result.host);
// question.stackoverflow.com
console.log(result.domain);
// stackoverflow.com
console.log(result.subdomain);
// question
console.log(result.protocol);
// http:
console.log(result.path);
// /aaa/bbb/ddd
console.log(result.search);
// q1=query1&q2=query2
console.log(result.querycount);
// 2
```
### Extract

@@ -36,19 +98,33 @@ ```js

//// extract(html_str,current_url);
let htmlStr=
'html> \
<body> \
<a href="http://www.stackoverflow.com/internal-1">test-link-4</a><br /> \
<a href="http://www.stackoverflow.com/internal-2">test-link-5</a><br /> \
<a href="http://www.stackoverflow.com/internal-2">test-link-6</a><br /> \
<a href="http://faq.stackoverflow.com/subdomain-1">test-link-7</a><br /> \
<a href="http://faq.stackoverflow.com/subdomain-2">test-link-8</a><br /> \
<a href="http://faq.stackoverflow.com/subdomain-2">test-link-9</a><br /> \
<a href="http://www.google.com/external-1">test-link-10</a><br /> \
<a href="http://www.google.com/external-2">test-link-11</a><br /> \
<a href="http://www.google.com/external-2">test-link-12</a><br /> \
</body> \
</html>';
let currentUrl= "http://www.stackoverflow.com/aaa/bbb/ccc";
let htmlStr='<html><body> \
<a href="http://best.question.stackoverflow.com">subdomain</a><br /> \
<a href="http://faq.stackoverflow.com">subdomain</a><br /> \
<a href="http://stackoverflow.com">updomain</a><br /> \
<a href="http://www.google.com">external</a><br /> \
<a href="http://www.facebook.com">external</a><br /> \
<a href="http://question.stackoverflow.com/aaa/bbb/ccc">sublevel</a><br /> \
<a href="http://question.stackoverflow.com/aaa/bbb/zzz">sublevel</a><br /> \
<a href="http://question.stackoverflow.com/aaa/">uplevel</a><br /> \
<a href="http://question.stackoverflow.com/aaa/ddd">samelevel</a><br /> \
<a href="http://question.stackoverflow.com/aaa/eee">samelevel</a><br /> \
<a href="http://question.stackoverflow.com/aaa/ddd/eee">internal</a><br /> \
<a href="http://question.stackoverflow.com/zzz">internal</a><br /> \
</body></html>';
let currentUrl= "http://question.stackoverflow.com/aaa/bbb";
let urls = cup.extract(htmlStr,currentUrl);
console.log(urls.length); // 6
console.log(urls[0].type); //subdomain
console.log(urls[1].type); //subdomain
console.log(urls[2].type); //updomain
console.log(urls[3].type); //external
console.log(urls[4].type); //external
console.log(urls[5].type); //sublevel
console.log(urls[6].type); //sublevel
console.log(urls[7].type); //uplevel
console.log(urls[8].type); //samelevel
console.log(urls[9].type); //samelevel
console.log(urls[10].type); //internal
console.log(urls[11].type); //subdomain
```

@@ -60,25 +136,16 @@

//// getlevel(current_url,base_url);
let level = cup.getlevel("sub.domain.com/aaa/bbb/","sub.domain.com/aaa/bbb/ccc");
//// gettype(current_url,base_url);
let level = cup.gettype("sub.domain.com/aaa/bbb/","sub.domain.com/aaa/bbb/ccc");
console.log(level); //sublevel
level = cup.getlevel("sub.domain.com/aaa/bbb/ccc/ddd","sub.domain.com/aaa/bbb/ccc");
level = cup.gettype("sub.domain.com/aaa/bbb/ccc/ddd","sub.domain.com/aaa/bbb/ccc");
console.log(level); //uplevel
level = cup.getlevel("sub.domain.com/aaa/bbb/eee","sub.domain.com/aaa/bbb/ccc");
level = cup.gettype("sub.domain.com/aaa/bbb/eee","sub.domain.com/aaa/bbb/ccc");
console.log(level); //samelevel
level = cup.getlevel("sub.domain.com/aaa/bbb/eee","sub.anotherdomain.com/aaa/bbb/ccc");
console.log(level); //null
level = cup.gettype("sub.domain.com/aaa/bbb/eee","sub.anotherdomain.com/aaa/bbb/ccc");
console.log(level); //external
```
### Query
```js
const cup = require('crawler-url-parser');
//// querycount(url)
let count = cup.querycount("sub.domain.com/aaa/bbb?q1=data1&q2=data2&q3=data3");
console.log(count); //3
```
## Test

@@ -85,0 +152,0 @@ `mocha` or `npm test`

@@ -281,3 +281,13 @@ const assert = require('assert');

describe('parse paths with invalid protocol "ftp://www.google.com"', function() {
it('should be null "ftp://www.google.com"', function() {
let res = cup.parse("ftp://www.google.com");
assert.equal(res,null);
});
});
describe('parse paths with invalid protocol "htp://www.google.com"', function() {
it('should be null "htp://www.google.com"', function() {

@@ -284,0 +294,0 @@ let res = cup.parse("htp://www.google.com");

const assert = require('assert');
const cup = require("../");
describe('getlevel url as samelevel, sublevel, uplevel', function() {
it('should getlevel sublevel urls', function() {
let res = cup.getlevel("sub.domain.com/aaa/bbb/","sub.domain.com/aaa/bbb/ccc");
assert.equal(res,"sublevel");
describe('gettype url as samelevel, sublevel, uplevel', function() {
it('should gettype sublevel urls', function() {
let res = cup.gettype("sub.domain.com/aaa/bbb/","sub.domain.com/aaa/bbb/ccc");
assert.equal(res,"uplevel");
});
it('should getlevel uplevel urls', function() {
let res = cup.getlevel("sub.domain.com/aaa/bbb/ccc/ddd","sub.domain.com/aaa/bbb/ccc");
assert.equal(res,"uplevel");
it('should gettype uplevel urls', function() {
let res = cup.gettype("sub.domain.com/aaa/bbb/ccc/ddd","sub.domain.com/aaa/bbb/ccc");
assert.equal(res,"sublevel");
});
it('should getlevel samelevel urls', function() {
let res = cup.getlevel("sub.domain.com/aaa/bbb/eee","sub.domain.com/aaa/bbb/ccc");
it('should gettype samelevel urls', function() {
let res = cup.gettype("sub.domain.com/aaa/bbb/eee","sub.domain.com/aaa/bbb/ccc");
assert.equal(res,"samelevel");;
});
it('should handle unvalid urls', function() {
let res = cup.getlevel("sub.domain.com/aaa/bbb/eee","sub.anotherdomain.com/aaa/bbb/ccc");
assert.equal(res,null);
it('should handle invalid urls', function() {
let res = cup.gettype("sub.domain.com/aaa/bbb/eee","sub.anotherdomain.com/aaa/bbb/ccc");
assert.equal(res,"external");
});
});

@@ -6,25 +6,25 @@ const assert = require('assert');

it('should calculate urls query-0', function() {
let res = cup.querycount("sub.domain.com/aaa/bbb");
assert.equal(res,0);
let res = cup.parse("sub.domain.com/aaa/bbb");
assert.equal(res.querycount,0);
});
it('should calculate urls query-1', function() {
let res = cup.querycount("sub.domain.com/aaa/bbb?q1=data1");
assert.equal(res,1);
let res = cup.parse("sub.domain.com/aaa/bbb?q1=data1");
assert.equal(res.querycount,1);
});
it('should calculate urls query-2', function() {
let res = cup.querycount("sub.domain.com/aaa/bbb?q1=data1&q2=data2");
assert.equal(res,2);
let res = cup.parse("sub.domain.com/aaa/bbb?q1=data1&q2=data2");
assert.equal(res.querycount,2);
});
it('should calculate urls query-3', function() {
let res = cup.querycount("sub.domain.com/aaa/bbb?q1=data1&q2=data2&q3=data3");
assert.equal(res,3);
let res = cup.parse("sub.domain.com/aaa/bbb?q1=data1&q2=data2&q3=data3");
assert.equal(res.querycount,3);
});
it('should calculate urls query-4', function() {
let res = cup.querycount("sub.domain.com/aaa/bbb?q1=data1&q2=data2&q3=data3&q4=data4");
assert.equal(res,4);
let res = cup.parse("sub.domain.com/aaa/bbb?q1=data1&q2=data2&q3=data3&q4=data4");
assert.equal(res.querycount,4);
});
});
SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc