crawler-url-parser
Advanced tools
Comparing version 1.5.1 to 2.0.0
@@ -1,43 +0,55 @@ | ||
const url = require('url'); | ||
const URL = require('url'); | ||
const psl = require('psl'); | ||
const normalizeUrl = require('normalize-url'); | ||
const cleanUrl = require('url-clean'); | ||
const cheerio = require('cheerio'); | ||
const normalize_options={ | ||
removeDirectoryIndex:false, | ||
removeTrailingSlash:false, | ||
stripWWW:false, | ||
stripFragment:true, | ||
normalizeHttps:false, | ||
normalizeProtocol:true, | ||
removeQueryParameters: [/^utm_\w+/i, 'ref'] | ||
} | ||
const result_normalize_options={ | ||
removeDirectoryIndex:true, | ||
removeTrailingSlash:true, | ||
stripWWW:true, | ||
stripFragment:true, | ||
normalizeHttps:false, | ||
normalizeProtocol:true, | ||
removeQueryParameters: [/^utm_\w+/i, 'ref'] | ||
} | ||
function parse(currentUrlStr,baseUrlStr) | ||
{ | ||
let ret = {url:null,normalized:null,protocol:null,host:null,domain:null,subdomain:null,path:null} | ||
let ret = {url:null,baseurl:null,normalized:null,protocol:null,host:null,domain:null,subdomain:null,path:null,search:null,querycount:0} | ||
if(typeof currentUrlStr === 'undefined') return null; | ||
let currentNormUrlStr = cleanUrl(currentUrlStr,normalize_options); | ||
if(currentNormUrlStr === "") return null; | ||
//check if currentUrlStr format like "aaa" | ||
let currentNormUrlStr = normalizeUrl(currentUrlStr); | ||
let parsedUrl = URL.parse(currentNormUrlStr,true,true); | ||
//test for normalised url like "http://aaa" | ||
if(/^http:\/\/[^.]+$/.test(currentNormUrlStr)){ | ||
currentNormUrlStr = currentNormUrlStr.replace("http://",""); | ||
currentNormUrlStr = currentNormUrlStr.replace("/?","?"); | ||
} | ||
let parsedUrl = url.parse(currentNormUrlStr,true,true); | ||
if(parsedUrl.protocol !='http:' && parsedUrl.protocol !='https:' && parsedUrl.protocol != null) return null; | ||
//current url is relative like "abc", "/abc" or "../abc" | ||
if(parsedUrl.host == null && typeof baseUrlStr !== "undefined") | ||
let normalizedBaseUrl = null; | ||
if(parsedUrl.host == null && typeof baseUrlStr !== "undefined" && baseUrlStr != null) | ||
{ | ||
let normalizedBaseUrl = normalizeUrl(baseUrlStr,{removeTrailingSlash: false}); | ||
let parsedBaseUrl = url.parse(normalizedBaseUrl,{removeTrailingSlash: false}); | ||
let absoluteUrl = url.parse(url.resolve(parsedBaseUrl,parsedUrl)); | ||
currentUrlStr = url.format(absoluteUrl); | ||
normalizedBaseUrl = cleanUrl(baseUrlStr,normalize_options); | ||
ret.baseurl = normalizedBaseUrl; | ||
let parsedBaseUrl = URL.parse(normalizedBaseUrl,normalize_options); | ||
let absoluteUrl = URL.parse(URL.resolve(parsedBaseUrl,parsedUrl)); | ||
currentUrlStr = URL.format(absoluteUrl); | ||
} | ||
ret.url = currentUrlStr; | ||
ret.normalized = normalizeUrl(currentUrlStr); | ||
ret.normalized = cleanUrl(currentUrlStr,result_normalize_options); | ||
if(/^http:\/\/[^.]+$/.test(ret.normalized)){ | ||
ret.normalized = ret.normalized.replace("http://",""); | ||
ret.normalized = ret.normalized.replace("/?","?"); | ||
} | ||
parsedUrl = URL.parse(ret.normalized,true,true); | ||
parsedUrl = url.parse(ret.normalized,true,true); | ||
ret.protocol = parsedUrl.protocol; | ||
@@ -53,2 +65,7 @@ ret.host = parsedUrl.host; | ||
ret.search=parsedUrl.search; | ||
ret.querycount = parsedUrl.search ? parsedUrl.search.split("=").length -1 : 0; | ||
//ret.type = normalizedBaseUrl ? gettype(ret.normalized,normalizedBaseUrl):"none"; | ||
return ret; | ||
@@ -64,13 +81,18 @@ } | ||
let embedBaseUrl = parse(embedBaseUrlStr); | ||
baseUrl = embedBaseUrl ? embedBaseUrl : baseUrl; | ||
let baseUrlStr = baseUrl ? baseUrl.normalized : null; | ||
$('a').each(function(i, el) { | ||
let href = $(this).attr('href'); | ||
let text = $(this).text(); | ||
if(typeof href == "undefined" && href.length < 3 && /^(javascript|mailto:|ftp:)/ig.test(href)) return; | ||
let text = $(this).text().trim(); | ||
//href = href.replace(/;.*$/g,""); | ||
if(typeof href == "undefined" || href.length < 3 || /^(javascript|mailto:|ftp:)/ig.test(href)) return; | ||
let currentUrl = embedBaseUrl == null ? parse(href,baseUrl.normalized) : parse(href,embedBaseUrl.normalized); | ||
if(currentUrl == null) return; | ||
//let currentUrl = embedBaseUrl == null ? parse(href,baseUrl.normalized) : parse(href,embedBaseUrl.normalized); | ||
let currentUrl = parse(href,baseUrlStr); | ||
if(!urlMap.has(currentUrl.normalized)){ | ||
urlMap.set(currentUrl.normalized,{url:currentUrl,text:text}); | ||
currentUrl.text = text == null ? "": text; | ||
currentUrl.baseurl = baseUrlStr; | ||
urlMap.set(currentUrl.normalized,currentUrl); | ||
} | ||
@@ -80,3 +102,3 @@ else{ | ||
if(! tmpUrl.text.includes(text)){ | ||
tmpUrl.text += ` ${text}`; | ||
tmpUrl.text = `${tmpUrl.text} ${text}`; | ||
} | ||
@@ -87,75 +109,56 @@ } | ||
//remove base url | ||
urlMap.delete(baseUrl.normalized); | ||
if(embedBaseUrl!=null){ | ||
urlMap.delete(embedBaseUrl.normalized); | ||
} | ||
urlMap.delete(baseUrlStr); | ||
for (let currentUrl of urlMap.values()) { | ||
if(baseUrl.host == currentUrl.host){ | ||
//internal | ||
currentUrl.type="internal"; | ||
} | ||
else if(baseUrl.domain == currentUrl.domain){ | ||
//subdomain | ||
currentUrl.type="subdomain"; | ||
} | ||
else{ | ||
//external | ||
currentUrl.type="external"; | ||
} | ||
currentUrl.type = gettype(currentUrl,baseUrl); | ||
} | ||
return Array.from(urlMap.values()); | ||
let retArr = Array.from(urlMap.values()); | ||
retArr = retArr.map(function(el) { | ||
return {url:el.normalized, text:el.text, type:el.type} | ||
}); | ||
return retArr; | ||
} | ||
function getlevel(current,base){ | ||
//samelevel,sublevel,uplevel | ||
//if baseurl "sub.domain.com/aaa/bbb/ccc" | ||
// "sub.domain.com/aaa/bbb/" - uplevel | ||
// "sub.domain.com/aaa/ddd/" - samelevel | ||
// "sub.domain.com/aaa/bbb/ccc/ddd" - sublevel | ||
//else | ||
// null | ||
let ret = null ; | ||
let normlizedCurrent = normalizeUrl(current); | ||
let normlizedBase = normalizeUrl(base); | ||
let parsedCurrentUrl = url.parse(normlizedCurrent); | ||
let parsedBaseUrl = url.parse(normlizedBase); | ||
function gettype(linkurl,pageurl){ | ||
if(parsedCurrentUrl.host==parsedBaseUrl.host){ | ||
let cPath = parsedCurrentUrl.pathname; | ||
let bPath = parsedBaseUrl.pathname; | ||
let cCount = cPath.split("/").length -1; | ||
let bCount = bPath.split("/").length -1; | ||
if(cCount!=bCount){ | ||
if(cPath.includes(bPath)){ | ||
ret = "uplevel"; | ||
} | ||
else if(bPath.includes(cPath)){ | ||
ret = "sublevel" | ||
} | ||
if(typeof linkurl == "string") linkurl = parse(linkurl); | ||
if(typeof pageurl == "string") pageurl = parse(pageurl); | ||
let linkurl_subdomain_len = linkurl.subdomain ? linkurl.subdomain.length : 0; | ||
let pageurl_subdomain_len = pageurl.subdomain ? pageurl.subdomain.length : 0; | ||
let linkurl_path = linkurl.path ? linkurl.path : ""; | ||
let pageurl_path = pageurl.path ? pageurl.path : ""; | ||
let linkurl_parts = linkurl_path.split("/").filter(function(elem, index, array){ return elem.length > 0}); | ||
let pageurl_parts = pageurl_path.split("/").filter(function(elem, index, array){ return elem.length > 0}); | ||
if(pageurl.host == linkurl.host){ | ||
let part_count_diff = linkurl_parts.length - pageurl_parts.length; | ||
if(part_count_diff == 0){ | ||
let linkurl_without_last_part = linkurl_path.replace(/(\/[^\/]*)[\/]?$/,""); | ||
let pageurl_without_last_part = pageurl_path.replace(/(\/[^\/]*)[\/]?$/,""); | ||
if(linkurl_without_last_part == pageurl_without_last_part) return "samelevel" | ||
} | ||
else if(cCount == bCount){ | ||
cPath = cPath.replace(/(\/[^\/]*)[\/]?$/,""); | ||
bPath = bPath.replace(/(\/[^\/]*)[\/]?$/,""); | ||
if(cPath == bPath){ | ||
return "samelevel"; | ||
} | ||
else if(part_count_diff == 1){ | ||
if(linkurl_path.includes(pageurl_path)) return "sublevel"; | ||
} | ||
else if(part_count_diff == -1){ | ||
if(pageurl_path.includes(linkurl_path)) return "uplevel"; | ||
} | ||
return "internal"; | ||
} | ||
return ret; | ||
} | ||
function querycount(current){ | ||
let ret = 0 ; | ||
let normlizedCurrent = normalizeUrl(current); | ||
let parsedCurrentUrl = url.parse(normlizedCurrent); | ||
else if(linkurl.domain == pageurl.domain){ | ||
if(linkurl_subdomain_len < pageurl_subdomain_len) return "updomain"; | ||
return "subdomain"; | ||
} | ||
if(parsedCurrentUrl.search!=null){ | ||
ret = parsedCurrentUrl.search.split("=").length -1; | ||
} | ||
return ret; | ||
return "external"; | ||
} | ||
@@ -165,5 +168,5 @@ | ||
module.exports.extract = extract; | ||
module.exports.getlevel = getlevel; | ||
module.exports.querycount = querycount; | ||
module.exports.gettype = gettype; | ||
//for testing purpose | ||
@@ -177,3 +180,12 @@ if (!module.parent){ | ||
//debugger; | ||
process.exit(); | ||
//let res = parse("ddd","http://www.stackoverflow.com/aaa/bbb/ccc/"); | ||
let page = 'http://journals.tubitak.gov.tr/'; | ||
let link = 'http://journals.tubitak.gov.tr/genel/telifhakki.pdf'; | ||
let res = gettype(link,page); | ||
debugger | ||
res = gettype(page,link); | ||
debugger | ||
//process.exit(); | ||
} |
{ | ||
"name": "crawler-url-parser", | ||
"version": "1.5.1", | ||
"version": "2.0.0", | ||
"description": "An `URL` parser for crawling purpose.", | ||
@@ -38,9 +38,10 @@ "main": "crawler-url-parser.js", | ||
"cheerio": "^1.0.0-rc.2", | ||
"normalize-url": "^2.0.0", | ||
"psl": "^1.1.20", | ||
"url": "^0.11.0" | ||
"url": "^0.11.0", | ||
"url-clean": "1.0.2" | ||
}, | ||
"devDependencies": { | ||
"mocha": "^4.0.1", | ||
"path": "^0.12.7" | ||
"path": "^0.12.7", | ||
"crawler-request": "^1.1.3" | ||
}, | ||
@@ -47,0 +48,0 @@ "scripts": { |
145
README.md
@@ -20,12 +20,74 @@ # crawler-url-parser | ||
//// parse(current_url,base_url) | ||
let url = cup.parse("../ddd","http://question.stackoverflow.com/aaa/bbb/ccc/"); | ||
console.log(url.normalized);//http://question.stackoverflow.com/aaa/bbb/ddd | ||
console.log(url.host); // question.stackoverflow.com | ||
console.log(url.domain); // stackoverflow.com | ||
console.log(url.subdomain); // question | ||
console.log(url.protocol); // http: | ||
console.log(url.path); // /aaa/bbb/ddd | ||
//// parse(current_url[,base_url]) | ||
let result = cup.parse("http://question.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2"); | ||
console.log(result.url); | ||
// http://question.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2 | ||
console.log(result.baseurl); | ||
// null | ||
console.log(result.normalized); | ||
// http://question.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2 | ||
console.log(result.host); | ||
// question.stackoverflow.com | ||
console.log(result.domain); | ||
// stackoverflow.com | ||
console.log(result.subdomain); | ||
// question | ||
console.log(result.protocol); | ||
// http: | ||
console.log(result.path); | ||
// /aaa/bbb/ddd | ||
console.log(result.search); | ||
// q1=query1&q2=query2 | ||
console.log(result.querycount); | ||
// 2 | ||
``` | ||
### Parse with baseURL | ||
```js | ||
const cup = require('crawler-url-parser'); | ||
//// parse(current_url[,base_url]) | ||
let result = cup.parse("../ddd?q1=query1&q2=query2","http://question.stackoverflow.com/aaa/bbb/ccc/"); | ||
console.log(result.url); | ||
// http://question.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2 | ||
console.log(result.baseurl); | ||
// http://question.stackoverflow.com/aaa/bbb/ccc | ||
console.log(result.normalized); | ||
// http://question.stackoverflow.com/aaa/bbb/ddd?q1=query1&q2=query2 | ||
console.log(result.host); | ||
// question.stackoverflow.com | ||
console.log(result.domain); | ||
// stackoverflow.com | ||
console.log(result.subdomain); | ||
// question | ||
console.log(result.protocol); | ||
// http: | ||
console.log(result.path); | ||
// /aaa/bbb/ddd | ||
console.log(result.search); | ||
// q1=query1&q2=query2 | ||
console.log(result.querycount); | ||
// 2 | ||
``` | ||
### Extract | ||
@@ -36,19 +98,33 @@ ```js | ||
//// extract(html_str,current_url); | ||
let htmlStr= | ||
'html> \ | ||
<body> \ | ||
<a href="http://www.stackoverflow.com/internal-1">test-link-4</a><br /> \ | ||
<a href="http://www.stackoverflow.com/internal-2">test-link-5</a><br /> \ | ||
<a href="http://www.stackoverflow.com/internal-2">test-link-6</a><br /> \ | ||
<a href="http://faq.stackoverflow.com/subdomain-1">test-link-7</a><br /> \ | ||
<a href="http://faq.stackoverflow.com/subdomain-2">test-link-8</a><br /> \ | ||
<a href="http://faq.stackoverflow.com/subdomain-2">test-link-9</a><br /> \ | ||
<a href="http://www.google.com/external-1">test-link-10</a><br /> \ | ||
<a href="http://www.google.com/external-2">test-link-11</a><br /> \ | ||
<a href="http://www.google.com/external-2">test-link-12</a><br /> \ | ||
</body> \ | ||
</html>'; | ||
let currentUrl= "http://www.stackoverflow.com/aaa/bbb/ccc"; | ||
let htmlStr='<html><body> \ | ||
<a href="http://best.question.stackoverflow.com">subdomain</a><br /> \ | ||
<a href="http://faq.stackoverflow.com">subdomain</a><br /> \ | ||
<a href="http://stackoverflow.com">updomain</a><br /> \ | ||
<a href="http://www.google.com">external</a><br /> \ | ||
<a href="http://www.facebook.com">external</a><br /> \ | ||
<a href="http://question.stackoverflow.com/aaa/bbb/ccc">sublevel</a><br /> \ | ||
<a href="http://question.stackoverflow.com/aaa/bbb/zzz">sublevel</a><br /> \ | ||
<a href="http://question.stackoverflow.com/aaa/">uplevel</a><br /> \ | ||
<a href="http://question.stackoverflow.com/aaa/ddd">samelevel</a><br /> \ | ||
<a href="http://question.stackoverflow.com/aaa/eee">samelevel</a><br /> \ | ||
<a href="http://question.stackoverflow.com/aaa/ddd/eee">internal</a><br /> \ | ||
<a href="http://question.stackoverflow.com/zzz">internal</a><br /> \ | ||
</body></html>'; | ||
let currentUrl= "http://question.stackoverflow.com/aaa/bbb"; | ||
let urls = cup.extract(htmlStr,currentUrl); | ||
console.log(urls.length); // 6 | ||
console.log(urls[0].type); //subdomain | ||
console.log(urls[1].type); //subdomain | ||
console.log(urls[2].type); //updomain | ||
console.log(urls[3].type); //external | ||
console.log(urls[4].type); //external | ||
console.log(urls[5].type); //sublevel | ||
console.log(urls[6].type); //sublevel | ||
console.log(urls[7].type); //uplevel | ||
console.log(urls[8].type); //samelevel | ||
console.log(urls[9].type); //samelevel | ||
console.log(urls[10].type); //internal | ||
console.log(urls[11].type); //subdomain | ||
``` | ||
@@ -60,25 +136,16 @@ | ||
//// getlevel(current_url,base_url); | ||
let level = cup.getlevel("sub.domain.com/aaa/bbb/","sub.domain.com/aaa/bbb/ccc"); | ||
//// gettype(current_url,base_url); | ||
let level = cup.gettype("sub.domain.com/aaa/bbb/","sub.domain.com/aaa/bbb/ccc"); | ||
console.log(level); //sublevel | ||
level = cup.getlevel("sub.domain.com/aaa/bbb/ccc/ddd","sub.domain.com/aaa/bbb/ccc"); | ||
level = cup.gettype("sub.domain.com/aaa/bbb/ccc/ddd","sub.domain.com/aaa/bbb/ccc"); | ||
console.log(level); //uplevel | ||
level = cup.getlevel("sub.domain.com/aaa/bbb/eee","sub.domain.com/aaa/bbb/ccc"); | ||
level = cup.gettype("sub.domain.com/aaa/bbb/eee","sub.domain.com/aaa/bbb/ccc"); | ||
console.log(level); //samelevel | ||
level = cup.getlevel("sub.domain.com/aaa/bbb/eee","sub.anotherdomain.com/aaa/bbb/ccc"); | ||
console.log(level); //null | ||
level = cup.gettype("sub.domain.com/aaa/bbb/eee","sub.anotherdomain.com/aaa/bbb/ccc"); | ||
console.log(level); //external | ||
``` | ||
### Query | ||
```js | ||
const cup = require('crawler-url-parser'); | ||
//// querycount(url) | ||
let count = cup.querycount("sub.domain.com/aaa/bbb?q1=data1&q2=data2&q3=data3"); | ||
console.log(count); //3 | ||
``` | ||
## Test | ||
@@ -85,0 +152,0 @@ `mocha` or `npm test` |
@@ -281,3 +281,13 @@ const assert = require('assert'); | ||
describe('parse paths with invalid protocol "ftp://www.google.com"', function() { | ||
it('should be null "ftp://www.google.com"', function() { | ||
let res = cup.parse("ftp://www.google.com"); | ||
assert.equal(res,null); | ||
}); | ||
}); | ||
describe('parse paths with invalid protocol "htp://www.google.com"', function() { | ||
it('should be null "htp://www.google.com"', function() { | ||
@@ -284,0 +294,0 @@ let res = cup.parse("htp://www.google.com"); |
const assert = require('assert'); | ||
const cup = require("../"); | ||
describe('getlevel url as samelevel, sublevel, uplevel', function() { | ||
it('should getlevel sublevel urls', function() { | ||
let res = cup.getlevel("sub.domain.com/aaa/bbb/","sub.domain.com/aaa/bbb/ccc"); | ||
assert.equal(res,"sublevel"); | ||
describe('gettype url as samelevel, sublevel, uplevel', function() { | ||
it('should gettype sublevel urls', function() { | ||
let res = cup.gettype("sub.domain.com/aaa/bbb/","sub.domain.com/aaa/bbb/ccc"); | ||
assert.equal(res,"uplevel"); | ||
}); | ||
it('should getlevel uplevel urls', function() { | ||
let res = cup.getlevel("sub.domain.com/aaa/bbb/ccc/ddd","sub.domain.com/aaa/bbb/ccc"); | ||
assert.equal(res,"uplevel"); | ||
it('should gettype uplevel urls', function() { | ||
let res = cup.gettype("sub.domain.com/aaa/bbb/ccc/ddd","sub.domain.com/aaa/bbb/ccc"); | ||
assert.equal(res,"sublevel"); | ||
}); | ||
it('should getlevel samelevel urls', function() { | ||
let res = cup.getlevel("sub.domain.com/aaa/bbb/eee","sub.domain.com/aaa/bbb/ccc"); | ||
it('should gettype samelevel urls', function() { | ||
let res = cup.gettype("sub.domain.com/aaa/bbb/eee","sub.domain.com/aaa/bbb/ccc"); | ||
assert.equal(res,"samelevel");; | ||
}); | ||
it('should handle unvalid urls', function() { | ||
let res = cup.getlevel("sub.domain.com/aaa/bbb/eee","sub.anotherdomain.com/aaa/bbb/ccc"); | ||
assert.equal(res,null); | ||
it('should handle invalid urls', function() { | ||
let res = cup.gettype("sub.domain.com/aaa/bbb/eee","sub.anotherdomain.com/aaa/bbb/ccc"); | ||
assert.equal(res,"external"); | ||
}); | ||
}); |
@@ -6,25 +6,25 @@ const assert = require('assert'); | ||
it('should calculate urls query-0', function() { | ||
let res = cup.querycount("sub.domain.com/aaa/bbb"); | ||
assert.equal(res,0); | ||
let res = cup.parse("sub.domain.com/aaa/bbb"); | ||
assert.equal(res.querycount,0); | ||
}); | ||
it('should calculate urls query-1', function() { | ||
let res = cup.querycount("sub.domain.com/aaa/bbb?q1=data1"); | ||
assert.equal(res,1); | ||
let res = cup.parse("sub.domain.com/aaa/bbb?q1=data1"); | ||
assert.equal(res.querycount,1); | ||
}); | ||
it('should calculate urls query-2', function() { | ||
let res = cup.querycount("sub.domain.com/aaa/bbb?q1=data1&q2=data2"); | ||
assert.equal(res,2); | ||
let res = cup.parse("sub.domain.com/aaa/bbb?q1=data1&q2=data2"); | ||
assert.equal(res.querycount,2); | ||
}); | ||
it('should calculate urls query-3', function() { | ||
let res = cup.querycount("sub.domain.com/aaa/bbb?q1=data1&q2=data2&q3=data3"); | ||
assert.equal(res,3); | ||
let res = cup.parse("sub.domain.com/aaa/bbb?q1=data1&q2=data2&q3=data3"); | ||
assert.equal(res.querycount,3); | ||
}); | ||
it('should calculate urls query-4', function() { | ||
let res = cup.querycount("sub.domain.com/aaa/bbb?q1=data1&q2=data2&q3=data3&q4=data4"); | ||
assert.equal(res,4); | ||
let res = cup.parse("sub.domain.com/aaa/bbb?q1=data1&q2=data2&q3=data3&q4=data4"); | ||
assert.equal(res.querycount,4); | ||
}); | ||
}); |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
93934
22
1088
154
3
3
+ Addedurl-clean@1.0.2
+ Addedurl-clean@1.0.2(transitive)
- Removednormalize-url@^2.0.0
- Removedis-plain-obj@1.1.0(transitive)
- Removednormalize-url@2.0.1(transitive)
- Removedprepend-http@2.0.0(transitive)
- Removedsort-keys@2.0.0(transitive)