robots-parser
Advanced tools
Comparing version 1.0.0 to 1.0.1
{ | ||
"name": "robots-parser", | ||
"version": "1.0.0", | ||
"version": "1.0.1", | ||
"description": "Robots.txt parser.", | ||
@@ -10,7 +10,7 @@ "main": "index.js", | ||
"scripts": { | ||
"test": "./node_modules/mocha/bin/mocha test" | ||
"test": "istanbul cover _mocha --print detail --report html --dir ./report/coverage" | ||
}, | ||
"repository" : { | ||
"type" : "git", | ||
"url" : "https://github.com/samclarke/robots-parser.git" | ||
"repository": { | ||
"type": "git", | ||
"url": "https://github.com/samclarke/robots-parser.git" | ||
}, | ||
@@ -20,5 +20,6 @@ "author": "Sam Clarke <sam@samclarke.com>", | ||
"devDependencies": { | ||
"chai": "^1.9.1", | ||
"mocha": "^1.21.4" | ||
"chai": "^3.5.0", | ||
"istanbul": "^0.4.5", | ||
"mocha": "^3.1.2" | ||
} | ||
} |
@@ -102,3 +102,3 @@ var libUrl = require('url'); | ||
if (!line || line[0].indexOf('#') === 0) { | ||
if (!line || !line[0] || line[0].indexOf('#') === 0) { | ||
continue; | ||
@@ -113,3 +113,5 @@ } | ||
currentUserAgents.push(formatUserAgent(line[1])); | ||
if (line[1]) { | ||
currentUserAgents.push(formatUserAgent(line[1])); | ||
} | ||
break; | ||
@@ -126,10 +128,14 @@ case 'disallow': | ||
case 'sitemap': | ||
robots.addSitemap(line[1]); | ||
if (line[1]) { | ||
robots.addSitemap(line[1]); | ||
} | ||
break; | ||
case 'host': | ||
robots.setPreferredHost(line[1].toLowerCase()); | ||
if (line[1]) { | ||
robots.setPreferredHost(line[1].toLowerCase()); | ||
} | ||
break; | ||
} | ||
isNoneUserAgentState = line[0] !== 'user-agent'; | ||
isNoneUserAgentState = line[0].toLowerCase() !== 'user-agent'; | ||
} | ||
@@ -257,6 +263,2 @@ } | ||
* | ||
* If the rules don't specify it will return the | ||
* return true unless defaultToDissallow option | ||
* is set to true. | ||
* | ||
* @param {string} url | ||
@@ -267,3 +269,2 @@ * @param {string?} ua | ||
Robots.prototype.isAllowed = function (url, ua) { | ||
var rule = true; | ||
var parsedUrl = libUrl.parse(url); | ||
@@ -282,11 +283,5 @@ var userAgent = formatUserAgent(ua || '*'); | ||
if (!this._rules[userAgent]) { | ||
userAgent = '*'; | ||
} | ||
var rules = this._rules[userAgent] || this._rules['*'] || []; | ||
if (this._rules[userAgent]) { | ||
rule = isPathAllowed(parsedUrl.path, this._rules[userAgent]); | ||
} | ||
return rule; | ||
return isPathAllowed(parsedUrl.path, rules); | ||
}; | ||
@@ -337,2 +332,3 @@ | ||
module.exports = Robots; | ||
module.exports = Robots; | ||
@@ -14,3 +14,3 @@ var robotsParser = require('../index'); | ||
disallowed.forEach(function (url) { | ||
expect(robots.isAllowed(url)).to.equal(false); | ||
expect(robots.isDisallowed(url)).to.equal(true); | ||
}); | ||
@@ -20,3 +20,3 @@ } | ||
describe('Robots', function () { | ||
it('should parse the disallow directive', function (done) { | ||
it('should parse the disallow directive', function () { | ||
var contents = [ | ||
@@ -40,7 +40,5 @@ 'User-agent: *', | ||
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed); | ||
done(); | ||
}); | ||
it('should parse the allow directive', function (done) { | ||
it('should parse the allow directive', function () { | ||
var contents = [ | ||
@@ -67,7 +65,5 @@ 'User-agent: *', | ||
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed); | ||
done(); | ||
}); | ||
it('should parse patterns', function (done) { | ||
it('should parse patterns', function () { | ||
var contents = [ | ||
@@ -92,7 +88,5 @@ 'User-agent: *', | ||
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed); | ||
done(); | ||
}); | ||
it('should have the correct order presidence for allow and disallow', function (done) { | ||
it('should have the correct order presidence for allow and disallow', function () { | ||
var contents = [ | ||
@@ -119,7 +113,5 @@ 'User-agent: *', | ||
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed); | ||
done(); | ||
}); | ||
it('should ignore rules that are not in a group', function (done) { | ||
it('should ignore rules that are not in a group', function () { | ||
var contents = [ | ||
@@ -137,9 +129,90 @@ 'Disallow: /secret.html', | ||
testRobots('http://www.example.com/robots.txt', contents, allowed, []); | ||
}); | ||
done(); | ||
it('should ignore comments', function () { | ||
var contents = [ | ||
'#', | ||
'# This is a comment', | ||
'#', | ||
'User-agent: *', | ||
'# This is a comment', | ||
'Disallow: /fish/', | ||
'# Disallow: fish', | ||
'Disallow: /test.html' | ||
].join('\n'); | ||
var allowed = [ | ||
'http://www.example.com/fish', | ||
'http://www.example.com/Test.html' | ||
]; | ||
var disallowed = [ | ||
'http://www.example.com/fish/index.php', | ||
'http://www.example.com/fish/', | ||
'http://www.example.com/test.html' | ||
]; | ||
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed); | ||
}); | ||
it('should return undefined for invalid urls', function (done) { | ||
it('should ignore invalid lines', function () { | ||
var contents = [ | ||
'invalid line', | ||
'User-agent: *', | ||
'Disallow: /fish/', | ||
':::::another invalid line:::::', | ||
'Disallow: /test.html', | ||
'Unknown: tule' | ||
].join('\n'); | ||
var allowed = [ | ||
'http://www.example.com/fish', | ||
'http://www.example.com/Test.html' | ||
]; | ||
var disallowed = [ | ||
'http://www.example.com/fish/index.php', | ||
'http://www.example.com/fish/', | ||
'http://www.example.com/test.html' | ||
]; | ||
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed); | ||
}); | ||
it('should ignore empty user-agent lines', function () { | ||
var contents = [ | ||
'User-agent:', | ||
'Disallow: /fish/', | ||
'Disallow: /test.html' | ||
].join('\n'); | ||
var allowed = [ | ||
'http://www.example.com/fish', | ||
'http://www.example.com/Test.html', | ||
'http://www.example.com/fish/index.php', | ||
'http://www.example.com/fish/', | ||
'http://www.example.com/test.html' | ||
]; | ||
var disallowed = []; | ||
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed); | ||
}); | ||
it('should support groups with multiple user agents (case insensitive)', function () { | ||
var contents = [ | ||
'User-agent: agenta', | ||
'User-agent: agentb', | ||
'Disallow: /fish', | ||
].join('\n'); | ||
var robots = robotsParser('http://www.example.com/robots.txt', contents); | ||
expect(robots.isAllowed("http://www.example.com/fish", "agenta")).to.equal(false); | ||
}); | ||
it('should return undefined for invalid urls', function () { | ||
var contents = [ | ||
'User-agent: *', | ||
'Disallow: /secret.html', | ||
@@ -160,7 +233,5 @@ 'Disallow: /test', | ||
}); | ||
done(); | ||
}); | ||
it('should handle Unicode and punycode URLs', function (done) { | ||
it('should handle Unicode and punycode URLs', function () { | ||
var contents = [ | ||
@@ -183,7 +254,5 @@ 'User-agent: *', | ||
testRobots('http://www.münich.com/robots.txt', contents, allowed, disallowed); | ||
done(); | ||
}); | ||
it('should allow all if empty robots.txt', function (done) { | ||
it('should allow all if empty robots.txt', function () { | ||
var allowed = [ | ||
@@ -200,7 +269,12 @@ 'http://www.example.com/secret.html', | ||
}); | ||
}); | ||
done(); | ||
it('should treat null as allowing all', function () { | ||
var robots = robotsParser('http://www.example.com/robots.txt', null); | ||
expect(robots.isAllowed("http://www.example.com/", "userAgent")).to.equal(true); | ||
expect(robots.isAllowed("http://www.example.com/")).to.equal(true); | ||
}); | ||
it('should parse the crawl-delay directive', function (done) { | ||
it('should parse the crawl-delay directive', function () { | ||
var contents = [ | ||
@@ -224,7 +298,6 @@ 'user-agent: a', | ||
expect(robots.getCrawlDelay('d')).to.equal(10); | ||
done(); | ||
expect(robots.getCrawlDelay()).to.equal(undefined); | ||
}); | ||
it('should ignore invalid crawl-delay directives', function (done) { | ||
it('should ignore invalid crawl-delay directives', function () { | ||
var contents = [ | ||
@@ -248,7 +321,5 @@ 'user-agent: a', | ||
expect(robots.getCrawlDelay('d')).to.equal(undefined); | ||
done(); | ||
}); | ||
it('should parse the sitemap directive', function (done) { | ||
it('should parse the sitemap directive', function () { | ||
var contents = [ | ||
@@ -273,7 +344,5 @@ 'user-agent: a', | ||
]); | ||
done(); | ||
}); | ||
it('should parse the host directive', function (done) { | ||
it('should parse the host directive', function () { | ||
var contents = [ | ||
@@ -293,7 +362,24 @@ 'user-agent: a', | ||
expect(robots.getPreferredHost()).to.equal('example.com'); | ||
}); | ||
done(); | ||
it('should parse empty and invalid directives', function () { | ||
var contents = [ | ||
'user-agent:', | ||
'user-agent:::: a::', | ||
'crawl-delay:', | ||
'crawl-delay:::: 0:', | ||
'host:', | ||
'host:: example.com', | ||
'sitemap:', | ||
'sitemap:: site:map.xml', | ||
'disallow:', | ||
'disallow::: /:', | ||
'allow:', | ||
'allow::: /:', | ||
].join('\n'); | ||
var robots = robotsParser('http://www.example.com/robots.txt', contents); | ||
}); | ||
it('should treat only the last host directive as valid', function (done) { | ||
it('should treat only the last host directive as valid', function () { | ||
var contents = [ | ||
@@ -314,7 +400,5 @@ 'user-agent: a', | ||
expect(robots.getPreferredHost()).to.equal('example.com'); | ||
done(); | ||
}); | ||
it('should return null when there is no host directive', function (done) { | ||
it('should return null when there is no host directive', function () { | ||
var contents = [ | ||
@@ -331,7 +415,5 @@ 'user-agent: a', | ||
expect(robots.getPreferredHost()).to.equal(null); | ||
done(); | ||
}); | ||
it('should fallback to * when a UA has no rules of its own', function (done) { | ||
it('should fallback to * when a UA has no rules of its own', function () { | ||
var contents = [ | ||
@@ -354,7 +436,5 @@ 'user-agent: *', | ||
expect(robots.getCrawlDelay('dd')).to.equal(1); | ||
done(); | ||
}); | ||
it('should not fallback to * when a UA has rules', function (done) { | ||
it('should not fallback to * when a UA has rules', function () { | ||
var contents = [ | ||
@@ -371,7 +451,5 @@ 'user-agent: *', | ||
expect(robots.getCrawlDelay('b')).to.equal(undefined); | ||
done(); | ||
}); | ||
it('should ignore version numbers in the UA string', function (done) { | ||
it('should ignore version numbers in the UA string', function () { | ||
var contents = [ | ||
@@ -395,5 +473,4 @@ 'user-agent: *', | ||
expect(robots.getCrawlDelay('b / 1.0')).to.equal(12); | ||
}); | ||
}); | ||
done(); | ||
}); | ||
}); |
Sorry, the diff of this file is not supported yet
21767
638
3