New Case Study:See how Anthropic automated 95% of dependency reviews with Socket.Learn More
Socket
Sign inDemoInstall
Socket

robots-parser

Package Overview
Dependencies
Maintainers
1
Versions
11
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

robots-parser - npm Package Compare versions

Comparing version 1.0.0 to 1.0.1

15

package.json
{
"name": "robots-parser",
"version": "1.0.0",
"version": "1.0.1",
"description": "Robots.txt parser.",

@@ -10,7 +10,7 @@ "main": "index.js",

"scripts": {
"test": "./node_modules/mocha/bin/mocha test"
"test": "istanbul cover _mocha --print detail --report html --dir ./report/coverage"
},
"repository" : {
"type" : "git",
"url" : "https://github.com/samclarke/robots-parser.git"
"repository": {
"type": "git",
"url": "https://github.com/samclarke/robots-parser.git"
},

@@ -20,5 +20,6 @@ "author": "Sam Clarke <sam@samclarke.com>",

"devDependencies": {
"chai": "^1.9.1",
"mocha": "^1.21.4"
"chai": "^3.5.0",
"istanbul": "^0.4.5",
"mocha": "^3.1.2"
}
}

@@ -102,3 +102,3 @@ var libUrl = require('url');

if (!line || line[0].indexOf('#') === 0) {
if (!line || !line[0] || line[0].indexOf('#') === 0) {
continue;

@@ -113,3 +113,5 @@ }

currentUserAgents.push(formatUserAgent(line[1]));
if (line[1]) {
currentUserAgents.push(formatUserAgent(line[1]));
}
break;

@@ -126,10 +128,14 @@ case 'disallow':

case 'sitemap':
robots.addSitemap(line[1]);
if (line[1]) {
robots.addSitemap(line[1]);
}
break;
case 'host':
robots.setPreferredHost(line[1].toLowerCase());
if (line[1]) {
robots.setPreferredHost(line[1].toLowerCase());
}
break;
}
isNoneUserAgentState = line[0] !== 'user-agent';
isNoneUserAgentState = line[0].toLowerCase() !== 'user-agent';
}

@@ -257,6 +263,2 @@ }

*
* If the rules don't specify it will return the
* return true unless defaultToDissallow option
* is set to true.
*
* @param {string} url

@@ -267,3 +269,2 @@ * @param {string?} ua

Robots.prototype.isAllowed = function (url, ua) {
var rule = true;
var parsedUrl = libUrl.parse(url);

@@ -282,11 +283,5 @@ var userAgent = formatUserAgent(ua || '*');

if (!this._rules[userAgent]) {
userAgent = '*';
}
var rules = this._rules[userAgent] || this._rules['*'] || [];
if (this._rules[userAgent]) {
rule = isPathAllowed(parsedUrl.path, this._rules[userAgent]);
}
return rule;
return isPathAllowed(parsedUrl.path, rules);
};

@@ -337,2 +332,3 @@

module.exports = Robots;
module.exports = Robots;

@@ -14,3 +14,3 @@ var robotsParser = require('../index');

disallowed.forEach(function (url) {
expect(robots.isAllowed(url)).to.equal(false);
expect(robots.isDisallowed(url)).to.equal(true);
});

@@ -20,3 +20,3 @@ }

describe('Robots', function () {
it('should parse the disallow directive', function (done) {
it('should parse the disallow directive', function () {
var contents = [

@@ -40,7 +40,5 @@ 'User-agent: *',

testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
done();
});
it('should parse the allow directive', function (done) {
it('should parse the allow directive', function () {
var contents = [

@@ -67,7 +65,5 @@ 'User-agent: *',

testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
done();
});
it('should parse patterns', function (done) {
it('should parse patterns', function () {
var contents = [

@@ -92,7 +88,5 @@ 'User-agent: *',

testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
done();
});
it('should have the correct order presidence for allow and disallow', function (done) {
it('should have the correct order presidence for allow and disallow', function () {
var contents = [

@@ -119,7 +113,5 @@ 'User-agent: *',

testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
done();
});
it('should ignore rules that are not in a group', function (done) {
it('should ignore rules that are not in a group', function () {
var contents = [

@@ -137,9 +129,90 @@ 'Disallow: /secret.html',

testRobots('http://www.example.com/robots.txt', contents, allowed, []);
});
done();
it('should ignore comments', function () {
var contents = [
'#',
'# This is a comment',
'#',
'User-agent: *',
'# This is a comment',
'Disallow: /fish/',
'# Disallow: fish',
'Disallow: /test.html'
].join('\n');
var allowed = [
'http://www.example.com/fish',
'http://www.example.com/Test.html'
];
var disallowed = [
'http://www.example.com/fish/index.php',
'http://www.example.com/fish/',
'http://www.example.com/test.html'
];
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
});
it('should return undefined for invalid urls', function (done) {
it('should ignore invalid lines', function () {
var contents = [
'invalid line',
'User-agent: *',
'Disallow: /fish/',
':::::another invalid line:::::',
'Disallow: /test.html',
'Unknown: tule'
].join('\n');
var allowed = [
'http://www.example.com/fish',
'http://www.example.com/Test.html'
];
var disallowed = [
'http://www.example.com/fish/index.php',
'http://www.example.com/fish/',
'http://www.example.com/test.html'
];
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
});
it('should ignore empty user-agent lines', function () {
var contents = [
'User-agent:',
'Disallow: /fish/',
'Disallow: /test.html'
].join('\n');
var allowed = [
'http://www.example.com/fish',
'http://www.example.com/Test.html',
'http://www.example.com/fish/index.php',
'http://www.example.com/fish/',
'http://www.example.com/test.html'
];
var disallowed = [];
testRobots('http://www.example.com/robots.txt', contents, allowed, disallowed);
});
it('should support groups with multiple user agents (case insensitive)', function () {
var contents = [
'User-agent: agenta',
'User-agent: agentb',
'Disallow: /fish',
].join('\n');
var robots = robotsParser('http://www.example.com/robots.txt', contents);
expect(robots.isAllowed("http://www.example.com/fish", "agenta")).to.equal(false);
});
it('should return undefined for invalid urls', function () {
var contents = [
'User-agent: *',
'Disallow: /secret.html',

@@ -160,7 +233,5 @@ 'Disallow: /test',

});
done();
});
it('should handle Unicode and punycode URLs', function (done) {
it('should handle Unicode and punycode URLs', function () {
var contents = [

@@ -183,7 +254,5 @@ 'User-agent: *',

testRobots('http://www.münich.com/robots.txt', contents, allowed, disallowed);
done();
});
it('should allow all if empty robots.txt', function (done) {
it('should allow all if empty robots.txt', function () {
var allowed = [

@@ -200,7 +269,12 @@ 'http://www.example.com/secret.html',

});
});
done();
it('should treat null as allowing all', function () {
var robots = robotsParser('http://www.example.com/robots.txt', null);
expect(robots.isAllowed("http://www.example.com/", "userAgent")).to.equal(true);
expect(robots.isAllowed("http://www.example.com/")).to.equal(true);
});
it('should parse the crawl-delay directive', function (done) {
it('should parse the crawl-delay directive', function () {
var contents = [

@@ -224,7 +298,6 @@ 'user-agent: a',

expect(robots.getCrawlDelay('d')).to.equal(10);
done();
expect(robots.getCrawlDelay()).to.equal(undefined);
});
it('should ignore invalid crawl-delay directives', function (done) {
it('should ignore invalid crawl-delay directives', function () {
var contents = [

@@ -248,7 +321,5 @@ 'user-agent: a',

expect(robots.getCrawlDelay('d')).to.equal(undefined);
done();
});
it('should parse the sitemap directive', function (done) {
it('should parse the sitemap directive', function () {
var contents = [

@@ -273,7 +344,5 @@ 'user-agent: a',

]);
done();
});
it('should parse the host directive', function (done) {
it('should parse the host directive', function () {
var contents = [

@@ -293,7 +362,24 @@ 'user-agent: a',

expect(robots.getPreferredHost()).to.equal('example.com');
});
done();
it('should parse empty and invalid directives', function () {
var contents = [
'user-agent:',
'user-agent:::: a::',
'crawl-delay:',
'crawl-delay:::: 0:',
'host:',
'host:: example.com',
'sitemap:',
'sitemap:: site:map.xml',
'disallow:',
'disallow::: /:',
'allow:',
'allow::: /:',
].join('\n');
var robots = robotsParser('http://www.example.com/robots.txt', contents);
});
it('should treat only the last host directive as valid', function (done) {
it('should treat only the last host directive as valid', function () {
var contents = [

@@ -314,7 +400,5 @@ 'user-agent: a',

expect(robots.getPreferredHost()).to.equal('example.com');
done();
});
it('should return null when there is no host directive', function (done) {
it('should return null when there is no host directive', function () {
var contents = [

@@ -331,7 +415,5 @@ 'user-agent: a',

expect(robots.getPreferredHost()).to.equal(null);
done();
});
it('should fallback to * when a UA has no rules of its own', function (done) {
it('should fallback to * when a UA has no rules of its own', function () {
var contents = [

@@ -354,7 +436,5 @@ 'user-agent: *',

expect(robots.getCrawlDelay('dd')).to.equal(1);
done();
});
it('should not fallback to * when a UA has rules', function (done) {
it('should not fallback to * when a UA has rules', function () {
var contents = [

@@ -371,7 +451,5 @@ 'user-agent: *',

expect(robots.getCrawlDelay('b')).to.equal(undefined);
done();
});
it('should ignore version numbers in the UA string', function (done) {
it('should ignore version numbers in the UA string', function () {
var contents = [

@@ -395,5 +473,4 @@ 'user-agent: *',

expect(robots.getCrawlDelay('b / 1.0')).to.equal(12);
});
});
done();
});
});

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc