Comparing version 0.1.1 to 0.1.2
@@ -17,3 +17,3 @@ const moment = require('moment'); | ||
if (!crawlRequest) { | ||
logger.info(`Queue empty`); | ||
this.logger.info(`Queue empty`); | ||
return; | ||
@@ -23,5 +23,67 @@ } | ||
if (!this.seen[crawlRequest.url]) { | ||
this._getRequestor().getAll(crawlRequest.url).then(body => { | ||
if (err || (response && response.statusCode >= 300)) { | ||
if (response && (response.statusCode >= 500 || response.statusCode === 403)) { | ||
this._getRequestor().getAll(crawlRequest.url).then( | ||
body => { | ||
self.seen[crawlRequest.url] = true; | ||
body._metadata = { | ||
type: crawlRequest.type, | ||
url: crawlRequest.url, | ||
fetchedAt: moment.utc().toISOString(), | ||
links: {} | ||
}; | ||
let document = null; | ||
switch (crawlRequest.type) { | ||
case 'orgs': { | ||
document = self._processCollection(body, 'login', crawlRequest.context); | ||
break; | ||
} | ||
case 'repo': { | ||
document = self._processRepo(body, crawlRequest.context); | ||
break; | ||
} | ||
case 'login': { | ||
document = self._processLogin(body, crawlRequest.context); | ||
break; | ||
} | ||
case 'repos': { | ||
document = self._processCollection(body, 'repo', crawlRequest.context); | ||
break; | ||
} | ||
case 'issues': { | ||
document = self._processCollection(body, 'issue', crawlRequest.context); | ||
break; | ||
} | ||
case 'issue': { | ||
document = self._processIssue(body, crawlRequest.context); | ||
break; | ||
} | ||
case 'issue_comments': { | ||
document = self._processCollection(body, 'issue_comment', crawlRequest.context); | ||
break; | ||
} | ||
case 'issue_comment': { | ||
document = self._processIssueComment(body, crawlRequest.context); | ||
break; | ||
} | ||
case 'commits': { | ||
document = self._processCollection(body, 'commit', crawlRequest.context); | ||
break; | ||
} | ||
case 'commit': { | ||
document = self._processCommit(body, crawlRequest.context); | ||
break; | ||
} | ||
} | ||
self.logger.info(`Crawled ${crawlRequest.url} [${crawlRequest.type}]`); | ||
if (document && self.store) { | ||
self.store.upsert(document, () => { | ||
setTimeout(self.start.bind(self), 0); | ||
}); | ||
} else { | ||
setTimeout(self.start.bind(self), 0); | ||
} | ||
}, | ||
err => { | ||
if (!err.repsonse || err.response && (err.response.statusCode >= 500 || err.response.statusCode === 403)) { | ||
// should mark the request with an attempt count so we don't get stuck doing this forever | ||
self.queue.push(crawlRequest); | ||
@@ -31,66 +93,5 @@ } | ||
return; | ||
} | ||
self.seen[crawlRequest.url] = true; | ||
body._metadata = { | ||
type: crawlRequest.type, | ||
url: crawlRequest.url, | ||
fetchedAt: moment.utc().toISOString(), | ||
links: {} | ||
}; | ||
let document = null; | ||
switch (crawlRequest.type) { | ||
case 'orgs': { | ||
document = self._processCollection(body, 'login', crawlRequest.context); | ||
break; | ||
} | ||
case 'repo': { | ||
document = self._processRepo(body, crawlRequest.context); | ||
break; | ||
} | ||
case 'login': { | ||
document = self._processLogin(body, crawlRequest.context); | ||
break; | ||
} | ||
case 'repos': { | ||
document = self._processCollection(body, 'repo', crawlRequest.context); | ||
break; | ||
} | ||
case 'issues': { | ||
document = self._processCollection(body, 'issue', crawlRequest.context); | ||
break; | ||
} | ||
case 'issue': { | ||
document = self._processIssue(body, crawlRequest.context); | ||
break; | ||
} | ||
case 'issue_comments': { | ||
document = self._processCollection(body, 'issue_comment', crawlRequest.context); | ||
break; | ||
} | ||
case 'issue_comment': { | ||
document = self._processIssueComment(body, crawlRequest.context); | ||
break; | ||
} | ||
case 'commits': { | ||
document = self._processCollection(body, 'commit', crawlRequest.context); | ||
break; | ||
} | ||
case 'commit': { | ||
document = self._processCommit(body, crawlRequest.context); | ||
break; | ||
} | ||
} | ||
logger.info(`Crawled ${crawlRequest.url} [${crawlRequest.type}]`); | ||
if (document && self.store) { | ||
self.store.upsert(document, () => { | ||
setTimeout(self.start.bind(self), 0); | ||
}); | ||
} else { | ||
setTimeout(self.start.bind(self), 0); | ||
} | ||
}).auth('', process.env['GITHUB_TOKEN']); | ||
} | ||
else { | ||
logger.info(`Skipped ${crawlRequest.url} [${crawlRequest.type}]`); | ||
}); | ||
} else { | ||
self.logger.info(`Skipped ${crawlRequest.url} [${crawlRequest.type}]`); | ||
setTimeout(self.start.bind(self), 0); | ||
@@ -101,5 +102,5 @@ } | ||
_getRequestor() { | ||
return new requestor({ | ||
return new this.requestor({ | ||
headers: { | ||
authorization: config.githubToken | ||
authorization: this.config.githubToken | ||
} | ||
@@ -106,0 +107,0 @@ }); |
{ | ||
"name": "ghcrawler", | ||
"version": "0.1.1", | ||
"version": "0.1.2", | ||
"description": "A robust GitHub API crawler that walks a queue of GitHub entities retrieving and storing their contents.", | ||
@@ -5,0 +5,0 @@ "main": "./lib/crawler.js", |
Sorry, the diff of this file is not supported yet
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
New author
Supply chain riskA new npm collaborator published a version of the package for the first time. New collaborators are usually benign additions to a project, but do indicate a change to the security surface area of a package.
Found 1 instance in 1 package
Environment variable access
Supply chain riskPackage accesses environment variables, which may be a sign of credential stuffing or data theft.
Found 1 instance in 1 package
11860
8
209
0
0