website-scraper
Advanced tools
Comparing version 0.2.2 to 0.2.3
@@ -8,2 +8,3 @@ var Promise = require('bluebird'); | ||
var css = require('./utils/css'); | ||
var request = require('./utils/request'); | ||
var Logger = require('./log.js'); | ||
@@ -15,3 +16,4 @@ var defaults = require('./config/defaults.js'); | ||
loadedFiles = {}, | ||
logger; | ||
logger, | ||
makeRequest; | ||
@@ -37,11 +39,12 @@ options = _.extend(defaults, data); | ||
filename = utils.trimFilename(filename); | ||
var occupiedFilenames = getAllLoadedFilenames(), | ||
ext = path.extname(filename), | ||
staticDir = getDirectoryByExtension(ext), | ||
fullPath = path.join(options.directory, staticDir, filename), | ||
basename = path.basename(filename, ext), | ||
index = 0; | ||
var occupiedFilenames = getAllLoadedFilenames(); | ||
var ext = path.extname(filename); | ||
var staticDir = getDirectoryByExtension(ext); | ||
var fullPath = path.join(options.directory, staticDir, filename); | ||
var basename = path.basename(filename, ext); | ||
var index = 1; | ||
while (occupiedFilenames.indexOf(fullPath) >= 0) { | ||
fullPath = path.join(options.directory, staticDir, basename + '_' + ++index + ext); | ||
while (_.contains(occupiedFilenames, fullPath)) { | ||
fullPath = path.join(options.directory, staticDir, basename + '_' + index + ext); | ||
index++; | ||
} | ||
@@ -59,4 +62,4 @@ return fullPath; | ||
function getFileProcessingFunction(url, filename) { | ||
var ext = path.extname(filename); | ||
function getFileProcessingFunction(options) { | ||
var ext = path.extname(options.filename); | ||
var processingFunction; | ||
@@ -66,6 +69,6 @@ | ||
case '.css': | ||
processingFunction = loadCssSources.bind(null, url, filename); | ||
processingFunction = loadCssSources.bind(null, options.url, options.filename); | ||
break; | ||
case '.html': | ||
processingFunction = loadHtml.bind(null, url, filename); | ||
processingFunction = loadHtml.bind(null, options.url, options.filename); | ||
break; | ||
@@ -86,18 +89,15 @@ default: | ||
if (!localFilename) { | ||
// Get filename | ||
localFilename = getFilename(options.filename || path.basename(options.url)); | ||
// Set file as loaded | ||
setLoadedFilename(options.url, localFilename); | ||
options.filename = getFilename(options.filename || path.basename(options.url)); | ||
setLoadedFilename(options.url, options.filename); | ||
// Request -> processing -> save to fs | ||
return utils.makeRequest(options.url).then(function requestCompleted(data) { | ||
return makeRequest(options.url).then(function requestCompleted(data) { | ||
options.url = data.url; // Url may be changed in redirects | ||
fileProcessingFunction = getFileProcessingFunction(options.url, localFilename); | ||
fileProcessingFunction = getFileProcessingFunction(options); | ||
return fileProcessingFunction(data.body); | ||
}).then(function saveFileToFS(text) { | ||
return fs.outputFileAsync(localFilename, text, {encoding: 'binary'}); | ||
return fs.outputFileAsync(options.filename, text, {encoding: 'binary'}); | ||
}).then(function fileSavedToFS() { | ||
logger.log(options.url + ' -> ' + localFilename); | ||
return localFilename; | ||
logger.log(options.url + ' -> ' + options.filename); | ||
return options.filename; | ||
}); | ||
@@ -139,13 +139,2 @@ } | ||
// // Update hrefs for loaded paths | ||
// $('a[href]').each(function () { | ||
// var self = $(this); | ||
// var href = self.attr('href'); | ||
// var url = utils.getUrl(currentOptions.url, href); | ||
// var localFileForUrl = _.findWhere(options.urlsToLoad, { url: url }); | ||
// if (localFileForUrl) { | ||
// self.attr('href', localFileForUrl.filename); | ||
// } | ||
// }); | ||
return Promise.resolve($.html()); | ||
@@ -234,2 +223,5 @@ } | ||
// Create makeRequest function with custom request params | ||
makeRequest = request.makeRequest.bind(null, options.request); | ||
// Create map { url -> local filename } for downloading | ||
@@ -236,0 +228,0 @@ return Promise.resolve(adaptLoadOptions(options.urls)); |
@@ -10,16 +10,12 @@ var _ = require('underscore'); | ||
var commentRegexp = /\/\*([\s\S]*?)\*\//g; | ||
var sourcesRegexps = [ | ||
/(@import[\s]*['"]?[\s]*)([\s\S]*?)([\s]*['"]?;)/ig, | ||
/((?:@import[\s]*)?url[\s]*\([\s'"]*)([\s\S]*?)([\s'"]*\))/ig | ||
]; | ||
var sourcesRegexp = /((?:@import\s+)?url\s*\(['"]?)(\S*?)(['"]?\s*\))|(@import\s+['"]?)([^;'"]+)/ig | ||
var paths = []; | ||
var urlMatch; | ||
text = text.replace(commentRegexp, ''); | ||
_.each(sourcesRegexps, function (regexp) { | ||
var urlMatch; | ||
while (urlMatch = regexp.exec(text)) { | ||
paths.push(urlMatch[2]); | ||
} | ||
}); | ||
while (urlMatch = sourcesRegexp.exec(text)) { | ||
// Match 2 group if '[@import] url(path)', match 5 group if '@import path' | ||
paths.push(urlMatch[2]||urlMatch[5]); | ||
} | ||
@@ -26,0 +22,0 @@ return _.chain(paths) |
var url = require('url'); | ||
var Promise = require('bluebird'); | ||
var request = Promise.promisifyAll(require('request')); | ||
@@ -28,17 +26,2 @@ function trimFilename(filename) { | ||
function makeRequest(url) { | ||
return request.getAsync({ | ||
url: url, | ||
method: 'GET', | ||
encoding: 'binary', | ||
strictSSL: false, | ||
jar: true | ||
}).then(function (data) { | ||
return { | ||
url: data[0].request.href, | ||
body: data[0].body | ||
} | ||
}); | ||
} | ||
module.exports.isUrl = isUrl; | ||
@@ -48,2 +31,1 @@ module.exports.getUrl = getUrl; | ||
module.exports.trimFilename = trimFilename; | ||
module.exports.makeRequest = makeRequest; |
{ | ||
"name": "website-scraper", | ||
"version": "0.2.2", | ||
"version": "0.2.3", | ||
"description": "full web-page's scraping including all css, images, js, etc.", | ||
@@ -34,3 +34,3 @@ "readmeFilename": "README.md", | ||
"bluebird": "^2.3.2", | ||
"fs-extra": "^0.12.0", | ||
"fs-extra": "^0.16.4", | ||
"cheerio": "0.11.0", | ||
@@ -41,5 +41,5 @@ "request": "^2.42.0", | ||
"devDependencies": { | ||
"should": "^4.3.0", | ||
"should": "^5.0.0", | ||
"mocha": "^2.0.1" | ||
} | ||
} |
@@ -1,2 +0,2 @@ | ||
##Introduction | ||
## Introduction | ||
Node.js module for website's scraping with images, css, js, etc. | ||
@@ -12,6 +12,6 @@ | ||
##Installation | ||
## Installation | ||
`npm install website-scraper` | ||
##Usage | ||
## Usage | ||
```javascript | ||
@@ -35,3 +35,3 @@ var scraper = require('website-scraper'); | ||
##API | ||
## API | ||
### scrape(options, callback) | ||
@@ -48,2 +48,3 @@ Makes request to `url` and saves all files found with `sources` to `directory`. | ||
- `subdirectories:` array of objects, specifies subdirectories for file extensions. If `null` all files will be saved to `directory` *(optional, see example below)* | ||
- `request`: object, custom options for [request](https://github.com/request/request) *(optional, see example below)* | ||
@@ -59,3 +60,3 @@ | ||
##Examples | ||
## Examples | ||
Let's scrape some pages from [http://nodejs.org/](http://nodejs.org/) with images, css, js files and save them to `/path/to/save/`. | ||
@@ -90,3 +91,8 @@ Imagine we want to load: | ||
{selector: 'script', attr: 'src'} | ||
] | ||
], | ||
request: { | ||
headers: { | ||
'User-Agent': 'Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 4 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19' | ||
} | ||
} | ||
}).then(function (result) { | ||
@@ -97,3 +103,3 @@ console.log(result); | ||
##Dependencies | ||
## Dependencies | ||
@@ -100,0 +106,0 @@ - cheerio |
@@ -7,6 +7,5 @@ var css = require('../lib/utils/css'); | ||
it('should return true if src is base64-encoded', function(){ | ||
var base64_1 = ' \ | ||
ABlBMVEUAAAD///+l2Z/dAAAAM0lEQVR4nGP4/5/h/1+G/58ZDrAz3D/McH8yw83NDDeN \ | ||
Ge4Ug9C9zwz3gVLMDA/A6P9/AFGGFyjOXZtQAAAAAElFTkSuQmCC'; | ||
var base64_1 = ''; | ||
var base64_2 = 'data:text/plain;base64,SGVsbG8sIFdvcmxkIQ%3D%3D'; | ||
css.isEmbedded(base64_1).should.be.true; | ||
@@ -18,2 +17,3 @@ css.isEmbedded(base64_2).should.be.true; | ||
var url = 'https://www.google.com.ua/images/srpr/logo11w.png'; | ||
css.isEmbedded(path).should.be.false; | ||
@@ -24,3 +24,42 @@ css.isEmbedded(url).should.be.false; | ||
describe('#getSourcesPaths(text)', function(){ | ||
it('should return array of entries from url(...), @import url(...) and @import ...', function(){ | ||
var text = '\ | ||
@import url("a.css"); \ | ||
@import url(\'b.css\') tv; \ | ||
@import url(c.css); \ | ||
@import "d.css" screen; \ | ||
@import \'e.css\'; \ | ||
@import f.css; \ | ||
background-image url ("g.css"); \ | ||
background-image url (\'h.css\'); \ | ||
background-image url (i.css); \ | ||
'; | ||
var paths = css.getSourcesPaths(text); | ||
paths.should.be.instanceof(Array).and.have.lengthOf(9); | ||
paths.should.containEql('a.css'); | ||
paths.should.containEql('b.css'); | ||
paths.should.containEql('c.css'); | ||
paths.should.containEql('d.css'); | ||
paths.should.containEql('e.css'); | ||
paths.should.containEql('f.css'); | ||
paths.should.containEql('g.css'); | ||
paths.should.containEql('h.css'); | ||
paths.should.containEql('i.css'); | ||
}); | ||
it('should not return duplicate paths', function(){ | ||
var text = '\ | ||
@import url("a.css"); \ | ||
@import a.css; \ | ||
background-image url ("a.css"); \ | ||
'; | ||
var paths = css.getSourcesPaths(text); | ||
paths.should.be.instanceof(Array).and.have.lengthOf(1); | ||
paths.should.containEql('a.css'); | ||
}); | ||
}); | ||
}); |
@@ -54,8 +54,2 @@ var utils = require('../lib/utils/utils'); | ||
}); | ||
describe('#makeRequest(url)', function(){ | ||
// it('should be a promise', function(){ | ||
// | ||
// }); | ||
}); | ||
}); |
24485
15
507
105
+ Addedfs-extra@0.16.5(transitive)
+ Addedgraceful-fs@3.0.12(transitive)
+ Addednatives@1.1.6(transitive)
- Removedfs-extra@0.12.0(transitive)
- Removedminimist@1.2.8(transitive)
- Removedmkdirp@0.5.6(transitive)
- Removedncp@0.6.0(transitive)
Updatedfs-extra@^0.16.4