website-scraper
Advanced tools
Comparing version 0.1.0 to 0.1.1
var Scraper = require('./lib/load.js'); | ||
module.exports.scrape = function (data, callback) { | ||
return new Scraper(data).scrape(callback); | ||
module.exports.scrape = function (options, callback) { | ||
return new Scraper(options).scrape(callback); | ||
}; |
@@ -7,34 +7,34 @@ var config = { | ||
selector: 'img', | ||
attributeName: 'src' | ||
attr: 'src' | ||
}, | ||
{ | ||
selector: 'input', | ||
attributeName: 'src' | ||
attr: 'src' | ||
}, | ||
{ | ||
selector: 'object', | ||
attributeName: 'data' | ||
attr: 'data' | ||
}, | ||
{ | ||
selector: 'embed', | ||
attributeName: 'src' | ||
attr: 'src' | ||
}, | ||
{ | ||
selector: 'param[name="movie"]', | ||
attributeName: 'value' | ||
attr: 'value' | ||
}, | ||
{ | ||
selector: 'script', | ||
attributeName: 'src' | ||
attr: 'src' | ||
}, | ||
{ | ||
selector: 'link[rel="stylesheet"]', | ||
attributeName: 'href' | ||
attr: 'href' | ||
}, | ||
{ | ||
selector: 'link[rel*="icon"]', | ||
attributeName: 'href' | ||
attr: 'href' | ||
}, | ||
], | ||
staticDirectories: [ | ||
directories: [ | ||
{ | ||
@@ -41,0 +41,0 @@ directory: 'images', |
@@ -9,9 +9,8 @@ var Promise = require('bluebird'), | ||
Logger = require('./log.js'), | ||
config = require('./defaults.js'); | ||
defaults = require('./defaults.js'); | ||
var encoding = 'binary'; | ||
/** @constructor */ | ||
var Loader = function (data) { | ||
var options = {}, | ||
var options = _.clone(data), | ||
encoding = 'binary', | ||
loadedFiles = {}, | ||
@@ -21,12 +20,6 @@ staticFullPaths, | ||
options.url = data.url; | ||
options.path = data.path; | ||
options.indexFile = _.has(data, 'indexFile') ? data.indexFile : config.indexFile; | ||
options.srcToLoad = _.has(data, 'srcToLoad') ? data.srcToLoad : config.srcToLoad; | ||
options.staticDirectories = _.has(data, 'staticDirectories') ? data.staticDirectories : config.staticDirectories; | ||
options.log = _.has(data, 'log') ? data.log : config.log; | ||
_.each(_.keys(defaults), function (key) { | ||
options[key] = _.has(options, key) ? options[key] : defaults[key]; | ||
}); | ||
staticFullPaths = _.map(options.staticDirectories, function (dir) { | ||
return path.resolve(options.path, dir.directory) | ||
}); | ||
logger = new Logger(options.log); | ||
@@ -100,3 +93,3 @@ | ||
function getDirectoryByExtension(ext) { | ||
var dirObj = _.chain(options.staticDirectories) | ||
var dirObj = _.chain(options.directories) | ||
.filter(function (dir) { | ||
@@ -251,5 +244,4 @@ return _.indexOf(dir.extensions, ext) >= 0 | ||
sourcesRegexps = [ | ||
/(url[\s]*\([\s'"]*)(.+?)([\s'"]*\))/gi, | ||
/(@import[\s]*['"]?[\s]*)(.+?)([\s]*['"]?;)/gi, | ||
/(@import[\s]*url[\s]*\([\s'"]*)(.+?)([\s'"]*\))/gi | ||
/((?:@import[\s]*)?url[\s]*\([\s'"]*)(.+?)([\s'"]*\))/gi | ||
], | ||
@@ -295,5 +287,10 @@ urlPromises = []; | ||
}) | ||
.then(function (html) { // Load css sources in index page | ||
.then(function (html) { | ||
fs.ensureDirSync(options.path); | ||
setLoadedFilename(options.url, options.path); | ||
staticFullPaths = _.map(options.directories, function (dir) { | ||
return path.resolve(options.path, dir.directory) | ||
}); | ||
return loadCssSources(html, options.url); | ||
@@ -310,3 +307,3 @@ }) | ||
p = p.then(function (newHtml) { | ||
return loadSources(newHtml, src.selector, src.attributeName) | ||
return loadSources(newHtml, src.selector, src.attr) | ||
}); | ||
@@ -318,7 +315,5 @@ }); | ||
p = p.then(function (html) { | ||
return fs.outputFileAsync(indexFilePath, html, {encoding: encoding}) | ||
}) | ||
.then(function () { | ||
return {status: 'success'} | ||
}); | ||
fs.outputFileSync(indexFilePath, html, {encoding: encoding}); | ||
return {html: html} | ||
}); | ||
@@ -328,15 +323,27 @@ return p; | ||
function errorCleanup() { | ||
return fs.removeAsync(options.path); | ||
} | ||
function noop() {} | ||
return { | ||
scrape: function (callback) { | ||
callback = typeof callback === 'function' ? callback : noop; | ||
if (!options.path) { | ||
return callback(new Error('Path is not defined')); | ||
} | ||
if (fs.existsSync(options.path)) { | ||
return callback(new Error('Path ' + options.path + ' exists!'), null); | ||
return callback(new Error('Path ' + options.path + ' exists')); | ||
} | ||
process() | ||
.then(function (res) { | ||
return callback(null, res) | ||
}) | ||
.catch(function (e) { | ||
return callback(e, null) | ||
}) | ||
.then(function (res, e) { | ||
if (e) { | ||
errorCleanup(); | ||
res = null; | ||
} | ||
return callback(e, res); | ||
}); | ||
} | ||
@@ -343,0 +350,0 @@ } |
{ | ||
"name": "website-scraper", | ||
"version": "0.1.0", | ||
"version": "0.1.1", | ||
"description": "full web-page's scraping including all css, images, js, etc.", | ||
@@ -35,4 +35,5 @@ "main": "index.js", | ||
"cheerio": "0.11.0", | ||
"request": "^2.42.0", | ||
"underscore": "^1.7.0" | ||
} | ||
} |
Sorry, the diff of this file is not supported yet
14901
5
8
417
+ Addedrequest@^2.42.0
+ Addedajv@6.12.6(transitive)
+ Addedasn1@0.2.6(transitive)
+ Addedassert-plus@1.0.0(transitive)
+ Addedasynckit@0.4.0(transitive)
+ Addedaws-sign2@0.7.0(transitive)
+ Addedaws4@1.13.2(transitive)
+ Addedbcrypt-pbkdf@1.0.2(transitive)
+ Addedcaseless@0.12.0(transitive)
+ Addedcombined-stream@1.0.8(transitive)
+ Addedcore-util-is@1.0.2(transitive)
+ Addeddashdash@1.14.1(transitive)
+ Addeddelayed-stream@1.0.0(transitive)
+ Addedecc-jsbn@0.1.2(transitive)
+ Addedextend@3.0.2(transitive)
+ Addedextsprintf@1.3.0(transitive)
+ Addedfast-deep-equal@3.1.3(transitive)
+ Addedfast-json-stable-stringify@2.1.0(transitive)
+ Addedforever-agent@0.6.1(transitive)
+ Addedform-data@2.3.3(transitive)
+ Addedgetpass@0.1.7(transitive)
+ Addedhar-schema@2.0.0(transitive)
+ Addedhar-validator@5.1.5(transitive)
+ Addedhttp-signature@1.2.0(transitive)
+ Addedis-typedarray@1.0.0(transitive)
+ Addedisstream@0.1.2(transitive)
+ Addedjsbn@0.1.1(transitive)
+ Addedjson-schema@0.4.0(transitive)
+ Addedjson-schema-traverse@0.4.1(transitive)
+ Addedjson-stringify-safe@5.0.1(transitive)
+ Addedjsprim@1.4.2(transitive)
+ Addedmime-db@1.52.0(transitive)
+ Addedmime-types@2.1.35(transitive)
+ Addedoauth-sign@0.9.0(transitive)
+ Addedperformance-now@2.1.0(transitive)
+ Addedpsl@1.15.0(transitive)
+ Addedpunycode@2.3.1(transitive)
+ Addedqs@6.5.3(transitive)
+ Addedrequest@2.88.2(transitive)
+ Addedsafe-buffer@5.2.1(transitive)
+ Addedsafer-buffer@2.1.2(transitive)
+ Addedsshpk@1.18.0(transitive)
+ Addedtough-cookie@2.5.0(transitive)
+ Addedtunnel-agent@0.6.0(transitive)
+ Addedtweetnacl@0.14.5(transitive)
+ Addeduri-js@4.4.1(transitive)
+ Addeduuid@3.4.0(transitive)
+ Addedverror@1.10.0(transitive)