New Case Study:See how Anthropic automated 95% of dependency reviews with Socket.Learn More
Socket
Sign inDemoInstall
Socket

website-scraper

Package Overview
Dependencies
Maintainers
1
Versions
60
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

website-scraper - npm Package Compare versions

Comparing version 0.2.2 to 0.2.3

lib/utils/request.js

60

lib/load.js

@@ -8,2 +8,3 @@ var Promise = require('bluebird');

var css = require('./utils/css');
var request = require('./utils/request');
var Logger = require('./log.js');

@@ -15,3 +16,4 @@ var defaults = require('./config/defaults.js');

loadedFiles = {},
logger;
logger,
makeRequest;

@@ -37,11 +39,12 @@ options = _.extend(defaults, data);

filename = utils.trimFilename(filename);
var occupiedFilenames = getAllLoadedFilenames(),
ext = path.extname(filename),
staticDir = getDirectoryByExtension(ext),
fullPath = path.join(options.directory, staticDir, filename),
basename = path.basename(filename, ext),
index = 0;
var occupiedFilenames = getAllLoadedFilenames();
var ext = path.extname(filename);
var staticDir = getDirectoryByExtension(ext);
var fullPath = path.join(options.directory, staticDir, filename);
var basename = path.basename(filename, ext);
var index = 1;
while (occupiedFilenames.indexOf(fullPath) >= 0) {
fullPath = path.join(options.directory, staticDir, basename + '_' + ++index + ext);
while (_.contains(occupiedFilenames, fullPath)) {
fullPath = path.join(options.directory, staticDir, basename + '_' + index + ext);
index++;
}

@@ -59,4 +62,4 @@ return fullPath;

function getFileProcessingFunction(url, filename) {
var ext = path.extname(filename);
function getFileProcessingFunction(options) {
var ext = path.extname(options.filename);
var processingFunction;

@@ -66,6 +69,6 @@

case '.css':
processingFunction = loadCssSources.bind(null, url, filename);
processingFunction = loadCssSources.bind(null, options.url, options.filename);
break;
case '.html':
processingFunction = loadHtml.bind(null, url, filename);
processingFunction = loadHtml.bind(null, options.url, options.filename);
break;

@@ -86,18 +89,15 @@ default:

if (!localFilename) {
// Get filename
localFilename = getFilename(options.filename || path.basename(options.url));
// Set file as loaded
setLoadedFilename(options.url, localFilename);
options.filename = getFilename(options.filename || path.basename(options.url));
setLoadedFilename(options.url, options.filename);
// Request -> processing -> save to fs
return utils.makeRequest(options.url).then(function requestCompleted(data) {
return makeRequest(options.url).then(function requestCompleted(data) {
options.url = data.url; // Url may be changed in redirects
fileProcessingFunction = getFileProcessingFunction(options.url, localFilename);
fileProcessingFunction = getFileProcessingFunction(options);
return fileProcessingFunction(data.body);
}).then(function saveFileToFS(text) {
return fs.outputFileAsync(localFilename, text, {encoding: 'binary'});
return fs.outputFileAsync(options.filename, text, {encoding: 'binary'});
}).then(function fileSavedToFS() {
logger.log(options.url + ' -> ' + localFilename);
return localFilename;
logger.log(options.url + ' -> ' + options.filename);
return options.filename;
});

@@ -139,13 +139,2 @@ }

// // Update hrefs for loaded paths
// $('a[href]').each(function () {
// var self = $(this);
// var href = self.attr('href');
// var url = utils.getUrl(currentOptions.url, href);
// var localFileForUrl = _.findWhere(options.urlsToLoad, { url: url });
// if (localFileForUrl) {
// self.attr('href', localFileForUrl.filename);
// }
// });
return Promise.resolve($.html());

@@ -234,2 +223,5 @@ }

// Create makeRequest function with custom request params
makeRequest = request.makeRequest.bind(null, options.request);
// Create map { url -> local filename } for downloading

@@ -236,0 +228,0 @@ return Promise.resolve(adaptLoadOptions(options.urls));

@@ -10,16 +10,12 @@ var _ = require('underscore');

var commentRegexp = /\/\*([\s\S]*?)\*\//g;
var sourcesRegexps = [
/(@import[\s]*['"]?[\s]*)([\s\S]*?)([\s]*['"]?;)/ig,
/((?:@import[\s]*)?url[\s]*\([\s'"]*)([\s\S]*?)([\s'"]*\))/ig
];
var sourcesRegexp = /((?:@import\s+)?url\s*\(['"]?)(\S*?)(['"]?\s*\))|(@import\s+['"]?)([^;'"]+)/ig
var paths = [];
var urlMatch;
text = text.replace(commentRegexp, '');
_.each(sourcesRegexps, function (regexp) {
var urlMatch;
while (urlMatch = regexp.exec(text)) {
paths.push(urlMatch[2]);
}
});
while (urlMatch = sourcesRegexp.exec(text)) {
// Match 2 group if '[@import] url(path)', match 5 group if '@import path'
paths.push(urlMatch[2]||urlMatch[5]);
}

@@ -26,0 +22,0 @@ return _.chain(paths)

var url = require('url');
var Promise = require('bluebird');
var request = Promise.promisifyAll(require('request'));

@@ -28,17 +26,2 @@ function trimFilename(filename) {

function makeRequest(url) {
return request.getAsync({
url: url,
method: 'GET',
encoding: 'binary',
strictSSL: false,
jar: true
}).then(function (data) {
return {
url: data[0].request.href,
body: data[0].body
}
});
}
module.exports.isUrl = isUrl;

@@ -48,2 +31,1 @@ module.exports.getUrl = getUrl;

module.exports.trimFilename = trimFilename;
module.exports.makeRequest = makeRequest;
{
"name": "website-scraper",
"version": "0.2.2",
"version": "0.2.3",
"description": "full web-page's scraping including all css, images, js, etc.",

@@ -34,3 +34,3 @@ "readmeFilename": "README.md",

"bluebird": "^2.3.2",
"fs-extra": "^0.12.0",
"fs-extra": "^0.16.4",
"cheerio": "0.11.0",

@@ -41,5 +41,5 @@ "request": "^2.42.0",

"devDependencies": {
"should": "^4.3.0",
"should": "^5.0.0",
"mocha": "^2.0.1"
}
}

@@ -1,2 +0,2 @@

##Introduction
## Introduction
Node.js module for website's scraping with images, css, js, etc.

@@ -12,6 +12,6 @@

##Installation
## Installation
`npm install website-scraper`
##Usage
## Usage
```javascript

@@ -35,3 +35,3 @@ var scraper = require('website-scraper');

##API
## API
### scrape(options, callback)

@@ -48,2 +48,3 @@ Makes request to `url` and saves all files found with `sources` to `directory`.

- `subdirectories:` array of objects, specifies subdirectories for file extensions. If `null` all files will be saved to `directory` *(optional, see example below)*
- `request`: object, custom options for [request](https://github.com/request/request) *(optional, see example below)*

@@ -59,3 +60,3 @@

##Examples
## Examples
Let's scrape some pages from [http://nodejs.org/](http://nodejs.org/) with images, css, js files and save them to `/path/to/save/`.

@@ -90,3 +91,8 @@ Imagine we want to load:

{selector: 'script', attr: 'src'}
]
],
request: {
headers: {
'User-Agent': 'Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 4 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19'
}
}
}).then(function (result) {

@@ -97,3 +103,3 @@ console.log(result);

##Dependencies
## Dependencies

@@ -100,0 +106,0 @@ - cheerio

@@ -7,6 +7,5 @@ var css = require('../lib/utils/css');

it('should return true if src is base64-encoded', function(){
var base64_1 = ' \
ABlBMVEUAAAD///+l2Z/dAAAAM0lEQVR4nGP4/5/h/1+G/58ZDrAz3D/McH8yw83NDDeN \
Ge4Ug9C9zwz3gVLMDA/A6P9/AFGGFyjOXZtQAAAAAElFTkSuQmCC';
var base64_1 = '';
var base64_2 = 'data:text/plain;base64,SGVsbG8sIFdvcmxkIQ%3D%3D';
css.isEmbedded(base64_1).should.be.true;

@@ -18,2 +17,3 @@ css.isEmbedded(base64_2).should.be.true;

var url = 'https://www.google.com.ua/images/srpr/logo11w.png';
css.isEmbedded(path).should.be.false;

@@ -24,3 +24,42 @@ css.isEmbedded(url).should.be.false;

describe('#getSourcesPaths(text)', function(){
it('should return array of entries from url(...), @import url(...) and @import ...', function(){
var text = '\
@import url("a.css"); \
@import url(\'b.css\') tv; \
@import url(c.css); \
@import "d.css" screen; \
@import \'e.css\'; \
@import f.css; \
background-image url ("g.css"); \
background-image url (\'h.css\'); \
background-image url (i.css); \
';
var paths = css.getSourcesPaths(text);
paths.should.be.instanceof(Array).and.have.lengthOf(9);
paths.should.containEql('a.css');
paths.should.containEql('b.css');
paths.should.containEql('c.css');
paths.should.containEql('d.css');
paths.should.containEql('e.css');
paths.should.containEql('f.css');
paths.should.containEql('g.css');
paths.should.containEql('h.css');
paths.should.containEql('i.css');
});
it('should not return duplicate paths', function(){
var text = '\
@import url("a.css"); \
@import a.css; \
background-image url ("a.css"); \
';
var paths = css.getSourcesPaths(text);
paths.should.be.instanceof(Array).and.have.lengthOf(1);
paths.should.containEql('a.css');
});
});
});

@@ -54,8 +54,2 @@ var utils = require('../lib/utils/utils');

});
describe('#makeRequest(url)', function(){
// it('should be a promise', function(){
//
// });
});
});
SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc