Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

crawlit

Package Overview
Dependencies
Maintainers
1
Versions
6
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

crawlit - npm Package Compare versions

Comparing version 0.1.2 to 0.1.3

config.local.js

44

lib/crawlQueue.js

@@ -14,3 +14,4 @@ "use strict";

maxConnections: 2,
crawlingTaskCount: 0
crawlingTaskCount: 0,
updateMode: true
};

@@ -20,2 +21,3 @@

PREPARE: 0,
UPDATE: 5,
RETRY: 1,

@@ -33,4 +35,6 @@ PROCESS: 2,

//crawl objects queue data cache.
var QUEUE = [];
var crawlQueue = function (baseDir, crawl) {
var QUEUE = [];
var that = {};

@@ -59,2 +63,11 @@

QUEUE = utilBox.loadJSON(queueDumFile, []);
QUEUE.forEach(function (uriObj) {
if (uriObj.status === QUEUE_STATUS.PROCESS) {
process.nextTick(crawl.bind(null, uriObj));
queueConfig.crawlingTaskCount += 1;
}
if (queueConfig.updateMode && uriObj.status === QUEUE_STATUS.CRAWLED && uriObj.type === 'link') {
uriObj.status = QUEUE_STATUS.UPDATE;
}
});
assert(Array.isArray(QUEUE));

@@ -71,2 +84,5 @@ };

*/
var memUsage = process.memoryUsage().rss / (1024 * 1024);
var next = function () {

@@ -79,14 +95,20 @@ var count = 0;

if (queueConfig.crawlingTaskCount < queueConfig.maxConnections &&
(uriObj.status === QUEUE_STATUS.PREPARE ||
uriObj.status === QUEUE_STATUS.RETRY && uriObj.failedCount < maxRetryCount)) {
if (memUsage < queueConfig.maxMem &&
queueConfig.crawlingTaskCount < queueConfig.maxConnections &&
( uriObj.status === QUEUE_STATUS.PREPARE ||
(uriObj.status === QUEUE_STATUS.RETRY && uriObj.failedCount < maxRetryCount) ||
(uriObj.status === QUEUE_STATUS.UPDATE && queueConfig.updateMode) )) {
process.nextTick(crawl.bind(null, uriObj));
setImmediate(crawl.bind(null, uriObj));
//process.nextTick(crawl.bind(null, uriObj));
uriObj.status = QUEUE_STATUS.PROCESS;
count += 1;
queueConfig.crawlingTaskCount += 1;
return queueConfig.crawlingTaskCount >= queueConfig.maxConnections;
}
});
queueConfig.crawlingTaskCount = count;
if (queueConfig.crawlingTaskCount === 0) {
if (retry > 2) {
if (retry > 0) {
clearInterval(timer);

@@ -109,3 +131,2 @@ console.log('queue is empty Exit dump Queue listener....');

var memUsage = process.memoryUsage().rss / (1024 * 1024);

@@ -116,3 +137,3 @@ console.log('Memory usage:', memUsage.toFixed(2), 'Mb');

}
if (memUsage > queueConfig.maxMem) {
if (memUsage > queueConfig.maxMem && queueConfig.crawlingTaskCount === 0) {
console.error("Over max memory usage, exit process.");

@@ -125,3 +146,3 @@ process.exit(1);

//TODO Update and dump queue by Event driven, remove setInterval() calling..
var timer = setInterval(dump, 5 * 1000);
var timer = setInterval(dump, 8 * 1000);
};

@@ -134,5 +155,4 @@

* type:[attachment,css,link.img],
* status:0:未开始 1:更新, 2:Retry 3:进行中,4:成功,5:失败,
* status: @see QUEUE_STATUS
* failedCount:3,
* updateFlag:anyeType}
*/

@@ -139,0 +159,0 @@ that.push = function (uriObj) {

@@ -14,4 +14,4 @@ "use strict";

var request = require('request');
var jsdom = require('jsdom');
//var request = require('http').request;
//var jsdom = require('jsdom');
var cheerio = require('cheerio');
var iconv = require('iconv-lite');

@@ -23,2 +23,3 @@ var charset = require('charset');

var utilBox = require('./utilbox.js');
var jsdomCount = 0;

@@ -28,2 +29,3 @@ //Load jQuery source code to string.

var resourceParser = config.crawlOptions.resourceParser;
//prepare working path

@@ -37,3 +39,3 @@ var ROOT_PATH = config.crawlOptions.working_root_path;

var getFilename = function (uriObj) {
return config.crawlOptions.resourceParser.getFileName(ROOT_PATH, uriObj);
return resourceParser.getFileName(uriObj);
};

@@ -51,9 +53,8 @@

var getBaseURI = function (window) {
var getBaseURI = function ($base, locationHref) {
//get baseURI, document.baseURI is not working in jsdom.
var $base = window.$('base');
var urlObj = url.parse(window.location.href);
var urlObj = url.parse(locationHref);
var basePath = $base.length === 1 ? $base.attr('href') : path.dirname(urlObj.pathname);
if (!url.parse(basePath).host) {
basePath = url.resolve(window.location.href, basePath);
basePath = url.resolve(locationHref, basePath);
}

@@ -69,7 +70,12 @@ return basePath;

var uri = uriObj['uri'];
console.log("Crawl :", uriObj.type, uri);
console.log("Crawl :", uriObj.type, uri);
console.debug('File', uriObj.filename);
Object.defineProperty(uriObj, 'distFilename', {
value: path.join(ROOT_PATH, url.parse(uriObj.uri).hostname, uriObj.filename),
writable: true,
numerable: false,
configurable: true});
utilBox.preparePath(path.dirname(uriObj.filename));
utilBox.preparePath(path.dirname(uriObj.distFilename));

@@ -103,3 +109,3 @@ /**

}
}).pipe(fs.createWriteStream(uriObj.filename));
}).pipe(fs.createWriteStream(uriObj.distFilename));
} else {

@@ -135,60 +141,67 @@ request.get({uri: uri, jar: true, encoding: null, headers: requestOptions}, function (err, res, body) {

//2. Remove absolute path to relative path
var regexp = new RegExp('http:\/\/' + config.crawlOptions.host.replace(/\./g, '\.'), 'g');
var outputCrawled = body.replace(regexp, '');
//3. save to disk
fs.writeFile(uriObj.filename, outputCrawled, function (err) {
if (err) {
return crawlEnd(err, uriObj);
}
outputCrawled = null;
console.info('Save to:', uriObj.filename);
});
var saveCrawledUri = function (output) {
var regexp = new RegExp('http:\/\/' + config.crawlOptions.host.replace(/\./g, '\.'), 'g');
var outputCrawled = output || body;
outputCrawled = outputCrawled.replace(regexp, '');
//console.warn(outputCrawled);
fs.writeFile(uriObj.distFilename, outputCrawled, function (err) {
if (err) {
return crawlEnd(err, uriObj);
}
outputCrawled = null;
console.info('Save to:', uriObj.filename);
});
};
if (!resourceParser.modify) {
saveCrawledUri();
}
//4. Remove all script for parsing HTML DOM.
body = body.replace(/<script.*?>.*?<\/script>/ig, '');
//5. parse links
jsdom.env({ html: body, src: [jQuerySrc],
done: function (err, window) {
body = null; //release memory
var $ = cheerio.load(body, { normalizeWhitespace: true, xmlMode: false });
if (err) {
window && window.close && window.close();
return crawlEnd(err, uriObj);
console.debug('Parse HTML :', uri);
var baseURI = getBaseURI($('base'), uriObj.uri);
//todo move linkStacks into resourcePraser.
var linkStacks = [];
var handler = {
parse: function (jQuerySelector, resourceType, callback) {
var resources = jQuerySelector;
if (resources) {
//todo use Object[key] to count;
linkStacks.push(resourceType + ': ' + resources.length);
resources.each(function (index, link) {
callback($(link));
});
} else {
linkStacks.push(resourceType, 0);
}
},
push: function (uriObj) {
var host2 = url.parse(uriObj.uri).host;
if (!host2) {
uriObj.uri = getAbsUrlPath(baseURI, uriObj.uri);
}
uriObj.filename = getFilename(uriObj);
crawler.push(uriObj);
}};
console.debug('Parse HTML :', uri);
var baseURI = getBaseURI(window);
var linkStacks = [];
var handler = {
parse: function (jQuerySelector, resourceType, callback) {
var resources = window.$(jQuerySelector);
if (resources) {
linkStacks.push(resourceType + ': ' + resources.length);
resources.each(function (index, link) {
callback(link);
});
} else {
linkStacks.push(resourceType, 0);
}
},
push: function (uriObj) {
var host2 = url.parse(uriObj.uri).host;
if (!host2) {
uriObj.uri = getAbsUrlPath(baseURI, uriObj.uri);
}
uriObj.filename = getFilename(uriObj);
crawler.push(uriObj);
}};
resourceParser.parseLinks($, handler);
if (resourceParser.modify) {
saveCrawledUri(resourceParser.modify($, handler));
} else {
//TODO change rules then move parseLines method downside.
//resourceParser.parseLinks($, handler);
}
config.crawlOptions.resourceParser.parseLinks(window.$, handler);
window.close();//Call window.close(); for memory leak.
window = null;
handler = null;
console.info('Resource count in page:', uriObj.uri, linkStacks);
linkStacks = null;
crawlEnd(null, uriObj);
}
});
handler = null;
console.info('Resource count in page:', uriObj.uri, linkStacks);
linkStacks = null;
crawlEnd(null, uriObj);
});

@@ -195,0 +208,0 @@ }

@@ -25,2 +25,4 @@ "use strict";

var utilBox = require('../../utilbox.js');
var console = require('../../logger.js').getLogger('info');
console.log = console.debug;

@@ -38,8 +40,10 @@ var finishedStack = {};

var attachmentFilePath = function (baseDir, uriObj) {
var attachmentFilePath = function (uriObj) {
//path name = __dirname/hostname/attachment/tid/uid-paramnames-n
var realIdArray = decode(uriObj.query.aid).split('|');
console.log('realId.array: ', realIdArray);
console.log('16num to 10', parseInt(realIdArray[1], 16));
var filename = Object.keys(uriObj.query).reduce(function (pre, current, index) {
//console.log(index, 'pre:', pre, 'current:', current);
console.log(index, 'current:', current, 'pre:', pre);
//Add other param to filenameArray

@@ -52,3 +56,6 @@ if (!/(mod|aid)/i.test(current)) {

return path.join(baseDir, uriObj.hostname, 'attachment', realIdArray.pop(), filename.join('-'));
console.log('realId.array: ', realIdArray);
console.log('filename:', filename);
return path.join('attachment', realIdArray.pop(), filename.join('-'));
};

@@ -129,18 +136,7 @@

exports.getAttFilePath = function (dir, uri) {
exports.getAttFilePath = function (uri) {
var urlObj = url.parse(uri, true);
return attachmentFilePath(dir, urlObj);
return attachmentFilePath(urlObj);
};
exports.exists = function (uri) {
var uriObj = url.parse(uri, true);
if (isAttach(uriObj)) {
var filePath = attachmentFilePath(uriObj);
//console.log('filePath:', filePath, ' exist:', fs.existsSync(filePath));
return fs.existsSync(filePath);
} else {
return false;
}
};
exports.clear = function () {

@@ -147,0 +143,0 @@ clearFishied.call(null, cleanAttFiles);

@@ -19,25 +19,25 @@ "use strict";

//pipe images
handler.parse('img[file^="forum.php?mod=attachment"]', "images", function (img) {
handler.push({uri: img.getAttribute('file'), type: 'attachment'});
handler.push({uri: img.getAttribute('zoomfile'), type: 'attachment'});
handler.parse($('img[file^="forum.php?mod=attachment"]'), "images", function (img) {
handler.push({uri: img.attr('file'), type: 'attachment'});
handler.push({uri: img.attr('zoomfile'), type: 'attachment'});
});
//pipe attachments and not need JB
handler.parse('a[href^="forum.php?mod=attachment"]', 'attachments', function (attachment) {
var needJB = $('#' + (attachment.parentNode.id || attachment.id) + '_menu').text().indexOf('金币') > 0;
handler.parse($('a[href^="forum.php?mod=attachment"]'), 'attachments', function (attachment) {
var needJB = $('#' + (attachment.parent.attr('id') || attachment.attr('id')) + '_menu').text().indexOf('金币') > 0;
if (!needJB) {
//console.log('attachment:', attachment.href, $('#' + $(attachment).parent().id + '_menu').text());
handler.push({uri: (attachment.href), type: 'attachment'});
handler.push({uri: (attachment.attr('href')), type: 'attachment'});
}
});
handler.parse('img', 'image', function (link) {
handler.push({uri: link.src, type: 'img'});
handler.parse($('img'), 'image', function (link) {
handler.push({uri: link.attr('src'), type: 'img'});
});
handler.parse('link[rel="stylesheet"]', 'css', function (link) {
handler.push({uri: link.href, type: 'css'});
handler.parse($('link[rel="stylesheet"]'), 'css', function (link) {
handler.push({uri: link.attr('href'), type: 'css'});
});
handler.parse('sscript[src^="static/"]', 'js', function (link) {
handler.parse($('sscript[src^="static/"]'), 'js', function (link) {
handler.push({uri: $(link).attr('src'), type: 'js'});

@@ -50,43 +50,41 @@ });

//thread links
handler.parse('a[href$=".html"][href^="forum"]', 'forum', function (link) {
handler.push({uri: link.href, type: 'link'});
handler.parse($('a[href$=".html"][href^="forum"]'), 'forum', function (link) {
handler.push({uri: link.attr('href'), type: 'link'});
});
handler.parse('a[href$=".html"][href^="thread"]', 'thread', function (link) {
handler.push({uri: link.href, type: 'link'});
handler.parse($('a[href$=".html"][href^="thread"]'), 'thread', function (link) {
handler.push({uri: link.attr('href'), type: 'link'});
});
//archive links
handler.parse('a[href^="archiver/"],a[href^="?tid-"]', 'Archive', function (a) {
handler.push({uri: a.href, type: 'link'});
handler.parse($('a[href^="archiver/"],a[href^="?tid-"]'), 'Archive', function (a) {
handler.push({uri: a.attr('href'), type: 'link'});
});
//forum links
handler.parse('a[href^="forum.php?gid="] ', 'forum', function (a) {
handler.push({uri: a.href, type: 'link'});
handler.parse($('a[href^="forum.php?gid="] '), 'forum', function (a) {
handler.push({uri: a.attr('href'), type: 'link'});
});
handler.parse('a[href="group.php"],a[href="portal.php"],a[href="home.php"]', 'p&f&g', function (link) {
handler.parse($('a[href="group.php"],a[href="portal.php"],a[href="home.php"]'), 'p&f&g', function (link) {
//console.log('group:', link.href);
if (/\.php$/.test(link.href)) {
if (/\.php$/.test(link.attr('href'))) {
//console.log('push:', link.href, crawler.push({uri:link.href, type:'link'}));
handler.push({uri: link.href, type: 'link'});
handler.push({uri: link.attr('href'), type: 'link'});
}
});
handler.parse('a[href^="group.php?gid="]', 'group', function (link) {
handler.parse($('a[href^="group.php?gid="]'), 'group', function (link) {
//console.log('group:', link.href);
if (/group\.php\?gid=\(d{1,3}$/.test(link.href)) {
handler.push({uri: link.href, type: 'link'});
if (/group\.php\?gid=\(d{1,3}$/.test(link.attr('href'))) {
handler.push({uri: link.attr('href'), type: 'link'});
}
});
handler.parse('a[href^="home.php?mod=space&uid="]', 'space', function (link) {
if (/home\.php\?mod=space&uid=\d{1,6}$/.test(link.href)) {
handler.push({uri: link.href, type: 'link'});
handler.parse($('a[href^="home.php?mod=space&uid="]'), 'space', function (link) {
if (/home\.php\?mod=space&uid=\d{1,6}$/.test(link.attr('href'))) {
handler.push({uri: link.attr('href'), type: 'link'});
}
});
}
handler.callback && handler.callback();
};

@@ -93,0 +91,0 @@

@@ -8,3 +8,7 @@ /**

var attachment = require('../discuz/attachment');
var fs = require('fs');
var assert = require('assert');
var template = fs.readFileSync(path.join(__dirname, '../../../template/index.html')).toString();
exports.parseLinks = function ($, handler) {

@@ -14,40 +18,63 @@ "use strict";

//pipe images
handler.parse('img[file*="forum.php?mod=attachment"]', "images", function (img) {
handler.push({uri: img.getAttribute('file'), type: 'attachment'});
handler.push({uri: img.getAttribute('src'), type: 'attachment'});
handler.parse($('img[file*="forum.php?mod=attachment"]'), "images", function (img) {
//discuz default src is a blank image, use file attribute.
assert(img);
img = $(img);
assert(img.attr('file'));
handler.push({uri: img.attr('file'), type: 'attachment'});
});
/*//pipe attachments and not need JB
handler.parse('a[href*="forum.php?mod=attachment"]', 'attachments', function (attachment) {
var needJB = $('#' + (attachment.parentNode.id || attachment.id) + '_menu').text().indexOf('金币') > 0;
if (!needJB) {
//console.log('attachment:', attachment.href, $('#' + $(attachment).parent().id + '_menu').text());
handler.push({uri: (attachment.href), type: 'attachment'});
}
});*/
handler.parse($('a[href*="forum.php?mod=attachment"]'), 'attachments', function (attachment) {
var needJB = $('#' + (attachment.parentNode.id || attachment.id) + '_menu').text().indexOf('金币') > 0;
if (!needJB) {
//console.log('attachment:', attachment.href, $('#' + $(attachment).parent().id + '_menu').text());
handler.push({uri: (attachment.href), type: 'attachment'});
}
});*/
/*handler.parse('img', 'image', function (link) {
handler.push({uri: link.src, type: 'img'});
handler.parse($('img[src$=gif]', '#postlist'), 'image', function (link) {
link = $(link);
var attr = link.attr('src');
assert(attr);
if (/\.gif$/.test(attr))
handler.push({uri: attr, type: 'img'});
});
handler.parse('link[rel="stylesheet"]', 'css', function (link) {
handler.push({uri: link.href, type: 'css'});
});
*/
handler.parse('a[href*="thread-50247-"]', 'link', function (link) {
handler.push({uri: link.href, type: 'link'});
});
/*handler.parse('link[rel="stylesheet"]', 'css', function (link) {
handler.push({uri: link.href, type: 'css'});
});
*/
if (config.crawlOptions.recursive) {
handler.parse($('a[href*="thread-50247-"]'), 'link', function (link) {
link = $(link);
handler.push({uri: link.attr('href'), type: 'link'});
});
}
};
var attachmentFilePath = function (uriObj) {
//path name = __dirname/hostname/attachment/tid/uid-paramnames-n
//new Buffer(str, 'base64')).toString()
var realIdArray = new Buffer(uriObj.query.aid, 'base64').toString().split('|');
//console.log('realId.array: ', realIdArray);
//console.log('16num to 10', parseInt(realIdArray[1], 16));
var filename = realIdArray.shift() + '.jpg';
//console.log('filename:', filename);
return path.join('attachment', filename);
};
//Get file path of url
exports.getFileName = function (baseDir, uriObj) {
exports.getFileName = function (uriObj) {
var filename, uri = uriObj.uri;
var urlObj = url.parse(uri);
var urlObj = url.parse(uri, true);
if (uriObj.type === "attachment" && /\.php$/.test(urlObj.pathname)) {
return attachment.getAttFilePath(baseDir, uri);
}
if (/(css)|(js)/.test(uriObj.type)) {
filename = path.join(path.dirname(urlObj.pathname), attachmentFilePath(urlObj));
} else if (/(css)|(js)/.test(uriObj.type)) {
filename = urlObj.pathname;

@@ -58,5 +85,57 @@ } else {

if (/\/$/.test(filename)) {
filename = 'index.html';
filename = path.join(filename, 'index.html');
}
return path.join(baseDir, urlObj.hostname, filename);
return filename;
};
exports.modify = function ($, handler) {
"use strict";
var attaches = $('img[file]');
attaches.each(function (index, attach) {
attach = $(attach);
attach.attr('src', attachmentFilePath(url.parse(attach.attr('file'), true)));
});
var $ul = $('<ul id="postList"></ul>');
var posts = $('#postlist').children('div[id^=post_]');
//console.log('get posts length:', posts.length);
posts.each(function (index, post) {
var $post = $(post);
//console.dir(post);
console.assert($post.attr('id'));
var postContent = $('<div class="postContent"></div>');
var postMessage = $('<div class="postMessage"></div>')
.append($post.find('div.pct div.pcb div.t_fsz td[id^=postmessage]').html());
postContent.append(postMessage);
var postImage = $post.find('div.pct div.pcb div.t_fsz div.pattl')
.attr('class', 'postImage');
postImage.find('em').remove();
postImage.find('div.attp').remove();
postImage.find('dl dt').remove();
postImage.find('a').each(function (index, a) {
a = $(a);
a.parent().text(a.text());
});
//postImage.find('p.mbn').each(function(index,p){
//});
if (postImage.length) {
postContent.append(postImage);
}
$ul.append($('<li id="' + $post.attr('id') + '"></li>')
.append('<div class="uname"><strong>' + $post.find('div.authi a.xw1').text() + '</strong></div>')
.append('<div class="postTime">' + $post.find('div.pi em[id^=authorposton]').text() + '</div>')
.append(postContent));
});
$ul.find('*').removeAttr('onclick').removeAttr('onmouseover').removeAttr('file');
var result = template.replace(/\{\{content\}\}/ig, $ul.toString())
.replace(/\{\{pageNav\}\}/ig, $('div.pgt').html());
//exports.parseLinks($, handler);
return result;
};
{
"name": "crawlit",
"version": "0.1.2",
"version": "0.1.3",
"description": " A node.js crawler support custom crawl rules for special site with thirdpart plugin.",

@@ -11,3 +11,2 @@ "main": "lib/index.js",

"dependencies": {
"cookies.txt": "*",
"iconv-lite": "*",

@@ -17,5 +16,8 @@ "request": "*",

"charset": "*",
"jsdom": "*"
"cheerio": "*"
},
"devDependencies": { },
"devDependencies": {
"connect":"~2.x",
"cookies.txt": "*"
},
"scripts": {},

@@ -22,0 +24,0 @@ "repository": {

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc