Comparing version 0.1.2 to 0.1.3
@@ -14,3 +14,4 @@ "use strict"; | ||
maxConnections: 2, | ||
crawlingTaskCount: 0 | ||
crawlingTaskCount: 0, | ||
updateMode: true | ||
}; | ||
@@ -20,2 +21,3 @@ | ||
PREPARE: 0, | ||
UPDATE: 5, | ||
RETRY: 1, | ||
@@ -33,4 +35,6 @@ PROCESS: 2, | ||
//crawl objects queue data cache. | ||
var QUEUE = []; | ||
var crawlQueue = function (baseDir, crawl) { | ||
var QUEUE = []; | ||
var that = {}; | ||
@@ -59,2 +63,11 @@ | ||
QUEUE = utilBox.loadJSON(queueDumFile, []); | ||
QUEUE.forEach(function (uriObj) { | ||
if (uriObj.status === QUEUE_STATUS.PROCESS) { | ||
process.nextTick(crawl.bind(null, uriObj)); | ||
queueConfig.crawlingTaskCount += 1; | ||
} | ||
if (queueConfig.updateMode && uriObj.status === QUEUE_STATUS.CRAWLED && uriObj.type === 'link') { | ||
uriObj.status = QUEUE_STATUS.UPDATE; | ||
} | ||
}); | ||
assert(Array.isArray(QUEUE)); | ||
@@ -71,2 +84,5 @@ }; | ||
*/ | ||
var memUsage = process.memoryUsage().rss / (1024 * 1024); | ||
var next = function () { | ||
@@ -79,14 +95,20 @@ var count = 0; | ||
if (queueConfig.crawlingTaskCount < queueConfig.maxConnections && | ||
(uriObj.status === QUEUE_STATUS.PREPARE || | ||
uriObj.status === QUEUE_STATUS.RETRY && uriObj.failedCount < maxRetryCount)) { | ||
if (memUsage < queueConfig.maxMem && | ||
queueConfig.crawlingTaskCount < queueConfig.maxConnections && | ||
( uriObj.status === QUEUE_STATUS.PREPARE || | ||
(uriObj.status === QUEUE_STATUS.RETRY && uriObj.failedCount < maxRetryCount) || | ||
(uriObj.status === QUEUE_STATUS.UPDATE && queueConfig.updateMode) )) { | ||
process.nextTick(crawl.bind(null, uriObj)); | ||
setImmediate(crawl.bind(null, uriObj)); | ||
//process.nextTick(crawl.bind(null, uriObj)); | ||
uriObj.status = QUEUE_STATUS.PROCESS; | ||
count += 1; | ||
queueConfig.crawlingTaskCount += 1; | ||
return queueConfig.crawlingTaskCount >= queueConfig.maxConnections; | ||
} | ||
}); | ||
queueConfig.crawlingTaskCount = count; | ||
if (queueConfig.crawlingTaskCount === 0) { | ||
if (retry > 2) { | ||
if (retry > 0) { | ||
clearInterval(timer); | ||
@@ -109,3 +131,2 @@ console.log('queue is empty Exit dump Queue listener....'); | ||
var memUsage = process.memoryUsage().rss / (1024 * 1024); | ||
@@ -116,3 +137,3 @@ console.log('Memory usage:', memUsage.toFixed(2), 'Mb'); | ||
} | ||
if (memUsage > queueConfig.maxMem) { | ||
if (memUsage > queueConfig.maxMem && queueConfig.crawlingTaskCount === 0) { | ||
console.error("Over max memory usage, exit process."); | ||
@@ -125,3 +146,3 @@ process.exit(1); | ||
//TODO Update and dump queue by Event driven, remove setInterval() calling.. | ||
var timer = setInterval(dump, 5 * 1000); | ||
var timer = setInterval(dump, 8 * 1000); | ||
}; | ||
@@ -134,5 +155,4 @@ | ||
* type:[attachment,css,link.img], | ||
* status:0:未开始 1:更新, 2:Retry 3:进行中,4:成功,5:失败, | ||
* status: @see QUEUE_STATUS | ||
* failedCount:3, | ||
* updateFlag:anyeType} | ||
*/ | ||
@@ -139,0 +159,0 @@ that.push = function (uriObj) { |
@@ -14,4 +14,4 @@ "use strict"; | ||
var request = require('request'); | ||
var jsdom = require('jsdom'); | ||
//var request = require('http').request; | ||
//var jsdom = require('jsdom'); | ||
var cheerio = require('cheerio'); | ||
var iconv = require('iconv-lite'); | ||
@@ -23,2 +23,3 @@ var charset = require('charset'); | ||
var utilBox = require('./utilbox.js'); | ||
var jsdomCount = 0; | ||
@@ -28,2 +29,3 @@ //Load jQuery source code to string. | ||
var resourceParser = config.crawlOptions.resourceParser; | ||
//prepare working path | ||
@@ -37,3 +39,3 @@ var ROOT_PATH = config.crawlOptions.working_root_path; | ||
var getFilename = function (uriObj) { | ||
return config.crawlOptions.resourceParser.getFileName(ROOT_PATH, uriObj); | ||
return resourceParser.getFileName(uriObj); | ||
}; | ||
@@ -51,9 +53,8 @@ | ||
var getBaseURI = function (window) { | ||
var getBaseURI = function ($base, locationHref) { | ||
//get baseURI, document.baseURI is not working in jsdom. | ||
var $base = window.$('base'); | ||
var urlObj = url.parse(window.location.href); | ||
var urlObj = url.parse(locationHref); | ||
var basePath = $base.length === 1 ? $base.attr('href') : path.dirname(urlObj.pathname); | ||
if (!url.parse(basePath).host) { | ||
basePath = url.resolve(window.location.href, basePath); | ||
basePath = url.resolve(locationHref, basePath); | ||
} | ||
@@ -69,7 +70,12 @@ return basePath; | ||
var uri = uriObj['uri']; | ||
console.log("Crawl :", uriObj.type, uri); | ||
console.log("Crawl :", uriObj.type, uri); | ||
console.debug('File', uriObj.filename); | ||
Object.defineProperty(uriObj, 'distFilename', { | ||
value: path.join(ROOT_PATH, url.parse(uriObj.uri).hostname, uriObj.filename), | ||
writable: true, | ||
numerable: false, | ||
configurable: true}); | ||
utilBox.preparePath(path.dirname(uriObj.filename)); | ||
utilBox.preparePath(path.dirname(uriObj.distFilename)); | ||
@@ -103,3 +109,3 @@ /** | ||
} | ||
}).pipe(fs.createWriteStream(uriObj.filename)); | ||
}).pipe(fs.createWriteStream(uriObj.distFilename)); | ||
} else { | ||
@@ -135,60 +141,67 @@ request.get({uri: uri, jar: true, encoding: null, headers: requestOptions}, function (err, res, body) { | ||
//2. Remove absolute path to relative path | ||
var regexp = new RegExp('http:\/\/' + config.crawlOptions.host.replace(/\./g, '\.'), 'g'); | ||
var outputCrawled = body.replace(regexp, ''); | ||
//3. save to disk | ||
fs.writeFile(uriObj.filename, outputCrawled, function (err) { | ||
if (err) { | ||
return crawlEnd(err, uriObj); | ||
} | ||
outputCrawled = null; | ||
console.info('Save to:', uriObj.filename); | ||
}); | ||
var saveCrawledUri = function (output) { | ||
var regexp = new RegExp('http:\/\/' + config.crawlOptions.host.replace(/\./g, '\.'), 'g'); | ||
var outputCrawled = output || body; | ||
outputCrawled = outputCrawled.replace(regexp, ''); | ||
//console.warn(outputCrawled); | ||
fs.writeFile(uriObj.distFilename, outputCrawled, function (err) { | ||
if (err) { | ||
return crawlEnd(err, uriObj); | ||
} | ||
outputCrawled = null; | ||
console.info('Save to:', uriObj.filename); | ||
}); | ||
}; | ||
if (!resourceParser.modify) { | ||
saveCrawledUri(); | ||
} | ||
//4. Remove all script for parsing HTML DOM. | ||
body = body.replace(/<script.*?>.*?<\/script>/ig, ''); | ||
//5. parse links | ||
jsdom.env({ html: body, src: [jQuerySrc], | ||
done: function (err, window) { | ||
body = null; //release memory | ||
var $ = cheerio.load(body, { normalizeWhitespace: true, xmlMode: false }); | ||
if (err) { | ||
window && window.close && window.close(); | ||
return crawlEnd(err, uriObj); | ||
console.debug('Parse HTML :', uri); | ||
var baseURI = getBaseURI($('base'), uriObj.uri); | ||
//todo move linkStacks into resourcePraser. | ||
var linkStacks = []; | ||
var handler = { | ||
parse: function (jQuerySelector, resourceType, callback) { | ||
var resources = jQuerySelector; | ||
if (resources) { | ||
//todo use Object[key] to count; | ||
linkStacks.push(resourceType + ': ' + resources.length); | ||
resources.each(function (index, link) { | ||
callback($(link)); | ||
}); | ||
} else { | ||
linkStacks.push(resourceType, 0); | ||
} | ||
}, | ||
push: function (uriObj) { | ||
var host2 = url.parse(uriObj.uri).host; | ||
if (!host2) { | ||
uriObj.uri = getAbsUrlPath(baseURI, uriObj.uri); | ||
} | ||
uriObj.filename = getFilename(uriObj); | ||
crawler.push(uriObj); | ||
}}; | ||
console.debug('Parse HTML :', uri); | ||
var baseURI = getBaseURI(window); | ||
var linkStacks = []; | ||
var handler = { | ||
parse: function (jQuerySelector, resourceType, callback) { | ||
var resources = window.$(jQuerySelector); | ||
if (resources) { | ||
linkStacks.push(resourceType + ': ' + resources.length); | ||
resources.each(function (index, link) { | ||
callback(link); | ||
}); | ||
} else { | ||
linkStacks.push(resourceType, 0); | ||
} | ||
}, | ||
push: function (uriObj) { | ||
var host2 = url.parse(uriObj.uri).host; | ||
if (!host2) { | ||
uriObj.uri = getAbsUrlPath(baseURI, uriObj.uri); | ||
} | ||
uriObj.filename = getFilename(uriObj); | ||
crawler.push(uriObj); | ||
}}; | ||
resourceParser.parseLinks($, handler); | ||
if (resourceParser.modify) { | ||
saveCrawledUri(resourceParser.modify($, handler)); | ||
} else { | ||
//TODO change rules then move parseLines method downside. | ||
//resourceParser.parseLinks($, handler); | ||
} | ||
config.crawlOptions.resourceParser.parseLinks(window.$, handler); | ||
window.close();//Call window.close(); for memory leak. | ||
window = null; | ||
handler = null; | ||
console.info('Resource count in page:', uriObj.uri, linkStacks); | ||
linkStacks = null; | ||
crawlEnd(null, uriObj); | ||
} | ||
}); | ||
handler = null; | ||
console.info('Resource count in page:', uriObj.uri, linkStacks); | ||
linkStacks = null; | ||
crawlEnd(null, uriObj); | ||
}); | ||
@@ -195,0 +208,0 @@ } |
@@ -25,2 +25,4 @@ "use strict"; | ||
var utilBox = require('../../utilbox.js'); | ||
var console = require('../../logger.js').getLogger('info'); | ||
console.log = console.debug; | ||
@@ -38,8 +40,10 @@ var finishedStack = {}; | ||
var attachmentFilePath = function (baseDir, uriObj) { | ||
var attachmentFilePath = function (uriObj) { | ||
//path name = __dirname/hostname/attachment/tid/uid-paramnames-n | ||
var realIdArray = decode(uriObj.query.aid).split('|'); | ||
console.log('realId.array: ', realIdArray); | ||
console.log('16num to 10', parseInt(realIdArray[1], 16)); | ||
var filename = Object.keys(uriObj.query).reduce(function (pre, current, index) { | ||
//console.log(index, 'pre:', pre, 'current:', current); | ||
console.log(index, 'current:', current, 'pre:', pre); | ||
//Add other param to filenameArray | ||
@@ -52,3 +56,6 @@ if (!/(mod|aid)/i.test(current)) { | ||
return path.join(baseDir, uriObj.hostname, 'attachment', realIdArray.pop(), filename.join('-')); | ||
console.log('realId.array: ', realIdArray); | ||
console.log('filename:', filename); | ||
return path.join('attachment', realIdArray.pop(), filename.join('-')); | ||
}; | ||
@@ -129,18 +136,7 @@ | ||
exports.getAttFilePath = function (dir, uri) { | ||
exports.getAttFilePath = function (uri) { | ||
var urlObj = url.parse(uri, true); | ||
return attachmentFilePath(dir, urlObj); | ||
return attachmentFilePath(urlObj); | ||
}; | ||
exports.exists = function (uri) { | ||
var uriObj = url.parse(uri, true); | ||
if (isAttach(uriObj)) { | ||
var filePath = attachmentFilePath(uriObj); | ||
//console.log('filePath:', filePath, ' exist:', fs.existsSync(filePath)); | ||
return fs.existsSync(filePath); | ||
} else { | ||
return false; | ||
} | ||
}; | ||
exports.clear = function () { | ||
@@ -147,0 +143,0 @@ clearFishied.call(null, cleanAttFiles); |
@@ -19,25 +19,25 @@ "use strict"; | ||
//pipe images | ||
handler.parse('img[file^="forum.php?mod=attachment"]', "images", function (img) { | ||
handler.push({uri: img.getAttribute('file'), type: 'attachment'}); | ||
handler.push({uri: img.getAttribute('zoomfile'), type: 'attachment'}); | ||
handler.parse($('img[file^="forum.php?mod=attachment"]'), "images", function (img) { | ||
handler.push({uri: img.attr('file'), type: 'attachment'}); | ||
handler.push({uri: img.attr('zoomfile'), type: 'attachment'}); | ||
}); | ||
//pipe attachments and not need JB | ||
handler.parse('a[href^="forum.php?mod=attachment"]', 'attachments', function (attachment) { | ||
var needJB = $('#' + (attachment.parentNode.id || attachment.id) + '_menu').text().indexOf('金币') > 0; | ||
handler.parse($('a[href^="forum.php?mod=attachment"]'), 'attachments', function (attachment) { | ||
var needJB = $('#' + (attachment.parent.attr('id') || attachment.attr('id')) + '_menu').text().indexOf('金币') > 0; | ||
if (!needJB) { | ||
//console.log('attachment:', attachment.href, $('#' + $(attachment).parent().id + '_menu').text()); | ||
handler.push({uri: (attachment.href), type: 'attachment'}); | ||
handler.push({uri: (attachment.attr('href')), type: 'attachment'}); | ||
} | ||
}); | ||
handler.parse('img', 'image', function (link) { | ||
handler.push({uri: link.src, type: 'img'}); | ||
handler.parse($('img'), 'image', function (link) { | ||
handler.push({uri: link.attr('src'), type: 'img'}); | ||
}); | ||
handler.parse('link[rel="stylesheet"]', 'css', function (link) { | ||
handler.push({uri: link.href, type: 'css'}); | ||
handler.parse($('link[rel="stylesheet"]'), 'css', function (link) { | ||
handler.push({uri: link.attr('href'), type: 'css'}); | ||
}); | ||
handler.parse('sscript[src^="static/"]', 'js', function (link) { | ||
handler.parse($('sscript[src^="static/"]'), 'js', function (link) { | ||
handler.push({uri: $(link).attr('src'), type: 'js'}); | ||
@@ -50,43 +50,41 @@ }); | ||
//thread links | ||
handler.parse('a[href$=".html"][href^="forum"]', 'forum', function (link) { | ||
handler.push({uri: link.href, type: 'link'}); | ||
handler.parse($('a[href$=".html"][href^="forum"]'), 'forum', function (link) { | ||
handler.push({uri: link.attr('href'), type: 'link'}); | ||
}); | ||
handler.parse('a[href$=".html"][href^="thread"]', 'thread', function (link) { | ||
handler.push({uri: link.href, type: 'link'}); | ||
handler.parse($('a[href$=".html"][href^="thread"]'), 'thread', function (link) { | ||
handler.push({uri: link.attr('href'), type: 'link'}); | ||
}); | ||
//archive links | ||
handler.parse('a[href^="archiver/"],a[href^="?tid-"]', 'Archive', function (a) { | ||
handler.push({uri: a.href, type: 'link'}); | ||
handler.parse($('a[href^="archiver/"],a[href^="?tid-"]'), 'Archive', function (a) { | ||
handler.push({uri: a.attr('href'), type: 'link'}); | ||
}); | ||
//forum links | ||
handler.parse('a[href^="forum.php?gid="] ', 'forum', function (a) { | ||
handler.push({uri: a.href, type: 'link'}); | ||
handler.parse($('a[href^="forum.php?gid="] '), 'forum', function (a) { | ||
handler.push({uri: a.attr('href'), type: 'link'}); | ||
}); | ||
handler.parse('a[href="group.php"],a[href="portal.php"],a[href="home.php"]', 'p&f&g', function (link) { | ||
handler.parse($('a[href="group.php"],a[href="portal.php"],a[href="home.php"]'), 'p&f&g', function (link) { | ||
//console.log('group:', link.href); | ||
if (/\.php$/.test(link.href)) { | ||
if (/\.php$/.test(link.attr('href'))) { | ||
//console.log('push:', link.href, crawler.push({uri:link.href, type:'link'})); | ||
handler.push({uri: link.href, type: 'link'}); | ||
handler.push({uri: link.attr('href'), type: 'link'}); | ||
} | ||
}); | ||
handler.parse('a[href^="group.php?gid="]', 'group', function (link) { | ||
handler.parse($('a[href^="group.php?gid="]'), 'group', function (link) { | ||
//console.log('group:', link.href); | ||
if (/group\.php\?gid=\(d{1,3}$/.test(link.href)) { | ||
handler.push({uri: link.href, type: 'link'}); | ||
if (/group\.php\?gid=\(d{1,3}$/.test(link.attr('href'))) { | ||
handler.push({uri: link.attr('href'), type: 'link'}); | ||
} | ||
}); | ||
handler.parse('a[href^="home.php?mod=space&uid="]', 'space', function (link) { | ||
if (/home\.php\?mod=space&uid=\d{1,6}$/.test(link.href)) { | ||
handler.push({uri: link.href, type: 'link'}); | ||
handler.parse($('a[href^="home.php?mod=space&uid="]'), 'space', function (link) { | ||
if (/home\.php\?mod=space&uid=\d{1,6}$/.test(link.attr('href'))) { | ||
handler.push({uri: link.attr('href'), type: 'link'}); | ||
} | ||
}); | ||
} | ||
handler.callback && handler.callback(); | ||
}; | ||
@@ -93,0 +91,0 @@ |
@@ -8,3 +8,7 @@ /** | ||
var attachment = require('../discuz/attachment'); | ||
var fs = require('fs'); | ||
var assert = require('assert'); | ||
var template = fs.readFileSync(path.join(__dirname, '../../../template/index.html')).toString(); | ||
exports.parseLinks = function ($, handler) { | ||
@@ -14,40 +18,63 @@ "use strict"; | ||
//pipe images | ||
handler.parse('img[file*="forum.php?mod=attachment"]', "images", function (img) { | ||
handler.push({uri: img.getAttribute('file'), type: 'attachment'}); | ||
handler.push({uri: img.getAttribute('src'), type: 'attachment'}); | ||
handler.parse($('img[file*="forum.php?mod=attachment"]'), "images", function (img) { | ||
//discuz default src is a blank image, use file attribute. | ||
assert(img); | ||
img = $(img); | ||
assert(img.attr('file')); | ||
handler.push({uri: img.attr('file'), type: 'attachment'}); | ||
}); | ||
/*//pipe attachments and not need JB | ||
handler.parse('a[href*="forum.php?mod=attachment"]', 'attachments', function (attachment) { | ||
var needJB = $('#' + (attachment.parentNode.id || attachment.id) + '_menu').text().indexOf('金币') > 0; | ||
if (!needJB) { | ||
//console.log('attachment:', attachment.href, $('#' + $(attachment).parent().id + '_menu').text()); | ||
handler.push({uri: (attachment.href), type: 'attachment'}); | ||
} | ||
});*/ | ||
handler.parse($('a[href*="forum.php?mod=attachment"]'), 'attachments', function (attachment) { | ||
var needJB = $('#' + (attachment.parentNode.id || attachment.id) + '_menu').text().indexOf('金币') > 0; | ||
if (!needJB) { | ||
//console.log('attachment:', attachment.href, $('#' + $(attachment).parent().id + '_menu').text()); | ||
handler.push({uri: (attachment.href), type: 'attachment'}); | ||
} | ||
});*/ | ||
/*handler.parse('img', 'image', function (link) { | ||
handler.push({uri: link.src, type: 'img'}); | ||
handler.parse($('img[src$=gif]', '#postlist'), 'image', function (link) { | ||
link = $(link); | ||
var attr = link.attr('src'); | ||
assert(attr); | ||
if (/\.gif$/.test(attr)) | ||
handler.push({uri: attr, type: 'img'}); | ||
}); | ||
handler.parse('link[rel="stylesheet"]', 'css', function (link) { | ||
handler.push({uri: link.href, type: 'css'}); | ||
}); | ||
*/ | ||
handler.parse('a[href*="thread-50247-"]', 'link', function (link) { | ||
handler.push({uri: link.href, type: 'link'}); | ||
}); | ||
/*handler.parse('link[rel="stylesheet"]', 'css', function (link) { | ||
handler.push({uri: link.href, type: 'css'}); | ||
}); | ||
*/ | ||
if (config.crawlOptions.recursive) { | ||
handler.parse($('a[href*="thread-50247-"]'), 'link', function (link) { | ||
link = $(link); | ||
handler.push({uri: link.attr('href'), type: 'link'}); | ||
}); | ||
} | ||
}; | ||
var attachmentFilePath = function (uriObj) { | ||
//path name = __dirname/hostname/attachment/tid/uid-paramnames-n | ||
//new Buffer(str, 'base64')).toString() | ||
var realIdArray = new Buffer(uriObj.query.aid, 'base64').toString().split('|'); | ||
//console.log('realId.array: ', realIdArray); | ||
//console.log('16num to 10', parseInt(realIdArray[1], 16)); | ||
var filename = realIdArray.shift() + '.jpg'; | ||
//console.log('filename:', filename); | ||
return path.join('attachment', filename); | ||
}; | ||
//Get file path of url | ||
exports.getFileName = function (baseDir, uriObj) { | ||
exports.getFileName = function (uriObj) { | ||
var filename, uri = uriObj.uri; | ||
var urlObj = url.parse(uri); | ||
var urlObj = url.parse(uri, true); | ||
if (uriObj.type === "attachment" && /\.php$/.test(urlObj.pathname)) { | ||
return attachment.getAttFilePath(baseDir, uri); | ||
} | ||
if (/(css)|(js)/.test(uriObj.type)) { | ||
filename = path.join(path.dirname(urlObj.pathname), attachmentFilePath(urlObj)); | ||
} else if (/(css)|(js)/.test(uriObj.type)) { | ||
filename = urlObj.pathname; | ||
@@ -58,5 +85,57 @@ } else { | ||
if (/\/$/.test(filename)) { | ||
filename = 'index.html'; | ||
filename = path.join(filename, 'index.html'); | ||
} | ||
return path.join(baseDir, urlObj.hostname, filename); | ||
return filename; | ||
}; | ||
exports.modify = function ($, handler) { | ||
"use strict"; | ||
var attaches = $('img[file]'); | ||
attaches.each(function (index, attach) { | ||
attach = $(attach); | ||
attach.attr('src', attachmentFilePath(url.parse(attach.attr('file'), true))); | ||
}); | ||
var $ul = $('<ul id="postList"></ul>'); | ||
var posts = $('#postlist').children('div[id^=post_]'); | ||
//console.log('get posts length:', posts.length); | ||
posts.each(function (index, post) { | ||
var $post = $(post); | ||
//console.dir(post); | ||
console.assert($post.attr('id')); | ||
var postContent = $('<div class="postContent"></div>'); | ||
var postMessage = $('<div class="postMessage"></div>') | ||
.append($post.find('div.pct div.pcb div.t_fsz td[id^=postmessage]').html()); | ||
postContent.append(postMessage); | ||
var postImage = $post.find('div.pct div.pcb div.t_fsz div.pattl') | ||
.attr('class', 'postImage'); | ||
postImage.find('em').remove(); | ||
postImage.find('div.attp').remove(); | ||
postImage.find('dl dt').remove(); | ||
postImage.find('a').each(function (index, a) { | ||
a = $(a); | ||
a.parent().text(a.text()); | ||
}); | ||
//postImage.find('p.mbn').each(function(index,p){ | ||
//}); | ||
if (postImage.length) { | ||
postContent.append(postImage); | ||
} | ||
$ul.append($('<li id="' + $post.attr('id') + '"></li>') | ||
.append('<div class="uname"><strong>' + $post.find('div.authi a.xw1').text() + '</strong></div>') | ||
.append('<div class="postTime">' + $post.find('div.pi em[id^=authorposton]').text() + '</div>') | ||
.append(postContent)); | ||
}); | ||
$ul.find('*').removeAttr('onclick').removeAttr('onmouseover').removeAttr('file'); | ||
var result = template.replace(/\{\{content\}\}/ig, $ul.toString()) | ||
.replace(/\{\{pageNav\}\}/ig, $('div.pgt').html()); | ||
//exports.parseLinks($, handler); | ||
return result; | ||
}; |
{ | ||
"name": "crawlit", | ||
"version": "0.1.2", | ||
"version": "0.1.3", | ||
"description": " A node.js crawler support custom crawl rules for special site with thirdpart plugin.", | ||
@@ -11,3 +11,2 @@ "main": "lib/index.js", | ||
"dependencies": { | ||
"cookies.txt": "*", | ||
"iconv-lite": "*", | ||
@@ -17,5 +16,8 @@ "request": "*", | ||
"charset": "*", | ||
"jsdom": "*" | ||
"cheerio": "*" | ||
}, | ||
"devDependencies": { }, | ||
"devDependencies": { | ||
"connect":"~2.x", | ||
"cookies.txt": "*" | ||
}, | ||
"scripts": {}, | ||
@@ -22,0 +24,0 @@ "repository": { |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Wildcard dependency
QualityPackage has a dependency with a floating version range. This can cause issues if the dependency publishes a new major version.
Found 1 instance in 1 package
No README
QualityPackage does not have a README. This may indicate a failed publish or a low quality package.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Wildcard dependency
QualityPackage has a dependency with a floating version range. This can cause issues if the dependency publishes a new major version.
Found 2 instances in 1 package
37413
5
15
1045
2
0
6
+ Addedcheerio@*
+ Addedboolbase@1.0.0(transitive)
+ Addedcheerio@1.0.0(transitive)
+ Addedcheerio-select@2.1.0(transitive)
+ Addedcss-select@5.1.0(transitive)
+ Addedcss-what@6.1.0(transitive)
+ Addeddom-serializer@2.0.0(transitive)
+ Addeddomelementtype@2.3.0(transitive)
+ Addeddomhandler@5.0.3(transitive)
+ Addeddomutils@3.1.0(transitive)
+ Addedencoding-sniffer@0.2.0(transitive)
+ Addedhtmlparser2@9.1.0(transitive)
+ Addednth-check@2.1.1(transitive)
+ Addedparse5-htmlparser2-tree-adapter@7.1.0(transitive)
+ Addedparse5-parser-stream@7.1.2(transitive)
+ Addedundici@6.20.1(transitive)
- Removedcookies.txt@*
- Removedjsdom@*
- Removedagent-base@7.1.1(transitive)
- Removedcookies.txt@0.1.2(transitive)
- Removedcssstyle@4.1.0(transitive)
- Removeddata-urls@5.0.0(transitive)
- Removeddebug@4.3.7(transitive)
- Removeddecimal.js@10.4.3(transitive)
- Removedform-data@4.0.1(transitive)
- Removedhtml-encoding-sniffer@4.0.0(transitive)
- Removedhttp-proxy-agent@7.0.2(transitive)
- Removedhttps-proxy-agent@7.0.5(transitive)
- Removedis-potential-custom-element-name@1.0.1(transitive)
- Removedjsdom@25.0.1(transitive)
- Removedms@2.1.3(transitive)
- Removednwsapi@2.2.13(transitive)
- Removedrrweb-cssom@0.7.1(transitive)
- Removedsaxes@6.0.0(transitive)
- Removedsymbol-tree@3.2.4(transitive)
- Removedtldts@6.1.59(transitive)
- Removedtldts-core@6.1.59(transitive)
- Removedtough-cookie@5.0.0(transitive)
- Removedtr46@5.0.0(transitive)
- Removedw3c-xmlserializer@5.0.0(transitive)
- Removedwebidl-conversions@7.0.0(transitive)
- Removedwhatwg-url@14.0.0(transitive)
- Removedws@8.18.0(transitive)
- Removedxml-name-validator@5.0.0(transitive)
- Removedxmlchars@2.2.0(transitive)