declarative-crawler
Advanced tools
Comparing version 0.0.4 to 0.0.5
@@ -5,2 +5,5 @@ // @flow | ||
/** | ||
* Description 新闻爬虫 | ||
*/ | ||
export default class NewsCrawler extends Crawler { | ||
@@ -7,0 +10,0 @@ initialize() { |
@@ -31,5 +31,8 @@ // @flow | ||
// 存在有效二级链接 | ||
return href.indexOf("zhihu.com") > -1 | ||
? href | ||
: `https://www.zhihu.com${href}`; | ||
return { | ||
url: href.indexOf("zhihu.com") > -1 | ||
? href | ||
: `https://www.zhihu.com${href}`, | ||
extra: feedItem | ||
}; | ||
} | ||
@@ -36,0 +39,0 @@ }); |
// @flow | ||
import { $ } from "../../../src/utils/parser/HTMLParser"; | ||
import HeadlessChromeSpider from "../../../src/spider/web/HeadlessChromeSpider"; | ||
import HeadlessChromeSpider | ||
from "../../../src/source/spider/web/HeadlessChromeSpider"; | ||
/** | ||
@@ -9,2 +9,4 @@ * @function 知乎某个话题答案的爬虫 | ||
export default class TopicSpider extends HeadlessChromeSpider { | ||
static displayName = "话题蜘蛛"; | ||
// 定义模型 | ||
@@ -11,0 +13,0 @@ model = { |
@@ -10,2 +10,6 @@ "use strict"; | ||
/** | ||
* Description 获取操作系统信息 | ||
* @returns {Promise} | ||
*/ | ||
var getOSInfo = function () { | ||
@@ -69,3 +73,2 @@ var _ref = _asyncToGenerator(regeneratorRuntime.mark(function _callee() { | ||
var router = new Router(); | ||
var CrawlerServer = function () { | ||
@@ -108,2 +111,3 @@ | ||
// 启动整个爬虫 | ||
// 这里不需要等待启动返回,因此直接使用 Promise 异步执行 | ||
router.get("/start", function (ctx, next) { | ||
@@ -170,3 +174,5 @@ // 启动整个爬虫 | ||
app.listen(this.httpOption.port, this.httpOption.host, function () { | ||
console.log("服务端开始运行"); | ||
var baseUrl = _this.httpOption.host + ":" + _this.httpOption.port; | ||
console.log("\n \u722C\u866B\u670D\u52A1\u7AEF\u5F00\u59CB\u8FD0\u884C\uFF1A\n " + baseUrl + "/ - \u67E5\u770B\u722C\u866B\u5217\u8868\n " + baseUrl + "/:crawlerName - \u67E5\u770B\u67D0\u4E2A\u722C\u866B\u8BE6\u60C5\n " + baseUrl + "/start - \u542F\u52A8\u6240\u6709\u722C\u866B\n " + baseUrl + "/status - \u67E5\u770B\u7CFB\u7EDF\u72B6\u6001\n "); | ||
}); | ||
@@ -173,0 +179,0 @@ |
@@ -184,3 +184,3 @@ "use strict"; | ||
if (spiderTask.request.extra) { | ||
spiderTask.nextSpiderInstance.setExtra(spiderTask.request.extra); | ||
spiderTask.spiderInstance.setExtra(spiderTask.request.extra); | ||
} | ||
@@ -187,0 +187,0 @@ |
{ | ||
"name": "declarative-crawler", | ||
"version": "0.0.4", | ||
"version": "0.0.5", | ||
"description": "Declarative and Observable Distributed Crawler For Web, RDB, OS, also can act as a Monitor or ETL for your system", | ||
@@ -26,21 +26,21 @@ "scripts": { | ||
"cheerio": "^0.22.0", | ||
"chrome-remote-interface": "^0.20.0", | ||
"core-decorators": "^0.17.0", | ||
"es6-promise": "^3.2.1", | ||
"fluent-fetcher": "0.2.4", | ||
"image-downloader": "^3.2.1", | ||
"chrome-remote-interface": "^0.24.0", | ||
"core-decorators": "^0.19.0", | ||
"es6-promise": "^4.1.1", | ||
"fluent-fetcher": "^0.3.0", | ||
"image-downloader": "^3.2.2", | ||
"isomorphic-fetch": "^2.2.1", | ||
"isomorphic-urlencode": "0.0.9", | ||
"koa": "^2.2.0", | ||
"koa": "^2.3.0", | ||
"koa-router": "next", | ||
"md5": "^2.2.1", | ||
"mysql": "^2.13.0", | ||
"pidusage": "^1.1.1", | ||
"pidusage": "^1.1.5", | ||
"qrcode-terminal": "^0.11.0", | ||
"wolfy87-eventemitter": "^5.1.0" | ||
"wolfy87-eventemitter": "^5.2.1" | ||
}, | ||
"devDependencies": { | ||
"babel-cli": "^6.14.0", | ||
"babel-core": "^6.14.0", | ||
"babel-jest": "^19.0.0", | ||
"babel-core": "^6.25.0", | ||
"babel-jest": "^20.0.3", | ||
"babel-plugin-async-to-promises": "^1.0.5", | ||
@@ -47,0 +47,0 @@ "babel-plugin-transform-class-properties": "^6.24.1", |
@@ -7,3 +7,4 @@ // @flow | ||
import CrawlerScheduler from "../source/crawler/CrawlerScheduler"; | ||
import CrawlerStatistics from "../source/crawler/store/entity/CrawlerStatistics"; | ||
import CrawlerStatistics | ||
from "../source/crawler/store/entity/CrawlerStatistics"; | ||
const pusage = require("pidusage"); | ||
@@ -16,2 +17,6 @@ const os = require("os"); | ||
/** | ||
* Description 获取操作系统信息 | ||
* @returns {Promise} | ||
*/ | ||
async function getOSInfo() { | ||
@@ -64,2 +69,3 @@ return new Promise(resolve => { | ||
// 启动整个爬虫 | ||
// 这里不需要等待启动返回,因此直接使用 Promise 异步执行 | ||
router.get("/start", (ctx, next) => { | ||
@@ -105,5 +111,15 @@ // 启动整个爬虫 | ||
app.listen(this.httpOption.port, this.httpOption.host, () => { | ||
console.log("服务端开始运行"); | ||
const baseUrl = `${this.httpOption.host}:${this.httpOption.port}`; | ||
console.log( | ||
` | ||
爬虫服务端开始运行: | ||
${baseUrl}/ - 查看爬虫列表 | ||
${baseUrl}/:crawlerName - 查看某个爬虫详情 | ||
${baseUrl}/start - 启动所有爬虫 | ||
${baseUrl}/status - 查看系统状态 | ||
` | ||
); | ||
}); | ||
} | ||
} |
@@ -136,3 +136,3 @@ // @flow | ||
// 取出某个任务实例 | ||
let spiderTask = this._spiderTasks.shift(); | ||
let spiderTask:SpiderTask = this._spiderTasks.shift(); | ||
@@ -147,3 +147,3 @@ // 设置爬虫的请求 | ||
if (spiderTask.request.extra) { | ||
spiderTask.nextSpiderInstance.setExtra(spiderTask.request.extra); | ||
spiderTask.spiderInstance.setExtra(spiderTask.request.extra); | ||
} | ||
@@ -150,0 +150,0 @@ |
332434
6038
+ Addedchrome-remote-interface@0.24.5(transitive)
+ Addedcore-decorators@0.19.0(transitive)
+ Addedes6-promise@4.2.8(transitive)
+ Addedfluent-fetcher@0.3.3(transitive)
- Removedchrome-remote-interface@0.20.0(transitive)
- Removedcore-decorators@0.17.0(transitive)
- Removedfluent-fetcher@0.2.4(transitive)
Updatedcore-decorators@^0.19.0
Updatedes6-promise@^4.1.1
Updatedfluent-fetcher@^0.3.0
Updatedimage-downloader@^3.2.2
Updatedkoa@^2.3.0
Updatedpidusage@^1.1.5
Updatedwolfy87-eventemitter@^5.2.1