Skip to content

Commit

Permalink
fix the method retry, parameter current should implement ITask;
Browse files Browse the repository at this point in the history
fix the method queue, task's special option is not supported but special info;
fix the function `startTask`, when execute plan.process there are an error, throw it;
import uuid/v1 instead of uuid;
delete outdated comments;
add warnning about package's stability;
update the package to version 0.6.1
  • Loading branch information
Bin-Huang committed Jul 27, 2017
1 parent 6b6ecc2 commit d01d03a
Show file tree
Hide file tree
Showing 7 changed files with 23 additions and 33 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
**NOTE** The package nodespider is still under development. It is not suggested to using it in your project.
**NOTE** The package nodespider is still under development. That means frequent changes and potential bug. So it is not suggested to using it in your project.

# Features
- 简单高效,开箱即用
Expand Down
8 changes: 4 additions & 4 deletions build/spider.d.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
/// <reference types="node" />
import { EventEmitter } from "events";
import Queue from "./queue";
import { IPipe, IPlan, IState } from "./types";
import { ICurrent, IDefaultPlanOptionCallback, IDefaultPlanOptionInput } from "./defaultPlan";
import { IPipe, IPlan, IState, ITask } from "./types";
import { IDefaultPlanOptionCallback, IDefaultPlanOptionInput } from "./defaultPlan";
/**
* class of NodeSpider
* @class NodeSpider
Expand Down Expand Up @@ -34,15 +34,15 @@ export default class NodeSpider extends EventEmitter {
* @param {number} maxRetry Maximum number of retries for this task
* @param {function} finalErrorCallback The function called when the maximum number of retries is reached
*/
retry(current: ICurrent, maxRetry?: number, finalErrorCallback?: (current: ICurrent) => void): void;
retry(current: ITask, maxRetry?: number, finalErrorCallback?: (current: ITask) => void): void;
plan(item: IPlan | IDefaultPlanOptionInput | IDefaultPlanOptionCallback): symbol;
/**
* 添加待爬取链接到队列,并指定爬取计划。
* @param planKey 指定的爬取计划
* @param url 待爬取的链接(们)
* @param special (可选)针对当前链接的特别设置,将覆盖与plan重复的设置
*/
queue(planKey: symbol, url: string | string[], special?: any): number;
queue(planKey: symbol, url: string | string[], info?: any): number;
pipe(pipeObject: IPipe): symbol;
save(pipeKey: symbol, data: any): Error | undefined;
}
17 changes: 7 additions & 10 deletions build/spider.js
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
"use strict";
// TODO: 更好的报错机制: 报错建议?以及去除多余的 console.error
// BUG: 使用url.resolve补全url,可能导致 'http://www.xxx.com//www.xxx.com' 的问题。补全前,使用 is-absolute-url 包判断, 或考录使用 relative-url 代替
// TODO: 使用 node 自带 stringdecode 代替 iconv-lite
// mysql 插件
// redis queue
// TODO B 注册pipe和queue可能存在异步操作,此时应该封装到promise或async函数。但依然存在问题:当还没注册好,就调动了queue或者save
// TODO C 兼容新 plan 系统的 queue
// TODO C 更良好的报错提示
Object.defineProperty(exports, "__esModule", { value: true });
const events_1 = require("events");
const uuid = require("uuid");
const uuid = require("uuid/v1");
const defaultPlan_1 = require("./defaultPlan");
const queue_1 = require("./queue");
const defaultOption = {
Expand Down Expand Up @@ -108,9 +106,9 @@ class NodeSpider extends events_1.EventEmitter {
retry(current, maxRetry = 1, finalErrorCallback) {
const task = {
hasRetried: current.hasRetried,
info: current.info,
maxRetry: current.maxRetry,
planKey: current.planKey,
special: current.special,
url: current.url,
};
if (!task.hasRetried) {
Expand All @@ -127,7 +125,7 @@ class NodeSpider extends events_1.EventEmitter {
if (task.hasRetried >= task.maxRetry) {
return finalErrorCallback(current);
}
const plan = current.plan;
const plan = this._STATE.planStore.get(task.planKey);
task.hasRetried++;
this._STATE.queue.jumpTask(task, plan.type); // 插队到队列,重新等待执行
}
Expand Down Expand Up @@ -157,7 +155,7 @@ class NodeSpider extends events_1.EventEmitter {
* @param url 待爬取的链接(们)
* @param special (可选)针对当前链接的特别设置,将覆盖与plan重复的设置
*/
queue(planKey, url, special) {
queue(planKey, url, info) {
// 参数检验
if (typeof planKey !== "symbol") {
throw new TypeError("queue 参数错误");
Expand All @@ -167,16 +165,15 @@ class NodeSpider extends events_1.EventEmitter {
throw new Error("指定plan不存在");
}
// 添加到队列
// TODO C 完善 special: 过滤掉其中不相干的成员?
if (!Array.isArray(url)) {
this._STATE.queue.addTask({ url, planKey, special }, plan.type);
this._STATE.queue.addTask({ url, planKey, info }, plan.type);
}
else {
url.map((u) => {
if (typeof u !== "string") {
return new Error("url数组中存在非字符串成员");
}
this._STATE.queue.addTask({ url: u, planKey, special }, plan.type);
this._STATE.queue.addTask({ url: u, planKey, info }, plan.type);
});
}
this._STATE.working = true;
Expand Down Expand Up @@ -246,9 +243,9 @@ function startTask(type, task, self) {
self._STATE.currentConnections[type]--;
self._STATE.currentTotalConnections--;
}).catch((e) => {
console.log(e);
self._STATE.currentConnections[type]--;
self._STATE.currentTotalConnections--;
throw e;
});
}
/**
Expand Down
1 change: 0 additions & 1 deletion build/types.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ export interface IDefaultOption {
export interface ITask {
url: string;
planKey: symbol;
special?: any;
maxRetry?: number;
hasRetried?: number;
info?: any;
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "nodespider",
"version": "0.6.0",
"version": "0.6.1",
"description": "20 lines of code to bulid a web crawler as a geek",
"main": "./build/index.js",
"typings": "./build/index.d.ts",
Expand Down
25 changes: 10 additions & 15 deletions src/spider.ts
Original file line number Diff line number Diff line change
@@ -1,14 +1,10 @@
// TODO: 更好的报错机制: 报错建议?以及去除多余的 console.error
// BUG: 使用url.resolve补全url,可能导致 'http://www.xxx.com//www.xxx.com' 的问题。补全前,使用 is-absolute-url 包判断, 或考录使用 relative-url 代替
// TODO: 使用 node 自带 stringdecode 代替 iconv-lite
// mysql 插件
// redis queue
// TODO B 注册pipe和queue可能存在异步操作,此时应该封装到promise或async函数。但依然存在问题:当还没注册好,就调动了queue或者save
// TODO C 兼容新 plan 系统的 queue
// TODO C 更良好的报错提示

// TODO: 修改spider的state、option、timer,以适应新的多任务系统

import * as charset from "charset";
import * as cheerio from "cheerio";
import { EventEmitter } from "events";
Expand All @@ -17,7 +13,7 @@ import * as iconv from "iconv-lite";
import * as request from "request";
import * as stream from "stream";
import * as url from "url";
import * as uuid from "uuid";
import * as uuid from "uuid/v1";
import defaultPlan from "./defaultPlan";
import Queue from "./queue";
import {
Expand Down Expand Up @@ -150,15 +146,15 @@ export default class NodeSpider extends EventEmitter {
*/
// TODO C current 应该能适应所有的plan
public retry(
current: ICurrent,
current: ITask,
maxRetry = 1,
finalErrorCallback?: (current: ICurrent) => void,
finalErrorCallback?: (current: ITask) => void,
) {
const task = {
hasRetried: current.hasRetried,
info: current.info,
maxRetry: current.maxRetry,
planKey: current.planKey,
special: current.special,
url: current.url,
};
if (! task.hasRetried) {
Expand All @@ -168,15 +164,15 @@ export default class NodeSpider extends EventEmitter {
task.maxRetry = maxRetry;
}
if (! finalErrorCallback) {
finalErrorCallback = (currentTask: ICurrent) => {
finalErrorCallback = (currentTask: ITask) => {
console.log("达到最大重试次数,但依旧错误");
};
}
if (task.hasRetried >= task.maxRetry) {
return finalErrorCallback(current);
}

const plan = current.plan;
const plan = this._STATE.planStore.get(task.planKey) as IPlan;
task.hasRetried ++;
this._STATE.queue.jumpTask(task, plan.type); // 插队到队列,重新等待执行
}
Expand Down Expand Up @@ -211,7 +207,7 @@ export default class NodeSpider extends EventEmitter {
* @param url 待爬取的链接(们)
* @param special (可选)针对当前链接的特别设置,将覆盖与plan重复的设置
*/
public queue(planKey: symbol, url: string | string[], special?: any): number {
public queue(planKey: symbol, url: string | string[], info?: any): number {
// 参数检验
if (typeof planKey !== "symbol") {
throw new TypeError("queue 参数错误");
Expand All @@ -222,15 +218,14 @@ export default class NodeSpider extends EventEmitter {
}

// 添加到队列
// TODO C 完善 special: 过滤掉其中不相干的成员?
if (! Array.isArray(url)) {
this._STATE.queue.addTask({url, planKey, special}, plan.type);
this._STATE.queue.addTask({url, planKey, info}, plan.type);
} else {
url.map((u) => {
if (typeof u !== "string") {
return new Error("url数组中存在非字符串成员");
}
this._STATE.queue.addTask({url: u, planKey, special}, plan.type);
this._STATE.queue.addTask({url: u, planKey, info}, plan.type);
});
}

Expand Down Expand Up @@ -306,9 +301,9 @@ function startTask(type: string, task: ITask, self: NodeSpider) {
self._STATE.currentConnections[type] --;
self._STATE.currentTotalConnections --;
}).catch((e: Error) => {
console.log(e);
self._STATE.currentConnections[type] --;
self._STATE.currentTotalConnections --;
throw e;
});
}

Expand Down
1 change: 0 additions & 1 deletion src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ export interface IDefaultOption {
export interface ITask {
url: string;
planKey: symbol;
special?: any;
maxRetry?: number;
hasRetried?: number;
info?: any;
Expand Down

0 comments on commit d01d03a

Please sign in to comment.