Skip to content

Commit

Permalink
merge to master
Browse files Browse the repository at this point in the history
  • Loading branch information
mike442144 committed Aug 9, 2018
2 parents dc4eff3 + 9ab744e commit fe1255d
Show file tree
Hide file tree
Showing 28 changed files with 768 additions and 1,027 deletions.
33 changes: 33 additions & 0 deletions .eslintrc.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
module.exports = {
"env": {
"es6": true,
"node": true,
"mocha": true,

},
"extends": "eslint:recommended",
"parserOptions": {
"sourceType": "module"
},
"rules": {
"indent": [
"error",
"tab"
],
"linebreak-style": [
"error",
"unix"
],
"quotes": [
"error",
"single"
],
"semi": [
"error",
"always"
],
"no-console":[
"off"
]
}
};
6 changes: 3 additions & 3 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
language: node_js
node_js:
- 4.6
- 6.7
- 6.9
- 6
- 8
- 10
os:
- linux
149 changes: 101 additions & 48 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,84 +2,137 @@

[![build status](https://secure.travis-ci.org/mike442144/seenreq.png)](https://travis-ci.org/mike442144/seenreq)
[![Dependency Status](https://david-dm.org/mike442144/seenreq/status.svg)](https://david-dm.org/mike442144/seenreq)
[![NPM download][download-image]][download-url]
[![NPM quality][quality-image]][quality-url]

[quality-image]: http://npm.packagequality.com/shield/seenreq.svg?style=flat-square
[quality-url]: http://packagequality.com/#?package=seenreq
[download-image]: https://img.shields.io/npm/dm/seenreq.svg?style=flat-square
[download-url]: https://npmjs.org/package/seenreq

# seenreq
A library to test if a url/request is crawled, usually used in a web crawler. Compatible with [request](https://github.com/request/request) and [node-crawler](https://github.com/bda-research/node-crawler)
A library to test if a url/request is crawled, usually used in a web crawler. Compatible with [request](https://github.com/request/request) and [node-crawler](https://github.com/bda-research/node-crawler). The 1.x or newer version has quite different APIs and is not compatible with 0.x versions. Please read the [upgrade guide](./UPGRADE.md) document.

# Table of Contents

* [Quick Start](#quick-start)
* [Installation](#installation)
* [Basic Usage](#basic-usage)
* [Use Redis](#use-redis)
* [Use Mongodb](#use-mongodb)
* [Class:seenreq](#classseenreq)
* [seen.initialize()](#seeninitialize)
* [seen.normalize(uri|option[,options])](#seennormalizeurioptionoptions)
* [seen.exists(uri|option|array[,options])](#seenexistsurioptionarrayoptions)
* [seen.dispose()](#seen_dispose)
* [Options](#options)

# Install
## Quick Start

$ npm install seenreq
### Installation

# Basic Usage
$ npm install seenreq --save

### Basic Usage

```javascript
var seenreq = require('seenreq')
var seen = new seenreq();
const seenreq = require('seenreq')
, seen = new seenreq();

//url to be normalized
var url = "http://www.GOOGLE.com";
console.log(seen.normalize(url));//GET http://www.google.com/\r\n
let url = "http://www.GOOGLE.com";
console.log(seen.normalize(url));//{ sign: "GET http://www.google.com/\r\n", options: {} }

//request options to be normalized
var option = {
uri:'http://www.GOOGLE.com'
let option = {
uri: 'http://www.GOOGLE.com',
rupdate: false
};

console.log(seen.normalize(option));//GET http://www.google.com/\r\n

//return false if ask for a `request` never see
console.log(seen.exists(url));//false

//return true if got same `request`
console.log(seen.exists(opt));//true
console.log(seen.normalize(option));//{sign: "GET http://www.google.com/\r\n", options:{rupdate: false} }

seen.initialize().then(()=>{
return seen.exists(url);
}).then( (rst) => {
console.log(rst[0]);//false if ask for a `request` never see
return seen.exists(opt);
}).then( (rst) => {
console.log(rst[0]);//true if got same `request`
}).catch(e){
console.error(e);
};
```
When you call `exists`, the module will do normalization itself first and then check if exists.

# Use Redis to store keys
`seenreq` default stores keys in memory, so process will use unlimited memory if there are unlimited keys. Redis will solve this problem. All `ioredis` options are recived and supported.
### Use Redis
`seenreq` stores keys in memory by default, memory usage will soar as number of keys increases. Redis will solve this problem. Because seenreq uses `ioredis` as redis client, all `ioredis`' [options](https://github.com/luin/ioredis/blob/master/API.md) are recived and supported. You should first install:

```javascript
npm install seenreq-repo-redis --save
```
and then set repo to `redis`:

```javascript
var seenreq = require('seenreq')
var seen = new seenreq({
const seenreq = require('seenreq')
let seen = new seenreq({
repo:'redis',// use redis instead of memory
host:'127.0.0.1',
host:'127.0.0.1',
port:6379,
clearOnQuit:false // default true.
clearOnQuit:false // clear redis cache or don't when calling dispose(), default true.
});

var url = "http://www.GOOGLE.com";

//because of non-blocking I/O, you have to use a callback function to get result
seen.exists(url,{
callback:function(err,result){
if(err){
console.error(err);
}else{
console.log(result);
}
}
});
seen.initialize().then(()=>{
//do stuff...
}).catch(e){
console.error(e);
}
```

### Use mongodb
It is similar with redis above:

```javascript
npm install seenreq-repo-mongo --save
```
Class:Seenreq
-------------

Instance of Seenreq
```javascript
const seenreq = require('seenreq')
let seen = new seenreq({
repo:'mongo',
url:'mongodb://xxx/seenreq',
collection: 'foor'
});
```


## Class:seenreq

Instance of seenreq

### __seen.initialize()__
Initialize the repo, returns a promise.

### __seen.normalize(uri|option[,options])__
* `uri` String, `option` is Option of [request](https://github.com/request/request) or [node-crawler](https://github.com/bda-research/node-crawler)
* [options](#options)

Returns normalized Object: {sign,options}.

### __seen.exists(uri|option|array[,options])__
* uri|option
* [options](#options)

Returns a promise with an Boolean array, e.g. [true, false, true, false, false].

__seen.normalize(uri|option)__
* `uri` String, `option` is Option of `request` or `node-webcrawler`. return normalized String.
### __seen.dispose()__

__seen.exists(uri|option|[uri][,options])__
* [options](#options), Warning: When using default `repo` if you call `exists` with an array of `uri` that have duplicate uris, the function won't remove.
Dispose resources of repo. If you are using repo other than memory, like Redis you should call `dispose` to release connection. Returns a promise.

__seen.dispose()__
* dispose resources of repo. If you are using Redis and do not call `dispose` the connection will keep forever, that is your process will never exit.
## Options

Options
-----------------
* removeKeys: Array, Ignore specified keys when doing normalization. For instance, there is a `ts` property in the url like `http://www.xxx.com/index?ts=1442382602504` which is timestamp and it should be same whenever you visit.
* stripFragment: Boolean, Remove the fragment at the end of the URL (Default true).
* update: Boolean, Store in repo so that `seenreq` can hit the `req` next time (Default true).
* callback: Function, return result if using Redis repo.
* rupdate: Boolean, it is short for `repo update`. Store in repo so that `seenreq` can hit the same `req` next time (Default true).

# RoadMap
* add `mysql` repo to persist keys to disk.
Expand Down
6 changes: 6 additions & 0 deletions UPGRADE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Upgrade guide
## api changes from version below 1.0 to 1.x
* `exists` are changed to node-style arguments with callback. In previous version it uses memory as default repo to store keys, you can get result by return value but you can't in new version. seenreq uses `process.nextTick` to produce asynchronous callback. So be careful to change your code to get result in callback even if you use defualt memory repo.
* `normalize` return value is an object now, it looks like: `{sign:"GET http://www.google.com\r\n",options:{key1:"val"}}`, the sign is same as the returned string by normalize before.
* `options.update` is changed to `options.rupdate` to avoid duplicate, so you can place `rupdate` in `request`, e.g. `{uri:"http://www.google.com", rupdate:false}`. It also takes effect to place it in `options`.
* Use `initialize` before use `exists`
115 changes: 115 additions & 0 deletions index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@

'use strict';

const URL = require('node-url-utils');

/*
*
*
*/

function seenreq(options) {
let Repo = null;
const Normalizers = [];

options = options || {};
if(!options.repo || options.repo==='default' || options.repo==='memory'){
Repo = require('./lib/repo/default.js');
}else{
const moduleName = `seenreq-repo-${options.repo}`;
try{
Repo = require(moduleName);
}catch(e){
console.error(`Cannot load module ${moduleName}, please run 'npm install ${moduleName}' and retry`);
return;
}
}

this.repo = new Repo(options);

if(!options.normalizer){
Normalizers.push(require('./lib/normalizer/default.js'));
}else{
let moduleNames = null;
if(typeof options.normalizer === 'string'){
moduleNames = [options.normalizer];
}else{
moduleNames = options.normalizer;
}

moduleNames.map(moduleName=>{
moduleName = `seenreq-nmlz-${moduleName}`;
try{
Normalizers.push(require(moduleName));
}catch(e){
console.error(`Cannot load module ${moduleName}, please run 'npm install ${moduleName}' and retry`);
}
});
}

this.normalizers = Normalizers.map(ctor => new ctor(options));
this.globalOptions = options;
}

/* Initialize repo
* - callback
* @return Promise if there is no callback
*/
seenreq.prototype.initialize = function(){
return this.repo.initialize();
};

/* Generate method + full uri + body string.
* - req, String|Object
* - [options], Object
* @return, Object. e.g {sign, options}
*/
seenreq.prototype.normalize = function(req, options) {
if(!req){
throw new Error('Argument req is required.');
}

const opt = {
method: 'GET',
body: null
};

options = Object.assign({},this.globalOptions,options);

if (typeof req === 'string') {
opt.uri = req;
}else if(typeof req === 'object'){
Object.assign(opt, req);
opt.uri = opt.uri || opt.url;
}

/* A normalizedRequest is an object of request with some modified keys and values */
const normalizedRequest = this.normalizers.reduce((r, cur) => cur.normalize(r,options), opt);
const sign = [
[normalizedRequest.method, URL.normalize(normalizedRequest.uri, options)].join(' '), normalizedRequest.body
].join('\r\n');

const requestArgsSet = new Set(['uri','url','qs','method','headers','body','form','json','multipart','followRedirect','followAllRedirects', 'maxRedirects','encoding','pool','timeout','proxy','auth','oauth','strictSSL','jar','aws','gzip','time','tunnel','proxyHeaderWhiteList','proxyHeaderExclusiveList','localAddress','forever']);

Object.keys(normalizedRequest).filter(key => !requestArgsSet.has(key) ).forEach(key=>options[key]=normalizedRequest[key]);
return {sign,options};
};

seenreq.prototype.exists = function(req, options) {
if(!req){
throw new Error('Argument req is required.');
}

if (!(req instanceof Array)) {
req = [req];
}

const rs = req.map(r=>this.normalize(r,options));
return this.repo.exists(rs, options);
};

seenreq.prototype.dispose = function() {
return this.repo.dispose();
};

module.exports = seenreq;
Loading

0 comments on commit fe1255d

Please sign in to comment.