Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Master #1

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.idea
25 changes: 23 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,29 @@ npm install scraper-engine
```
create app.js
```
var port=4000;
require('scraper-engine').start(__dirname,port);
var express = require('express');
var app = express();


var nu = require('nu-widget');
require('scraper-engine');

// require extensions
require('example_module_extensions');


app.get('/:site', function(req, res) {
var site = req.params.site;
nu.scraperEngine({
req : req,
res : res,
dir : __dirname,
site : site,
});
});

app.listen(4000);

```

```
Expand Down
9 changes: 9 additions & 0 deletions example/google.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
var S = require('string');
exports.scraper = {
url: function () {
return "http://google.co.id"
},
evaluate : function(){
return this.engine.click('input:contains("Google Search")');
},
}
40 changes: 40 additions & 0 deletions lib/example_module_extensions.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
var nu = require('nu-widget');
var cheerio = require('cheerio');

/** tukangtest extension for scraperEngine **/

var base = {
scrape: nu.lukluk.scraperEngine.prototype.scrape,
}

nu.extend(true,nu.lukluk.scraperEngine.prototype,{

options : {
evaluate:false,
actions : []
},

scrape : function(scr,i,callback){
if(this.options.evaluate){
this.scrape_tukang.apply(this,arguments);
}else{
base.scrape.apply(this,arguments);
}
},
scrape_tukang : function (scr,i,callback) {
var self = this;
var url = scr.url(i);
var Horseman = require('node-horseman');
this.engine = new Horseman();

this.engine = this.engine
.userAgent('Mozilla/5.0 (Windows NT 6.1; WOW64; rv:27.0) Gecko/20100101 Firefox/27.0')
.open(url);

this.engine = this.options.evaluate.apply(this,arguments);
this.engine.html().then(function(result){
var s = cheerio.load(result);
self.load_html(scr,i,callback,s);
});
},
});
280 changes: 137 additions & 143 deletions lib/listingScraperAPI.js
Original file line number Diff line number Diff line change
@@ -1,166 +1,160 @@
var express = require('express');
var app = express();
var nu = require('nu-widget');
var request = require('request');
var json2csv = require('nice-json2csv');
var cheerio = require('cheerio');

var extend = require('extend');

var fs = require('fs');

function doScrape(scr, i, callback) {
var json = [];
console.log("------->" + scr.url(i));
request(scr.url(i), function(initialerror, firstresponse, firsthtml) {
if (initialerror) {
var content = initialerror;
res.status(404);
res.send('eroor ' + content);
nu.widget('lukluk.scraperEngine',{
options : {
dir : __dirname,
output : 'json'
},
_create : function(){
if(this.options.site)
this._get_site_params();
if(this.options.setup)
this.options.setup(this.options.req);
this.run(this.options,[], function(result, res) {
switch (self.options.output){
default:
case 'json':
res.json(result);
break;
case 'csv':
res.csv(result);
break;
}
},this.options.res);
},
_get_site_params : function(){
var fileName = this.options.site;
if (!fs.existsSync(this.options.dir + '/' + fileName + '.js')) {
fileName = false;
}
if (!fileName) {
this.options.res.status(404);
this.options.res.send('no input');
} else {
extend(true,this.options,require(this.options.dir + '/' + fileName + '.js').scraper);
}

var s = cheerio.load(firsthtml);

},
run : function(scr,allJson, callback,res) {
var self = this;
var index = scr.index ? scr.index : 1;

this.scrape(scr, index, function(error, json, s) {
if (json.length > 0)
allJson = allJson.concat(json);
if (scr.next && scr.next(s, index)) {
index += 1;
scr.index = index;
self.run(scr, allJson, callback, res);
} else {

callback && callback(allJson, res);
}
});
},
scrape : function(scr,i,callback){
var self = this;
var json = [];
console.log("------->" + scr.url(i));
request(scr.url(i), function(initialerror, firstresponse, firsthtml) {
if (initialerror) {
var content = initialerror;
res.status(404);
res.send('eroor ' + content);
}

var s = cheerio.load(firsthtml);
self.load_html(scr,i,callback,s);
})
},
load_html : function(scr,i,callback,s){
if (scr.rows) {
var rows = scr.rows(s);

var tot = rows.length;
rows.each(function() {
var output = {};
for (var key in scr.fields) {
if (typeof scr.fields[key] == 'function')

output[key] = scr.fields[key](s(this),s);
}
json.push(output);
if (json.length >= tot) {
if (scr.finish)
scr.finish();
console.log("--->" + json.length + " records");
callback && callback(false, json, s);
}
});
this.load_rows(scr,i,callback,s);
} else {
var list = scr.list(s);
if(scr.list){
this.load_list(scr,i,callback,s);
}else{
scr.res.send(s.html());
}
}
},
load_rows : function(scr,i,callback,s){
var self = this;
var rows = scr.rows(s);

var tot = rows.length;
rows.each(function() {
var output = {};
for (var key in scr.fields) {
if (typeof scr.fields[key] == 'function')

output[key] = scr.fields[key](s(this),s);
}
json.push(output);
if (json.length >= tot) {
if (scr.finish)
scr.finish();
console.log("--->" + json.length + " records");
callback && callback(false, json, s);
}
});
},
load_list : function(scr,i,callback,s){
var self = this;
var list = scr.list(s);
// console.log('list',list)
var tot = 0;
var tot = 0;

for (var i in list)
for (var i in list)
// var i=0;
{
//Detail
console.log("--->" + list[i]);
request(list[i], function(error, response, dom) {
var ss = cheerio.load(dom);
var sublist = scr.sublist(ss);
{
//Detail
console.log("--->" + list[i]);
request(list[i], function(error, response, dom) {
var ss = cheerio.load(dom);
var sublist = scr.sublist(ss);
// console.log('sublist',sublist)
tot = tot+sublist.length;
tot = tot+sublist.length;

for (var ii in sublist)
for (var ii in sublist)
// var ii=0;
{

//Detail
console.log("--->" + sublist[ii]);
request(sublist[ii], function(error, response, dom) {
if (!error) {

var jqq = cheerio.load(dom);

var output = {};
for (var key in scr.fields) {
if (typeof scr.fields[key] == 'function')
output[key] = scr.fields[key](jqq, dom);
}
json.push(output);
console.log(json.length,tot);
if (json.length >= tot) {
if (scr.finish)
scr.finish();
callback && callback(false, json, s);
}
} else {
if (error) {


}
{

}
})
}
})
}
}
})
}

function doRun(scr, allJson, callback, res) {
var index = scr.index ? scr.index : 1;

doScrape(scr, index, function(error, json, s) {
if (json.length > 0)
allJson = allJson.concat(json);
if (scr.next && scr.next(s, index)) {
index += 1;
scr.index = index;
doRun(scr, allJson, callback, res);
} else {

callback && callback(allJson, res);
}
});
}

exports.start = function(dir, port) {
if (!port) {
port = 4000;
}
app.use(json2csv.expressDecorator);
app.get('/output.csv', function(req, res) {
var fileName = req.param('site');
if (!fs.existsSync(dir + '/' + fileName + '.js')) {
fileName = false;
}
if (!fileName) {
res.status(404);
res.send('no input');
} else {
var scr = require(dir + '/' + fileName + '.js').scraper;
if (scr.setup)
scr.setup(req);
doRun(scr, [], function(result, res) {

res.csv (result,"output.csv");
}, res);

}
//Detail
console.log("--->" + sublist[ii]);
request(sublist[ii], function(error, response, dom) {
if (!error) {

var jqq = cheerio.load(dom);

var output = {};
for (var key in scr.fields) {
if (typeof scr.fields[key] == 'function')
output[key] = scr.fields[key](jqq, dom);
}
json.push(output);
console.log(json.length,tot);
if (json.length >= tot) {
if (scr.finish)
scr.finish();
callback && callback(false, json, s);
}
} else {
if (error) {

})
app.get('/output.json', function(req, res) {
var fileName = req.param('site');
if (!fs.existsSync(dir + '/' + fileName + '.js')) {
fileName = false;
}
if (!fileName) {
res.status(404);
res.send('no input');
} else {
var scr = require(dir + '/' + fileName + '.js').scraper;
if (scr.setup)
scr.setup(req);
doRun(scr, [], function(result, res) {

res.json (result);
}, res);
}

}
})
}
})
}



})
var server = app.listen(port, function() {
console.log("scraper engine ready!");
console.log('Listening on port %d', server.address().port);
});

}
}
});
Loading