Skip to content

Commit

Permalink
v1.1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
zhegexiaohuozi committed Jul 6, 2016
1 parent 417070c commit 4c46f73
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 16 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ SeimiCrawler的目标是成为Java里最实用的爬虫框架,大家一起加

# 简介 #

SeimiCrawler是一个敏捷的,独立部署的,支持分布式的Java爬虫框架,希望能在最大程度上降低新手开发一个可用性高且性能不差的爬虫系统的门槛,以及提升开发爬虫系统的开发效率。在SeimiCrawler的世界里,绝大多数人只需关心去写抓取的业务逻辑就够了,其余的Seimi帮你搞定。设计思想上SeimiCrawler受Python的爬虫框架Scrapy启发很大,同时融合了Java语言本身特点与Spring的特性,并希望在国内更方便且普遍的使用更有效率的XPath解析HTML,所以SeimiCrawler默认的HTML解析器是[JsoupXpath](http://jsoupxpath.wanghaomiao.cn)(独立扩展项目,非jsoup自带),默认解析提取HTML数据工作均使用XPath来完成(当然,数据处理亦可以自行选择其他解析器)。并结合[SeimiAgent](https://github.com/zhegexiaohuozi/SeimiAgent)彻底完美解决复杂动态页面渲染抓取问题。
SeimiCrawler是一个敏捷的,独立部署的,支持分布式的Java爬虫框架,希望能在最大程度上降低新手开发一个可用性高且性能不差的爬虫系统的门槛,以及提升开发爬虫系统的开发效率。在SeimiCrawler的世界里,绝大多数人只需关心去写抓取的业务逻辑就够了,其余的Seimi帮你搞定。设计思想上SeimiCrawler受Python的爬虫框架Scrapy启发,同时融合了Java语言本身特点与Spring的特性,并希望在国内更方便且普遍的使用更有效率的XPath解析HTML,所以SeimiCrawler默认的HTML解析器是[JsoupXpath](http://jsoupxpath.wanghaomiao.cn)(独立扩展项目,非jsoup自带),默认解析提取HTML数据工作均使用XPath来完成(当然,数据处理亦可以自行选择其他解析器)。并结合[SeimiAgent](https://github.com/zhegexiaohuozi/SeimiAgent)彻底完美解决复杂动态页面渲染抓取问题。

# 号外 #
- 2016.04.14
Expand Down
28 changes: 15 additions & 13 deletions demo/src/main/java/cn/wanghaomiao/crawlers/UseCookie.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,12 @@
import org.apache.commons.lang3.StringUtils;

import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

/**
* 用注解进行配置
* 登陆oschina演示
* 启用了cookie后全程请求将会使用同一个cookiestore,也就是能保持延续请求的各种状态,包括需要登录的场景等等,默认不开启。
* @author 汪浩淼 [[email protected]]
* @since 2015/10/21.
Expand All @@ -22,26 +24,26 @@
public class UseCookie extends BaseSeimiCrawler {
@Override
public String[] startUrls() {
//用于触发第一个回调函数
return new String[]{"http://www.oschina.net/"};
return null;
}

@Override
public void start(Response response) {
//提交登陆请求
Request login = Request.build("https://www.oschina.net/action/user/hash_login","afterLogin");

public List<Request> startRequests() {
List<Request> requests = new LinkedList<>();
Request start = Request.build("https://www.oschina.net/action/user/hash_login","start");
Map<String,String> params = new HashMap<>();
params.put("email","xx@xx.xx");
params.put("pwd","xxxxxxxxxxxxxxxxxxxxxxxxxx");
params.put("email","xxx@xx.com");
params.put("pwd","xxxxxxxxxxxxxxxxxxx");
params.put("save_login","1");
params.put("verifyCode","");
login.setHttpMethod(HttpMethod.POST);
login.setParams(params);
push(login);
start.setHttpMethod(HttpMethod.POST);
start.setParams(params);
requests.add(start);
return requests;
}

public void afterLogin(Response response){
@Override
public void start(Response response) {
logger.info(response.getContent());
push(Request.build("http://www.oschina.net/home/go?page=blog","minePage"));
}
Expand Down
13 changes: 11 additions & 2 deletions project/src/main/java/cn/wanghaomiao/seimi/core/Seimi.java
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ public void startWorkers(){

private void sendRequest(String crawlerName, SeimiQueue queue, BaseSeimiCrawler instance){
String[] startUrls = instance.startUrls();
boolean trigger = false;
if (ArrayUtils.isNotEmpty(startUrls)){
for (String url:startUrls){
Request request = new Request();
Expand All @@ -106,12 +107,20 @@ private void sendRequest(String crawlerName, SeimiQueue queue, BaseSeimiCrawler
queue.push(request);
logger.info("{} url={} started",crawlerName,url);
}
}else if (!CollectionUtils.isEmpty(instance.startRequests())){
trigger = true;
}
if (!CollectionUtils.isEmpty(instance.startRequests())){
for (Request request:instance.startRequests()){
request.setCrawlerName(crawlerName);
if (StringUtils.isBlank(request.getCallBack())){
request.setCallBack("start");
}
queue.push(request);
logger.info("{} url={} started",crawlerName,request.getUrl());
}
}else {
trigger = true;
}
if (!trigger){
logger.error("crawler:{} can not find start urls!",crawlerName);
}
}
Expand Down

0 comments on commit 4c46f73

Please sign in to comment.