From 4c46f73f5339bd674b38122211acef7a486886aa Mon Sep 17 00:00:00 2001 From: xiaohuo Date: Thu, 7 Jul 2016 00:10:22 +0800 Subject: [PATCH] v1.1.0 --- README.md | 2 +- .../cn/wanghaomiao/crawlers/UseCookie.java | 28 ++++++++++--------- .../java/cn/wanghaomiao/seimi/core/Seimi.java | 13 +++++++-- 3 files changed, 27 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 457b1aca..ed3b373b 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ SeimiCrawler的目标是成为Java里最实用的爬虫框架,大家一起加 # 简介 # -SeimiCrawler是一个敏捷的,独立部署的,支持分布式的Java爬虫框架,希望能在最大程度上降低新手开发一个可用性高且性能不差的爬虫系统的门槛,以及提升开发爬虫系统的开发效率。在SeimiCrawler的世界里,绝大多数人只需关心去写抓取的业务逻辑就够了,其余的Seimi帮你搞定。设计思想上SeimiCrawler受Python的爬虫框架Scrapy启发很大,同时融合了Java语言本身特点与Spring的特性,并希望在国内更方便且普遍的使用更有效率的XPath解析HTML,所以SeimiCrawler默认的HTML解析器是[JsoupXpath](http://jsoupxpath.wanghaomiao.cn)(独立扩展项目,非jsoup自带),默认解析提取HTML数据工作均使用XPath来完成(当然,数据处理亦可以自行选择其他解析器)。并结合[SeimiAgent](https://github.com/zhegexiaohuozi/SeimiAgent)彻底完美解决复杂动态页面渲染抓取问题。 +SeimiCrawler是一个敏捷的,独立部署的,支持分布式的Java爬虫框架,希望能在最大程度上降低新手开发一个可用性高且性能不差的爬虫系统的门槛,以及提升开发爬虫系统的开发效率。在SeimiCrawler的世界里,绝大多数人只需关心去写抓取的业务逻辑就够了,其余的Seimi帮你搞定。设计思想上SeimiCrawler受Python的爬虫框架Scrapy启发,同时融合了Java语言本身特点与Spring的特性,并希望在国内更方便且普遍的使用更有效率的XPath解析HTML,所以SeimiCrawler默认的HTML解析器是[JsoupXpath](http://jsoupxpath.wanghaomiao.cn)(独立扩展项目,非jsoup自带),默认解析提取HTML数据工作均使用XPath来完成(当然,数据处理亦可以自行选择其他解析器)。并结合[SeimiAgent](https://github.com/zhegexiaohuozi/SeimiAgent)彻底完美解决复杂动态页面渲染抓取问题。 # 号外 # - 2016.04.14 diff --git a/demo/src/main/java/cn/wanghaomiao/crawlers/UseCookie.java b/demo/src/main/java/cn/wanghaomiao/crawlers/UseCookie.java index 4bb8c2d8..51e284db 100644 --- a/demo/src/main/java/cn/wanghaomiao/crawlers/UseCookie.java +++ b/demo/src/main/java/cn/wanghaomiao/crawlers/UseCookie.java @@ -10,10 +10,12 @@ import org.apache.commons.lang3.StringUtils; import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; import java.util.Map; /** - * 用注解进行配置 + * 登陆oschina演示 * 启用了cookie后全程请求将会使用同一个cookiestore,也就是能保持延续请求的各种状态,包括需要登录的场景等等,默认不开启。 * @author 汪浩淼 [et.tw@163.com] * @since 2015/10/21. @@ -22,26 +24,26 @@ public class UseCookie extends BaseSeimiCrawler { @Override public String[] startUrls() { - //用于触发第一个回调函数 - return new String[]{"http://www.oschina.net/"}; + return null; } @Override - public void start(Response response) { - //提交登陆请求 - Request login = Request.build("https://www.oschina.net/action/user/hash_login","afterLogin"); - + public List startRequests() { + List requests = new LinkedList<>(); + Request start = Request.build("https://www.oschina.net/action/user/hash_login","start"); Map params = new HashMap<>(); - params.put("email","xx@xx.xx"); - params.put("pwd","xxxxxxxxxxxxxxxxxxxxxxxxxx"); + params.put("email","xxx@xx.com"); + params.put("pwd","xxxxxxxxxxxxxxxxxxx"); params.put("save_login","1"); params.put("verifyCode",""); - login.setHttpMethod(HttpMethod.POST); - login.setParams(params); - push(login); + start.setHttpMethod(HttpMethod.POST); + start.setParams(params); + requests.add(start); + return requests; } - public void afterLogin(Response response){ + @Override + public void start(Response response) { logger.info(response.getContent()); push(Request.build("http://www.oschina.net/home/go?page=blog","minePage")); } diff --git a/project/src/main/java/cn/wanghaomiao/seimi/core/Seimi.java b/project/src/main/java/cn/wanghaomiao/seimi/core/Seimi.java index dbfffb93..96dea36b 100644 --- a/project/src/main/java/cn/wanghaomiao/seimi/core/Seimi.java +++ b/project/src/main/java/cn/wanghaomiao/seimi/core/Seimi.java @@ -93,6 +93,7 @@ public void startWorkers(){ private void sendRequest(String crawlerName, SeimiQueue queue, BaseSeimiCrawler instance){ String[] startUrls = instance.startUrls(); + boolean trigger = false; if (ArrayUtils.isNotEmpty(startUrls)){ for (String url:startUrls){ Request request = new Request(); @@ -106,12 +107,20 @@ private void sendRequest(String crawlerName, SeimiQueue queue, BaseSeimiCrawler queue.push(request); logger.info("{} url={} started",crawlerName,url); } - }else if (!CollectionUtils.isEmpty(instance.startRequests())){ + trigger = true; + } + if (!CollectionUtils.isEmpty(instance.startRequests())){ for (Request request:instance.startRequests()){ + request.setCrawlerName(crawlerName); + if (StringUtils.isBlank(request.getCallBack())){ + request.setCallBack("start"); + } queue.push(request); logger.info("{} url={} started",crawlerName,request.getUrl()); } - }else { + trigger = true; + } + if (!trigger){ logger.error("crawler:{} can not find start urls!",crawlerName); } }