Skip to content

Commit 5d55bf3

Browse files
committed
Merge branch 'release/0.10.0'
2 parents 19288e9 + 73dd2eb commit 5d55bf3

File tree

18 files changed

+118
-53
lines changed

18 files changed

+118
-53
lines changed

README.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -118,9 +118,9 @@ The architecture of webmagic (refered to [Scrapy](http://scrapy.org/))
118118

119119
There are more examples in `webmagic-samples` package.
120120

121-
### Lisence:
121+
### License:
122122

123-
Lisenced under [Apache 2.0 lisence](http://opensource.org/licenses/Apache-2.0)
123+
Licensed under [Apache 2.0 license](http://opensource.org/licenses/Apache-2.0)
124124

125125
### Thanks:
126126

pom.xml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
<?xml version="1.0" encoding="UTF-8"?>
22
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
33
<groupId>us.codecraft</groupId>
4-
<version>0.9.1</version>
4+
<version>0.10.0</version>
55
<modelVersion>4.0.0</modelVersion>
66
<packaging>pom</packaging>
77
<properties>

src/site/site.xml

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
<skin>
66
<groupId>org.apache.maven.skins</groupId>
77
<artifactId>maven-fluido-skin</artifactId>
8-
<version>1.9</version>
8+
<version>1.11.1</version>
99
</skin>
1010
<body>
1111
<menu ref="parent" inherit="top" />

webmagic-core/pom.xml

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
<parent>
44
<groupId>us.codecraft</groupId>
55
<artifactId>webmagic-parent</artifactId>
6-
<version>0.9.1</version>
6+
<version>0.10.0</version>
77
</parent>
88
<modelVersion>4.0.0</modelVersion>
99

webmagic-core/src/main/java/us/codecraft/webmagic/Page.java

+47-15
Original file line numberDiff line numberDiff line change
@@ -49,15 +49,34 @@ public class Page {
4949

5050
private byte[] bytes;
5151

52-
private List<Request> targetRequests = new ArrayList<Request>();
52+
private List<Request> targetRequests = new ArrayList<>();
5353

5454
private String charset;
5555

5656
public Page() {
5757
}
5858

59-
public static Page fail(){
59+
/**
60+
* Returns a {@link Page} with {@link #downloadSuccess} is {@code false}.
61+
*
62+
* @return the page.
63+
* @deprecated Use {@link #fail(Request)} instead.
64+
*/
65+
@Deprecated
66+
public static Page fail() {
67+
return fail(null);
68+
}
69+
70+
/**
71+
* Returns a {@link Page} with {@link #downloadSuccess} is {@code false},
72+
* and {@link #request} is specified.
73+
*
74+
* @return the page.
75+
* @since 0.10.0
76+
*/
77+
public static Page fail(Request request){
6078
Page page = new Page();
79+
page.setRequest(request);
6180
page.setDownloadSuccess(false);
6281
return page;
6382
}
@@ -123,13 +142,7 @@ public List<Request> getTargetRequests() {
123142
* @param requests requests
124143
*/
125144
public void addTargetRequests(Iterable<String> requests) {
126-
for (String s : requests) {
127-
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
128-
continue;
129-
}
130-
s = UrlUtils.canonicalizeUrl(s, url.toString());
131-
targetRequests.add(new Request(s));
132-
}
145+
addTargetRequests(requests, 0); // Default priority is 0
133146
}
134147

135148
/**
@@ -139,13 +152,32 @@ public void addTargetRequests(Iterable<String> requests) {
139152
* @param priority priority
140153
*/
141154
public void addTargetRequests(Iterable<String> requests, long priority) {
142-
for (String s : requests) {
143-
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
144-
continue;
145-
}
146-
s = UrlUtils.canonicalizeUrl(s, url.toString());
147-
targetRequests.add(new Request(s).setPriority(priority));
155+
if(requests == null) {
156+
return;
157+
}
158+
159+
for (String req : requests) {
160+
addRequestIfValid(req, priority);
161+
}
162+
}
163+
164+
/**
165+
* Helper method to add a request if it's valid.
166+
*
167+
* @param url URL to add
168+
* @param priority Priority for the URL
169+
*/
170+
private void addRequestIfValid(String url, long priority) {
171+
if (StringUtils.isBlank(url) || url.equals("#") || url.startsWith("javascript:")) {
172+
return;
173+
}
174+
175+
String canonicalizedUrl = UrlUtils.canonicalizeUrl(url, this.url.toString());
176+
Request req = new Request(canonicalizedUrl);
177+
if(priority > 0) {
178+
req.setPriority(priority);
148179
}
180+
targetRequests.add(req);
149181
}
150182

151183
/**

webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java

+36
Original file line numberDiff line numberDiff line change
@@ -36,26 +36,62 @@ public Html download(String url, String charset) {
3636
return (Html) page.getHtml();
3737
}
3838

39+
/**
40+
* @param request the {@link Request}.
41+
* @deprecated Use {@link #onSuccess(Page, Task)} instead.
42+
*/
3943
@Deprecated
4044
protected void onSuccess(Request request) {
4145
}
4246

4347
/**
48+
* @param request the {@link Request}.
49+
* @param task the {@link Task}.
4450
* @since 0.7.6
51+
* @deprecated Use {@link #onSuccess(Page, Task)} instead.
4552
*/
53+
@Deprecated
4654
protected void onSuccess(Request request, Task task) {
4755
this.onSuccess(request);
4856
}
4957

58+
/**
59+
* @param page the {@link Page}.
60+
* @param task the {@link Task}.
61+
* @since 0.10.0
62+
*/
63+
protected void onSuccess(Page page, Task task) {
64+
this.onSuccess(page.getRequest(), task);
65+
}
66+
67+
/**
68+
* @param request the {@link Request}.
69+
* @deprecated Use {@link #onError(Page, Task, Throwable)} instead.
70+
*/
5071
@Deprecated
5172
protected void onError(Request request) {
5273
}
5374

5475
/**
76+
* @param request the {@link Request}.
77+
* @param task the {@link Task}.
78+
* @param e the exception.
5579
* @since 0.7.6
80+
* @deprecated Use {@link #onError(Page, Task, Throwable)} instead.
5681
*/
82+
@Deprecated
5783
protected void onError(Request request, Task task, Throwable e) {
5884
this.onError(request);
5985
}
6086

87+
/**
88+
* @param page the {@link Page}.
89+
* @param task the {@link Task}.
90+
* @param e the exception.
91+
* @since 0.10.0
92+
*/
93+
protected void onError(Page page, Task task, Throwable e) {
94+
this.onError(page.getRequest(), task, e);
95+
}
96+
6197
}

webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java

+3-3
Original file line numberDiff line numberDiff line change
@@ -79,18 +79,18 @@ public Page download(Request request, Task task) {
7979
CloseableHttpClient httpClient = getHttpClient(task.getSite());
8080
Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(request, task) : null;
8181
HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy);
82-
Page page = Page.fail();
82+
Page page = Page.fail(request);
8383
try {
8484
httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext());
8585
page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task);
8686

87-
onSuccess(request, task);
87+
onSuccess(page, task);
8888
logger.info("downloading page success {}", request.getUrl());
8989

9090
return page;
9191
} catch (IOException e) {
9292

93-
onError(request, task, e);
93+
onError(page, task, e);
9494
logger.info("download page {} error", request.getUrl(), e);
9595

9696
return page;

webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java

+11-8
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
import java.util.ArrayList;
55
import java.util.List;
6+
67
import org.apache.commons.collections4.CollectionUtils;
78

89
/**
@@ -55,11 +56,12 @@ public Selectable jsonPath(String jsonPath) {
5556

5657
@Override
5758
public String get() {
58-
if (CollectionUtils.isNotEmpty(all())) {
59-
return all().get(0);
60-
} else {
61-
return null;
62-
}
59+
List<String> sourceTexts = all();
60+
if (CollectionUtils.isNotEmpty(sourceTexts)) {
61+
return sourceTexts.get(0);
62+
}
63+
return null;
64+
6365
}
6466

6567
@Override
@@ -91,8 +93,9 @@ public Selectable replace(String regex, String replacement) {
9193
}
9294

9395
public String getFirstSourceText() {
94-
if (getSourceTexts() != null && getSourceTexts().size() > 0) {
95-
return getSourceTexts().get(0);
96+
List<String> sourceTexts = getSourceTexts();
97+
if (CollectionUtils.isNotEmpty(sourceTexts)) {
98+
return sourceTexts.get(0);
9699
}
97100
return null;
98101
}
@@ -104,6 +107,6 @@ public String toString() {
104107

105108
@Override
106109
public boolean match() {
107-
return getSourceTexts() != null && getSourceTexts().size() > 0;
110+
return CollectionUtils.isNotEmpty(getSourceTexts());
108111
}
109112
}

webmagic-core/src/main/java/us/codecraft/webmagic/utils/NumberUtils.java

+1-7
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,6 @@
66
public abstract class NumberUtils {
77

88
public static int compareLong(long o1, long o2) {
9-
if (o1 < o2) {
10-
return -1;
11-
} else if (o1 == o2) {
12-
return 0;
13-
} else {
14-
return 1;
15-
}
9+
return Long.compare(o1, o2);
1610
}
1711
}

webmagic-core/src/main/java/us/codecraft/webmagic/utils/WMCollections.java

+3-3
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,10 @@ public static <T> Set<T> newHashSet(T... t){
2121
}
2222

2323
public static <T> List<T> newArrayList(T... t){
24-
List<T> set = new ArrayList<T>(t.length);
24+
List<T> list = new ArrayList<T>(t.length);
2525
for (T t1 : t) {
26-
set.add(t1);
26+
list.add(t1);
2727
}
28-
return set;
28+
return list;
2929
}
3030
}

webmagic-coverage/pom.xml

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
<parent>
99
<groupId>us.codecraft</groupId>
1010
<artifactId>webmagic-parent</artifactId>
11-
<version>0.9.1</version>
11+
<version>0.10.0</version>
1212
</parent>
1313

1414
<artifactId>webmagic-coverage</artifactId>

webmagic-extension/pom.xml

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
<parent>
44
<groupId>us.codecraft</groupId>
55
<artifactId>webmagic-parent</artifactId>
6-
<version>0.9.1</version>
6+
<version>0.10.0</version>
77
</parent>
88
<modelVersion>4.0.0</modelVersion>
99

webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java

+3-3
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ public Page download(Request request, Task task) {
8888
logger.info("downloading page: " + request.getUrl());
8989
}
9090

91-
Page page = Page.fail();
91+
Page page = Page.fail(request);
9292
try {
9393
String content = getPage(request);
9494
if (!content.contains("HTTP request failed")) {
@@ -98,9 +98,9 @@ public Page download(Request request, Task task) {
9898
page.setRequest(request);
9999
page.setStatusCode(200);
100100
}
101-
onSuccess(request, task);
101+
onSuccess(page, task);
102102
} catch (Exception e) {
103-
onError(request, task, e);
103+
onError(page, task, e);
104104
logger.warn("download page {} error", request.getUrl(), e);
105105
}
106106
return page;

webmagic-samples/pom.xml

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
<parent>
44
<artifactId>webmagic-parent</artifactId>
55
<groupId>us.codecraft</groupId>
6-
<version>0.9.1</version>
6+
<version>0.10.0</version>
77
</parent>
88
<modelVersion>4.0.0</modelVersion>
99

webmagic-saxon/pom.xml

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
<parent>
44
<artifactId>webmagic-parent</artifactId>
55
<groupId>us.codecraft</groupId>
6-
<version>0.9.1</version>
6+
<version>0.10.0</version>
77
</parent>
88
<modelVersion>4.0.0</modelVersion>
99

webmagic-scripts/pom.xml

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
<parent>
44
<artifactId>webmagic-parent</artifactId>
55
<groupId>us.codecraft</groupId>
6-
<version>0.9.1</version>
6+
<version>0.10.0</version>
77
</parent>
88
<modelVersion>4.0.0</modelVersion>
99

webmagic-selenium/pom.xml

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
<parent>
44
<artifactId>webmagic-parent</artifactId>
55
<groupId>us.codecraft</groupId>
6-
<version>0.9.1</version>
6+
<version>0.10.0</version>
77
</parent>
88
<modelVersion>4.0.0</modelVersion>
99

webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java

+3-3
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ public SeleniumDownloader setSleepTime(int sleepTime) {
7474
public Page download(Request request, Task task) {
7575
checkInit();
7676
WebDriver webDriver = null;
77-
Page page = Page.fail();
77+
Page page = Page.fail(request);
7878
try {
7979
webDriver = webDriverPool.get();
8080

@@ -111,10 +111,10 @@ public Page download(Request request, Task task) {
111111
page.setHtml(new Html(content, request.getUrl()));
112112
page.setUrl(new PlainText(request.getUrl()));
113113
page.setRequest(request);
114-
onSuccess(request, task);
114+
onSuccess(page, task);
115115
} catch (Exception e) {
116116
logger.warn("download page {} error", request.getUrl(), e);
117-
onError(request, task, e);
117+
onError(page, task, e);
118118
} finally {
119119
if (webDriver != null) {
120120
webDriverPool.returnToPool(webDriver);

0 commit comments

Comments
 (0)