MD5DocumentChecksummer#setField(String) has been deprecated in favor
of MD5DocumentChecksummer#setFields(String...).
diff --git a/norconex-collector-core/src/main/java/com/norconex/collector/core/crawler/AbstractCrawlerConfig.java b/norconex-collector-core/src/main/java/com/norconex/collector/core/crawler/AbstractCrawlerConfig.java
index 28abc5f..2b27d82 100644
--- a/norconex-collector-core/src/main/java/com/norconex/collector/core/crawler/AbstractCrawlerConfig.java
+++ b/norconex-collector-core/src/main/java/com/norconex/collector/core/crawler/AbstractCrawlerConfig.java
@@ -1,4 +1,4 @@
-/* Copyright 2014 Norconex Inc.
+/* Copyright 2014-2015 Norconex Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -67,7 +67,7 @@ public abstract class AbstractCrawlerConfig implements ICrawlerConfig {
private int numThreads = 2;
private File workDir = new File("./work");
private int maxDocuments = -1;
- private OrphansStrategy orphansStrategy = OrphansStrategy.IGNORE;
+ private OrphansStrategy orphansStrategy = OrphansStrategy.PROCESS;
private ICrawlDataStoreFactory crawlDataStoreFactory =
new MapDBCrawlDataStoreFactory();
diff --git a/norconex-collector-core/src/main/java/com/norconex/collector/core/crawler/ICrawlerConfig.java b/norconex-collector-core/src/main/java/com/norconex/collector/core/crawler/ICrawlerConfig.java
index 65c00cc..7be8455 100644
--- a/norconex-collector-core/src/main/java/com/norconex/collector/core/crawler/ICrawlerConfig.java
+++ b/norconex-collector-core/src/main/java/com/norconex/collector/core/crawler/ICrawlerConfig.java
@@ -1,4 +1,4 @@
-/* Copyright 2014 Norconex Inc.
+/* Copyright 2014-2015 Norconex Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -80,14 +80,28 @@ enum OrphansStrategy {
int getMaxDocuments();
/**
- * Gets the strategy to adopt when there are orphans. Orphans are
+ * Gets the strategy to adopt when there are orphans. Orphans are
* references that were processed in a previous run, but were not in the
* current run. In other words, they are leftovers from a previous run
* that were not re-encountered in the current.
- *
+ *
* Unless explicitly stated otherwise by an implementing class, the default
- * strategy is to DELETE orphans. Setting a null
value is
- * the same as setting IGNORE.
+ * strategy is to PROCESS
orphans.
+ * Setting a null
value is the same as setting
+ * IGNORE
.
+ *
+ * Since 1.2.0, unless otherwise stated in implementing classes,
+ * the default orphan strategy is now PROCESS
.
+ *
+ * Be careful: Setting the orphan strategy to DELETE
+ * is NOT recommended in most cases. With some collectors, a temporary
+ * failure such as a network outage or a web page timing out, may cause
+ * some documents not to be crawled. When this happens, unreachable
+ * documents would be considered "orphans" and be deleted while under
+ * normal circumstances, they should be kept. Re-processing them
+ * (default), is usually the safest approach to confirm they still
+ * exist before deleting or updating them.
+ *
* @return orphans strategy
*/
OrphansStrategy getOrphansStrategy();