From ab6a92ad71553979e8489e1e70e450acabbcf437 Mon Sep 17 00:00:00 2001 From: Pascal Essiembre Date: Fri, 17 Jul 2015 15:50:47 -0400 Subject: [PATCH] The OrphansStrategy default in crawler config is now PROCESS. --- .../src/changes/changes.xml | 6 +++++ .../core/crawler/AbstractCrawlerConfig.java | 4 ++-- .../core/crawler/ICrawlerConfig.java | 24 +++++++++++++++---- 3 files changed, 27 insertions(+), 7 deletions(-) diff --git a/norconex-collector-core/src/changes/changes.xml b/norconex-collector-core/src/changes/changes.xml index 7ed93c6..9bdf373 100644 --- a/norconex-collector-core/src/changes/changes.xml +++ b/norconex-collector-core/src/changes/changes.xml @@ -35,6 +35,12 @@ given once chance to recover before a deletion request gets sent. This can be overwritten. + + The OrphansStrategy default in crawler config is now PROCESS + to get around cases where temporary conditions prevent accessing + some documents that normally should (and should not avoid re-processing + on incremental crawls). + MD5DocumentChecksummer#setField(String) has been deprecated in favor of MD5DocumentChecksummer#setFields(String...). diff --git a/norconex-collector-core/src/main/java/com/norconex/collector/core/crawler/AbstractCrawlerConfig.java b/norconex-collector-core/src/main/java/com/norconex/collector/core/crawler/AbstractCrawlerConfig.java index 28abc5f..2b27d82 100644 --- a/norconex-collector-core/src/main/java/com/norconex/collector/core/crawler/AbstractCrawlerConfig.java +++ b/norconex-collector-core/src/main/java/com/norconex/collector/core/crawler/AbstractCrawlerConfig.java @@ -1,4 +1,4 @@ -/* Copyright 2014 Norconex Inc. +/* Copyright 2014-2015 Norconex Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -67,7 +67,7 @@ public abstract class AbstractCrawlerConfig implements ICrawlerConfig { private int numThreads = 2; private File workDir = new File("./work"); private int maxDocuments = -1; - private OrphansStrategy orphansStrategy = OrphansStrategy.IGNORE; + private OrphansStrategy orphansStrategy = OrphansStrategy.PROCESS; private ICrawlDataStoreFactory crawlDataStoreFactory = new MapDBCrawlDataStoreFactory(); diff --git a/norconex-collector-core/src/main/java/com/norconex/collector/core/crawler/ICrawlerConfig.java b/norconex-collector-core/src/main/java/com/norconex/collector/core/crawler/ICrawlerConfig.java index 65c00cc..7be8455 100644 --- a/norconex-collector-core/src/main/java/com/norconex/collector/core/crawler/ICrawlerConfig.java +++ b/norconex-collector-core/src/main/java/com/norconex/collector/core/crawler/ICrawlerConfig.java @@ -1,4 +1,4 @@ -/* Copyright 2014 Norconex Inc. +/* Copyright 2014-2015 Norconex Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -80,14 +80,28 @@ enum OrphansStrategy { int getMaxDocuments(); /** - * Gets the strategy to adopt when there are orphans. Orphans are + *

Gets the strategy to adopt when there are orphans. Orphans are * references that were processed in a previous run, but were not in the * current run. In other words, they are leftovers from a previous run * that were not re-encountered in the current. - *

+ *

* Unless explicitly stated otherwise by an implementing class, the default - * strategy is to DELETE orphans. Setting a null value is - * the same as setting IGNORE. + * strategy is to PROCESS orphans. + * Setting a null value is the same as setting + * IGNORE. + *

+ * Since 1.2.0, unless otherwise stated in implementing classes, + * the default orphan strategy is now PROCESS. + *

+ * Be careful: Setting the orphan strategy to DELETE + * is NOT recommended in most cases. With some collectors, a temporary + * failure such as a network outage or a web page timing out, may cause + * some documents not to be crawled. When this happens, unreachable + * documents would be considered "orphans" and be deleted while under + * normal circumstances, they should be kept. Re-processing them + * (default), is usually the safest approach to confirm they still + * exist before deleting or updating them. + *

* @return orphans strategy */ OrphansStrategy getOrphansStrategy();