Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added extra wait time after frontier run out of links #182

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
- Upgrade `commons-validator` library to version 1.6
- Upgrade `okhttp3` library to version 3.14.0
- Fix issue #177: Links from recent TLDs are considered invalid
- Added wait time after frontier ran out of links to avoid race conditions (issue #147)

## Version 0.11.0

Expand Down
64 changes: 49 additions & 15 deletions src/main/java/focusedCrawler/crawler/async/AsyncCrawler.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@

public class AsyncCrawler extends AbstractExecutionThreadService {

private static final int RUN_OUT_OF_LINKS_DEFAULT = -1;
private static final int MAX_RUN_OUT_OF_LINKS_TIME_MS = 5000;
private static final int RUN_OUT_OF_LINKS_WAIT_TIME = 1000;

private static final Logger logger = LoggerFactory.getLogger(AsyncCrawler.class);

private final TargetStorage targetStorage;
Expand All @@ -29,6 +33,7 @@ public class AsyncCrawler extends AbstractExecutionThreadService {
private final Map<LinkRelevance.Type, HttpDownloader.Callback> handlers = new HashMap<>();
private MetricsManager metricsManager;
private Configuration config;
private long runOutOfLinksTime = RUN_OUT_OF_LINKS_DEFAULT;

public AsyncCrawler(String crawlerId, TargetStorage targetStorage, LinkStorage linkStorage,
Configuration config, String dataPath, MetricsManager metricsManager) {
Expand Down Expand Up @@ -60,6 +65,7 @@ protected void run() {
try {
LinkRelevance link = (LinkRelevance) linkStorage.select(null);
if (link != null) {
this.runOutOfLinksTime = RUN_OUT_OF_LINKS_DEFAULT;
Callback handler = handlers.get(link.getType());
if (handler == null) {
logger.error("No registered handler for link type: " + link.getType());
Expand All @@ -68,29 +74,57 @@ protected void run() {
downloader.dipatchDownload(link, handler);
}
} catch (DataNotFoundException e) {
// There are no more links available in the frontier right now
if (downloader.hasPendingDownloads() || !e.ranOutOfLinks()) {
// If there are still pending downloads, new links
// may be found in these pages, so we should wait some
// time until more links are available and try again
try {
logger.info("Waiting for links from pages being downloaded...");
Thread.sleep(1000);
} catch (InterruptedException ie) {
}
// There are no more links available in the frontier right now. We need to check
// whether it is a temporary state to decide if the crawler should stop running.

boolean hasPendingLinks = downloader.hasPendingDownloads() || !e.ranOutOfLinks();
if (hasPendingLinks) {
// If there are still pending downloads, new links may be found in these pages,
// so we should wait some time until more links are available and try again.
waitMilliseconds(RUN_OUT_OF_LINKS_WAIT_TIME);
continue;
}
// There are no more pending downloads and there are no
// more links available in the frontier, so stop crawler
logger.info("LinkStorage ran out of links, stopping crawler.");
stopAsync();
break;

// Even when the frontier runs out of links and there are no pending downloads,
// there may be still some pages being processed, in which case the crawler may
// find some new links. Therefore, we still keep trying to select from the frontier
// for a fixed amount of time (MAX_RUN_OUT_OF_LINKS_TIME_MS) to avoid race conditions.
if (!hasPendingLinks && this.runOutOfLinksTime == RUN_OUT_OF_LINKS_DEFAULT) {
this.runOutOfLinksTime = System.currentTimeMillis();
}

// The crawler should stop only after having ran out of links for a few seconds
// This time is necessary to
long timeSinceRunOutOfLinks = System.currentTimeMillis() - this.runOutOfLinksTime;
if (this.runOutOfLinksTime != RUN_OUT_OF_LINKS_DEFAULT &&
timeSinceRunOutOfLinks > MAX_RUN_OUT_OF_LINKS_TIME_MS) {
// There are no more pending downloads, no more links available in the frontier,
// and we already waited some time new links. Now we can stop the crawler.
logger.info("LinkStorage ran out of links for {} ms, stopping crawler.",
timeSinceRunOutOfLinks);
stopAsync();
break;
} else {
logger.info("LinkStorage ran out of links for {} ms...",
timeSinceRunOutOfLinks);
}

logger.info("Waiting for links from pages being processed...");
waitMilliseconds(RUN_OUT_OF_LINKS_WAIT_TIME);

} catch (Exception e) {
logger.error("An unexpected error happened.", e);
}
}
}

private void waitMilliseconds(long ms) {
try {
Thread.sleep(ms);
} catch (InterruptedException e) {
}
}

@Override
public void shutDown() {
logger.info("Starting crawler shutdown...");
Expand Down