From 036ebd6c3c9470c00c0c3836262c70f047e6543d Mon Sep 17 00:00:00 2001 From: Shinsuke Sugaya Date: Sun, 3 Nov 2024 14:09:36 +0900 Subject: [PATCH] fix #2856 Add crawl order configuration to control URL processing order --- .../crawler/service/FessUrlQueueService.java | 60 +++++++++++++++++++ .../es/config/exentity/CrawlingConfig.java | 1 + .../resources/crawler_es+urlQueueService.xml | 9 +++ 3 files changed, 70 insertions(+) create mode 100644 src/main/java/org/codelibs/fess/crawler/service/FessUrlQueueService.java create mode 100644 src/main/resources/crawler_es+urlQueueService.xml diff --git a/src/main/java/org/codelibs/fess/crawler/service/FessUrlQueueService.java b/src/main/java/org/codelibs/fess/crawler/service/FessUrlQueueService.java new file mode 100644 index 000000000..98e39fcb3 --- /dev/null +++ b/src/main/java/org/codelibs/fess/crawler/service/FessUrlQueueService.java @@ -0,0 +1,60 @@ +/* + * Copyright 2012-2024 CodeLibs Project and the Others. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +package org.codelibs.fess.crawler.service; + +import java.util.List; +import java.util.Map; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.codelibs.fess.crawler.entity.EsUrlQueue; +import org.codelibs.fess.crawler.service.impl.EsUrlQueueService; +import org.codelibs.fess.crawler.util.EsCrawlerConfig; +import org.codelibs.fess.es.config.exentity.CrawlingConfig; +import org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName; +import org.codelibs.fess.helper.CrawlingConfigHelper; +import org.codelibs.fess.util.ComponentUtil; +import org.opensearch.index.query.QueryBuilders; +import org.opensearch.index.query.functionscore.FunctionScoreQueryBuilder; +import org.opensearch.index.query.functionscore.RandomScoreFunctionBuilder; +import org.opensearch.search.sort.SortBuilders; +import org.opensearch.search.sort.SortOrder; + +public class FessUrlQueueService extends EsUrlQueueService { + private static final Logger logger = LogManager.getLogger(FessUrlQueueService.class); + + public FessUrlQueueService(final EsCrawlerConfig crawlerConfig) { + super(crawlerConfig); + } + + @Override + protected List fetchUrlQueueList(final String sessionId) { + final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper(); + final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(sessionId); + final Map configParams = crawlingConfig.getConfigParameterMap(ConfigName.CONFIG); + final String crawlOrder = configParams.getOrDefault(CrawlingConfig.Param.Config.CRAWL_ORDER, "sequential"); + if ("random".equals(crawlOrder)) { + return getList(EsUrlQueue.class, sessionId, + QueryBuilders.functionScoreQuery(QueryBuilders.matchAllQuery(), + new FunctionScoreQueryBuilder.FilterFunctionBuilder[] { new FunctionScoreQueryBuilder.FilterFunctionBuilder( + new RandomScoreFunctionBuilder().seed(sessionId.hashCode())) }), + 0, pollingFetchSize, SortBuilders.scoreSort().order(SortOrder.ASC)); + } else if (!"sequential".equals(crawlOrder)) { + logger.warn("Invalid crawl order specified: {}. Falling back to sequential.", crawlOrder); + } + return getList(EsUrlQueue.class, sessionId, null, 0, pollingFetchSize, SortBuilders.fieldSort(CREATE_TIME).order(SortOrder.ASC)); + } +} diff --git a/src/main/java/org/codelibs/fess/es/config/exentity/CrawlingConfig.java b/src/main/java/org/codelibs/fess/es/config/exentity/CrawlingConfig.java index 945d2c90a..6da8d3e22 100644 --- a/src/main/java/org/codelibs/fess/es/config/exentity/CrawlingConfig.java +++ b/src/main/java/org/codelibs/fess/es/config/exentity/CrawlingConfig.java @@ -133,6 +133,7 @@ public static class Config { public static final String IGNORE_ROBOTS_TAGS = "ignore.robots.tags"; public static final String SCRIPT_TYPE = "script.type"; public static final String HTML_CHILD_URL_RULES = "html.child.url.rules"; + public static final String CRAWL_ORDER = "crawl.order"; } // meta.* diff --git a/src/main/resources/crawler_es+urlQueueService.xml b/src/main/resources/crawler_es+urlQueueService.xml new file mode 100644 index 000000000..54f8cc92a --- /dev/null +++ b/src/main/resources/crawler_es+urlQueueService.xml @@ -0,0 +1,9 @@ + + + + + crawlerConfig + +