From c3514c5b3a9c0172b516ed0cd60191c3227f309a Mon Sep 17 00:00:00 2001 From: Shinsuke Sugaya Date: Thu, 19 Dec 2024 17:05:54 +0900 Subject: [PATCH] fix #2860 Refactor URL handling logic. --- .../fess/crawler/FessCrawlerThread.java | 17 ++++++++--------- .../crawler/service/FessUrlQueueService.java | 13 +++++++++---- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/src/main/java/org/codelibs/fess/crawler/FessCrawlerThread.java b/src/main/java/org/codelibs/fess/crawler/FessCrawlerThread.java index 399717c5e..769f0c922 100644 --- a/src/main/java/org/codelibs/fess/crawler/FessCrawlerThread.java +++ b/src/main/java/org/codelibs/fess/crawler/FessCrawlerThread.java @@ -95,7 +95,8 @@ protected boolean isContentUpdated(final CrawlerClient client, final UrlQueue final PermissionHelper permissionHelper = ComponentUtil.getPermissionHelper(); if (fessConfig.isSmbRoleFromFile() || fessConfig.isFileRoleFromFile() || fessConfig.isFtpRoleFromFile()) { // head method - responseData = client.execute(RequestDataBuilder.newRequestData().head().url(url).build()); + responseData = + client.execute(RequestDataBuilder.newRequestData().head().url(url).weight(urlQueue.getWeight()).build()); if (responseData == null) { return true; } @@ -202,14 +203,12 @@ protected void storeChildUrlsToQueue(final UrlQueue urlQueue, final Set getAnchorSet(final Object obj) { List anchorList; - if (obj instanceof String) { - anchorList = new ArrayList<>(); - anchorList.add(obj.toString()); - } else if (obj instanceof List) { - anchorList = (List) obj; + if (obj instanceof final String s) { + anchorList = List.of(s); + } else if (obj instanceof final List l) { + anchorList = l.stream().map(String::valueOf).toList(); } else { return null; } @@ -263,11 +262,11 @@ protected void processResponse(final UrlQueue urlQueue, final ResponseData re } @Override - protected void storeChildUrl(final String childUrl, final String parentUrl, final String metaData, final int depth) { + protected void storeChildUrl(final String childUrl, final String parentUrl, final float weight, final int depth) { if (StringUtil.isNotBlank(childUrl)) { final DuplicateHostHelper duplicateHostHelper = ComponentUtil.getDuplicateHostHelper(); final String url = duplicateHostHelper.convert(childUrl); - super.storeChildUrl(url, parentUrl, metaData, depth); + super.storeChildUrl(url, parentUrl, weight, depth); } } diff --git a/src/main/java/org/codelibs/fess/crawler/service/FessUrlQueueService.java b/src/main/java/org/codelibs/fess/crawler/service/FessUrlQueueService.java index 5553cbfb7..913f11fed 100644 --- a/src/main/java/org/codelibs/fess/crawler/service/FessUrlQueueService.java +++ b/src/main/java/org/codelibs/fess/crawler/service/FessUrlQueueService.java @@ -34,8 +34,13 @@ import org.opensearch.search.sort.SortOrder; public class FessUrlQueueService extends OpenSearchUrlQueueService { + private static final Logger logger = LogManager.getLogger(FessUrlQueueService.class); + protected static final String ORDER_SEQUENTIAL = "sequential"; + + protected static final String ORDER_RANDOM = "random"; + public FessUrlQueueService(final OpenSearchCrawlerConfig crawlerConfig) { super(crawlerConfig); } @@ -45,14 +50,14 @@ protected List fetchUrlQueueList(final String sessionId) { final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper(); final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(sessionId); final Map configParams = crawlingConfig.getConfigParameterMap(ConfigName.CONFIG); - final String crawlOrder = configParams.getOrDefault(CrawlingConfig.Param.Config.CRAWL_ORDER, "sequential"); - if ("random".equals(crawlOrder)) { + final String crawlOrder = configParams.getOrDefault(CrawlingConfig.Param.Config.CRAWL_ORDER, ORDER_SEQUENTIAL); + if (ORDER_RANDOM.equals(crawlOrder)) { return getList(OpenSearchUrlQueue.class, sessionId, QueryBuilders.functionScoreQuery(QueryBuilders.matchAllQuery(), new FunctionScoreQueryBuilder.FilterFunctionBuilder[] { new FunctionScoreQueryBuilder.FilterFunctionBuilder( new RandomScoreFunctionBuilder().seed(sessionId.hashCode())) }), - 0, pollingFetchSize, SortBuilders.scoreSort().order(SortOrder.ASC)); - } else if (!"sequential".equals(crawlOrder)) { + 0, pollingFetchSize, SortBuilders.scoreSort().order(SortOrder.DESC)); + } else if (!ORDER_SEQUENTIAL.equals(crawlOrder)) { logger.warn("Invalid crawl order specified: {}. Falling back to sequential.", crawlOrder); } return super.fetchUrlQueueList(sessionId);