Skip to content

Commit

Permalink
fix #2860 Refactor URL handling logic.
Browse files Browse the repository at this point in the history
  • Loading branch information
marevol committed Dec 19, 2024
1 parent 69f21a8 commit c3514c5
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 13 deletions.
17 changes: 8 additions & 9 deletions src/main/java/org/codelibs/fess/crawler/FessCrawlerThread.java
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,8 @@ protected boolean isContentUpdated(final CrawlerClient client, final UrlQueue<?>
final PermissionHelper permissionHelper = ComponentUtil.getPermissionHelper();
if (fessConfig.isSmbRoleFromFile() || fessConfig.isFileRoleFromFile() || fessConfig.isFtpRoleFromFile()) {
// head method
responseData = client.execute(RequestDataBuilder.newRequestData().head().url(url).build());
responseData =
client.execute(RequestDataBuilder.newRequestData().head().url(url).weight(urlQueue.getWeight()).build());
if (responseData == null) {
return true;
}
Expand Down Expand Up @@ -202,14 +203,12 @@ protected void storeChildUrlsToQueue(final UrlQueue<?> urlQueue, final Set<Reque
}
}

@SuppressWarnings("unchecked")
protected Set<RequestData> getAnchorSet(final Object obj) {
List<String> anchorList;
if (obj instanceof String) {
anchorList = new ArrayList<>();
anchorList.add(obj.toString());
} else if (obj instanceof List<?>) {
anchorList = (List<String>) obj;
if (obj instanceof final String s) {
anchorList = List.of(s);
} else if (obj instanceof final List<?> l) {
anchorList = l.stream().map(String::valueOf).toList();
} else {
return null;
}
Expand Down Expand Up @@ -263,11 +262,11 @@ protected void processResponse(final UrlQueue<?> urlQueue, final ResponseData re
}

@Override
protected void storeChildUrl(final String childUrl, final String parentUrl, final String metaData, final int depth) {
protected void storeChildUrl(final String childUrl, final String parentUrl, final float weight, final int depth) {
if (StringUtil.isNotBlank(childUrl)) {
final DuplicateHostHelper duplicateHostHelper = ComponentUtil.getDuplicateHostHelper();
final String url = duplicateHostHelper.convert(childUrl);
super.storeChildUrl(url, parentUrl, metaData, depth);
super.storeChildUrl(url, parentUrl, weight, depth);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,13 @@
import org.opensearch.search.sort.SortOrder;

public class FessUrlQueueService extends OpenSearchUrlQueueService {

private static final Logger logger = LogManager.getLogger(FessUrlQueueService.class);

protected static final String ORDER_SEQUENTIAL = "sequential";

protected static final String ORDER_RANDOM = "random";

public FessUrlQueueService(final OpenSearchCrawlerConfig crawlerConfig) {
super(crawlerConfig);
}
Expand All @@ -45,14 +50,14 @@ protected List<OpenSearchUrlQueue> fetchUrlQueueList(final String sessionId) {
final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(sessionId);
final Map<String, String> configParams = crawlingConfig.getConfigParameterMap(ConfigName.CONFIG);
final String crawlOrder = configParams.getOrDefault(CrawlingConfig.Param.Config.CRAWL_ORDER, "sequential");
if ("random".equals(crawlOrder)) {
final String crawlOrder = configParams.getOrDefault(CrawlingConfig.Param.Config.CRAWL_ORDER, ORDER_SEQUENTIAL);
if (ORDER_RANDOM.equals(crawlOrder)) {
return getList(OpenSearchUrlQueue.class, sessionId,
QueryBuilders.functionScoreQuery(QueryBuilders.matchAllQuery(),
new FunctionScoreQueryBuilder.FilterFunctionBuilder[] { new FunctionScoreQueryBuilder.FilterFunctionBuilder(
new RandomScoreFunctionBuilder().seed(sessionId.hashCode())) }),
0, pollingFetchSize, SortBuilders.scoreSort().order(SortOrder.ASC));
} else if (!"sequential".equals(crawlOrder)) {
0, pollingFetchSize, SortBuilders.scoreSort().order(SortOrder.DESC));
} else if (!ORDER_SEQUENTIAL.equals(crawlOrder)) {
logger.warn("Invalid crawl order specified: {}. Falling back to sequential.", crawlOrder);
}
return super.fetchUrlQueueList(sessionId);
Expand Down

0 comments on commit c3514c5

Please sign in to comment.