Skip to content

Commit

Permalink
fix #2795 Updated user-agent version handling in crawler to use dynam…
Browse files Browse the repository at this point in the history
…ic versioning based on Fess version
  • Loading branch information
marevol committed Jan 7, 2024
1 parent 1b22604 commit b7041af
Show file tree
Hide file tree
Showing 6 changed files with 228 additions and 4 deletions.
4 changes: 4 additions & 0 deletions src/main/java/org/codelibs/fess/Constants.java
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,10 @@ public class Constants extends CoreLibConstants {

public static final long DEFAULT_CRAWLING_EXECUTION_INTERVAL = 5000L;

public static final String CRAWLING_USER_AGENT_PREFIX = "Mozilla/5.0 (compatible; Fess/";

public static final String CRAWLING_USER_AGENT_SUFFIX = "; +http://fess.codelibs.org/bot.html)";

// fess properties
public static final String USER_INFO_PROPERTY = "user.info";

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,13 @@ public CrawlerClientFactory initializeClientFactory(final Supplier<CrawlerClient

final String userAgent = getUserAgent();
if (StringUtil.isNotBlank(userAgent)) {
paramMap.put(Client.USER_AGENT, userAgent);
if (userAgent.startsWith(Constants.CRAWLING_USER_AGENT_PREFIX) && userAgent.endsWith(Constants.CRAWLING_USER_AGENT_SUFFIX)) {
paramMap.put(Client.USER_AGENT, fessConfig.getUserAgentName());
} else {
paramMap.put(Client.USER_AGENT, userAgent);
}
} else {
paramMap.put(Client.USER_AGENT, fessConfig.getUserAgentName());
}

final List<WebAuthentication> webAuthList = webAuthenticationService.getWebAuthenticationList(getId());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -694,8 +694,8 @@ default String getNotificationSearchTop() {
}

default String getUserAgentName() {
return getSystemProperty(Constants.CRAWLING_USER_AGENT_PROPERTY, "Mozilla/5.0 (compatible; Fess/"
+ ComponentUtil.getSystemHelper().getProductVersion() + "; +http://fess.codelibs.org/bot.html)");
return getSystemProperty(Constants.CRAWLING_USER_AGENT_PROPERTY, Constants.CRAWLING_USER_AGENT_PREFIX
+ ComponentUtil.getSystemHelper().getProductVersion() + Constants.CRAWLING_USER_AGENT_SUFFIX);
}

default void setLtrModelName(final String value) {
Expand Down
9 changes: 8 additions & 1 deletion src/main/java/org/codelibs/fess/util/ComponentUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@
import org.lastaflute.core.message.MessageManager;
import org.lastaflute.core.security.PrimaryCipher;
import org.lastaflute.di.core.SingletonLaContainer;
import org.lastaflute.di.core.exception.AutoBindingFailureException;
import org.lastaflute.di.core.exception.ComponentNotFoundException;
import org.lastaflute.di.core.factory.SingletonLaContainerFactory;
import org.lastaflute.di.core.smart.hot.HotdeployUtil;
Expand Down Expand Up @@ -520,6 +521,7 @@ public static RankFusionProcessor getRankFusionProcessor() {
return getComponent(RANK_FUSION_PROCESSOR);
}

@SuppressWarnings("unchecked")
public static <T> T getComponent(final Class<T> clazz) {
try {
return SingletonLaContainer.getComponent(clazz);
Expand All @@ -528,6 +530,11 @@ public static <T> T getComponent(final Class<T> clazz) {
throw new ContainerNotAvailableException(clazz.getCanonicalName(), e);
}
throw new ContainerNotAvailableException(clazz.getCanonicalName());
} catch (final ComponentNotFoundException | AutoBindingFailureException e) {
if (componentMap.containsKey(clazz.getCanonicalName())) {
return (T) componentMap.get(clazz.getCanonicalName());
}
throw e;
}
}

Expand All @@ -540,7 +547,7 @@ public static <T> T getComponent(final String componentName) {
throw new ContainerNotAvailableException(componentName, e);
}
throw new ContainerNotAvailableException(componentName);
} catch (final ComponentNotFoundException e) {
} catch (final ComponentNotFoundException | AutoBindingFailureException e) {
if (componentMap.containsKey(componentName)) {
return (T) componentMap.get(componentName);
}
Expand Down
179 changes: 179 additions & 0 deletions src/test/java/org/codelibs/fess/es/config/exentity/WebConfigTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
/*
* Copyright 2012-2023 CodeLibs Project and the Others.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/
package org.codelibs.fess.es.config.exentity;

import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.codelibs.core.lang.StringUtil;
import org.codelibs.fess.Constants;
import org.codelibs.fess.app.service.RequestHeaderService;
import org.codelibs.fess.app.service.WebAuthenticationService;
import org.codelibs.fess.crawler.client.CrawlerClientFactory;
import org.codelibs.fess.crawler.client.http.Authentication;
import org.codelibs.fess.helper.SystemHelper;
import org.codelibs.fess.mylasta.direction.FessConfig;
import org.codelibs.fess.mylasta.direction.FessProp;
import org.codelibs.fess.unit.UnitFessTestCase;
import org.codelibs.fess.util.ComponentUtil;
import org.opensearch.common.SetOnce;

public class WebConfigTest extends UnitFessTestCase {

@Override
protected boolean isUseOneTimeContainer() {
return true;
}

public void test_initializeClientFactory() {
final Map<String, String> systemPropMap = new HashMap<>();
FessProp.propMap.clear();
FessConfig fessConfig = new FessConfig.SimpleImpl() {
@Override
public String getSystemProperty(final String key, final String defaultValue) {
return systemPropMap.getOrDefault(key, defaultValue);
}

@Override
public boolean isCrawlerIgnoreRobotsTxt() {
return false;
}

@Override
public String getHttpProxyHost() {
return StringUtil.EMPTY;
}

@Override
public String getHttpProxyPort() {
return StringUtil.EMPTY;
}
};
ComponentUtil.setFessConfig(fessConfig);
SystemHelper systemHelper = new SystemHelper() {
@Override
public String getProductVersion() {
return "98.76";
}
};
ComponentUtil.register(systemHelper, "systemHelper");
WebAuthenticationService webAuthenticationService = new WebAuthenticationService() {
@Override
public List<WebAuthentication> getWebAuthenticationList(final String webConfigId) {
return Collections.emptyList();
}
};
ComponentUtil.register(webAuthenticationService, WebAuthenticationService.class.getCanonicalName());
RequestHeaderService requestHeaderService = new RequestHeaderService() {
@Override
public List<RequestHeader> getRequestHeaderList(final String webConfigId) {
return Collections.emptyList();
}
};
ComponentUtil.register(requestHeaderService, RequestHeaderService.class.getCanonicalName());

final SetOnce<Map<String, Object>> initParamMapSet = new SetOnce<>();
WebConfig webConfig = new WebConfig();
webConfig.setUserAgent(Constants.CRAWLING_USER_AGENT_PREFIX + "1.0" + Constants.CRAWLING_USER_AGENT_SUFFIX);
CrawlerClientFactory crawlerClientFactory = webConfig.initializeClientFactory(() -> new CrawlerClientFactory() {
public void setInitParameterMap(final Map<String, Object> params) {
initParamMapSet.set(params);
}
});
assertNotNull(crawlerClientFactory);
Map<String, Object> initParamMap = initParamMapSet.get();
assertNotNull(initParamMap);
assertEquals(0, ((org.codelibs.fess.crawler.client.http.RequestHeader[]) initParamMap.get("requestHeaders")).length);
assertEquals("Mozilla/5.0 (compatible; Fess/98.76; +http://fess.codelibs.org/bot.html)", initParamMap.get("userAgent"));
assertEquals(0, ((Authentication[]) initParamMap.get("basicAuthentications")).length);
assertTrue(Boolean.valueOf(initParamMap.get("robotsTxtEnabled").toString()).booleanValue());
}

public void test_initializeClientFactoryWithConfigParameter() {
final Map<String, String> systemPropMap = new HashMap<>();
FessProp.propMap.clear();
FessConfig fessConfig = new FessConfig.SimpleImpl() {
@Override
public String getSystemProperty(final String key, final String defaultValue) {
return systemPropMap.getOrDefault(key, defaultValue);
}

@Override
public boolean isCrawlerIgnoreRobotsTxt() {
return false;
}

@Override
public String getHttpProxyHost() {
return StringUtil.EMPTY;
}

@Override
public String getHttpProxyPort() {
return StringUtil.EMPTY;
}

@Override
public String getAppEncryptPropertyPattern() {
return ".*password|.*key|.*token|.*secret";
}
};
ComponentUtil.setFessConfig(fessConfig);
SystemHelper systemHelper = new SystemHelper() {
@Override
public String getProductVersion() {
return "98.76";
}
};
ComponentUtil.register(systemHelper, "systemHelper");
WebAuthenticationService webAuthenticationService = new WebAuthenticationService() {
@Override
public List<WebAuthentication> getWebAuthenticationList(final String webConfigId) {
return Collections.emptyList();
}
};
ComponentUtil.register(webAuthenticationService, WebAuthenticationService.class.getCanonicalName());
RequestHeaderService requestHeaderService = new RequestHeaderService() {
@Override
public List<RequestHeader> getRequestHeaderList(final String webConfigId) {
return Collections.emptyList();
}
};
ComponentUtil.register(requestHeaderService, RequestHeaderService.class.getCanonicalName());

final SetOnce<Map<String, Object>> initParamMapSet = new SetOnce<>();
WebConfig webConfig = new WebConfig();
final String userAgent = "TestAgent";
webConfig.setUserAgent(userAgent);
webConfig.setConfigParameter("""
client.robotsTxtEnabled=false
""");
CrawlerClientFactory crawlerClientFactory = webConfig.initializeClientFactory(() -> new CrawlerClientFactory() {
public void setInitParameterMap(final Map<String, Object> params) {
initParamMapSet.set(params);
}
});
assertNotNull(crawlerClientFactory);
Map<String, Object> initParamMap = initParamMapSet.get();
assertNotNull(initParamMap);
assertEquals(0, ((org.codelibs.fess.crawler.client.http.RequestHeader[]) initParamMap.get("requestHeaders")).length);
assertEquals(userAgent, initParamMap.get("userAgent"));
assertEquals(0, ((Authentication[]) initParamMap.get("basicAuthentications")).length);
assertFalse(Boolean.valueOf(initParamMap.get("robotsTxtEnabled").toString()).booleanValue());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,14 @@
import java.util.Arrays;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;

import org.codelibs.core.io.FileUtil;
import org.codelibs.core.misc.DynamicProperties;
import org.codelibs.fess.Constants;
import org.codelibs.fess.helper.SystemHelper;
import org.codelibs.fess.unit.UnitFessTestCase;
import org.codelibs.fess.util.ComponentUtil;
import org.codelibs.fess.util.PrunedTag;
import org.codelibs.nekohtml.parsers.DOMParser;
import org.lastaflute.di.core.factory.SingletonLaContainerFactory;
Expand Down Expand Up @@ -262,6 +266,30 @@ public String getUserCodePattern() {
assertFalse(fessConfig.isValidUserCode("123456789?"));
}

public void test_getUserAgentName() throws IOException {
final Map<String, String> systemPropMap = new HashMap<>();
FessProp.propMap.clear();
FessConfig fessConfig = new FessConfig.SimpleImpl() {
@Override
public String getSystemProperty(final String key, final String defaultValue) {
return systemPropMap.getOrDefault(key, defaultValue);
}
};
ComponentUtil.setFessConfig(fessConfig);
SystemHelper systemHelper = new SystemHelper() {
@Override
public String getProductVersion() {
return "98.76";
}
};
ComponentUtil.register(systemHelper, "systemHelper");

assertEquals("Mozilla/5.0 (compatible; Fess/98.76; +http://fess.codelibs.org/bot.html)", fessConfig.getUserAgentName());

systemPropMap.put(Constants.CRAWLING_USER_AGENT_PROPERTY, "TestAgent");
assertEquals("TestAgent", fessConfig.getUserAgentName());
}

private void assertArrays(final String[] expected, final String[] actual) {
Arrays.sort(expected);
Arrays.sort(actual);
Expand Down

0 comments on commit b7041af

Please sign in to comment.