From 48840f078aa251e8c34a6dcd79502b0f93804ad9 Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Thu, 12 Sep 2024 16:24:17 -0700 Subject: [PATCH 1/2] NUTCH-3064 Upgrade com.maxmind.geoip2:geoip2 dependency in geoip-index to v4.2.0 --- build.xml | 1 + conf/nutch-default.xml | 24 +- src/plugin/build.xml | 2 +- src/plugin/index-geoip/ivy.xml | 2 +- src/plugin/index-geoip/plugin.xml | 4 +- .../indexer/geoip/GeoIPDocumentCreator.java | 335 +++++++++++------- .../indexer/geoip/GeoIPIndexingFilter.java | 210 +++++------ .../geoip/TestGeoIPIndexingFilter.java | 204 +++++++++++ 8 files changed, 509 insertions(+), 273 deletions(-) create mode 100644 src/plugin/index-geoip/src/test/org/apache/nutch/indexer/geoip/TestGeoIPIndexingFilter.java diff --git a/build.xml b/build.xml index 845bdfce89..245026cefd 100644 --- a/build.xml +++ b/build.xml @@ -1180,6 +1180,7 @@ + diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index c00d9776b3..d59894ccae 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -2206,15 +2206,25 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this index.geoip.usage - insightsService + + To use the geoip-plugin you must set 'store.ip.address' to true. A string representing the information source to be used for GeoIP information - association. Either enter 'cityDatabase', 'connectionTypeDatabase', - 'domainDatabase', 'ispDatabase' or 'insightsService'. If you wish to use any one of the - Database options, you should make one of GeoIP2-City.mmdb, GeoIP2-Connection-Type.mmdb, - GeoIP2-Domain.mmdb or GeoIP2-ISP.mmdb files respectively available on the classpath and - available at runtime. Alternatively, also the GeoLite2 IP databases (GeoLite2-*.mmdb) - can be used. + lookup. Options are 'anonymous', 'asn', 'city', 'connection', 'domain', + 'insights' or 'isp'. + If you wish to use any one of the Database options, you should make the relevant + *.mmdb file(s) available on the Hadoop classpath and available at runtime. + This can be achieved by adding it to `$NUTCH_HOME/conf`. + + + + + index.geoip.db.file + + + The GeoIP2 or GeoLite2 database file in the MMDB format. More information available at + https://support.maxmind.com/hc/en-us/articles/4408216157723-Database-Formats + This file must be available on the Hadoop classpath. diff --git a/src/plugin/build.xml b/src/plugin/build.xml index 498259a950..b7a5a77216 100755 --- a/src/plugin/build.xml +++ b/src/plugin/build.xml @@ -120,7 +120,7 @@ - + diff --git a/src/plugin/index-geoip/ivy.xml b/src/plugin/index-geoip/ivy.xml index a6ddc2949a..ec2b61aba0 100644 --- a/src/plugin/index-geoip/ivy.xml +++ b/src/plugin/index-geoip/ivy.xml @@ -37,7 +37,7 @@ - + diff --git a/src/plugin/index-geoip/plugin.xml b/src/plugin/index-geoip/plugin.xml index dda1b6a7be..aed01cd94c 100644 --- a/src/plugin/index-geoip/plugin.xml +++ b/src/plugin/index-geoip/plugin.xml @@ -26,8 +26,8 @@ - - + + diff --git a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java index 64b3862be8..2a62628236 100644 --- a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java +++ b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java @@ -20,6 +20,7 @@ import java.lang.invoke.MethodHandles; import java.net.InetAddress; import java.net.UnknownHostException; +import java.util.Optional; import org.apache.nutch.indexer.NutchDocument; import org.slf4j.Logger; @@ -27,12 +28,14 @@ import com.maxmind.geoip2.DatabaseReader; import com.maxmind.geoip2.WebServiceClient; -import com.maxmind.geoip2.exception.AddressNotFoundException; import com.maxmind.geoip2.exception.GeoIp2Exception; -import com.maxmind.geoip2.model.InsightsResponse; +import com.maxmind.geoip2.model.AbstractCityResponse; +import com.maxmind.geoip2.model.AbstractCountryResponse; +import com.maxmind.geoip2.model.AbstractResponse; +import com.maxmind.geoip2.model.AnonymousIpResponse; +import com.maxmind.geoip2.model.AsnResponse; import com.maxmind.geoip2.model.CityResponse; import com.maxmind.geoip2.model.ConnectionTypeResponse; -import com.maxmind.geoip2.model.CountryResponse; import com.maxmind.geoip2.model.DomainResponse; import com.maxmind.geoip2.model.IspResponse; import com.maxmind.geoip2.record.City; @@ -46,109 +49,250 @@ /** *

- * Simple utility class which enables efficient, structured - * {@link org.apache.nutch.indexer.NutchDocument} building based on input from + * Simple utility class which builds a + * {@link org.apache.nutch.indexer.NutchDocument} based on input from * {@link GeoIPIndexingFilter}, where configuration is also read. *

- *

- * Based on the nature of the input, this class wraps factory type - * implementations for populating {@link org.apache.nutch.indexer.NutchDocument} - * 's with the correct {@link org.apache.nutch.indexer.NutchField} information. - * */ public class GeoIPDocumentCreator { private static final Logger LOG = LoggerFactory .getLogger(MethodHandles.lookup().lookupClass()); + private static final String NETWORK_ADDRESS = "networkAddress"; + + private GeoIPDocumentCreator() {} + /** * Add field to document but only if value isn't null * @param doc the {@link NutchDocument} to augment * @param name the name of the target field * @param value the String value to associate with the target field */ - public static void addIfNotNull(NutchDocument doc, String name, + private static void addIfNotNull(NutchDocument doc, String name, Object value) { if (value != null) { doc.add(name, value); } } - public static NutchDocument createDocFromInsightsService(String serverIp, - NutchDocument doc, WebServiceClient client) throws UnknownHostException, - IOException, GeoIp2Exception { + /** + * + * @param serverIp + * @param doc + * @param reader + * @return + * @throws IOException + * @throws GeoIp2Exception + */ + public static NutchDocument createDocFromAnonymousIpDb(String serverIp, + NutchDocument doc, DatabaseReader reader) throws IOException, GeoIp2Exception { + Optional opt = reader.tryAnonymousIp(InetAddress.getByName(serverIp)); + if (opt.isPresent()) { + AnonymousIpResponse response = opt.get(); + addIfNotNull(doc, "ip", response.getIpAddress()); + addIfNotNull(doc, NETWORK_ADDRESS, response.getNetwork().toString()); + addIfNotNull(doc, "isAnonymous", response.isAnonymous()); + addIfNotNull(doc, "isAnonymousVpn", response.isAnonymousVpn()); + addIfNotNull(doc, "isHostingProxy", response.isHostingProvider()); + addIfNotNull(doc, "isPublicProxy", response.isPublicProxy()); + addIfNotNull(doc, "isResidentialProxy", response.isResidentialProxy()); + addIfNotNull(doc, "isTorExitNode", response.isTorExitNode()); + } else { + LOG.debug("'{}' IP address not found in Anonymous IP DB.", serverIp); + } + return doc; + } + + /** + * + * @param serverIp + * @param doc + * @param reader + * @return + * @throws IOException + * @throws GeoIp2Exception + */ + public static NutchDocument createDocFromAsnDb(String serverIp, + NutchDocument doc, DatabaseReader reader) throws IOException, GeoIp2Exception { + Optional opt = reader.tryAsn(InetAddress.getByName(serverIp)); + if (opt.isPresent()) { + AsnResponse response = opt.get(); + addIfNotNull(doc, "ip", response.getIpAddress()); + addIfNotNull(doc, NETWORK_ADDRESS, response.getNetwork().toString()); + addIfNotNull(doc, "autonomousSystemNumber", response.getAutonomousSystemNumber()); + addIfNotNull(doc, "autonomousSystemOrganization", response.getAutonomousSystemOrganization()); + } else { + LOG.debug("'{}' IP address not found in ASN DB.", serverIp); + } + return doc; + } + + /** + * + * @param serverIp + * @param doc + * @param reader + * @return + * @throws IOException + * @throws GeoIp2Exception + */ + public static NutchDocument createDocFromCityDb(String serverIp, + NutchDocument doc, DatabaseReader reader) throws IOException, GeoIp2Exception { addIfNotNull(doc, "ip", serverIp); - InsightsResponse response = client - .insights(InetAddress.getByName(serverIp)); + Optional opt = reader.tryCity(InetAddress.getByName(serverIp)); + if (opt.isPresent()) { + processDocument(doc, opt.get()); + } else { + LOG.debug("'{}' IP address not found in City DB.", serverIp); + } + return doc; + } - City city = response.getCity(); - addIfNotNull(doc, "cityName", city.getName()); // 'Minneapolis' - addIfNotNull(doc, "cityConfidence", city.getConfidence()); // 50 + private static NutchDocument processDocument(NutchDocument doc, AbstractResponse response) { + City city = ((AbstractCityResponse) response).getCity(); + addIfNotNull(doc, "cityName", city.getName()); + addIfNotNull(doc, "cityConfidence", city.getConfidence()); addIfNotNull(doc, "cityGeoNameId", city.getGeoNameId()); - Continent continent = response.getContinent(); + Continent continent = ((AbstractCountryResponse) response).getContinent(); addIfNotNull(doc, "continentCode", continent.getCode()); addIfNotNull(doc, "continentGeoNameId", continent.getGeoNameId()); addIfNotNull(doc, "continentName", continent.getName()); - Country country = response.getCountry(); - addIfNotNull(doc, "countryIsoCode", country.getIsoCode()); // 'US' - addIfNotNull(doc, "countryName", country.getName()); // 'United States' - addIfNotNull(doc, "countryConfidence", country.getConfidence()); // 99 + Country country = ((AbstractCountryResponse) response).getRegisteredCountry(); + addIfNotNull(doc, "countryIsoCode", country.getIsoCode()); + addIfNotNull(doc, "countryName", country.getName()); + addIfNotNull(doc, "countryConfidence", country.getConfidence()); addIfNotNull(doc, "countryGeoNameId", country.getGeoNameId()); + addIfNotNull(doc, "countryInEuropeanUnion", country.isInEuropeanUnion()); - Location location = response.getLocation(); - addIfNotNull(doc, "latLon", location.getLatitude() + "," + location.getLongitude()); // 44.9733, - // -93.2323 - addIfNotNull(doc, "accRadius", location.getAccuracyRadius()); // 3 - addIfNotNull(doc, "timeZone", location.getTimeZone()); // 'America/Chicago' + Location location = ((AbstractCityResponse) response).getLocation(); + addIfNotNull(doc, "latLon", location.getLatitude() + "," + location.getLongitude()); + addIfNotNull(doc, "accuracyRadius", location.getAccuracyRadius()); + addIfNotNull(doc, "timeZone", location.getTimeZone()); addIfNotNull(doc, "metroCode", location.getMetroCode()); + addIfNotNull(doc, "populationDensity", location.getPopulationDensity()); + addIfNotNull(doc, "timezone", location.getTimeZone()); - Postal postal = response.getPostal(); - addIfNotNull(doc, "postalCode", postal.getCode()); // '55455' - addIfNotNull(doc, "postalConfidence", postal.getConfidence()); // 40 + Postal postal = ((AbstractCityResponse) response).getPostal(); + addIfNotNull(doc, "postalCode", postal.getCode()); + addIfNotNull(doc, "postalConfidence", postal.getConfidence()); - RepresentedCountry rCountry = response.getRepresentedCountry(); + RepresentedCountry rCountry = ((AbstractCountryResponse) response).getRepresentedCountry(); addIfNotNull(doc, "countryType", rCountry.getType()); - Subdivision subdivision = response.getMostSpecificSubdivision(); - addIfNotNull(doc, "subDivName", subdivision.getName()); // 'Minnesota' - addIfNotNull(doc, "subDivIsoCode", subdivision.getIsoCode()); // 'MN' - addIfNotNull(doc, "subDivConfidence", subdivision.getConfidence()); // 90 - addIfNotNull(doc, "subDivGeoNameId", subdivision.getGeoNameId()); + Subdivision mostSubdivision = ((AbstractCityResponse) response).getMostSpecificSubdivision(); + addIfNotNull(doc, "mostSpecificSubDivName", mostSubdivision.getName()); + addIfNotNull(doc, "mostSpecificSubDivIsoCode", mostSubdivision.getIsoCode()); + addIfNotNull(doc, "mostSpecificSubDivConfidence", mostSubdivision.getConfidence()); + addIfNotNull(doc, "mostSpecificSubDivGeoNameId", mostSubdivision.getGeoNameId()); - Traits traits = response.getTraits(); - addIfNotNull(doc, "autonSystemNum", traits.getAutonomousSystemNumber()); - addIfNotNull(doc, "autonSystemOrg", traits.getAutonomousSystemOrganization()); + Subdivision leastSubdivision = ((AbstractCityResponse) response).getLeastSpecificSubdivision(); + addIfNotNull(doc, "leastSpecificSubDivName", leastSubdivision.getName()); + addIfNotNull(doc, "leastSpecificSubDivIsoCode", leastSubdivision.getIsoCode()); + addIfNotNull(doc, "leastSpecificSubDivConfidence", leastSubdivision.getConfidence()); + addIfNotNull(doc, "leastSpecificSubDivGeoNameId", leastSubdivision.getGeoNameId()); + + Traits traits = ((AbstractCountryResponse) response).getTraits(); + addIfNotNull(doc, "autonomousSystemNumber", traits.getAutonomousSystemNumber()); + addIfNotNull(doc, "autonomousSystemOrganization", traits.getAutonomousSystemOrganization()); + addIfNotNull(doc, "connectionType", traits.getConnectionType().toString()); addIfNotNull(doc, "domain", traits.getDomain()); addIfNotNull(doc, "isp", traits.getIsp()); - addIfNotNull(doc, "org", traits.getOrganization()); + addIfNotNull(doc, "mobileCountryCode", traits.getMobileCountryCode()); + addIfNotNull(doc, "mobileNetworkCode", traits.getMobileNetworkCode()); + addIfNotNull(doc, NETWORK_ADDRESS, traits.getNetwork().toString()); + addIfNotNull(doc, "organization", traits.getOrganization()); + addIfNotNull(doc, "staticIpScore", traits.getStaticIpScore()); + addIfNotNull(doc, "userCount", traits.getUserCount()); addIfNotNull(doc, "userType", traits.getUserType()); - //for better results, users should upgrade to - //https://www.maxmind.com/en/solutions/geoip2-enterprise-product-suite/anonymous-ip-database - addIfNotNull(doc, "isAnonProxy", String.valueOf(traits.isAnonymousProxy())); + addIfNotNull(doc, "isAnonymous", traits.isAnonymous()); + addIfNotNull(doc, "isAnonymousVpn", traits.isAnonymousVpn()); + addIfNotNull(doc, "isAnycast", traits.isAnycast()); + addIfNotNull(doc, "isHostingProvider", traits.isHostingProvider()); + addIfNotNull(doc, "isLegitimateProxy", traits.isLegitimateProxy()); + addIfNotNull(doc, "isPublicProxy", traits.isPublicProxy()); + addIfNotNull(doc, "isResidentialProxy", traits.isResidentialProxy()); + addIfNotNull(doc, "isTorExitNode", traits.isTorExitNode()); return doc; } - @SuppressWarnings("unused") - public static NutchDocument createDocFromCityService(String serverIp, - NutchDocument doc, WebServiceClient client) throws UnknownHostException, - IOException, GeoIp2Exception { - CityResponse response = client.city(InetAddress.getByName(serverIp)); + /** + * Populate a {@link org.apache.nutch.indexer.NutchDocument} based on lookup + * of IP in ConnectionDb. + * @param serverIp the server IP + * @param doc NutchDocument to populate + * @param reader instantiated DatabaseReader object + * @return populated NutchDocument + * @throws UnknownHostException if IP address of host could not be determined + * @throws IOException if an error occurs performing the Db lookup + * @throws GeoIp2Exception generic GeoIp2 exception + */ + public static NutchDocument createDocFromConnectionDb(String serverIp, + NutchDocument doc, DatabaseReader reader) throws IOException, GeoIp2Exception { + Optional opt = reader.tryConnectionType(InetAddress + .getByName(serverIp)); + if (opt.isPresent()) { + ConnectionTypeResponse response = opt.get(); + addIfNotNull(doc, "ip", response.getIpAddress()); + addIfNotNull(doc, "connectionType", response.getConnectionType().toString()); + addIfNotNull(doc, NETWORK_ADDRESS, response.getNetwork().toString()); + } else { + LOG.debug("'{}' IP address not found in Connection DB.", serverIp); + } return doc; } - @SuppressWarnings("unused") - public static NutchDocument createDocFromCountryService(String serverIp, - NutchDocument doc, WebServiceClient client) throws UnknownHostException, - IOException, GeoIp2Exception { - CountryResponse response = client.country(InetAddress.getByName(serverIp)); + /** + * + * @param serverIp + * @param doc + * @param reader + * @return + * @throws IOException + * @throws GeoIp2Exception + */ + public static NutchDocument createDocFromDomainDb(String serverIp, + NutchDocument doc, DatabaseReader reader) throws IOException, GeoIp2Exception { + Optional opt = reader.tryDomain(InetAddress.getByName(serverIp)); + if (opt.isPresent()) { + DomainResponse response = opt.get(); + addIfNotNull(doc, "ip", response.getIpAddress()); + addIfNotNull(doc, "domain", response.getDomain()); + addIfNotNull(doc, NETWORK_ADDRESS, response.getNetwork().toString()); + } else { + LOG.debug("'{}' IP address not found in Domain DB.", serverIp); + } return doc; } + /** + * + * @param serverIp + * @param doc + * @param client + * @return + * @throws IOException + * @throws GeoIp2Exception + */ + public static NutchDocument createDocFromInsightsService(String serverIp, + NutchDocument doc, WebServiceClient client) throws IOException, GeoIp2Exception { + addIfNotNull(doc, "ip", serverIp); + return processDocument(doc, client.insights(InetAddress.getByName(serverIp))); + } + + /** + * + * @param serverIp + * @param doc + * @param reader + * @return + * @throws IOException + * @throws GeoIp2Exception + */ public static NutchDocument createDocFromIspDb(String serverIp, - NutchDocument doc, DatabaseReader reader) throws UnknownHostException, - IOException, GeoIp2Exception { + NutchDocument doc, DatabaseReader reader) throws IOException, GeoIp2Exception { IspResponse response = reader.isp(InetAddress.getByName(serverIp)); addIfNotNull(doc, "ip", serverIp); addIfNotNull(doc, "autonSystemNum", response.getAutonomousSystemNumber()); @@ -158,81 +302,4 @@ public static NutchDocument createDocFromIspDb(String serverIp, return doc; } - public static NutchDocument createDocFromDomainDb(String serverIp, - NutchDocument doc, DatabaseReader reader) throws UnknownHostException, - IOException, GeoIp2Exception { - DomainResponse response; - try { - response = reader.domain(InetAddress.getByName(serverIp)); - } catch (AddressNotFoundException e) { - LOG.debug("IP address not found: {}", serverIp); - return doc; - } - addIfNotNull(doc, "ip", serverIp); - addIfNotNull(doc, "domain", response.getDomain()); - return doc; - } - - public static NutchDocument createDocFromConnectionDb(String serverIp, - NutchDocument doc, DatabaseReader reader) throws UnknownHostException, - IOException, GeoIp2Exception { - ConnectionTypeResponse response = reader.connectionType(InetAddress - .getByName(serverIp)); - addIfNotNull(doc, "ip", serverIp); - addIfNotNull(doc, "connType", response.getConnectionType().toString()); - return doc; - } - - public static NutchDocument createDocFromCityDb(String serverIp, - NutchDocument doc, DatabaseReader reader) throws UnknownHostException, - IOException, GeoIp2Exception { - addIfNotNull(doc, "ip", serverIp); - - CityResponse response; - try { - response = reader.city(InetAddress.getByName(serverIp)); - } catch (AddressNotFoundException e) { - LOG.debug("IP address not found: {}", serverIp); - return doc; - } - - City city = response.getCity(); - addIfNotNull(doc, "cityName", city.getName()); // 'Minneapolis' - addIfNotNull(doc, "cityConfidence", city.getConfidence()); // 50 - addIfNotNull(doc, "cityGeoNameId", city.getGeoNameId()); - - - Continent continent = response.getContinent(); - addIfNotNull(doc, "continentCode", continent.getCode()); - addIfNotNull(doc, "continentGeoNameId", continent.getGeoNameId()); - addIfNotNull(doc, "continentName", continent.getName()); - - Country country = response.getCountry(); - addIfNotNull(doc, "countryIsoCode", country.getIsoCode()); // 'US' - addIfNotNull(doc, "countryName", country.getName()); // 'United States' - addIfNotNull(doc, "countryConfidence", country.getConfidence()); // 99 - addIfNotNull(doc, "countryGeoNameId", country.getGeoNameId()); - - Location location = response.getLocation(); - addIfNotNull(doc, "latLon", location.getLatitude() + "," + location.getLongitude()); // 44.9733, - // -93.2323 - addIfNotNull(doc, "accRadius", location.getAccuracyRadius()); // 3 - addIfNotNull(doc, "timeZone", location.getTimeZone()); // 'America/Chicago' - addIfNotNull(doc, "metroCode", location.getMetroCode()); - - Postal postal = response.getPostal(); - addIfNotNull(doc, "postalCode", postal.getCode()); // '55455' - addIfNotNull(doc, "postalConfidence", postal.getConfidence()); // 40 - - RepresentedCountry rCountry = response.getRepresentedCountry(); - addIfNotNull(doc, "countryType", rCountry.getType()); - - Subdivision subdivision = response.getMostSpecificSubdivision(); - addIfNotNull(doc, "subDivName", subdivision.getName()); // 'Minnesota' - addIfNotNull(doc, "subDivIsoCode", subdivision.getIsoCode()); // 'MN' - addIfNotNull(doc, "subDivConfidence", subdivision.getConfidence()); // 90 - addIfNotNull(doc, "subDivGeoNameId", subdivision.getGeoNameId()); - return doc; - } - } diff --git a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java index ea30b8c7b2..eea14fc127 100644 --- a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java +++ b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java @@ -16,10 +16,12 @@ */ package org.apache.nutch.indexer.geoip; -import java.lang.invoke.MethodHandles; -import java.net.URL; import java.io.File; import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.net.URL; +import java.util.Objects; + import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; import org.apache.nutch.crawl.CrawlDatum; @@ -32,104 +34,47 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.maxmind.db.CHMCache; import com.maxmind.geoip2.DatabaseReader; import com.maxmind.geoip2.WebServiceClient; +import com.maxmind.geoip2.exception.GeoIp2Exception; /** - * This plugin implements an indexing filter which takes advantage of the GeoIP2-java API. + *

This plugin implements an indexing filter which takes advantage of the GeoIP2-java API.

*

* The third party library distribution provides an API for the GeoIP2 Precision web - * services and databases. The - * API also works with the free GeoLite2 databases. + * href="https://dev.maxmind.com/geoip/geolocate-an-ip/web-services">Precision web + * services, + * GeoLite2 (free) and databases. *

*

- * Depending on the service level agreement, you have with the GeoIP service - * provider, the plugin can add a number of the following fields to the index - * data model: - *

    - *
  1. Continent
  2. - *
  3. Country
  4. - *
  5. Regional Subdivision
  6. - *
  7. City
  8. - *
  9. Postal Code
  10. - *
  11. Latitude/Longitude
  12. - *
  13. ISP/Organization
  14. - *
  15. AS Number
  16. - *
  17. Confidence Factors
  18. - *
  19. Radius
  20. - *
  21. User Type
  22. - *
- * - *

- * Some of the services are documented at the GeoIP2 Precision - * Services webpage where more information can be obtained. + * Services webpage for more information. *

- * *

- * You should also consult the following three properties in - * nutch-site.xml + * You should consult and configure the index.geoip.* properties in + * nutch-site.xml. *

- * - *
- *  {@code
- * 
- * 
- *   index.geoip.usage
- *   insightsService
- *   
- *   A string representing the information source to be used for GeoIP information
- *   association. Either enter 'cityDatabase', 'connectionTypeDatabase', 
- *   'domainDatabase', 'ispDatabase' or 'insightsService'. If you wish to use any one of the 
- *   Database options, you should make one of GeoIP2-City.mmdb, GeoIP2-Connection-Type.mmdb, 
- *   GeoIP2-Domain.mmdb or GeoIP2-ISP.mmdb files respectively available on the Hadoop classpath 
- *   and available at runtime. This can be achieved by adding it to `$NUTCH_HOME/conf`.
- *   Alternatively, also the GeoLite2 IP databases (GeoLite2-*.mmdb) can be used.
- *   
- * 
- * 
- * 
- *   index.geoip.userid
- *   
- *   
- *   The userId associated with the GeoIP2 Precision Services account.
- *   
- * 
- * 
- * 
- *   index.geoip.licensekey
- *   
- *   
- *   The license key associated with the GeoIP2 Precision Services account.
- *   
- * 
- * }
- * 
- * */ public class GeoIPIndexingFilter implements IndexingFilter { private static final Logger LOG = LoggerFactory .getLogger(MethodHandles.lookup().lookupClass()); - private Configuration conf; - - private String usage = null; - - WebServiceClient client = null; - - DatabaseReader reader = null; - - // private AbstractResponse response = null; + private String usage; + private static final String INSIGHTS_SERVICE = "insights"; + private WebServiceClient client; + private DatabaseReader reader; /** * Default constructor for this plugin */ public GeoIPIndexingFilter() { + //Constructor } /** @@ -141,60 +86,58 @@ public Configuration getConf() { } /** + * Set plugin {@link org.apache.hadoop.conf.Configuration} * @see org.apache.hadoop.conf.Configurable#setConf(org.apache.hadoop.conf.Configuration) */ @Override - public void setConf(Configuration conf) { - this.conf = conf; - usage = conf.get("index.geoip.usage", "insightsService"); - LOG.debug("GeoIP usage medium set to: {}", usage); - if (usage.equalsIgnoreCase("insightsService")) { + public void setConf(Configuration config) { + conf = config; + if (!config.getBoolean("store.ip.address", false)) { + LOG.warn("Plugin index-geoip is active but IP address is not stored. " + + "'store.ip.address' must be set to true in nutch-site.xml."); + } + usage = config.get("index.geoip.usage"); + if (usage != null && usage.equalsIgnoreCase(INSIGHTS_SERVICE)) { client = new WebServiceClient.Builder( - conf.getInt("index.geoip.userid", 12345), - conf.get("index.geoip.licensekey")).build(); - } else { - String dbSuffix = null; - if (usage.equalsIgnoreCase("cityDatabase")) { - dbSuffix = "-City.mmdb"; - } else if (usage.equalsIgnoreCase("connectionTypeDatabase")) { - dbSuffix = "-Connection-Type.mmdb"; - } else if (usage.equalsIgnoreCase("domainDatabase")) { - dbSuffix = "-Domain.mmdb"; - } else if (usage.equalsIgnoreCase("ispDatabase")) { - dbSuffix = "-ISP.mmdb"; - } - String[] dbPrefixes = {"GeoIP2", "GeoLite2"}; - for (String dbPrefix : dbPrefixes) { - String db = dbPrefix + dbSuffix; - URL dbFileUrl = conf.getResource(db); + Integer.parseInt(config.get("index.geoip.userid")), + config.get("index.geoip.licensekey")).build(); + LOG.debug("Established geoip-index InsightsService client."); + } else if (usage != null && !usage.equalsIgnoreCase(INSIGHTS_SERVICE)) { + String dbFile = config.get("index.geoip.db.file"); + if (dbFile != null) { + LOG.debug("GeoIP db file: {}", dbFile); + URL dbFileUrl = config.getResource(dbFile); if (dbFileUrl == null) { - LOG.error("GeoDb file {} not found on classpath", db); + LOG.error("Db file {} not found on classpath", dbFile); } else { try { - LOG.info("Reading GeoDb file {}", db); buildDb(new File(dbFileUrl.getFile())); } catch (Exception e) { - LOG.error("Failed to read geoDb file {}: ", db, e); + LOG.error("Failed to read Db file: {} {}", dbFile, e.getMessage()); } } } } - if (!conf.getBoolean("store.ip.address", false)) { - LOG.warn("Plugin index-geoip is active but IP address is not stored" - + "(store.ip.address == false)"); - } } + /* + * Build the Database and + * + * associated cache. + * @param geoDb the GeoIP2 database to be used for IP lookups. + */ private void buildDb(File geoDb) { try { - reader = new DatabaseReader.Builder(geoDb).build(); - } catch (IOException e) { - LOG.error("Failed to build geoDb:", e); + LOG.info("Reading index-geoip Db file: {}", geoDb); + reader = Objects.requireNonNull(new DatabaseReader.Builder(geoDb).withCache(new CHMCache()).build()); + } catch (IOException | NullPointerException e) { + LOG.error("Failed to build Db: {}", e.getMessage()); } } /** - * + * Filter the document. + * @return A {@link org.apache.nutch.indexer.NutchDocument} with added geoip fields. * @see org.apache.nutch.indexer.IndexingFilter#filter(org.apache.nutch.indexer.NutchDocument, * org.apache.nutch.parse.Parse, org.apache.hadoop.io.Text, * org.apache.nutch.crawl.CrawlDatum, org.apache.nutch.crawl.Inlinks) @@ -202,31 +145,42 @@ private void buildDb(File geoDb) { @Override public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { - return addServerGeo(doc, parse.getData(), url.toString()); + return augmentNutchDocWithIPData(doc, parse.getData()); } - private NutchDocument addServerGeo(NutchDocument doc, ParseData data, - String url) { - + private NutchDocument augmentNutchDocWithIPData(NutchDocument doc, ParseData data) { String serverIp = data.getContentMeta().get("_ip_"); - if (serverIp != null && reader != null) { + // The global DatabaseReader variable is already NonNull so no null check required. + if (!serverIp.isEmpty()) { try { - if (usage.equalsIgnoreCase("cityDatabase")) { + switch (conf.get("index.geoip.usage").toLowerCase()) { + case "anonymous": + doc = GeoIPDocumentCreator.createDocFromAnonymousIpDb(serverIp, doc, reader); + break; + case "asn": + doc = GeoIPDocumentCreator.createDocFromAsnDb(serverIp, doc, reader); + break; + case "city": doc = GeoIPDocumentCreator.createDocFromCityDb(serverIp, doc, reader); - } else if (usage.equalsIgnoreCase("connectionTypeDatabase")) { - doc = GeoIPDocumentCreator.createDocFromConnectionDb(serverIp, doc, - reader); - } else if (usage.equalsIgnoreCase("domainDatabase")) { - doc = GeoIPDocumentCreator.createDocFromDomainDb(serverIp, doc, - reader); - } else if (usage.equalsIgnoreCase("ispDatabase")) { + break; + case "connection": + doc = GeoIPDocumentCreator.createDocFromConnectionDb(serverIp, doc, reader); + break; + case "domain": + doc = GeoIPDocumentCreator.createDocFromDomainDb(serverIp, doc, reader); + break; + case INSIGHTS_SERVICE: + doc = GeoIPDocumentCreator.createDocFromInsightsService(serverIp, doc, client); + break; + case "isp": doc = GeoIPDocumentCreator.createDocFromIspDb(serverIp, doc, reader); - } else if (usage.equalsIgnoreCase("insightsService")) { - doc = GeoIPDocumentCreator.createDocFromInsightsService(serverIp, doc, - client); + break; + default: + LOG.error("Failed to determine 'index.geoip.usage' value: {}", usage); } - } catch (Exception e) { - LOG.error("Failed to determine geoip:", e); + } catch (IOException | GeoIp2Exception e) { + LOG.error("Error creating index-geoip fields _ip_: {}, databe type: {} \n{}", + serverIp, reader.getMetadata().getDatabaseType(), e.getMessage()); } } return doc; diff --git a/src/plugin/index-geoip/src/test/org/apache/nutch/indexer/geoip/TestGeoIPIndexingFilter.java b/src/plugin/index-geoip/src/test/org/apache/nutch/indexer/geoip/TestGeoIPIndexingFilter.java new file mode 100644 index 0000000000..9e2c52ed9a --- /dev/null +++ b/src/plugin/index-geoip/src/test/org/apache/nutch/indexer/geoip/TestGeoIPIndexingFilter.java @@ -0,0 +1,204 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexer.geoip; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import javax.annotation.processing.Filer; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlink; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +/** + * + */ +public class TestGeoIPIndexingFilter { + + private Configuration conf; + private GeoIPIndexingFilter filter; + private NutchDocument doc; + private ParseImpl parseImpl; + private Text text; + private CrawlDatum crawlDatum; + private Inlinks inlinks; + + @Before + public void setUp() { + conf = NutchConfiguration.create(); + conf.setBoolean("store.ip.address", true); + filter = new GeoIPIndexingFilter(); + filter.setConf(conf); + doc = new NutchDocument(); + parseImpl = new ParseImpl("foo bar", new ParseData()); + inlinks = new Inlinks(); + text = new Text("http://nutch.apache.org/index.html"); + crawlDatum = new CrawlDatum(); + } + + @After + public void teardown() { + filter.getConf().clear(); + } + + /** + * Test method for {@link org.apache.nutch.indexer.geoip.GeoIPIndexingFilter#getConf()}. + */ + @Test + public final void testGetConf() { + assertTrue(filter.getConf().getBoolean("store.ip.address", true)); + } + + /** + * Test method for {@link org.apache.nutch.indexer.geoip.GeoIPIndexingFilter#setConf(org.apache.hadoop.conf.Configuration)}. + */ + @Test + public final void testSetConfCaseInsensitive() { + assertNull(filter.getConf().get("index.geoip.usage")); + // test for case insensitivity + filter.getConf().set("index.geoip.usage", "InSiGhTs"); + assertTrue(filter.getConf().get("index.geoip.usage").equalsIgnoreCase("insights")); + } + + /** + * Test method for {@link org.apache.nutch.indexer.geoip.GeoIPIndexingFilter#setConf(org.apache.hadoop.conf.Configuration)}. + */ + @Test + public final void testSetConfDbFile() { + assertNull(filter.getConf().get("index.geoip.db.file")); + // test for case insensitivity + filter.getConf().set("index.geoip.usage", "CiTy"); + filter.getConf().set("index.geoip.db.file", "GeoIP2-City-Test.mmdb"); + assertEquals(filter.getConf().get("index.geoip.db.file"), "GeoIP2-City-Test.mmdb"); + } + + /** + * Test method for {@link org.apache.nutch.indexer.geoip.GeoIPIndexingFilter#filter(org.apache.nutch.indexer.NutchDocument, org.apache.nutch.parse.Parse, org.apache.hadoop.io.Text, org.apache.nutch.crawl.CrawlDatum, org.apache.nutch.crawl.Inlinks)}. + * Uses the GeoIP2 Anonymous IP database to augment NutchDocument fields. + * @throws IndexingException + */ + @Test + public void testAnonymousIPDatabaseGeoIPIndexingFilter() { + conf.set("index.geoip.usage", "anonymous"); + conf.set("index.geoip.db.file", "GeoIP2-Anonymous-IP-Test.mmdb"); + filter.setConf(conf); + parseImpl.getData().getContentMeta().add("_ip_", "::81.2.69.0/120"); + try { + filter.filter(doc, parseImpl, text, crawlDatum, inlinks); + } catch (Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + assertNotNull(doc); + System.out.print(doc.toString()); + assertNotNull(doc.getField("is_anonymous")); + assertNotEquals(doc.getFieldValue("is_tor_exit_node"), "true"); + assertEquals(10, doc.getFieldNames().size()); + assertTrue("NutchDocument contains 'is_public_proxy' field.", + doc.getFieldNames().contains("is_public_proxy")); + } +// +// @Test +// public void testAsnIPDatabaseGeoIPIndexingFilter() { +// conf.set("index.geoip.usage", "asn"); +// conf.set("index.geoip.db.file", "GeoLite2-ASN-Test.mmdb"); +// +// GeoIPIndexingFilter filter = new GeoIPIndexingFilter(); +// filter.setConf(conf); +// Assert.assertNotNull(filter); +// +// NutchDocument doc = new NutchDocument(); +// } +// +// @Test +// public void testCityDatabaseGeoIPIndexingFilter() { +// conf.set("index.geoip.usage", "city"); +// conf.set("index.geoip.db.file", "GeoIP2-City-Test.mmdb"); +// +// GeoIPIndexingFilter filter = new GeoIPIndexingFilter(); +// filter.setConf(conf); +// Assert.assertNotNull(filter); +// +// NutchDocument doc = new NutchDocument(); +// } +// +// @Test +// public void testConnectionDatabaseGeoIPIndexingFilter() { +// conf.set("index.geoip.usage", "connection"); +// conf.set("index.geoip.db.file", "GeoIP2-Connection-Type-Test.mmdb"); +// +// GeoIPIndexingFilter filter = new GeoIPIndexingFilter(); +// filter.setConf(conf); +// Assert.assertNotNull(filter); +// +// NutchDocument doc = new NutchDocument(); +// } +// +// @Test +// public void testDomainDatabaseGeoIPIndexingFilter() { +// conf.set("index.geoip.usage", "domain"); +// conf.set("index.geoip.db.file", "GeoIP2-Domain-Test.mmdb"); +// +// GeoIPIndexingFilter filter = new GeoIPIndexingFilter(); +// filter.setConf(conf); +// Assert.assertNotNull(filter); +// +// NutchDocument doc = new NutchDocument(); +// } +// +// @Test +// public void testInsightsGeoIPIndexingFilter() { +// conf.set("index.geoip.usage", "insights"); +// conf.set("index.geoip.userid", ""); +// conf.set("index.geoip.licensekey", ""); +// +// GeoIPIndexingFilter filter = new GeoIPIndexingFilter(); +// filter.setConf(conf); +// Assert.assertNotNull(filter); +// +// NutchDocument doc = new NutchDocument(); +// } +// +// @Test +// public void testIspDatabaseGeoIPIndexingFilter() { +// conf.set("index.geoip.usage", "isp"); +// conf.set("index.geoip.db.file", "GeoIP2-ISP-Test.mmdb"); +// +// GeoIPIndexingFilter filter = new GeoIPIndexingFilter(); +// filter.setConf(conf); +// Assert.assertNotNull(filter); +// +// NutchDocument doc = new NutchDocument(); +// } +} From 28eb77278e1626a2d348f1b67ac7c6a1cb21a390 Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Wed, 25 Sep 2024 21:56:21 -0700 Subject: [PATCH 2/2] NUTCH-3064 Upgrade com.maxmind.geoip2:geoip2 dependency in geoip-index to v4.2.0 --- .../indexer/geoip/GeoIPIndexingFilter.java | 45 ++++++++++--------- .../geoip/TestGeoIPIndexingFilter.java | 2 +- .../urlfilter/api/RegexURLFilterBaseTest.java | 30 ++++++------- 3 files changed, 39 insertions(+), 38 deletions(-) diff --git a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java index eea14fc127..0b551d7485 100644 --- a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java +++ b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java @@ -16,10 +16,11 @@ */ package org.apache.nutch.indexer.geoip; -import java.io.File; import java.io.IOException; +import java.io.InputStream; import java.lang.invoke.MethodHandles; -import java.net.URL; +import java.util.Arrays; +import java.util.List; import java.util.Objects; import org.apache.hadoop.conf.Configuration; @@ -67,6 +68,8 @@ public class GeoIPIndexingFilter implements IndexingFilter { private Configuration conf; private String usage; private static final String INSIGHTS_SERVICE = "insights"; + private static final List DB_TYPES = Arrays.asList( + "anonymous", "asn", "city", "connection", "domain", "isp"); private WebServiceClient client; private DatabaseReader reader; @@ -99,39 +102,37 @@ public void setConf(Configuration config) { usage = config.get("index.geoip.usage"); if (usage != null && usage.equalsIgnoreCase(INSIGHTS_SERVICE)) { client = new WebServiceClient.Builder( - Integer.parseInt(config.get("index.geoip.userid")), - config.get("index.geoip.licensekey")).build(); + Integer.parseInt(config.get("index.geoip.userid")), + config.get("index.geoip.licensekey")).build(); LOG.debug("Established geoip-index InsightsService client."); - } else if (usage != null && !usage.equalsIgnoreCase(INSIGHTS_SERVICE)) { + } else if (usage != null && DB_TYPES.contains(usage.toLowerCase())) { String dbFile = config.get("index.geoip.db.file"); if (dbFile != null) { - LOG.debug("GeoIP db file: {}", dbFile); - URL dbFileUrl = config.getResource(dbFile); - if (dbFileUrl == null) { - LOG.error("Db file {} not found on classpath", dbFile); + InputStream db = config.getConfResourceAsInputStream(dbFile); + if (db == null) { + LOG.error("GeoIP DB file {} not found on classpath", dbFile); } else { - try { - buildDb(new File(dbFileUrl.getFile())); - } catch (Exception e) { - LOG.error("Failed to read Db file: {} {}", dbFile, e.getMessage()); - } + buildDb(db, dbFile); } } + } else { + LOG.warn("Error processing index-geoip plugin configuration."); } } - /* + /** * Build the Database and * * associated cache. - * @param geoDb the GeoIP2 database to be used for IP lookups. + * @param db an {@link InputStream} representing the GeoIP2 DB to be used for IP lookups. + * @param dbFile the GeoIP DB file name */ - private void buildDb(File geoDb) { + private void buildDb(InputStream db, String dbFile) { try { - LOG.info("Reading index-geoip Db file: {}", geoDb); - reader = Objects.requireNonNull(new DatabaseReader.Builder(geoDb).withCache(new CHMCache()).build()); + reader = Objects.requireNonNull(new DatabaseReader.Builder(db).withCache(new CHMCache()).build()); + LOG.info("Built in-memory GeoIP lookup DB from file: {}", db); } catch (IOException | NullPointerException e) { - LOG.error("Failed to build Db: {}", e.getMessage()); + LOG.error("Failed to read Db file: {} {}", dbFile, e.getMessage()); } } @@ -179,8 +180,8 @@ private NutchDocument augmentNutchDocWithIPData(NutchDocument doc, ParseData dat LOG.error("Failed to determine 'index.geoip.usage' value: {}", usage); } } catch (IOException | GeoIp2Exception e) { - LOG.error("Error creating index-geoip fields _ip_: {}, databe type: {} \n{}", - serverIp, reader.getMetadata().getDatabaseType(), e.getMessage()); + LOG.error("Error creating index-geoip fields _ip_: {}, databe type: {} \n{}", + serverIp, reader.getMetadata().getDatabaseType(), e.getMessage()); } } return doc; diff --git a/src/plugin/index-geoip/src/test/org/apache/nutch/indexer/geoip/TestGeoIPIndexingFilter.java b/src/plugin/index-geoip/src/test/org/apache/nutch/indexer/geoip/TestGeoIPIndexingFilter.java index 9e2c52ed9a..ee8331fb4b 100644 --- a/src/plugin/index-geoip/src/test/org/apache/nutch/indexer/geoip/TestGeoIPIndexingFilter.java +++ b/src/plugin/index-geoip/src/test/org/apache/nutch/indexer/geoip/TestGeoIPIndexingFilter.java @@ -42,7 +42,7 @@ import org.junit.Test; /** - * + * Tests for {@link org.apache.nutch.indexer.geoip.GeoIPIndexingFilter} */ public class TestGeoIPIndexingFilter { diff --git a/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java b/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java index 080b2e5870..48784e09b4 100644 --- a/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java +++ b/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java @@ -33,7 +33,7 @@ import org.apache.nutch.net.URLFilter; /** - * JUnit based test of class RegexURLFilterBase. + * Base test class for {@link org.apache.nutch.urlfilter.api.RegexURLFilterBase} * * @author Jérôme Charron */ @@ -43,14 +43,14 @@ public abstract class RegexURLFilterBaseTest { private static final Logger LOG = LoggerFactory .getLogger(MethodHandles.lookup().lookupClass()); - protected final static String SEPARATOR = System.getProperty("file.separator"); - protected final static String SAMPLES = System.getProperty("test.data", "."); + protected static final String SEPARATOR = System.getProperty("file.separator"); + protected static final String SAMPLES = System.getProperty("test.data", "."); protected abstract URLFilter getURLFilter(Reader rules); protected void bench(int loops, String file) { - try { - bench(loops, new FileReader(SAMPLES + SEPARATOR + file + ".rules"), + try (FileReader rulesReader = new FileReader(SAMPLES + SEPARATOR + file + ".rules");) { + bench(loops, rulesReader, new FileReader(SAMPLES + SEPARATOR + file + ".urls")); } catch (Exception e) { Assert.fail(e.toString()); @@ -74,8 +74,8 @@ protected void bench(int loops, Reader rules, Reader urls) { } protected void bench(int loops, String rulesFile, String urlsFile) { - try { - bench(loops, new FileReader(SAMPLES + SEPARATOR + rulesFile), + try (FileReader rulesReader = new FileReader(SAMPLES + SEPARATOR + rulesFile);) { + bench(loops, rulesReader, new FileReader(SAMPLES + SEPARATOR + urlsFile)); } catch (Exception e) { Assert.fail(e.toString()); @@ -83,18 +83,18 @@ protected void bench(int loops, String rulesFile, String urlsFile) { } protected void test(String rulesFile, String urlsFile) { - try { - test(new FileReader(SAMPLES + SEPARATOR + rulesFile), - new FileReader(SAMPLES + SEPARATOR + urlsFile)); + try (FileReader rulesReader = new FileReader(SAMPLES + SEPARATOR + rulesFile); + FileReader urlsReader = new FileReader(SAMPLES + SEPARATOR + urlsFile);) { + test(rulesReader, urlsReader); } catch (Exception e) { Assert.fail(e.toString()); } } protected void test(String file) { - try { - test(new FileReader(SAMPLES + SEPARATOR + file + ".rules"), - new FileReader(SAMPLES + SEPARATOR + file + ".urls")); + try (FileReader rulesReader = new FileReader(SAMPLES + SEPARATOR + file + ".rules"); + FileReader urlsReader = new FileReader(SAMPLES + SEPARATOR + file + ".urls");) { + test(rulesReader, urlsReader); } catch (Exception e) { Assert.fail(e.toString()); } @@ -121,14 +121,14 @@ protected void test(URLFilter filter, FilteredURL[] expected) { private static FilteredURL[] readURLFile(Reader reader) throws IOException { BufferedReader in = new BufferedReader(reader); - List list = new ArrayList(); + List list = new ArrayList<>(); String line; while ((line = in.readLine()) != null) { if (line.length() != 0) { list.add(new FilteredURL(line)); } } - return (FilteredURL[]) list.toArray(new FilteredURL[list.size()]); + return list.toArray(new FilteredURL[list.size()]); } private static class FilteredURL {