forked from eclipse-rdf4j/rdf4j
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
eclipse-rdf4jGH-5090 Lucene 9 version of the Lucene SAIL
- Copied the Lucene SAIL implementation and upgraded it to Lucene 9 Signed-off-by: Richard Eckart de Castilho <[email protected]>
- Loading branch information
Showing
23 changed files
with
5,124 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> | ||
<modelVersion>4.0.0</modelVersion> | ||
<parent> | ||
<groupId>org.eclipse.rdf4j</groupId> | ||
<artifactId>rdf4j-sail</artifactId> | ||
<version>5.0.2-SNAPSHOT</version> | ||
</parent> | ||
<artifactId>rdf4j-sail-lucene-v9</artifactId> | ||
<name>RDF4J: Lucene Sail Index</name> | ||
<description>StackableSail implementation offering full-text search on literals, based on Apache Lucene.</description> | ||
<properties> | ||
<lucene.version>9.11.1</lucene.version> | ||
</properties> | ||
<dependencies> | ||
<dependency> | ||
<groupId>${project.groupId}</groupId> | ||
<artifactId>rdf4j-sail-lucene-api</artifactId> | ||
<version>${project.version}</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>org.apache.lucene</groupId> | ||
<artifactId>lucene-core</artifactId> | ||
<version>${lucene.version}</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>org.apache.lucene</groupId> | ||
<artifactId>lucene-queries</artifactId> | ||
<version>${lucene.version}</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>org.apache.lucene</groupId> | ||
<artifactId>lucene-highlighter</artifactId> | ||
<version>${lucene.version}</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>org.apache.lucene</groupId> | ||
<artifactId>lucene-analysis-common</artifactId> | ||
<version>${lucene.version}</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>org.apache.lucene</groupId> | ||
<artifactId>lucene-queryparser</artifactId> | ||
<version>${lucene.version}</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>org.apache.lucene</groupId> | ||
<artifactId>lucene-spatial-extras</artifactId> | ||
<version>${lucene.version}</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>org.apache.lucene</groupId> | ||
<artifactId>lucene-backward-codecs</artifactId> | ||
<version>${lucene.version}</version> | ||
<scope>runtime</scope> | ||
</dependency> | ||
<dependency> | ||
<groupId>${project.groupId}</groupId> | ||
<artifactId>rdf4j-queryalgebra-geosparql</artifactId> | ||
<version>${project.version}</version> | ||
<scope>test</scope> | ||
</dependency> | ||
<dependency> | ||
<groupId>${project.groupId}</groupId> | ||
<artifactId>rdf4j-rio-rdfxml</artifactId> | ||
<version>${project.version}</version> | ||
<scope>test</scope> | ||
</dependency> | ||
<dependency> | ||
<groupId>${project.groupId}</groupId> | ||
<artifactId>rdf4j-sail-memory</artifactId> | ||
<version>${project.version}</version> | ||
<scope>test</scope> | ||
</dependency> | ||
<dependency> | ||
<groupId>org.locationtech.jts</groupId> | ||
<artifactId>jts-core</artifactId> | ||
</dependency> | ||
<dependency> | ||
<groupId>org.junit.vintage</groupId> | ||
<artifactId>junit-vintage-engine</artifactId> | ||
<scope>test</scope> | ||
</dependency> | ||
</dependencies> | ||
</project> |
199 changes: 199 additions & 0 deletions
199
core/sail/lucene-v9/src/main/java/org/eclipse/rdf4j/sail/lucene/impl/LuceneDocument.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,199 @@ | ||
/******************************************************************************* | ||
* Copyright (c) 2015 Eclipse RDF4J contributors, Aduna, and others. | ||
* | ||
* All rights reserved. This program and the accompanying materials | ||
* are made available under the terms of the Eclipse Distribution License v1.0 | ||
* which accompanies this distribution, and is available at | ||
* http://www.eclipse.org/org/documents/edl-v10.php. | ||
* | ||
* SPDX-License-Identifier: BSD-3-Clause | ||
*******************************************************************************/ | ||
package org.eclipse.rdf4j.sail.lucene.impl; | ||
|
||
import java.io.IOException; | ||
import java.text.ParseException; | ||
import java.util.Arrays; | ||
import java.util.HashSet; | ||
import java.util.List; | ||
import java.util.Set; | ||
|
||
import org.apache.lucene.document.Document; | ||
import org.apache.lucene.document.Field; | ||
import org.apache.lucene.document.LatLonPoint; | ||
import org.apache.lucene.document.LatLonShape; | ||
import org.apache.lucene.geo.Line; | ||
import org.apache.lucene.geo.Polygon; | ||
import org.apache.lucene.geo.Rectangle; | ||
import org.apache.lucene.geo.SimpleWKTShapeParser; | ||
import org.apache.lucene.index.IndexableField; | ||
import org.apache.lucene.sandbox.document.LatLonBoundingBox; | ||
import org.apache.lucene.spatial.SpatialStrategy; | ||
import org.eclipse.rdf4j.sail.lucene.LuceneSail; | ||
import org.eclipse.rdf4j.sail.lucene.SearchDocument; | ||
import org.eclipse.rdf4j.sail.lucene.SearchFields; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
import com.google.common.base.Function; | ||
|
||
public class LuceneDocument implements SearchDocument { | ||
|
||
private final Document doc; | ||
private final Logger logger = LoggerFactory.getLogger(getClass()); | ||
|
||
private static final String POINT_FIELD_PREFIX = "_pt_"; | ||
private static final String GEO_FIELD_PREFIX = "_geo_"; | ||
|
||
private final Function<? super String, ? extends SpatialStrategy> geoStrategyMapper; | ||
|
||
public LuceneDocument(Function<? super String, ? extends SpatialStrategy> geoStrategyMapper) { | ||
this(new Document(), geoStrategyMapper); | ||
} | ||
|
||
public LuceneDocument(Document doc, Function<? super String, ? extends SpatialStrategy> geoStrategyMapper) { | ||
this.doc = doc; | ||
this.geoStrategyMapper = geoStrategyMapper; | ||
} | ||
|
||
public LuceneDocument(String id, String resourceId, String context, | ||
Function<? super String, ? extends SpatialStrategy> geoStrategyMapper) { | ||
this(geoStrategyMapper); | ||
setId(id); | ||
setResource(resourceId); | ||
setContext(context); | ||
} | ||
|
||
private void setId(String id) { | ||
LuceneIndex.addIDField(id, doc); | ||
} | ||
|
||
private void setContext(String context) { | ||
LuceneIndex.addContextField(context, doc); | ||
} | ||
|
||
private void setResource(String resourceId) { | ||
LuceneIndex.addResourceField(resourceId, doc); | ||
} | ||
|
||
public Document getDocument() { | ||
return doc; | ||
} | ||
|
||
@Override | ||
public String getId() { | ||
return doc.get(SearchFields.ID_FIELD_NAME); | ||
} | ||
|
||
@Override | ||
public String getResource() { | ||
return doc.get(SearchFields.URI_FIELD_NAME); | ||
} | ||
|
||
@Override | ||
public String getContext() { | ||
return doc.get(SearchFields.CONTEXT_FIELD_NAME); | ||
} | ||
|
||
@Override | ||
public Set<String> getPropertyNames() { | ||
List<IndexableField> fields = doc.getFields(); | ||
Set<String> names = new HashSet<>(); | ||
for (IndexableField field : fields) { | ||
String name = field.name(); | ||
if (SearchFields.isPropertyField(name)) { | ||
names.add(name); | ||
} | ||
} | ||
return names; | ||
} | ||
|
||
@Override | ||
public void addProperty(String name) { | ||
// don't need to do anything | ||
} | ||
|
||
/** | ||
* Stores and indexes a property in a Document. We don't have to recalculate the concatenated text: just add another | ||
* TEXT field and Lucene will take care of this. Additional advantage: Lucene may be able to handle the invididual | ||
* strings in a way that may affect e.g. phrase and proximity searches (concatenation basically means loss of | ||
* information). NOTE: The TEXT_FIELD_NAME has to be stored, see in LuceneSail | ||
* | ||
* @see LuceneSail | ||
*/ | ||
@Override | ||
public void addProperty(String name, String text) { | ||
LuceneIndex.addPredicateField(name, text, doc); | ||
LuceneIndex.addTextField(text, doc); | ||
} | ||
|
||
/** | ||
* Checks whether a field occurs with a specified value in a Document. | ||
*/ | ||
@Override | ||
public boolean hasProperty(String fieldName, String value) { | ||
String[] fields = doc.getValues(fieldName); | ||
if (fields != null) { | ||
for (String field : fields) { | ||
if (value.equals(field)) { | ||
return true; | ||
} | ||
} | ||
} | ||
|
||
return false; | ||
} | ||
|
||
@Override | ||
public List<String> getProperty(String name) { | ||
return Arrays.asList(doc.getValues(name)); | ||
} | ||
|
||
private void indexShape(Object shape, String field) { | ||
|
||
if (shape instanceof Object[]) { // case of GEOMETRYCOLLECTION | ||
Object[] geometries = (Object[]) shape; | ||
|
||
for (int i = 0; i < geometries.length; i++) { | ||
indexShape(geometries[i], field); | ||
} | ||
} else { | ||
if (shape instanceof Polygon) { // WKT:POLYGON | ||
for (Field f : LatLonShape.createIndexableFields(GEO_FIELD_PREFIX + field, (Polygon) shape)) { | ||
doc.add(f); | ||
} | ||
} else if (shape instanceof Line) { // WKT:LINESTRING | ||
for (Field f : LatLonShape.createIndexableFields(GEO_FIELD_PREFIX + field, (Line) shape)) { | ||
doc.add(f); | ||
} | ||
} else if (shape instanceof double[]) { // WKT:POINT | ||
double[] point = (double[]) shape; | ||
|
||
for (Field f : LatLonShape.createIndexableFields(GEO_FIELD_PREFIX + field, point[1], | ||
point[0])) { | ||
doc.add(f); | ||
} | ||
doc.add(new LatLonPoint(POINT_FIELD_PREFIX + field, point[1], point[0])); | ||
} else if (shape instanceof Rectangle) { // WKT:ENVELOPE / RECTANGLE | ||
Rectangle box = (Rectangle) shape; | ||
doc.add(new LatLonBoundingBox(GEO_FIELD_PREFIX + field, box.minLat, box.minLon, box.maxLat, | ||
box.maxLon)); | ||
} else { | ||
throw new IllegalArgumentException("Geometry for shape " + shape.toString() + " is not supported"); | ||
} | ||
} | ||
} | ||
|
||
@Override | ||
public void addGeoProperty(String field, String value) { | ||
LuceneIndex.addStoredOnlyPredicateField(field, value, doc); | ||
try { | ||
String wkt = value; | ||
Object shape = SimpleWKTShapeParser.parse(wkt); | ||
indexShape(shape, field); | ||
} catch (ParseException e) { | ||
logger.warn("error while processing geo property", e); | ||
} catch (IOException e) { | ||
logger.warn("error while parsing wkt geometry", e); | ||
} | ||
} | ||
} |
67 changes: 67 additions & 0 deletions
67
...il/lucene-v9/src/main/java/org/eclipse/rdf4j/sail/lucene/impl/LuceneDocumentDistance.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
/******************************************************************************* | ||
* Copyright (c) 2015 Eclipse RDF4J contributors, Aduna, and others. | ||
* | ||
* All rights reserved. This program and the accompanying materials | ||
* are made available under the terms of the Eclipse Distribution License v1.0 | ||
* which accompanies this distribution, and is available at | ||
* http://www.eclipse.org/org/documents/edl-v10.php. | ||
* | ||
* SPDX-License-Identifier: BSD-3-Clause | ||
*******************************************************************************/ | ||
package org.eclipse.rdf4j.sail.lucene.impl; | ||
|
||
import java.text.ParseException; | ||
import java.util.List; | ||
import java.util.Set; | ||
|
||
import org.apache.lucene.search.ScoreDoc; | ||
import org.eclipse.rdf4j.model.IRI; | ||
import org.eclipse.rdf4j.sail.lucene.DocumentDistance; | ||
import org.eclipse.rdf4j.sail.lucene.SearchFields; | ||
import org.eclipse.rdf4j.sail.lucene.util.GeoUnits; | ||
import org.locationtech.spatial4j.shape.Point; | ||
import org.locationtech.spatial4j.shape.Shape; | ||
|
||
import com.google.common.collect.Sets; | ||
|
||
public class LuceneDocumentDistance extends LuceneDocumentResult implements DocumentDistance { | ||
|
||
private final String geoProperty; | ||
|
||
private final IRI units; | ||
|
||
private final Point origin; | ||
|
||
private static Set<String> requiredFields(String geoProperty, boolean includeContext) { | ||
Set<String> fields = Sets.newHashSet(SearchFields.URI_FIELD_NAME, geoProperty); | ||
if (includeContext) { | ||
fields.add(SearchFields.CONTEXT_FIELD_NAME); | ||
} | ||
return fields; | ||
} | ||
|
||
public LuceneDocumentDistance(ScoreDoc doc, String geoProperty, IRI units, Point origin, boolean includeContext, | ||
LuceneIndex index) { | ||
super(doc, index, requiredFields(geoProperty, includeContext)); | ||
this.geoProperty = geoProperty; | ||
this.units = units; | ||
this.origin = origin; | ||
} | ||
|
||
@Override | ||
public double getDistance() { | ||
List<String> wkts = getDocument().getProperty(geoProperty); | ||
double min = Double.POSITIVE_INFINITY; | ||
for (String wkt : wkts) { | ||
Shape shape; | ||
try { | ||
shape = index.getSpatialContext(geoProperty).readShapeFromWkt(wkt); | ||
double dist = index.getSpatialContext(geoProperty).calcDistance(shape.getCenter(), origin); | ||
min = Math.min(dist, min); | ||
} catch (ParseException e) { | ||
// ignore | ||
} | ||
} | ||
return GeoUnits.fromDegrees(min, units); | ||
} | ||
} |
44 changes: 44 additions & 0 deletions
44
...sail/lucene-v9/src/main/java/org/eclipse/rdf4j/sail/lucene/impl/LuceneDocumentResult.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
/******************************************************************************* | ||
* Copyright (c) 2015 Eclipse RDF4J contributors, Aduna, and others. | ||
* | ||
* All rights reserved. This program and the accompanying materials | ||
* are made available under the terms of the Eclipse Distribution License v1.0 | ||
* which accompanies this distribution, and is available at | ||
* http://www.eclipse.org/org/documents/edl-v10.php. | ||
* | ||
* SPDX-License-Identifier: BSD-3-Clause | ||
*******************************************************************************/ | ||
package org.eclipse.rdf4j.sail.lucene.impl; | ||
|
||
import java.util.Set; | ||
|
||
import org.apache.lucene.document.Document; | ||
import org.apache.lucene.search.ScoreDoc; | ||
import org.eclipse.rdf4j.sail.lucene.DocumentResult; | ||
import org.eclipse.rdf4j.sail.lucene.SearchDocument; | ||
|
||
public class LuceneDocumentResult implements DocumentResult { | ||
|
||
protected final ScoreDoc scoreDoc; | ||
|
||
protected final LuceneIndex index; | ||
|
||
private final Set<String> fields; | ||
|
||
private LuceneDocument fullDoc; | ||
|
||
public LuceneDocumentResult(ScoreDoc doc, LuceneIndex index, Set<String> fields) { | ||
this.scoreDoc = doc; | ||
this.index = index; | ||
this.fields = fields; | ||
} | ||
|
||
@Override | ||
public SearchDocument getDocument() { | ||
if (fullDoc == null) { | ||
Document doc = index.getDocument(scoreDoc.doc, fields); | ||
fullDoc = new LuceneDocument(doc, index.getSpatialStrategyMapper()); | ||
} | ||
return fullDoc; | ||
} | ||
} |
Oops, something went wrong.