Skip to content

Commit

Permalink
eclipse-rdf4jGH-5090 Lucene 9 version of the Lucene SAIL
Browse files Browse the repository at this point in the history
- Copied the Lucene SAIL implementation and upgraded it to Lucene 9

Signed-off-by: Richard Eckart de Castilho <[email protected]>
  • Loading branch information
reckart committed Jul 26, 2024
1 parent 2a998c3 commit 3db999c
Show file tree
Hide file tree
Showing 23 changed files with 5,124 additions and 0 deletions.
85 changes: 85 additions & 0 deletions core/sail/lucene-v9/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.eclipse.rdf4j</groupId>
<artifactId>rdf4j-sail</artifactId>
<version>5.0.2-SNAPSHOT</version>
</parent>
<artifactId>rdf4j-sail-lucene-v9</artifactId>
<name>RDF4J: Lucene Sail Index</name>
<description>StackableSail implementation offering full-text search on literals, based on Apache Lucene.</description>
<properties>
<lucene.version>9.11.1</lucene.version>
</properties>
<dependencies>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>rdf4j-sail-lucene-api</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>${lucene.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queries</artifactId>
<version>${lucene.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-highlighter</artifactId>
<version>${lucene.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analysis-common</artifactId>
<version>${lucene.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queryparser</artifactId>
<version>${lucene.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-spatial-extras</artifactId>
<version>${lucene.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-backward-codecs</artifactId>
<version>${lucene.version}</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>rdf4j-queryalgebra-geosparql</artifactId>
<version>${project.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>rdf4j-rio-rdfxml</artifactId>
<version>${project.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>rdf4j-sail-memory</artifactId>
<version>${project.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.locationtech.jts</groupId>
<artifactId>jts-core</artifactId>
</dependency>
<dependency>
<groupId>org.junit.vintage</groupId>
<artifactId>junit-vintage-engine</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
/*******************************************************************************
* Copyright (c) 2015 Eclipse RDF4J contributors, Aduna, and others.
*
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Distribution License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/org/documents/edl-v10.php.
*
* SPDX-License-Identifier: BSD-3-Clause
*******************************************************************************/
package org.eclipse.rdf4j.sail.lucene.impl;

import java.io.IOException;
import java.text.ParseException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.LatLonPoint;
import org.apache.lucene.document.LatLonShape;
import org.apache.lucene.geo.Line;
import org.apache.lucene.geo.Polygon;
import org.apache.lucene.geo.Rectangle;
import org.apache.lucene.geo.SimpleWKTShapeParser;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.sandbox.document.LatLonBoundingBox;
import org.apache.lucene.spatial.SpatialStrategy;
import org.eclipse.rdf4j.sail.lucene.LuceneSail;
import org.eclipse.rdf4j.sail.lucene.SearchDocument;
import org.eclipse.rdf4j.sail.lucene.SearchFields;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Function;

public class LuceneDocument implements SearchDocument {

private final Document doc;
private final Logger logger = LoggerFactory.getLogger(getClass());

private static final String POINT_FIELD_PREFIX = "_pt_";
private static final String GEO_FIELD_PREFIX = "_geo_";

private final Function<? super String, ? extends SpatialStrategy> geoStrategyMapper;

public LuceneDocument(Function<? super String, ? extends SpatialStrategy> geoStrategyMapper) {
this(new Document(), geoStrategyMapper);
}

public LuceneDocument(Document doc, Function<? super String, ? extends SpatialStrategy> geoStrategyMapper) {
this.doc = doc;
this.geoStrategyMapper = geoStrategyMapper;
}

public LuceneDocument(String id, String resourceId, String context,
Function<? super String, ? extends SpatialStrategy> geoStrategyMapper) {
this(geoStrategyMapper);
setId(id);
setResource(resourceId);
setContext(context);
}

private void setId(String id) {
LuceneIndex.addIDField(id, doc);
}

private void setContext(String context) {
LuceneIndex.addContextField(context, doc);
}

private void setResource(String resourceId) {
LuceneIndex.addResourceField(resourceId, doc);
}

public Document getDocument() {
return doc;
}

@Override
public String getId() {
return doc.get(SearchFields.ID_FIELD_NAME);
}

@Override
public String getResource() {
return doc.get(SearchFields.URI_FIELD_NAME);
}

@Override
public String getContext() {
return doc.get(SearchFields.CONTEXT_FIELD_NAME);
}

@Override
public Set<String> getPropertyNames() {
List<IndexableField> fields = doc.getFields();
Set<String> names = new HashSet<>();
for (IndexableField field : fields) {
String name = field.name();
if (SearchFields.isPropertyField(name)) {
names.add(name);
}
}
return names;
}

@Override
public void addProperty(String name) {
// don't need to do anything
}

/**
* Stores and indexes a property in a Document. We don't have to recalculate the concatenated text: just add another
* TEXT field and Lucene will take care of this. Additional advantage: Lucene may be able to handle the invididual
* strings in a way that may affect e.g. phrase and proximity searches (concatenation basically means loss of
* information). NOTE: The TEXT_FIELD_NAME has to be stored, see in LuceneSail
*
* @see LuceneSail
*/
@Override
public void addProperty(String name, String text) {
LuceneIndex.addPredicateField(name, text, doc);
LuceneIndex.addTextField(text, doc);
}

/**
* Checks whether a field occurs with a specified value in a Document.
*/
@Override
public boolean hasProperty(String fieldName, String value) {
String[] fields = doc.getValues(fieldName);
if (fields != null) {
for (String field : fields) {
if (value.equals(field)) {
return true;
}
}
}

return false;
}

@Override
public List<String> getProperty(String name) {
return Arrays.asList(doc.getValues(name));
}

private void indexShape(Object shape, String field) {

if (shape instanceof Object[]) { // case of GEOMETRYCOLLECTION
Object[] geometries = (Object[]) shape;

for (int i = 0; i < geometries.length; i++) {
indexShape(geometries[i], field);
}
} else {
if (shape instanceof Polygon) { // WKT:POLYGON
for (Field f : LatLonShape.createIndexableFields(GEO_FIELD_PREFIX + field, (Polygon) shape)) {
doc.add(f);
}
} else if (shape instanceof Line) { // WKT:LINESTRING
for (Field f : LatLonShape.createIndexableFields(GEO_FIELD_PREFIX + field, (Line) shape)) {
doc.add(f);
}
} else if (shape instanceof double[]) { // WKT:POINT
double[] point = (double[]) shape;

for (Field f : LatLonShape.createIndexableFields(GEO_FIELD_PREFIX + field, point[1],
point[0])) {
doc.add(f);
}
doc.add(new LatLonPoint(POINT_FIELD_PREFIX + field, point[1], point[0]));
} else if (shape instanceof Rectangle) { // WKT:ENVELOPE / RECTANGLE
Rectangle box = (Rectangle) shape;
doc.add(new LatLonBoundingBox(GEO_FIELD_PREFIX + field, box.minLat, box.minLon, box.maxLat,
box.maxLon));
} else {
throw new IllegalArgumentException("Geometry for shape " + shape.toString() + " is not supported");
}
}
}

@Override
public void addGeoProperty(String field, String value) {
LuceneIndex.addStoredOnlyPredicateField(field, value, doc);
try {
String wkt = value;
Object shape = SimpleWKTShapeParser.parse(wkt);
indexShape(shape, field);
} catch (ParseException e) {
logger.warn("error while processing geo property", e);
} catch (IOException e) {
logger.warn("error while parsing wkt geometry", e);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/*******************************************************************************
* Copyright (c) 2015 Eclipse RDF4J contributors, Aduna, and others.
*
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Distribution License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/org/documents/edl-v10.php.
*
* SPDX-License-Identifier: BSD-3-Clause
*******************************************************************************/
package org.eclipse.rdf4j.sail.lucene.impl;

import java.text.ParseException;
import java.util.List;
import java.util.Set;

import org.apache.lucene.search.ScoreDoc;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.sail.lucene.DocumentDistance;
import org.eclipse.rdf4j.sail.lucene.SearchFields;
import org.eclipse.rdf4j.sail.lucene.util.GeoUnits;
import org.locationtech.spatial4j.shape.Point;
import org.locationtech.spatial4j.shape.Shape;

import com.google.common.collect.Sets;

public class LuceneDocumentDistance extends LuceneDocumentResult implements DocumentDistance {

private final String geoProperty;

private final IRI units;

private final Point origin;

private static Set<String> requiredFields(String geoProperty, boolean includeContext) {
Set<String> fields = Sets.newHashSet(SearchFields.URI_FIELD_NAME, geoProperty);
if (includeContext) {
fields.add(SearchFields.CONTEXT_FIELD_NAME);
}
return fields;
}

public LuceneDocumentDistance(ScoreDoc doc, String geoProperty, IRI units, Point origin, boolean includeContext,
LuceneIndex index) {
super(doc, index, requiredFields(geoProperty, includeContext));
this.geoProperty = geoProperty;
this.units = units;
this.origin = origin;
}

@Override
public double getDistance() {
List<String> wkts = getDocument().getProperty(geoProperty);
double min = Double.POSITIVE_INFINITY;
for (String wkt : wkts) {
Shape shape;
try {
shape = index.getSpatialContext(geoProperty).readShapeFromWkt(wkt);
double dist = index.getSpatialContext(geoProperty).calcDistance(shape.getCenter(), origin);
min = Math.min(dist, min);
} catch (ParseException e) {
// ignore
}
}
return GeoUnits.fromDegrees(min, units);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
/*******************************************************************************
* Copyright (c) 2015 Eclipse RDF4J contributors, Aduna, and others.
*
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Distribution License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/org/documents/edl-v10.php.
*
* SPDX-License-Identifier: BSD-3-Clause
*******************************************************************************/
package org.eclipse.rdf4j.sail.lucene.impl;

import java.util.Set;

import org.apache.lucene.document.Document;
import org.apache.lucene.search.ScoreDoc;
import org.eclipse.rdf4j.sail.lucene.DocumentResult;
import org.eclipse.rdf4j.sail.lucene.SearchDocument;

public class LuceneDocumentResult implements DocumentResult {

protected final ScoreDoc scoreDoc;

protected final LuceneIndex index;

private final Set<String> fields;

private LuceneDocument fullDoc;

public LuceneDocumentResult(ScoreDoc doc, LuceneIndex index, Set<String> fields) {
this.scoreDoc = doc;
this.index = index;
this.fields = fields;
}

@Override
public SearchDocument getDocument() {
if (fullDoc == null) {
Document doc = index.getDocument(scoreDoc.doc, fields);
fullDoc = new LuceneDocument(doc, index.getSpatialStrategyMapper());
}
return fullDoc;
}
}
Loading

0 comments on commit 3db999c

Please sign in to comment.