Skip to content

Commit

Permalink
GH-5149 numdocs (#5202)
Browse files Browse the repository at this point in the history
  • Loading branch information
hmottestad authored Nov 20, 2024
2 parents f687e85 + 5833e57 commit 8bb861d
Show file tree
Hide file tree
Showing 21 changed files with 405 additions and 82 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ public interface Model extends Set<Statement>, Serializable, NamespaceAware {
*/
default Namespace setNamespace(String prefix, String name) {
Optional<? extends Namespace> result = getNamespace(prefix);
if (!result.isPresent() || !result.get().getName().equals(name)) {
if (result.isEmpty() || !result.get().getName().equals(name)) {
result = Optional.of(new ModelNamespace(prefix, name));
setNamespace(result.get());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ public static Optional<Resource> getSubjectByType(Model model, IRI type, IRI leg

private static void logDiscrepancyWarning(Optional<? extends Value> preferred,
Optional<? extends Value> fallback) {
if (!fallback.isEmpty() && !preferred.equals(fallback)) {
if (fallback.isPresent() && !preferred.equals(fallback)) {
var msg = "Discrepancy between use of the old and new config vocabulary.";
// depending on whether preferred is set, we log on warn or debug
if (preferred.isEmpty()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ private static boolean isomorphicSingleContext(Model model1, Model model2) {

// Because we have previously already checked that the models are the same size, we don't have to check both
// ways to establish model equality.
return !missingInModel2.isPresent();
return missingInModel2.isEmpty();
}

private static boolean mappingsIncompatible(Map<BNode, HashCode> mapping1, Map<BNode, HashCode> mapping2) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ public Literal evaluate(ValueFactory valueFactory, Value... args) throws ValueEx
Literal rightLit = (Literal) rightVal;

if (leftLit.getLanguage().isPresent()) {
if (!rightLit.getLanguage().isPresent() || rightLit.getLanguage().equals(leftLit.getLanguage())) {
if (rightLit.getLanguage().isEmpty() || rightLit.getLanguage().equals(leftLit.getLanguage())) {

String leftLexVal = leftLit.getLabel();
String rightLexVal = rightLit.getLabel();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ public static Literal createLiteral(String label, String lang, IRI datatype, Par
try {
// Removes datatype for langString datatype with no language tag when VERIFY_DATATYPE_VALUES is False.
if ((workingDatatype == null || RDF.LANGSTRING.equals(workingDatatype))
&& (!workingLang.isPresent() || workingLang.get().isEmpty())
&& (workingLang.isEmpty() || workingLang.get().isEmpty())
&& !parserConfig.get(BasicParserSettings.VERIFY_DATATYPE_VALUES)) {
workingLang = Optional.ofNullable(null);
workingDatatype = null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ public void handleStatement(RDFDataset result, Statement nextStatement) {

// In RDF-1.1, RDF-1.0 Plain Literals are now Typed Literals with
// type xsd:String
if (!literal.getLanguage().isPresent() && datatype == null) {
if (literal.getLanguage().isEmpty() && datatype == null) {
datatype = XSD.STRING.stringValue();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,14 +87,13 @@

/**
* Requires an Elasticsearch cluster with the DeleteByQuery plugin.
*
* <p>
* Note that, while RDF4J is licensed under the EDL, several ElasticSearch dependencies are licensed under the Elastic
* license or the SSPL, which may have implications for some projects.
*
* <p>
* Please consult the ElasticSearch website and license FAQ for more information.
*
* @see <a href="https://www.elastic.co/licensing/elastic-license/faq">Elastic License FAQ</a>
*
* @see LuceneSail
*/
public class ElasticsearchIndex extends AbstractSearchIndex {
Expand Down Expand Up @@ -409,13 +408,8 @@ protected SearchDocument getDocument(String id) throws IOException {
@Override
protected Iterable<? extends SearchDocument> getDocuments(String resourceId) throws IOException {
SearchHits hits = getDocuments(QueryBuilders.termQuery(SearchFields.URI_FIELD_NAME, resourceId));
return Iterables.transform(hits, new Function<>() {

@Override
public SearchDocument apply(SearchHit hit) {
return new ElasticsearchDocument(hit, geoContextMapper);
}
});
return Iterables.transform(hits,
(Function<SearchHit, SearchDocument>) hit -> new ElasticsearchDocument(hit, geoContextMapper));
}

@Override
Expand Down Expand Up @@ -577,18 +571,26 @@ protected Iterable<? extends DocumentScore> query(Resource subject, QuerySpec sp
}

SearchHits hits;
if (subject != null) {
hits = search(subject, request, qb);

int numDocs;

Integer specNumDocs = spec.getNumDocs();
if (specNumDocs != null) {
if (specNumDocs < 0) {
throw new IllegalArgumentException("numDocs must be >= 0");
}
numDocs = specNumDocs;
} else {
hits = search(request, qb);
numDocs = -1;
}
return Iterables.transform(hits, new Function<>() {

@Override
public DocumentScore apply(SearchHit hit) {
return new ElasticsearchDocumentScore(hit, geoContextMapper);
}
});
if (subject != null) {
hits = search(subject, request, qb, numDocs);
} else {
hits = search(request, qb, numDocs);
}
return Iterables.transform(hits,
(Function<SearchHit, DocumentScore>) hit -> new ElasticsearchDocumentScore(hit, geoContextMapper));
}

/**
Expand All @@ -600,11 +602,24 @@ public DocumentScore apply(SearchHit hit) {
* @return search hits
*/
public SearchHits search(Resource resource, SearchRequestBuilder request, QueryBuilder query) {
return search(resource, request, query, -1);
}

/**
* Evaluates the given query only for the given resource.
*
* @param resource
* @param request
* @param query
* @param numDocs
* @return search hits
*/
public SearchHits search(Resource resource, SearchRequestBuilder request, QueryBuilder query, int numDocs) {
// rewrite the query
QueryBuilder idQuery = QueryBuilders.termQuery(SearchFields.URI_FIELD_NAME,
SearchFields.getResourceID(resource));
QueryBuilder combinedQuery = QueryBuilders.boolQuery().must(idQuery).must(query);
return search(request, combinedQuery);
return search(request, combinedQuery, numDocs);
}

@Override
Expand Down Expand Up @@ -686,13 +701,8 @@ protected Iterable<? extends DocumentResult> geoRelationQuery(String relation, I

SearchRequestBuilder request = client.prepareSearch();
SearchHits hits = search(request, QueryBuilders.boolQuery().must(qb).filter(fb));
return Iterables.transform(hits, new Function<>() {

@Override
public DocumentResult apply(SearchHit hit) {
return new ElasticsearchDocumentResult(hit, geoContextMapper);
}
});
return Iterables.transform(hits,
(Function<SearchHit, DocumentResult>) hit -> new ElasticsearchDocumentResult(hit, geoContextMapper));
}

private ShapeRelation toSpatialOp(String relation) {
Expand All @@ -712,25 +722,43 @@ private ShapeRelation toSpatialOp(String relation) {
* Evaluates the given query and returns the results as a TopDocs instance.
*/
public SearchHits search(SearchRequestBuilder request, QueryBuilder query) {
return search(request, query, -1);
}

/**
* Evaluates the given query and returns the results as a TopDocs instance.
*/
public SearchHits search(SearchRequestBuilder request, QueryBuilder query, int numDocs) {
String[] types = getTypes();
int nDocs;
if (maxDocs > 0) {
nDocs = maxDocs;
} else {

if (numDocs < -1) {
throw new IllegalArgumentException("numDocs should be 0 or greater if defined by the user");
}

int size = defaultNumDocs;
if (numDocs >= 0) {
// If the user has set numDocs we will use that. If it is 0 then the implementation may end up throwing an
// exception.
size = Math.min(maxDocs, numDocs);
}

if (size < 0) {
// defaultNumDocs is not set
long docCount = client.prepareSearch(indexName)
.setTypes(types)
.setSource(new SearchSourceBuilder().size(0).query(query))
.get()
.getHits()
.getTotalHits().value;
nDocs = Math.max((int) Math.min(docCount, Integer.MAX_VALUE), 1);
size = Math.max((int) Math.min(docCount, maxDocs), 1);
}

SearchResponse response = request.setIndices(indexName)
.setTypes(types)
.setVersion(false)
.seqNoAndPrimaryTerm(true)
.setQuery(query)
.setSize(nDocs)
.setSize(size)
.execute()
.actionGet();
return response.getHits();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
import org.eclipse.rdf4j.query.MalformedQueryException;
import org.eclipse.rdf4j.query.algebra.Var;
import org.eclipse.rdf4j.query.algebra.evaluation.QueryBindingSet;
import org.eclipse.rdf4j.sail.Sail;
import org.eclipse.rdf4j.sail.SailException;
import org.eclipse.rdf4j.sail.lucene.util.MapOfListMaps;
import org.locationtech.spatial4j.context.SpatialContext;
Expand All @@ -65,7 +66,8 @@ public abstract class AbstractSearchIndex implements SearchIndex {
REJECTED_DATATYPES.add("http://www.w3.org/2001/XMLSchema#float");
}

protected int maxDocs;
protected int defaultNumDocs = -1;
protected int maxDocs = Integer.MAX_VALUE;

protected Set<String> wktFields = Collections.singleton(SearchFields.getPropertyField(GEO.AS_WKT));

Expand All @@ -75,8 +77,29 @@ public abstract class AbstractSearchIndex implements SearchIndex {

@Override
public void initialize(Properties parameters) throws Exception {
String maxDocParam = parameters.getProperty(LuceneSail.MAX_DOCUMENTS_KEY);
maxDocs = (maxDocParam != null) ? Integer.parseInt(maxDocParam) : -1;
String maxDocumentsParam = parameters.getProperty(LuceneSail.MAX_DOCUMENTS_KEY);
String defaultNumDocsParam = parameters.getProperty(LuceneSail.DEFAULT_NUM_DOCS_KEY);

if ((maxDocumentsParam != null)) {
maxDocs = Integer.parseInt(maxDocumentsParam);

// if maxDocs is set then defaultNumDocs is set to maxDocs if it is not set, because we now have a known
// upper limit
defaultNumDocs = (defaultNumDocsParam != null) ? Math.min(maxDocs, Integer.parseInt(defaultNumDocsParam))
: maxDocs;
} else {
// we can never return more than Integer.MAX_VALUE documents
maxDocs = Integer.MAX_VALUE;

// legacy behaviour is to return the number of documents that the query would return if there was no limit,
// so if the defaultNumDocs is not set, we set it to -1 to signal that there is no limit
defaultNumDocs = (defaultNumDocsParam != null) ? Integer.parseInt(defaultNumDocsParam) : -1;
}

if (defaultNumDocs > maxDocs) {
throw new IllegalArgumentException(LuceneSail.DEFAULT_NUM_DOCS_KEY + " must be less than or equal to "
+ LuceneSail.MAX_DOCUMENTS_KEY + " (" + defaultNumDocs + " > " + maxDocs + ")");
}

String wktFieldParam = parameters.getProperty(LuceneSail.WKT_FIELDS);
if (wktFieldParam != null) {
Expand Down Expand Up @@ -146,7 +169,7 @@ public boolean accept(Literal literal) {

// we reject literals that aren't in the list of the indexed lang
if (indexedLangs != null
&& (!literal.getLanguage().isPresent()
&& (literal.getLanguage().isEmpty()
|| !indexedLangs.contains(literal.getLanguage().get().toLowerCase()
))) {
return false;
Expand Down Expand Up @@ -353,11 +376,8 @@ public final synchronized void addRemoveStatements(Collection<Statement> added,
// remove value from both property field and the
// corresponding text field
String field = SearchFields.getPropertyField(r.getPredicate());
Set<String> removedValues = removedOfResource.get(field);
if (removedValues == null) {
removedValues = new HashSet<>();
removedOfResource.put(field, removedValues);
}
Set<String> removedValues = removedOfResource.computeIfAbsent(field,
k -> new HashSet<>());
removedValues.add(val);
}
}
Expand Down Expand Up @@ -545,7 +565,8 @@ private Iterable<? extends DocumentScore> evaluateQuery(QuerySpec query) {
hits = query(query.getSubject(), query);
}
} catch (Exception e) {
logger.error("There was a problem evaluating query '" + query.getCatQuery() + "'!", e);
logger.error("There was a problem evaluating query '{}'!", query.getCatQuery(), e);
assert false : "There was a problem evaluating query '" + query.getCatQuery() + "'!";
}

return hits;
Expand Down Expand Up @@ -717,6 +738,8 @@ private Iterable<? extends DocumentDistance> evaluateQuery(DistanceQuerySpec que
} catch (Exception e) {
logger.error("There was a problem evaluating distance query 'within " + distance + getUnitSymbol(units)
+ " of " + from.getLabel() + "'!", e);
assert false : "There was a problem evaluating distance query 'within " + distance + getUnitSymbol(units)
+ " of " + from.getLabel() + "'!";
}

return hits;
Expand Down Expand Up @@ -825,6 +848,8 @@ private Iterable<? extends DocumentResult> evaluateQuery(GeoRelationQuerySpec qu
} catch (Exception e) {
logger.error("There was a problem evaluating spatial relation query '" + query.getRelation() + " "
+ qgeom.getLabel() + "'!", e);
assert false : "There was a problem evaluating spatial relation query '" + query.getRelation() + " "
+ qgeom.getLabel() + "'!";
}

return hits;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -290,10 +290,17 @@ public class LuceneSail extends NotifyingSailWrapper {
public static final String LUCENE_RAMDIR_KEY = "useramdir";

/**
* Set the key "maxDocuments=&lt;n&gt;" as sail parameter to limit the maximum number of documents to return from a
* search query. The default is to return all documents. NB: this may involve extra cost for some SearchIndex
* Set the key "defaultNumDocs=&lt;n&gt;" as sail parameter to limit the maximum number of documents to return from
* a search query. The default is to return all documents. NB: this may involve extra cost for some SearchIndex
* implementations as they may have to determine this number.
*/
public static final String DEFAULT_NUM_DOCS_KEY = "defaultNumDocs";

/**
* Set the key "maxDocuments=&lt;n&gt;" as sail parameter to limit the maximum number of documents the user can
* query at a time to return from a search query. The default is the value of the {@link #DEFAULT_NUM_DOCS_KEY}
* parameter.
*/
public static final String MAX_DOCUMENTS_KEY = "maxDocuments";

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ public class LuceneSailSchema {

public static final IRI CONTEXT;

public static final IRI NUM_DOCS;

static {
ValueFactory factory = SimpleValueFactory.getInstance(); // compatible with beta4:
// creating a new factory
Expand All @@ -73,5 +75,6 @@ public class LuceneSailSchema {
WITHIN_DISTANCE = factory.createIRI(NAMESPACE + "withinDistance");
DISTANCE = factory.createIRI(NAMESPACE + "distance");
CONTEXT = factory.createIRI(NAMESPACE + "context");
NUM_DOCS = factory.createIRI(NAMESPACE + "numDocs");
}
}
Loading

0 comments on commit 8bb861d

Please sign in to comment.