Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-5149 Lucene numdocs param #5163

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Objects;
import java.util.Properties;
import java.util.Set;

Expand Down Expand Up @@ -577,10 +578,11 @@ protected Iterable<? extends DocumentScore> query(Resource subject, QuerySpec sp
}

SearchHits hits;
int numDocs = Objects.requireNonNullElse(spec.getNumDocs(), -1);
if (subject != null) {
hits = search(subject, request, qb);
hits = search(subject, request, qb, numDocs);
} else {
hits = search(request, qb);
hits = search(request, qb, numDocs);
}
return Iterables.transform(hits, new Function<>() {

Expand All @@ -600,11 +602,24 @@ public DocumentScore apply(SearchHit hit) {
* @return search hits
*/
public SearchHits search(Resource resource, SearchRequestBuilder request, QueryBuilder query) {
return search(resource, request, query, -1);
}

/**
* Evaluates the given query only for the given resource.
*
* @param resource
* @param request
* @param query
* @param numDocs
* @return search hits
*/
public SearchHits search(Resource resource, SearchRequestBuilder request, QueryBuilder query, int numDocs) {
// rewrite the query
QueryBuilder idQuery = QueryBuilders.termQuery(SearchFields.URI_FIELD_NAME,
SearchFields.getResourceID(resource));
QueryBuilder combinedQuery = QueryBuilders.boolQuery().must(idQuery).must(query);
return search(request, combinedQuery);
return search(request, combinedQuery, numDocs);
}

@Override
Expand Down Expand Up @@ -712,10 +727,23 @@ private ShapeRelation toSpatialOp(String relation) {
* Evaluates the given query and returns the results as a TopDocs instance.
*/
public SearchHits search(SearchRequestBuilder request, QueryBuilder query) {
return search(request, query, -1);
}

/**
* Evaluates the given query and returns the results as a TopDocs instance.
*/
public SearchHits search(SearchRequestBuilder request, QueryBuilder query, int numDocs) {
String[] types = getTypes();
int nDocs;
if (maxDocs > 0) {
nDocs = maxDocs;
if (numDocs > 0) {
ate47 marked this conversation as resolved.
Show resolved Hide resolved
if (maxDocs > 0 && maxDocs < numDocs) {
nDocs = maxDocs;
} else {
nDocs = numDocs;
}
} else if (defaultNumDocs > 0) {
nDocs = defaultNumDocs;
} else {
long docCount = client.prepareSearch(indexName)
.setTypes(types)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ public abstract class AbstractSearchIndex implements SearchIndex {
REJECTED_DATATYPES.add("http://www.w3.org/2001/XMLSchema#float");
}

protected int defaultNumDocs;
protected int maxDocs;

protected Set<String> wktFields = Collections.singleton(SearchFields.getPropertyField(GEO.AS_WKT));
Expand All @@ -75,8 +76,10 @@ public abstract class AbstractSearchIndex implements SearchIndex {

@Override
public void initialize(Properties parameters) throws Exception {
String maxDocParam = parameters.getProperty(LuceneSail.MAX_DOCUMENTS_KEY);
maxDocs = (maxDocParam != null) ? Integer.parseInt(maxDocParam) : -1;
String maxDocumentsParam = parameters.getProperty(LuceneSail.MAX_DOCUMENTS_KEY);
maxDocs = (maxDocumentsParam != null) ? Integer.parseInt(maxDocumentsParam) : -1;
String defaultNumDocsParam = parameters.getProperty(LuceneSail.DEFAULT_NUM_DOCS_KEY);
defaultNumDocs = (defaultNumDocsParam != null) ? Integer.parseInt(defaultNumDocsParam) : defaultNumDocs;

String wktFieldParam = parameters.getProperty(LuceneSail.WKT_FIELDS);
if (wktFieldParam != null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -290,10 +290,17 @@ public class LuceneSail extends NotifyingSailWrapper {
public static final String LUCENE_RAMDIR_KEY = "useramdir";

/**
* Set the key "maxDocuments=&lt;n&gt;" as sail parameter to limit the maximum number of documents to return from a
* search query. The default is to return all documents. NB: this may involve extra cost for some SearchIndex
* Set the key "defaultNumDocs=&lt;n&gt;" as sail parameter to limit the maximum number of documents to return from
* a search query. The default is to return all documents. NB: this may involve extra cost for some SearchIndex
* implementations as they may have to determine this number.
*/
public static final String DEFAULT_NUM_DOCS_KEY = "defaultNumDocs";

/**
* Set the key "maxDocuments=&lt;n&gt;" as sail parameter to limit the maximum number of documents the user can
* query at a time to return from a search query. The default is the value of the {@link #DEFAULT_NUM_DOCS_KEY}
* parameter.
*/
public static final String MAX_DOCUMENTS_KEY = "maxDocuments";

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ public class LuceneSailSchema {

public static final IRI CONTEXT;

public static final IRI NUM_DOCS;

static {
ValueFactory factory = SimpleValueFactory.getInstance(); // compatible with beta4:
// creating a new factory
Expand All @@ -73,5 +75,6 @@ public class LuceneSailSchema {
WITHIN_DISTANCE = factory.createIRI(NAMESPACE + "withinDistance");
DISTANCE = factory.createIRI(NAMESPACE + "distance");
CONTEXT = factory.createIRI(NAMESPACE + "context");
NUM_DOCS = factory.createIRI(NAMESPACE + "numDocs");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@
import java.util.stream.Collectors;

import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Literal;
import org.eclipse.rdf4j.model.Resource;
import org.eclipse.rdf4j.model.Value;
import org.eclipse.rdf4j.query.algebra.QueryModelNode;
import org.eclipse.rdf4j.query.algebra.SingletonSet;
import org.eclipse.rdf4j.query.algebra.StatementPattern;
Expand Down Expand Up @@ -67,21 +69,43 @@ private static void append(Var var, StringBuilder buffer) {

private final StatementPattern idPattern;

private final StatementPattern numDocsPattern;

private final Resource subject;

private final String matchesVarName;

private final String scoreVarName;

private final Integer numDocs;

public QuerySpec(StatementPattern matchesPattern, Collection<QueryParam> queryPatterns,
StatementPattern scorePattern, StatementPattern typePattern,
StatementPattern idPattern, Resource subject) {
this(matchesPattern, queryPatterns, scorePattern, typePattern, idPattern, null, subject);
}

public QuerySpec(StatementPattern matchesPattern, Collection<QueryParam> queryPatterns,
StatementPattern scorePattern, StatementPattern typePattern,
StatementPattern idPattern, StatementPattern numDocsPattern, Resource subject) {
this.matchesPattern = matchesPattern;
this.queryPatterns = queryPatterns;
this.scorePattern = scorePattern;
this.typePattern = typePattern;
this.idPattern = idPattern;
this.numDocsPattern = numDocsPattern;
this.subject = subject;
if (numDocsPattern != null) {
Value val = numDocsPattern.getObjectVar().getValue();
if (val != null && val.isLiteral()) {
this.numDocs = ((Literal) val).intValue();
} else {
throw new IllegalArgumentException("numDocs should be constant literal value");
}
} else {
this.numDocs = null;
}

if (matchesPattern != null) {
this.matchesVarName = matchesPattern.getSubjectVar().getName();
} else {
Expand All @@ -101,9 +125,11 @@ public QuerySpec(String matchesVarName, String propertyVarName, String scoreVarN
this.matchesPattern = null;
this.scorePattern = null;
this.typePattern = null;
this.numDocsPattern = null;
this.queryPatterns = Set.of();
this.idPattern = null;
this.subject = subject;
this.numDocs = null;
}

@Override
Expand All @@ -121,6 +147,7 @@ public QueryModelNode removeQueryPatterns() {
replace(getScorePattern(), replacement);
replace(getTypePattern(), replacement);
replace(getIdPattern(), replacement);
replace(getNumDocsPattern(), replacement);

final QueryModelNode placeholder = new SingletonSet();

Expand Down Expand Up @@ -154,6 +181,10 @@ public StatementPattern getScorePattern() {
return scorePattern;
}

public StatementPattern getNumDocsPattern() {
return numDocsPattern;
}

/**
* The variable name associated with the query score
*
Expand All @@ -163,6 +194,10 @@ public String getScoreVariableName() {
return scoreVarName;
}

public Integer getNumDocs() {
return numDocs;
}

public StatementPattern getTypePattern() {
return typePattern;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import static org.eclipse.rdf4j.sail.lucene.LuceneSailSchema.INDEXID;
import static org.eclipse.rdf4j.sail.lucene.LuceneSailSchema.LUCENE_QUERY;
import static org.eclipse.rdf4j.sail.lucene.LuceneSailSchema.MATCHES;
import static org.eclipse.rdf4j.sail.lucene.LuceneSailSchema.NUM_DOCS;
import static org.eclipse.rdf4j.sail.lucene.LuceneSailSchema.PROPERTY;
import static org.eclipse.rdf4j.sail.lucene.LuceneSailSchema.QUERY;
import static org.eclipse.rdf4j.sail.lucene.LuceneSailSchema.SCORE;
Expand Down Expand Up @@ -152,7 +153,7 @@ public void process(TupleExpr tupleExpr, BindingSet bindings, Collection<SearchQ
}

// find the relevant outgoing patterns
StatementPattern typePattern, propertyPattern, scorePattern, snippetPattern;
StatementPattern typePattern, propertyPattern, scorePattern, snippetPattern, numDocsPattern;
List<StatementPattern> queryPatterns;

try {
Expand All @@ -161,6 +162,7 @@ public void process(TupleExpr tupleExpr, BindingSet bindings, Collection<SearchQ
propertyPattern = getPattern(matchesVar, filter.propertyPatterns);
scorePattern = getPattern(matchesVar, filter.scorePatterns);
snippetPattern = getPattern(matchesVar, filter.snippetPatterns);
numDocsPattern = getPattern(matchesVar, filter.numDocsPatterns);
} catch (IllegalArgumentException e) {
failOrWarn(e);
continue;
Expand Down Expand Up @@ -302,7 +304,8 @@ else if (propertyValue != null) {
queryString, propertyURI, null));
}

QuerySpec querySpec = new QuerySpec(matchesPattern, queries, scorePattern, typePattern, idPattern, subject);
QuerySpec querySpec = new QuerySpec(matchesPattern, queries, scorePattern, typePattern, idPattern,
numDocsPattern, subject);

if (querySpec.isEvaluable()) {
// constant optimizer
Expand Down Expand Up @@ -341,6 +344,10 @@ else if (propertyValue != null) {
funcCall.addArg(new ValueConstant(LuceneSailSchema.SNIPPET));
funcCall.addResultVar(snippetVar);
}
if (numDocsPattern != null) {
funcCall.addArg(new ValueConstant(LuceneSailSchema.NUM_DOCS));
funcCall.addArg(numDocsPattern.getObjectVar());
}

Join join = new Join();
matchesPattern.replaceWith(join);
Expand Down Expand Up @@ -465,6 +472,8 @@ private static class PatternFilter extends AbstractQueryModelVisitor<RuntimeExce

public ArrayList<StatementPattern> boostPatterns = new ArrayList<>();

public ArrayList<StatementPattern> numDocsPatterns = new ArrayList<>();

/**
* Method implementing the visitor pattern that gathers all statements using a predicate from the LuceneSail's
* namespace.
Expand All @@ -487,6 +496,8 @@ public void meet(StatementPattern node) {
idPatterns.add(node);
} else if (BOOST.equals(predicate)) {
boostPatterns.add(node);
} else if (NUM_DOCS.equals(predicate)) {
numDocsPatterns.add(node);
} else if (TYPE.equals(predicate)) {
Value object = node.getObjectVar().getValue();
if (LUCENE_QUERY.equals(object)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import static org.eclipse.rdf4j.sail.lucene.LuceneSailSchema.BOOST;
import static org.eclipse.rdf4j.sail.lucene.LuceneSailSchema.LUCENE_QUERY;
import static org.eclipse.rdf4j.sail.lucene.LuceneSailSchema.MATCHES;
import static org.eclipse.rdf4j.sail.lucene.LuceneSailSchema.NUM_DOCS;
import static org.eclipse.rdf4j.sail.lucene.LuceneSailSchema.QUERY;
import static org.eclipse.rdf4j.sail.lucene.LuceneSailSchema.SCORE;
import static org.eclipse.rdf4j.sail.lucene.LuceneSailSchema.SNIPPET;
Expand Down Expand Up @@ -55,6 +56,7 @@ public void testQueryInterpretation() {
"<" + TYPE + "> <" + LUCENE_QUERY + ">; " +
"<" + QUERY + "> \"my Lucene query\"; " +
"<" + SCORE + "> ?Score; " +
"<" + NUM_DOCS + "> 76; " +
"<" + SNIPPET + "> ?Snippet ]. } ";
ParsedQuery query = parser.parseQuery(buffer, null);
TupleExpr tupleExpr = query.getTupleExpr();
Expand All @@ -69,6 +71,8 @@ public void testQueryInterpretation() {
assertEquals("Score", querySpec.getScorePattern().getObjectVar().getName());
assertEquals("Snippet", param.getSnippetPattern().getObjectVar().getName());
assertEquals(LUCENE_QUERY, querySpec.getTypePattern().getObjectVar().getValue());
assertEquals(76, querySpec.getNumDocs());
assertEquals(76, ((Literal) querySpec.getNumDocsPattern().getObjectVar().getValue()).intValue());
assertEquals("my Lucene query", param.getQuery());
assertNull(querySpec.getSubject());
}
Expand All @@ -80,11 +84,13 @@ public void testMultipleQueriesInterpretation() {
"<" + TYPE + "> <" + LUCENE_QUERY + ">; " +
"<" + QUERY + "> \"my Lucene query\"; " +
"<" + SCORE + "> ?score1; " +
"<" + NUM_DOCS + "> 86; " +
"<" + SNIPPET + "> ?snippet1 ]. " +
" ?sub2 <" + MATCHES + "> [ " +
"<" + TYPE + "> <" + LUCENE_QUERY + ">; " +
"<" + QUERY + "> \"second lucene query\"; " +
"<" + SCORE + "> ?score2; " +
"<" + NUM_DOCS + "> 13; " +
"<" + SNIPPET + "> ?snippet2 ]. " +
// and connect them both via any X in between, just as salt to make the
// parser do something
Expand All @@ -103,6 +109,7 @@ public void testMultipleQueriesInterpretation() {
// Matched the first
assertEquals("sub1", querySpec.getMatchesPattern().getSubjectVar().getName());
assertEquals(1, querySpec.getQueryPatterns().size());
assertEquals(86, querySpec.getNumDocs());
QuerySpec.QueryParam param = querySpec.getQueryPatterns().iterator().next();
assertEquals("my Lucene query",
((Literal) param.getQueryPattern().getObjectVar().getValue()).getLabel());
Expand All @@ -116,6 +123,7 @@ public void testMultipleQueriesInterpretation() {
// and the second
assertEquals("sub2", querySpec.getMatchesPattern().getSubjectVar().getName());
assertEquals(1, querySpec.getQueryPatterns().size());
assertEquals(13, querySpec.getNumDocs());
QuerySpec.QueryParam param = querySpec.getQueryPatterns().iterator().next();
assertEquals("second lucene query",
((Literal) param.getQueryPattern().getObjectVar().getValue()).getLabel());
Expand Down
Loading
Loading