Skip to content

Commit

Permalink
boostedfields for query
Browse files Browse the repository at this point in the history
  • Loading branch information
kaipoykio committed Nov 5, 2024
1 parent fd2d84a commit e11768d
Show file tree
Hide file tree
Showing 4 changed files with 269 additions and 6 deletions.
11 changes: 8 additions & 3 deletions rest/src/main/groovy/whelk/rest/api/SearchUtils2.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import whelk.search2.Spell;
import whelk.search2.Stats;
import whelk.search2.querytree.QueryTree;
import whelk.search2.BoostedFields;

import java.io.IOException;
import java.util.*;
Expand All @@ -37,6 +38,10 @@ Map<String, Object> doSearch(Map<String, String[]> queryParameters) throws Inval
throw new WhelkRuntimeException("ElasticSearch not configured.");
}

BoostedFields bf = new BoostedFields(queryUtil.whelk);

List<String> boostedfields = bf.boostedFields(queryParameters, queryUtil.lensBoost);

QueryParams queryParams = new QueryParams(queryParameters);

if (queryParams.q.isEmpty()) {
Expand All @@ -60,7 +65,7 @@ Map<String, Object> doSearch(Map<String, String[]> queryParameters) throws Inval
qTree.addFilters(queryParams, appParams);
qTree.setOutsetType(disambiguate);

Map<String, Object> esQueryDsl = getEsQueryDsl(qTree, queryParams, appParams.statsRepr);
Map<String, Object> esQueryDsl = getEsQueryDsl(qTree, queryParams, appParams.statsRepr, boostedfields);

QueryResult queryRes = new QueryResult(queryUtil.query(esQueryDsl));

Expand All @@ -73,9 +78,9 @@ Map<String, Object> doSearch(Map<String, String[]> queryParameters) throws Inval
return partialCollectionView;
}

private Map<String, Object> getEsQueryDsl(QueryTree queryTree, QueryParams queryParams, AppParams.StatsRepr statsRepr) {
private Map<String, Object> getEsQueryDsl(QueryTree queryTree, QueryParams queryParams, AppParams.StatsRepr statsRepr, List<String> boostedfields) {
var queryDsl = new LinkedHashMap<String, Object>();
queryDsl.put("query", queryTree.toEs(queryUtil, disambiguate));
queryDsl.put("query", queryTree.toEs(queryUtil, disambiguate, boostedfields));
queryDsl.put("size", queryParams.limit);
queryDsl.put("from", queryParams.offset);
queryDsl.put("sort", (queryParams.sortBy == Sort.DEFAULT_BY_RELEVANCY && queryTree.isWild()
Expand Down
256 changes: 256 additions & 0 deletions whelk-core/src/main/groovy/whelk/search2/BoostedFields.groovy
Original file line number Diff line number Diff line change
@@ -0,0 +1,256 @@
package whelk.search2

import groovy.transform.CompileStatic
import groovy.transform.TypeCheckingMode
import whelk.search.ESQueryLensBoost
import whelk.Whelk
import whelk.JsonLd
import whelk.util.DocumentUtil

/*
rest/api/SearchUtils2:
private Map<String, Object> getEsQueryDsl(QueryTree queryTree, QueryParams queryParams, AppParams.StatsRepr statsRepr)
queryDsl.put("query", queryTree.toEs(queryUtil, disambiguate));
Map getESQuery(Map<String, String[]> ogQueryParameters, String suggest = null, String spell = null) {
*/


//private Map<String, List<String>> boostFieldsByType = [:]
//private ESQueryLensBoost lensBoost

@CompileStatic
class BoostedFields {

private Whelk whelk
private JsonLd jsonld
private Set keywordFields
private Set dateFields
private Set<String> nestedFields
private Set<String> nestedNotInParentFields
private Set<String> numericExtractorFields

BoostedFields(Whelk whelk) {
this.whelk = whelk
this.jsonld = whelk.jsonld
initFieldMappings(this.whelk)
}

void initFieldMappings(Whelk whelk) {
if (whelk.elastic) {
Map mappings = whelk.elastic.getMappings()
this.keywordFields = getKeywordFields(mappings)
this.dateFields = getFieldsOfType('date', mappings)
this.nestedFields = getFieldsOfType('nested', mappings)
this.nestedNotInParentFields = nestedFields - getFieldsWithSetting('include_in_parent', true, mappings)
this.numericExtractorFields = getFieldsWithAnalyzer('numeric_extractor', mappings)

/*
if (DocumentUtil.getAtPath(mappings, ['properties', '_sortKeyByLang', 'properties', 'sv', 'fields', 'trigram'], null)) {
ENABLE_SPELL_CHECK = true
}
log.info("ENABLE_SPELL_CHECK = ${ENABLE_SPELL_CHECK}")
*/
} else {
this.keywordFields = Collections.emptySet()
this.dateFields = Collections.emptySet()
this.nestedFields = Collections.emptySet()
}
}

Set getKeywordFields(Map mappings) {
Set keywordFields = [] as Set
if (mappings) {
keywordFields = getKeywordFieldsFromProperties(mappings['properties'] as Map)
}

return keywordFields
}

private Set getKeywordFieldsFromProperties(Map properties, String parentName = '') {
Set result = [] as Set
properties.each { fieldName, fieldSettings ->
result += getKeywordFieldsFromProperty(fieldName as String,
fieldSettings as Map, parentName)
}

return result
}

private Set getKeywordFieldsFromProperty(String fieldName, Map fieldSettings, String parentName) {
Set result = [] as Set
String currentField
if (parentName == '') {
currentField = fieldName
} else {
currentField = "${parentName}.${fieldName}"
}
Map fields = (Map) fieldSettings.get('fields')
if (fields && fields.get('keyword')) {
result.add(currentField)
}
Map properties = (Map) fieldSettings.get('properties')
if (properties) {
result += getKeywordFieldsFromProperties(properties, currentField)
}
return result
}

static Set getFieldsWithSetting(String setting, value, Map mappings) {
Set fields = [] as Set
DocumentUtil.findKey(mappings['properties'], setting) { v, path ->
if (v == value) {
fields.add(path.dropRight(1).findAll{ it != 'properties'}.join('.'))
}
DocumentUtil.NOP
}
return fields
}

static Set getFieldsWithAnalyzer(String analyzer, Map mappings) {
getFieldsWithSetting('analyzer', analyzer, mappings)
}

static Set getFieldsOfType(String type, Map mappings) {
getFieldsWithSetting('type', type, mappings)
}

@CompileStatic(TypeCheckingMode.SKIP)
List<String> boostedFields(Map<String, String[]> ogQueryParameters, ESQueryLensBoost lensBoost) {

Map<String, String[]> queryParameters = new HashMap<>(ogQueryParameters)

String[] originalTypeParam = queryParameters.get('@type')
if (originalTypeParam != null) {
queryParameters.put('@type', expandTypeParam(originalTypeParam, whelk.jsonld))
}

String[] boostParam = queryParameters.get('_boost')

String boostMode = boostParam ? boostParam[0] : null
List<String> boostedFields = getBoostFields(originalTypeParam, boostMode, lensBoost)

return boostedFields
}

List<String> getBoostFields(String[] types, String boostMode, ESQueryLensBoost lensBoost) {
if (boostMode?.indexOf('^') > -1) {
return boostMode.tokenize(',')
}
if (boostMode == 'id.kb.se') {
return CONCEPT_BOOST
}

Map<String, List<String>> boostFieldsByType = [:]

String typeKey = types != null ? types.toUnique().sort().join(',') : ''
typeKey += boostMode

List<String> boostFields = boostFieldsByType[typeKey]
if (boostFields == null) {
if (boostMode == 'hardcoded') {
boostFields = [
'prefLabel^100',
'code^100',
'name^100',
'familyName^100', 'givenName^100',
'lifeSpan^100', 'birthYear^100', 'deathYear^100',
'hasTitle.mainTitle^100', 'title^100',
'heldBy.sigel^100',
]
} else {
boostFields = computeBoostFields(types, lensBoost)
}
boostFieldsByType[typeKey] = boostFields
}
}

List<String> computeBoostFields(String[] types, ESQueryLensBoost lensBoost) {
/* FIXME:
lensBoost.computeBoostFieldsFromLenses does not give a good result for Concept.
Use hand-tuned boosting instead until we improve boosting/ranking in general. See LXL-3399 for details.
*/
def l = ((types ?: []) as List<String>).split { jsonld.isSubClassOf(it, 'Concept') }
def (conceptTypes, otherTypes) = [l[0], l[1]]

if (conceptTypes) {
if (otherTypes) {
def fromLens = lensBoost.computeBoostFieldsFromLenses(otherTypes as String[])
def conceptFields = CONCEPT_BOOST.collect{ it.split('\\^')[0]}
def otherFieldsBoost = fromLens.findAll{!conceptFields.contains(it.split('\\^')[0]) }
return CONCEPT_BOOST + otherFieldsBoost
}
else {
return CONCEPT_BOOST
}
}
else {
return lensBoost.computeBoostFieldsFromLenses(types)
}
}

private static final List<String> CONCEPT_BOOST = [
'prefLabel^1500',
'prefLabelByLang.sv^1500',
'label^500',
'labelByLang.sv^500',
'code^200',
'termComponentList._str.exact^125',
'termComponentList._str^75',
'altLabel^150',
'altLabelByLang.sv^150',
'hasVariant.prefLabel.exact^150',
'_str.exact^100',
'inScheme._str.exact^100',
'inScheme._str^100',
'inCollection._str.exact^10',
'broader._str.exact^10',
'exactMatch._str.exact^10',
'closeMatch._str.exact^10',
'broadMatch._str.exact^10',
'related._str.exact^10',
'scopeNote^10',
'keyword._str.exact^10',
]

/**
* Expand `@type` query parameter with subclasses.
*
* This also removes superclasses, since we only care about the most
* specific class.
*/

static String[] expandTypeParam(String[] types, JsonLd jsonld) {
// Filter out all types that have (more specific) subclasses that are
// also in the list.
// So for example [Instance, Electronic] should be reduced to just
// [Electronic].
// Afterwards, include all subclasses of the remaining types.
Set<String> subClasses = []

// Select types to prune
Set<String> toBeRemoved = []
for (String c1 : types) {
ArrayList<String> c1SuperClasses = []
jsonld.getSuperClasses(c1, c1SuperClasses)
toBeRemoved.addAll(c1SuperClasses)
}
// Make a new pruned list without the undesired superclasses
List<String> prunedTypes = []
for (String type : types) {
if (!toBeRemoved.contains(type))
prunedTypes.add(type)
}
// Add all subclasses of the remaining types
for (String type : prunedTypes) {
subClasses += jsonld.getSubClasses(type)
subClasses.add(type)
}

return subClasses.toArray()
}

}
3 changes: 2 additions & 1 deletion whelk-core/src/main/groovy/whelk/search2/QueryUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
public class QueryUtil {
private static final Escaper QUERY_ESCAPER = UrlEscapers.urlFormParameterEscaper();

private final Whelk whelk;
public final Whelk whelk;
public final EsMappings esMappings;
public final ESQueryLensBoost lensBoost;

Expand All @@ -34,6 +34,7 @@ public QueryUtil(Whelk whelk) {
}

public Map<?, ?> query(Map<String, Object> queryDsl) {
//System.out.println(queryDsl.toString());
return whelk.elastic.query(queryDsl);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,12 @@ public QueryTree(Node tree) {
removeNeedlessWildcard();
}

public Map<String, Object> toEs(QueryUtil queryUtil, Disambiguate disambiguate) {
public Map<String, Object> toEs(QueryUtil queryUtil, Disambiguate disambiguate, List<String> boostedfields) {
return (isFiltered() ? filtered.tree : tree)
.expand(disambiguate, getOutsetType())
.insertNested(queryUtil::getNestedPath)
.toEs(queryUtil.lensBoost.computeBoostFieldsFromLenses(new String[0])); // TODO: Implement boosting
.toEs(boostedfields);
//.toEs(queryUtil.lensBoost.computeBoostFieldsFromLenses(new String[0])); // TODO: Implement boosting
}

public Map<String, Object> toSearchMapping(Map<String, String> nonQueryParams) {
Expand Down

0 comments on commit e11768d

Please sign in to comment.