Skip to content

Commit

Permalink
Add a new dynamic field for term indexing (tokenized, not stored) #337
Browse files Browse the repository at this point in the history
  • Loading branch information
pkiraly committed Oct 26, 2023
1 parent 07580b7 commit f0d2b0f
Show file tree
Hide file tree
Showing 9 changed files with 62 additions and 25 deletions.
17 changes: 15 additions & 2 deletions solr-functions
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,6 @@ prepare_schema() {
}
}' $SCHEMA_URL


echo "Create *_sni dynamic field definition in ${LOCAL_CORE}."
curl -X POST -H 'Content-type:application/json' --data-binary '{
"add-dynamic-field":{
Expand All @@ -86,7 +85,6 @@ prepare_schema() {
"docValues":false,
"multiValued":false,
}}' $SCHEMA_URL

fi

NUMBER_OF_COPY_FIELD=$(curl -is "$SCHEMA_URL/copyfields?source.fl=*_ss" | grep -c '"source":"\*_ss"')
Expand All @@ -109,6 +107,21 @@ prepare_schema() {
NUMBER_OF_COPY_FIELD=$(curl -is "$SCHEMA_URL/copyfields?source.fl=*_ss" | grep -c '"source":"\*_ss"')
done
fi

HAS_PROPER_TT=$(curl -is "$SCHEMA_URL/dynamicfields/*_tt" | grep -c '"type":"text_general"')
echo "Does ${LOCAL_CORE} have proper *_tt field definition? Answer: ${HAS_PROPER_TT}"
if [[ $HAS_PROPER_TT -eq 0 ]]; then
# add _tt
# <dynamicField name="*_tt" type="text_general" indexed="true" stored="false"/>
echo "Create *_tt dynamic field definition in ${LOCAL_CORE}."
curl -X POST -H 'Content-type:application/json' --data-binary '{
"add-dynamic-field":{
"name":"*_tt",
"type":"text_general",
"stored":false,
"indexed":true,
}}' $SCHEMA_URL
fi
}

status() {
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/de/gwdg/metadataqa/marc/cli/MarcToSolr.java
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ public void processRecord(BibliographicRecord bibliographicRecord, int recordNum
}
client.index(document);

if (recordNumber % 10000 == 0) {
if (recordNumber % parameters.getCommitAt() == 0) {
if (parameters.doCommit())
client.commit();
logger.info(
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/de/gwdg/metadataqa/marc/cli/QACli.java
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ protected void initializeGroups(String groupBy, boolean isPica) {
}
}

protected <T extends CommonParameters> void saveParameters(String fileName, T parameters) {
protected void saveParameters(String fileName, T parameters) {
ObjectMapper mapper = new ObjectMapper();
try {
String json = mapper.writeValueAsString(parameters);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

public class MarcToSolrParameters extends CommonParameters {

private int DEFAULT_COMMIT_AT = 10000;
private boolean useEmbedded = false;
private String solrUrl = null;
private boolean doCommit = false;
Expand All @@ -15,16 +16,18 @@ public class MarcToSolrParameters extends CommonParameters {
private boolean indexWithTokenizedField = false;

private boolean isOptionSet = false;
private int commitAt = DEFAULT_COMMIT_AT;

protected void setOptions() {
if (!isOptionSet) {
super.setOptions();
options.addOption("s", "solrUrl", true, "the URL of Solr server");
options.addOption("c", "doCommit", false, "send commits to Solr regularly");
options.addOption("c", "doCommit", false, "commits Solr index regularly");
options.addOption("t", "solrFieldType", true,
"type of Solr fields, could be one of 'marc-tags', 'human-readable', or 'mixed'");
options.addOption("B", "useEmbedded", false, "use embedded Solr server (used in tests only)");
options.addOption("C", "indexWithTokenizedField", false, "index data elements as tokenized field as well");
options.addOption("D", "commitAt", true, "commit index after this number of records");
isOptionSet = true;
}
}
Expand All @@ -46,6 +49,9 @@ public MarcToSolrParameters(String[] arguments) throws ParseException {

if (cmd.hasOption("indexWithTokenizedField"))
indexWithTokenizedField = true;

if (cmd.hasOption("commitAt"))
commitAt = Integer.valueOf(cmd.getOptionValue("commitAt"));
}

public String getSolrUrl() {
Expand Down Expand Up @@ -84,6 +90,10 @@ public boolean indexWithTokenizedField() {
return indexWithTokenizedField;
}

public int getCommitAt() {
return commitAt;
}

@Override
public String formatParameters() {
String text = super.formatParameters();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import org.apache.solr.common.params.MapSolrParams;

import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
Expand All @@ -25,6 +26,8 @@ public class MarcSolrClient {
private String collection;
private boolean trimId = false;
private boolean indexWithTokenizedField = false;
private String termFieldSuffix = "_tt";
private Map<String, String> termFieldNameCache = new HashMap<>();

public MarcSolrClient() {
initialize(defaultUrl);
Expand Down Expand Up @@ -69,20 +72,26 @@ public SolrInputDocument createSolrDoc(String id, Map<String, List<String>> obje
SolrInputDocument document = new SolrInputDocument();
document.addField("id", (trimId ? id.trim() : id));
for (Map.Entry<String, List<String>> entry : objectMap.entrySet()) {
String key = entry.getKey();
String fieldName = entry.getKey();
Object value = entry.getValue();
if (value != null) {
if (!key.endsWith("_sni") && !key.endsWith("_ss"))
key += "_ss";
if (!fieldName.endsWith("_sni") && !fieldName.endsWith("_ss"))
fieldName += "_ss";
document.addField(fieldName, value);

document.addField(key, value);
if (indexWithTokenizedField && key.endsWith("_ss"))
document.addField(key.replaceAll("_ss$", "_txt"), value);
if (indexWithTokenizedField && fieldName.endsWith("_ss"))
document.addField(getTermFieldName(fieldName), value);
}
}
return document;
}

private String getTermFieldName(String phraseField) {
if (!termFieldNameCache.containsKey(phraseField))
termFieldNameCache.put(phraseField, phraseField.replaceAll("_ss$", termFieldSuffix));
return termFieldNameCache.get(phraseField);
}

public void indexDuplumKey(String id, Map<String, Object> objectMap)
throws IOException, SolrServerException {
SolrInputDocument document = new SolrInputDocument();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -132,12 +132,16 @@ private String forSubfield(String code, String codeForIndex) {
key = String.format("%s%s", indexTag, codeForIndex);
break;
case MIXED:
if ((schemaType == null || !schemaType.equals(SchemaType.PICA)) && !tag.equals(indexTag) && !codeForIndex.equals("_" + code))
key = String.format("%s%s_%s%s", safeTag, code, indexTag, codeForIndex);
else if ((schemaType == null || !schemaType.equals(SchemaType.PICA)) && !tag.equals(indexTag) && codeForIndex.equals("_" + code))
key = String.format("%s%s_%s", safeTag, code, indexTag);
else
if (schemaType != null && schemaType.equals(SchemaType.PICA)) {
key = String.format("%s%s", safeTag, code);
} else {
if (!tag.equals(indexTag) && !codeForIndex.equals("_" + code))
key = String.format("%s%s_%s%s", safeTag, code, indexTag, codeForIndex);
else if (!tag.equals(indexTag) && codeForIndex.equals("_" + code))
key = String.format("%s%s_%s", safeTag, code, indexTag);
else
key = String.format("%s%s", safeTag, code);
}
break;
case MARC:
default:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ public class PicaXmlHandler implements ContentHandler {

private String tag;

private String prev_tag = "n/a";
private String prevTag = "n/a";

private static final int COLLECTION_ID = 1;

Expand Down Expand Up @@ -147,7 +147,7 @@ record = factory.newRecord();
if (typeAttr != null && RECORD_TYPES.contains(typeAttr)) {
record.setType(typeAttr);
}
prev_tag = "n/a";
prevTag = "n/a";

break;
case LEADER_ID:
Expand All @@ -158,7 +158,7 @@ record = factory.newRecord();

if (tag == null) {
if (record != null) {
record.addError("n/a", "n/a", MarcError.MINOR_ERROR, "Missing tag element in ControlField after tag: "+ prev_tag);
record.addError("n/a", "n/a", MarcError.MINOR_ERROR, "Missing tag element in ControlField after tag: "+ prevTag);
} else {
throw new MarcException("ControlField missing tag value, found outside a record element");
}
Expand All @@ -173,7 +173,7 @@ record = factory.newRecord();

if (tag == null) {
if (record != null) {
record.addError("n/a", "n/a", MarcError.MINOR_ERROR, "Missing tag element in datafield after tag: "+prev_tag);
record.addError("n/a", "n/a", MarcError.MINOR_ERROR, "Missing tag element in datafield after tag: "+ prevTag);
} else {
throw new MarcException("DataField missing tag value, found outside a record element");
}
Expand Down Expand Up @@ -211,7 +211,7 @@ record = factory.newRecord();
sb = new StringBuffer();
break;
}
prev_tag = tag;
prevTag = tag;
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ public void indexMap() throws SolrServerException, IOException {
assertNull(mainClient.get("123"));

final SolrDocument doc = mainClient.get("124");
assertEquals(Set.of("title_ss", "title_txt"), doc.getFieldNames());
assertEquals(Set.of("title_ss"), doc.getFieldNames());
assertEquals(List.of("Hello world"), doc.getFieldValues("title_ss"));
}

Expand All @@ -135,7 +135,7 @@ public void createSolrDoc() throws SolrServerException, IOException {
mainClient.indexWithTokenizedField(true);
SolrInputDocument doc = mainClient.createSolrDoc("124", Map.of("title_ss", List.of("Hello world")));

assertEquals(Set.of("id", "title_ss", "title_txt"), doc.getFieldNames());
assertEquals(Set.of("id", "title_ss", "title_tt"), doc.getFieldNames());
assertEquals(List.of("Hello world"), doc.getFieldValues("title_ss"));
}

Expand All @@ -152,7 +152,7 @@ public void merge() throws SolrServerException, IOException {
for (String field : validationValues.getFieldNames())
doc.addField(field, validationValues.getFieldValues(field));

assertEquals(Set.of("id", "title_ss", "title_txt", "groupId_is", "errorId_is"), doc.getFieldNames());
assertEquals(Set.of("id", "title_ss", "title_tt", "groupId_is", "errorId_is"), doc.getFieldNames());
assertEquals(List.of("Hello world"), doc.getFieldValues("title_ss"));
assertEquals(List.of(1, 2, 3), doc.getFieldValues("groupId_is"));
assertEquals(List.of(11, 12, 13), doc.getFieldValues("errorId_is"));
Expand All @@ -173,7 +173,7 @@ public void merge_withCommit() throws SolrServerException, IOException {
mainClient.commit();

SolrDocument savedValues = mainClient.get("123");
assertEquals(Set.of("title_ss", "title_txt", "groupId_is", "errorId_is"), savedValues.getFieldNames());
assertEquals(Set.of("title_ss", "groupId_is", "errorId_is"), savedValues.getFieldNames());
assertEquals(List.of("Hello world"), savedValues.getFieldValues("title_ss"));
assertEquals(List.of(1, 2, 3), savedValues.getFieldValues("groupId_is"));
assertEquals(List.of(11, 12, 13), savedValues.getFieldValues("errorId_is"));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
<dynamicField name="*_ss" type="strings" indexed="true" stored="true"/>
<dynamicField name="*_s" type="strings" indexed="true" stored="true"/>
<dynamicField name="*_txt" type="text_general" indexed="true" stored="true"/>
<dynamicField name="*_tt" type="text_general" indexed="true" stored="false"/>
<dynamicField name="*_is" type="pints" indexed="true" stored="true" />
<dynamicField name="*_sni" type="string_big" docValues="false" multiValued="false" indexed="false" stored="true"/>

Expand Down

0 comments on commit f0d2b0f

Please sign in to comment.