Skip to content

Commit

Permalink
Experimental .blf.yaml version 2. Improvements to processing steps.
Browse files Browse the repository at this point in the history
- multipleValues is no longer necessary, works automatically now.a
- allowDuplicateValues should be replaced by new 'unique' processing step
- mapValues should be moved into a 'map' processing step with parameter 'table'
- inline tags attributes can be defined in a single 'attributes' block now,
  which replaces include/exclude/extraAttributes
- v2 removes baseFormat (inheritance), above obsolete attributes.
- v2 'append' processing step has prefix parameter (default empty) instead of
  using separator, so it doesn't always append a space first.
- removed exception for lemma and word annotations; all annotations are
  insensitive only now unless specified otherwise.
- 'split' and 'replace' now support regex flags i and u (unicode char. classes)
- multipleValuesSeparator (tabular format) deprecated; use 'split' processing step
  instead.

Squashed commit of the following:

commit dda9aca
Author: Jan Niestadt <[email protected]>
Date:   Fri Mar 7 12:58:08 2025 +0100

    Remove multipleValues.

commit bca9881
Author: Jan Niestadt <[email protected]>
Date:   Fri Mar 7 11:35:04 2025 +0100

    Single attributes key replaces include/exclude/extraAttributes.

    For v1, the old keys still work. In v2, they have been removed.

commit c02cdae
Author: Jan Niestadt <[email protected]>
Date:   Fri Mar 7 10:55:33 2025 +0100

    Various v2 changes.

commit 522d3bb
Author: Jan Niestadt <[email protected]>
Date:   Fri Mar 7 09:56:52 2025 +0100

    Remove format inheritance (baseFormat), word/lemma default sensitivity in v2.

commit b20862b
Author: Jan Niestadt <[email protected]>
Date:   Thu Mar 6 15:43:05 2025 +0100

    Document differences between v1 and v2.

commit e08a4da
Author: Jan Niestadt <[email protected]>
Date:   Thu Mar 6 15:38:04 2025 +0100

    In v2, append works more like StringBuilder.

commit 27c203f
Author: Jan Niestadt <[email protected]>
Date:   Thu Mar 6 15:18:47 2025 +0100

    Deprecate allowDuplicateValues. Experimental v2 .blf.yaml.

    allowDuplicateValues still works, and is automatically added
    as the last processing step.

    Version 2 of .blf.yaml defaults to Saxon processor.

commit fb69aa9
Author: Jan Niestadt <[email protected]>
Date:   Thu Mar 6 14:18:34 2025 +0100

    Warning. replace keep 'both'. Docs.

commit 3d0e704
Author: Jan Niestadt <[email protected]>
Date:   Thu Mar 6 13:41:24 2025 +0100

    Fix test failure.

commit 0722290
Author: Jan Niestadt <[email protected]>
Date:   Thu Mar 6 13:26:44 2025 +0100

    map, unique operations. deprecated multipleValues.

    Also deprecated multipleValuesSeparator (tabular formats).

commit 5d79000
Author: Jan Niestadt <[email protected]>
Date:   Thu Mar 6 11:57:02 2025 +0100

    WIP mapValues.

commit 2fa67b1
Author: Jan Niestadt <[email protected]>
Date:   Wed Mar 5 18:20:27 2025 +0100

    Failing unit tests.

commit fd745db
Author: Jan Niestadt <[email protected]>
Date:   Wed Mar 5 13:53:40 2025 +0100

    Implement ProcessingStep classes.

commit 10cb608
Author: Jan Niestadt <[email protected]>
Date:   Wed Mar 5 12:41:07 2025 +0100

    Regex flags for replace/split.
  • Loading branch information
jan-niestadt committed Mar 10, 2025
1 parent 660dd96 commit 0acf9b7
Show file tree
Hide file tree
Showing 39 changed files with 1,592 additions and 618 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -223,10 +223,10 @@ public AnnotationSensitivities getSensitivitySetting(String annotationName) {

// Not in parameter (or unrecognized value), use default based on
// annotationName
if (AnnotationSensitivities.defaultForAnnotation(annotationName) != AnnotationSensitivities.ONLY_INSENSITIVE) {
if (getAnnotationSensitivity(annotationName) != AnnotationSensitivities.ONLY_INSENSITIVE) {
// Word or lemma: default to sensitive/insensitive
// (deprecated, will be removed eventually)
return AnnotationSensitivities.defaultForAnnotation(annotationName);
return getAnnotationSensitivity(annotationName);
}
if (annotationName.equals(AnnotatedFieldNameUtil.PUNCTUATION_ANNOT_NAME)) {
// Punctuation: default to only insensitive
Expand All @@ -241,6 +241,10 @@ public AnnotationSensitivities getSensitivitySetting(String annotationName) {
return AnnotationSensitivities.ONLY_INSENSITIVE;
}

protected static AnnotationSensitivities getAnnotationSensitivity(String name) {
return AnnotationSensitivities.defaultForAnnotation(name, 1);
}

/**
* Set a number of parameters for this indexer
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ public DocIndexerPlainTextBasic(DocWriter indexer, String fileName, Reader reade
String mainPropName = AnnotatedFieldNameUtil.DEFAULT_MAIN_ANNOT_NAME;
boolean needsPrimaryValuePayloads = getDocWriter().needsPrimaryValuePayloads();
contentsField = new AnnotatedFieldWriter(getDocWriter(), DocIndexerLegacy.DEFAULT_CONTENTS_FIELD_NAME,
mainPropName, AnnotationSensitivities.defaultForAnnotation(mainPropName),
mainPropName, getAnnotationSensitivity(mainPropName),
false, needsPrimaryValuePayloads);
annotMain = contentsField.mainAnnotation();
String propName = AnnotatedFieldNameUtil.PUNCTUATION_ANNOT_NAME;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -401,7 +401,7 @@ public DocIndexerXmlHandlers(DocWriter docWriter, String fileName, Reader reader
String mainPropName = AnnotatedFieldNameUtil.DEFAULT_MAIN_ANNOT_NAME;
boolean needsPrimaryValuePayloads = docWriter.needsPrimaryValuePayloads();
contentsField = new AnnotatedFieldWriter(getDocWriter(), DocIndexerLegacy.DEFAULT_CONTENTS_FIELD_NAME,
mainPropName, AnnotationSensitivities.defaultForAnnotation(mainPropName),
mainPropName, getAnnotationSensitivity(mainPropName),
false, needsPrimaryValuePayloads);
propMain = contentsField.mainAnnotation();
propPunct = addAnnotation(AnnotatedFieldNameUtil.PUNCTUATION_ANNOT_NAME);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
* What sensitivities are indexed for an annotation.
*/
public enum AnnotationSensitivities {
DEFAULT, // "choose default based on field name"
DEFAULT, // "insensitive (except for some internal annotations)"
LEGACY_DEFAULT, // "choose default based on field name" (DEPRECATED)
ONLY_SENSITIVE, // only index case- and diacritics-sensitively
ONLY_INSENSITIVE, // only index case- and diacritics-insensitively
SENSITIVE_AND_INSENSITIVE, // case+diac sensitive as well as case+diac insensitive
Expand All @@ -18,6 +19,8 @@ public static AnnotationSensitivities fromStringValue(String v) {
case "default":
case "":
return DEFAULT;
case "legacy-default":
return LEGACY_DEFAULT;
case "sensitive":
case "s":
return ONLY_SENSITIVE;
Expand All @@ -40,6 +43,8 @@ public String getStringValue() {
switch (this) {
case DEFAULT:
return "default";
case LEGACY_DEFAULT:
return "legacy_default";
case ONLY_SENSITIVE:
return "sensitive";
case ONLY_INSENSITIVE:
Expand All @@ -56,7 +61,8 @@ public String getStringValue() {
public String stringValueForResponse() {
switch (this) {
case DEFAULT:
return "DEFAULT";
case LEGACY_DEFAULT:
return getStringValue().toUpperCase();
case ONLY_SENSITIVE:
return "ONLY_SENSITIVE";
case ONLY_INSENSITIVE:
Expand All @@ -77,7 +83,7 @@ public String toString() {
return getStringValue();
}

public static AnnotationSensitivities defaultForAnnotation(String name) {
public static AnnotationSensitivities defaultForAnnotation(String name, int configVersion) {
if (name.equals(AnnotatedFieldNameUtil.RELATIONS_ANNOT_NAME)) {
// Relations annotation (which includes inline tags) defaults to
// insensitive nowadays (used to be sensitive), unless configured otherwise.
Expand All @@ -87,7 +93,7 @@ public static AnnotationSensitivities defaultForAnnotation(String name) {
}

// Check for legacy special cases that get sensitive+insensitive by default
if (AnnotatedFieldNameUtil.defaultSensitiveInsensitive(name))
if (configVersion < 2 && AnnotatedFieldNameUtil.defaultSensitiveInsensitive(name))
return AnnotationSensitivities.SENSITIVE_AND_INSENSITIVE;

// No special case; default to insensitive unless explicitly set to sensitive
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
import org.apache.logging.log4j.Logger;

import nl.inl.blacklab.index.annotated.AnnotationSensitivities;
import nl.inl.blacklab.search.indexmetadata.AnnotatedFieldNameUtil;
import nl.inl.blacklab.indexers.config.process.ProcessingStep;
import nl.inl.blacklab.indexers.config.process.ProcessingStepUnique;

/**
* Configuration for a single annotation (formerly "property") of an annotated field.
Expand Down Expand Up @@ -77,9 +78,6 @@ public class ConfigAnnotation {

/** What UI element to show in the interface (optional) */
private String uiType = "";

/** Can this annotation have multiple values at one token position? [false] */
private boolean multipleValues = false;

/** Should we allow duplicate values at one token position? (if false, performs extra checking and discards duplicates) */
private boolean allowDuplicateValues = false;
Expand Down Expand Up @@ -131,7 +129,6 @@ public ConfigAnnotation copy() {
result.addSubAnnotation(a.copy());
}
result.setForwardIndex(forwardIndex);
result.setMultipleValues(multipleValues);
result.setAllowDuplicateValues(allowDuplicateValues);
result.setCaptureXml(captureXml);
return result;
Expand Down Expand Up @@ -237,8 +234,17 @@ public void setSensitivity(AnnotationSensitivities sensitivity) {
this.sensitivity = sensitivity;
}

public List<ConfigProcessStep> getProcess() {
return process;
ProcessingStep processSteps;

public synchronized ProcessingStep getProcess() {
if (processSteps == null) {
processSteps = ProcessingStep.fromConfig(process);
if (!allowDuplicateValues) {
// If we don't allow duplicate values, add a unique() step to the end of the processing chain
processSteps = ProcessingStep.combine(processSteps, new ProcessingStepUnique());
}
}
return processSteps;
}

public void setProcess(List<ConfigProcessStep> process) {
Expand All @@ -254,18 +260,6 @@ public void setForwardIndex(boolean forwardIndex) {
this.forwardIndex = forwardIndex;
}

public boolean isMultipleValues() {
return multipleValues;
}

public void setMultipleValues(boolean multipleValues) {
this.multipleValues = multipleValues;
}

public boolean isAllowDuplicateValues() {
return allowDuplicateValues;
}

public void setAllowDuplicateValues(boolean allowDuplicateValues) {
this.allowDuplicateValues = allowDuplicateValues;
}
Expand Down Expand Up @@ -293,9 +287,9 @@ public String toString() {

public AnnotationSensitivities getSensitivitySetting() {
AnnotationSensitivities sensitivity = getSensitivity();
if (sensitivity == AnnotationSensitivities.DEFAULT) {
if (sensitivity == AnnotationSensitivities.DEFAULT || sensitivity == AnnotationSensitivities.LEGACY_DEFAULT) {
String name = getName();
sensitivity = AnnotationSensitivities.defaultForAnnotation(name);
sensitivity = AnnotationSensitivities.defaultForAnnotation(name, sensitivity == AnnotationSensitivities.LEGACY_DEFAULT ? 1 : 2);
if (sensitivity != AnnotationSensitivities.ONLY_INSENSITIVE) {
// Historic behaviour: if no sensitivity is given, "word" and "lemma" annotations will
// get SensitivitySetting.SENSITIVE_AND_INSENSITIVE; all others get SensitivitySetting.ONLY_INSENSITIVE.
Expand All @@ -304,11 +298,12 @@ public AnnotationSensitivities getSensitivitySetting() {
if (!warnSensitivity.contains(name)) {
warnSensitivity.add(name);
logger.warn("Configuration " + getName()
+ " relies on special default sensitivity 'sensitive_insensitive' for annotation "
+ " relies on special default sensitivity 'sensitive_insensitive' for annotation '"
+ name
+ "; this behaviour "
+ "is deprecated. Please update your config to explicitly declare the sensitivity setting for this annotation. In a future version, all annotations "
+ "without explicit sensitivity will default to 'insensitive'.");
+ "'; this behaviour is deprecated in .blf.yaml version 1 and removed in version 2. "
+ "Please update your config to explicitly declare the sensitivity setting for this "
+ "annotation. Starting with .blf.yaml version 2, all annotations without explicit "
+ "sensitivity default to 'insensitive'.");
}
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
package nl.inl.blacklab.indexers.config;

import java.util.ArrayList;
import java.util.List;

import nl.inl.blacklab.indexers.config.process.ProcessingStep;

/**
* Configuration for attributes to index using XPath
*/
public class ConfigAttribute {
/**
* Attribute name
*/
private String name;

/**
* Exclude this attribute?
*/
private boolean exclude = false;

/**
* XPath to get attribute's value, or null if this attribute is present on the tag.
*/
private String valuePath;

/**
* How to process annotation values (if at all)
*/
private final List<ConfigProcessStep> process = new ArrayList<>();

public ConfigAttribute() {

}

public String getName() {
return name;
}

public void setName(String name) {
this.name = name;
}

public String getValuePath() {
return valuePath;
}

public void setValuePath(String valuePath) {
this.valuePath = valuePath;
}

public void setExclude(boolean exclude) {
this.exclude = exclude;
}

public boolean isExclude() {
return exclude;
}

ProcessingStep processSteps;

public synchronized ProcessingStep getProcess() {
if (processSteps == null) {
processSteps = ProcessingStep.fromConfig(process);
}
return processSteps;
}

public void setProcess(List<ConfigProcessStep> process) {
this.process.clear();
this.process.addAll(process);
}

public void validate() {
ConfigInputFormat.req(name, "extra attribute", "name");
ConfigInputFormat.req(valuePath, "extra attribute", "valuePath");
for (ConfigProcessStep step: process) {
step.validate();
}
}

/**
* Is this a nameless rule that simply says "exclude any attribute that isn't explicitly included?"
*/
public boolean isDefaultExclude() {
return exclude && name == null;
}
}
Loading

0 comments on commit 0acf9b7

Please sign in to comment.