Skip to content

Commit

Permalink
Changes to elasticsearch/create_mapping.py related to support nested …
Browse files Browse the repository at this point in the history
…quality-metrics/qc_values
  • Loading branch information
dmichaels-harvard committed Aug 17, 2024
1 parent 8e8b126 commit 22d64de
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 24 deletions.
9 changes: 9 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,15 @@ snovault
Change Log
----------

11.22.0
=======

* Changes to elasticsearch/create_mapping.py related to support nested quality-metrics/qc_values
for smaht-portal; appliy fallthru logic for create_mapping.schema_mapping if the type does
not match any other itemized type there; e.g. for ['boolean', 'integer', 'number', 'string']
for the quality-metrics/qc_values/value type.


11.21.1
=======

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "dcicsnovault"
version = "11.21.1"
version = "11.21.1.1b1" # TODO: To become 11.22.00
description = "Storage support for 4DN Data Portals."
authors = ["4DN-DCIC Team <[email protected]>"]
license = "MIT"
Expand Down
82 changes: 59 additions & 23 deletions snovault/elasticsearch/create_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def determine_if_is_date_field(field, schema):
return is_date_field


def schema_mapping(field, schema, top_level=False, from_array=False):
def schema_mapping(field, schema, top_level=False, from_array=False, paths_for_logging=[]):
"""
Create the mapping for a given schema. Can handle using all fields for
objects (*), but can handle specific fields using the field parameter.
Expand Down Expand Up @@ -112,12 +112,13 @@ def schema_mapping(field, schema, top_level=False, from_array=False):

# Elasticsearch handles multiple values for a field
if type_ == 'array' and schema['items']:
return schema_mapping(field, schema['items'], from_array=True)
return schema_mapping(field, schema['items'], from_array=True, paths_for_logging=[*paths_for_logging, "[]"])

if type_ == 'object':
properties = {}
paths_for_logging = [*paths_for_logging, schema.get("title", "").replace(" ", "")]
for k, v in schema.get('properties', {}).items():
mapping = schema_mapping(k, v)
mapping = schema_mapping(k, v, paths_for_logging=[*paths_for_logging, k])
if mapping is not None:
if field == '*' or k == field:
properties[k] = mapping
Expand Down Expand Up @@ -150,25 +151,26 @@ def schema_mapping(field, schema, top_level=False, from_array=False):
}
}

if type_ == ["number", "string"]:
return {
'type': 'text',
'fields': {
'value': {
'type': 'float',
'ignore_malformed': True,
},
'raw': {
'type': 'keyword',
'ignore_above': KW_IGNORE_ABOVE
},
'lower_case_sort': {
'type': 'keyword',
'normalizer': 'case_insensitive',
'ignore_above': KW_IGNORE_ABOVE
}
}
}
# Move to bottom as the default ...
# if type_ == ["number", "string"]:
# return {
# 'type': 'text',
# 'fields': {
# 'value': {
# 'type': 'float',
# 'ignore_malformed': True,
# },
# 'raw': {
# 'type': 'keyword',
# 'ignore_above': KW_IGNORE_ABOVE
# },
# 'lower_case_sort': {
# 'type': 'keyword',
# 'normalizer': 'case_insensitive',
# 'ignore_above': KW_IGNORE_ABOVE
# }
# }
# }

if type_ == 'boolean':
return {
Expand Down Expand Up @@ -241,6 +243,39 @@ def schema_mapping(field, schema, top_level=False, from_array=False):
}
}

# Fall thru case.
default_mapping = False

# Warnings for unmapped items; guard against duplicate warning by squirreling away
# the paths_for_logging in a hidden attribute (__unmapped_warnings) of this function.
if len(paths_for_logging) > 1:
if not hasattr(schema_mapping, "__unmapped_warnings"):
setattr(schema_mapping, "__unmapped_warnings", [])
paths_for_logging = ".".join([path for path in paths_for_logging if path])
if paths_for_logging not in schema_mapping.__unmapped_warnings:
schema_mapping.__unmapped_warnings.append(paths_for_logging)
log.warning(f"Using default mapping for field: {paths_for_logging} | type: {type_}")

if default_mapping:
return {
'type': 'text',
'fields': {
'value': {
'type': 'float',
'ignore_malformed': True,
},
'raw': {
'type': 'keyword',
'ignore_above': KW_IGNORE_ABOVE
},
'lower_case_sort': {
'type': 'keyword',
'normalizer': 'case_insensitive',
'ignore_above': KW_IGNORE_ABOVE
}
}
}


def _inject_custom_settings(*, template: dict, custom_settings: IndexSettings) -> dict:
""" Adds our custom settings to the base template
Expand Down Expand Up @@ -678,7 +713,8 @@ def type_mapping(types, item_type, embed=True):
type_info = types[item_type]
schema = type_info.schema
# TODO: use top_level parameter here for schema_mapping
mapping = schema_mapping('*', schema, from_array=False)
paths_for_logging = [schema.get("title", "").replace(" ", "")] if schema else []
mapping = schema_mapping('*', schema, from_array=False, paths_for_logging=paths_for_logging)
if not embed:
return mapping

Expand Down

0 comments on commit 22d64de

Please sign in to comment.