From 008edc5e5e1e53e6e6c02e496a7a43fc371426ec Mon Sep 17 00:00:00 2001 From: Yizhe Liu <59710443+yizheliu-amazon@users.noreply.github.com> Date: Tue, 7 Jan 2025 21:14:24 -0800 Subject: [PATCH] Fix bug where document embedding fails to be generated due to document has dot in field name (#1062) * Fix bug where document embedding fails to be generated due to document has dot in field name Signed-off-by: Yizhe Liu * Address comments Signed-off-by: Yizhe Liu --------- Signed-off-by: Yizhe Liu (cherry picked from commit 5b9f43b55a48ce17cd7a820bafc787b6a2c21944) --- CHANGELOG.md | 1 + .../processor/InferenceProcessor.java | 27 +- .../util/ProcessorDocumentUtils.java | 165 +++++++++ .../processor/TextEmbeddingProcessorIT.java | 61 ++++ .../TextEmbeddingProcessorTests.java | 330 +++++++++++++----- .../util/ProcessorDocumentUtilsTests.java | 146 +++++++- src/test/resources/processor/ingest_doc5.json | 24 ++ 7 files changed, 658 insertions(+), 96 deletions(-) create mode 100644 src/test/resources/processor/ingest_doc5.json diff --git a/CHANGELOG.md b/CHANGELOG.md index 905d4a8ee..400844e19 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Address inconsistent scoring in hybrid query results ([#998](https://github.com/opensearch-project/neural-search/pull/998)) - Fix bug where ingested document has list of nested objects ([#1040](https://github.com/opensearch-project/neural-search/pull/1040)) - Fixed document source and score field mismatch in sorted hybrid queries ([#1043](https://github.com/opensearch-project/neural-search/pull/1043)) +- Fix bug where embedding is missing when ingested document has "." in field name, and mismatches fieldMap config ([#1062](https://github.com/opensearch-project/neural-search/pull/1062)) ### Infrastructure ### Documentation ### Maintenance diff --git a/src/main/java/org/opensearch/neuralsearch/processor/InferenceProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/InferenceProcessor.java index 0e6a4443f..3fb45ceeb 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/InferenceProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/InferenceProcessor.java @@ -137,6 +137,7 @@ public IngestDocument execute(IngestDocument ingestDocument) throws Exception { @Override public void execute(IngestDocument ingestDocument, BiConsumer handler) { try { + preprocessIngestDocument(ingestDocument); validateEmbeddingFieldsValue(ingestDocument); Map processMap = buildMapWithTargetKeys(ingestDocument); List inferenceList = createInferenceList(processMap); @@ -150,6 +151,15 @@ public void execute(IngestDocument ingestDocument, BiConsumer sourceAndMetadataMap = ingestDocument.getSourceAndMetadata(); + Map unflattened = ProcessorDocumentUtils.unflattenJson(sourceAndMetadataMap); + unflattened.forEach(ingestDocument::setFieldValue); + sourceAndMetadataMap.keySet().removeIf(key -> key.contains(".")); + } + /** * This is the function which does actual inference work for batchExecute interface. * @param inferenceList a list of String for inference. @@ -244,12 +254,14 @@ private List getDataForInference(List i for (IngestDocumentWrapper ingestDocumentWrapper : ingestDocumentWrappers) { Map processMap = null; List inferenceList = null; + IngestDocument ingestDocument = ingestDocumentWrapper.getIngestDocument(); try { - validateEmbeddingFieldsValue(ingestDocumentWrapper.getIngestDocument()); - processMap = buildMapWithTargetKeys(ingestDocumentWrapper.getIngestDocument()); + preprocessIngestDocument(ingestDocument); + validateEmbeddingFieldsValue(ingestDocument); + processMap = buildMapWithTargetKeys(ingestDocument); inferenceList = createInferenceList(processMap); } catch (Exception e) { - ingestDocumentWrapper.update(ingestDocumentWrapper.getIngestDocument(), e); + ingestDocumentWrapper.update(ingestDocument, e); } finally { dataForInferences.add(new DataForInference(ingestDocumentWrapper, processMap, inferenceList)); } @@ -333,13 +345,14 @@ void buildNestedMap(String parentKey, Object processorKey, Map s } else if (sourceAndMetadataMap.get(parentKey) instanceof List) { for (Map.Entry nestedFieldMapEntry : ((Map) processorKey).entrySet()) { List> list = (List>) sourceAndMetadataMap.get(parentKey); + Pair processedNestedKey = processNestedKey(nestedFieldMapEntry); List listOfStrings = list.stream().map(x -> { - Object nestedSourceValue = x.get(nestedFieldMapEntry.getKey()); + Object nestedSourceValue = x.get(processedNestedKey.getKey()); return normalizeSourceValue(nestedSourceValue); }).collect(Collectors.toList()); Map map = new LinkedHashMap<>(); - map.put(nestedFieldMapEntry.getKey(), listOfStrings); - buildNestedMap(nestedFieldMapEntry.getKey(), nestedFieldMapEntry.getValue(), map, next); + map.put(processedNestedKey.getKey(), listOfStrings); + buildNestedMap(processedNestedKey.getKey(), processedNestedKey.getValue(), map, next); } } treeRes.merge(parentKey, next, REMAPPING_FUNCTION); @@ -387,7 +400,7 @@ private void validateEmbeddingFieldsValue(IngestDocument ingestDocument) { ProcessorDocumentUtils.validateMapTypeValue( FIELD_MAP_FIELD, sourceAndMetadataMap, - fieldMap, + ProcessorDocumentUtils.unflattenJson(fieldMap), indexName, clusterService, environment, diff --git a/src/main/java/org/opensearch/neuralsearch/util/ProcessorDocumentUtils.java b/src/main/java/org/opensearch/neuralsearch/util/ProcessorDocumentUtils.java index 20abebc23..6f9297e5c 100644 --- a/src/main/java/org/opensearch/neuralsearch/util/ProcessorDocumentUtils.java +++ b/src/main/java/org/opensearch/neuralsearch/util/ProcessorDocumentUtils.java @@ -12,11 +12,14 @@ import org.opensearch.env.Environment; import org.opensearch.index.mapper.MapperService; +import java.util.ArrayList; +import java.util.HashMap; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Objects; import java.util.Optional; +import java.util.Stack; /** * This class is used to accommodate the common code pieces of parsing, validating and processing the document for multiple @@ -178,4 +181,166 @@ private static void validateDepth( ); } } + + /** + * Unflatten a JSON object represented as a {@code Map}, possibly with dot in field name, + * into a nested {@code Map} + * "Object" can be either a {@code Map} or a {@code List} or simply a String. + * For example, input is {"a.b": "c"}, output is {"a":{"b": "c"}}. + * Another example: + * input is {"a": [{"b.c": "d"}, {"b.c": "e"}]}, + * output is {"a": [{"b": {"c": "d"}}, {"b": {"c": "e"}}]} + * @param originalJsonMap the original JSON object represented as a {@code Map} + * @return the nested JSON object represented as a nested {@code Map} + * @throws IllegalArgumentException if the originalJsonMap is null or has invalid dot usage in field name + */ + public static Map unflattenJson(Map originalJsonMap) { + if (originalJsonMap == null) { + throw new IllegalArgumentException("originalJsonMap cannot be null"); + } + Map result = new HashMap<>(); + Stack stack = new Stack<>(); + + // Push initial items to stack + for (Map.Entry entry : originalJsonMap.entrySet()) { + stack.push(new ProcessJsonObjectItem(entry.getKey(), entry.getValue(), result)); + } + + // Process items until stack is empty + while (!stack.isEmpty()) { + ProcessJsonObjectItem item = stack.pop(); + String key = item.key; + Object value = item.value; + Map currentMap = item.targetMap; + + // Handle nested value + if (value instanceof Map) { + Map nestedMap = new HashMap<>(); + for (Map.Entry entry : ((Map) value).entrySet()) { + stack.push(new ProcessJsonObjectItem(entry.getKey(), entry.getValue(), nestedMap)); + } + value = nestedMap; + } else if (value instanceof List) { + value = handleList((List) value); + } + + // If key contains dot, split and create nested structure + unflattenSingleItem(key, value, currentMap); + } + + return result; + } + + private static List handleList(List list) { + List result = new ArrayList<>(); + Stack stack = new Stack<>(); + + // Push initial items to stack + for (int i = list.size() - 1; i >= 0; i--) { + stack.push(new ProcessJsonListItem(list.get(i), result)); + } + + // Process items until stack is empty + while (!stack.isEmpty()) { + ProcessJsonListItem item = stack.pop(); + Object value = item.value; + List targetList = item.targetList; + + if (value instanceof Map) { + Map nestedMap = new HashMap<>(); + Map sourceMap = (Map) value; + for (Map.Entry entry : sourceMap.entrySet()) { + stack.push(new ProcessJsonListItem(new ProcessJsonObjectItem(entry.getKey(), entry.getValue(), nestedMap), targetList)); + } + targetList.add(nestedMap); + } else if (value instanceof List) { + List nestedList = new ArrayList<>(); + for (Object listItem : (List) value) { + stack.push(new ProcessJsonListItem(listItem, nestedList)); + } + targetList.add(nestedList); + } else if (value instanceof ProcessJsonObjectItem) { + ProcessJsonObjectItem processJsonObjectItem = (ProcessJsonObjectItem) value; + Map tempMap = new HashMap<>(); + unflattenSingleItem(processJsonObjectItem.key, processJsonObjectItem.value, tempMap); + targetList.set(targetList.size() - 1, tempMap); + } else { + targetList.add(value); + } + } + + return result; + } + + private static void unflattenSingleItem(String key, Object value, Map result) { + if (StringUtils.isBlank(key)) { + throw new IllegalArgumentException("Field name cannot be null or empty"); + } + if (key.contains(".")) { + // Use split with -1 limit to preserve trailing empty strings + String[] parts = key.split("\\.", -1); + Map current = result; + + for (int i = 0; i < parts.length; i++) { + if (StringUtils.isBlank(parts[i])) { + throw new IllegalArgumentException(String.format(Locale.ROOT, "Field name '%s' contains invalid dot usage", key)); + } + if (i == parts.length - 1) { + current.put(parts[i], value); + continue; + } + current = (Map) current.computeIfAbsent(parts[i], k -> new HashMap<>()); + } + } else { + result.put(key, value); + } + } + + /** + * Validate if field name is in correct format, which is either "a", or "a.b.c". + * If field name is like "..a..b", "a..b", "a.b..", it should be invalid. + * This is done via checking if a string contains empty segments when split by dots. + * + * @param input the string to check + * @throws IllegalArgumentException if the input is null or has invalid dot usage + */ + private static void validateFieldName(String input) { + if (StringUtils.isBlank(input)) { + throw new IllegalArgumentException("Field name cannot be null or empty"); + } + + // Use split with -1 limit to preserve trailing empty strings + String[] segments = input.split("\\.", -1); + + // Check if any segment is empty + for (String segment : segments) { + if (StringUtils.isBlank(segment)) { + throw new IllegalArgumentException(String.format(Locale.ROOT, "Field name '%s' contains invalid dot usage", input)); + } + } + } + + // Helper classes to maintain state during iteration + private static class ProcessJsonObjectItem { + String key; + Object value; + Map targetMap; + + ProcessJsonObjectItem(String key, Object value, Map targetMap) { + this.key = key; + this.value = value; + this.targetMap = targetMap; + } + } + + private static class ProcessJsonListItem { + Object value; + List targetList; + + ProcessJsonListItem(Object value, List targetList) { + this.value = value; + this.targetList = targetList; + } + } + } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessorIT.java b/src/test/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessorIT.java index 263a55273..ec40f0b40 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessorIT.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessorIT.java @@ -51,6 +51,7 @@ public class TextEmbeddingProcessorIT extends BaseNeuralSearchIT { private final String INGEST_DOC2 = Files.readString(Path.of(classLoader.getResource("processor/ingest_doc2.json").toURI())); private final String INGEST_DOC3 = Files.readString(Path.of(classLoader.getResource("processor/ingest_doc3.json").toURI())); private final String INGEST_DOC4 = Files.readString(Path.of(classLoader.getResource("processor/ingest_doc4.json").toURI())); + private final String INGEST_DOC5 = Files.readString(Path.of(classLoader.getResource("processor/ingest_doc5.json").toURI())); private final String BULK_ITEM_TEMPLATE = Files.readString( Path.of(classLoader.getResource("processor/bulk_item_template.json").toURI()) ); @@ -176,6 +177,23 @@ private void assertDoc(Map sourceMap, String textFieldValue, Opt } } + private void assertDocWithLevel2AsList(Map sourceMap) { + assertNotNull(sourceMap); + assertTrue(sourceMap.containsKey(LEVEL_1_FIELD)); + assertTrue(sourceMap.get(LEVEL_1_FIELD) instanceof List); + List> nestedPassages = (List>) sourceMap.get(LEVEL_1_FIELD); + nestedPassages.forEach(nestedPassage -> { + assertTrue(nestedPassage.containsKey(LEVEL_2_FIELD)); + Map level2 = (Map) nestedPassage.get(LEVEL_2_FIELD); + Map level3 = (Map) level2.get(LEVEL_3_FIELD_CONTAINER); + List embeddings = (List) level3.get(LEVEL_3_FIELD_EMBEDDING); + assertEquals(768, embeddings.size()); + for (Double embedding : embeddings) { + assertTrue(embedding >= 0.0 && embedding <= 1.0); + } + }); + } + public void testTextEmbeddingProcessor_withBatchSizeInProcessor() throws Exception { String modelId = null; try { @@ -240,6 +258,49 @@ public void testTextEmbeddingProcessor_withFailureAndSkip() throws Exception { } } + @SuppressWarnings("unchecked") + public void testNestedFieldMapping_whenDocumentInListIngested_thenSuccessful() throws Exception { + String modelId = null; + try { + modelId = uploadTextEmbeddingModel(); + loadModel(modelId); + createPipelineProcessor(modelId, PIPELINE_NAME, ProcessorType.TEXT_EMBEDDING_WITH_NESTED_FIELDS_MAPPING); + createTextEmbeddingIndex(); + ingestDocument(INGEST_DOC5, "5"); + + assertDocWithLevel2AsList((Map) getDocById(INDEX_NAME, "5").get("_source")); + + NeuralQueryBuilder neuralQueryBuilderQuery = NeuralQueryBuilder.builder() + .fieldName(LEVEL_1_FIELD + "." + LEVEL_2_FIELD + "." + LEVEL_3_FIELD_CONTAINER + "." + LEVEL_3_FIELD_EMBEDDING) + .queryText(QUERY_TEXT) + .modelId(modelId) + .k(10) + .build(); + + QueryBuilder queryNestedLowerLevel = QueryBuilders.nestedQuery( + LEVEL_1_FIELD + "." + LEVEL_2_FIELD, + neuralQueryBuilderQuery, + ScoreMode.Total + ); + QueryBuilder queryNestedHighLevel = QueryBuilders.nestedQuery(LEVEL_1_FIELD, queryNestedLowerLevel, ScoreMode.Total); + + Map searchResponseAsMap = search(INDEX_NAME, queryNestedHighLevel, 2); + assertNotNull(searchResponseAsMap); + + Map hits = (Map) searchResponseAsMap.get("hits"); + assertNotNull(hits); + + List> listOfHits = (List>) hits.get("hits"); + assertNotNull(listOfHits); + assertEquals(1, listOfHits.size()); + + Map innerHitDetails = listOfHits.getFirst(); + assertEquals("5", innerHitDetails.get("_id")); + } finally { + wipeOfTestResources(INDEX_NAME, PIPELINE_NAME, modelId, null); + } + } + private String uploadTextEmbeddingModel() throws Exception { String requestBody = Files.readString(Path.of(classLoader.getResource("processor/UploadModelRequestBody.json").toURI())); return registerModelGroupAndUploadModel(requestBody); diff --git a/src/test/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessorTests.java index 731f8182f..8fedd1fca 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/TextEmbeddingProcessorTests.java @@ -486,28 +486,34 @@ public void testNestedFieldInMappingForListWithNestedObj_withIngestDocumentWitho ] */ - Map child1Level2 = buildObjMapWithSingleField(CHILD_1_TEXT_FIELD, TEXT_VALUE_1); - Map child1Level1 = buildObjMapWithSingleField(CHILD_FIELD_LEVEL_1, child1Level2); - Map child2Level2 = buildObjMapWithSingleField(CHILD_1_TEXT_FIELD, TEXT_VALUE_1); - child2Level2.put(CHILD_2_TEXT_FIELD, TEXT_VALUE_2); - child2Level2.put(CHILD_3_TEXT_FIELD, TEXT_VALUE_3); - Map child2Level1 = buildObjMapWithSingleField(CHILD_FIELD_LEVEL_1, child2Level2); - Map sourceAndMetadata = Map.of( - PARENT_FIELD, - Arrays.asList(child1Level1, child2Level1), - IndexFieldMapper.NAME, - "my_index" + Map child1Level2 = buildObjMap(Pair.of(CHILD_1_TEXT_FIELD, TEXT_VALUE_1)); + Map child1Level1 = buildObjMap(Pair.of(CHILD_FIELD_LEVEL_1, child1Level2)); + Map child2Level2 = buildObjMap( + Pair.of(CHILD_1_TEXT_FIELD, TEXT_VALUE_1), + Pair.of(CHILD_2_TEXT_FIELD, TEXT_VALUE_2), + Pair.of(CHILD_3_TEXT_FIELD, TEXT_VALUE_3) + ); + Map child2Level1 = buildObjMap(Pair.of(CHILD_FIELD_LEVEL_1, child2Level2)); + Map sourceAndMetadata = buildObjMap( + Pair.of(PARENT_FIELD, Arrays.asList(child1Level1, child2Level1)), + Pair.of(IndexFieldMapper.NAME, "my_index") ); IngestDocument ingestDocument = new IngestDocument(sourceAndMetadata, new HashMap<>()); Map registry = new HashMap<>(); - Map config = new HashMap<>(); - config.put(TextEmbeddingProcessor.MODEL_ID_FIELD, "mockModelId"); - config.put( - TextEmbeddingProcessor.FIELD_MAP_FIELD, - Map.of( - PARENT_FIELD, - Map.of(CHILD_FIELD_LEVEL_1, Map.of(CHILD_1_TEXT_FIELD, String.join(".", CHILD_FIELD_LEVEL_2, CHILD_LEVEL_2_KNN_FIELD))) + Map config = buildObjMap( + Pair.of(TextEmbeddingProcessor.MODEL_ID_FIELD, "mockModelId"), + Pair.of( + TextEmbeddingProcessor.FIELD_MAP_FIELD, + buildObjMap( + Pair.of( + PARENT_FIELD, + Map.of( + CHILD_FIELD_LEVEL_1, + Map.of(CHILD_1_TEXT_FIELD, String.join(".", CHILD_FIELD_LEVEL_2, CHILD_LEVEL_2_KNN_FIELD)) + ) + ) + ) ) ); TextEmbeddingProcessor processor = (TextEmbeddingProcessor) textEmbeddingProcessorFactory.create( @@ -770,6 +776,103 @@ public void testBuildVectorOutput_withNestedMap_successful() { } } + @SneakyThrows + @SuppressWarnings("unchecked") + public void testBuildVectorOutput_withFlattenedNestedMap_successful() { + Map config = createNestedMapConfiguration(); + IngestDocument ingestDocument = createFlattenedNestedMapIngestDocument(); + TextEmbeddingProcessor processor = createInstanceWithNestedMapConfiguration(config); + processor.preprocessIngestDocument(ingestDocument); + Map knnMap = processor.buildMapWithTargetKeys(ingestDocument); + List> modelTensorList = createRandomOneDimensionalMockVector(2, 100, 0.0f, 1.0f); + processor.buildNLPResult(knnMap, modelTensorList, ingestDocument.getSourceAndMetadata()); + /** + * "favorites.favorite": { + * "movie": "matrix", + * "actor": "Charlie Chaplin", + * "games" : { + * "adventure": { + * "action": "overwatch", + * "rpg": "elden ring" + * } + * } + * } + */ + Map favoritesMap = (Map) ingestDocument.getSourceAndMetadata().get("favorites"); + assertNotNull(favoritesMap); + Map favorites = (Map) favoritesMap.get("favorite"); + assertNotNull(favorites); + + Map favoriteGames = (Map) favorites.get("games"); + assertNotNull(favoriteGames); + Map adventure = (Map) favoriteGames.get("adventure"); + List adventureKnnVector = (List) adventure.get("with_action_knn"); + assertNotNull(adventureKnnVector); + assertEquals(100, adventureKnnVector.size()); + for (float vector : adventureKnnVector) { + assertTrue(vector >= 0.0f && vector <= 1.0f); + } + + List favoriteKnnVector = (List) favorites.get("favorite_movie_knn"); + assertNotNull(favoriteKnnVector); + assertEquals(100, favoriteKnnVector.size()); + for (float vector : favoriteKnnVector) { + assertTrue(vector >= 0.0f && vector <= 1.0f); + } + } + + @SneakyThrows + @SuppressWarnings("unchecked") + public void testBuildVectorOutput_withFlattenedNestedMapAndList_successful() { + Map config = createNestedMapConfiguration(); + IngestDocument ingestDocument = createFlattenedNestedMapAndListIngestDocument(); + TextEmbeddingProcessor processor = createInstanceWithNestedMapConfiguration(config); + processor.preprocessIngestDocument(ingestDocument); + Map knnMap = processor.buildMapWithTargetKeys(ingestDocument); + List> modelTensorList = createRandomOneDimensionalMockVector(3, 100, 0.0f, 1.0f); + processor.buildNLPResult(knnMap, modelTensorList, ingestDocument.getSourceAndMetadata()); + /** + * "favorites.favorite": { + * "movie": "matrix", + * "actor": "Charlie Chaplin", + * "games" : [ + * { + * "adventure": { + * "action": "overwatch", + * "rpg": "elden ring" + * } + * }, + * { + * "adventure.action": "wukong" + * } + * ] + * } + */ + Map favoritesMap = (Map) ingestDocument.getSourceAndMetadata().get("favorites"); + assertNotNull(favoritesMap); + Map favorite = (Map) favoritesMap.get("favorite"); + assertNotNull(favorite); + + List> favoriteGames = (List>) favorite.get("games"); + assertNotNull(favoriteGames); + for (Map favoriteGame : favoriteGames) { + Map adventure = (Map) favoriteGame.get("adventure"); + List adventureKnnVector = (List) adventure.get("with_action_knn"); + assertNotNull(adventureKnnVector); + assertEquals(100, adventureKnnVector.size()); + for (float vector : adventureKnnVector) { + assertTrue(vector >= 0.0f && vector <= 1.0f); + } + } + + List favoriteKnnVector = (List) favorite.get("favorite_movie_knn"); + assertNotNull(favoriteKnnVector); + assertEquals(100, favoriteKnnVector.size()); + for (float vector : favoriteKnnVector) { + assertTrue(vector >= 0.0f && vector <= 1.0f); + } + } + public void testBuildVectorOutput_withNestedList_successful() { Map config = createNestedListConfiguration(); IngestDocument ingestDocument = createNestedListIngestDocument(); @@ -853,8 +956,8 @@ public void testBuildVectorOutput_withNestedListLevel2_withPartialNullNestedFiel * } */ List> nestedList = (List>) ingestDocument.getSourceAndMetadata().get("nestedField"); - Map objWithNullText = buildObjMapWithSingleField("textField", null); - Map nestedObjWithNullText = buildObjMapWithSingleField("nestedField", objWithNullText); + Map objWithNullText = buildObjMap(Pair.of("textField", null)); + Map nestedObjWithNullText = buildObjMap(Pair.of("nestedField", objWithNullText)); nestedList.set(0, nestedObjWithNullText); TextEmbeddingProcessor textEmbeddingProcessor = createInstanceWithNestedMapConfiguration(config); Map knnMap = textEmbeddingProcessor.buildMapWithTargetKeys(ingestDocument); @@ -1142,21 +1245,22 @@ private void assertMapWithNestedFields(Pair actual, List @SneakyThrows private TextEmbeddingProcessor createInstanceWithNestedMapConfiguration(Map fieldMap) { Map registry = new HashMap<>(); - Map config = new HashMap<>(); - config.put(TextEmbeddingProcessor.MODEL_ID_FIELD, "mockModelId"); - config.put(TextEmbeddingProcessor.FIELD_MAP_FIELD, fieldMap); + Map config = buildObjMap( + Pair.of(TextEmbeddingProcessor.MODEL_ID_FIELD, "mockModelId"), + Pair.of(TextEmbeddingProcessor.FIELD_MAP_FIELD, fieldMap) + ); return (TextEmbeddingProcessor) textEmbeddingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config); } private Map createPlainStringConfiguration() { - Map config = new HashMap<>(); - config.put("oriKey1", "oriKey1_knn"); - config.put("oriKey2", "oriKey2_knn"); - config.put("oriKey3", "oriKey3_knn"); - config.put("oriKey4", "oriKey4_knn"); - config.put("oriKey5", "oriKey5_knn"); - config.put("oriKey6", "oriKey6_knn"); - return config; + return buildObjMap( + Pair.of("oriKey1", "oriKey1_knn"), + Pair.of("oriKey2", "oriKey2_knn"), + Pair.of("oriKey3", "oriKey3_knn"), + Pair.of("oriKey4", "oriKey4_knn"), + Pair.of("oriKey5", "oriKey5_knn"), + Pair.of("oriKey6", "oriKey6_knn") + ); } /** @@ -1169,24 +1273,24 @@ private Map createPlainStringConfiguration() { * } */ private Map createNestedMapConfiguration() { - Map adventureGames = new HashMap<>(); - adventureGames.put("adventure.action", "with_action_knn"); - Map favorite = new HashMap<>(); - favorite.put("favorite.movie", "favorite_movie_knn"); - favorite.put("favorite.games", adventureGames); - Map result = new HashMap<>(); - result.put("favorites", favorite); + Map adventureGames = buildObjMap(Pair.of("adventure.action", "with_action_knn")); + Map favorite = buildObjMap( + Pair.of("favorite.movie", "favorite_movie_knn"), + Pair.of("favorite.games", adventureGames) + ); + Map result = buildObjMap(Pair.of("favorites", favorite)); return result; } private IngestDocument createPlainIngestDocument() { - Map result = new HashMap<>(); - result.put("oriKey1", "oriValue1"); - result.put("oriKey2", "oriValue2"); - result.put("oriKey3", "oriValue3"); - result.put("oriKey4", "oriValue4"); - result.put("oriKey5", "oriValue5"); - result.put("oriKey6", ImmutableList.of("oriValue6", "oriValue7")); + Map result = buildObjMap( + Pair.of("oriKey1", "oriValue1"), + Pair.of("oriKey2", "oriValue2"), + Pair.of("oriKey3", "oriValue3"), + Pair.of("oriKey4", "oriValue4"), + Pair.of("oriKey5", "oriValue5"), + Pair.of("oriKey6", ImmutableList.of("oriValue6", "oriValue7")) + ); return new IngestDocument(result, new HashMap<>()); } @@ -1206,81 +1310,131 @@ private IngestDocument createPlainIngestDocument() { * } */ private IngestDocument createNestedMapIngestDocument() { - Map adventureGames = new HashMap<>(); - adventureGames.put("action", "overwatch"); - adventureGames.put("rpg", "elden ring"); - Map favGames = new HashMap<>(); - favGames.put("adventure", adventureGames); - Map favorites = new HashMap<>(); - favorites.put("movie", "matrix"); - favorites.put("games", favGames); - favorites.put("actor", "Charlie Chaplin"); - Map favorite = new HashMap<>(); - favorite.put("favorite", favorites); - Map result = new HashMap<>(); - result.put("favorites", favorite); + Map adventureGames = buildObjMap(Pair.of("action", "overwatch"), Pair.of("rpg", "elden ring")); + Map favGames = buildObjMap(Pair.of("adventure", adventureGames)); + Map favorites = buildObjMap( + Pair.of("movie", "matrix"), + Pair.of("games", favGames), + Pair.of("actor", "Charlie Chaplin") + ); + Map favorite = buildObjMap(Pair.of("favorite", favorites)); + Map result = buildObjMap(Pair.of("favorites", favorite)); + return new IngestDocument(result, new HashMap<>()); + } + + /** + * Create following document with flattened nested map + * "favorites.favorite": { + * "movie": "matrix", + * "actor": "Charlie Chaplin", + * "games" : { + * "adventure": { + * "action": "overwatch", + * "rpg": "elden ring" + * } + * } + * } + */ + private IngestDocument createFlattenedNestedMapIngestDocument() { + Map adventureGames = buildObjMap(Pair.of("action", "overwatch"), Pair.of("rpg", "elden ring")); + Map favGames = buildObjMap(Pair.of("adventure", adventureGames)); + Map favorites = buildObjMap( + Pair.of("movie", "matrix"), + Pair.of("games", favGames), + Pair.of("actor", "Charlie Chaplin") + ); + Map result = buildObjMap(Pair.of("favorites.favorite", favorites)); + return new IngestDocument(result, new HashMap<>()); + } + + /** + * Create following document with flattened nested map and list + * "favorites.favorite": { + * "movie": "matrix", + * "actor": "Charlie Chaplin", + * "games" : [ + * { + * "adventure": { + * "action": "overwatch", + * "rpg": "elden ring" + * } + * }, + * { + * "adventure.action": "wukong" + * } + * ] + * } + */ + private IngestDocument createFlattenedNestedMapAndListIngestDocument() { + Map adventureGames = buildObjMap(Pair.of("action", "overwatch"), Pair.of("rpg", "elden ring")); + Map game1 = buildObjMap(Pair.of("adventure", adventureGames)); + Map game2 = buildObjMap(Pair.of("adventure.action", "wukong")); + Map favorites = buildObjMap( + Pair.of("movie", "matrix"), + Pair.of("games", Arrays.asList(game1, game2)), + Pair.of("actor", "Charlie Chaplin") + ); + Map result = buildObjMap(Pair.of("favorites.favorite", favorites)); return new IngestDocument(result, new HashMap<>()); } private Map createNestedListConfiguration() { - Map nestedConfig = buildObjMapWithSingleField("textField", "vectorField"); - return buildObjMapWithSingleField("nestedField", nestedConfig); + Map nestedConfig = buildObjMap(Pair.of("textField", "vectorField")); + return buildObjMap(Pair.of("nestedField", nestedConfig)); } private Map createNestedList2LevelConfiguration() { - Map nestedConfig = buildObjMapWithSingleField("textField", "vectorField"); - Map nestConfigLevel1 = buildObjMapWithSingleField("nestedField", nestedConfig); - return buildObjMapWithSingleField("nestedField", nestConfigLevel1); + Map nestedConfig = buildObjMap(Pair.of("textField", "vectorField")); + Map nestConfigLevel1 = buildObjMap(Pair.of("nestedField", nestedConfig)); + return buildObjMap(Pair.of("nestedField", nestConfigLevel1)); } private IngestDocument createNestedListIngestDocument() { - Map nestedObj1 = buildObjMapWithSingleField("textField", "This is a text field"); - Map nestedObj2 = buildObjMapWithSingleField("textField", "This is another text field"); - Map nestedList = buildObjMapWithSingleField("nestedField", Arrays.asList(nestedObj1, nestedObj2)); + Map nestedObj1 = buildObjMap(Pair.of("textField", "This is a text field")); + Map nestedObj2 = buildObjMap(Pair.of("textField", "This is another text field")); + Map nestedList = buildObjMap(Pair.of("nestedField", Arrays.asList(nestedObj1, nestedObj2))); return new IngestDocument(nestedList, new HashMap<>()); } private IngestDocument createNestedListWithNotEmbeddingFieldIngestDocument() { - Map nestedObj1 = buildObjMapWithSingleField("textFieldNotForEmbedding", "This is a text field"); - Map nestedObj2 = buildObjMapWithSingleField("textField", "This is another text field"); - Map nestedList = buildObjMapWithSingleField("nestedField", Arrays.asList(nestedObj1, nestedObj2)); + Map nestedObj1 = buildObjMap(Pair.of("textFieldNotForEmbedding", "This is a text field")); + Map nestedObj2 = buildObjMap(Pair.of("textField", "This is another text field")); + Map nestedList = buildObjMap(Pair.of("nestedField", Arrays.asList(nestedObj1, nestedObj2))); return new IngestDocument(nestedList, new HashMap<>()); } private IngestDocument create2LevelNestedListIngestDocument() { - Map nestedObj1 = buildObjMapWithSingleField("textField", "This is a text field"); - Map nestedObj2 = buildObjMapWithSingleField("textField", "This is another text field"); - Map nestedList = buildObjMapWithSingleField("nestedField", Arrays.asList(nestedObj1, nestedObj2)); - Map nestedList1 = buildObjMapWithSingleField("nestedField", nestedList); + Map nestedObj1 = buildObjMap(Pair.of("textField", "This is a text field")); + Map nestedObj2 = buildObjMap(Pair.of("textField", "This is another text field")); + Map nestedList = buildObjMap(Pair.of("nestedField", Arrays.asList(nestedObj1, nestedObj2))); + Map nestedList1 = buildObjMap(Pair.of("nestedField", nestedList)); return new IngestDocument(nestedList1, new HashMap<>()); } private IngestDocument create2LevelNestedListWithNestedFieldsIngestDocument() { - Map nestedObj1Level2 = buildObjMapWithSingleField("textField", "This is a text field"); - Map nestedObj1Level1 = buildObjMapWithSingleField("nestedField", nestedObj1Level2); + Map nestedObj1Level2 = buildObjMap(Pair.of("textField", "This is a text field")); + Map nestedObj1Level1 = buildObjMap(Pair.of("nestedField", nestedObj1Level2)); - Map nestedObj2Level2 = buildObjMapWithSingleField("textField", "This is another text field"); - Map nestedObj2Level1 = buildObjMapWithSingleField("nestedField", nestedObj2Level2); + Map nestedObj2Level2 = buildObjMap(Pair.of("textField", "This is another text field")); + Map nestedObj2Level1 = buildObjMap(Pair.of("nestedField", nestedObj2Level2)); - Map nestedList = buildObjMapWithSingleField("nestedField", Arrays.asList(nestedObj1Level1, nestedObj2Level1)); + Map nestedList = buildObjMap(Pair.of("nestedField", Arrays.asList(nestedObj1Level1, nestedObj2Level1))); return new IngestDocument(nestedList, new HashMap<>()); } - private Map buildObjMapWithSingleField(String fieldName, Object fieldValue) { + private Map buildObjMap(Pair... pairs) { Map objMap = new HashMap<>(); - objMap.put(fieldName, fieldValue); + for (Pair pair : pairs) { + objMap.put(pair.getKey(), pair.getValue()); + } return objMap; } private IngestDocument create2LevelNestedListWithNotEmbeddingFieldIngestDocument() { - HashMap nestedObj1 = new HashMap<>(); - nestedObj1.put("textFieldNotForEmbedding", "This is a text field"); - HashMap nestedObj2 = new HashMap<>(); - nestedObj2.put("textField", "This is another text field"); - HashMap nestedList = new HashMap<>(); - nestedList.put("nestedField", Arrays.asList(nestedObj1, nestedObj2)); - HashMap nestedList1 = new HashMap<>(); - nestedList1.put("nestedField", nestedList); + Map nestedObj1 = buildObjMap(Pair.of("textFieldNotForEmbedding", "This is a text field")); + Map nestedObj2 = buildObjMap(Pair.of("textField", "This is another text field")); + Map nestedList = buildObjMap(Pair.of("nestedField", Arrays.asList(nestedObj1, nestedObj2))); + Map nestedList1 = buildObjMap(Pair.of("nestedField", nestedList)); return new IngestDocument(nestedList1, new HashMap<>()); } } diff --git a/src/test/java/org/opensearch/neuralsearch/util/ProcessorDocumentUtilsTests.java b/src/test/java/org/opensearch/neuralsearch/util/ProcessorDocumentUtilsTests.java index 068edcf2f..2a08a350d 100644 --- a/src/test/java/org/opensearch/neuralsearch/util/ProcessorDocumentUtilsTests.java +++ b/src/test/java/org/opensearch/neuralsearch/util/ProcessorDocumentUtilsTests.java @@ -18,6 +18,10 @@ import java.net.URISyntaxException; import java.nio.file.Files; import java.nio.file.Path; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; import java.util.Map; import static org.mockito.ArgumentMatchers.anyString; @@ -37,7 +41,7 @@ public void setup() { MockitoAnnotations.openMocks(this); } - public void test_with_different_configurations() throws URISyntaxException, IOException { + public void testValidateMapTypeValue_withDifferentConfigurations_thenSuccess() throws URISyntaxException, IOException { Settings settings = Settings.builder().put("index.mapping.depth.limit", 20).build(); when(clusterService.state().metadata().index(anyString()).getSettings()).thenReturn(settings); String processorDocumentTestJson = Files.readString( @@ -80,4 +84,144 @@ public void test_with_different_configurations() throws URISyntaxException, IOEx } } + public void testUnflatten_withSimpleDotNotation_thenSuccess() { + Map input = Map.of("a.b", "c"); + + Map nested = Map.of("b", "c"); + Map expected = Map.of("a", nested); + + Map result = ProcessorDocumentUtils.unflattenJson(input); + assertEquals(expected, result); + } + + public void testUnflatten_withSimpleNoDot_thenSuccess() { + Map nestedA = Map.of("b", "c"); + Map input = Map.of("a", nestedA); + + Map result = ProcessorDocumentUtils.unflattenJson(input); + assertEquals(input, result); + } + + public void testUnflatten_withMultipleDotNotation_thenSuccess() { + Map input = Map.of("a.b.c", "d", "a.b.e", "f", "x.y", "z"); + + Map nestedAB = Map.of("c", "d", "e", "f"); + Map nestedA = Map.of("b", nestedAB); + Map nestedX = Map.of("y", "z"); + + Map expected = Map.of("a", nestedA, "x", nestedX); + + Map result = ProcessorDocumentUtils.unflattenJson(input); + assertEquals(expected, result); + } + + public void testUnflatten_withList_thenSuccess() { + Map map1 = Map.of("b.c", "d"); + Map map2 = Map.of("b.c", "e"); + List> list = Arrays.asList(map1, map2); + Map input = Map.of("a", list); + + Map nestedB1 = Map.of("c", "d"); + Map expectedMap1 = Map.of("b", nestedB1); + Map nestedB2 = Map.of("c", "e"); + Map expectedMap2 = Map.of("b", nestedB2); + + List> expectedList = Arrays.asList(expectedMap1, expectedMap2); + + Map expected = Map.of("a", expectedList); + + Map result = ProcessorDocumentUtils.unflattenJson(input); + assertEquals(expected, result); + } + + public void testUnflatten_withMixedContent_thenSuccess() { + Map input = Map.of("a.b", "c", "d", "e", "f.g.h", "i"); + + Map nestedA = Map.of("b", "c"); + Map nestedG = Map.of("h", "i"); + Map nestedF = Map.of("g", nestedG); + Map expected = Map.of("a", nestedA, "d", "e", "f", nestedF); + + Map result = ProcessorDocumentUtils.unflattenJson(input); + assertEquals(expected, result); + } + + public void testUnflatten_wthEmptyMap_thenSuccess() { + Map result = ProcessorDocumentUtils.unflattenJson(Map.of()); + assertTrue(result.isEmpty()); + } + + public void testUnflatten_withNullValue_thenSuccess() { + Map input = new HashMap<>(); + input.put("a.b", null); + Map nested = new HashMap<>(); + nested.put("b", null); + Map expected = Map.of("a", nested); + + Map result = ProcessorDocumentUtils.unflattenJson(input); + assertEquals(expected, result); + } + + public void testUnflatten_withNestedListWithMultipleLevels_thenSuccess() { + Map map1 = Map.of("b.c.d", "e"); + Map map2 = Map.of("b.c.f", "g"); + List> outerList = Arrays.asList(map1, map2); + + Map input = Map.of("a", outerList); + + Map nestedC1 = Map.of("d", "e"); + Map nestedB1 = Map.of("c", nestedC1); + Map expectedMap1 = Map.of("b", nestedB1); + Map nestedC2 = Map.of("f", "g"); + Map nestedB2 = Map.of("c", nestedC2); + Map expectedMap2 = Map.of("b", nestedB2); + List> expectedOuterList = Arrays.asList(expectedMap1, expectedMap2); + + Map expected = Map.of("a", expectedOuterList); + + Map result = ProcessorDocumentUtils.unflattenJson(input); + assertEquals(expected, result); + } + + public void testUnflatten_withNullInput_thenFail() { + IllegalArgumentException illegalArgumentException = assertThrows( + IllegalArgumentException.class, + () -> ProcessorDocumentUtils.unflattenJson(null) + ); + + assertEquals("originalJsonMap cannot be null", illegalArgumentException.getMessage()); + } + + public void testUnflatten_withSimpleField_withLeadingDots_thenFail() { + String fieldName = ".a.b.c"; + Map input = Map.of(fieldName, "d"); + testUnflatten_withInvalidUsageOfDots_thenFail(fieldName, input); + } + + public void testUnflatten_withSimpleField_withInBetweenMultiDots_thenFail() { + String fieldName = "a..b.c"; + Map input = Map.of(fieldName, "d"); + testUnflatten_withInvalidUsageOfDots_thenFail(fieldName, input); + } + + public void testUnflatten_withSimpleField_withTrailingDots_thenFail() { + String fieldName = "a.b.c."; + Map input = Map.of(fieldName, "d"); + testUnflatten_withInvalidUsageOfDots_thenFail(fieldName, input); + } + + public void testUnflatten_withNestedField_withTrailingDots_thenFail() { + String fieldName = "b.c.d."; + Map input = Map.of("a", Map.of(fieldName, "e")); + testUnflatten_withInvalidUsageOfDots_thenFail(fieldName, input); + } + + private void testUnflatten_withInvalidUsageOfDots_thenFail(String fieldName, Map input) { + IllegalArgumentException illegalArgumentException = assertThrows( + IllegalArgumentException.class, + () -> ProcessorDocumentUtils.unflattenJson(input) + ); + assert (illegalArgumentException.getMessage() + .contains(String.format(Locale.ROOT, "Field name '%s' contains invalid dot usage", fieldName))); + } } diff --git a/src/test/resources/processor/ingest_doc5.json b/src/test/resources/processor/ingest_doc5.json new file mode 100644 index 000000000..18a6e6922 --- /dev/null +++ b/src/test/resources/processor/ingest_doc5.json @@ -0,0 +1,24 @@ +{ + "title": "This is a good day", + "description": "daily logging", + "favor_list": [ + "key", + "hey", + "click" + ], + "favorites": { + "game": "cossacks", + "movie": "matrix" + }, + "nested_passages":[ + { + "level_2": + { + "level_3_text": "clown" + } + }, + { + "level_2.level_3_text": "batman" + } + ] +}