From 56b61c2774c47835dda8b06bfec9c7ee49a299fe Mon Sep 17 00:00:00 2001 From: Tyler Ohlsen Date: Fri, 27 Dec 2024 14:38:36 -0800 Subject: [PATCH] Auto-update mappings Signed-off-by: Tyler Ohlsen --- common/constants.ts | 92 +++++++++--- .../ingest_inputs/advanced_settings.tsx | 57 +++++++- .../new_workflow/quick_configure_inputs.tsx | 4 +- public/utils/utils.ts | 134 ++++++++++++++++-- 4 files changed, 249 insertions(+), 38 deletions(-) diff --git a/common/constants.ts b/common/constants.ts index 76435d0b..e52e4028 100644 --- a/common/constants.ts +++ b/common/constants.ts @@ -77,31 +77,81 @@ export const SEARCH_CONNECTORS_NODE_API_PATH = `${BASE_CONNECTOR_NODE_API_PATH}/ * based on the specified remote model from a remote service, if found */ -// Cohere -export const COHERE_DIMENSIONS = { - [`embed-english-v3.0`]: 1024, - [`embed-english-light-v3.0`]: 384, - [`embed-multilingual-v3.0`]: 1024, - [`embed-multilingual-light-v3.0`]: 384, - [`embed-english-v2.0`]: 4096, - [`embed-english-light-v2.0`]: 1024, - [`embed-multilingual-v2.0`]: 768, +interface RemoteEmbeddingModelConfig { + dimension: number; + fieldName: string; +} + +// Amazon BedRock +export const BEDROCK_CONFIGS = { + [`amazon.titan-embed-text-v1`]: { + dimension: 1536, + fieldName: 'embedding', + } as RemoteEmbeddingModelConfig, + [`amazon.titan-embed-text-v2`]: { + dimension: 1024, + fieldName: 'embedding', + } as RemoteEmbeddingModelConfig, + [`amazon.titan-embed-image-v1`]: { + dimension: 1024, + fieldName: 'embedding', + } as RemoteEmbeddingModelConfig, + [`cohere.embed-english-v3`]: { + dimension: 1024, + fieldName: 'embeddings', + } as RemoteEmbeddingModelConfig, + [`cohere.embed-multilingual-v3`]: { + dimension: 1024, + fieldName: 'embeddings', + } as RemoteEmbeddingModelConfig, }; -// OpenAI -export const OPENAI_DIMENSIONS = { - [`text-embedding-3-small`]: 1536, - [`text-embedding-3-large`]: 3072, - [`text-embedding-ada-002`]: 1536, +// Cohere +export const COHERE_CONFIGS = { + [`embed-english-v3.0`]: { + dimension: 1024, + fieldName: 'embeddings', + } as RemoteEmbeddingModelConfig, + [`embed-english-light-v3.0`]: { + dimension: 384, + fieldName: 'embeddings', + } as RemoteEmbeddingModelConfig, + [`embed-multilingual-v3.0`]: { + dimension: 1024, + fieldName: 'embeddings', + } as RemoteEmbeddingModelConfig, + [`embed-multilingual-light-v3.0`]: { + dimension: 384, + fieldName: 'embeddings', + } as RemoteEmbeddingModelConfig, + [`embed-english-v2.0`]: { + dimension: 4096, + fieldName: 'embeddings', + } as RemoteEmbeddingModelConfig, + [`embed-english-light-v2.0`]: { + dimension: 1024, + fieldName: 'embeddings', + } as RemoteEmbeddingModelConfig, + [`embed-multilingual-v2.0`]: { + dimension: 768, + fieldName: 'embeddings', + } as RemoteEmbeddingModelConfig, }; -// Amazon BedRock -export const BEDROCK_DIMENSIONS = { - [`amazon.titan-embed-text-v1`]: 1536, - [`amazon.titan-embed-text-v2`]: 1024, - [`amazon.titan-embed-image-v1`]: 1024, - [`cohere.embed-english-v3`]: 1024, // same as Cohere directly - [`cohere.embed-multilingual-v3`]: 1024, // same as Cohere directly +// OpenAI +export const OPENAI_CONFIGS = { + [`text-embedding-3-small`]: { + dimension: 1536, + fieldName: 'embedding', + } as RemoteEmbeddingModelConfig, + [`text-embedding-3-large`]: { + dimension: 3072, + fieldName: 'embedding', + } as RemoteEmbeddingModelConfig, + [`text-embedding-ada-002`]: { + dimension: 1536, + fieldName: 'embedding', + } as RemoteEmbeddingModelConfig, }; /** diff --git a/public/pages/workflow_detail/workflow_inputs/ingest_inputs/advanced_settings.tsx b/public/pages/workflow_detail/workflow_inputs/ingest_inputs/advanced_settings.tsx index a9bf8f42..cfbebf13 100644 --- a/public/pages/workflow_detail/workflow_inputs/ingest_inputs/advanced_settings.tsx +++ b/public/pages/workflow_detail/workflow_inputs/ingest_inputs/advanced_settings.tsx @@ -17,9 +17,12 @@ import { getIn, useFormikContext } from 'formik'; import { WorkflowFormValues } from '../../../../../common'; import { AppState } from '../../../../store'; import { - getEmbeddingDimensions, + getEmbeddingField, + getEmbeddingModelDimensions, + getUpdatedIndexMappings, getUpdatedIndexSettings, isKnnIndex, + removeVectorFieldFromIndexMappings, } from '../../../../utils'; interface AdvancedSettingsProps {} @@ -30,8 +33,10 @@ interface AdvancedSettingsProps {} export function AdvancedSettings(props: AdvancedSettingsProps) { const { values, setFieldValue } = useFormikContext(); const { models, connectors } = useSelector((state: AppState) => state.ml); - const ingestProcessors = Object.values(values?.ingest?.enrich) as []; - const ingestProcessorModelIds = ingestProcessors + const ingestMLProcessors = (Object.values( + values?.ingest?.enrich + ) as any[]).filter((ingestProcessor) => ingestProcessor?.model !== undefined); + const ingestProcessorModelIds = ingestMLProcessors .map((ingestProcessor) => ingestProcessor?.model?.id as string | undefined) .filter((modelId) => !isEmpty(modelId)); const indexMappingsPath = 'ingest.index.mappings'; @@ -40,7 +45,7 @@ export function AdvancedSettings(props: AdvancedSettingsProps) { const curSettings = getIn(values, indexSettingsPath); // listen on when processor with models are added / removed. dynamically update index - // mappings and settings, if applicable. + // settings to be knn-enabled or knn-disabled. useEffect(() => { if (ingestProcessorModelIds.length > 0) { ingestProcessorModelIds.forEach((ingestProcessorModelId) => { @@ -49,9 +54,11 @@ export function AdvancedSettings(props: AdvancedSettingsProps) { ); if (processorModel?.connectorId !== undefined) { const processorConnector = connectors[processorModel?.connectorId]; - const dimension = getEmbeddingDimensions(processorConnector); + const dimension = getEmbeddingModelDimensions(processorConnector); + + // If a dimension is found, it is a known embedding model. + // Ensure the index is configured to be knn-enabled. if (dimension !== undefined) { - // TODO: update mappings if (!isKnnIndex(curSettings)) { setFieldValue( indexSettingsPath, @@ -62,7 +69,6 @@ export function AdvancedSettings(props: AdvancedSettingsProps) { } }); } else { - // TODO: update mappings if (isKnnIndex(curSettings)) { setFieldValue( indexSettingsPath, @@ -72,6 +78,43 @@ export function AdvancedSettings(props: AdvancedSettingsProps) { } }, [ingestProcessorModelIds.length]); + // listener on when there are updates to any ingest processors. Try to update + // any index mappings accordingly, such as setting the knn_vector mappings + // for models that output vector embeddings, or removing any mappings, if no ML + // processor defined. + useEffect(() => { + if (ingestMLProcessors.length > 0) { + ingestMLProcessors.forEach((ingestMLProcessor) => { + const processorModel = Object.values(models).find( + (model) => model.id === ingestMLProcessor?.model?.id + ); + if (processorModel?.connectorId !== undefined) { + const processorConnector = connectors[processorModel?.connectorId]; + const dimension = getEmbeddingModelDimensions(processorConnector); + const embeddingFieldName = getEmbeddingField( + processorConnector, + ingestMLProcessor + ); + if (embeddingFieldName !== undefined && dimension !== undefined) { + setFieldValue( + indexMappingsPath, + getUpdatedIndexMappings( + curMappings, + embeddingFieldName, + dimension + ) + ); + } + } + }); + } else { + setFieldValue( + indexMappingsPath, + removeVectorFieldFromIndexMappings(curMappings) + ); + } + }, [getIn(values, 'ingest.enrich')]); + return ( diff --git a/public/pages/workflows/new_workflow/quick_configure_inputs.tsx b/public/pages/workflows/new_workflow/quick_configure_inputs.tsx index 7b062c5f..90626b5a 100644 --- a/public/pages/workflows/new_workflow/quick_configure_inputs.tsx +++ b/public/pages/workflows/new_workflow/quick_configure_inputs.tsx @@ -27,7 +27,7 @@ import { WORKFLOW_TYPE, } from '../../../../common'; import { AppState } from '../../../store'; -import { getEmbeddingDimensions, parseModelInputs } from '../../../utils'; +import { getEmbeddingModelDimensions, parseModelInputs } from '../../../utils'; import { get } from 'lodash'; interface QuickConfigureInputsProps { @@ -120,7 +120,7 @@ export function QuickConfigureInputs(props: QuickConfigureInputsProps) { if (connector !== undefined) { setFieldValues({ ...fieldValues, - embeddingLength: getEmbeddingDimensions(connector), + embeddingLength: getEmbeddingModelDimensions(connector), }); } } diff --git a/public/utils/utils.ts b/public/utils/utils.ts index acf5aaf4..c33a290b 100644 --- a/public/utils/utils.ts +++ b/public/utils/utils.ts @@ -5,7 +5,7 @@ import yaml from 'js-yaml'; import jsonpath from 'jsonpath'; -import { escape, get, isEmpty, set } from 'lodash'; +import { escape, findKey, get, isEmpty, set, unset } from 'lodash'; import semver from 'semver'; import queryString from 'query-string'; import { useLocation } from 'react-router-dom'; @@ -27,10 +27,12 @@ import { WORKFLOW_STEP_TYPE, Workflow, WorkflowResource, - BEDROCK_DIMENSIONS, - COHERE_DIMENSIONS, - OPENAI_DIMENSIONS, + BEDROCK_CONFIGS, + COHERE_CONFIGS, + OPENAI_CONFIGS, customStringify, + NO_TRANSFORMATION, + TRANSFORM_TYPE, } from '../../common'; import { getCore, getDataSourceEnabled } from '../services'; import { @@ -40,6 +42,7 @@ import { ModelInputMap, ModelOutputMap, OutputMapEntry, + OutputMapFormValue, QueryParam, } from '../../common/interfaces'; import * as pluginManifest from '../../opensearch_dashboards.json'; @@ -610,7 +613,7 @@ export function injectParameters( } // Fetch embedding dimensions, if the selected model is a known one -export function getEmbeddingDimensions( +export function getEmbeddingModelDimensions( connector: Connector ): number | undefined { // some APIs allow specifically setting the dimensions at runtime, @@ -620,11 +623,11 @@ export function getEmbeddingDimensions( } else if (connector.parameters?.model !== undefined) { return ( // @ts-ignore - COHERE_DIMENSIONS[connector.parameters?.model] || + COHERE_CONFIGS[connector.parameters?.model]?.dimension || // @ts-ignore - OPENAI_DIMENSIONS[connector.parameters?.model] || + OPENAI_CONFIGS[connector.parameters?.model]?.dimension || // @ts-ignore - BEDROCK_DIMENSIONS[connector.parameters?.model] + BEDROCK_CONFIGS[connector.parameters?.model]?.dimension ); } else { return undefined; @@ -655,3 +658,118 @@ export function getUpdatedIndexSettings( return existingSettings; } } + +// Get any embedding fields, if a known embedding model +function getEmbeddingFieldFromConnector( + connector: Connector +): string | undefined { + if (connector.parameters?.model !== undefined) { + return ( + // @ts-ignore + COHERE_CONFIGS[connector.parameters?.model]?.fieldName || + // @ts-ignore + OPENAI_CONFIGS[connector.parameters?.model]?.fieldName || + // @ts-ignore + BEDROCK_CONFIGS[connector.parameters?.model]?.fieldName + ); + } else { + return undefined; + } +} + +// Try to determine the embedding field based on the processor config. +// First check if it is a known model, then make a best guess based on +// the output map configuration, if there is any transformations made +export function getEmbeddingField( + connector: Connector, + processorConfig: any +): string | undefined { + let embeddingField = getEmbeddingFieldFromConnector(connector); + const outputMap = processorConfig?.output_map as OutputMapFormValue; + if ( + outputMap !== undefined && + outputMap[0] !== undefined && + Array.isArray(outputMap[0]) && + outputMap[0].length > 0 + ) { + const relevantOutputMapEntry = + embeddingField !== undefined + ? outputMap[0].find( + (outputMapEntry) => outputMapEntry.key === embeddingField + ) + : outputMap[0][0]; + switch (relevantOutputMapEntry?.value?.transformType) { + case TRANSFORM_TYPE.FIELD: { + embeddingField = relevantOutputMapEntry?.value?.value; + break; + } + case TRANSFORM_TYPE.EXPRESSION: { + embeddingField = get(relevantOutputMapEntry, 'value.nestedVars.0.name'); + break; + } + case NO_TRANSFORMATION: + case undefined: + default: { + embeddingField = relevantOutputMapEntry?.key; + break; + } + } + // if (relevantOutputMapEntry?.value?.transformType === NO_TRANSFORMATION) { + // embeddingField = relevantOutputMapEntry?.key; + // } else if ( + // relevantOutputMapEntry?.value?.transformType === TRANSFORM_TYPE.FIELD + // ) { + // embeddingField = relevantOutputMapEntry?.value?.value; + // } else if ( + // relevantOutputMapEntry?.value?.transformType === TRANSFORM_TYPE.EXPRESSION + // ) { + // embeddingField = get(relevantOutputMapEntry, 'value.nestedVars.0.name'); + // } + } + return embeddingField; +} + +// Update the index mappings based on parameters passed. +// Currently used for updating the knn_vector field configuration, & removing +// any old/existing knn_vector field in the process. +export function getUpdatedIndexMappings( + existingMappings: string, + embeddingFieldName: string, + dimension: number +): string { + try { + const mappingsWithRemovedVectorField = removeVectorFieldFromIndexMappings( + existingMappings + ); + return customStringify( + set( + JSON.parse(mappingsWithRemovedVectorField), + `properties.${embeddingFieldName}`, + { + type: 'knn_vector', + dimension, + } + ) + ); + } catch { + return existingMappings; + } +} + +export function removeVectorFieldFromIndexMappings( + existingMappings: string +): string { + try { + let existingMappingsObj = JSON.parse(existingMappings); + const existingEmbeddingField = findKey( + existingMappingsObj?.properties, + (field) => field.type === 'knn_vector' + ); + if (existingEmbeddingField !== undefined) { + unset(existingMappingsObj?.properties, existingEmbeddingField); + } + return customStringify(existingMappingsObj); + } catch { + return existingMappings; + } +}