Skip to content

Commit

Permalink
Auto-update mappings
Browse files Browse the repository at this point in the history
Signed-off-by: Tyler Ohlsen <[email protected]>
  • Loading branch information
ohltyler committed Dec 27, 2024
1 parent 29e3a3c commit 56b61c2
Show file tree
Hide file tree
Showing 4 changed files with 249 additions and 38 deletions.
92 changes: 71 additions & 21 deletions common/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -77,31 +77,81 @@ export const SEARCH_CONNECTORS_NODE_API_PATH = `${BASE_CONNECTOR_NODE_API_PATH}/
* based on the specified remote model from a remote service, if found
*/

// Cohere
export const COHERE_DIMENSIONS = {
[`embed-english-v3.0`]: 1024,
[`embed-english-light-v3.0`]: 384,
[`embed-multilingual-v3.0`]: 1024,
[`embed-multilingual-light-v3.0`]: 384,
[`embed-english-v2.0`]: 4096,
[`embed-english-light-v2.0`]: 1024,
[`embed-multilingual-v2.0`]: 768,
interface RemoteEmbeddingModelConfig {
dimension: number;
fieldName: string;
}

// Amazon BedRock
export const BEDROCK_CONFIGS = {
[`amazon.titan-embed-text-v1`]: {
dimension: 1536,
fieldName: 'embedding',
} as RemoteEmbeddingModelConfig,
[`amazon.titan-embed-text-v2`]: {
dimension: 1024,
fieldName: 'embedding',
} as RemoteEmbeddingModelConfig,
[`amazon.titan-embed-image-v1`]: {
dimension: 1024,
fieldName: 'embedding',
} as RemoteEmbeddingModelConfig,
[`cohere.embed-english-v3`]: {
dimension: 1024,
fieldName: 'embeddings',
} as RemoteEmbeddingModelConfig,
[`cohere.embed-multilingual-v3`]: {
dimension: 1024,
fieldName: 'embeddings',
} as RemoteEmbeddingModelConfig,
};

// OpenAI
export const OPENAI_DIMENSIONS = {
[`text-embedding-3-small`]: 1536,
[`text-embedding-3-large`]: 3072,
[`text-embedding-ada-002`]: 1536,
// Cohere
export const COHERE_CONFIGS = {
[`embed-english-v3.0`]: {
dimension: 1024,
fieldName: 'embeddings',
} as RemoteEmbeddingModelConfig,
[`embed-english-light-v3.0`]: {
dimension: 384,
fieldName: 'embeddings',
} as RemoteEmbeddingModelConfig,
[`embed-multilingual-v3.0`]: {
dimension: 1024,
fieldName: 'embeddings',
} as RemoteEmbeddingModelConfig,
[`embed-multilingual-light-v3.0`]: {
dimension: 384,
fieldName: 'embeddings',
} as RemoteEmbeddingModelConfig,
[`embed-english-v2.0`]: {
dimension: 4096,
fieldName: 'embeddings',
} as RemoteEmbeddingModelConfig,
[`embed-english-light-v2.0`]: {
dimension: 1024,
fieldName: 'embeddings',
} as RemoteEmbeddingModelConfig,
[`embed-multilingual-v2.0`]: {
dimension: 768,
fieldName: 'embeddings',
} as RemoteEmbeddingModelConfig,
};

// Amazon BedRock
export const BEDROCK_DIMENSIONS = {
[`amazon.titan-embed-text-v1`]: 1536,
[`amazon.titan-embed-text-v2`]: 1024,
[`amazon.titan-embed-image-v1`]: 1024,
[`cohere.embed-english-v3`]: 1024, // same as Cohere directly
[`cohere.embed-multilingual-v3`]: 1024, // same as Cohere directly
// OpenAI
export const OPENAI_CONFIGS = {
[`text-embedding-3-small`]: {
dimension: 1536,
fieldName: 'embedding',
} as RemoteEmbeddingModelConfig,
[`text-embedding-3-large`]: {
dimension: 3072,
fieldName: 'embedding',
} as RemoteEmbeddingModelConfig,
[`text-embedding-ada-002`]: {
dimension: 1536,
fieldName: 'embedding',
} as RemoteEmbeddingModelConfig,
};

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,12 @@ import { getIn, useFormikContext } from 'formik';
import { WorkflowFormValues } from '../../../../../common';
import { AppState } from '../../../../store';
import {
getEmbeddingDimensions,
getEmbeddingField,
getEmbeddingModelDimensions,
getUpdatedIndexMappings,
getUpdatedIndexSettings,
isKnnIndex,
removeVectorFieldFromIndexMappings,
} from '../../../../utils';

interface AdvancedSettingsProps {}
Expand All @@ -30,8 +33,10 @@ interface AdvancedSettingsProps {}
export function AdvancedSettings(props: AdvancedSettingsProps) {
const { values, setFieldValue } = useFormikContext<WorkflowFormValues>();
const { models, connectors } = useSelector((state: AppState) => state.ml);
const ingestProcessors = Object.values(values?.ingest?.enrich) as [];
const ingestProcessorModelIds = ingestProcessors
const ingestMLProcessors = (Object.values(
values?.ingest?.enrich
) as any[]).filter((ingestProcessor) => ingestProcessor?.model !== undefined);
const ingestProcessorModelIds = ingestMLProcessors
.map((ingestProcessor) => ingestProcessor?.model?.id as string | undefined)
.filter((modelId) => !isEmpty(modelId));
const indexMappingsPath = 'ingest.index.mappings';
Expand All @@ -40,7 +45,7 @@ export function AdvancedSettings(props: AdvancedSettingsProps) {
const curSettings = getIn(values, indexSettingsPath);

// listen on when processor with models are added / removed. dynamically update index
// mappings and settings, if applicable.
// settings to be knn-enabled or knn-disabled.
useEffect(() => {
if (ingestProcessorModelIds.length > 0) {
ingestProcessorModelIds.forEach((ingestProcessorModelId) => {
Expand All @@ -49,9 +54,11 @@ export function AdvancedSettings(props: AdvancedSettingsProps) {
);
if (processorModel?.connectorId !== undefined) {
const processorConnector = connectors[processorModel?.connectorId];
const dimension = getEmbeddingDimensions(processorConnector);
const dimension = getEmbeddingModelDimensions(processorConnector);

// If a dimension is found, it is a known embedding model.
// Ensure the index is configured to be knn-enabled.
if (dimension !== undefined) {
// TODO: update mappings
if (!isKnnIndex(curSettings)) {
setFieldValue(
indexSettingsPath,
Expand All @@ -62,7 +69,6 @@ export function AdvancedSettings(props: AdvancedSettingsProps) {
}
});
} else {
// TODO: update mappings
if (isKnnIndex(curSettings)) {
setFieldValue(
indexSettingsPath,
Expand All @@ -72,6 +78,43 @@ export function AdvancedSettings(props: AdvancedSettingsProps) {
}
}, [ingestProcessorModelIds.length]);

// listener on when there are updates to any ingest processors. Try to update
// any index mappings accordingly, such as setting the knn_vector mappings
// for models that output vector embeddings, or removing any mappings, if no ML
// processor defined.
useEffect(() => {
if (ingestMLProcessors.length > 0) {
ingestMLProcessors.forEach((ingestMLProcessor) => {
const processorModel = Object.values(models).find(
(model) => model.id === ingestMLProcessor?.model?.id
);
if (processorModel?.connectorId !== undefined) {
const processorConnector = connectors[processorModel?.connectorId];
const dimension = getEmbeddingModelDimensions(processorConnector);
const embeddingFieldName = getEmbeddingField(
processorConnector,
ingestMLProcessor
);
if (embeddingFieldName !== undefined && dimension !== undefined) {
setFieldValue(
indexMappingsPath,
getUpdatedIndexMappings(
curMappings,
embeddingFieldName,
dimension
)
);
}
}
});
} else {
setFieldValue(
indexMappingsPath,
removeVectorFieldFromIndexMappings(curMappings)
);
}
}, [getIn(values, 'ingest.enrich')]);

return (
<EuiFlexGroup direction="column">
<EuiFlexItem grow={false}>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ import {
WORKFLOW_TYPE,
} from '../../../../common';
import { AppState } from '../../../store';
import { getEmbeddingDimensions, parseModelInputs } from '../../../utils';
import { getEmbeddingModelDimensions, parseModelInputs } from '../../../utils';
import { get } from 'lodash';

interface QuickConfigureInputsProps {
Expand Down Expand Up @@ -120,7 +120,7 @@ export function QuickConfigureInputs(props: QuickConfigureInputsProps) {
if (connector !== undefined) {
setFieldValues({
...fieldValues,
embeddingLength: getEmbeddingDimensions(connector),
embeddingLength: getEmbeddingModelDimensions(connector),
});
}
}
Expand Down
134 changes: 126 additions & 8 deletions public/utils/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import yaml from 'js-yaml';
import jsonpath from 'jsonpath';
import { escape, get, isEmpty, set } from 'lodash';
import { escape, findKey, get, isEmpty, set, unset } from 'lodash';
import semver from 'semver';
import queryString from 'query-string';
import { useLocation } from 'react-router-dom';
Expand All @@ -27,10 +27,12 @@ import {
WORKFLOW_STEP_TYPE,
Workflow,
WorkflowResource,
BEDROCK_DIMENSIONS,
COHERE_DIMENSIONS,
OPENAI_DIMENSIONS,
BEDROCK_CONFIGS,
COHERE_CONFIGS,
OPENAI_CONFIGS,
customStringify,
NO_TRANSFORMATION,
TRANSFORM_TYPE,
} from '../../common';
import { getCore, getDataSourceEnabled } from '../services';
import {
Expand All @@ -40,6 +42,7 @@ import {
ModelInputMap,
ModelOutputMap,
OutputMapEntry,
OutputMapFormValue,
QueryParam,
} from '../../common/interfaces';
import * as pluginManifest from '../../opensearch_dashboards.json';
Expand Down Expand Up @@ -610,7 +613,7 @@ export function injectParameters(
}

// Fetch embedding dimensions, if the selected model is a known one
export function getEmbeddingDimensions(
export function getEmbeddingModelDimensions(
connector: Connector
): number | undefined {
// some APIs allow specifically setting the dimensions at runtime,
Expand All @@ -620,11 +623,11 @@ export function getEmbeddingDimensions(
} else if (connector.parameters?.model !== undefined) {
return (
// @ts-ignore
COHERE_DIMENSIONS[connector.parameters?.model] ||
COHERE_CONFIGS[connector.parameters?.model]?.dimension ||
// @ts-ignore
OPENAI_DIMENSIONS[connector.parameters?.model] ||
OPENAI_CONFIGS[connector.parameters?.model]?.dimension ||
// @ts-ignore
BEDROCK_DIMENSIONS[connector.parameters?.model]
BEDROCK_CONFIGS[connector.parameters?.model]?.dimension
);
} else {
return undefined;
Expand Down Expand Up @@ -655,3 +658,118 @@ export function getUpdatedIndexSettings(
return existingSettings;
}
}

// Get any embedding fields, if a known embedding model
function getEmbeddingFieldFromConnector(
connector: Connector
): string | undefined {
if (connector.parameters?.model !== undefined) {
return (
// @ts-ignore
COHERE_CONFIGS[connector.parameters?.model]?.fieldName ||
// @ts-ignore
OPENAI_CONFIGS[connector.parameters?.model]?.fieldName ||
// @ts-ignore
BEDROCK_CONFIGS[connector.parameters?.model]?.fieldName
);
} else {
return undefined;
}
}

// Try to determine the embedding field based on the processor config.
// First check if it is a known model, then make a best guess based on
// the output map configuration, if there is any transformations made
export function getEmbeddingField(
connector: Connector,
processorConfig: any
): string | undefined {
let embeddingField = getEmbeddingFieldFromConnector(connector);
const outputMap = processorConfig?.output_map as OutputMapFormValue;
if (
outputMap !== undefined &&
outputMap[0] !== undefined &&
Array.isArray(outputMap[0]) &&
outputMap[0].length > 0
) {
const relevantOutputMapEntry =
embeddingField !== undefined
? outputMap[0].find(
(outputMapEntry) => outputMapEntry.key === embeddingField
)
: outputMap[0][0];
switch (relevantOutputMapEntry?.value?.transformType) {
case TRANSFORM_TYPE.FIELD: {
embeddingField = relevantOutputMapEntry?.value?.value;
break;
}
case TRANSFORM_TYPE.EXPRESSION: {
embeddingField = get(relevantOutputMapEntry, 'value.nestedVars.0.name');
break;
}
case NO_TRANSFORMATION:
case undefined:
default: {
embeddingField = relevantOutputMapEntry?.key;
break;
}
}
// if (relevantOutputMapEntry?.value?.transformType === NO_TRANSFORMATION) {
// embeddingField = relevantOutputMapEntry?.key;
// } else if (
// relevantOutputMapEntry?.value?.transformType === TRANSFORM_TYPE.FIELD
// ) {
// embeddingField = relevantOutputMapEntry?.value?.value;
// } else if (
// relevantOutputMapEntry?.value?.transformType === TRANSFORM_TYPE.EXPRESSION
// ) {
// embeddingField = get(relevantOutputMapEntry, 'value.nestedVars.0.name');
// }
}
return embeddingField;
}

// Update the index mappings based on parameters passed.
// Currently used for updating the knn_vector field configuration, & removing
// any old/existing knn_vector field in the process.
export function getUpdatedIndexMappings(
existingMappings: string,
embeddingFieldName: string,
dimension: number
): string {
try {
const mappingsWithRemovedVectorField = removeVectorFieldFromIndexMappings(
existingMappings
);
return customStringify(
set(
JSON.parse(mappingsWithRemovedVectorField),
`properties.${embeddingFieldName}`,
{
type: 'knn_vector',
dimension,
}
)
);
} catch {
return existingMappings;
}
}

export function removeVectorFieldFromIndexMappings(
existingMappings: string
): string {
try {
let existingMappingsObj = JSON.parse(existingMappings);
const existingEmbeddingField = findKey(
existingMappingsObj?.properties,
(field) => field.type === 'knn_vector'
);
if (existingEmbeddingField !== undefined) {
unset(existingMappingsObj?.properties, existingEmbeddingField);
}
return customStringify(existingMappingsObj);
} catch {
return existingMappings;
}
}

0 comments on commit 56b61c2

Please sign in to comment.