From 03b5c44871f8db4d76ccdb494f3e4fff5f92991d Mon Sep 17 00:00:00 2001 From: "opensearch-trigger-bot[bot]" <98922864+opensearch-trigger-bot[bot]@users.noreply.github.com> Date: Tue, 6 Aug 2024 08:45:54 -0700 Subject: [PATCH] Support optional params for all processors; onboard text_chunking processor (#265) (#266) Signed-off-by: Tyler Ohlsen (cherry picked from commit e2d22c36afbb575a829cdb7e8336eb71641b69af) Co-authored-by: Tyler Ohlsen --- common/constants.ts | 27 +++- common/interfaces.ts | 7 +- public/configs/base_config.ts | 5 +- public/configs/ingest_processors/index.ts | 1 + .../text_chunking_ingest_processor.ts | 75 +++++++++++ public/configs/ml_processor.ts | 38 +++++- public/configs/sort_processor.ts | 17 ++- public/configs/split_processor.ts | 23 +++- .../workflow_inputs/config_field_list.tsx | 60 +++++++-- .../input_fields/boolean_field.tsx | 54 ++++++-- .../workflow_inputs/input_fields/index.ts | 1 + .../input_fields/json_field.tsx | 10 +- .../input_fields/number_field.tsx | 68 ++++++++++ .../input_fields/select_field.tsx | 7 +- .../input_fields/text_field.tsx | 7 +- .../processor_inputs/ml_processor_inputs.tsx | 22 ++- .../processor_inputs/processor_inputs.tsx | 46 ++++++- .../text_chunking_processor_inputs.tsx | 127 ++++++++++++++++++ .../workflow_inputs/processors_list.tsx | 10 ++ .../workflow_inputs/workflow_inputs.tsx | 1 + public/utils/config_to_form_utils.ts | 10 ++ public/utils/config_to_schema_utils.ts | 22 ++- public/utils/config_to_template_utils.ts | 112 +++++++++++---- public/utils/config_to_workspace_utils.ts | 7 + public/utils/form_to_config_utils.ts | 7 + public/utils/utils.ts | 10 ++ 26 files changed, 695 insertions(+), 79 deletions(-) create mode 100644 public/configs/ingest_processors/text_chunking_ingest_processor.ts create mode 100644 public/pages/workflow_detail/workflow_inputs/input_fields/number_field.tsx create mode 100644 public/pages/workflow_detail/workflow_inputs/processor_inputs/text_chunking_processor_inputs.tsx diff --git a/common/constants.ts b/common/constants.ts index 7ca3dcfb..b0ccccc8 100644 --- a/common/constants.ts +++ b/common/constants.ts @@ -62,10 +62,13 @@ export enum WORKFLOW_TYPE { UNKNOWN = 'Unknown', } +// the names should be consistent with the underlying implementation. used when generating the +// final ingest/search pipeline configurations. export enum PROCESSOR_TYPE { - ML = 'ml_processor', - SPLIT = 'split_processor', - SORT = 'sort_processor', + ML = 'ml_inference', + SPLIT = 'split', + SORT = 'sort', + TEXT_CHUNKING = 'text_chunking', } export enum MODEL_TYPE { @@ -118,6 +121,24 @@ export const ML_INFERENCE_DOCS_LINK = 'https://opensearch.org/docs/latest/ingest-pipelines/processors/ml-inference/#configuration-parameters'; export const ML_CHOOSE_MODEL_LINK = 'https://opensearch.org/docs/latest/ml-commons-plugin/integrating-ml-models/#choosing-a-model'; +export const TEXT_CHUNKING_PROCESSOR_LINK = + 'https://opensearch.org/docs/latest/ingest-pipelines/processors/text-chunking/'; + +/** + * Text chunking algorithm constants + */ +export enum TEXT_CHUNKING_ALGORITHM { + FIXED_TOKEN_LENGTH = 'fixed_token_length', + DELIMITER = 'delimiter', +} +export const FIXED_TOKEN_LENGTH_OPTIONAL_FIELDS = [ + 'token_limit', + 'tokenizer', + 'overlap_rate', +]; +export const DELIMITER_OPTIONAL_FIELDS = ['delimiter']; +export const SHARED_OPTIONAL_FIELDS = ['max_chunk_limit', 'description', 'tag']; + /** * MISCELLANEOUS */ diff --git a/common/interfaces.ts b/common/interfaces.ts index 2c4759ce..0feac2fd 100644 --- a/common/interfaces.ts +++ b/common/interfaces.ts @@ -24,15 +24,15 @@ export type ConfigFieldType = | 'select' | 'model' | 'map' - | 'mapArray'; + | 'mapArray' + | 'boolean' + | 'number'; export type ConfigFieldValue = string | {}; export interface IConfigField { type: ConfigFieldType; id: string; - optional?: boolean; - label?: string; value?: ConfigFieldValue; selectOptions?: ConfigFieldValue[]; } @@ -41,6 +41,7 @@ export interface IConfig { id: string; name: string; fields: IConfigField[]; + optionalFields?: IConfigField[]; } export interface IProcessorConfig extends IConfig { diff --git a/public/configs/base_config.ts b/public/configs/base_config.ts index 3f7572b3..298e3913 100644 --- a/public/configs/base_config.ts +++ b/public/configs/base_config.ts @@ -12,14 +12,14 @@ export abstract class BaseConfig implements IConfig { id: string; name: string; fields: IConfigField[]; - // TODO: have a dedicated optional fields list to display more fields & have more - // flexibility for the users to customize + optionalFields?: IConfigField[]; // No-op constructor. If there are general / defaults for field values, add in here. constructor() { this.id = ''; this.name = ''; this.fields = []; + this.optionalFields = []; } // Persist a standard toObj() fn that all component classes can use. This is necessary @@ -29,6 +29,7 @@ export abstract class BaseConfig implements IConfig { id: this.id, name: this.name, fields: this.fields, + optionalFields: this.optionalFields, } as IConfig; } } diff --git a/public/configs/ingest_processors/index.ts b/public/configs/ingest_processors/index.ts index 573c82fc..5b4c680f 100644 --- a/public/configs/ingest_processors/index.ts +++ b/public/configs/ingest_processors/index.ts @@ -6,3 +6,4 @@ export * from './ml_ingest_processor'; export * from './split_ingest_processor'; export * from './sort_ingest_processor'; +export * from './text_chunking_ingest_processor'; diff --git a/public/configs/ingest_processors/text_chunking_ingest_processor.ts b/public/configs/ingest_processors/text_chunking_ingest_processor.ts new file mode 100644 index 00000000..8caa71a5 --- /dev/null +++ b/public/configs/ingest_processors/text_chunking_ingest_processor.ts @@ -0,0 +1,75 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +import { PROCESSOR_TYPE, TEXT_CHUNKING_ALGORITHM } from '../../../common'; +import { generateId } from '../../utils'; +import { Processor } from '../processor'; + +/** + * The text chunking ingest processor + */ +export class TextChunkingIngestProcessor extends Processor { + constructor() { + super(); + this.name = 'Text Chunking Processor'; + this.type = PROCESSOR_TYPE.TEXT_CHUNKING; + this.id = generateId('text_chunking_processor_ingest'); + this.fields = [ + { + id: 'field_map', + type: 'map', + }, + { + id: 'algorithm', + type: 'select', + selectOptions: [ + TEXT_CHUNKING_ALGORITHM.FIXED_TOKEN_LENGTH, + TEXT_CHUNKING_ALGORITHM.DELIMITER, + ], + }, + ]; + // optional params include all of those possible from both text chunking algorithms. + // for more details, see https://opensearch.org/docs/latest/ingest-pipelines/processors/text-chunking/ + // the list of optional params per algorithm and shared across algorithms is persisted in + // common/constants.ts + this.optionalFields = [ + // fixed_token_length optional params + { + id: 'token_limit', + type: 'number', + value: 384, + }, + { + id: 'tokenizer', + type: 'string', + value: 'standard', + }, + { + id: 'overlap_rate', + type: 'number', + value: 0, + }, + // delimiter optional params + { + id: 'delimiter', + type: 'string', + }, + // shared optional params (independent of algorithm) + { + id: 'max_chunk_limit', + type: 'number', + value: 100, + }, + { + id: 'description', + type: 'string', + }, + { + id: 'tag', + type: 'string', + }, + ]; + } +} diff --git a/public/configs/ml_processor.ts b/public/configs/ml_processor.ts index ed2065a9..bbfac637 100644 --- a/public/configs/ml_processor.ts +++ b/public/configs/ml_processor.ts @@ -21,13 +21,47 @@ export abstract class MLProcessor extends Processor { type: 'model', }, { - id: 'inputMap', + id: 'input_map', type: 'mapArray', }, { - id: 'outputMap', + id: 'output_map', type: 'mapArray', }, ]; + this.optionalFields = [ + { + id: 'description', + type: 'string', + }, + { + id: 'model_config', + type: 'json', + }, + { + id: 'full_response_path', + type: 'boolean', + value: false, + }, + { + id: 'ignore_missing', + type: 'boolean', + value: false, + }, + { + id: 'ignore_failure', + type: 'boolean', + value: false, + }, + { + id: 'max_prediction_tasks', + type: 'number', + value: 10, + }, + { + id: 'tag', + type: 'string', + }, + ]; } } diff --git a/public/configs/sort_processor.ts b/public/configs/sort_processor.ts index 7307063e..bc30d410 100644 --- a/public/configs/sort_processor.ts +++ b/public/configs/sort_processor.ts @@ -19,16 +19,27 @@ export abstract class SortProcessor extends Processor { { id: 'field', type: 'string', - label: 'Field', }, + ]; + this.optionalFields = [ { id: 'order', type: 'select', - label: 'Order', - optional: true, selectOptions: [SORT_ORDER.ASC, SORT_ORDER.DESC], value: SORT_ORDER.ASC, }, + { + id: 'target_field', + type: 'string', + }, + { + id: 'description', + type: 'string', + }, + { + id: 'tag', + type: 'string', + }, ]; } } diff --git a/public/configs/split_processor.ts b/public/configs/split_processor.ts index 3a523683..3e0349f7 100644 --- a/public/configs/split_processor.ts +++ b/public/configs/split_processor.ts @@ -19,12 +19,31 @@ export abstract class SplitProcessor extends Processor { { id: 'field', type: 'string', - label: 'Field', }, { id: 'separator', type: 'string', - label: 'Separator', + }, + ]; + this.optionalFields = [ + // TODO: although listed in docs, this field doesn't seem to exist. Fails + // at regular API level. + // { + // id: 'preserve_field', + // type: 'boolean', + // value: false, + // }, + { + id: 'target_field', + type: 'string', + }, + { + id: 'description', + type: 'string', + }, + { + id: 'tag', + type: 'string', }, ]; } diff --git a/public/pages/workflow_detail/workflow_inputs/config_field_list.tsx b/public/pages/workflow_detail/workflow_inputs/config_field_list.tsx index e28ca6bf..929e2f1e 100644 --- a/public/pages/workflow_detail/workflow_inputs/config_field_list.tsx +++ b/public/pages/workflow_detail/workflow_inputs/config_field_list.tsx @@ -5,8 +5,14 @@ import React from 'react'; import { EuiFlexItem, EuiSpacer } from '@elastic/eui'; -import { TextField, ModelField, SelectField } from './input_fields'; -import { IConfig } from '../../../../common'; +import { + TextField, + SelectField, + BooleanField, + NumberField, +} from './input_fields'; +import { IConfigField } from '../../../../common'; +import { camelCaseToTitleString } from '../../../utils'; /** * A helper component to format all of the input fields for a component. Dynamically @@ -14,7 +20,8 @@ import { IConfig } from '../../../../common'; */ interface ConfigFieldListProps { - config: IConfig; + configId: string; + configFields: IConfigField[]; baseConfigPath: string; // the base path of the nested config, if applicable. e.g., 'ingest.enrich' onFormChange: () => void; } @@ -22,20 +29,17 @@ interface ConfigFieldListProps { const CONFIG_FIELD_SPACER_SIZE = 'm'; export function ConfigFieldList(props: ConfigFieldListProps) { - const configFields = props.config.fields || []; - const configId = props.config.id; return ( - {configFields.map((field, idx) => { + {props.configFields.map((field, idx) => { let el; switch (field.type) { case 'string': { el = ( @@ -49,7 +53,43 @@ export function ConfigFieldList(props: ConfigFieldListProps) { + + + ); + break; + } + case 'boolean': { + el = ( + + + + + ); + break; + } + case 'number': { + el = ( + + diff --git a/public/pages/workflow_detail/workflow_inputs/input_fields/boolean_field.tsx b/public/pages/workflow_detail/workflow_inputs/input_fields/boolean_field.tsx index 98368d5d..cf9dbcd3 100644 --- a/public/pages/workflow_detail/workflow_inputs/input_fields/boolean_field.tsx +++ b/public/pages/workflow_detail/workflow_inputs/input_fields/boolean_field.tsx @@ -5,13 +5,24 @@ import React from 'react'; import { Field, FieldProps } from 'formik'; -import { EuiCompressedRadioGroup, EuiRadioGroupOption } from '@elastic/eui'; +import { + EuiCompressedFormRow, + EuiCompressedRadioGroup, + EuiLink, + EuiRadioGroupOption, + EuiText, +} from '@elastic/eui'; +import { camelCaseToTitleString } from '../../../../utils'; interface BooleanFieldProps { fieldPath: string; // the full path in string-form to the field (e.g., 'ingest.enrich.processors.text_embedding_processor.inputField') onFormChange: () => void; enabledOption: EuiRadioGroupOption; disabledOption: EuiRadioGroupOption; + label?: string; + helpLink?: string; + helpText?: string; + showLabel?: boolean; } /** @@ -22,18 +33,37 @@ export function BooleanField(props: BooleanFieldProps) { {({ field, form }: FieldProps) => { return ( - { - form.setFieldValue(field.name, !field.value); - props.onFormChange(); - }} - /> + labelAppend={ + props.helpLink ? ( + + + Learn more + + + ) : undefined + } + helpText={props.helpText || undefined} + isInvalid={false} + > + { + form.setFieldValue(field.name, !field.value); + props.onFormChange(); + }} + /> + ); }} diff --git a/public/pages/workflow_detail/workflow_inputs/input_fields/index.ts b/public/pages/workflow_detail/workflow_inputs/input_fields/index.ts index 58fd6418..3928800a 100644 --- a/public/pages/workflow_detail/workflow_inputs/input_fields/index.ts +++ b/public/pages/workflow_detail/workflow_inputs/input_fields/index.ts @@ -10,3 +10,4 @@ export { MapField } from './map_field'; export { MapArrayField } from './map_array_field'; export { BooleanField } from './boolean_field'; export { SelectField } from './select_field'; +export { NumberField } from './number_field'; diff --git a/public/pages/workflow_detail/workflow_inputs/input_fields/json_field.tsx b/public/pages/workflow_detail/workflow_inputs/input_fields/json_field.tsx index 833d4c00..0cace946 100644 --- a/public/pages/workflow_detail/workflow_inputs/input_fields/json_field.tsx +++ b/public/pages/workflow_detail/workflow_inputs/input_fields/json_field.tsx @@ -5,8 +5,14 @@ import React, { useEffect, useState } from 'react'; import { Field, FieldProps, getIn, useFormikContext } from 'formik'; -import { EuiCodeEditor, EuiCompressedFormRow, EuiLink, EuiText } from '@elastic/eui'; +import { + EuiCodeEditor, + EuiCompressedFormRow, + EuiLink, + EuiText, +} from '@elastic/eui'; import { WorkspaceFormValues } from '../../../../../common'; +import { camelCaseToTitleString } from '../../../../utils'; interface JsonFieldProps { fieldPath: string; // the full path in string-form to the field (e.g., 'ingest.enrich.processors.text_embedding_processor.inputField') @@ -44,7 +50,7 @@ export function JsonField(props: JsonFieldProps) { return ( diff --git a/public/pages/workflow_detail/workflow_inputs/input_fields/number_field.tsx b/public/pages/workflow_detail/workflow_inputs/input_fields/number_field.tsx new file mode 100644 index 00000000..7bca568c --- /dev/null +++ b/public/pages/workflow_detail/workflow_inputs/input_fields/number_field.tsx @@ -0,0 +1,68 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +import React from 'react'; +import { Field, FieldProps, getIn, useFormikContext } from 'formik'; +import { + EuiCompressedFormRow, + EuiLink, + EuiText, + EuiFieldNumber, +} from '@elastic/eui'; +import { WorkspaceFormValues } from '../../../../../common'; +import { camelCaseToTitleString, getInitialValue } from '../../../../utils'; + +interface NumberFieldProps { + fieldPath: string; // the full path in string-form to the field (e.g., 'ingest.enrich.processors.text_embedding_processor.inputField') + onFormChange: () => void; + label?: string; + helpLink?: string; + helpText?: string; + placeholder?: string; + showError?: boolean; +} + +/** + * An input field for a component where users input numbers + */ +export function NumberField(props: NumberFieldProps) { + const { errors, touched } = useFormikContext(); + + return ( + + {({ field, form }: FieldProps) => { + return ( + + + Learn more + + + ) : undefined + } + helpText={props.helpText || undefined} + error={props.showError && getIn(errors, field.name)} + isInvalid={getIn(errors, field.name) && getIn(touched, field.name)} + > + { + form.setFieldValue(props.fieldPath, e.target.value); + props.onFormChange(); + }} + /> + + ); + }} + + ); +} diff --git a/public/pages/workflow_detail/workflow_inputs/input_fields/select_field.tsx b/public/pages/workflow_detail/workflow_inputs/input_fields/select_field.tsx index 2b850b3a..23d4deb2 100644 --- a/public/pages/workflow_detail/workflow_inputs/input_fields/select_field.tsx +++ b/public/pages/workflow_detail/workflow_inputs/input_fields/select_field.tsx @@ -12,11 +12,13 @@ import { EuiText, } from '@elastic/eui'; import { WorkspaceFormValues, IConfigField } from '../../../../../common'; +import { camelCaseToTitleString } from '../../../../utils'; interface SelectFieldProps { field: IConfigField; fieldPath: string; // the full path in string-form to the field (e.g., 'ingest.enrich.processors.text_embedding_processor.inputField') onFormChange: () => void; + onSelectChange?: (option: string) => void; } /** @@ -29,7 +31,7 @@ export function SelectField(props: SelectFieldProps) { {({ field, form }: FieldProps) => { return ( - + field.id === 'inputMap' + (field) => field.id === 'input_map' ) as IConfigField; const inputMapFieldPath = `${props.baseConfigPath}.${props.config.id}.${inputMapField.id}`; const inputMapValue = getIn(values, inputMapFieldPath); const outputMapField = props.config.fields.find( - (field) => field.id === 'outputMap' + (field) => field.id === 'output_map' ) as IConfigField; const outputMapFieldPath = `${props.baseConfigPath}.${props.config.id}.${outputMapField.id}`; const outputMapValue = getIn(values, outputMapFieldPath); @@ -113,7 +115,7 @@ export function MLProcessorInputs(props: MLProcessorInputsProps) { // Hook to listen when the selected model has changed. We do a few checks here: // 1: update model interface states - // 2. clear out any persisted inputMap/outputMap form values, as those would now be invalid + // 2. clear out any persisted input_map/output_map form values, as those would now be invalid function onModelChange(modelId: string) { updateModelInterfaceStates(modelId); setFieldValue(inputMapFieldPath, []); @@ -296,6 +298,20 @@ export function MLProcessorInputs(props: MLProcessorInputsProps) { color="danger" /> )} + + + + + )} diff --git a/public/pages/workflow_detail/workflow_inputs/processor_inputs/processor_inputs.tsx b/public/pages/workflow_detail/workflow_inputs/processor_inputs/processor_inputs.tsx index e55b3beb..975dc8f3 100644 --- a/public/pages/workflow_detail/workflow_inputs/processor_inputs/processor_inputs.tsx +++ b/public/pages/workflow_detail/workflow_inputs/processor_inputs/processor_inputs.tsx @@ -4,7 +4,8 @@ */ import React from 'react'; -import { EuiFlexItem, EuiSpacer } from '@elastic/eui'; +import { EuiAccordion, EuiFlexItem, EuiSpacer } from '@elastic/eui'; +import { isEmpty } from 'lodash'; import { IProcessorConfig, PROCESSOR_CONTEXT, @@ -13,6 +14,7 @@ import { } from '../../../../../common'; import { MLProcessorInputs } from './ml_processor_inputs'; import { ConfigFieldList } from '../config_field_list'; +import { TextChunkingProcessorInputs } from './text_chunking_processor_inputs'; /** * Base component for rendering processor form inputs based on the processor type @@ -28,8 +30,13 @@ interface ProcessorInputsProps { const PROCESSOR_INPUTS_SPACER_SIZE = 'm'; +// Component to dynamically render the processor inputs based on the processor types. +// For most processors, we can use the standard/default ConfigFieldList components +// for rendering the required and optional fields. For more complex processors, we have +// standalone, specialized components. export function ProcessorInputs(props: ProcessorInputsProps) { const configType = props.config.type; + return ( {(() => { @@ -50,14 +57,47 @@ export function ProcessorInputs(props: ProcessorInputsProps) { ); break; } - default: { + case PROCESSOR_TYPE.TEXT_CHUNKING: { el = ( - + + + ); + break; + } + default: { + el = ( + + <> + + {!isEmpty(props.config.optionalFields) && ( + + + + + )} + ); break; diff --git a/public/pages/workflow_detail/workflow_inputs/processor_inputs/text_chunking_processor_inputs.tsx b/public/pages/workflow_detail/workflow_inputs/processor_inputs/text_chunking_processor_inputs.tsx new file mode 100644 index 00000000..607be6f9 --- /dev/null +++ b/public/pages/workflow_detail/workflow_inputs/processor_inputs/text_chunking_processor_inputs.tsx @@ -0,0 +1,127 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +import React, { useState } from 'react'; +import { getIn, useFormikContext } from 'formik'; +import { EuiAccordion, EuiCallOut, EuiSpacer } from '@elastic/eui'; +import { + IProcessorConfig, + IConfigField, + PROCESSOR_CONTEXT, + WorkflowConfig, + TEXT_CHUNKING_ALGORITHM, + FIXED_TOKEN_LENGTH_OPTIONAL_FIELDS, + DELIMITER_OPTIONAL_FIELDS, + SHARED_OPTIONAL_FIELDS, + WorkflowFormValues, + MapFormValue, + TEXT_CHUNKING_PROCESSOR_LINK, +} from '../../../../../common'; +import { MapField, SelectField } from '../input_fields'; +import { ConfigFieldList } from '../config_field_list'; + +interface TextChunkingProcessorInputsProps { + uiConfig: WorkflowConfig; + config: IProcessorConfig; + baseConfigPath: string; // the base path of the nested config, if applicable. e.g., 'ingest.enrich' + onFormChange: () => void; + context: PROCESSOR_CONTEXT; +} + +/** + * Specialized component to render the text chunking ingest processor. The list of optional + * params we display is dependent on the source algorithm that is chosen. Internally, we persist + * all of the params, but only choose the relevant ones when constructing the final ingest processor + * template. This is to minimize the amount of ui config / form / schema updates we would need + * to do if we only persisted the subset of optional params specific to the currently-chosen algorithm. + */ +export function TextChunkingProcessorInputs( + props: TextChunkingProcessorInputsProps +) { + const { values } = useFormikContext(); + + // extracting field info from the text chunking processor config + // TODO: have a better mechanism for guaranteeing the expected fields/config instead of hardcoding them here + const algorithmFieldPath = `${props.baseConfigPath}.${props.config.id}.algorithm`; + const algorithmField = props.config.fields.find( + (field) => field.id === 'algorithm' + ) as IConfigField; + const fieldMapFieldPath = `${props.baseConfigPath}.${props.config.id}.field_map`; + const fieldMapValue = getIn(values, fieldMapFieldPath) as MapFormValue; + + // algorithm optional fields state + const [algorithmOptionalFields, setAlgorithmOptionalFields] = useState< + string[] + >( + algorithmField !== undefined && algorithmField.value !== undefined + ? algorithmField.value === TEXT_CHUNKING_ALGORITHM.FIXED_TOKEN_LENGTH + ? FIXED_TOKEN_LENGTH_OPTIONAL_FIELDS + : DELIMITER_OPTIONAL_FIELDS + : FIXED_TOKEN_LENGTH_OPTIONAL_FIELDS + ); + + // Update the optional fields to display when the algorithm is changed + function onAlgorithmChange(algorithm: string) { + setAlgorithmOptionalFields( + algorithm === TEXT_CHUNKING_ALGORITHM.FIXED_TOKEN_LENGTH + ? FIXED_TOKEN_LENGTH_OPTIONAL_FIELDS + : DELIMITER_OPTIONAL_FIELDS + ); + } + + return ( + <> + + + + {fieldMapValue?.length === 0 && ( + <> + + + + )} + + + + + algorithmOptionalFields.includes(optionalField.id) + ) || []), + ...(props.config.optionalFields?.filter((optionalField) => + SHARED_OPTIONAL_FIELDS.includes(optionalField.id) + ) || []), + ]} + baseConfigPath={props.baseConfigPath} + onFormChange={props.onFormChange} + /> + + + ); +} diff --git a/public/pages/workflow_detail/workflow_inputs/processors_list.tsx b/public/pages/workflow_detail/workflow_inputs/processors_list.tsx index 638bf926..bee13710 100644 --- a/public/pages/workflow_detail/workflow_inputs/processors_list.tsx +++ b/public/pages/workflow_detail/workflow_inputs/processors_list.tsx @@ -32,6 +32,7 @@ import { SortSearchResponseProcessor, SplitIngestProcessor, SplitSearchResponseProcessor, + TextChunkingIngestProcessor, } from '../../../configs'; import { ProcessorInputs } from './processor_inputs'; @@ -225,6 +226,15 @@ export function ProcessorsList(props: ProcessorsListProps) { addProcessor(new SortIngestProcessor().toObj()); }, }, + { + name: 'Text Chunking Processor', + onClick: () => { + closePopover(); + addProcessor( + new TextChunkingIngestProcessor().toObj() + ); + }, + }, ] : props.context === PROCESSOR_CONTEXT.SEARCH_REQUEST ? [ diff --git a/public/pages/workflow_detail/workflow_inputs/workflow_inputs.tsx b/public/pages/workflow_detail/workflow_inputs/workflow_inputs.tsx index abb36f5e..906159ee 100644 --- a/public/pages/workflow_detail/workflow_inputs/workflow_inputs.tsx +++ b/public/pages/workflow_detail/workflow_inputs/workflow_inputs.tsx @@ -482,6 +482,7 @@ export function WorkflowInputs(props: WorkflowInputsProps) { ), }} + showLabel={false} /> )} diff --git a/public/utils/config_to_form_utils.ts b/public/utils/config_to_form_utils.ts index 3fdcfcab..6563f6f6 100644 --- a/public/utils/config_to_form_utils.ts +++ b/public/utils/config_to_form_utils.ts @@ -67,6 +67,10 @@ export function processorConfigToFormik( processorConfig.fields.forEach((field) => { fieldValues[field.id] = field.value || getInitialValue(field.type); }); + processorConfig.optionalFields?.forEach((optionalField) => { + fieldValues[optionalField.id] = + optionalField.value || getInitialValue(optionalField.type); + }); return fieldValues; } @@ -125,5 +129,11 @@ export function getInitialValue(fieldType: ConfigFieldType): ConfigFieldValue { case 'mapArray': { return []; } + case 'boolean': { + return false; + } + case 'number': { + return 0; + } } } diff --git a/public/utils/config_to_schema_utils.ts b/public/utils/config_to_schema_utils.ts index 030fcf4f..6a4801d4 100644 --- a/public/utils/config_to_schema_utils.ts +++ b/public/utils/config_to_schema_utils.ts @@ -74,6 +74,12 @@ function processorsConfigToSchema(processorsConfig: ProcessorsConfig): Schema { processorConfig.fields.forEach((field) => { processorSchemaObj[field.id] = getFieldSchema(field); }); + processorConfig.optionalFields?.forEach((optionalField) => { + processorSchemaObj[optionalField.id] = getFieldSchema( + optionalField, + true + ); + }); processorsSchemaObj[processorConfig.id] = yup.object(processorSchemaObj); }); @@ -84,7 +90,10 @@ function processorsConfigToSchema(processorsConfig: ProcessorsConfig): Schema { **************** Yup (validation) utils ********************** */ -function getFieldSchema(field: IConfigField): Schema { +function getFieldSchema( + field: IConfigField, + optional: boolean = false +): Schema { let baseSchema: Schema; switch (field.type) { case 'string': @@ -157,9 +166,14 @@ function getFieldSchema(field: IConfigField): Schema { ); break; } + case 'boolean': { + baseSchema = yup.boolean(); + break; + } + case 'number': { + baseSchema = yup.number(); + } } - return field.optional - ? baseSchema.optional() - : baseSchema.required('Required'); + return optional ? baseSchema.optional() : baseSchema.required('Required'); } diff --git a/public/utils/config_to_template_utils.ts b/public/utils/config_to_template_utils.ts index 8cde6131..af857405 100644 --- a/public/utils/config_to_template_utils.ts +++ b/public/utils/config_to_template_utils.ts @@ -3,6 +3,8 @@ * SPDX-License-Identifier: Apache-2.0 */ +import { FormikValues } from 'formik'; +import { isEmpty } from 'lodash'; import { TemplateFlows, TemplateNode, @@ -27,6 +29,10 @@ import { SearchConfig, MapFormValue, MapEntry, + TEXT_CHUNKING_ALGORITHM, + SHARED_OPTIONAL_FIELDS, + FIXED_TOKEN_LENGTH_OPTIONAL_FIELDS, + DELIMITER_OPTIONAL_FIELDS, } from '../../common'; import { processorConfigToFormik } from './config_to_form_utils'; import { generateId } from './utils'; @@ -133,8 +139,6 @@ function searchConfigToTemplateNodes( // General fn to process all processor configs and convert them // into a final list of template-formatted IngestProcessor/SearchProcessors. -// TODO: improve the type safety of the returned form values. Have defined interfaces -// for each processor type, including the handling of any configured optional fields export function processorConfigsToTemplateProcessors( processorConfigs: IProcessorConfig[] ): (IngestProcessor | SearchProcessor)[] { @@ -143,12 +147,12 @@ export function processorConfigsToTemplateProcessors( processorConfigs.forEach((processorConfig) => { switch (processorConfig.type) { case PROCESSOR_TYPE.ML: { - const { model, inputMap, outputMap } = processorConfigToFormik( + const { model, input_map, output_map } = processorConfigToFormik( processorConfig ) as { model: ModelFormValue; - inputMap: MapArrayFormValue; - outputMap: MapArrayFormValue; + input_map: MapArrayFormValue; + output_map: MapArrayFormValue; }; let processor = { @@ -156,46 +160,84 @@ export function processorConfigsToTemplateProcessors( model_id: model.id, }, } as MLInferenceProcessor; - if (inputMap?.length > 0) { - processor.ml_inference.input_map = inputMap.map((mapFormValue) => + if (input_map?.length > 0) { + processor.ml_inference.input_map = input_map.map((mapFormValue) => mergeMapIntoSingleObj(mapFormValue) ); } - if (outputMap?.length > 0) { - processor.ml_inference.output_map = outputMap.map((mapFormValue) => + if (output_map?.length > 0) { + processor.ml_inference.output_map = output_map.map((mapFormValue) => mergeMapIntoSingleObj(mapFormValue) ); } processorsList.push(processor); break; } - case PROCESSOR_TYPE.SPLIT: { - const { field, separator } = processorConfigToFormik( - processorConfig - ) as { field: string; separator: string }; - processorsList.push({ - split: { - field, - separator, - }, + // only include the optional field form values that are relevant + // to the selected algorithm. always add any common/shared form values. + case PROCESSOR_TYPE.TEXT_CHUNKING: { + const formValues = processorConfigToFormik(processorConfig); + let finalFormValues = {} as FormikValues; + const algorithm = formValues['algorithm'] as TEXT_CHUNKING_ALGORITHM; + Object.keys(formValues).forEach((formKey: string) => { + const formValue = formValues[formKey]; + if (SHARED_OPTIONAL_FIELDS.includes(formKey)) { + finalFormValues = optionallyAddToFinalForm( + finalFormValues, + formKey, + formValue + ); + } else { + if (algorithm === TEXT_CHUNKING_ALGORITHM.FIXED_TOKEN_LENGTH) { + if (FIXED_TOKEN_LENGTH_OPTIONAL_FIELDS.includes(formKey)) { + finalFormValues = optionallyAddToFinalForm( + finalFormValues, + formKey, + formValue + ); + } + } else { + if (DELIMITER_OPTIONAL_FIELDS.includes(formKey)) { + finalFormValues = optionallyAddToFinalForm( + finalFormValues, + formKey, + formValue + ); + } + } + } }); - break; - } - case PROCESSOR_TYPE.SORT: { - const { field, order } = processorConfigToFormik(processorConfig) as { - field: string; - order: string; + // add the field map config obj + finalFormValues = { + ...finalFormValues, + field_map: mergeMapIntoSingleObj( + formValues['field_map'] as MapFormValue + ), }; processorsList.push({ - sort: { - field, - order, - }, + [processorConfig.type]: finalFormValues, }); break; } + case PROCESSOR_TYPE.SPLIT: + case PROCESSOR_TYPE.SORT: default: { + const formValues = processorConfigToFormik(processorConfig); + let finalFormValues = {} as FormikValues; + // iterate through the form values, ignoring any empty + // field (empty fields can be possible if the field is optional) + Object.keys(formValues).forEach((formKey: string) => { + const formValue = formValues[formKey]; + finalFormValues = optionallyAddToFinalForm( + finalFormValues, + formKey, + formValue + ); + }); + processorsList.push({ + [processorConfig.type]: finalFormValues, + }); break; } } @@ -294,3 +336,17 @@ function mergeMapIntoSingleObj(mapFormValue: MapFormValue): {} { }); return curMap; } + +// utility fn used to build the final set of processor config fields, filtering +// by only adding if the field is valid +function optionallyAddToFinalForm( + finalFormValues: FormikValues, + formKey: string, + formValue: any +): FormikValues { + if (!isEmpty(formValue) || typeof formValue === 'boolean') { + finalFormValues[formKey] = + typeof formValue === 'boolean' ? formValue : formValue; + } + return finalFormValues; +} diff --git a/public/utils/config_to_workspace_utils.ts b/public/utils/config_to_workspace_utils.ts index f2de8a00..f57d3ad7 100644 --- a/public/utils/config_to_workspace_utils.ts +++ b/public/utils/config_to_workspace_utils.ts @@ -315,6 +315,13 @@ function processorsConfigToWorkspaceFlow( transformerNodeId = generateId(COMPONENT_CLASS.TRANSFORMER); break; } + case PROCESSOR_TYPE.TEXT_CHUNKING: { + transformer = new BaseTransformer( + processorConfig.name, + 'A processor to split long documents into shorter passages' + ); + transformerNodeId = generateId(COMPONENT_CLASS.TRANSFORMER); + } default: { transformer = new BaseTransformer(processorConfig.name, ''); transformerNodeId = generateId(COMPONENT_CLASS.TRANSFORMER); diff --git a/public/utils/form_to_config_utils.ts b/public/utils/form_to_config_utils.ts index 34020117..153977ab 100644 --- a/public/utils/form_to_config_utils.ts +++ b/public/utils/form_to_config_utils.ts @@ -101,6 +101,13 @@ function formikToProcessorsUiConfig( getInitialValue(processorField.type) ); }); + processorConfig.optionalFields?.forEach((processorField) => { + processorField.value = get( + processorFormValues, + processorField.id, + undefined + ); + }); }); return existingConfig; } diff --git a/public/utils/utils.ts b/public/utils/utils.ts index 4e78bbb8..828052c0 100644 --- a/public/utils/utils.ts +++ b/public/utils/utils.ts @@ -228,3 +228,13 @@ export function parseModelOutputs( } as ModelOutputFormField) ); } + +// converts camelCase to a space-delimited string with the first word capitalized. +// useful for converting config IDs (in snake_case) to a formatted form title +export function camelCaseToTitleString(snakeCaseString: string): string { + return snakeCaseString + .split('_') + .filter((word) => word.length > 0) + .map((word) => word.charAt(0).toUpperCase() + word.slice(1)) + .join(' '); +}