Skip to content

Commit

Permalink
feat(ChatKnowledge):chunk add enable_merge parameter (#1014)
Browse files Browse the repository at this point in the history
Co-authored-by: Aralhi <[email protected]>
  • Loading branch information
Aries-ckt and Aralhi authored Jan 4, 2024
1 parent fd30588 commit ca83443
Show file tree
Hide file tree
Showing 26 changed files with 98 additions and 41 deletions.
4 changes: 2 additions & 2 deletions dbgpt/app/knowledge/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from dbgpt.rag.chunk import Chunk
from dbgpt.rag.chunk_manager import ChunkParameters
from dbgpt.rag.embedding.embedding_factory import EmbeddingFactory
from dbgpt.rag.knowledge.base import KnowledgeType
from dbgpt.rag.knowledge.base import KnowledgeType, ChunkStrategy
from dbgpt.rag.knowledge.factory import KnowledgeFactory
from dbgpt.rag.text_splitter.text_splitter import (
RecursiveCharacterTextSplitter,
Expand Down Expand Up @@ -234,7 +234,7 @@ def batch_document_sync(
f" doc:{doc.doc_name} status is {doc.status}, can not sync"
)
chunk_parameters = sync_request.chunk_parameters
if "Automatic" == chunk_parameters.chunk_strategy:
if chunk_parameters.chunk_strategy != ChunkStrategy.CHUNK_BY_SIZE.name:
space_context = self.get_space_context(space_name)
chunk_parameters.chunk_size = (
CFG.KNOWLEDGE_CHUNK_SIZE
Expand Down
2 changes: 1 addition & 1 deletion dbgpt/app/static/404.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion dbgpt/app/static/404/index.html

Large diffs are not rendered by default.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

This file was deleted.

17 changes: 17 additions & 0 deletions dbgpt/app/static/_next/static/chunks/10-f02ccef88f814547.js

Large diffs are not rendered by default.

11 changes: 0 additions & 11 deletions dbgpt/app/static/_next/static/chunks/450-bd680f0e37e9b4b9.js

This file was deleted.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion dbgpt/app/static/agent/index.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion dbgpt/app/static/chat/[scene]/[id]/index.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion dbgpt/app/static/chat/index.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion dbgpt/app/static/database/index.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion dbgpt/app/static/index.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion dbgpt/app/static/knowledge/chunk/index.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion dbgpt/app/static/knowledge/index.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion dbgpt/app/static/models/index.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion dbgpt/app/static/prompt/index.html

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions dbgpt/rag/chunk_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@ class ChunkParameters(BaseModel):
default="\n",
description="chunk separator",
)
enable_merge: bool = Field(
default=None,
description="enable chunk merge by chunk_size.",
)


class ChunkManager:
Expand Down Expand Up @@ -134,4 +138,5 @@ def _select_text_splitter(
chunk_size=self._chunk_parameters.chunk_size,
chunk_overlap=self._chunk_parameters.chunk_overlap,
separator=self._chunk_parameters.separator,
enable_merge=self._chunk_parameters.enable_merge,
)
39 changes: 35 additions & 4 deletions dbgpt/rag/knowledge/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,22 +47,52 @@ class ChunkStrategy(Enum):
CHUNK_BY_SIZE = (
RecursiveCharacterTextSplitter,
[
{"param_name": "chunk_size", "param_type": "int", "default_value": 512},
{"param_name": "chunk_overlap", "param_type": "int", "default_value": 50},
{
"param_name": "chunk_size",
"param_type": "int",
"default_value": 512,
"description": "The size of the data chunks used in processing.",
},
{
"param_name": "chunk_overlap",
"param_type": "int",
"default_value": 50,
"description": "The amount of overlap between adjacent data chunks.",
},
],
"chunk size",
"split document by chunk size",
)
CHUNK_BY_PAGE = (PageTextSplitter, [], "page", "split document by page")
CHUNK_BY_PARAGRAPH = (
ParagraphTextSplitter,
[{"param_name": "separator", "param_type": "string", "default_value": "\n"}],
[
{
"param_name": "separator",
"param_type": "string",
"default_value": "\\n",
"description": "paragraph separator",
}
],
"paragraph",
"split document by paragraph",
)
CHUNK_BY_SEPARATOR = (
SeparatorTextSplitter,
[{"param_name": "separator", "param_type": "string", "default_value": "\n"}],
[
{
"param_name": "separator",
"param_type": "string",
"default_value": "\\n",
"description": "chunk separator",
},
{
"param_name": "enable_merge",
"param_type": "boolean",
"default_value": False,
"description": "Whether to merge according to the chunk_size after splitting by the separator.",
},
],
"separator",
"split document by separator",
)
Expand All @@ -80,6 +110,7 @@ def __init__(self, splitter_class, parameters, alias, description):
self.description = description

def match(self, *args, **kwargs):
kwargs = {k: v for k, v in kwargs.items() if v is not None}
return self.value[0](*args, **kwargs)


Expand Down
5 changes: 4 additions & 1 deletion dbgpt/rag/text_splitter/text_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -682,6 +682,7 @@ class SeparatorTextSplitter(CharacterTextSplitter):

def __init__(self, separator: str = "\n", filters: list = [], **kwargs: Any):
"""Create a new TextSplitter."""
self._merge = kwargs.pop("enable_merge") or False
super().__init__(**kwargs)
self._separator = separator
self._filter = filters
Expand All @@ -696,7 +697,9 @@ def split_text(
splits = text.split(separator)
else:
splits = list(text)
return self._merge_splits(splits, separator, chunk_overlap=0, **kwargs)
if self._merge:
return self._merge_splits(splits, separator, chunk_overlap=0, **kwargs)
return list(filter(None, text.split(separator)))


class PageTextSplitter(TextSplitter):
Expand Down
6 changes: 2 additions & 4 deletions web/app/i18n.ts
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ const resources = {
recall_score: 'recall_score',
Set_a_threshold_score: 'Set a threshold score for the retrieval of similar vectors',
recall_type: 'recall_type',
Recall_Type: 'recall type',
model: 'model',
A_model_used: 'A model used to create vector representations of text or other data',
Automatic: 'Automatic',
Expand Down Expand Up @@ -239,12 +238,11 @@ const resources = {
Please_select_a_file: '请上传一个文件',
Please_input_the_text: '请输入文本',
Embedding: '嵌入',
topk: '',
topk: 'TopK',
the_top_k_vectors: '基于相似度得分的前 k 个向量',
recall_score: '召回分数',
Set_a_threshold_score: '设置相似向量检索的阈值分数',
recall_type: '回忆类型',
Recall_Type: '回忆类型',
recall_type: '召回类型',
model: '模型',
A_model_used: '用于创建文本或其他数据的矢量表示的模型',
Automatic: '自动切片',
Expand Down
2 changes: 1 addition & 1 deletion web/components/knowledge/arguments-modal.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ export default function ArgumentsModal({ space, argumentsShow, setArgumentsShow
</Form.Item>
</Col>
<Col span={12}>
<Form.Item<IArguments> tooltip={t(`Recall_Type`)} rules={[{ required: true }]} label={t('recall_type')} name={['embedding', 'recall_type']}>
<Form.Item<IArguments> tooltip={t(`recall_type`)} rules={[{ required: true }]} label={t('recall_type')} name={['embedding', 'recall_type']}>
<Input className="mb-5 h-12" />
</Form.Item>
</Col>
Expand Down
21 changes: 17 additions & 4 deletions web/components/knowledge/strategy-form.tsx
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { IChunkStrategyResponse } from '@/types/knowledge';
import { Alert, Form, FormListFieldData, Input, InputNumber, Radio, RadioChangeEvent } from 'antd';
import { Alert, Checkbox, Form, FormListFieldData, Input, InputNumber, Radio, RadioChangeEvent } from 'antd';
import { useState } from 'react';
import { useTranslation } from 'react-i18next';
const { TextArea } = Input;
Expand All @@ -25,7 +25,7 @@ export default function StrategyForm({ strategies, docType, fileName, field }: I
const [selectedStrategy, setSelectedStrategy] = useState<string>();
const { t } = useTranslation();
const DEFAULT_STRATEGY = {
strategy: t('Automatic'),
strategy: 'Automatic',
name: t('Automatic'),
desc: t('Automatic_desc'),
};
Expand All @@ -50,17 +50,30 @@ export default function StrategyForm({ strategies, docType, fileName, field }: I
{parameters?.map((param) => (
<Form.Item
key={`param_${param.param_name}`}
label={`${param.param_name}: ${param.param_type}`}
label={param.param_name}
name={[field!.name, 'chunk_parameters', param.param_name]}
rules={[{ required: true, message: t('Please_input_the_name') }]}
initialValue={param.default_value}
valuePropName={param.param_type === 'boolean' ? 'checked' : 'value'}
tooltip={param.description}
>
{param.param_type === 'int' ? <InputNumber className="w-full" min={1} /> : <TextArea className="w-full" rows={2} maxLength={6} />}
{renderParamByType(param.param_type)}
</Form.Item>
))}
</div>
);
}

function renderParamByType(type: string) {
switch (type) {
case 'int':
return <InputNumber className="w-full" min={1} />;
case 'string':
return <TextArea className="w-full" rows={2} />;
case 'boolean':
return <Checkbox />;
}
}
return (
<>
<Form.Item name={[field!.name, 'chunk_parameters', 'chunk_strategy']} initialValue={DEFAULT_STRATEGY.strategy}>
Expand Down
1 change: 1 addition & 0 deletions web/types/knowledge.ts
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ export type IStrategyParameter = {
param_name: string;
param_type: string;
default_value?: string | number;
description: string;
};

export type IChunkStrategyResponse = {
Expand Down

0 comments on commit ca83443

Please sign in to comment.