Merge branch 'main' into feat/web-theme

eosphoros-ai · Jan 4, 2024 · 71d86cb · 71d86cb
2 parents 0cd1837 + ca83443
commit 71d86cb
Show file tree

Hide file tree

Showing 26 changed files with 98 additions and 41 deletions.
diff --git a/dbgpt/app/knowledge/service.py b/dbgpt/app/knowledge/service.py
@@ -7,7 +7,7 @@
 from dbgpt.rag.chunk import Chunk
 from dbgpt.rag.chunk_manager import ChunkParameters
 from dbgpt.rag.embedding.embedding_factory import EmbeddingFactory
-from dbgpt.rag.knowledge.base import KnowledgeType
+from dbgpt.rag.knowledge.base import KnowledgeType, ChunkStrategy
 from dbgpt.rag.knowledge.factory import KnowledgeFactory
 from dbgpt.rag.text_splitter.text_splitter import (
     RecursiveCharacterTextSplitter,
@@ -234,7 +234,7 @@ def batch_document_sync(
                     f" doc:{doc.doc_name} status is {doc.status}, can not sync"
                 )
             chunk_parameters = sync_request.chunk_parameters
-            if "Automatic" == chunk_parameters.chunk_strategy:
+            if chunk_parameters.chunk_strategy != ChunkStrategy.CHUNK_BY_SIZE.name:
                 space_context = self.get_space_context(space_name)
                 chunk_parameters.chunk_size = (
                     CFG.KNOWLEDGE_CHUNK_SIZE

diff --git a/dbgpt/app/static/404.html b/dbgpt/app/static/404.html
diff --git a/dbgpt/app/static/404/index.html b/dbgpt/app/static/404/index.html
diff --git a/dbgpt/app/static/_next/static/I7dQBXvA2vgamhtZ4dNdp/_buildManifest.js b/dbgpt/app/static/_next/static/I7dQBXvA2vgamhtZ4dNdp/_buildManifest.js
diff --git a/...tic/ZbyASIrV2Qi_gF86DN3uR/_ssgManifest.js → ...tic/I7dQBXvA2vgamhtZ4dNdp/_ssgManifest.js b/...tic/ZbyASIrV2Qi_gF86DN3uR/_ssgManifest.js → ...tic/I7dQBXvA2vgamhtZ4dNdp/_ssgManifest.js
diff --git a/dbgpt/app/static/_next/static/ZbyASIrV2Qi_gF86DN3uR/_buildManifest.js b/dbgpt/app/static/_next/static/ZbyASIrV2Qi_gF86DN3uR/_buildManifest.js
diff --git a/dbgpt/app/static/_next/static/chunks/10-f02ccef88f814547.js b/dbgpt/app/static/_next/static/chunks/10-f02ccef88f814547.js
diff --git a/dbgpt/app/static/_next/static/chunks/450-bd680f0e37e9b4b9.js b/dbgpt/app/static/_next/static/chunks/450-bd680f0e37e9b4b9.js
diff --git a/...tic/chunks/pages/_app-27d17772c0f13b37.js → ...tic/chunks/pages/_app-acb9967659ff9821.js b/...tic/chunks/pages/_app-27d17772c0f13b37.js → ...tic/chunks/pages/_app-acb9967659ff9821.js
diff --git a/...hunks/pages/knowledge-b9300e7addf1931f.js → ...hunks/pages/knowledge-f3c914cac944c089.js b/...hunks/pages/knowledge-b9300e7addf1931f.js → ...hunks/pages/knowledge-f3c914cac944c089.js
diff --git a/dbgpt/app/static/agent/index.html b/dbgpt/app/static/agent/index.html
diff --git a/dbgpt/app/static/chat/[scene]/[id]/index.html b/dbgpt/app/static/chat/[scene]/[id]/index.html
diff --git a/dbgpt/app/static/chat/index.html b/dbgpt/app/static/chat/index.html
diff --git a/dbgpt/app/static/database/index.html b/dbgpt/app/static/database/index.html
diff --git a/dbgpt/app/static/index.html b/dbgpt/app/static/index.html
diff --git a/dbgpt/app/static/knowledge/chunk/index.html b/dbgpt/app/static/knowledge/chunk/index.html
diff --git a/dbgpt/app/static/knowledge/index.html b/dbgpt/app/static/knowledge/index.html
diff --git a/dbgpt/app/static/models/index.html b/dbgpt/app/static/models/index.html
diff --git a/dbgpt/app/static/prompt/index.html b/dbgpt/app/static/prompt/index.html
diff --git a/dbgpt/rag/chunk_manager.py b/dbgpt/rag/chunk_manager.py
@@ -45,6 +45,10 @@ class ChunkParameters(BaseModel):
         default="\n",
         description="chunk separator",
     )
+    enable_merge: bool = Field(
+        default=None,
+        description="enable chunk merge by chunk_size.",
+    )
 
 
 class ChunkManager:
@@ -134,4 +138,5 @@ def _select_text_splitter(
             chunk_size=self._chunk_parameters.chunk_size,
             chunk_overlap=self._chunk_parameters.chunk_overlap,
             separator=self._chunk_parameters.separator,
+            enable_merge=self._chunk_parameters.enable_merge,
         )
diff --git a/dbgpt/rag/knowledge/base.py b/dbgpt/rag/knowledge/base.py
@@ -47,22 +47,52 @@ class ChunkStrategy(Enum):
     CHUNK_BY_SIZE = (
         RecursiveCharacterTextSplitter,
         [
-            {"param_name": "chunk_size", "param_type": "int", "default_value": 512},
-            {"param_name": "chunk_overlap", "param_type": "int", "default_value": 50},
+            {
+                "param_name": "chunk_size",
+                "param_type": "int",
+                "default_value": 512,
+                "description": "The size of the data chunks used in processing.",
+            },
+            {
+                "param_name": "chunk_overlap",
+                "param_type": "int",
+                "default_value": 50,
+                "description": "The amount of overlap between adjacent data chunks.",
+            },
         ],
         "chunk size",
         "split document by chunk size",
     )
     CHUNK_BY_PAGE = (PageTextSplitter, [], "page", "split document by page")
     CHUNK_BY_PARAGRAPH = (
         ParagraphTextSplitter,
-        [{"param_name": "separator", "param_type": "string", "default_value": "\n"}],
+        [
+            {
+                "param_name": "separator",
+                "param_type": "string",
+                "default_value": "\\n",
+                "description": "paragraph separator",
+            }
+        ],
         "paragraph",
         "split document by paragraph",
     )
     CHUNK_BY_SEPARATOR = (
         SeparatorTextSplitter,
-        [{"param_name": "separator", "param_type": "string", "default_value": "\n"}],
+        [
+            {
+                "param_name": "separator",
+                "param_type": "string",
+                "default_value": "\\n",
+                "description": "chunk separator",
+            },
+            {
+                "param_name": "enable_merge",
+                "param_type": "boolean",
+                "default_value": False,
+                "description": "Whether to merge according to the chunk_size after splitting by the separator.",
+            },
+        ],
         "separator",
         "split document by separator",
     )
@@ -80,6 +110,7 @@ def __init__(self, splitter_class, parameters, alias, description):
         self.description = description
 
     def match(self, *args, **kwargs):
+        kwargs = {k: v for k, v in kwargs.items() if v is not None}
         return self.value[0](*args, **kwargs)
 
 

diff --git a/dbgpt/rag/text_splitter/text_splitter.py b/dbgpt/rag/text_splitter/text_splitter.py
@@ -682,6 +682,7 @@ class SeparatorTextSplitter(CharacterTextSplitter):
 
     def __init__(self, separator: str = "\n", filters: list = [], **kwargs: Any):
         """Create a new TextSplitter."""
+        self._merge = kwargs.pop("enable_merge") or False
         super().__init__(**kwargs)
         self._separator = separator
         self._filter = filters
@@ -696,7 +697,9 @@ def split_text(
             splits = text.split(separator)
         else:
             splits = list(text)
-        return self._merge_splits(splits, separator, chunk_overlap=0, **kwargs)
+        if self._merge:
+            return self._merge_splits(splits, separator, chunk_overlap=0, **kwargs)
+        return list(filter(None, text.split(separator)))
 
 
 class PageTextSplitter(TextSplitter):

diff --git a/web/app/i18n.ts b/web/app/i18n.ts
@@ -61,7 +61,6 @@ const resources = {
       recall_score: 'recall_score',
       Set_a_threshold_score: 'Set a threshold score for the retrieval of similar vectors',
       recall_type: 'recall_type',
-      Recall_Type: 'recall type',
       model: 'model',
       A_model_used: 'A model used to create vector representations of text or other data',
       Automatic: 'Automatic',
@@ -240,12 +239,11 @@ const resources = {
       Please_select_a_file: '请上传一个文件',
       Please_input_the_text: '请输入文本',
       Embedding: '嵌入',
-      topk: '球',
+      topk: 'TopK',
       the_top_k_vectors: '基于相似度得分的前 k 个向量',
       recall_score: '召回分数',
       Set_a_threshold_score: '设置相似向量检索的阈值分数',
-      recall_type: '回忆类型',
-      Recall_Type: '回忆类型',
+      recall_type: '召回类型',
       model: '模型',
       A_model_used: '用于创建文本或其他数据的矢量表示的模型',
       Automatic: '自动切片',

diff --git a/web/components/knowledge/arguments-modal.tsx b/web/components/knowledge/arguments-modal.tsx
@@ -47,7 +47,7 @@ export default function ArgumentsModal({ space, argumentsShow, setArgumentsShow
           </Form.Item>
         </Col>
         <Col span={12}>
-          <Form.Item<IArguments> tooltip={t(`Recall_Type`)} rules={[{ required: true }]} label={t('recall_type')} name={['embedding', 'recall_type']}>
+          <Form.Item<IArguments> tooltip={t(`recall_type`)} rules={[{ required: true }]} label={t('recall_type')} name={['embedding', 'recall_type']}>
             <Input className="mb-5  h-12" />
           </Form.Item>
         </Col>

diff --git a/web/components/knowledge/strategy-form.tsx b/web/components/knowledge/strategy-form.tsx
@@ -1,5 +1,5 @@
 import { IChunkStrategyResponse } from '@/types/knowledge';
-import { Alert, Form, FormListFieldData, Input, InputNumber, Radio, RadioChangeEvent } from 'antd';
+import { Alert, Checkbox, Form, FormListFieldData, Input, InputNumber, Radio, RadioChangeEvent } from 'antd';
 import { useState } from 'react';
 import { useTranslation } from 'react-i18next';
 const { TextArea } = Input;
@@ -25,7 +25,7 @@ export default function StrategyForm({ strategies, docType, fileName, field }: I
   const [selectedStrategy, setSelectedStrategy] = useState<string>();
   const { t } = useTranslation();
   const DEFAULT_STRATEGY = {
-    strategy: t('Automatic'),
+    strategy: 'Automatic',
     name: t('Automatic'),
     desc: t('Automatic_desc'),
   };
@@ -50,17 +50,30 @@ export default function StrategyForm({ strategies, docType, fileName, field }: I
         {parameters?.map((param) => (
           <Form.Item
             key={`param_${param.param_name}`}
-            label={`${param.param_name}: ${param.param_type}`}
+            label={param.param_name}
             name={[field!.name, 'chunk_parameters', param.param_name]}
             rules={[{ required: true, message: t('Please_input_the_name') }]}
             initialValue={param.default_value}
+            valuePropName={param.param_type === 'boolean' ? 'checked' : 'value'}
+            tooltip={param.description}
           >
-            {param.param_type === 'int' ? <InputNumber className="w-full" min={1} /> : <TextArea className="w-full" rows={2} maxLength={6} />}
+            {renderParamByType(param.param_type)}
           </Form.Item>
         ))}
       </div>
     );
   }
+
+  function renderParamByType(type: string) {
+    switch (type) {
+      case 'int':
+        return <InputNumber className="w-full" min={1} />;
+      case 'string':
+        return <TextArea className="w-full" rows={2} />;
+      case 'boolean':
+        return <Checkbox />;
+    }
+  }
   return (
     <>
       <Form.Item name={[field!.name, 'chunk_parameters', 'chunk_strategy']} initialValue={DEFAULT_STRATEGY.strategy}>

diff --git a/web/types/knowledge.ts b/web/types/knowledge.ts
@@ -82,6 +82,7 @@ export type IStrategyParameter = {
   param_name: string;
   param_type: string;
   default_value?: string | number;
+  description: string;
 };
 
 export type IChunkStrategyResponse = {