feat: support configure chunking setting on knowledge base API (#623)

close #221 part of #527
pingcap · Feb 17, 2025 · 51ef131 · 51ef131
1 parent 13604e6
commit 51ef131
Show file tree

Hide file tree

Showing 15 changed files with 383 additions and 95 deletions.
diff --git a/backend/app/alembic/versions/211f3c5aa125_chunking_settings.py b/backend/app/alembic/versions/211f3c5aa125_chunking_settings.py
@@ -0,0 +1,31 @@
+"""chunking_settings
+
+Revision ID: 211f3c5aa125
+Revises: 2adc0b597dcd
+Create Date: 2025-02-17 14:20:56.253857
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "211f3c5aa125"
+down_revision = "2adc0b597dcd"
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column(
+        "knowledge_bases", sa.Column("chunking_config", sa.JSON(), nullable=True)
+    )
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_column("knowledge_bases", "chunking_config")
+    # ### end Alembic commands ###
diff --git a/backend/app/api/admin_routes/knowledge_base/models.py b/backend/app/api/admin_routes/knowledge_base/models.py
@@ -14,7 +14,7 @@
 )
 from app.exceptions import KBNoVectorIndexConfigured
 from app.models import KgIndexStatus
-from app.models.knowledge_base import IndexMethod
+from app.models.knowledge_base import IndexMethod, GeneralChunkingConfig, ChunkingConfig
 
 
 class KnowledgeBaseCreate(BaseModel):
@@ -25,6 +25,7 @@ class KnowledgeBaseCreate(BaseModel):
     )
     llm_id: Optional[int] = None
     embedding_model_id: Optional[int] = None
+    chunking_config: ChunkingConfig = Field(default_factory=GeneralChunkingConfig)
     data_sources: list[KBDataSourceCreate] = Field(default_factory=list)
 
     @field_validator("name")
@@ -45,6 +46,7 @@ def index_methods_must_has_vector(cls, v: list[IndexMethod]) -> list[IndexMethod
 class KnowledgeBaseUpdate(BaseModel):
     name: Optional[str] = None
     description: Optional[str] = None
+    chunking_config: Optional[ChunkingConfig] = None
 
 
 class KnowledgeBaseDetail(BaseModel):
@@ -54,12 +56,13 @@ class KnowledgeBaseDetail(BaseModel):
 
     id: int
     name: str
-    description: str
+    description: Optional[str] = None
     documents_total: int
     data_sources_total: int
     # Notice: By default, SQLModel will not serialize list type relationships.
     # https://github.com/fastapi/sqlmodel/issues/37#issuecomment-2093607242
     data_sources: list[KBDataSource]
+    chunking_config: Optional[ChunkingConfig] = None
     index_methods: list[IndexMethod]
     llm_id: int | None = None
     llm: LLMDescriptor | None = None
@@ -77,7 +80,7 @@ class KnowledgeBaseItem(BaseModel):
 
     id: int
     name: str
-    description: str
+    description: Optional[str] = None
     documents_total: int
     data_sources_total: int
     index_methods: list[IndexMethod]

diff --git a/backend/app/api/admin_routes/knowledge_base/routes.py b/backend/app/api/admin_routes/knowledge_base/routes.py
@@ -75,6 +75,7 @@ def create_knowledge_base(
             index_methods=create.index_methods,
             llm_id=create.llm_id,
             embedding_model_id=create.embedding_model_id,
+            chunking_config=create.chunking_config.model_dump(),
             data_sources=data_sources,
             created_by=user.id,
             updated_by=user.id,

diff --git a/backend/app/core/config.py b/backend/app/core/config.py
@@ -95,12 +95,8 @@ def _validate_sentry_sample_rate(self) -> Self:
     COMPLIED_INTENT_ANALYSIS_PROGRAM_PATH: str | None = None
     COMPLIED_PREREQUISITE_ANALYSIS_PROGRAM_PATH: str | None = None
 
-    # CAUTION: Do not change EMBEDDING_DIMS after initializing the database.
-    # Changing the embedding dimensions requires recreating the database and tables.
-    # The default EMBEDDING_DIMS and EMBEDDING_MAX_TOKENS are set for the OpenAI text-embedding-3-small model.
-    # If using a different embedding model, adjust these values according to the model's specifications.
-    # For example:
-    #   maidalun1020/bce-embedding-base_v1: EMBEDDING_DIMS=768   EMBEDDING_MAX_TOKENS=512
+    # NOTICE: EMBEDDING_DIMS and EMBEDDING_MAX_TOKENS is deprecated and
+    # will be removed in the future.
     EMBEDDING_DIMS: int = 1536
     EMBEDDING_MAX_TOKENS: int = 2048
 

diff --git a/backend/app/models/document.py b/backend/app/models/document.py
@@ -27,6 +27,11 @@ class DocIndexTaskStatus(str, enum.Enum):
     FAILED = "failed"
 
 
+class ContentFormat(str, enum.Enum):
+    TEXT = "text"
+    MARKDOWN = "markdown"
+
+
 class Document(UpdatableBaseModel, table=True):
     # Avoid "expected `enum` but got `str`" error.
     model_config = ConfigDict(use_enum_values=True)

diff --git a/backend/app/models/entity.py b/backend/app/models/entity.py
@@ -11,14 +11,14 @@
 from tidb_vector.sqlalchemy import VectorType
 from sqlalchemy import Index
 
-from app.core.config import settings
 from app.models.knowledge_base import KnowledgeBase
 from app.models.knowledge_base_scoped.registry import get_kb_scoped_registry
 from app.models.knowledge_base_scoped.table_naming import (
     get_kb_entities_table_name,
     get_kb_vector_dims,
 )
 from app.models.patch.sql_model import SQLModel as PatchSQLModel
+from app.core.config import settings
 
 
 class EntityType(str, enum.Enum):
@@ -37,7 +37,7 @@ class EntityBase(SQLModel):
     synopsis_info: List | Dict | None = Field(default=None, sa_column=Column(JSON))
 
 
-# Notice: DO NOT forget to modify the definition in `get_kb_chunk_model` to
+# Notice: DO NOT forget to modify the definition in `get_kb_entity_model` to
 # keep the table structure on both sides consistent.
 class Entity(EntityBase, table=True):
     id: Optional[int] = Field(default=None, primary_key=True)

diff --git a/backend/app/models/knowledge_base.py b/backend/app/models/knowledge_base.py
@@ -1,8 +1,9 @@
 import enum
 from datetime import datetime
-from typing import Optional
+from typing import Dict, Optional, Union
 from uuid import UUID
 
+from pydantic import BaseModel
 from sqlalchemy import JSON, func
 from sqlalchemy.dialects.mysql import MEDIUMTEXT
 from sqlmodel import (
@@ -12,13 +13,21 @@
     Relationship as SQLRelationship,
     SQLModel,
 )
-
+from llama_index.core.node_parser.text.sentence import (
+    DEFAULT_PARAGRAPH_SEP,
+    SENTENCE_CHUNK_OVERLAP,
+)
+from app.rag.node_parser.file.markdown import (
+    DEFAULT_CHUNK_HEADER_LEVEL,
+    DEFAULT_CHUNK_SIZE,
+)
 from app.api.admin_routes.models import KnowledgeBaseDescriptor
 from app.exceptions import KBDataSourceNotFound
 from app.models.auth import User
 from app.models.data_source import DataSource
 from app.models.embed_model import EmbeddingModel
 from app.models.llm import LLM
+from app.types import MimeTypes
 
 # For compatibility with old code, define a fake knowledge base id.
 PHONY_KNOWLEDGE_BASE_ID = 0
@@ -36,10 +45,87 @@ class KnowledgeBaseDataSource(SQLModel, table=True):
     __tablename__ = "knowledge_base_datasources"
 
 
+# Chunking Settings.
+
+
+class ChunkSplitter(str, enum.Enum):
+    SENTENCE_SPLITTER = "SentenceSplitter"
+    MARKDOWN_NODE_PARSER = "MarkdownNodeParser"
+
+
+class SentenceSplitterOptions(BaseModel):
+    chunk_size: int = Field(
+        description="The token chunk size for each chunk.",
+        default=1000,
+        gt=0,
+    )
+    chunk_overlap: int = Field(
+        description="The overlap size for each chunk.",
+        default=SENTENCE_CHUNK_OVERLAP,
+        gt=0,
+    )
+    paragraph_separator: str = Field(
+        description="The paragraph separator for splitting the text.",
+        default=DEFAULT_PARAGRAPH_SEP,
+    )
+
+
+class MarkdownNodeParserOptions(BaseModel):
+    chunk_size: int = Field(
+        description="The token chunk size for each chunk.",
+        default=1000,
+        gt=0,
+    )
+    chunk_header_level: int = Field(
+        description="The header level to split on",
+        default=DEFAULT_CHUNK_HEADER_LEVEL,
+        ge=1,
+        le=6,
+    )
+
+
+class ChunkSplitterConfig(BaseModel):
+    splitter: ChunkSplitter = Field(default=ChunkSplitter.SENTENCE_SPLITTER)
+    splitter_options: Union[SentenceSplitterOptions, MarkdownNodeParserOptions] = (
+        Field()
+    )
+
+
+class ChunkingMode(str, enum.Enum):
+    GENERAL = "general"
+    ADVANCED = "advanced"
+
+
+class BaseChunkingConfig(BaseModel):
+    mode: ChunkingMode = Field(default=ChunkingMode.GENERAL)
+
+
+class GeneralChunkingConfig(BaseChunkingConfig):
+    mode: ChunkingMode = Field(default=ChunkingMode.GENERAL)
+    chunk_size: int = Field(default=DEFAULT_CHUNK_SIZE, gt=0)
+    chunk_overlap: int = Field(default=SENTENCE_CHUNK_OVERLAP, gt=0)
+    paragraph_separator: str = Field(default=DEFAULT_PARAGRAPH_SEP)
+
+
+class AdvancedChunkingConfig(BaseChunkingConfig):
+    mode: ChunkingMode = Field(default=ChunkingMode.ADVANCED)
+    rules: Dict[MimeTypes, ChunkSplitterConfig] = Field(default_factory=list)
+
+
+ChunkingConfig = Union[GeneralChunkingConfig | AdvancedChunkingConfig]
+
+# Knowledge Base Model
+
+
 class KnowledgeBase(SQLModel, table=True):
     id: Optional[int] = Field(default=None, primary_key=True)
     name: str = Field(max_length=255, nullable=False)
-    description: str = Field(sa_column=Column(MEDIUMTEXT))
+    description: Optional[str] = Field(sa_column=Column(MEDIUMTEXT), default=None)
+
+    # The config for chunking, the process to break down the document into smaller chunks.
+    chunking_config: Dict = Field(
+        sa_column=Column(JSON), default=GeneralChunkingConfig().model_dump()
+    )
 
     # Data sources config.
     data_sources: list["DataSource"] = SQLRelationship(
@@ -64,10 +150,6 @@ class KnowledgeBase(SQLModel, table=True):
             "foreign_keys": "KnowledgeBase.embedding_model_id",
         },
     )
-
-    # TODO: Support knowledge-base level retrieval config.
-
-    # TODO: Store the statistics of the knowledge base.
     documents_total: int = Field(default=0)
     data_sources_total: int = Field(default=0)
 

diff --git a/backend/app/models/relationship.py b/backend/app/models/relationship.py
@@ -34,7 +34,7 @@ class RelationshipBase(SQLModel):
     chunk_id: Optional[UUID] = Field(default=None, nullable=True)
 
 
-# Notice: DO NOT forget to modify the definition in `get_kb_chunk_model` to
+# Notice: DO NOT forget to modify the definition in `get_kb_relationship_model` to
 # keep the table structure on both sides consistent.
 class Relationship(RelationshipBase, table=True):
     id: Optional[int] = Field(default=None, primary_key=True)