Skip to content

Commit

Permalink
feat: support configure chunking setting on knowledge base API (#623)
Browse files Browse the repository at this point in the history
close #221
part of #527
  • Loading branch information
Mini256 authored Feb 17, 2025
1 parent 13604e6 commit 51ef131
Show file tree
Hide file tree
Showing 15 changed files with 383 additions and 95 deletions.
31 changes: 31 additions & 0 deletions backend/app/alembic/versions/211f3c5aa125_chunking_settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""chunking_settings
Revision ID: 211f3c5aa125
Revises: 2adc0b597dcd
Create Date: 2025-02-17 14:20:56.253857
"""

from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision = "211f3c5aa125"
down_revision = "2adc0b597dcd"
branch_labels = None
depends_on = None


def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.add_column(
"knowledge_bases", sa.Column("chunking_config", sa.JSON(), nullable=True)
)
# ### end Alembic commands ###


def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.drop_column("knowledge_bases", "chunking_config")
# ### end Alembic commands ###
9 changes: 6 additions & 3 deletions backend/app/api/admin_routes/knowledge_base/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
)
from app.exceptions import KBNoVectorIndexConfigured
from app.models import KgIndexStatus
from app.models.knowledge_base import IndexMethod
from app.models.knowledge_base import IndexMethod, GeneralChunkingConfig, ChunkingConfig


class KnowledgeBaseCreate(BaseModel):
Expand All @@ -25,6 +25,7 @@ class KnowledgeBaseCreate(BaseModel):
)
llm_id: Optional[int] = None
embedding_model_id: Optional[int] = None
chunking_config: ChunkingConfig = Field(default_factory=GeneralChunkingConfig)
data_sources: list[KBDataSourceCreate] = Field(default_factory=list)

@field_validator("name")
Expand All @@ -45,6 +46,7 @@ def index_methods_must_has_vector(cls, v: list[IndexMethod]) -> list[IndexMethod
class KnowledgeBaseUpdate(BaseModel):
name: Optional[str] = None
description: Optional[str] = None
chunking_config: Optional[ChunkingConfig] = None


class KnowledgeBaseDetail(BaseModel):
Expand All @@ -54,12 +56,13 @@ class KnowledgeBaseDetail(BaseModel):

id: int
name: str
description: str
description: Optional[str] = None
documents_total: int
data_sources_total: int
# Notice: By default, SQLModel will not serialize list type relationships.
# https://github.com/fastapi/sqlmodel/issues/37#issuecomment-2093607242
data_sources: list[KBDataSource]
chunking_config: Optional[ChunkingConfig] = None
index_methods: list[IndexMethod]
llm_id: int | None = None
llm: LLMDescriptor | None = None
Expand All @@ -77,7 +80,7 @@ class KnowledgeBaseItem(BaseModel):

id: int
name: str
description: str
description: Optional[str] = None
documents_total: int
data_sources_total: int
index_methods: list[IndexMethod]
Expand Down
1 change: 1 addition & 0 deletions backend/app/api/admin_routes/knowledge_base/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ def create_knowledge_base(
index_methods=create.index_methods,
llm_id=create.llm_id,
embedding_model_id=create.embedding_model_id,
chunking_config=create.chunking_config.model_dump(),
data_sources=data_sources,
created_by=user.id,
updated_by=user.id,
Expand Down
8 changes: 2 additions & 6 deletions backend/app/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,12 +95,8 @@ def _validate_sentry_sample_rate(self) -> Self:
COMPLIED_INTENT_ANALYSIS_PROGRAM_PATH: str | None = None
COMPLIED_PREREQUISITE_ANALYSIS_PROGRAM_PATH: str | None = None

# CAUTION: Do not change EMBEDDING_DIMS after initializing the database.
# Changing the embedding dimensions requires recreating the database and tables.
# The default EMBEDDING_DIMS and EMBEDDING_MAX_TOKENS are set for the OpenAI text-embedding-3-small model.
# If using a different embedding model, adjust these values according to the model's specifications.
# For example:
# maidalun1020/bce-embedding-base_v1: EMBEDDING_DIMS=768 EMBEDDING_MAX_TOKENS=512
# NOTICE: EMBEDDING_DIMS and EMBEDDING_MAX_TOKENS is deprecated and
# will be removed in the future.
EMBEDDING_DIMS: int = 1536
EMBEDDING_MAX_TOKENS: int = 2048

Expand Down
5 changes: 5 additions & 0 deletions backend/app/models/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@ class DocIndexTaskStatus(str, enum.Enum):
FAILED = "failed"


class ContentFormat(str, enum.Enum):
TEXT = "text"
MARKDOWN = "markdown"


class Document(UpdatableBaseModel, table=True):
# Avoid "expected `enum` but got `str`" error.
model_config = ConfigDict(use_enum_values=True)
Expand Down
4 changes: 2 additions & 2 deletions backend/app/models/entity.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@
from tidb_vector.sqlalchemy import VectorType
from sqlalchemy import Index

from app.core.config import settings
from app.models.knowledge_base import KnowledgeBase
from app.models.knowledge_base_scoped.registry import get_kb_scoped_registry
from app.models.knowledge_base_scoped.table_naming import (
get_kb_entities_table_name,
get_kb_vector_dims,
)
from app.models.patch.sql_model import SQLModel as PatchSQLModel
from app.core.config import settings


class EntityType(str, enum.Enum):
Expand All @@ -37,7 +37,7 @@ class EntityBase(SQLModel):
synopsis_info: List | Dict | None = Field(default=None, sa_column=Column(JSON))


# Notice: DO NOT forget to modify the definition in `get_kb_chunk_model` to
# Notice: DO NOT forget to modify the definition in `get_kb_entity_model` to
# keep the table structure on both sides consistent.
class Entity(EntityBase, table=True):
id: Optional[int] = Field(default=None, primary_key=True)
Expand Down
96 changes: 89 additions & 7 deletions backend/app/models/knowledge_base.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import enum
from datetime import datetime
from typing import Optional
from typing import Dict, Optional, Union
from uuid import UUID

from pydantic import BaseModel
from sqlalchemy import JSON, func
from sqlalchemy.dialects.mysql import MEDIUMTEXT
from sqlmodel import (
Expand All @@ -12,13 +13,21 @@
Relationship as SQLRelationship,
SQLModel,
)

from llama_index.core.node_parser.text.sentence import (
DEFAULT_PARAGRAPH_SEP,
SENTENCE_CHUNK_OVERLAP,
)
from app.rag.node_parser.file.markdown import (
DEFAULT_CHUNK_HEADER_LEVEL,
DEFAULT_CHUNK_SIZE,
)
from app.api.admin_routes.models import KnowledgeBaseDescriptor
from app.exceptions import KBDataSourceNotFound
from app.models.auth import User
from app.models.data_source import DataSource
from app.models.embed_model import EmbeddingModel
from app.models.llm import LLM
from app.types import MimeTypes

# For compatibility with old code, define a fake knowledge base id.
PHONY_KNOWLEDGE_BASE_ID = 0
Expand All @@ -36,10 +45,87 @@ class KnowledgeBaseDataSource(SQLModel, table=True):
__tablename__ = "knowledge_base_datasources"


# Chunking Settings.


class ChunkSplitter(str, enum.Enum):
SENTENCE_SPLITTER = "SentenceSplitter"
MARKDOWN_NODE_PARSER = "MarkdownNodeParser"


class SentenceSplitterOptions(BaseModel):
chunk_size: int = Field(
description="The token chunk size for each chunk.",
default=1000,
gt=0,
)
chunk_overlap: int = Field(
description="The overlap size for each chunk.",
default=SENTENCE_CHUNK_OVERLAP,
gt=0,
)
paragraph_separator: str = Field(
description="The paragraph separator for splitting the text.",
default=DEFAULT_PARAGRAPH_SEP,
)


class MarkdownNodeParserOptions(BaseModel):
chunk_size: int = Field(
description="The token chunk size for each chunk.",
default=1000,
gt=0,
)
chunk_header_level: int = Field(
description="The header level to split on",
default=DEFAULT_CHUNK_HEADER_LEVEL,
ge=1,
le=6,
)


class ChunkSplitterConfig(BaseModel):
splitter: ChunkSplitter = Field(default=ChunkSplitter.SENTENCE_SPLITTER)
splitter_options: Union[SentenceSplitterOptions, MarkdownNodeParserOptions] = (
Field()
)


class ChunkingMode(str, enum.Enum):
GENERAL = "general"
ADVANCED = "advanced"


class BaseChunkingConfig(BaseModel):
mode: ChunkingMode = Field(default=ChunkingMode.GENERAL)


class GeneralChunkingConfig(BaseChunkingConfig):
mode: ChunkingMode = Field(default=ChunkingMode.GENERAL)
chunk_size: int = Field(default=DEFAULT_CHUNK_SIZE, gt=0)
chunk_overlap: int = Field(default=SENTENCE_CHUNK_OVERLAP, gt=0)
paragraph_separator: str = Field(default=DEFAULT_PARAGRAPH_SEP)


class AdvancedChunkingConfig(BaseChunkingConfig):
mode: ChunkingMode = Field(default=ChunkingMode.ADVANCED)
rules: Dict[MimeTypes, ChunkSplitterConfig] = Field(default_factory=list)


ChunkingConfig = Union[GeneralChunkingConfig | AdvancedChunkingConfig]

# Knowledge Base Model


class KnowledgeBase(SQLModel, table=True):
id: Optional[int] = Field(default=None, primary_key=True)
name: str = Field(max_length=255, nullable=False)
description: str = Field(sa_column=Column(MEDIUMTEXT))
description: Optional[str] = Field(sa_column=Column(MEDIUMTEXT), default=None)

# The config for chunking, the process to break down the document into smaller chunks.
chunking_config: Dict = Field(
sa_column=Column(JSON), default=GeneralChunkingConfig().model_dump()
)

# Data sources config.
data_sources: list["DataSource"] = SQLRelationship(
Expand All @@ -64,10 +150,6 @@ class KnowledgeBase(SQLModel, table=True):
"foreign_keys": "KnowledgeBase.embedding_model_id",
},
)

# TODO: Support knowledge-base level retrieval config.

# TODO: Store the statistics of the knowledge base.
documents_total: int = Field(default=0)
data_sources_total: int = Field(default=0)

Expand Down
2 changes: 1 addition & 1 deletion backend/app/models/relationship.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ class RelationshipBase(SQLModel):
chunk_id: Optional[UUID] = Field(default=None, nullable=True)


# Notice: DO NOT forget to modify the definition in `get_kb_chunk_model` to
# Notice: DO NOT forget to modify the definition in `get_kb_relationship_model` to
# keep the table structure on both sides consistent.
class Relationship(RelationshipBase, table=True):
id: Optional[int] = Field(default=None, primary_key=True)
Expand Down
Loading

0 comments on commit 51ef131

Please sign in to comment.