Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(RAG):add metadata properties filters #1395

Merged
merged 3 commits into from
Apr 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 8 additions & 6 deletions dbgpt/rag/knowledge/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from abc import ABC, abstractmethod
from enum import Enum
from typing import Any, List, Optional, Tuple, Type
from typing import Any, Dict, List, Optional, Tuple, Type, Union

from dbgpt.core import Document
from dbgpt.rag.text_splitter.text_splitter import (
Expand Down Expand Up @@ -147,16 +147,18 @@ def __init__(
self,
path: Optional[str] = None,
knowledge_type: Optional[KnowledgeType] = None,
data_loader: Optional[Any] = None,
loader: Optional[Any] = None,
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
**kwargs: Any,
) -> None:
"""Initialize with Knowledge arguments."""
self._path = path
self._type = knowledge_type
self._data_loader = data_loader
self._loader = loader
self._metadata = metadata

def load(self) -> List[Document]:
"""Load knowledge from data_loader."""
"""Load knowledge from data loader."""
documents = self._load()
return self._postprocess(documents)

Expand All @@ -171,12 +173,12 @@ def document_type(cls) -> Any:
return None

def _postprocess(self, docs: List[Document]) -> List[Document]:
"""Post process knowledge from data_loader."""
"""Post process knowledge from data loader."""
return docs

@abstractmethod
def _load(self) -> List[Document]:
"""Preprocess knowledge from data_loader."""
"""Preprocess knowledge from data loader."""

@classmethod
def support_chunk_strategy(cls) -> List[ChunkStrategy]:
Expand Down
15 changes: 11 additions & 4 deletions dbgpt/rag/knowledge/csv.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""CSV Knowledge."""
import csv
from typing import Any, List, Optional
from typing import Any, Dict, List, Optional, Union

from dbgpt.core import Document
from dbgpt.rag.knowledge.base import (
Expand All @@ -21,6 +21,7 @@ def __init__(
source_column: Optional[str] = None,
encoding: Optional[str] = "utf-8",
loader: Optional[Any] = None,
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
**kwargs: Any,
) -> None:
"""Create CSV Knowledge with Knowledge arguments.
Expand All @@ -32,9 +33,13 @@ def __init__(
encoding(str, optional): csv encoding
loader(Any, optional): loader
"""
self._path = file_path
self._type = knowledge_type
self._loader = loader
super().__init__(
path=file_path,
knowledge_type=knowledge_type,
data_loader=loader,
metadata=metadata,
**kwargs,
)
self._encoding = encoding
self._source_column = source_column

Expand Down Expand Up @@ -67,6 +72,8 @@ def _load(self) -> List[Document]:
f"file."
)
metadata = {"source": source, "row": i}
if self._metadata:
metadata.update(self._metadata) # type: ignore
doc = Document(content=content, metadata=metadata)
docs.append(doc)

Expand Down
17 changes: 10 additions & 7 deletions dbgpt/rag/knowledge/datasource.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""Datasource Knowledge."""
from typing import Any, List, Optional
from typing import Any, Dict, List, Optional, Union

from dbgpt.core import Document
from dbgpt.datasource import BaseConnector
Expand All @@ -16,26 +16,29 @@ def __init__(
connector: BaseConnector,
summary_template: str = "{table_name}({columns})",
knowledge_type: Optional[KnowledgeType] = KnowledgeType.DOCUMENT,
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
**kwargs: Any,
) -> None:
"""Create Datasource Knowledge with Knowledge arguments.

Args:
path(str, optional): file path
connector(BaseConnector): connector
summary_template(str, optional): summary template
knowledge_type(KnowledgeType, optional): knowledge type
data_loader(Any, optional): loader
metadata(Dict[str, Union[str, List[str]], optional): metadata
"""
self._connector = connector
self._summary_template = summary_template
super().__init__(knowledge_type=knowledge_type, **kwargs)
super().__init__(knowledge_type=knowledge_type, metadata=metadata, **kwargs)

def _load(self) -> List[Document]:
"""Load datasource document from data_loader."""
docs = []
for table_summary in _parse_db_summary(self._connector, self._summary_template):
docs.append(
Document(content=table_summary, metadata={"source": "database"})
)
metadata = {"source": "database"}
if self._metadata:
metadata.update(self._metadata) # type: ignore
docs.append(Document(content=table_summary, metadata=metadata))
return docs

@classmethod
Expand Down
20 changes: 13 additions & 7 deletions dbgpt/rag/knowledge/docx.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""Docx Knowledge."""
from typing import Any, List, Optional
from typing import Any, Dict, List, Optional, Union

import docx

Expand All @@ -21,6 +21,7 @@ def __init__(
knowledge_type: Any = KnowledgeType.DOCUMENT,
encoding: Optional[str] = "utf-8",
loader: Optional[Any] = None,
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
**kwargs: Any,
) -> None:
"""Create Docx Knowledge with Knowledge arguments.
Expand All @@ -31,9 +32,13 @@ def __init__(
encoding(str, optional): csv encoding
loader(Any, optional): loader
"""
self._path = file_path
self._type = knowledge_type
self._loader = loader
super().__init__(
path=file_path,
knowledge_type=knowledge_type,
data_loader=loader,
metadata=metadata,
**kwargs,
)
self._encoding = encoding

def _load(self) -> List[Document]:
Expand All @@ -48,9 +53,10 @@ def _load(self) -> List[Document]:
para = doc.paragraphs[i]
text = para.text
content.append(text)
docs.append(
Document(content="\n".join(content), metadata={"source": self._path})
)
metadata = {"source": self._path}
if self._metadata:
metadata.update(self._metadata) # type: ignore
docs.append(Document(content="\n".join(content), metadata=metadata))
return docs
return [Document.langchain2doc(lc_document) for lc_document in documents]

Expand Down
20 changes: 16 additions & 4 deletions dbgpt/rag/knowledge/factory.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""Knowledge Factory to create knowledge from file path and url."""
from typing import List, Optional, Type
from typing import Dict, List, Optional, Type, Union

from dbgpt.rag.knowledge.base import Knowledge, KnowledgeType
from dbgpt.rag.knowledge.string import StringKnowledge
Expand All @@ -13,6 +13,7 @@ def __init__(
self,
file_path: Optional[str] = None,
knowledge_type: Optional[KnowledgeType] = KnowledgeType.DOCUMENT,
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
):
"""Create Knowledge Factory with file path and knowledge type.

Expand All @@ -22,18 +23,21 @@ def __init__(
"""
self._file_path = file_path
self._knowledge_type = knowledge_type
self._metadata = metadata

@classmethod
def create(
cls,
datasource: str = "",
knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
):
"""Create knowledge from file path, url or text.

Args:
datasource: path of the file to convert
knowledge_type: type of knowledge
metadata: Optional[Dict[str, Union[str, List[str]]]]

Examples:
.. code-block:: python
Expand All @@ -52,12 +56,16 @@ def create(
match knowledge_type:
case KnowledgeType.DOCUMENT:
return cls.from_file_path(
file_path=datasource, knowledge_type=knowledge_type
file_path=datasource,
knowledge_type=knowledge_type,
metadata=metadata,
)
case KnowledgeType.URL:
return cls.from_url(url=datasource, knowledge_type=knowledge_type)
case KnowledgeType.TEXT:
return cls.from_text(text=datasource, knowledge_type=knowledge_type)
return cls.from_text(
text=datasource, knowledge_type=knowledge_type, metadata=metadata
)
case _:
raise Exception(f"Unsupported knowledge type '{knowledge_type}'")

Expand All @@ -66,6 +74,7 @@ def from_file_path(
cls,
file_path: str = "",
knowledge_type: Optional[KnowledgeType] = KnowledgeType.DOCUMENT,
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
) -> Knowledge:
"""Create knowledge from path.

Expand All @@ -82,10 +91,11 @@ def from_file_path(
datasource="path/to/document.pdf",
knowledge_type=KnowledgeType.DOCUMENT,
)

"""
factory = cls(file_path=file_path, knowledge_type=knowledge_type)
return factory._select_document_knowledge(
file_path=file_path, knowledge_type=knowledge_type
file_path=file_path, knowledge_type=knowledge_type, metadata=metadata
)

@staticmethod
Expand Down Expand Up @@ -117,6 +127,7 @@ def from_url(
def from_text(
text: str = "",
knowledge_type: KnowledgeType = KnowledgeType.TEXT,
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
) -> Knowledge:
"""Create knowledge from text.

Expand All @@ -127,6 +138,7 @@ def from_text(
return StringKnowledge(
text=text,
knowledge_type=knowledge_type,
metadata=metadata,
)

def _select_document_knowledge(self, **kwargs):
Expand Down
15 changes: 11 additions & 4 deletions dbgpt/rag/knowledge/html.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""HTML Knowledge."""
from typing import Any, List, Optional
from typing import Any, Dict, List, Optional, Union

import chardet

Expand All @@ -20,6 +20,7 @@ def __init__(
file_path: Optional[str] = None,
knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
loader: Optional[Any] = None,
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
**kwargs: Any,
) -> None:
"""Create HTML Knowledge with Knowledge arguments.
Expand All @@ -29,9 +30,13 @@ def __init__(
knowledge_type(KnowledgeType, optional): knowledge type
loader(Any, optional): loader
"""
self._path = file_path
self._type = knowledge_type
self._loader = loader
super().__init__(
path=file_path,
knowledge_type=knowledge_type,
data_loader=loader,
metadata=metadata,
**kwargs,
)

def _load(self) -> List[Document]:
"""Load html document from loader."""
Expand All @@ -48,6 +53,8 @@ def _load(self) -> List[Document]:
else:
text = raw_text.decode(result["encoding"])
metadata = {"source": self._path}
if self._metadata:
metadata.update(self._metadata) # type: ignore
return [Document(content=text, metadata=metadata)]

return [Document.langchain2doc(lc_document) for lc_document in documents]
Expand Down
15 changes: 11 additions & 4 deletions dbgpt/rag/knowledge/markdown.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""Markdown Knowledge."""
from typing import Any, List, Optional
from typing import Any, Dict, List, Optional, Union

from dbgpt.core import Document
from dbgpt.rag.knowledge.base import (
Expand All @@ -19,6 +19,7 @@ def __init__(
knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
encoding: Optional[str] = "utf-8",
loader: Optional[Any] = None,
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
**kwargs: Any,
) -> None:
"""Create Markdown Knowledge with Knowledge arguments.
Expand All @@ -29,9 +30,13 @@ def __init__(
encoding(str, optional): csv encoding
loader(Any, optional): loader
"""
self._path = file_path
self._type = knowledge_type
self._loader = loader
super().__init__(
path=file_path,
knowledge_type=knowledge_type,
data_loader=loader,
metadata=metadata,
**kwargs,
)
self._encoding = encoding

def _load(self) -> List[Document]:
Expand All @@ -44,6 +49,8 @@ def _load(self) -> List[Document]:
with open(self._path, encoding=self._encoding, errors="ignore") as f:
markdown_text = f.read()
metadata = {"source": self._path}
if self._metadata:
metadata.update(self._metadata) # type: ignore
documents = [Document(content=markdown_text, metadata=metadata)]
return documents
return [Document.langchain2doc(lc_document) for lc_document in documents]
Expand Down
15 changes: 11 additions & 4 deletions dbgpt/rag/knowledge/pdf.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""PDF Knowledge."""
from typing import Any, List, Optional
from typing import Any, Dict, List, Optional, Union

from dbgpt.core import Document
from dbgpt.rag.knowledge.base import (
Expand All @@ -19,6 +19,7 @@ def __init__(
knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
loader: Optional[Any] = None,
language: Optional[str] = "zh",
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
**kwargs: Any,
) -> None:
"""Create PDF Knowledge with Knowledge arguments.
Expand All @@ -29,9 +30,13 @@ def __init__(
loader(Any, optional): loader
language(str, optional): language
"""
self._path = file_path
self._type = knowledge_type
self._loader = loader
super().__init__(
path=file_path,
knowledge_type=knowledge_type,
data_loader=loader,
metadata=metadata,
**kwargs,
)
self._language = language

def _load(self) -> List[Document]:
Expand Down Expand Up @@ -65,6 +70,8 @@ def _load(self) -> List[Document]:
page = "\n".join(cleaned_lines)
# cleaned_pages.append(page)
metadata = {"source": self._path, "page": page_num}
if self._metadata:
metadata.update(self._metadata) # type: ignore
# text = "\f".join(cleaned_pages)
document = Document(content=page, metadata=metadata)
documents.append(document)
Expand Down
Loading
Loading