Skip to content

Commit

Permalink
feat:add metadata properties filters
Browse files Browse the repository at this point in the history
  • Loading branch information
Aries-ckt committed Apr 9, 2024
1 parent bb77e13 commit cc39637
Show file tree
Hide file tree
Showing 20 changed files with 516 additions and 112 deletions.
4 changes: 3 additions & 1 deletion dbgpt/rag/knowledge/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from abc import ABC, abstractmethod
from enum import Enum
from typing import Any, List, Optional, Tuple, Type
from typing import Any, Dict, List, Optional, Tuple, Type, Union

from dbgpt.core import Document
from dbgpt.rag.text_splitter.text_splitter import (
Expand Down Expand Up @@ -148,12 +148,14 @@ def __init__(
path: Optional[str] = None,
knowledge_type: Optional[KnowledgeType] = None,
data_loader: Optional[Any] = None,
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
**kwargs: Any,
) -> None:
"""Initialize with Knowledge arguments."""
self._path = path
self._type = knowledge_type
self._data_loader = data_loader
self._metadata = metadata

def load(self) -> List[Document]:
"""Load knowledge from data_loader."""
Expand Down
6 changes: 5 additions & 1 deletion dbgpt/rag/knowledge/csv.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""CSV Knowledge."""
import csv
from typing import Any, List, Optional
from typing import Any, Dict, List, Optional, Union

from dbgpt.core import Document
from dbgpt.rag.knowledge.base import (
Expand All @@ -21,6 +21,7 @@ def __init__(
source_column: Optional[str] = None,
encoding: Optional[str] = "utf-8",
loader: Optional[Any] = None,
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
**kwargs: Any,
) -> None:
"""Create CSV Knowledge with Knowledge arguments.
Expand All @@ -37,6 +38,7 @@ def __init__(
self._loader = loader
self._encoding = encoding
self._source_column = source_column
self._metadata = metadata

def _load(self) -> List[Document]:
"""Load csv document from loader."""
Expand Down Expand Up @@ -67,6 +69,8 @@ def _load(self) -> List[Document]:
f"file."
)
metadata = {"source": source, "row": i}
if self._metadata:
metadata.update(self._metadata)
doc = Document(content=content, metadata=metadata)
docs.append(doc)

Expand Down
11 changes: 7 additions & 4 deletions dbgpt/rag/knowledge/datasource.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""Datasource Knowledge."""
from typing import Any, List, Optional
from typing import Any, Dict, List, Optional, Union

from dbgpt.core import Document
from dbgpt.datasource import BaseConnector
Expand All @@ -16,6 +16,7 @@ def __init__(
connector: BaseConnector,
summary_template: str = "{table_name}({columns})",
knowledge_type: Optional[KnowledgeType] = KnowledgeType.DOCUMENT,
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
**kwargs: Any,
) -> None:
"""Create Datasource Knowledge with Knowledge arguments.
Expand All @@ -27,15 +28,17 @@ def __init__(
"""
self._connector = connector
self._summary_template = summary_template
self._metadata = metadata
super().__init__(knowledge_type=knowledge_type, **kwargs)

def _load(self) -> List[Document]:
"""Load datasource document from data_loader."""
docs = []
for table_summary in _parse_db_summary(self._connector, self._summary_template):
docs.append(
Document(content=table_summary, metadata={"source": "database"})
)
metadata = {"source": "database"}
if self._metadata:
metadata.update(self._metadata)
docs.append(Document(content=table_summary, metadata=metadata))
return docs

@classmethod
Expand Down
11 changes: 7 additions & 4 deletions dbgpt/rag/knowledge/docx.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""Docx Knowledge."""
from typing import Any, List, Optional
from typing import Any, Dict, List, Optional, Union

import docx

Expand All @@ -21,6 +21,7 @@ def __init__(
knowledge_type: Any = KnowledgeType.DOCUMENT,
encoding: Optional[str] = "utf-8",
loader: Optional[Any] = None,
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
**kwargs: Any,
) -> None:
"""Create Docx Knowledge with Knowledge arguments.
Expand All @@ -35,6 +36,7 @@ def __init__(
self._type = knowledge_type
self._loader = loader
self._encoding = encoding
self._metadata = metadata

def _load(self) -> List[Document]:
"""Load docx document from loader."""
Expand All @@ -48,9 +50,10 @@ def _load(self) -> List[Document]:
para = doc.paragraphs[i]
text = para.text
content.append(text)
docs.append(
Document(content="\n".join(content), metadata={"source": self._path})
)
metadata = {"source": self._path}
if self._metadata:
metadata.update(self._metadata)
docs.append(Document(content="\n".join(content), metadata=metadata))
return docs
return [Document.langchain2doc(lc_document) for lc_document in documents]

Expand Down
20 changes: 16 additions & 4 deletions dbgpt/rag/knowledge/factory.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""Knowledge Factory to create knowledge from file path and url."""
from typing import List, Optional, Type
from typing import Dict, List, Optional, Type, Union

from dbgpt.rag.knowledge.base import Knowledge, KnowledgeType
from dbgpt.rag.knowledge.string import StringKnowledge
Expand All @@ -13,6 +13,7 @@ def __init__(
self,
file_path: Optional[str] = None,
knowledge_type: Optional[KnowledgeType] = KnowledgeType.DOCUMENT,
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
):
"""Create Knowledge Factory with file path and knowledge type.
Expand All @@ -22,18 +23,21 @@ def __init__(
"""
self._file_path = file_path
self._knowledge_type = knowledge_type
self._metadata = metadata

@classmethod
def create(
cls,
datasource: str = "",
knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
):
"""Create knowledge from file path, url or text.
Args:
datasource: path of the file to convert
knowledge_type: type of knowledge
metadata: Optional[Dict[str, Union[str, List[str]]]]
Examples:
.. code-block:: python
Expand All @@ -52,12 +56,16 @@ def create(
match knowledge_type:
case KnowledgeType.DOCUMENT:
return cls.from_file_path(
file_path=datasource, knowledge_type=knowledge_type
file_path=datasource,
knowledge_type=knowledge_type,
metadata=metadata,
)
case KnowledgeType.URL:
return cls.from_url(url=datasource, knowledge_type=knowledge_type)
case KnowledgeType.TEXT:
return cls.from_text(text=datasource, knowledge_type=knowledge_type)
return cls.from_text(
text=datasource, knowledge_type=knowledge_type, metadata=metadata
)
case _:
raise Exception(f"Unsupported knowledge type '{knowledge_type}'")

Expand All @@ -66,6 +74,7 @@ def from_file_path(
cls,
file_path: str = "",
knowledge_type: Optional[KnowledgeType] = KnowledgeType.DOCUMENT,
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
) -> Knowledge:
"""Create knowledge from path.
Expand All @@ -82,10 +91,11 @@ def from_file_path(
datasource="path/to/document.pdf",
knowledge_type=KnowledgeType.DOCUMENT,
)
"""
factory = cls(file_path=file_path, knowledge_type=knowledge_type)
return factory._select_document_knowledge(
file_path=file_path, knowledge_type=knowledge_type
file_path=file_path, knowledge_type=knowledge_type, metadata=metadata
)

@staticmethod
Expand Down Expand Up @@ -117,6 +127,7 @@ def from_url(
def from_text(
text: str = "",
knowledge_type: KnowledgeType = KnowledgeType.TEXT,
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
) -> Knowledge:
"""Create knowledge from text.
Expand All @@ -127,6 +138,7 @@ def from_text(
return StringKnowledge(
text=text,
knowledge_type=knowledge_type,
metadata=metadata,
)

def _select_document_knowledge(self, **kwargs):
Expand Down
6 changes: 5 additions & 1 deletion dbgpt/rag/knowledge/markdown.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""Markdown Knowledge."""
from typing import Any, List, Optional
from typing import Any, Dict, List, Optional, Union

from dbgpt.core import Document
from dbgpt.rag.knowledge.base import (
Expand All @@ -19,6 +19,7 @@ def __init__(
knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
encoding: Optional[str] = "utf-8",
loader: Optional[Any] = None,
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
**kwargs: Any,
) -> None:
"""Create Markdown Knowledge with Knowledge arguments.
Expand All @@ -33,6 +34,7 @@ def __init__(
self._type = knowledge_type
self._loader = loader
self._encoding = encoding
self._metadata = metadata

def _load(self) -> List[Document]:
"""Load markdown document from loader."""
Expand All @@ -44,6 +46,8 @@ def _load(self) -> List[Document]:
with open(self._path, encoding=self._encoding, errors="ignore") as f:
markdown_text = f.read()
metadata = {"source": self._path}
if self._metadata:
metadata.update(self._metadata)
documents = [Document(content=markdown_text, metadata=metadata)]
return documents
return [Document.langchain2doc(lc_document) for lc_document in documents]
Expand Down
6 changes: 5 additions & 1 deletion dbgpt/rag/knowledge/pdf.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""PDF Knowledge."""
from typing import Any, List, Optional
from typing import Any, Dict, List, Optional, Union

from dbgpt.core import Document
from dbgpt.rag.knowledge.base import (
Expand All @@ -19,6 +19,7 @@ def __init__(
knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
loader: Optional[Any] = None,
language: Optional[str] = "zh",
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
**kwargs: Any,
) -> None:
"""Create PDF Knowledge with Knowledge arguments.
Expand All @@ -33,6 +34,7 @@ def __init__(
self._type = knowledge_type
self._loader = loader
self._language = language
self._metadata = metadata

def _load(self) -> List[Document]:
"""Load pdf document from loader."""
Expand Down Expand Up @@ -65,6 +67,8 @@ def _load(self) -> List[Document]:
page = "\n".join(cleaned_lines)
# cleaned_pages.append(page)
metadata = {"source": self._path, "page": page_num}
if self._metadata:
metadata.update(self._metadata)
# text = "\f".join(cleaned_pages)
document = Document(content=page, metadata=metadata)
documents.append(document)
Expand Down
11 changes: 7 additions & 4 deletions dbgpt/rag/knowledge/pptx.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""PPTX Knowledge."""
from typing import Any, List, Optional
from typing import Any, Dict, List, Optional, Union

from dbgpt.core import Document
from dbgpt.rag.knowledge.base import (
Expand All @@ -19,6 +19,7 @@ def __init__(
knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
loader: Optional[Any] = None,
language: Optional[str] = "zh",
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
**kwargs: Any,
) -> None:
"""Create PPTX knowledge with PDF Knowledge arguments.
Expand All @@ -32,6 +33,7 @@ def __init__(
self._type = knowledge_type
self._loader = loader
self._language = language
self._metadata = metadata

def _load(self) -> List[Document]:
"""Load pdf document from loader."""
Expand All @@ -47,9 +49,10 @@ def _load(self) -> List[Document]:
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text:
content += shape.text
docs.append(
Document(content=content, metadata={"source": slide.slide_id})
)
metadata = {"source": self._path}
if self._metadata:
metadata.update(self._metadata)
docs.append(Document(content=content, metadata=metadata))
return docs
return [Document.langchain2doc(lc_document) for lc_document in documents]

Expand Down
6 changes: 5 additions & 1 deletion dbgpt/rag/knowledge/string.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""String Knowledge."""
from typing import Any, List, Optional
from typing import Any, Dict, List, Optional, Union

from dbgpt.core import Document
from dbgpt.rag.knowledge.base import ChunkStrategy, Knowledge, KnowledgeType
Expand All @@ -14,6 +14,7 @@ def __init__(
knowledge_type: KnowledgeType = KnowledgeType.TEXT,
encoding: Optional[str] = "utf-8",
loader: Optional[Any] = None,
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
**kwargs: Any,
) -> None:
"""Create String knowledge parameters.
Expand All @@ -28,10 +29,13 @@ def __init__(
self._type = knowledge_type
self._loader = loader
self._encoding = encoding
self._metadata = metadata

def _load(self) -> List[Document]:
"""Load raw text from loader."""
metadata = {"source": "raw text"}
if self._metadata:
metadata.update(self._metadata)
docs = [Document(content=self._text, metadata=metadata)]
return docs

Expand Down
6 changes: 5 additions & 1 deletion dbgpt/rag/knowledge/txt.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""TXT Knowledge."""
from typing import Any, List, Optional
from typing import Any, Dict, List, Optional, Union

import chardet

Expand All @@ -20,6 +20,7 @@ def __init__(
file_path: Optional[str] = None,
knowledge_type: KnowledgeType = KnowledgeType.DOCUMENT,
loader: Optional[Any] = None,
metadata: Optional[Dict[str, Union[str, List[str]]]] = None,
**kwargs: Any,
) -> None:
"""Create TXT Knowledge with Knowledge arguments.
Expand All @@ -32,6 +33,7 @@ def __init__(
self._path = file_path
self._type = knowledge_type
self._loader = loader
self._metadata = metadata

def _load(self) -> List[Document]:
"""Load txt document from loader."""
Expand All @@ -48,6 +50,8 @@ def _load(self) -> List[Document]:
else:
text = raw_text.decode(result["encoding"])
metadata = {"source": self._path}
if self._metadata:
metadata.update(self._metadata)
return [Document(content=text, metadata=metadata)]

return [Document.langchain2doc(lc_document) for lc_document in documents]
Expand Down
Loading

0 comments on commit cc39637

Please sign in to comment.