Skip to content

Commit

Permalink
chore: Add pylint for rag
Browse files Browse the repository at this point in the history
  • Loading branch information
fangyinc committed Jan 13, 2024
1 parent a4a0505 commit 80f6a51
Show file tree
Hide file tree
Showing 50 changed files with 234 additions and 178 deletions.
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,12 @@ fmt: setup ## Format Python code
$(VENV_BIN)/isort dbgpt/core/
$(VENV_BIN)/isort dbgpt/datasource/
$(VENV_BIN)/isort dbgpt/model/
$(VENV_BIN)/isort dbgpt/rag/
# TODO: $(VENV_BIN)/isort dbgpt/serve
$(VENV_BIN)/isort dbgpt/serve/core/
$(VENV_BIN)/isort dbgpt/serve/agent/
$(VENV_BIN)/isort dbgpt/serve/conversation/
$(VENV_BIN)/isort dbgpt/serve/rag/
$(VENV_BIN)/isort dbgpt/serve/utils/_template_files
$(VENV_BIN)/isort dbgpt/storage/
$(VENV_BIN)/isort dbgpt/train/
Expand All @@ -68,6 +70,7 @@ fmt: setup ## Format Python code
$(VENV_BIN)/blackdoc dbgpt/core/
$(VENV_BIN)/blackdoc dbgpt/datasource/
$(VENV_BIN)/blackdoc dbgpt/model/
$(VENV_BIN)/blackdoc dbgpt/rag/
$(VENV_BIN)/blackdoc dbgpt/serve/
# TODO: $(VENV_BIN)/blackdoc dbgpt/storage/
$(VENV_BIN)/blackdoc dbgpt/train/
Expand Down
4 changes: 0 additions & 4 deletions dbgpt/app/scene/chat_knowledge/v1/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,10 +80,6 @@ def __init__(self, chat_param: Dict):
vector_store_config=config,
)
query_rewrite = None
self.worker_manager = CFG.SYSTEM_APP.get_component(
ComponentType.WORKER_MANAGER_FACTORY, WorkerManagerFactory
).create()
self.llm_client = DefaultLLMClient(worker_manager=self.worker_manager)
if CFG.KNOWLEDGE_SEARCH_REWRITE:
query_rewrite = QueryRewrite(
llm_client=self.llm_client,
Expand Down
2 changes: 1 addition & 1 deletion dbgpt/rag/chunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import uuid
from typing import Any, Dict

from pydantic import Field, BaseModel
from pydantic import BaseModel, Field


class Document(BaseModel):
Expand Down
2 changes: 1 addition & 1 deletion dbgpt/rag/chunk_manager.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from enum import Enum
from typing import Optional, List, Any
from typing import Any, List, Optional

from pydantic import BaseModel, Field

Expand Down
3 changes: 2 additions & 1 deletion dbgpt/rag/embedding/embedding_factory.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

from abc import ABC, abstractmethod
from typing import Any, Type, TYPE_CHECKING
from typing import TYPE_CHECKING, Any, Type

from dbgpt.component import BaseComponent
from dbgpt.rag.embedding.embeddings import HuggingFaceEmbeddings
Expand Down
22 changes: 11 additions & 11 deletions dbgpt/rag/embedding/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from typing import Any, Dict, List, Optional

import requests
from pydantic import Field, Extra, BaseModel
from pydantic import BaseModel, Extra, Field

DEFAULT_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
DEFAULT_INSTRUCT_MODEL = "hkunlp/instructor-large"
Expand Down Expand Up @@ -54,12 +54,12 @@ class HuggingFaceEmbeddings(BaseModel, Embeddings):
from .embeddings import HuggingFaceEmbeddings
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": False}
hf = HuggingFaceEmbeddings(
model_name=model_name,
model_kwargs=model_kwargs,
encode_kwargs=encode_kwargs
encode_kwargs=encode_kwargs,
)
"""

Expand Down Expand Up @@ -142,12 +142,12 @@ class HuggingFaceInstructEmbeddings(BaseModel, Embeddings):
from langchain.embeddings import HuggingFaceInstructEmbeddings
model_name = "hkunlp/instructor-large"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
hf = HuggingFaceInstructEmbeddings(
model_name=model_name,
model_kwargs=model_kwargs,
encode_kwargs=encode_kwargs
encode_kwargs=encode_kwargs,
)
"""

Expand Down Expand Up @@ -221,12 +221,12 @@ class HuggingFaceBgeEmbeddings(BaseModel, Embeddings):
from langchain.embeddings import HuggingFaceBgeEmbeddings
model_name = "BAAI/bge-large-en"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
hf = HuggingFaceBgeEmbeddings(
model_name=model_name,
model_kwargs=model_kwargs,
encode_kwargs=encode_kwargs
encode_kwargs=encode_kwargs,
)
"""

Expand Down Expand Up @@ -336,7 +336,7 @@ def embed_documents(self, texts: List[str]) -> List[List[float]]:
hf_embeddings = HuggingFaceInferenceAPIEmbeddings(
api_key="your_api_key",
model_name="sentence-transformers/all-MiniLM-l6-v2"
model_name="sentence-transformers/all-MiniLM-l6-v2",
)
texts = ["Hello, world!", "How are you?"]
hf_embeddings.embed_documents(texts)
Expand Down
2 changes: 1 addition & 1 deletion dbgpt/rag/extractor/base.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from abc import abstractmethod, ABC
from abc import ABC, abstractmethod
from typing import List

from dbgpt.core import LLMClient
Expand Down
2 changes: 1 addition & 1 deletion dbgpt/rag/extractor/summary.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import List, Optional

from dbgpt._private.llm_metadata import LLMMetadata
from dbgpt.core import LLMClient, ModelRequest, ModelMessageRoleType
from dbgpt.core import LLMClient, ModelMessageRoleType, ModelRequest
from dbgpt.rag.chunk import Chunk
from dbgpt.rag.extractor.base import Extractor
from dbgpt.util import utils
Expand Down
5 changes: 3 additions & 2 deletions dbgpt/rag/graph/graph_engine.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import logging
from typing import Any, Optional, Callable, Tuple, List
from typing import Any, Callable, List, Optional, Tuple

from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
Expand Down Expand Up @@ -87,9 +87,10 @@ def _extract_triplets(self, text: str) -> List[Tuple[str, str, str]]:

def _llm_extract_triplets(self, text: str) -> List[Tuple[str, str, str]]:
"""Extract triplets from text by llm"""
import uuid

from dbgpt.app.scene import ChatScene
from dbgpt.util.chat_util import llm_chat_response_nostream
import uuid

chat_param = {
"chat_session_id": uuid.uuid1(),
Expand Down
1 change: 1 addition & 0 deletions dbgpt/rag/graph/graph_factory.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from __future__ import annotations

from abc import ABC, abstractmethod
from typing import Any, Type

Expand Down
7 changes: 4 additions & 3 deletions dbgpt/rag/graph/graph_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
import os
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor
from typing import List, Optional, Dict, Any, Set, Callable
from typing import Any, Callable, Dict, List, Optional, Set

from langchain.schema import Document

from dbgpt.rag.graph.node import BaseNode, TextNode, NodeWithScore
from dbgpt.rag.graph.node import BaseNode, NodeWithScore, TextNode
from dbgpt.rag.graph.search import BaseSearch, SearchMode

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -77,9 +77,10 @@ async def _extract_subject_entities(self, query_str: str) -> Set[str]:

async def _extract_entities_by_llm(self, text: str) -> Set[str]:
"""extract subject entities from text by llm"""
import uuid

from dbgpt.app.scene import ChatScene
from dbgpt.util.chat_util import llm_chat_response_nostream
import uuid

chat_param = {
"chat_session_id": uuid.uuid1(),
Expand Down
3 changes: 1 addition & 2 deletions dbgpt/rag/graph/index_struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,8 @@

from dataclasses_json import DataClassJsonMixin


from dbgpt.rag.graph.index_type import IndexStructType
from dbgpt.rag.graph.node import TextNode, BaseNode
from dbgpt.rag.graph.node import BaseNode, TextNode

# TODO: legacy backport of old Node class
Node = TextNode
Expand Down
1 change: 1 addition & 0 deletions dbgpt/rag/graph/kv_index.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import List, Optional

from llama_index.data_structs.data_structs import IndexStruct
from llama_index.storage.index_store.utils import (
index_struct_to_json,
Expand Down
2 changes: 1 addition & 1 deletion dbgpt/rag/graph/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
from typing import Any, Dict, List, Optional, Union

from langchain.schema import Document
from dbgpt._private.pydantic import BaseModel, Field, root_validator
from typing_extensions import Self

from dbgpt._private.pydantic import BaseModel, Field, root_validator

DEFAULT_TEXT_NODE_TMPL = "{metadata_str}\n\n{content}"
DEFAULT_METADATA_TMPL = "{key}: {value}"
Expand Down
10 changes: 5 additions & 5 deletions dbgpt/rag/knowledge/base.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
from abc import abstractmethod, ABC
from abc import ABC, abstractmethod
from enum import Enum
from typing import Optional, Any, List
from typing import Any, List, Optional

from dbgpt.rag.chunk import Document
from dbgpt.rag.text_splitter.text_splitter import (
RecursiveCharacterTextSplitter,
MarkdownHeaderTextSplitter,
ParagraphTextSplitter,
CharacterTextSplitter,
MarkdownHeaderTextSplitter,
PageTextSplitter,
ParagraphTextSplitter,
RecursiveCharacterTextSplitter,
SeparatorTextSplitter,
)

Expand Down
7 changes: 4 additions & 3 deletions dbgpt/rag/knowledge/csv.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
from typing import Optional, Any, List
import csv
from typing import Any, List, Optional

from dbgpt.rag.chunk import Document
from dbgpt.rag.knowledge.base import (
KnowledgeType,
Knowledge,
ChunkStrategy,
DocumentType,
Knowledge,
KnowledgeType,
)


Expand Down
9 changes: 5 additions & 4 deletions dbgpt/rag/knowledge/docx.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
from typing import Optional, Any, List
from typing import Any, List, Optional

import docx

from dbgpt.rag.chunk import Document
from dbgpt.rag.knowledge.base import (
KnowledgeType,
Knowledge,
ChunkStrategy,
DocumentType,
Knowledge,
KnowledgeType,
)
import docx


class DocxKnowledge(Knowledge):
Expand Down
62 changes: 44 additions & 18 deletions dbgpt/rag/knowledge/factory.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from typing import Optional
from typing import List
from typing import List, Optional

from dbgpt.rag.knowledge.base import KnowledgeType, Knowledge
from dbgpt.rag.knowledge.base import Knowledge, KnowledgeType
from dbgpt.rag.knowledge.string import StringKnowledge
from dbgpt.rag.knowledge.url import URLKnowledge

Expand Down Expand Up @@ -32,11 +31,21 @@ def create(
Args:
datasource: path of the file to convert
knowledge_type: type of knowledge
Example:
Examples:
.. code-block:: python
>>> from dbgpt.rag.knowledge.factory import KnowledgeFactory
>>> url_knowlege = KnowledgeFactory.create(datasource="https://www.baidu.com", knowledge_type=KnowledgeType.URL)
>>> doc_knowlege = KnowledgeFactory.create(datasource="path/to/document.pdf", knowledge_type=KnowledgeType.DOCUMENT)
from dbgpt.rag.knowledge.factory import KnowledgeFactory
url_knowlege = KnowledgeFactory.create(
datasource="https://www.baidu.com", knowledge_type=KnowledgeType.URL
)
doc_knowlege = KnowledgeFactory.create(
datasource="path/to/document.pdf",
knowledge_type=KnowledgeType.DOCUMENT,
)
"""
match knowledge_type:
case KnowledgeType.DOCUMENT:
Expand All @@ -57,13 +66,22 @@ def from_file_path(
knowledge_type: Optional[KnowledgeType] = KnowledgeType.DOCUMENT,
) -> Knowledge:
"""Create knowledge from path
Args:
param file_path: path of the file to convert
param knowledge_type: type of knowledge
Example:
Examples:
.. code-block:: python
>>> from dbgpt.rag.knowledge.factory import KnowledgeFactory
>>> doc_knowlege = KnowledgeFactory.create(datasource="path/to/document.pdf", knowledge_type=KnowledgeType.DOCUMENT)
from dbgpt.rag.knowledge.factory import KnowledgeFactory
doc_knowlege = KnowledgeFactory.create(
datasource="path/to/document.pdf",
knowledge_type=KnowledgeType.DOCUMENT,
)
"""
factory = cls(file_path=file_path, knowledge_type=knowledge_type)
return factory._select_document_knowledge(
Expand All @@ -76,13 +94,21 @@ def from_url(
knowledge_type: Optional[KnowledgeType] = KnowledgeType.URL,
) -> Knowledge:
"""Create knowledge from url
Args:
param url: url of the file to convert
param knowledge_type: type of knowledge
Example:
Examples:
.. code-block:: python
>>> from dbgpt.rag.knowledge.factory import KnowledgeFactory
>>> url_knowlege = KnowledgeFactory.create(datasource="https://www.baidu.com", knowledge_type=KnowledgeType.URL)
from dbgpt.rag.knowledge.factory import KnowledgeFactory
url_knowlege = KnowledgeFactory.create(
datasource="https://www.baidu.com", knowledge_type=KnowledgeType.URL
)
"""
return URLKnowledge(
url=url,
Expand Down Expand Up @@ -130,14 +156,14 @@ def subclasses(cls):
def _get_knowledge_subclasses() -> List[Knowledge]:
"""get all knowledge subclasses"""
from dbgpt.rag.knowledge.base import Knowledge
from dbgpt.rag.knowledge.pdf import PDFKnowledge
from dbgpt.rag.knowledge.csv import CSVKnowledge
from dbgpt.rag.knowledge.docx import DocxKnowledge
from dbgpt.rag.knowledge.html import HTMLKnowledge
from dbgpt.rag.knowledge.markdown import MarkdownKnowledge
from dbgpt.rag.knowledge.csv import CSVKnowledge
from dbgpt.rag.knowledge.txt import TXTKnowledge
from dbgpt.rag.knowledge.pdf import PDFKnowledge
from dbgpt.rag.knowledge.pptx import PPTXKnowledge
from dbgpt.rag.knowledge.html import HTMLKnowledge
from dbgpt.rag.knowledge.url import URLKnowledge
from dbgpt.rag.knowledge.string import StringKnowledge
from dbgpt.rag.knowledge.txt import TXTKnowledge
from dbgpt.rag.knowledge.url import URLKnowledge

return Knowledge.__subclasses__()
6 changes: 3 additions & 3 deletions dbgpt/rag/knowledge/html.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
from typing import Optional, Any, List
from typing import Any, List, Optional

import chardet

from dbgpt.rag.chunk import Document
from dbgpt.rag.knowledge.base import (
Knowledge,
KnowledgeType,
ChunkStrategy,
DocumentType,
Knowledge,
KnowledgeType,
)


Expand Down
Loading

0 comments on commit 80f6a51

Please sign in to comment.