-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathJSONLoader.py
123 lines (108 loc) · 4.85 KB
/
JSONLoader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import json
import logging
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Union
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(filename)s - %(lineno)d - %(message)s')
class JSONLoader(BaseLoader):
def __init__(
self,
file_path: Union[str, Path],
content_key: Optional[str] = None,
metadata_func: Optional[Callable[[Dict, Dict], Dict]] = None,
text_content: bool = True,
json_lines: bool = False,
):
"""
Initializes the JSONLoader with a file path, an optional content key to extract specific content,
and an optional metadata function to extract metadata from each record.
"""
try:
self.file_path = Path(file_path).resolve()
self._content_key = content_key
self._metadata_func = metadata_func
self._text_content = text_content
self._json_lines = json_lines
except Exception as e:
logging.error("Error occurred in JSONLoader init for file '{}': {}".format(self.file_path, e))
return None
def load(self) -> List[Document]:
"""Load and return documents from the JSON file."""
docs: List[Document] = []
if self._json_lines:
with self.file_path.open(encoding="utf-8") as f:
for line in f:
line = line.strip()
if line:
self._parse(line, docs)
else:
self._parse(self.file_path.read_text(encoding="utf-8"), docs)
return docs
def _parse(self, content: str, docs: List[Document]) -> None:
"""Convert given content to documents."""
data = json.loads(content)
# Perform some validation
# This is not a perfect validation, but it should catch most cases
# and prevent the user from getting a cryptic error later on.
if self._content_key is not None:
self._validate_content_key(data)
if self._metadata_func is not None:
self._validate_metadata_func(data)
for i, sample in enumerate(data, len(docs) + 1):
text = self._get_text(key=sample,data=data)
metadata = self._get_metadata(sample=sample, source=str(self.file_path), seq_num=i)
docs.append(Document(page_content=text, metadata=metadata))
def _get_text(self, key: Any,data) -> str:
"""Convert sample to string format"""
if self._content_key is not None:
content = key.get(self._content_key)
else:
content = data[key]
if self._text_content and not isinstance(content, str):
raise ValueError(
f"Expected page_content is string, got {type(content)} instead. \
Set `text_content=False` if the desired input for \
`page_content` is not a string"
)
# In case the text is None, set it to an empty string
elif isinstance(content, str):
return content
elif isinstance(content, dict):
return json.dumps(content) if content else ""
else:
return str(content) if content is not None else ""
def _get_metadata(self, sample: Dict[str, Any], **additional_fields: Any) -> Dict[str, Any]:
"""
Return a metadata dictionary base on the existence of metadata_func
:param sample: single data payload
:param additional_fields: key-word arguments to be added as metadata values
:return:
"""
if self._metadata_func is not None:
return self._metadata_func(sample, additional_fields)
else:
return additional_fields
def _validate_content_key(self, data: Any) -> None:
"""Check if a content key is valid"""
sample = data.first()
if not isinstance(sample, dict):
raise ValueError(
f"Expected the jq schema to result in a list of objects (dict), \
so sample must be a dict but got `{type(sample)}`"
)
if sample.get(self._content_key) is None:
raise ValueError(
f"Expected the jq schema to result in a list of objects (dict) \
with the key `{self._content_key}`"
)
def _validate_metadata_func(self, data: Any) -> None:
"""Check if the metadata_func output is valid"""
sample = data.first()
if self._metadata_func is not None:
sample_metadata = self._metadata_func(sample, {})
if not isinstance(sample_metadata, dict):
raise ValueError(
f"Expected the metadata_func to return a dict but got \
`{type(sample_metadata)}`"
)