Skip to content

Commit

Permalink
Move MIME type to metadata instead of its own field/column
Browse files Browse the repository at this point in the history
  • Loading branch information
smokestacklightnin committed Feb 2, 2025
1 parent 435a268 commit ec92164
Show file tree
Hide file tree
Showing 7 changed files with 18 additions and 17 deletions.
13 changes: 5 additions & 8 deletions ragna/core/_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,15 @@ def __init__(
name: str,
metadata: dict[str, Any],
handler: Optional[DocumentHandler] = None,
mime_type: str | None = None,
):
self.id = id or uuid.uuid4()
self.name = name
self.metadata = metadata
self.handler = handler or self.get_handler(name)
self.mime_type = (
mime_type or mimetypes.guess_type(name)[0] or "application/octet-stream"
)
if "mime_type" not in self.metadata:
self.metadata["mime_type"] = (
mimetypes.guess_type(name)[0] or "application/octet-stream"
)

@staticmethod
def supported_suffixes() -> set[str]:
Expand Down Expand Up @@ -81,11 +81,8 @@ def __init__(
name: str,
metadata: dict[str, Any],
handler: Optional[DocumentHandler] = None,
mime_type: str | None = None,
):
super().__init__(
id=id, name=name, metadata=metadata, handler=handler, mime_type=mime_type
)
super().__init__(id=id, name=name, metadata=metadata, handler=handler)
if "path" not in self.metadata:
metadata["path"] = str(ragna.local_root() / "documents" / str(self.id))

Expand Down
2 changes: 1 addition & 1 deletion ragna/deploy/_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ async def get_document_content(

return StreamingResponse(
io.BytesIO(core_document.read()),
media_type=core_document.mime_type,
media_type=core_document.metadata["mime_type"],
headers=headers,
)

Expand Down
2 changes: 0 additions & 2 deletions ragna/deploy/_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,6 @@ def document(
user_id=user_id,
name=document.name,
metadata_=document.metadata,
mime_type=document.mime_type,
)

def source(self, source: schemas.Source) -> orm.Source:
Expand Down Expand Up @@ -358,7 +357,6 @@ def document(self, document: orm.Document) -> schemas.Document:
id=document.id,
name=document.name,
metadata=document.metadata_,
mime_type=document.mime_type,
)

def source(self, source: orm.Source) -> schemas.Source:
Expand Down
2 changes: 0 additions & 2 deletions ragna/deploy/_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,6 @@ def document(self, document: schemas.Document) -> core.Document:
id=document.id,
name=document.name,
metadata=document.metadata,
mime_type=document.mime_type,
)

def source(self, source: schemas.Source) -> core.Source:
Expand Down Expand Up @@ -335,7 +334,6 @@ def document(self, document: core.Document) -> schemas.Document:
id=document.id,
name=document.name,
metadata=document.metadata,
mime_type=document.mime_type,
)

def source(self, source: core.Source) -> schemas.Source:
Expand Down
1 change: 0 additions & 1 deletion ragna/deploy/_orm.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,6 @@ class Document(Base):
# Mind the trailing underscore here. Unfortunately, this is necessary, because
# metadata without the underscore is reserved by SQLAlchemy
metadata_ = Column(Json, nullable=False)
mime_type = Column(types.String, nullable=False)
chats = relationship(
"Chat",
secondary=document_chat_association_table,
Expand Down
1 change: 0 additions & 1 deletion ragna/deploy/_schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,6 @@ class Document(BaseModel):
id: uuid.UUID = Field(default_factory=uuid.uuid4)
name: str
metadata: dict[str, Any]
mime_type: str


class Source(BaseModel):
Expand Down
14 changes: 12 additions & 2 deletions tests/deploy/api/test_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,12 @@ def test_e2e(tmp_local_root, multiple_answer_chunks, stream_answer, corpus_name)
)
assert corpus_name in corpuses_metadata[source_storage]
metadata_keys = corpuses_metadata[source_storage][corpus_name].keys()
assert list(metadata_keys) == ["document_id", "document_name", "path"]
assert list(metadata_keys) == [
"document_id",
"document_name",
"mime_type",
"path",
]
for key in metadata_keys:
assert corpuses_metadata[source_storage][corpus_name][key][0] == "str"

Expand All @@ -126,7 +131,12 @@ def test_e2e(tmp_local_root, multiple_answer_chunks, stream_answer, corpus_name)
)
assert corpus_name in corpuses_metadata[source_storage]
metadata_keys = corpuses_metadata[source_storage][corpus_name].keys()
assert list(metadata_keys) == ["document_id", "document_name", "path"]
assert list(metadata_keys) == [
"document_id",
"document_name",
"mime_type",
"path",
]
for key in metadata_keys:
assert corpuses_metadata[source_storage][corpus_name][key][0] == "str"

Expand Down

0 comments on commit ec92164

Please sign in to comment.