Skip to content

Commit

Permalink
Switch from puremagic to magika.
Browse files Browse the repository at this point in the history
  • Loading branch information
afourney committed Mar 8, 2025
1 parent 99d8e56 commit 58a687c
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 56 deletions.
3 changes: 1 addition & 2 deletions packages/markitdown/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,7 @@ dependencies = [
"beautifulsoup4",
"requests",
"markdownify",
"puremagic",
"pathvalidate",
"magika>=0.6.0rc1",
"charset-normalizer",
]

Expand Down
3 changes: 0 additions & 3 deletions packages/markitdown/src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,6 @@
from pathlib import Path
from urllib.parse import urlparse
from warnings import warn

# File-format detection
import puremagic
import requests

from ._stream_info import StreamInfo, _guess_stream_info_from_stream
Expand Down
72 changes: 21 additions & 51 deletions packages/markitdown/src/markitdown/_stream_info.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,10 @@
import puremagic
import mimetypes
import os
from dataclasses import dataclass, asdict
from typing import Optional, BinaryIO, List, TypeVar, Type
from magika import Magika

# Mimetype substitutions table
MIMETYPE_SUBSTITUTIONS = {
"application/excel": "application/vnd.ms-excel",
"application/mspowerpoint": "application/vnd.ms-powerpoint",
}
magika = Magika()


@dataclass(kw_only=True, frozen=True)
Expand Down Expand Up @@ -59,6 +55,25 @@ def _guess_stream_info_from_stream(
"""
guesses: List[StreamInfo] = []

# Call magika to guess from the stream
cur_pos = file_stream.tell()
try:
result = magika.identify_bytes(file_stream.read())
if result.status == "ok" and result.prediction.output.label != "unknown":
extension = None
if len(result.prediction.output.extensions) > 0:
extension = result.prediction.output.extensions[0]
if extension and not extension.startswith("."):
extension = "." + extension
guesses.append(
StreamInfo(
mimetype=result.prediction.output.mime_type,
extension=extension,
)
)
finally:
file_stream.seek(cur_pos)

# Add a guess purely based on the filename hint
if filename_hint:
try:
Expand All @@ -74,49 +89,4 @@ def _guess_stream_info_from_stream(
)
)

def _puremagic(
file_stream, filename_hint
) -> List[puremagic.main.PureMagicWithConfidence]:
"""Wrap guesses to handle exceptions."""
try:
return puremagic.magic_stream(file_stream, filename=filename_hint)
except puremagic.main.PureError as e:
return []

cur_pos = file_stream.tell()
type_guesses = _puremagic(file_stream, filename_hint=filename_hint)
if len(type_guesses) == 0:
# Fix for: https://github.com/microsoft/markitdown/issues/222
# If there are no guesses, then try again after trimming leading ASCII whitespaces.
# ASCII whitespace characters are those byte values in the sequence b' \t\n\r\x0b\f'
# (space, tab, newline, carriage return, vertical tab, form feed).

# Eat all the leading whitespace
file_stream.seek(cur_pos)
while True:
char = file_stream.read(1)
if not char: # End of file
break
if not char.isspace():
file_stream.seek(file_stream.tell() - 1)
break

# Try again
type_guesses = _puremagic(file_stream, filename_hint=filename_hint)
file_stream.seek(cur_pos)

# Convert and return the guesses
for guess in type_guesses:
kwargs: dict[str, str] = {}
if guess.extension:
kwargs["extension"] = guess.extension
if guess.mime_type:
kwargs["mimetype"] = MIMETYPE_SUBSTITUTIONS.get(
guess.mime_type, guess.mime_type
)
if len(kwargs) > 0:
# We don't add the filename_hint, because sometimes it's just a placeholder,
# and, in any case, doesn't add new information.
guesses.append(StreamInfo(**kwargs))

return guesses

0 comments on commit 58a687c

Please sign in to comment.