Skip to content

Commit

Permalink
chore: using magika instead of guesslang
Browse files Browse the repository at this point in the history
  • Loading branch information
lh0x00 committed Mar 10, 2025
1 parent 58a687c commit 65b3f4a
Showing 1 changed file with 18 additions and 0 deletions.
18 changes: 18 additions & 0 deletions packages/markitdown/src/markitdown/converters/_markdownify.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@

from typing import Any, Optional
from urllib.parse import quote, unquote, urlparse, urlunparse
from magika import Magika

magika = Magika()

class _CustomMarkdownify(markdownify.MarkdownConverter):
"""
Expand All @@ -17,6 +19,22 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):

def __init__(self, **options: Any):
options["heading_style"] = options.get("heading_style", markdownify.ATX)

# Add a custom code language callback to guess the language of code snippets
def code_language_callback(el):
extracted_code_snippet = el.get_text()
if not extracted_code_snippet:
return ""
result = magika.identify_bytes(extracted_code_snippet.encode())
if result.status == "ok" and result.prediction.output.group in ["text", "code"]:
language = result.prediction.output.label
return language
return ""

options["code_language_callback"] = options.get(
"code_language_callback", code_language_callback
)

# Explicitly cast options to the expected type if necessary
super().__init__(**options)

Expand Down

0 comments on commit 65b3f4a

Please sign in to comment.