diff --git a/CHANGELOG.md b/CHANGELOG.md index aa8d11a..570a28b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Allow setting user agent string and provide a default ([#37](https://github.com/stumpylog/tika-client/pull/37)) -- Support for async (by [@Goldziher](https://github.com/stumpylog/tika-client/pull/39) in [#39](https://github.com/stumpylog/tika-client/pull/39)) +- Support for async (by [@Goldziher](https://github.com/Goldziher) in [#39](https://github.com/stumpylog/tika-client/pull/39)) ### Documentation diff --git a/src/tika_client/_base.py b/src/tika_client/_base.py index 6e47801..55a783a 100644 --- a/src/tika_client/_base.py +++ b/src/tika_client/_base.py @@ -33,27 +33,39 @@ def __init__(self, client: T, *, compress: bool) -> None: self.client = client self.compress = compress - def get_content_headers(self, filename: str) -> dict[str, str]: + def get_content_headers(self, filename: str, disposition: str = "attachment") -> dict[str, str]: """ Given a filename, returns the attachment header. Args: filename: The filename to encode + disposition: The disposition of the file, defaults to attachment Returns: The attachment header """ try: + # Test if filename is ASCII filename.encode("ascii") except UnicodeEncodeError: - filename_safed = filename.encode("ascii", "ignore").decode("ascii") - filepath_quoted = quote(filename, encoding="utf-8") + # For non-ASCII, provide ASCII fallback and UTF-8 encoded version + ascii_filename = filename.encode("ascii", "replace").decode("ascii") + # Replace ? marks from replace encoding with underscore for better readability + ascii_filename = ascii_filename.replace("?", "_") + # Escape quotes in ASCII version + ascii_filename = ascii_filename.replace('"', '\\"') + # UTF-8 encode the original filename and percent-encode the bytes + utf8_filename = quote(filename.encode("utf-8")) + return { - "Content-Disposition": f"attachment; filename={filename_safed}; filename*=UTF-8''{filepath_quoted}", + "Content-Disposition": f'{disposition}; filename="{ascii_filename}"; ' + f"filename*=UTF-8''{utf8_filename}", } else: + # If ASCII, we still need to escape quotes + escaped_filename = filename.replace('"', '\\"') return { - "Content-Disposition": f"attachment; filename={filename}", + "Content-Disposition": f'{disposition}; filename="{escaped_filename}"', } @abstractmethod