diff --git a/docs/user/forms.md b/docs/user/forms.md index e622b4138..c2517fefe 100644 --- a/docs/user/forms.md +++ b/docs/user/forms.md @@ -30,6 +30,7 @@ writer.update_page_form_field_values( writer.pages[0], {"fieldname": "some filled in text"}, auto_regenerate=False, + flatten=False, ) with open("filled-out.pdf", "wb") as output_stream: @@ -41,6 +42,12 @@ parameter is `True` by default for legacy compatibility, but this flags the PDF processor to recompute the field's rendering, and may trigger a "save changes" dialog for users who open the generated PDF. +If you want to flatten your form, that is, keeping all form field contents while +removing the form fields themselves, you can set `flatten=True` to convert form +field contents to regular pdf content, and then use +`writer.remove_annotations(subtypes="/Widget")` to remove all form fields. This +will result in a flattened pdf. + ## Some notes about form fields and annotations PDF forms have a dual-nature approach to the fields: diff --git a/pypdf/_font.py b/pypdf/_font.py index 06f78ea77..5f8d98d2f 100644 --- a/pypdf/_font.py +++ b/pypdf/_font.py @@ -1,5 +1,4 @@ from dataclasses import dataclass, field -from typing import Optional from pypdf.generic import DictionaryObject @@ -29,10 +28,16 @@ class FontDescriptor: character_widths: dict[str, int] = field(default_factory=dict) @classmethod - def from_font_resource(cls, pdf_font_dict: DictionaryObject) -> "Optional[FontDescriptor]": + def from_font_resource(cls, pdf_font_dict: DictionaryObject) -> "FontDescriptor": from pypdf._codecs.core_fontmetrics import CORE_FONT_METRICS # noqa: PLC0415 # Prioritize information from the PDF font dictionary - font_name = pdf_font_dict.get("/BaseFont", "Unknown") - if font_name[1:] in CORE_FONT_METRICS: - return CORE_FONT_METRICS.get(font_name[1:]) + font_name = pdf_font_dict.get("/BaseFont", "Unknown").removeprefix("/") + if font_name in CORE_FONT_METRICS: + return CORE_FONT_METRICS[font_name] return cls(name=font_name) + + def text_width(self, text: str) -> float: + """Sum of character widths specified in PDF font for the supplied text.""" + return sum( + [self.character_widths.get(char, self.character_widths.get("default", 0)) for char in text], 0.0 + ) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index c4ebea929..92b0db325 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -48,7 +48,6 @@ cast, ) -from ._cmap import _default_fonts_space_width, build_char_map_from_dict from ._doc_common import DocumentInformation, PdfDocCommon from ._encryption import EncryptAlgorithm, Encryption from ._page import PageObject, Transformation @@ -85,7 +84,6 @@ BooleanObject, ByteStringObject, ContentStream, - DecodedStreamObject, Destination, DictionaryObject, EmbeddedFile, @@ -107,6 +105,7 @@ hex_to_rgb, is_null_or_none, ) +from .generic._appearance_stream import TextStreamAppearance from .pagerange import PageRange, PageRangeSpec from .types import ( AnnotationSubtype, @@ -119,7 +118,6 @@ from .xmp import XmpInformation ALL_DOCUMENT_PERMISSIONS = UserAccessPermissions.all() -DEFAULT_FONT_HEIGHT_IN_MULTILINE = 12 class ObjectDeletionFlag(enum.IntFlag): @@ -874,7 +872,6 @@ def _add_apstream_object( object_name: str, x_offset: float, y_offset: float, - font_res: Optional[DictionaryObject] = None ) -> None: """ Adds an appearance stream to the page content in the form of @@ -886,17 +883,25 @@ def _add_apstream_object( object_name: The name of the appearance stream. x_offset: The horizontal offset for the appearance stream. y_offset: The vertical offset for the appearance stream. - font_res: The appearance stream's font resource (if given). """ - # Prepare XObject resource dictionary on the page + # Prepare XObject resource dictionary on the page. This currently + # only deals with font resources, but can easily be adapted to also + # include other resources. pg_res = cast(DictionaryObject, page[PG.RESOURCES]) - if font_res is not None: - font_name = font_res["/BaseFont"] # [/"Name"] often also exists, but is deprecated + if "/Resources" in appearance_stream_obj: + ap_stream_res = cast(DictionaryObject, appearance_stream_obj["/Resources"]) + # No need to check "if "/Font" in ap_stream_res", because the only reason this + # code runs would be if we are flattening form fields, and the associated code + # either adds a Font resource or no resource at all. This probably needs to + # change if we want to use this method to flatten markup annotations. + ap_stream_font_dict = cast(DictionaryObject, ap_stream_res["/Font"]) if "/Font" not in pg_res: pg_res[NameObject("/Font")] = DictionaryObject() - pg_ft_res = cast(DictionaryObject, pg_res[NameObject("/Font")]) - if font_name not in pg_ft_res: - pg_ft_res[NameObject(font_name)] = font_res + pg_font_res = cast(DictionaryObject, pg_res["/Font"]) + # Merge fonts from the appearance stream into the page's font resources + for font_name, font_ref in ap_stream_font_dict.items(): + if font_name not in pg_font_res: + pg_font_res[font_name] = font_ref # Always add the resolved stream object to the writer to get a new IndirectObject. # This ensures we have a valid IndirectObject managed by *this* writer. xobject_ref = self._add_object(appearance_stream_obj) @@ -915,160 +920,6 @@ def _add_apstream_object( xobject_drawing_commands = f"q\n{xobject_cm._to_cm()}\n{xobject_name} Do\nQ".encode() self._merge_content_stream_to_page(page, xobject_drawing_commands) - def _update_field_annotation( - self, - page: PageObject, - field: DictionaryObject, - annotation: DictionaryObject, - font_name: str = "", - font_size: float = -1, - flatten: bool = False, - ) -> None: - # Calculate rectangle dimensions - _rct = cast(RectangleObject, annotation[AA.Rect]) - rct = RectangleObject((0, 0, abs(_rct[2] - _rct[0]), abs(_rct[3] - _rct[1]))) - - # Extract font information - da = annotation.get_inherited( - AA.DA, - cast(DictionaryObject, self.root_object[CatalogDictionary.ACRO_FORM]).get( - AA.DA, None - ), - ) - if da is None: - da = TextStringObject("/Helv 0 Tf 0 g") - else: - da = da.get_object() - font_properties = da.replace("\n", " ").replace("\r", " ").split(" ") - font_properties = [x for x in font_properties if x != ""] - if font_name: - font_properties[font_properties.index("Tf") - 2] = font_name - else: - font_name = font_properties[font_properties.index("Tf") - 2] - font_height = ( - font_size - if font_size >= 0 - else float(font_properties[font_properties.index("Tf") - 1]) - ) - if font_height == 0: - if field.get(FA.Ff, 0) & FA.FfBits.Multiline: - font_height = DEFAULT_FONT_HEIGHT_IN_MULTILINE - else: - font_height = rct.height - 2 - font_properties[font_properties.index("Tf") - 1] = str(font_height) - da = " ".join(font_properties) - y_offset = rct.height - 1 - font_height - - # Retrieve font information from local DR ... - dr: Any = cast( - DictionaryObject, - cast( - DictionaryObject, - annotation.get_inherited( - "/DR", - cast( - DictionaryObject, self.root_object[CatalogDictionary.ACRO_FORM] - ).get("/DR", DictionaryObject()), - ), - ).get_object(), - ) - dr = dr.get("/Font", DictionaryObject()).get_object() - # _default_fonts_space_width keys is the list of Standard fonts - if font_name not in dr and font_name not in _default_fonts_space_width: - # ...or AcroForm dictionary - dr = cast( - dict[Any, Any], - cast( - DictionaryObject, self.root_object[CatalogDictionary.ACRO_FORM] - ).get("/DR", {}), - ) - dr = dr.get_object().get("/Font", DictionaryObject()).get_object() - font_res = dr.get(font_name, None) - if not is_null_or_none(font_res): - font_res = cast(DictionaryObject, font_res.get_object()) - _font_subtype, _, font_encoding, font_map = build_char_map_from_dict( - 200, font_res - ) - try: # remove width stored in -1 key - del font_map[-1] - except KeyError: - pass - font_full_rev: dict[str, bytes] - if isinstance(font_encoding, str): - font_full_rev = { - v: k.encode(font_encoding) for k, v in font_map.items() - } - else: - font_full_rev = {v: bytes((k,)) for k, v in font_encoding.items()} - font_encoding_rev = {v: bytes((k,)) for k, v in font_encoding.items()} - for key, value in font_map.items(): - font_full_rev[value] = font_encoding_rev.get(key, key) - else: - logger_warning(f"Font dictionary for {font_name} not found.", __name__) - font_full_rev = {} - - # Retrieve field text and selected values - field_flags = field.get(FA.Ff, 0) - if field.get(FA.FT, "/Tx") == "/Ch" and field_flags & FA.FfBits.Combo == 0: - txt = "\n".join(annotation.get_inherited(FA.Opt, [])) - sel = field.get("/V", []) - if not isinstance(sel, list): - sel = [sel] - else: # /Tx - txt = field.get("/V", "") - sel = [] - # Escape parentheses (PDF 1.7 reference, table 3.2, Literal Strings) - txt = txt.replace("\\", "\\\\").replace("(", r"\(").replace(")", r"\)") - # Generate appearance stream - ap_stream = generate_appearance_stream( - txt, sel, da, font_full_rev, rct, font_height, y_offset - ) - - # Create appearance dictionary - dct = DecodedStreamObject.initialize_from_dictionary( - { - NameObject("/Type"): NameObject("/XObject"), - NameObject("/Subtype"): NameObject("/Form"), - NameObject("/BBox"): rct, - "__streamdata__": ByteStringObject(ap_stream), - "/Length": 0, - } - ) - if AA.AP in annotation: - for k, v in cast(DictionaryObject, annotation[AA.AP]).get("/N", {}).items(): - if k not in {"/BBox", "/Length", "/Subtype", "/Type", "/Filter"}: - dct[k] = v - - # Update Resources with font information if necessary - if font_res is not None: - dct[NameObject("/Resources")] = DictionaryObject( - { - NameObject("/Font"): DictionaryObject( - { - NameObject(font_name): getattr( - font_res, "indirect_reference", font_res - ) - } - ) - } - ) - if AA.AP not in annotation: - annotation[NameObject(AA.AP)] = DictionaryObject( - {NameObject("/N"): self._add_object(dct)} - ) - elif "/N" not in cast(DictionaryObject, annotation[AA.AP]): - cast(DictionaryObject, annotation[NameObject(AA.AP)])[ - NameObject("/N") - ] = self._add_object(dct) - else: # [/AP][/N] exists - n = annotation[AA.AP]["/N"].indirect_reference.idnum # type: ignore - self._objects[n - 1] = dct - dct.indirect_reference = IndirectObject(n, 0, self) - - if flatten: - field_name = self._get_qualified_field_name(annotation) - self._add_apstream_object(page, dct, field_name, _rct[0], _rct[1], font_res) - FFBITS_NUL = FA.FfBits(0) def update_page_form_field_values( @@ -1111,8 +962,8 @@ def update_page_form_field_values( """ if CatalogDictionary.ACRO_FORM not in self._root_object: raise PyPdfError("No /AcroForm dictionary in PDF of PdfWriter Object") - af = cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM]) - if InteractiveFormDictEntries.Fields not in af: + acro_form = cast(DictionaryObject, self._root_object[CatalogDictionary.ACRO_FORM]) + if InteractiveFormDictEntries.Fields not in acro_form: raise PyPdfError("No /Fields dictionary in PDF of PdfWriter Object") if isinstance(auto_regenerate, bool): self.set_need_appearances_writer(auto_regenerate) @@ -1139,6 +990,7 @@ def update_page_form_field_values( ).get_object() for field, value in fields.items(): + rectangle = cast(RectangleObject, annotation[AA.Rect]) if not ( self._get_qualified_field_name(parent_annotation) == field or parent_annotation.get("/T", None) == field @@ -1151,6 +1003,7 @@ def update_page_form_field_values( del parent_annotation["/I"] if flags: annotation[NameObject(FA.Ff)] = NumberObject(flags) + # Set the field value if not (value is None and flatten): # Only change values if given by user and not flattening. if isinstance(value, list): lst = ArrayObject(TextStringObject(v) for v in value) @@ -1161,37 +1014,52 @@ def update_page_form_field_values( ) else: parent_annotation[NameObject(FA.V)] = TextStringObject(value) + # Get or create the field's appearance stream object if parent_annotation.get(FA.FT) == "/Btn": - # Checkbox button (no /FT found in Radio widgets) + # Checkbox button (no /FT found in Radio widgets); + # We can find the associated appearance stream object + # within the annotation. v = NameObject(value) ap = cast(DictionaryObject, annotation[NameObject(AA.AP)]) normal_ap = cast(DictionaryObject, ap["/N"]) if v not in normal_ap: v = NameObject("/Off") appearance_stream_obj = normal_ap.get(v) - # other cases will be updated through the for loop + # Other cases will be updated through the for loop annotation[NameObject(AA.AS)] = v annotation[NameObject(FA.V)] = v - if flatten and appearance_stream_obj is not None: - # We basically copy the entire appearance stream, which should be an XObject that - # is already registered. No need to add font resources. - rct = cast(RectangleObject, annotation[AA.Rect]) - self._add_apstream_object(page, appearance_stream_obj, field, rct[0], rct[1]) elif ( parent_annotation.get(FA.FT) == "/Tx" or parent_annotation.get(FA.FT) == "/Ch" ): - # textbox + # Textbox; we need to generate the appearance stream object if isinstance(value, tuple): - self._update_field_annotation( - page, parent_annotation, annotation, value[1], value[2], flatten=flatten + appearance_stream_obj = TextStreamAppearance.from_text_annotation( + acro_form, parent_annotation, annotation, value[1], value[2] ) else: - self._update_field_annotation(page, parent_annotation, annotation, flatten=flatten) + appearance_stream_obj = TextStreamAppearance.from_text_annotation( + acro_form, parent_annotation, annotation + ) + # Add the appearance stream object + if AA.AP not in annotation: + annotation[NameObject(AA.AP)] = DictionaryObject( + {NameObject("/N"): self._add_object(appearance_stream_obj)} + ) + elif "/N" not in (ap:= cast(DictionaryObject, annotation[AA.AP])): + cast(DictionaryObject, annotation[NameObject(AA.AP)])[ + NameObject("/N") + ] = self._add_object(appearance_stream_obj) + else: # [/AP][/N] exists + n = annotation[AA.AP]["/N"].indirect_reference.idnum # type: ignore + self._objects[n - 1] = appearance_stream_obj + appearance_stream_obj.indirect_reference = IndirectObject(n, 0, self) elif ( annotation.get(FA.FT) == "/Sig" ): # deprecated # not implemented yet logger_warning("Signature forms not implemented yet", __name__) + if flatten and appearance_stream_obj is not None: + self._add_apstream_object(page, appearance_stream_obj, field, rectangle[0], rectangle[1]) def reattach_fields( self, page: Optional[PageObject] = None @@ -3432,36 +3300,3 @@ def _create_outline_item( format_flag += OutlineFontFlag.bold outline_item.update({NameObject("/F"): NumberObject(format_flag)}) return outline_item - - -def generate_appearance_stream( - txt: str, - sel: list[str], - da: str, - font_full_rev: dict[str, bytes], - rct: RectangleObject, - font_height: float, - y_offset: float, -) -> bytes: - ap_stream = f"q\n/Tx BMC \nq\n1 1 {rct.width - 1} {rct.height - 1} re\nW\nBT\n{da}\n".encode() - for line_number, line in enumerate(txt.replace("\n", "\r").split("\r")): - if line in sel: - # may be improved but cannot find how to get fill working => replaced with lined box - ap_stream += ( - f"1 {y_offset - (line_number * font_height * 1.4) - 1} {rct.width - 2} {font_height + 2} re\n" - f"0.5 0.5 0.5 rg s\n{da}\n" - ).encode() - if line_number == 0: - ap_stream += f"2 {y_offset} Td\n".encode() - else: - # Td is a relative translation - ap_stream += f"0 {- font_height * 1.4} Td\n".encode() - enc_line: list[bytes] = [ - font_full_rev.get(c, c.encode("utf-16-be")) for c in line - ] - if any(len(c) >= 2 for c in enc_line): - ap_stream += b"<" + (b"".join(enc_line)).hex().encode() + b"> Tj\n" - else: - ap_stream += b"(" + b"".join(enc_line) + b") Tj\n" - ap_stream += b"ET\nQ\nEMC\nQ\n" - return ap_stream diff --git a/pypdf/generic/_appearance_stream.py b/pypdf/generic/_appearance_stream.py new file mode 100644 index 000000000..de71eff87 --- /dev/null +++ b/pypdf/generic/_appearance_stream.py @@ -0,0 +1,473 @@ +import re +from typing import Any, Optional, Union, cast + +from .._cmap import build_char_map_from_dict +from .._codecs.core_fontmetrics import CORE_FONT_METRICS +from .._font import FontDescriptor +from .._utils import logger_warning +from ..constants import AnnotationDictionaryAttributes, FieldDictionaryAttributes +from ..generic import ( + DecodedStreamObject, + DictionaryObject, + NameObject, + NumberObject, + RectangleObject, +) +from ..generic._base import ByteStringObject, TextStringObject, is_null_or_none + +DEFAULT_FONT_SIZE_IN_MULTILINE = 12 + + +class TextStreamAppearance(DecodedStreamObject): + """ + A class representing the appearance stream for a text-based form field. + + This class generates the content stream (the `ap_stream_data`) that dictates + how text is rendered within a form field's bounding box. It handles properties + like font, font size, color, multiline text, and text selection highlighting. + """ + def _scale_text( + self, + font_descriptor: FontDescriptor, + font_size: float, + field_width: float, + field_height: float, + txt: str, + multiline: bool, + min_font_size: float = 4.0, # Minimum font size to attempt + font_size_step: float = 0.2 # How much to decrease font size by each step + ) -> tuple[list[tuple[float, str]], float]: + """ + Takes a piece of text and scales it to field_width or field_height, given font_name + and font_size. For multiline fields, adds newlines to wrap the text. + + Args: + font_descriptor: A FontDescriptor for the font to be used. + font_size: The font size in points. + field_width: The width of the field in which to fit the text. + field_height: The height of the field in which to fit the text. + txt: The text to fit with the field. + multiline: Whether to scale and wrap the text, or only to scale. + min_font_size: The minimum font size at which to scale the text. + font_size_step: The amount by which to decrement font size per step while scaling. + + Returns: + The text in in the form of list of tuples, each tuple containing the length of a line + and it contents, and the font_size for these lines and lengths. + """ + # Single line: + if not multiline: + test_width = font_descriptor.text_width(txt) * font_size / 1000 + if test_width > field_width or font_size > field_height: + new_font_size = font_size - font_size_step + if new_font_size >= min_font_size: + # Text overflows height; Retry with smaller font size. + return self._scale_text( + font_descriptor, + round(new_font_size, 1), + field_width, + field_height, + txt, + multiline, + min_font_size, + font_size_step + ) + # Font size lower than set minimum font size, give up. + return [(test_width, txt)], font_size + return [(test_width, txt)], font_size + # Multiline: + orig_txt = txt + paragraphs = re.sub(r"\n", "\r", txt).split("\r") + wrapped_lines = [] + current_line_words: list[str] = [] + current_line_width: float = 0 + space_width = font_descriptor.text_width(" ") * font_size / 1000 + for paragraph in paragraphs: + if not paragraph.strip(): + wrapped_lines.append((0.0, "")) + continue + words = paragraph.split(" ") + for i, word in enumerate(words): + word_width = font_descriptor.text_width(word) * font_size / 1000 + test_width = current_line_width + word_width + (space_width if i else 0) + if test_width > field_width and current_line_words: + wrapped_lines.append((current_line_width, " ".join(current_line_words))) + current_line_words = [word] + current_line_width = word_width + elif not current_line_words and word_width > field_width: + wrapped_lines.append((word_width, word)) + current_line_words = [] + current_line_width = 0 + else: + if current_line_words: + current_line_width += space_width + current_line_words.append(word) + current_line_width += word_width + if current_line_words: + wrapped_lines.append((current_line_width, " ".join(current_line_words))) + current_line_words = [] + current_line_width = 0 + # Estimate total height. + # Assumed line spacing of 1.4 + estimated_total_height = font_size + (len(wrapped_lines) - 1) * 1.4 * font_size + if estimated_total_height > field_height: + new_font_size = font_size - font_size_step + if new_font_size >= min_font_size: + # Text overflows height; Retry with smaller font size. + return self._scale_text( + font_descriptor, + round(new_font_size, 1), + field_width, + field_height, + orig_txt, + multiline, + min_font_size, + font_size_step + ) + # Font size lower than set minimum font size, give up. + return (wrapped_lines, font_size) + return (wrapped_lines, font_size) + + def _generate_appearance_stream_data( + self, + text: str = "", + selection: Optional[list[str]] = None, + rectangle: Union[RectangleObject, tuple[float, float, float, float]] = (0.0, 0.0, 0.0, 0.0), + font_descriptor: FontDescriptor = CORE_FONT_METRICS["Helvetica"], + font_glyph_byte_map: Optional[dict[str, bytes]] = None, + font_name: str = "/Helv", + font_size: float = 0.0, + font_color: str = "0 g", + multiline: bool = False, + alignment: int = 0 + ) -> bytes: + """ + Generates the raw bytes of the PDF appearance stream for a text field. + + This private method assembles the PDF content stream operators to draw + the provided text within the specified rectangle. It handles text positioning, + font application, color, and special formatting like selected text. + + Args: + text: The text to be rendered in the form field. + selection: An optional list of strings that should be highlighted as selected. + font_glyph_byte_map: An optional dictionary mapping characters to their + byte representation for glyph encoding. + rect: The bounding box of the form field. Can be a `RectangleObject` + or a tuple of four floats (x1, y1, x2, y2). + font_name: The name of the font resource to use (e.g., "/Helv"). + font_size: The font size. If 0, it is automatically calculated + based on whether the field is multiline or not. + font_color: The color to apply to the font, represented as a PDF + graphics state string (e.g., "0 g" for black). + multiline: A boolean indicating if the text field is multiline. + alignment: Left-aligned (0), centered (1) or right-aligned (2) text. + + Returns: + A byte string containing the PDF content stream data. + + """ + font_glyph_byte_map = font_glyph_byte_map or {} + if isinstance(rectangle, tuple): + rectangle = RectangleObject(rectangle) + + # If font_size is 0, apply the logic for multiline or large-as-possible font + if font_size == 0: + if selection: # Don't wrap text when dealing with a /Ch field, in order to prevent problems + multiline = False # with matching "selection" with "line" later on. + if multiline: + font_size = DEFAULT_FONT_SIZE_IN_MULTILINE + else: + font_size = rectangle.height - 2 + lines, font_size = self._scale_text( + font_descriptor, + font_size, + rectangle.width - 3, # One point margin left and right, and an additional point because the first + # offset takes one extra point (see below, "desired_abs_x_start") + rectangle.height - 3, # One point margin for top and bottom, one point extra for the first line + # (see y_offset) + text, + multiline, + ) + else: + lines = [( + font_descriptor.text_width(line) * font_size / 1000, + line + ) for line in text.replace("\n", "\r").split("\r")] + + # Set the vertical offset + y_offset = rectangle.height - 1 - font_size + default_appearance = f"{font_name} {font_size} Tf {font_color}" + + ap_stream = ( + f"q\n/Tx BMC \nq\n1 1 {rectangle.width - 1} {rectangle.height - 1} " + f"re\nW\nBT\n{default_appearance}\n".encode() + ) + current_x_pos: float = 0 # Initial virtual position within the text object. + for line_number, (line_width, line) in enumerate(lines): + if selection and line in selection: + # Might be improved, but cannot find how to get fill working => replaced with lined box + ap_stream += ( + f"1 {y_offset - (line_number * font_size * 1.4) - 1} {rectangle.width - 2} {font_size + 2} re\n" + f"0.5 0.5 0.5 rg s\n{default_appearance}\n" + ).encode() + + # Calculate the desired absolute starting X for the current line + desired_abs_x_start: float = 0 + if alignment == 2: # Right aligned + desired_abs_x_start = rectangle.width - 2 - line_width + elif alignment == 1: # Centered + desired_abs_x_start = (rectangle.width - line_width) / 2 + else: # Left aligned; default + desired_abs_x_start = 2 + # Calculate x_rel_offset: how much to move from the current_x_pos + # to reach the desired_abs_x_start. + x_rel_offset = desired_abs_x_start - current_x_pos + + # Y-offset: + y_rel_offset: float = 0 + if line_number == 0: + y_rel_offset = y_offset # Initial vertical position + else: + y_rel_offset = - font_size * 1.4 # Move down by line height + + # Td is a relative translation (Tx and Ty). + # It updates the current text position. + ap_stream += f"{x_rel_offset} {y_rel_offset} Td\n".encode() + # Update current_x_pos based on the Td operation for the next iteration. + # This is the X position where the *current line* will start. + current_x_pos = desired_abs_x_start + + encoded_line: list[bytes] = [ + font_glyph_byte_map.get(c, c.encode("utf-16-be")) for c in line + ] + if any(len(c) >= 2 for c in encoded_line): + ap_stream += b"<" + (b"".join(encoded_line)).hex().encode() + b"> Tj\n" + else: + ap_stream += b"(" + b"".join(encoded_line) + b") Tj\n" + ap_stream += b"ET\nQ\nEMC\nQ\n" + return ap_stream + + def __init__( + self, + text: str = "", + selection: Optional[list[str]] = None, + rectangle: Union[RectangleObject, tuple[float, float, float, float]] = (0.0, 0.0, 0.0, 0.0), + font_resource: Optional[DictionaryObject] = None, + font_name: str = "/Helv", + font_size: float = 0.0, + font_color: str = "0 g", + multiline: bool = False, + alignment: int = 0 + ) -> None: + """ + Initializes a TextStreamAppearance object. + + This constructor creates a new PDF stream object configured as an XObject + of subtype Form. It uses the `_appearance_stream_data` method to generate + the content for the stream. + + Args: + text: The text to be rendered in the form field. + selection: An optional list of strings that should be highlighted as selected. + rect: The bounding box of the form field. Can be a `RectangleObject` + or a tuple of four floats (x1, y1, x2, y2). + font_resource: An optional variable that represents a PDF font dictionary. + font_name: The name of the font resource, e.g., "/Helv". + font_size: The font size. If 0, it's auto-calculated. + font_color: The font color string. + multiline: A boolean indicating if the text field is multiline. + alignment: Left-aligned (0), centered (1) or right-aligned (2) text. + + """ + super().__init__() + + # If a font resource was added, get the font character map + if font_resource: + font_resource = cast(DictionaryObject, font_resource.get_object()) + font_descriptor = FontDescriptor.from_font_resource(font_resource) + else: + logger_warning(f"Font dictionary for {font_name} not found; defaulting to Helvetica.", __name__) + font_name = "/Helv" + font_resource = DictionaryObject() + font_resource[NameObject("/Type")] = NameObject("/Font") + font_resource[NameObject("/Subtype")] = NameObject("/Type1") + font_resource[NameObject("/Name")] = NameObject("/Helv") + font_resource[NameObject("/BaseFont")] = NameObject("/Helvetica") + font_resource[NameObject("/Encoding")] = NameObject("/MacRomanEncoding") + font_descriptor = CORE_FONT_METRICS["Helvetica"] + + # Get the font glyph data + _font_subtype, _, font_encoding, font_map = build_char_map_from_dict( + 200, font_resource + ) + try: # remove width stored in -1 key + del font_map[-1] + except KeyError: + pass + font_glyph_byte_map: dict[str, bytes] + if isinstance(font_encoding, str): + font_glyph_byte_map = { + v: k.encode(font_encoding) for k, v in font_map.items() + } + else: + font_glyph_byte_map = {v: bytes((k,)) for k, v in font_encoding.items()} + font_encoding_rev = {v: bytes((k,)) for k, v in font_encoding.items()} + for key, value in font_map.items(): + font_glyph_byte_map[value] = font_encoding_rev.get(key, key) + + ap_stream_data = self._generate_appearance_stream_data( + text, + selection, + rectangle, + font_descriptor, + font_glyph_byte_map, + font_name, + font_size, + font_color, + multiline, + alignment + ) + + self[NameObject("/Type")] = NameObject("/XObject") + self[NameObject("/Subtype")] = NameObject("/Form") + self[NameObject("/BBox")] = RectangleObject(rectangle) + self.set_data(ByteStringObject(ap_stream_data)) + self[NameObject("/Length")] = NumberObject(len(ap_stream_data)) + # Update Resources with font information + self[NameObject("/Resources")] = DictionaryObject({ + NameObject("/Font"): DictionaryObject({ + NameObject(font_name): getattr(font_resource, "indirect_reference", font_resource) + }) + }) + + @classmethod + def from_text_annotation( + cls, + acro_form: DictionaryObject, # _root_object[CatalogDictionary.ACRO_FORM]) + field: DictionaryObject, + annotation: DictionaryObject, + user_font_name: str = "", + user_font_size: float = -1, + ) -> "TextStreamAppearance": + """ + Creates a TextStreamAppearance object from a text field annotation. + + This class method is a factory for creating a `TextStreamAppearance` + instance by extracting all necessary information (bounding box, font, + text content, etc.) from the PDF field and annotation dictionaries. + It respects inheritance for properties like default appearance (`/DA`). + + Args: + acro_form: The root AcroForm dictionary from the PDF catalog. + field: The field dictionary object. + annotation: The widget annotation dictionary object associated with the field. + user_font_name: An optional user-provided font name to override the + default. Defaults to an empty string. + user_font_size: An optional user-provided font size to override the + default. A value of -1 indicates no override. + + Returns: + A new `TextStreamAppearance` instance configured for the given field. + + """ + # Calculate rectangle dimensions + _rectangle = cast(RectangleObject, annotation[AnnotationDictionaryAttributes.Rect]) + rectangle = RectangleObject((0, 0, abs(_rectangle[2] - _rectangle[0]), abs(_rectangle[3] - _rectangle[1]))) + + # Get default appearance dictionary from annotation + default_appearance = annotation.get_inherited( + AnnotationDictionaryAttributes.DA, + acro_form.get(AnnotationDictionaryAttributes.DA, None), + ) + if not default_appearance: + # Create a default appearance if none was found in the annotation + default_appearance = TextStringObject("/Helv 0 Tf 0 g") + else: + default_appearance = default_appearance.get_object() + + # Derive font name, size and color from the default appearance. Also set + # user-provided font name and font size in the default appearance, if given. + # For a font name, this presumes that we can find an associated font resource + # dictionary. Uses the variable font_properties as an intermediate. + # As per the PDF spec: + # "At a minimum, the string [that is, default_appearance] shall include a Tf (text + # font) operator along with its two operands, font and size" (Section 12.7.4.3 + # "Variable text" of the PDF 2.0 specification). + font_properties = [prop for prop in re.split(r"\s", default_appearance) if prop] + font_name = font_properties.pop(font_properties.index("Tf") - 2) + font_size = float(font_properties.pop(font_properties.index("Tf") - 1)) + font_properties.remove("Tf") + font_color = " ".join(font_properties) + # Determine the font name to use, prioritizing the user's input + if user_font_name: + font_name = user_font_name + # Determine the font size to use, prioritizing the user's input + if user_font_size > 0: + font_size = user_font_size + + # Try to find a resource dictionary for the font + document_resources: Any = cast( + DictionaryObject, + cast( + DictionaryObject, + annotation.get_inherited( + "/DR", + acro_form.get("/DR", DictionaryObject()), + ), + ).get_object(), + ) + document_font_resources = document_resources.get("/Font", DictionaryObject()).get_object() + # CORE_FONT_METRICS is the dict with Standard font metrics + if font_name not in document_font_resources and font_name.removeprefix("/") not in CORE_FONT_METRICS: + # ...or AcroForm dictionary + document_resources = cast( + dict[Any, Any], + acro_form.get("/DR", {}), + ) + document_font_resources = document_resources.get_object().get("/Font", DictionaryObject()).get_object() + font_resource = document_font_resources.get(font_name, None) + if not is_null_or_none(font_resource): + font_resource = cast(DictionaryObject, font_resource.get_object()) + + # Retrieve field text, selected values and formatting information + multiline = False + field_flags = field.get(FieldDictionaryAttributes.Ff, 0) + alignment = field.get("/Q", 0) + if field_flags & FieldDictionaryAttributes.FfBits.Multiline: + multiline = True + if ( + field.get(FieldDictionaryAttributes.FT, "/Tx") == "/Ch" and + field_flags & FieldDictionaryAttributes.FfBits.Combo == 0 + ): + text = "\n".join(annotation.get_inherited(FieldDictionaryAttributes.Opt, [])) + selection = field.get("/V", []) + if not isinstance(selection, list): + selection = [selection] + else: # /Tx + text = field.get("/V", "") + selection = [] + + # Escape parentheses (PDF 1.7 reference, table 3.2, Literal Strings) + text = text.replace("\\", "\\\\").replace("(", r"\(").replace(")", r"\)") + + # Create the TextStreamAppearance instance + new_appearance_stream = cls( + text, + selection, + rectangle, + font_resource, + font_name, + font_size, + font_color, + multiline, + alignment + ) + if AnnotationDictionaryAttributes.AP in annotation: + for key, value in ( + cast(DictionaryObject, annotation[AnnotationDictionaryAttributes.AP]).get("/N", {}).items() + ): + if key not in {"/BBox", "/Length", "/Subtype", "/Type", "/Filter"}: + new_appearance_stream[key] = value + + return new_appearance_stream diff --git a/tests/test_appearance_stream.py b/tests/test_appearance_stream.py new file mode 100644 index 000000000..f2561372b --- /dev/null +++ b/tests/test_appearance_stream.py @@ -0,0 +1,62 @@ +"""Test the pypdf.generic._appearance_stream module.""" + +from pypdf.generic._appearance_stream import TextStreamAppearance + + +def test_scale_text(): + rectangle = (0, 0, 9.1, 55.4) + font_size = 10.1 + text = "Hello World" + multiline = False + appearance_stream = TextStreamAppearance( + text, rectangle=rectangle, font_size=font_size, multiline=multiline + ) + assert (str(font_size) + r" Tf").encode() in appearance_stream.get_data() + text = "This is a very very long sentence that probably will scale below the minimum font size" + font_size = 0.0 + appearance_stream = TextStreamAppearance( + text, rectangle=rectangle, font_size=font_size, multiline=multiline + ) + assert (b"4.0 Tf") in appearance_stream.get_data() + rectangle = (0, 0, 160, 360) + font_size = 0.0 + text = """Welcome to pypdf +pypdf is a free and open source pure-python PDF library capable of splitting, merging, cropping, and +transforming the pages of PDF files. It can also add custom data, viewing options, and passwords to PDF +files. pypdf can retrieve text and metadata from PDFs as well. + +See pdfly for a CLI application that uses pypdf to interact with PDFs. + """ + multiline = True + appearance_stream = TextStreamAppearance( + text, rectangle=rectangle, font_size=font_size, multiline=multiline + ) + assert (b"12 Tf") in appearance_stream.get_data() + assert b"pypdf is a free and open" in appearance_stream.get_data() + rectangle = (0, 0, 160, 160) + appearance_stream = TextStreamAppearance( + text, rectangle=rectangle, font_size=font_size, multiline=multiline + ) + assert (b"8.8 Tf") in appearance_stream.get_data() + rectangle = (0, 0, 160, 12) + appearance_stream = TextStreamAppearance( + text, rectangle=rectangle, font_size=font_size, multiline=multiline + ) + text = """Option A +Option B +Option C +Option D +""" + selection = "Option A" + assert (b"4.0 Tf") in appearance_stream.get_data() + text = "pneumonoultramicroscopicsilicovolcanoconiosis" + appearance_stream = TextStreamAppearance( + text, selection, rectangle=rectangle, font_size=font_size, multiline=multiline + ) + assert (b"7.2 Tf") in appearance_stream.get_data() + rectangle = (0, 0, 10, 100) + text = "OneWord" + appearance_stream = TextStreamAppearance( + text, rectangle=rectangle, font_size=font_size, multiline=multiline + ) + assert (b"OneWord") in appearance_stream.get_data() diff --git a/tests/test_writer.py b/tests/test_writer.py index ce8d72c5f..be4c7d9c5 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -2406,7 +2406,7 @@ def test_selfont(): b"Text_1" in writer.pages[0]["/Annots"][1].get_object()["/AP"]["/N"].get_data() ) assert ( - b"/F3 12 Tf" + b"/F3 12.0 Tf" in writer.pages[0]["/Annots"][2].get_object()["/AP"]["/N"].get_data() ) assert ( @@ -2427,7 +2427,7 @@ def test_no_resource_for_14_std_fonts(caplog): writer.update_page_form_field_values( p, {a["/T"]: "Brooks"}, auto_regenerate=False ) - assert "Font dictionary for /Helvetica not found." in caplog.text + assert "Font dictionary for /Helvetica not found; defaulting to Helvetica." in caplog.text @pytest.mark.enable_socket @@ -2439,7 +2439,7 @@ def test_field_box_upside_down(): writer.update_page_form_field_values(None, {"FreightTrainMiles": "0"}) assert writer.pages[0]["/Annots"][13].get_object()["/AP"]["/N"].get_data() == ( b"q\n/Tx BMC \nq\n1 1 105.29520000000001 10.835000000000036 re\n" - b"W\nBT\n/Arial 8.0 Tf 0 g\n2 2.8350000000000364 Td\n(0) Tj\nET\n" + b"W\nBT\n/Helv 8.0 Tf 0 g\n2 2.8350000000000364 Td\n(0) Tj\nET\n" b"Q\nEMC\nQ\n" ) box = writer.pages[0]["/Annots"][13].get_object()["/AP"]["/N"]["/BBox"]