Skip to content

Commit

Permalink
add dryrun option to truncateXMLDump()
Browse files Browse the repository at this point in the history
  • Loading branch information
yzqzss committed Jul 9, 2024
1 parent 3254eb3 commit ed8ea38
Showing 1 changed file with 8 additions and 2 deletions.
10 changes: 8 additions & 2 deletions wikiteam3/dumpgenerator/dump/xmldump/xml_truncate.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,12 @@ def addNewline(filename: str) -> None:
f.write("\n")


def truncateXMLDump(filename: str) -> str:
"""Removes incomplete <page> elements from the end of XML dump files"""
def truncateXMLDump(filename: str, dryrun: bool = False) -> str:
"""
Removes incomplete <page> elements from the end of XML dump files
dryrun: bool - returns the incomplete segment without truncating the file
"""

with FileReadBackwards(filename, encoding="utf-8") as frb:
incomplete_segment: str = ""
Expand All @@ -36,6 +40,8 @@ def truncateXMLDump(filename: str) -> str:
while xml_line and "</page>" not in xml_line:
incomplete_segment = xml_line + incomplete_segment
xml_line = frb.readline()
if dryrun:
return incomplete_segment
incomplete_segment_size = len(incomplete_segment.encode("utf-8"))
file_size = os.path.getsize(filename)
if file_size > incomplete_segment_size:
Expand Down

0 comments on commit ed8ea38

Please sign in to comment.