Skip to content

Commit

Permalink
multi jpeg extract
Browse files Browse the repository at this point in the history
  • Loading branch information
fligt committed Jul 8, 2024
1 parent 2b50ba6 commit fc8af81
Show file tree
Hide file tree
Showing 3 changed files with 179 additions and 60 deletions.
175 changes: 136 additions & 39 deletions notebooks/20_extracting-image-data.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion notebooks/40_pdz-archeology.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
"\n",
"As explained in the previous section, the currently prevalent pdz file format version **pdz25** contains multiple blocks of different types and variable size. The first two bytes of a **pdz25** file decode as `25`. This is not true for old 'legacy format' pdz files. In earlier days pdz files were formatted as one single block of data of fixed length. If a pdz file has a file size of 8698 or 4454 bytes you can tell that this is a legacy format pdz file. These files always start with two bytes with hexadecimal code `\\x01\\x01`. I call this format **pdz11**. \n",
"\n",
"To check a pdz file type import the `check_pdz_type()` function and provide a valid file path to a pdz file. Listing pdz files is easily done with the standard library function `glob.glob(*.pdz)`. "
"To check a pdz file type import the `check_pdz_type()` function and provide a valid file path to a pdz file. Listing pdz files is easily done with the standard library function `glob.glob('*.pdz')`. "
]
},
{
Expand Down
62 changes: 42 additions & 20 deletions read_pdz/jpg_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,44 +3,66 @@
# %% auto 0
__all__ = ['extract_jpg']

# %% ../notebooks/20_extracting-image-data.ipynb 12
# %% ../notebooks/20_extracting-image-data.ipynb 16
from . import file_to_bytes, get_blocks, get_blocktypes
import numpy as np
import io
import re
import matplotlib.pyplot as plt
from PIL import Image

# %% ../notebooks/20_extracting-image-data.ipynb 13
def extract_jpg(pdz_file, BLOCKTYPE=137, save_file=False):
'''Extract jpg image from `pdz_file`.'''
# %% ../notebooks/20_extracting-image-data.ipynb 17
def extract_jpg(pdz_file, BLOCKTYPE=137, save_file=False):
'''Extract and save jpg images from `pdz_file`.
# parse into blocks
Returns a list of jpg images where ims = [ im0, im1, ... ].
'''

# parse into blocks
pdz_bytes = file_to_bytes(pdz_file)
block_list = get_blocks(pdz_bytes, verbose=False)

# read block 137 (if present)

blocktypes_list = get_blocktypes(block_list)

if BLOCKTYPE not in blocktypes_list:
if BLOCKTYPE not in blocktypes_list:

print(f'Could not find jpg image data in: {pdz_file}')

else:
jpg_i = blocktypes_list.index(137)
else:
jpg_i = blocktypes_list.index(BLOCKTYPE)
jpg_dict = block_list[jpg_i]

jpg_sandwich = jpg_dict['bytes'].tobytes()

jpg_start = re.search(b'\xff\xd8', jpg_sandwich).span()[0]
jpg_end = re.search(b'\xff\xd9', jpg_sandwich).span()[1]
jpg = jpg_sandwich[jpg_start:jpg_end]

im = np.array(Image.open(io.BytesIO(jpg)))

if save_file is True:
jpg_file = re.sub('\.pdz$', '.jpg', pdz_file)
print(f"Saving image file: '{jpg_file}'")
plt.imsave(jpg_file, im)
# code below thanks to Lars Maxfield
# Repeatedly search for jpgs by consuming jpg_sandwich

ims = []
while True:
match_jpg_start = re.search(b'\xff\xd8', jpg_sandwich)
match_jpg_end = re.search(b'\xff\xd9', jpg_sandwich)

if not match_jpg_start or not match_jpg_end:

break

jpg_start = match_jpg_start.span()[0]
jpg_end = match_jpg_end.span()[1]
jpg = jpg_sandwich[jpg_start:jpg_end]

ims.append(np.array(Image.open(io.BytesIO(jpg))))

jpg_sandwich = jpg_sandwich[jpg_end:]

if save_file is True:
for i, im in enumerate(ims):
jpg_file = re.sub('\.pdz$', f'-{i}.jpg', pdz_file)

print(f"Saving image file: '{jpg_file}'")
plt.imsave(jpg_file, im)

return im
return ims

return None
return None

0 comments on commit fc8af81

Please sign in to comment.