multi jpeg extract

fligt · Jul 8, 2024 · fc8af81 · fc8af81
1 parent 2b50ba6
commit fc8af81
Show file tree

Hide file tree

Showing 3 changed files with 179 additions and 60 deletions.
diff --git a/notebooks/20_extracting-image-data.ipynb b/notebooks/20_extracting-image-data.ipynb
diff --git a/notebooks/40_pdz-archeology.ipynb b/notebooks/40_pdz-archeology.ipynb
@@ -39,7 +39,7 @@
     "\n",
     "As explained in the previous section, the currently prevalent pdz file format version **pdz25** contains multiple blocks of different types and variable size. The first two bytes of a **pdz25** file decode as `25`. This is not true for old 'legacy format' pdz files. In earlier days pdz files were formatted as one single block of data of fixed length. If a pdz file has a file size of 8698 or 4454 bytes you can tell that this is a legacy format pdz file. These files always start with two bytes with hexadecimal code `\\x01\\x01`. I call this format **pdz11**. \n",
     "\n",
-    "To check a pdz file type import the `check_pdz_type()` function and provide a valid file path to a pdz file. Listing pdz files is easily done  with the standard library function `glob.glob(*.pdz)`.       "
+    "To check a pdz file type import the `check_pdz_type()` function and provide a valid file path to a pdz file. Listing pdz files is easily done  with the standard library function `glob.glob('*.pdz')`.       "
    ]
   },
   {

diff --git a/read_pdz/jpg_extractor.py b/read_pdz/jpg_extractor.py
@@ -3,44 +3,66 @@
 # %% auto 0
 __all__ = ['extract_jpg']
 
-# %% ../notebooks/20_extracting-image-data.ipynb 12
+# %% ../notebooks/20_extracting-image-data.ipynb 16
 from . import file_to_bytes, get_blocks, get_blocktypes 
 import numpy as np 
 import io 
 import re
 import matplotlib.pyplot as plt 
 from PIL import Image
 
-# %% ../notebooks/20_extracting-image-data.ipynb 13
-def extract_jpg(pdz_file, BLOCKTYPE=137, save_file=False): 
-    '''Extract jpg image from `pdz_file`.'''
+# %% ../notebooks/20_extracting-image-data.ipynb 17
+def extract_jpg(pdz_file, BLOCKTYPE=137, save_file=False):
+    '''Extract and save jpg images from `pdz_file`.
     
-    # parse into blocks 
+    Returns a list of jpg images where ims = [ im0, im1, ... ].
+    '''
+
+    # parse into blocks
     pdz_bytes = file_to_bytes(pdz_file)
     block_list = get_blocks(pdz_bytes, verbose=False)
 
     # read block 137 (if present)
+
     blocktypes_list = get_blocktypes(block_list)
 
-    if BLOCKTYPE not in blocktypes_list: 
+    if BLOCKTYPE not in blocktypes_list:
+
         print(f'Could not find jpg image data in: {pdz_file}')
 
-    else: 
-        jpg_i = blocktypes_list.index(137)
+    else:
+        jpg_i = blocktypes_list.index(BLOCKTYPE)
         jpg_dict = block_list[jpg_i]
+
         jpg_sandwich = jpg_dict['bytes'].tobytes()
-
-        jpg_start = re.search(b'\xff\xd8', jpg_sandwich).span()[0]
-        jpg_end = re.search(b'\xff\xd9', jpg_sandwich).span()[1]
-        jpg = jpg_sandwich[jpg_start:jpg_end]
-
-        im = np.array(Image.open(io.BytesIO(jpg))) 
 
-        if save_file is True: 
-            jpg_file = re.sub('\.pdz$', '.jpg', pdz_file) 
-            print(f"Saving image file: '{jpg_file}'")
-            plt.imsave(jpg_file, im) 
+        # code below thanks to Lars Maxfield
+        # Repeatedly search for jpgs by consuming jpg_sandwich
+
+        ims = []
+        while True:
+            match_jpg_start = re.search(b'\xff\xd8', jpg_sandwich)
+            match_jpg_end = re.search(b'\xff\xd9', jpg_sandwich)
+
+            if not match_jpg_start or not match_jpg_end:
+
+                break
+
+            jpg_start = match_jpg_start.span()[0]
+            jpg_end = match_jpg_end.span()[1]
+            jpg = jpg_sandwich[jpg_start:jpg_end]
+
+            ims.append(np.array(Image.open(io.BytesIO(jpg))))
+
+            jpg_sandwich = jpg_sandwich[jpg_end:]
+
+        if save_file is True:
+            for i, im in enumerate(ims):
+                jpg_file = re.sub('\.pdz$', f'-{i}.jpg', pdz_file)
+
+                print(f"Saving image file: '{jpg_file}'")
+                plt.imsave(jpg_file, im)
 
-        return im 
+        return ims
 
-    return None 
+    return None
-Original file line number
+Diff line change
@@ Expand Up / @@ -39,7 +39,7 @@ @@
         "\n",
         "As explained in the previous section, the currently prevalent pdz file format version **pdz25** contains multiple blocks of different types and variable size. The first two bytes of a **pdz25** file decode as `25`. This is not true for old 'legacy format' pdz files. In earlier days pdz files were formatted as one single block of data of fixed length. If a pdz file has a file size of 8698 or 4454 bytes you can tell that this is a legacy format pdz file. These files always start with two bytes with hexadecimal code `\\x01\\x01`. I call this format **pdz11**. \n",
         "\n",
-        "To check a pdz file type import the `check_pdz_type()` function and provide a valid file path to a pdz file. Listing pdz files is easily done  with the standard library function `glob.glob(*.pdz)`.       "
+        "To check a pdz file type import the `check_pdz_type()` function and provide a valid file path to a pdz file. Listing pdz files is easily done  with the standard library function `glob.glob('*.pdz')`.       "
        ]
       },
       {
@@ Expand Down @@