From 96b52354ece55a6adc5bad80eb3b8244ff07bd38 Mon Sep 17 00:00:00 2001 From: Matthieu Maitre Date: Sat, 9 Nov 2024 20:52:46 -0800 Subject: [PATCH] Handle .npz Numpy files --- .gitattributes | 7 ++ .vscode/settings.json | 3 +- setup.cfg | 2 +- src/picklescan/scanner.py | 19 ++- tests/data2/int_array.npy | Bin 0 -> 144 bytes tests/data2/int_arrays.npz | Bin 0 -> 514 bytes tests/data2/int_arrays_compressed.npz | Bin 0 -> 387 bytes tests/{data => data2}/object_array.npy | Bin tests/data2/object_arrays.npz | Bin 0 -> 810 bytes tests/data2/object_arrays_compressed.npz | Bin 0 -> 648 bytes tests/test_scanner.py | 140 +++++++++++++++++++---- 11 files changed, 145 insertions(+), 26 deletions(-) create mode 100644 .gitattributes create mode 100644 tests/data2/int_array.npy create mode 100644 tests/data2/int_arrays.npz create mode 100644 tests/data2/int_arrays_compressed.npz rename tests/{data => data2}/object_array.npy (100%) create mode 100644 tests/data2/object_arrays.npz create mode 100644 tests/data2/object_arrays_compressed.npz diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..bdb1b77 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,7 @@ +*.pkl binary +*.pickle binary +*.bin binary +*.pt binary +*.zip binary +*.npy binary +*.npz binary \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json index 709a7ba..f9ca979 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -5,5 +5,6 @@ "tests" ], "python.testing.unittestEnabled": false, - "python.testing.pytestEnabled": true + "python.testing.pytestEnabled": true, + "sarif-viewer.connectToGithubCodeScanning": "off" } \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index c52dab0..171d577 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = picklescan -version = 0.0.17 +version = 0.0.18 author = Matthieu Maitre author_email = mmaitre314@users.noreply.github.com description = Security scanner detecting Python Pickle files performing suspicious actions diff --git a/src/picklescan/scanner.py b/src/picklescan/scanner.py index aa358b7..61ce5dc 100644 --- a/src/picklescan/scanner.py +++ b/src/picklescan/scanner.py @@ -82,6 +82,13 @@ def __str__(self) -> str: "IntStorage", "ByteStorage", }, + "numpy": { + "dtype", + "ndarray", + }, + "numpy.core.multiarray": { + "_reconstruct", + }, "torch._utils": {"_rebuild_tensor_v2"}, } @@ -141,8 +148,7 @@ def __str__(self) -> str: # https://www.tensorflow.org/api_docs/python/tf/keras/models/load_model # -# TODO: support .npz files -_numpy_file_extensions = {".npy"} +_numpy_file_extensions = {".npy"} # Note: .npz is handled as zip files _pytorch_file_extensions = {".bin", ".pt", ".pth", ".ckpt"} _pickle_file_extensions = {".pkl", ".pickle", ".joblib", ".dat", ".data"} _zip_file_extensions = {".zip", ".npz"} @@ -301,10 +307,15 @@ def scan_zip_bytes(data: IO[bytes], file_id) -> ScanResult: file_names = zip.namelist() _log.debug("Files in archive %s: %s", file_id, file_names) for file_name in file_names: - if os.path.splitext(file_name)[1] in _pickle_file_extensions: + file_ext = os.path.splitext(file_name)[1] + if file_ext in _pickle_file_extensions: _log.debug("Scanning file %s in zip archive %s", file_name, file_id) with zip.open(file_name, "r") as file: result.merge(scan_pickle_bytes(file, f"{file_id}:{file_name}")) + elif file_ext in _numpy_file_extensions: + _log.debug("Scanning file %s in zip archive %s", file_name, file_id) + with zip.open(file_name, "r") as file: + result.merge(scan_numpy(file, f"{file_id}:{file_name}")) return result @@ -323,7 +334,7 @@ def scan_numpy(data: IO[bytes], file_id) -> ScanResult: data.seek(-min(N, len(magic)), 1) # back-up if magic.startswith(_ZIP_PREFIX) or magic.startswith(_ZIP_SUFFIX): # .npz file - raise NotImplementedError("Scanning of .npz files is not implemented yet") + raise ValueError(f".npz file not handled as zip file: {file_id}") elif magic == np.lib.format.MAGIC_PREFIX: # .npy file diff --git a/tests/data2/int_array.npy b/tests/data2/int_array.npy new file mode 100644 index 0000000000000000000000000000000000000000..2cc58255330b0883efe0639eeb5bb9563c7f6997 GIT binary patch literal 144 zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1ZlWC%^qoAIaUsO_*m=~X4l#&V(cT3DEP6dh= jXCxM+0{I$7ItoUbItsN4WCJcn1_lNuAZ7+)79a)y4!s+A literal 0 HcmV?d00001 diff --git a/tests/data2/int_arrays.npz b/tests/data2/int_arrays.npz new file mode 100644 index 0000000000000000000000000000000000000000..c052742d7d787db9a41d1145fd08715e0af573cb GIT binary patch literal 514 zcmWIWW@Zs#fB;1XrZ;<6_5e8`%*r6bkf@heP|3(302Tl#0!e_tWWP|~fJjD$GKOmP zl+@znB6TYTb(>5RbsYuuwEUuyqQt!T{Gyapkhoi7PH`$wyf`DVAQi~hFxJu3QK(g* z0C0iazzD=lKpcSL1$_ws?eF@PjNVD=mFSJDC} zPXrtbSUYc0)SQ?lc}o}MP0n1jZt;S7)8a#drpyqZA3tRhmr%L)8BIUdF7b2~u_a7D zrH)z6vf|pxCBmS`rjWprq^7{e5P;$YeGU#cOQUsC!fysWMz5$Vp3}p<} z>M5zk$wlf`3hFif>N*PQY57GZMTvRw`9&$IAYr$}oZ?iVcyUHzK`M~1VXUL6qfo0r z3E*mAPR=XMEvVE>&M!*U%Pq|*$xJLNO049HFG@|$&nqq|DorloDrA7D;>t^b$QCks zGqe^mIWr~|GPf791QoJscr$u4w-&NN6{eI_7Nl|&vPUohJ?U>z$kEx+UdS0#$fe=U z90BrsO(D0RpP$!%AOI8I_9cZpNu9BUyc*sNK(m-qOA7gt^a4`*RRz@ZfU;sh@dU}Nl1E~gZG6kpJ0B=++sBwX;B^;;;95VsltZX0& OCLl}$((6EpfdK%LtGsLg literal 0 HcmV?d00001 diff --git a/tests/data2/object_arrays_compressed.npz b/tests/data2/object_arrays_compressed.npz new file mode 100644 index 0000000000000000000000000000000000000000..53777456038de5a48497cefc794390dcecc4ca8f GIT binary patch literal 648 zcmWIWW@Zs#U|`??VnqgtgC>*C16c};3=FIcA`FRoc?Fe>3<6*Qpg03a5(H+y5q~8u zaPmaJv4FMnCPmGOS(3MOLEPi@>lQDVH!VISXvz%n`SDXGaS4@qpV9PV?GjH{5nICa zQ|g%2EGw?9TvHgXO1jdbi`}jk40CW{Mqwp(xOXG zgw{E+b!e^IBhrzhxY*n|TXC{+a*$$iwW4=`;_5c@#pcD?iqav8UY@CrTH%V-EW*sp z&HG&y-P7Hx71IM0r?;6GvnU6wSvKMJ`Sa&r+Y2;s=Kr4LG;hhd8%}Y&`D_a!eO9Wr z)_7Ji1fYb%jXTd;kV7G9_=ds)u6(ilYgZ;Y?NhM@h5{p#2r$;Mrv`AmgNX)U>R^DR g4RkH2L5Hk`4X6nmkOAJTY#<3HAWQ?&JWL=S0E61vzyJUM literal 0 HcmV?d00001 diff --git a/tests/test_scanner.py b/tests/test_scanner.py index 40f6e27..bd9e88c 100644 --- a/tests/test_scanner.py +++ b/tests/test_scanner.py @@ -183,15 +183,50 @@ def initialize_zip_file(path, file_name, data): zip.writestr(file_name, data) -def initialize_numpy_file(path): +def initialize_numpy_files(): import numpy as np - # create numpy object array - with open(path, "wb") as f: - data = [(1, 2), (3, 4)] + os.makedirs(f"{_root_path}/data2", exist_ok=True) + + path = f"{_root_path}/data2/object_array.npy" + if not os.path.exists(path): x = np.empty((2, 2), dtype=object) - x[:] = data - np.save(f, x) + x[:] = [(1, 2), (3, 4)] + np.save(path, x) + + path = f"{_root_path}/data2/int_array.npy" + if not os.path.exists(path): + x = np.empty((2, 2), dtype=int) + x[:] = [(1, 2), (3, 4)] + np.save(path, x) + + path = f"{_root_path}/data2/object_arrays.npz" + if not os.path.exists(path): + np.savez( + path, + a=np.array([0, 1, 2], dtype=object), + b=np.array([3, 4, 5], dtype=object), + ) + + path = f"{_root_path}/data2/int_arrays.npz" + if not os.path.exists(path): + np.savez( + path, a=np.array([0, 1, 2], dtype=int), b=np.array([3, 4, 5], dtype=int) + ) + + path = f"{_root_path}/data2/object_arrays_compressed.npz" + if not os.path.exists(path): + np.savez_compressed( + path, + a=np.array([0, 1, 2], dtype=object), + b=np.array([3, 4, 5], dtype=object), + ) + + path = f"{_root_path}/data2/int_arrays_compressed.npz" + if not os.path.exists(path): + np.savez_compressed( + path, a=np.array([0, 1, 2], dtype=int), b=np.array([3, 4, 5], dtype=int) + ) def initialize_pickle_files(): @@ -364,13 +399,12 @@ def initialize_pickle_files(): pickle.dumps(Malicious1(), protocol=4), ) - initialize_numpy_file(f"{_root_path}/data/object_array.npy") - # Fake PyTorch file (PNG file format) simulating https://huggingface.co/RectalWorm/loras_new/blob/main/Owl_Mage_no_background.pt initialize_data_file(f"{_root_path}/data/bad_pytorch.pt", b"\211PNG\r\n\032\n") initialize_pickle_files() +initialize_numpy_files() def compare_scan_results(sr1: ScanResult, sr2: ScanResult): @@ -411,19 +445,32 @@ def test_scan_zip_bytes(): def test_scan_numpy(): - scan_result = ScanResult( - [ - Global("numpy.core.multiarray", "_reconstruct", SafetyLevel.Suspicious), - Global("numpy", "ndarray", SafetyLevel.Suspicious), - Global("numpy", "dtype", SafetyLevel.Suspicious), - ], - 1, - 0, - 0, - ) - with open(f"{_root_path}/data/object_array.npy", "rb") as f: + with open(f"{_root_path}/data2/object_array.npy", "rb") as f: compare_scan_results( - scan_numpy(io.BytesIO(f.read()), "object_array.npy"), scan_result + scan_numpy(io.BytesIO(f.read()), "object_array.npy"), + ScanResult( + [ + Global( + "numpy.core.multiarray", "_reconstruct", SafetyLevel.Innocuous + ), + Global("numpy", "ndarray", SafetyLevel.Innocuous), + Global("numpy", "dtype", SafetyLevel.Innocuous), + ], + 1, + 0, + 0, + ), + ) + + with open(f"{_root_path}/data2/int_array.npy", "rb") as f: + compare_scan_results( + scan_numpy(io.BytesIO(f.read()), "int_array.npy"), + ScanResult( + [], + 1, + 0, + 0, + ), ) @@ -581,6 +628,59 @@ def test_scan_file_path(): ) +def test_scan_file_path_npz(): + + compare_scan_results( + scan_file_path(f"{_root_path}/data2/object_arrays.npz"), + ScanResult( + [ + Global("numpy.core.multiarray", "_reconstruct", SafetyLevel.Innocuous), + Global("numpy", "ndarray", SafetyLevel.Innocuous), + Global("numpy", "dtype", SafetyLevel.Innocuous), + ] + * 2, + 2, + 0, + 0, + ), + ) + + compare_scan_results( + scan_file_path(f"{_root_path}/data2/int_arrays.npz"), + ScanResult( + [], + 2, + 0, + 0, + ), + ) + + compare_scan_results( + scan_file_path(f"{_root_path}/data2/object_arrays_compressed.npz"), + ScanResult( + [ + Global("numpy.core.multiarray", "_reconstruct", SafetyLevel.Innocuous), + Global("numpy", "ndarray", SafetyLevel.Innocuous), + Global("numpy", "dtype", SafetyLevel.Innocuous), + ] + * 2, + 2, + 0, + 0, + ), + ) + + compare_scan_results( + scan_file_path(f"{_root_path}/data2/int_arrays_compressed.npz"), + ScanResult( + [], + 2, + 0, + 0, + ), + ) + + def test_scan_directory_path(): sr = ScanResult( globals=[