diff --git a/scanpipe/pipelines/windows_docker.py b/scanpipe/pipelines/windows_docker.py index 2937a2cc3..c4142346b 100644 --- a/scanpipe/pipelines/windows_docker.py +++ b/scanpipe/pipelines/windows_docker.py @@ -22,6 +22,7 @@ from scanpipe.pipelines.docker import Docker from scanpipe.pipes import docker +from scanpipe.pipes import rootfs from scanpipe.pipes import windows @@ -46,6 +47,7 @@ def steps(cls): cls.scan_for_application_packages, cls.scan_for_files, cls.analyze_scanned_files, + cls.tag_data_files_with_no_clues, cls.tag_not_analyzed_codebase_resources, ) @@ -58,10 +60,12 @@ def tag_known_software_packages(self): def tag_uninteresting_codebase_resources(self): """ - Flag remaining files not from a system package. + Flag files that are known to be uninteresting """ docker.tag_whiteout_codebase_resources(self.project) windows.tag_uninteresting_windows_codebase_resources(self.project) + rootfs.tag_ignorable_codebase_resources(self.project) + rootfs.tag_media_files_as_uninteresting(self.project) def tag_program_files_dirs_as_packages(self): """ @@ -69,3 +73,10 @@ def tag_program_files_dirs_as_packages(self): Files (x86)` as packages """ windows.tag_program_files(self.project) + + def tag_data_files_with_no_clues(self): + """ + If a file is a data file and has no clues towards its origin, mark as + uninteresting. + """ + rootfs.tag_data_files_with_no_clues(self.project) diff --git a/scanpipe/pipes/rootfs.py b/scanpipe/pipes/rootfs.py index f38bef46b..ff98ddfd4 100644 --- a/scanpipe/pipes/rootfs.py +++ b/scanpipe/pipes/rootfs.py @@ -358,3 +358,68 @@ def tag_ignorable_codebase_resources(project): lookups |= Q(rootfs_path__iregex=translated_pattern) qs = project.codebaseresources.no_status() qs.filter(lookups).update(status="ignored-default-ignores") + + +def tag_data_files_with_no_clues(project): + """ + Tag CodebaseResources that have a file type of `data` and no detected clues to be uninteresting. + """ + lookup = Q( + file_type="data", + copyrights=[], + holders=[], + authors=[], + licenses=[], + license_expressions=[], + emails=[], + urls=[], + ) + project.codebaseresources.filter(lookup).update(status="ignored-data-file-no-clues") + + +def tag_media_files_as_uninteresting(project): + """ + Tag CodebaseResources that are media files to be uninteresting. + """ + # `mimes` and `types` were taken from TypeCode + # https://github.com/nexB/typecode/blob/c38f6831c59acae02a34a1288b9ce16e2e1f1733/src/typecode/contenttype.py#L528 + mimes = ( + "image", + "picture", + "audio", + "video", + "graphic", + "sound", + ) + types = ( + "image data", + "graphics image", + "ms-windows metafont .wmf", + "windows enhanced metafile", + "png image", + "interleaved image", + "microsoft asf", + "image text", + "photoshop image", + "shop pro image", + "ogg data", + "vorbis", + "mpeg", + "theora", + "bitmap", + "audio", + "video", + "sound", + "riff", + "icon", + "pc bitmap", + "image data", + "netpbm", + ) + lookup = Q() + for m in mimes: + lookup |= Q(mime_type__icontains=m) + for t in types: + lookup |= Q(file_type__icontains=t) + qs = project.codebaseresources.no_status() + qs.filter(lookup).update(status="ignored-media-file") diff --git a/scanpipe/pipes/windows.py b/scanpipe/pipes/windows.py index 625f88989..7be41d6ee 100644 --- a/scanpipe/pipes/windows.py +++ b/scanpipe/pipes/windows.py @@ -60,6 +60,7 @@ def tag_uninteresting_windows_codebase_resources(project): "SECURITY", "SOFTWARE", "SYSTEM", + "system.ini", ) uninteresting_file_extensions = ( @@ -68,11 +69,13 @@ def tag_uninteresting_windows_codebase_resources(project): ".LOG", ".inf_loc", ".NLS", + ".dat", + ".pem", ) lookups = Q() for file_name in uninteresting_files: - lookups |= Q(path__iendswith=file_name) + lookups |= Q(rootfs_path__iendswith=file_name) for file_extension in uninteresting_file_extensions: lookups |= Q(extension__icontains=file_extension) @@ -144,7 +147,7 @@ def tag_known_software(project): # We do not want to tag the files in the `site-packages` directory as being # from Python proper. The packages found here are oftentime third-party # packages from outside the Python foundation - q_objects = [~Q(rootfs_path__icontains='site-packages')] + q_objects = [~Q(rootfs_path__icontains="site-packages")] for python_version, python_path in python_paths_by_versions.items(): python_package = Package( type="windows-program", @@ -155,7 +158,10 @@ def tag_known_software(project): homepage_url="https://www.python.org/", ) tag_installed_package_files( - project=project, root_dir_pattern=python_path, package=python_package, q_objects=q_objects + project=project, + root_dir_pattern=python_path, + package=python_package, + q_objects=q_objects, ) qs = project.codebaseresources.no_status() @@ -210,9 +216,7 @@ def tag_program_files(project): """ qs = project.codebaseresources.no_status() # Get all files from Program Files and Program Files (x86) - program_files_one_directory_below_pattern = ( - r"(^.*Program Files( \(x86\))?/([^/]+))" - ) + program_files_one_directory_below_pattern = r"(^.*Program Files( \(x86\))?/([^/]+))" program_files_one_directory_below_pattern_compiled = re.compile( program_files_one_directory_below_pattern ) diff --git a/scanpipe/tests/test_pipes.py b/scanpipe/tests/test_pipes.py index 1072aac79..d3fcc729b 100644 --- a/scanpipe/tests/test_pipes.py +++ b/scanpipe/tests/test_pipes.py @@ -778,9 +778,9 @@ def test_scanpipe_pipes_windows_tag_uninteresting_windows_codebase_resources(sel ) resource4 = CodebaseResource.objects.create( project=p1, - path="root/Files/User/Test/foo.dat", - rootfs_path="/Files/User/Test/foo.dat", - extension=".dat", + path="root/Files/should-not-be-ignored.txt", + rootfs_path="/Files/should-not-be-ignored.txt", + extension=".txt", ) windows.tag_uninteresting_windows_codebase_resources(p1) @@ -790,7 +790,7 @@ def test_scanpipe_pipes_windows_tag_uninteresting_windows_codebase_resources(sel resource4.refresh_from_db() self.assertEqual("ignored-not-interesting", resource1.status) self.assertEqual("ignored-not-interesting", resource2.status) - self.assertEqual("", resource3.status) + self.assertEqual("ignored-not-interesting", resource3.status) self.assertEqual("", resource4.status) def test_scanpipe_pipes_windows_tag_known_software(self): @@ -914,6 +914,49 @@ def test_scanpipe_pipes_rootfs_tag_ignorable_codebase_resources(self): self.assertEqual("ignored-default-ignores", resource4.status) self.assertEqual("", resource5.status) + def test_scanpipe_pipes_rootfs_tag_data_files_with_no_clues(self): + p1 = Project.objects.create(name="Analysis") + resource1 = CodebaseResource.objects.create( + project=p1, + path="root/user/foo.data", + rootfs_path="/user/foo.data", + file_type="data", + ) + resource2 = CodebaseResource.objects.create( + project=p1, + path="root/user/bar.data", + rootfs_path="/user/bar.data", + file_type="data", + license_expressions=["apache-2.0"], + ) + rootfs.tag_data_files_with_no_clues(p1) + resource1.refresh_from_db() + resource2.refresh_from_db() + self.assertEqual("ignored-data-file-no-clues", resource1.status) + self.assertEqual("", resource2.status) + + def test_scanpipe_pipes_rootfs_tag_media_files_as_uninteresting(self): + p1 = Project.objects.create(name="Analysis") + resource1 = CodebaseResource.objects.create( + project=p1, + path="root/user/foo.png", + rootfs_path="/user/foo.png", + mime_type="image/png", + file_type="image/png", + ) + resource2 = CodebaseResource.objects.create( + project=p1, + path="root/user/bar.jpg", + rootfs_path="/user/bar.jpg", + mime_type="image/jpeg", + file_type="JPEG image data", + ) + rootfs.tag_media_files_as_uninteresting(p1) + resource1.refresh_from_db() + resource2.refresh_from_db() + self.assertEqual("ignored-media-file", resource1.status) + self.assertEqual("ignored-media-file", resource2.status) + class ScanPipePipesTransactionTest(TransactionTestCase): """