Skip to content

Commit

Permalink
Create new pipes for ignoring files #238
Browse files Browse the repository at this point in the history
    * Create pipes that ignore media files and data files with no clues
    * Update test results

Signed-off-by: Jono Yang <[email protected]>
  • Loading branch information
JonoYang committed Aug 2, 2021
1 parent aaafc04 commit 730e808
Show file tree
Hide file tree
Showing 4 changed files with 134 additions and 11 deletions.
13 changes: 12 additions & 1 deletion scanpipe/pipelines/windows_docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

from scanpipe.pipelines.docker import Docker
from scanpipe.pipes import docker
from scanpipe.pipes import rootfs
from scanpipe.pipes import windows


Expand All @@ -46,6 +47,7 @@ def steps(cls):
cls.scan_for_application_packages,
cls.scan_for_files,
cls.analyze_scanned_files,
cls.tag_data_files_with_no_clues,
cls.tag_not_analyzed_codebase_resources,
)

Expand All @@ -58,14 +60,23 @@ def tag_known_software_packages(self):

def tag_uninteresting_codebase_resources(self):
"""
Flag remaining files not from a system package.
Flag files that are known to be uninteresting
"""
docker.tag_whiteout_codebase_resources(self.project)
windows.tag_uninteresting_windows_codebase_resources(self.project)
rootfs.tag_ignorable_codebase_resources(self.project)
rootfs.tag_media_files_as_uninteresting(self.project)

def tag_program_files_dirs_as_packages(self):
"""
Report the immediate subdirectories of `Program Files` and `Program
Files (x86)` as packages
"""
windows.tag_program_files(self.project)

def tag_data_files_with_no_clues(self):
"""
If a file is a data file and has no clues towards its origin, mark as
uninteresting.
"""
rootfs.tag_data_files_with_no_clues(self.project)
65 changes: 65 additions & 0 deletions scanpipe/pipes/rootfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,3 +358,68 @@ def tag_ignorable_codebase_resources(project):
lookups |= Q(rootfs_path__iregex=translated_pattern)
qs = project.codebaseresources.no_status()
qs.filter(lookups).update(status="ignored-default-ignores")


def tag_data_files_with_no_clues(project):
"""
Tag CodebaseResources that have a file type of `data` and no detected clues to be uninteresting.
"""
lookup = Q(
file_type="data",
copyrights=[],
holders=[],
authors=[],
licenses=[],
license_expressions=[],
emails=[],
urls=[],
)
project.codebaseresources.filter(lookup).update(status="ignored-data-file-no-clues")


def tag_media_files_as_uninteresting(project):
"""
Tag CodebaseResources that are media files to be uninteresting.
"""
# `mimes` and `types` were taken from TypeCode
# https://github.com/nexB/typecode/blob/c38f6831c59acae02a34a1288b9ce16e2e1f1733/src/typecode/contenttype.py#L528
mimes = (
"image",
"picture",
"audio",
"video",
"graphic",
"sound",
)
types = (
"image data",
"graphics image",
"ms-windows metafont .wmf",
"windows enhanced metafile",
"png image",
"interleaved image",
"microsoft asf",
"image text",
"photoshop image",
"shop pro image",
"ogg data",
"vorbis",
"mpeg",
"theora",
"bitmap",
"audio",
"video",
"sound",
"riff",
"icon",
"pc bitmap",
"image data",
"netpbm",
)
lookup = Q()
for m in mimes:
lookup |= Q(mime_type__icontains=m)
for t in types:
lookup |= Q(file_type__icontains=t)
qs = project.codebaseresources.no_status()
qs.filter(lookup).update(status="ignored-media-file")
16 changes: 10 additions & 6 deletions scanpipe/pipes/windows.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def tag_uninteresting_windows_codebase_resources(project):
"SECURITY",
"SOFTWARE",
"SYSTEM",
"system.ini",
)

uninteresting_file_extensions = (
Expand All @@ -68,11 +69,13 @@ def tag_uninteresting_windows_codebase_resources(project):
".LOG",
".inf_loc",
".NLS",
".dat",
".pem",
)

lookups = Q()
for file_name in uninteresting_files:
lookups |= Q(path__iendswith=file_name)
lookups |= Q(rootfs_path__iendswith=file_name)
for file_extension in uninteresting_file_extensions:
lookups |= Q(extension__icontains=file_extension)

Expand Down Expand Up @@ -144,7 +147,7 @@ def tag_known_software(project):
# We do not want to tag the files in the `site-packages` directory as being
# from Python proper. The packages found here are oftentime third-party
# packages from outside the Python foundation
q_objects = [~Q(rootfs_path__icontains='site-packages')]
q_objects = [~Q(rootfs_path__icontains="site-packages")]
for python_version, python_path in python_paths_by_versions.items():
python_package = Package(
type="windows-program",
Expand All @@ -155,7 +158,10 @@ def tag_known_software(project):
homepage_url="https://www.python.org/",
)
tag_installed_package_files(
project=project, root_dir_pattern=python_path, package=python_package, q_objects=q_objects
project=project,
root_dir_pattern=python_path,
package=python_package,
q_objects=q_objects,
)

qs = project.codebaseresources.no_status()
Expand Down Expand Up @@ -210,9 +216,7 @@ def tag_program_files(project):
"""
qs = project.codebaseresources.no_status()
# Get all files from Program Files and Program Files (x86)
program_files_one_directory_below_pattern = (
r"(^.*Program Files( \(x86\))?/([^/]+))"
)
program_files_one_directory_below_pattern = r"(^.*Program Files( \(x86\))?/([^/]+))"
program_files_one_directory_below_pattern_compiled = re.compile(
program_files_one_directory_below_pattern
)
Expand Down
51 changes: 47 additions & 4 deletions scanpipe/tests/test_pipes.py
Original file line number Diff line number Diff line change
Expand Up @@ -778,9 +778,9 @@ def test_scanpipe_pipes_windows_tag_uninteresting_windows_codebase_resources(sel
)
resource4 = CodebaseResource.objects.create(
project=p1,
path="root/Files/User/Test/foo.dat",
rootfs_path="/Files/User/Test/foo.dat",
extension=".dat",
path="root/Files/should-not-be-ignored.txt",
rootfs_path="/Files/should-not-be-ignored.txt",
extension=".txt",
)

windows.tag_uninteresting_windows_codebase_resources(p1)
Expand All @@ -790,7 +790,7 @@ def test_scanpipe_pipes_windows_tag_uninteresting_windows_codebase_resources(sel
resource4.refresh_from_db()
self.assertEqual("ignored-not-interesting", resource1.status)
self.assertEqual("ignored-not-interesting", resource2.status)
self.assertEqual("", resource3.status)
self.assertEqual("ignored-not-interesting", resource3.status)
self.assertEqual("", resource4.status)

def test_scanpipe_pipes_windows_tag_known_software(self):
Expand Down Expand Up @@ -914,6 +914,49 @@ def test_scanpipe_pipes_rootfs_tag_ignorable_codebase_resources(self):
self.assertEqual("ignored-default-ignores", resource4.status)
self.assertEqual("", resource5.status)

def test_scanpipe_pipes_rootfs_tag_data_files_with_no_clues(self):
p1 = Project.objects.create(name="Analysis")
resource1 = CodebaseResource.objects.create(
project=p1,
path="root/user/foo.data",
rootfs_path="/user/foo.data",
file_type="data",
)
resource2 = CodebaseResource.objects.create(
project=p1,
path="root/user/bar.data",
rootfs_path="/user/bar.data",
file_type="data",
license_expressions=["apache-2.0"],
)
rootfs.tag_data_files_with_no_clues(p1)
resource1.refresh_from_db()
resource2.refresh_from_db()
self.assertEqual("ignored-data-file-no-clues", resource1.status)
self.assertEqual("", resource2.status)

def test_scanpipe_pipes_rootfs_tag_media_files_as_uninteresting(self):
p1 = Project.objects.create(name="Analysis")
resource1 = CodebaseResource.objects.create(
project=p1,
path="root/user/foo.png",
rootfs_path="/user/foo.png",
mime_type="image/png",
file_type="image/png",
)
resource2 = CodebaseResource.objects.create(
project=p1,
path="root/user/bar.jpg",
rootfs_path="/user/bar.jpg",
mime_type="image/jpeg",
file_type="JPEG image data",
)
rootfs.tag_media_files_as_uninteresting(p1)
resource1.refresh_from_db()
resource2.refresh_from_db()
self.assertEqual("ignored-media-file", resource1.status)
self.assertEqual("ignored-media-file", resource2.status)


class ScanPipePipesTransactionTest(TransactionTestCase):
"""
Expand Down

0 comments on commit 730e808

Please sign in to comment.