From c3598908506b5cfb28b707204c315bf4d1626852 Mon Sep 17 00:00:00 2001
From: Chris Adams <cadams@loc.gov>
Date: Tue, 15 Oct 2024 16:35:23 -0400
Subject: [PATCH] Use Ruff + pre-commit for linting & formatting

---
 .github/workflows/test.yml |  8 +++++
 .pre-commit-config.yaml    | 36 +++++++++++++++++++
 README.rst                 |  4 +--
 bagit.py                   | 73 +++++++++++++++++++-------------------
 bench.py                   |  2 +-
 pyproject.toml             |  1 -
 test.py                    |  2 +-
 7 files changed, 84 insertions(+), 42 deletions(-)
 create mode 100644 .pre-commit-config.yaml

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 9185bec..b37ba86 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -7,7 +7,15 @@ on:
         branches: [master]
 
 jobs:
+    ruff: # https://docs.astral.sh/ruff
+        runs-on: ubuntu-latest
+        steps:
+            - uses: actions/checkout@v4
+            - run: pip install --user ruff
+            - run: ruff check --output-format=github
+
     test:
+        needs: ruff
         runs-on: ubuntu-latest
         strategy:
             fail-fast: false
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..9f1dc32
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,36 @@
+exclude: ".*test-data.*"
+
+repos:
+    - repo: https://github.com/astral-sh/ruff-pre-commit
+      rev: v0.6.9
+      hooks:
+          - id: ruff
+            args: [--fix, --exit-non-zero-on-fix]
+          - id: ruff-format
+
+    - repo: https://github.com/pre-commit/pre-commit-hooks
+      rev: v5.0.0
+      hooks:
+          - id: check-added-large-files
+            args: ["--maxkb=128"]
+          - id: check-ast
+          - id: check-byte-order-marker
+          - id: check-case-conflict
+          - id: check-docstring-first
+          - id: check-executables-have-shebangs
+          - id: check-json
+          - id: check-merge-conflict
+          - id: check-symlinks
+          - id: check-xml
+          - id: check-yaml
+            args: ["--unsafe"]
+          - id: debug-statements
+          - id: detect-aws-credentials
+            args: ["--allow-missing-credentials"]
+          - id: detect-private-key
+          - id: end-of-file-fixer
+          - id: mixed-line-ending
+            args: ["--fix=lf"]
+          - id: trailing-whitespace
+          - id: pretty-format-json
+            args: ["--autofix", "--no-sort-keys", "--indent=4"]
diff --git a/README.rst b/README.rst
index d134c4b..c109c83 100644
--- a/README.rst
+++ b/README.rst
@@ -226,11 +226,11 @@ Contributing to bagit-python development
 Running the tests
 ~~~~~~~~~~~~~~~~~
 
-You can quickly run the tests by having setuptools install dependencies:
+You can quickly run the tests using the built-in unittest framework:
 
 ::
 
-    python setup.py test
+    python -m unittest discover
 
 If you have Docker installed, you can run the tests under Linux inside a
 container:
diff --git a/bagit.py b/bagit.py
index 458fba8..944bf94 100755
--- a/bagit.py
+++ b/bagit.py
@@ -140,7 +140,7 @@ def find_locale_dir():
 open_text_file = partial(codecs.open, encoding="utf-8", errors="strict")
 
 # This is the same as decoding the byte values in codecs.BOM:
-UNICODE_BYTE_ORDER_MARK = "\uFEFF"
+UNICODE_BYTE_ORDER_MARK = "\ufeff"
 
 
 def make_bag(
@@ -422,8 +422,8 @@ def compare_manifests_with_fs(self):
 
     def compare_fetch_with_fs(self):
         """Compares the fetch entries with the files actually
-           in the payload, and returns a list of all the files
-           that still need to be fetched.
+        in the payload, and returns a list of all the files
+        that still need to be fetched.
         """
 
         files_on_fs = set(self.payload_files())
@@ -449,7 +449,7 @@ def payload_files(self):
                 yield rel_path
 
     def payload_entries(self):
-        """Return a dictionary of items """
+        """Return a dictionary of items"""
         # Don't use dict comprehension (compatibility with Python < 2.7)
         return dict(
             (key, value)
@@ -618,7 +618,9 @@ def is_valid(self, processes=1, fast=False, completeness_only=False):
         """
 
         try:
-            self.validate(processes=processes, fast=fast, completeness_only=completeness_only)
+            self.validate(
+                processes=processes, fast=fast, completeness_only=completeness_only
+            )
         except BagError:
             return False
 
@@ -776,7 +778,10 @@ def validate_fetch(self):
 
             # each parsed url must resolve to a scheme and point to a netloc
             # if the scheme is file, netloc is not necessary
-            if not (all((parsed_url.scheme, parsed_url.netloc)) or parsed_url.scheme == "file"):
+            if not (
+                all((parsed_url.scheme, parsed_url.netloc))
+                or parsed_url.scheme == "file"
+            ):
                 raise BagError(_("Malformed URL in fetch.txt: %s") % url)
 
     def _validate_contents(self, processes=1, fast=False, completeness_only=False):
@@ -851,11 +856,11 @@ def _validate_completeness(self):
         only_in_manifests, only_on_fs = self.compare_manifests_with_fs()
         for path in only_in_manifests:
             e = FileMissing(path)
-            LOGGER.warning(force_unicode(e))
+            LOGGER.warning(str(e))
             errors.append(e)
         for path in only_on_fs:
             e = UnexpectedFile(path)
-            LOGGER.warning(force_unicode(e))
+            LOGGER.warning(str(e))
             errors.append(e)
 
         if errors:
@@ -906,7 +911,7 @@ def _validate_entries(self, processes):
                     e = ChecksumMismatch(
                         rel_path, alg, stored_hash.lower(), computed_hash
                     )
-                    LOGGER.warning(force_unicode(e))
+                    LOGGER.warning(str(e))
                     errors.append(e)
 
         if errors:
@@ -963,7 +968,7 @@ def __init__(self, message, details=None):
 
     def __str__(self):
         if len(self.details) > 0:
-            details = "; ".join([force_unicode(e) for e in self.details])
+            details = "; ".join([str(e) for e in self.details])
             return "%s: %s" % (self.message, details)
         return self.message
 
@@ -988,7 +993,7 @@ def __str__(self):
         return _(
             '%(path)s %(algorithm)s validation failed: expected="%(expected)s" found="%(found)s"'
         ) % {
-            "path": force_unicode(self.path),
+            "path": str(self.path),
             "algorithm": self.algorithm,
             "expected": self.expected,
             "found": self.found,
@@ -997,9 +1002,9 @@ def __str__(self):
 
 class FileMissing(ManifestErrorDetail):
     def __str__(self):
-        return _(
-            "%s exists in manifest but was not found on filesystem"
-        ) % force_unicode(self.path)
+        return _("%s exists in manifest but was not found on filesystem") % str(
+            self.path
+        )
 
 
 class UnexpectedFile(ManifestErrorDetail):
@@ -1138,7 +1143,7 @@ def _calc_hashes(args):
     try:
         f_hashes = _calculate_file_hashes(full_path, f_hashers)
     except BagValidationError as e:
-        f_hashes = dict((alg, force_unicode(e)) for alg in f_hashers.keys())
+        f_hashes = dict((alg, str(e)) for alg in f_hashers.keys())
 
     return rel_path, f_hashes, hashes
 
@@ -1161,7 +1166,7 @@ def _calculate_file_hashes(full_path, f_hashers):
     except (OSError, IOError) as e:
         raise BagValidationError(
             _("Could not read %(filename)s: %(error)s")
-            % {"filename": full_path, "error": force_unicode(e)}
+            % {"filename": full_path, "error": str(e)}
         )
 
     return dict((alg, h.hexdigest()) for alg, h in f_hashers.items())
@@ -1187,11 +1192,11 @@ def _load_tag_file(tag_file_name, encoding="utf-8-sig"):
 
 def _parse_tags(tag_file):
     """Parses a tag file, according to RFC 2822.  This
-       includes line folding, permitting extra-long
-       field values.
+    includes line folding, permitting extra-long
+    field values.
 
-       See http://www.faqs.org/rfcs/rfc2822.html for
-       more information.
+    See http://www.faqs.org/rfcs/rfc2822.html for
+    more information.
     """
 
     tag_name = None
@@ -1237,7 +1242,7 @@ def _make_tag_file(bag_info_path, bag_info):
                 values = [values]
             for txt in values:
                 # strip CR, LF and CRLF so they don't mess up the tag file
-                txt = re.sub(r"\n|\r|(\r\n)", "", force_unicode(txt))
+                txt = re.sub(r"\n|\r|(\r\n)", "", str(txt))
                 f.write("%s: %s\n" % (h, txt))
 
 
@@ -1433,19 +1438,6 @@ def _decode_filename(s):
     return s
 
 
-def force_unicode_py2(s):
-    """Reliably return a Unicode string given a possible unicode or byte string"""
-    if isinstance(s, str):
-        return s.decode("utf-8")
-    else:
-        return unicode(s)
-
-
-if sys.version_info > (3, 0):
-    force_unicode = str
-else:
-    force_unicode = force_unicode_py2
-
 # following code is used for command line program
 
 
@@ -1531,7 +1523,10 @@ def _make_parser():
     metadata_args = parser.add_argument_group(_("Optional Bag Metadata"))
     for header in STANDARD_BAG_INFO_HEADERS:
         metadata_args.add_argument(
-            "--%s" % header.lower(), type=str, action=BagHeaderAction, default=argparse.SUPPRESS
+            "--%s" % header.lower(),
+            type=str,
+            action=BagHeaderAction,
+            default=argparse.SUPPRESS,
         )
 
     parser.add_argument(
@@ -1574,7 +1569,9 @@ def main():
         parser.error(_("--fast is only allowed as an option for --validate!"))
 
     if args.completeness_only and not args.validate:
-        parser.error(_("--completeness-only is only allowed as an option for --validate!"))
+        parser.error(
+            _("--completeness-only is only allowed as an option for --validate!")
+        )
 
     _configure_logging(args)
 
@@ -1593,7 +1590,9 @@ def main():
                 if args.fast:
                     LOGGER.info(_("%s valid according to Payload-Oxum"), bag_dir)
                 elif args.completeness_only:
-                    LOGGER.info(_("%s is complete and valid according to Payload-Oxum"), bag_dir)
+                    LOGGER.info(
+                        _("%s is complete and valid according to Payload-Oxum"), bag_dir
+                    )
                 else:
                     LOGGER.info(_("%s is valid"), bag_dir)
             except BagError as e:
diff --git a/bench.py b/bench.py
index 37d14f5..06b4796 100755
--- a/bench.py
+++ b/bench.py
@@ -2,7 +2,7 @@
 
 """
 This is a little benchmarking script to exercise bagit.make_bag and
-bagit.validate using 1-8 parallel processes. It will download some images 
+bagit.validate using 1-8 parallel processes. It will download some images
 from NASA for use in bagging the first time it is run.
 """
 
diff --git a/pyproject.toml b/pyproject.toml
index 231d419..49db106 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,4 +34,3 @@ known_first_party = "bagit"
 [tool.coverage.run]
 branch = true
 include = "bagit.py"
-
diff --git a/test.py b/test.py
index 0f32754..16652fb 100644
--- a/test.py
+++ b/test.py
@@ -444,7 +444,7 @@ def test_sha1_tagfile(self):
         bag = bagit.make_bag(self.tmpdir, checksum=["sha1"], bag_info=info)
         self.assertTrue(os.path.isfile(j(self.tmpdir, "tagmanifest-sha1.txt")))
         self.assertEqual(
-            "f69110479d0d395f7c321b3860c2bc0c96ae9fe8",
+            "3f7423acbb8395ff11dfeb16b4172e7ccc2c529e",
             bag.entries["bag-info.txt"]["sha1"],
         )