Skip to content

Commit

Permalink
Merge pull request #118 from LibraryOfCongress/version1
Browse files Browse the repository at this point in the history
updates due to version 1.0 of bagit spec being released
  • Loading branch information
johnscancella authored Jun 19, 2018
2 parents 61c50c8 + 97a6770 commit af00922
Show file tree
Hide file tree
Showing 21 changed files with 190 additions and 45 deletions.
16 changes: 12 additions & 4 deletions src/main/java/gov/loc/repository/bagit/conformance/BagLinter.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@
import gov.loc.repository.bagit.domain.Version;
import gov.loc.repository.bagit.exceptions.InvalidBagMetadataException;
import gov.loc.repository.bagit.exceptions.InvalidBagitFileFormatException;
import gov.loc.repository.bagit.exceptions.MaliciousPathException;
import gov.loc.repository.bagit.exceptions.UnparsableVersionException;
import gov.loc.repository.bagit.exceptions.UnsupportedAlgorithmException;
import gov.loc.repository.bagit.exceptions.conformance.BagitVersionIsNotAcceptableException;
import gov.loc.repository.bagit.exceptions.conformance.FetchFileNotAllowedException;
import gov.loc.repository.bagit.exceptions.conformance.MetatdataValueIsNotAcceptableException;
Expand All @@ -39,6 +41,8 @@

/**
* Responsible for checking a bag and providing insight into how it cause problems.
* This class is only to be used on VALID bags, using it on un-validated bags may result in
* exceptions being thrown (like {@link java.io.IOException} )
*/
public final class BagLinter {
private static final Logger logger = LoggerFactory.getLogger(BagLinter.class);
Expand All @@ -57,7 +61,7 @@ private BagLinter(){
* @param jsonProfile the input stream to the json string describing the profile
* @param bag the bag to check against the profile
*
* @throws IOException if there is a problem reading the profile
* @throws IOException if there is a problem reading the profile or some of the bag files
* @throws JsonMappingException if there is a problem mapping the profile to the {@link BagitProfile}
* @throws JsonParseException if there is a problem parsing the json while mapping to java object
*
Expand Down Expand Up @@ -88,8 +92,10 @@ public static void checkAgainstProfile(final InputStream jsonProfile, final Bag
* @throws InvalidBagMetadataException if the bag metadata does not conform to the bagit specification
* @throws UnparsableVersionException if there is an error reading the bagit version
* @throws IOException if there was an error reading a file
* @throws UnsupportedAlgorithmException if there is an error while reading one of the manifests due to the algorithm being unsupported
* @throws MaliciousPathException if the path is crafted to be malicious (overwrite non bag files)
*/
public static Set<BagitWarning> lintBag(final Path rootDir) throws IOException, UnparsableVersionException, InvalidBagMetadataException, InvalidBagitFileFormatException{
public static Set<BagitWarning> lintBag(final Path rootDir) throws IOException, UnparsableVersionException, InvalidBagMetadataException, InvalidBagitFileFormatException, MaliciousPathException, UnsupportedAlgorithmException{
return lintBag(rootDir, Collections.emptyList());
}

Expand All @@ -107,8 +113,10 @@ public static Set<BagitWarning> lintBag(final Path rootDir) throws IOException,
* @throws InvalidBagMetadataException if the bag metadata does not conform to the bagit specification
* @throws UnparsableVersionException if there is an error reading the bagit version
* @throws IOException if there was an error reading a file
* @throws UnsupportedAlgorithmException if there is an error while reading one of the manifests due to the algorithm being unsupported
* @throws MaliciousPathException if the path is crafted to be malicious (overwrite non bag files)
*/
public static Set<BagitWarning> lintBag(final Path rootDir, final Collection<BagitWarning> warningsToIgnore) throws IOException, UnparsableVersionException, InvalidBagMetadataException, InvalidBagitFileFormatException{
public static Set<BagitWarning> lintBag(final Path rootDir, final Collection<BagitWarning> warningsToIgnore) throws IOException, UnparsableVersionException, InvalidBagMetadataException, InvalidBagitFileFormatException, MaliciousPathException, UnsupportedAlgorithmException{
final Set<BagitWarning> warnings = new HashSet<>();

//@Incubating
Expand All @@ -128,7 +136,7 @@ public static Set<BagitWarning> lintBag(final Path rootDir, final Collection<Bag
VersionChecker.checkVersion(bagitInfo.getKey(), warnings, warningsToIgnore);

logger.info(messages.getString("checking_manifest_problems"));
ManifestChecker.checkManifests(bagitDir, bagitInfo.getValue(), warnings, warningsToIgnore);
ManifestChecker.checkManifests(bagitInfo.getKey(), bagitDir, bagitInfo.getValue(), warnings, warningsToIgnore);

logger.info(messages.getString("checking_metadata_problems"));
MetadataChecker.checkBagMetadata(bagitDir, bagitInfo.getValue(), warnings, warningsToIgnore);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ public enum BagitWarning {
OS_SPECIFIC_FILES("os_specific_files"),
PAYLOAD_OXUM_MISSING("payload_oxum_missing"),
TAG_FILES_ENCODING("tag_files_encoding"),
WEAK_CHECKSUM_ALGORITHM("weak_checksum_algorithm");
WEAK_CHECKSUM_ALGORITHM("weak_checksum_algorithm"),
MANIFEST_SETS_DIFFER("manifest_file_sets_differ_between_algorithms");

private final String messageBundleKey;
private static final ResourceBundle messages = ResourceBundle.getBundle("MessageBundle");
Expand Down
147 changes: 120 additions & 27 deletions src/main/java/gov/loc/repository/bagit/conformance/ManifestChecker.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,23 +7,32 @@
import java.nio.file.Files;
import java.nio.file.Path;
import java.text.Normalizer;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.ResourceBundle;
import java.util.Set;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.helpers.MessageFormatter;

import gov.loc.repository.bagit.domain.Manifest;
import gov.loc.repository.bagit.domain.Version;
import gov.loc.repository.bagit.exceptions.InvalidBagitFileFormatException;
import gov.loc.repository.bagit.exceptions.MaliciousPathException;
import gov.loc.repository.bagit.exceptions.UnsupportedAlgorithmException;
import gov.loc.repository.bagit.hash.StandardBagitAlgorithmNameToSupportedAlgorithmMapping;
import gov.loc.repository.bagit.reader.ManifestReader;
import gov.loc.repository.bagit.util.PathUtils;

/**
* Part of the BagIt conformance suite.
* This checker checks for various problems related to the manifests in a bag.
*/
@SuppressWarnings({"PMD.UseLocaleWithCaseConversions"})
//TODO refactor to remove PMD warnings!
@SuppressWarnings({"PMD.UseLocaleWithCaseConversions", "PMD.TooManyMethods", "PMD.GodClass"})
public final class ManifestChecker {
private static final Logger logger = LoggerFactory.getLogger(ManifestChecker.class);
private static final ResourceBundle messages = ResourceBundle.getBundle("MessageBundle");
Expand All @@ -34,46 +43,79 @@ public final class ManifestChecker {
private static final String TRASHES_FILE = "\\.(_.)?[Tt][Rr][Aa][Ss][Hh][Ee][Ss]";
private static final String FS_EVENTS_FILE = "\\.[Ff][Ss][Ee][Vv][Ee][Nn][Tt][Ss][Dd]";
private static final String OS_FILES_REGEX = ".*data/(" + THUMBS_DB_FILE + "|" + DS_STORE_FILE + "|" + SPOTLIGHT_FILE + "|" + TRASHES_FILE + "|" + FS_EVENTS_FILE + ")";
private static final Version VERSION_1_0 = new Version(1,0);

private ManifestChecker(){
//intentionally left empty
}

/*
/**
* Check for all the manifest specific potential problems
*
* @param version the version of the bag we are checking
* @param bagitDir the directory where the manifests are stored
* @param encoding the encoding of the manifests
* @param warnings the set of warnings that will be appended to while checking
* @param warningsToIgnore the set of warnings to ignore
*
* @throws IOException if there is a problem reading a file (because it doesn't exist)
* @throws InvalidBagitFileFormatException if one (or more) of the files does not match the formatting as specified in the specification
* @throws MaliciousPathException if someone crafted the bag to specifically try and write outside the bag directory
* @throws UnsupportedAlgorithmException if a manifest uses an algorithm that the computer doesn't know how to use
*/
public static void checkManifests(final Path bagitDir, final Charset encoding, final Set<BagitWarning> warnings,
final Collection<BagitWarning> warningsToIgnore) throws IOException, InvalidBagitFileFormatException{
//@SuppressWarnings("PMD.CyclomaticComplexity")
public static void checkManifests(final Version version, final Path bagitDir, final Charset encoding, final Set<BagitWarning> warnings,
final Collection<BagitWarning> warningsToIgnore) throws IOException, InvalidBagitFileFormatException, MaliciousPathException, UnsupportedAlgorithmException{

boolean missingTagManifest = true;
final List<Path> payloadManifests = new ArrayList<>();
final List<Path> tagManifests = new ArrayList<>();
try(final DirectoryStream<Path> files = Files.newDirectoryStream(bagitDir)){
for(final Path file : files){
final String filename = PathUtils.getFilename(file);
if(filename.contains("manifest-")){
if(filename.startsWith("manifest-")){
checkData(file, encoding, warnings, warningsToIgnore, true);
}
else{
checkData(file, encoding, warnings, warningsToIgnore, false);
missingTagManifest = false;
}

final String algorithm = filename.split("[-\\.]")[1];
checkAlgorthm(algorithm, warnings, warningsToIgnore);
}
missingTagManifest = missingTagManifest && checkManifest(file, payloadManifests, tagManifests, encoding, warnings, warningsToIgnore);
}
}

if(!warnings.contains(BagitWarning.MANIFEST_SETS_DIFFER)){
checkManifestSets(version, tagManifests, payloadManifests, warnings, encoding);
}

if(!warningsToIgnore.contains(BagitWarning.MISSING_TAG_MANIFEST) && missingTagManifest){
logger.warn(messages.getString("bag_missing_tag_manifest_warning"), bagitDir);
warnings.add(BagitWarning.MISSING_TAG_MANIFEST);
}
}

private static boolean checkManifest(final Path file, final List<Path> payloadManifests, final List<Path> tagManifests,
final Charset encoding, final Set<BagitWarning> warnings,
final Collection<BagitWarning> warningsToIgnore) throws IOException, InvalidBagitFileFormatException{
boolean missingTagManifest = true;
final String filename = PathUtils.getFilename(file);
if(filename.contains("manifest-")){
if(filename.startsWith("manifest-")){
payloadManifests.add(file);
checkManifestPayload(file, encoding, warnings, warningsToIgnore, true);
}
else{
tagManifests.add(file);
checkManifestPayload(file, encoding, warnings, warningsToIgnore, false);
missingTagManifest = false;
}

final String algorithm = filename.split("[-\\.]")[1];
checkAlgorthm(algorithm, warnings, warningsToIgnore);
}

return missingTagManifest;
}

/*
* Check for a "bag within a bag" and for relative paths in the manifests
* Check for a "bag within a bag", relative paths, and OS specific files in the manifests
*/
private static void checkData(final Path manifestFile, final Charset encoding, final Set<BagitWarning> warnings, final Collection<BagitWarning> warningsToIgnore, final boolean isPayloadManifest) throws IOException, InvalidBagitFileFormatException{
private static void checkManifestPayload(final Path manifestFile, final Charset encoding, final Set<BagitWarning> warnings,
final Collection<BagitWarning> warningsToIgnore, final boolean isPayloadManifest)
throws IOException, InvalidBagitFileFormatException{

try(final BufferedReader reader = Files.newBufferedReader(manifestFile, encoding)){
final Set<String> paths = new HashSet<>();

Expand All @@ -82,28 +124,24 @@ private static void checkData(final Path manifestFile, final Charset encoding, f
String path = parsePath(line);

path = checkForManifestCreatedWithMD5SumTools(path, warnings, warningsToIgnore);

if(!warningsToIgnore.contains(BagitWarning.DIFFERENT_CASE) && paths.contains(path.toLowerCase())){
logger.warn(messages.getString("different_case_warning"), manifestFile, path);
warnings.add(BagitWarning.DIFFERENT_CASE);
}
paths.add(path.toLowerCase());

checkForDifferentCase(path, paths, manifestFile, warnings, warningsToIgnore);
if(encoding.name().startsWith("UTF")){
checkNormalization(path, manifestFile.getParent(), warnings, warningsToIgnore);
}

checkForBagWithinBag(line, warnings, warningsToIgnore, isPayloadManifest);

checkForRelativePaths(line, warnings, warningsToIgnore, manifestFile);

checkForOSSpecificFiles(line, warnings, warningsToIgnore, manifestFile);

line = reader.readLine();
}
}
}

/*
* Check to make sure it conforms to <hash> <path>
*/
static String parsePath(final String line) throws InvalidBagitFileFormatException{
final String[] parts = line.split("\\s+", 2);
if(parts.length < 2){
Expand All @@ -114,6 +152,9 @@ static String parsePath(final String line) throws InvalidBagitFileFormatExceptio
return parts[1];
}

/*
* We allow for MD5sum tools for compatibility but it is not recommended
*/
private static String checkForManifestCreatedWithMD5SumTools(final String path, final Set<BagitWarning> warnings, final Collection<BagitWarning> warningsToIgnore){
String fixedPath = path;
final boolean startsWithStar = path.charAt(0) == '*';
Expand All @@ -130,6 +171,17 @@ private static String checkForManifestCreatedWithMD5SumTools(final String path,
return fixedPath;
}

/*
* Check that the same line doesn't already exist in the set of paths
*/
private static void checkForDifferentCase(final String path, final Set<String> paths, final Path manifestFile,
final Set<BagitWarning> warnings, final Collection<BagitWarning> warningsToIgnore){
if(!warningsToIgnore.contains(BagitWarning.DIFFERENT_CASE) && paths.contains(path.toLowerCase())){
logger.warn(messages.getString("different_case_warning"), manifestFile, path);
warnings.add(BagitWarning.DIFFERENT_CASE);
}
}

/*
* Check that the file specified has not changed its normalization (i.e. have the bytes changed but it still looks the same?)
*/
Expand Down Expand Up @@ -210,6 +262,47 @@ else if(!warningsToIgnore.contains(BagitWarning.NON_STANDARD_ALGORITHM) && !"SHA
warnings.add(BagitWarning.NON_STANDARD_ALGORITHM);
}
}

static void checkManifestSets(final Version version, final List<Path> tagManifests, final List<Path> payloadManifests,
final Set<BagitWarning> warnings, final Charset encoding)
throws IOException, MaliciousPathException, UnsupportedAlgorithmException, InvalidBagitFileFormatException{
//edge case, for version 1.0+ all tag manifests SHOULD list the same set of files
if(tagManifests.size() > 1 && VERSION_1_0.isSameOrOlder(version)){
checkManifestsListSameSetOfFiles(warnings, tagManifests, encoding);
}

//edge case, for version 1.0+ all payload manifests SHOULD list the same set of files
if(payloadManifests.size() > 1 && VERSION_1_0.isSameOrOlder(version)){
checkManifestsListSameSetOfFiles(warnings, payloadManifests, encoding);
}
}

//starting with version 1.0 all manifest types (tag, payload) should list the same set of files
@SuppressWarnings("PMD.EmptyCatchBlock")
static void checkManifestsListSameSetOfFiles(final Set<BagitWarning> warnings, final List<Path> manifestPaths, final Charset charset) throws IOException, MaliciousPathException, UnsupportedAlgorithmException, InvalidBagitFileFormatException{
final StandardBagitAlgorithmNameToSupportedAlgorithmMapping nameMapping = new StandardBagitAlgorithmNameToSupportedAlgorithmMapping();

Manifest compareToManifest = null;
Path compareToManifestPath = null;
for (final Path manifestPath : manifestPaths) {
try {
final Manifest manifest = ManifestReader.readManifest(nameMapping, manifestPath, manifestPath.getParent(), charset);
if(compareToManifest == null) {
compareToManifestPath = manifestPath;
compareToManifest = manifest;
continue;
}

if(!compareToManifest.getFileToChecksumMap().keySet().equals(manifest.getFileToChecksumMap().keySet())) {
logger.warn(messages.getString("manifest_fileset_differ"), compareToManifestPath, manifestPath);
warnings.add(BagitWarning.MANIFEST_SETS_DIFFER);
}
}
catch(UnsupportedAlgorithmException e) {
//ignore an unsupported algorithm as it is caught in checkAlgorthm()
}
}
}

//for unit test only
static String getOsFilesRegex() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
*/
public interface VersionChecker {
Logger logger = LoggerFactory.getLogger(VersionChecker.class);
Version LATEST_BAGIT_VERSION = new Version(0, 97);
Version LATEST_BAGIT_VERSION = Version.LATEST_BAGIT_VERSION();
ResourceBundle messages = ResourceBundle.getBundle("MessageBundle");

/*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ public final class BagCreator {
private static final ResourceBundle messages = ResourceBundle.getBundle("MessageBundle");
private static final String DATE_FORMAT = "yyyy-MM-dd";
private static final Version DOT_BAGIT_VERSION = new Version(2, 0);
private static final Version LATEST_NON_DOT_BAGIT_VERSION = new Version(0, 97);
private static final Version LATEST_NON_DOT_BAGIT_VERSION = Version.LATEST_BAGIT_VERSION();

private BagCreator(){}

Expand Down
4 changes: 4 additions & 0 deletions src/main/java/gov/loc/repository/bagit/domain/Version.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ public Version(final int major, final int minor){
this.minor = minor;
this.cachedToString = major + "." + minor;
}

public static Version LATEST_BAGIT_VERSION() {
return new Version(1, 0);
}

@Override
public String toString() {
Expand Down
2 changes: 2 additions & 0 deletions src/main/resources/MessageBundle.properties
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ os_specific_files=Files created by the operating system (OS) for its own use. Th
payload_oxum_missing=It is recommended to always include the Payload-Oxum in the bag metadata since it allows for a 'quick verification' of the bag.
tag_files_encoding=It is recommended to always use UTF-8.
weak_checksum_algorithm=The checksum algorithm used is known to be weak. We recommend using SHA-512.
manifest_file_sets_differ_between_algorithms=As of bagit version 1.0 it is recommended that all payload manifests contain the same set of files as other payload manifests. It is also recommended that all tag manifests contain the same set of files as other tag manifests.

#for BagLinter.java
checking_encoding_problems=Checking encoding problems.
Expand Down Expand Up @@ -83,6 +84,7 @@ leading_dot_slash_warning=In manifest [{}] line [{}] is a non-normalized path.
os_specific_files_warning=In manifest [{}] line [{}] contains a OS specific file.
weak_algorithm_warning=Detected a known weak algorithm [{}]. With the great advances in computer hardware there is little penalty to using more bits to calculate the checksum.
non_standard_algorithm_warning=Detected algorithm [{}] which is not included by default in Java. This will make it more difficult to read this bag on some systems. Consider changing it to SHA-512.
manifest_fileset_differ=Manifest [{}] does not contain the same set of files as manifest [{}], it is recommended that they be the same.

#for MetadataChecker.java
missing_payload_oxum_warning=The Payload-Oxum key was not found in the bag metadata. This will prevent a "quick verify".
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ public void testClassIsWellDefined() throws NoSuchMethodException, InvocationTar
public void testLintBag() throws Exception{
Set<BagitWarning> expectedWarnings = new HashSet<>();
expectedWarnings.addAll(Arrays.asList(BagitWarning.values()));
expectedWarnings.remove(BagitWarning.MANIFEST_SETS_DIFFER); //only applies to version 1.0 but need older version for other warnings, so we test this separately
Set<BagitWarning> warnings = BagLinter.lintBag(rootDir);

if(FileSystems.getDefault().getClass().getName() == "sun.nio.fs.MacOSXFileSystem"){
Expand Down
Loading

0 comments on commit af00922

Please sign in to comment.