Skip to content

Commit

Permalink
DD-450: copy some metadata files to data/migration (#42)
Browse files Browse the repository at this point in the history
* copy migration files and update payload manifest
* remove dans-bagit-lib dependency
  • Loading branch information
jo-pol authored May 3, 2021
1 parent d746f64 commit 209f598
Show file tree
Hide file tree
Showing 19 changed files with 111 additions and 53 deletions.
5 changes: 3 additions & 2 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,11 @@ SYNOPSIS
DESCRIPTION
-----------

Add deposit.properties to directories(s) with a bag.
Add deposit.properties to directorie(s) with a bag.
These [properties](depositproperties.md) are used in the processing of the deposit.
The bag in each directory should be a bag created with the `get`
subcommand of [easy-bag-store](https://dans-knaw.github.io/easy-bag-store/).
subcommand of [easy-bag-store](https://dans-knaw.github.io/easy-bag-store/)
or created with [easy-fedora-to-bag](https://dans-knaw.github.io/easy-fedora-to-bag/)

The state of a bag is undefined when it has a `deposit.properties` but is not moved to `<output-dir>`:
the `metadata.xml`, `bag-info.txt` and manifests may or may not have been changed.
Expand Down
6 changes: 3 additions & 3 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@

<name>EASY Vault convert bag to deposit</name>
<url>https://github.com/DANS-KNAW/easy-convert-bag-to-deposit</url>
<description>Add deposit.properties to directories(s) with a bag</description>
<description>Add deposit.properties to directorie(s) with a bag</description>
<inceptionYear>2020</inceptionYear>

<properties>
Expand All @@ -45,8 +45,8 @@

<dependencies>
<dependency>
<groupId>nl.knaw.dans.lib</groupId>
<artifactId>dans-bag-lib_2.12</artifactId>
<groupId>nl.knaw.dans</groupId>
<artifactId>bagit</artifactId>
</dependency>
<dependency>
<groupId>com.yourmediashelf.fedora.client</groupId>
Expand Down
88 changes: 68 additions & 20 deletions src/main/scala/nl.knaw.dans.easy.bag2deposit/BagFacade.scala
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,28 @@
*/
package nl.knaw.dans.easy.bag2deposit

import java.nio.file.attribute.BasicFileAttributes
import java.nio.file.{ FileVisitResult, Files, Path }

import better.files.File
import gov.loc.repository.bagit.creator.CreateTagManifestsVistor
import gov.loc.repository.bagit.domain.{ Bag, Metadata }
import gov.loc.repository.bagit.creator.{ CreatePayloadManifestsVistor, CreateTagManifestsVistor }
import gov.loc.repository.bagit.domain
import gov.loc.repository.bagit.domain.Bag
import gov.loc.repository.bagit.hash.Hasher
import gov.loc.repository.bagit.reader.BagReader
import gov.loc.repository.bagit.writer.{ ManifestWriter, MetadataWriter }

import java.nio.file.attribute.BasicFileAttributes
import java.nio.file.{ FileVisitResult, Files, Path }
import java.security.MessageDigest
import java.util
import scala.collection.JavaConverters._
import scala.util.{ Failure, Try }

object BagFacade {

// TODO add to dans-bag lib
// variant of https://github.com/DANS-KNAW/easy-ingest-flow/blob/78ea3bec23923adf10c1c0650b019ea51c251ce6/src/main/scala/nl.knaw.dans.easy.ingestflow/BagitFacadeComponent.scala#L133
// these constants duplicate nl.knaw.dans.bag.v0.DansV0Bag
val IS_VERSION_OF_KEY = "Is-Version-Of"
val EASY_USER_ACCOUNT_KEY = "EASY-User-Account"

// TODO variant of https://github.com/DANS-KNAW/easy-ingest-flow/blob/78ea3bec23923adf10c1c0650b019ea51c251ce6/src/main/scala/nl.knaw.dans.easy.ingestflow/BagitFacadeComponent.scala#L133

private val bagReader = new BagReader()

Expand All @@ -45,23 +50,66 @@ object BagFacade {
MetadataWriter.writeBagMetadata(bag.getMetadata, bag.getVersion, bag.getRootDir, bag.getFileEncoding)
}

def updateManifest(bag: Bag): Try[Unit] = Try {
def isTagManifest(path: Path): Boolean = {
bag.getRootDir.relativize(path).getNameCount == 1 && path.getFileName.toString.startsWith("tagmanifest-")
private val includeHiddenFiles = true

/**
* (re)calculate values for all algorithms of new/changed payload files
*
* @param bag changed bag
* @param payloadEntries directory or file relatieve to the root of the bag
* @return
*/
def updatePayloadManifests(bag: Bag, payloadEntries: Path): Try[Unit] = Try {
if (!payloadEntries.toString.startsWith("data/")) {
throw new IllegalArgumentException(s"path must start with data, found $payloadEntries")
}
if (bag.getPayLoadManifests.isEmpty) {
throw new IllegalArgumentException(s"No payload manifests found (as DansV0Bag would have created) ${ bag.getRootDir }")
}
val payloadManifests = bag.getPayLoadManifests
val algorithms = payloadManifests.asScala.map(_.getAlgorithm).asJava
val map = Hasher.createManifestToMessageDigestMap(algorithms)
val visitor = new CreatePayloadManifestsVistor(map, includeHiddenFiles)
Files.walkFileTree(bag.getRootDir.resolve(payloadEntries), visitor)
mergeManifests(payloadManifests, map)
}

val algorithms = bag.getTagManifests.asScala.map(_.getAlgorithm).asJava
val tagFilesMap = Hasher.createManifestToMessageDigestMap(algorithms)
val tagVisitor = new CreateTagManifestsVistor(tagFilesMap, true) {
/** Recalculates the checksums for changed metadata files (and payload manifests) for all present algorithms */
def updateTagManifests(bag: Bag, changed: Seq[Path]): Try[Unit] = Try {
val bagRoot = bag.getRootDir
val tagManifests = bag.getTagManifests
val algorithms = tagManifests.asScala.map(_.getAlgorithm).asJava
val map = Hasher.createManifestToMessageDigestMap(algorithms)
val visitor = new CreateTagManifestsVistor(map, includeHiddenFiles) {
override def visitFile(path: Path, attrs: BasicFileAttributes): FileVisitResult = {
if (isTagManifest(path)) FileVisitResult.CONTINUE
else super.visitFile(path, attrs)
val relativePath = bagRoot.relativize(path)
if (relativePath.toString.startsWith("manifest-") ||
changed.contains(relativePath)
) super.visitFile(path, attrs)
else FileVisitResult.CONTINUE
}
}
val bagPath = bag.getRootDir
Files.walkFileTree(bagPath, tagVisitor)
bag.getTagManifests.clear()
bag.getTagManifests.addAll(tagFilesMap.keySet())
ManifestWriter.writeTagManifests(bag.getTagManifests, bagPath, bagPath, bag.getFileEncoding)
Files.walkFileTree(bagRoot, visitor)
mergeManifests(tagManifests, map)
}

private def mergeManifests(manifests: util.Set[domain.Manifest], manifetsToDigest: util.Map[domain.Manifest, MessageDigest]): Unit = {
val newMap = manifetsToDigest.keySet().asScala.map(m =>
m.getAlgorithm -> m.getFileToChecksumMap
).toMap
for {
m <- manifests.asScala
(path, hash) <- newMap(m.getAlgorithm).asScala
} {
m.getFileToChecksumMap.put(path, hash)
}
}

/** (re)writes payload and tagmanifest files for all present algorithms */
def writeManifests(bag: Bag): Try[Unit] = Try {
val bagRoot = bag.getRootDir
val encoding = bag.getFileEncoding
ManifestWriter.writePayloadManifests(bag.getPayLoadManifests, bagRoot, bagRoot, encoding)
ManifestWriter.writeTagManifests(bag.getTagManifests, bagRoot, bagRoot, encoding)
}
}
8 changes: 3 additions & 5 deletions src/main/scala/nl.knaw.dans.easy.bag2deposit/BagInfo.scala
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,12 @@
*/
package nl.knaw.dans.easy.bag2deposit

import java.util.UUID

import better.files.File
import gov.loc.repository.bagit.domain.Metadata
import nl.knaw.dans.bag.v0.DansV0Bag
import nl.knaw.dans.lib.error._
import org.apache.commons.configuration.ConfigurationException

import java.util.UUID
import scala.collection.JavaConverters._
import scala.util.{ Failure, Try }

Expand All @@ -41,15 +39,15 @@ object BagInfo {

def getMandatory(key: String) = getMaybe(key).getOrElse(throw notFound(key))

val maybeVersionOf = getMaybe(DansV0Bag.IS_VERSION_OF_KEY).map(uuidFromVersionOf)
val maybeVersionOf = getMaybe(BagFacade.IS_VERSION_OF_KEY).map(uuidFromVersionOf)
val basePids = (getMaybe(baseUrnKey), getMaybe(baseDoiKey)) match {
case (None, None) => None
case (Some(urn), Some(doi)) => Some(BasePids(urn, doi))
case _ => throw new Exception("")
}

new BagInfo(
userId = getMandatory(DansV0Bag.EASY_USER_ACCOUNT_KEY),
userId = getMandatory(BagFacade.EASY_USER_ACCOUNT_KEY),
created = getMandatory("Bagging-Date"),
uuid = uuidFromFile(bagDir.parent),
bagName = bagDir.name,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,19 @@
*/
package nl.knaw.dans.easy.bag2deposit

import java.nio.file.Path

import better.files.File
import nl.knaw.dans.easy.bag2deposit.BagSource.BagSource
import nl.knaw.dans.easy.bag2deposit.IdType.IdType
import org.rogach.scallop.{ ScallopConf, ScallopOption, ValueConverter, singleArgConverter }

import java.nio.file.Path

class CommandLineOptions(args: Array[String], configuration: Configuration) extends ScallopConf(args) {
appendDefaultToDescription = true
editBuilder(_.setHelpWidth(110))
printedName = "easy-convert-bag-to-deposit"
version(configuration.version)
val description: String = s"""Add deposit.properties to directories(s) with a bag"""
val description: String = s"""Add deposit.properties to directorie(s) with a bag"""
val synopsis: String =
s"""
| $printedName { --dir | --uuid } <directory> -t { URN | DOI } -s { FEDORA | VAULT } [ -o <output-dir> ]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,12 @@ package nl.knaw.dans.easy.bag2deposit

import better.files.File
import better.files.File.CopyOptions
import nl.knaw.dans.bag.v0.DansV0Bag
import nl.knaw.dans.easy.bag2deposit.Command.FeedBackMessage
import nl.knaw.dans.easy.bag2deposit.ddm.Provenance
import nl.knaw.dans.lib.logging.DebugEnhancedLogging

import java.io.{ FileNotFoundException, IOException }
import java.nio.file.Paths
import scala.collection.mutable.ListBuffer
import scala.util.{ Failure, Success, Try }
import scala.xml.{ Elem, NodeSeq }
Expand Down Expand Up @@ -69,11 +69,14 @@ class EasyConvertBagToDepositApp(configuration: Configuration) extends DebugEnha
app = getClass.getSimpleName,
version = configuration.version
)

private def addProps(depositPropertiesFactory: DepositPropertiesFactory, maybeOutputDir: Option[File])
(bagParentDir: File): Try[Boolean] = {
logger.debug(s"creating application.properties for $bagParentDir")
val migrationFiles = Seq("provenance.xml", "emd.xml", "dataset.xml", "files.xml")
val changedMetadata = Seq("bag-info.xml", "metadata/amd.xml", "metadata/dataset.xml", "metadata/provenance.xml").map(Paths.get(_))
val bagInfoKeysToRemove = Seq(
DansV0Bag.EASY_USER_ACCOUNT_KEY,
BagFacade.EASY_USER_ACCOUNT_KEY,
BagInfo.baseUrnKey,
BagInfo.baseDoiKey,
)
Expand All @@ -82,8 +85,10 @@ class EasyConvertBagToDepositApp(configuration: Configuration) extends DebugEnha
bag <- BagFacade.getBag(bagDir)
mutableBagMetadata = bag.getMetadata
bagInfo <- BagInfo(bagDir, mutableBagMetadata)
_ = bagInfoKeysToRemove.foreach(mutableBagMetadata.remove)
_ = logger.info(s"$bagInfo")
ddmFile = bagDir / "metadata" / "dataset.xml"
metadata = bagDir / "metadata"
ddmFile = metadata / "dataset.xml"
ddmIn <- loadXml(ddmFile)
props <- depositPropertiesFactory.create(bagInfo, ddmIn)
datasetId = props.getString("identifier.fedora", "")
Expand All @@ -92,11 +97,12 @@ class EasyConvertBagToDepositApp(configuration: Configuration) extends DebugEnha
_ = registerMatchedReports(datasetId, ddmOut \\ "reportNumber")
_ = props.save((bagParentDir / "deposit.properties").toJava)
_ = ddmFile.writeText(ddmOut.serialize)
_ = bagInfoKeysToRemove.foreach(mutableBagMetadata.remove)
_ = trace("updating metadata")
migrationDir = (bagDir / "data" / "easy-migration").createDirectories()
_ = migrationFiles.foreach(name => (metadata / name).copyTo(migrationDir / name))
_ <- BagFacade.updateMetadata(bag)
_ = trace("updating manifest")
_ <- BagFacade.updateManifest(bag)
_ <- BagFacade.updatePayloadManifests(bag, Paths.get("data/easy-migration"))
_ <- BagFacade.updateTagManifests(bag, changedMetadata)
_ <- BagFacade.writeManifests(bag)
_ = maybeOutputDir.foreach(move(bagParentDir))
_ = logger.info(s"OK $datasetId ${ bagParentDir.name }/${ bagDir.name }")
} yield true
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
d41d8cd98f00b204e9800998ecf8427e data/leeg.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
da39a3ee5e6b4b0d3255bfef95601890afd80709 data/leeg.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
93e40077e897bf876b34584d55c3d0c3418335a0 bag-info.txt
9ab631399d291d404a5410f05ea788c0841bf62e bag-info.txt
e2924b081506bac23f5fffe650ad1848a1c8ac1d bagit.txt
22765423165e02fb051662346de38f41486c01d1 manifest-sha1.txt
083add4cd452b0e6707a7943d76481d0e730f930 metadata/dataset.xml
7ce5c4a27f60ad31ea34a853184536a89899b916 metadata/files.xml
aa06505173e0140717f192f83bc5126110686483 manifest-sha1.txt
d371cfc7fbac8faa2b44a1a7a7783c33dcc581a9 manifest-md5.txt
f7b43884d90e11f27479a99bfddcb83a1f963d53 metadata/dataset.xml
da39a3ee5e6b4b0d3255bfef95601890afd80709 metadata/files.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
d41d8cd98f00b204e9800998ecf8427e data/leeg.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
da39a3ee5e6b4b0d3255bfef95601890afd80709 data/leeg.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
93e40077e897bf876b34584d55c3d0c3418335a0 bag-info.txt
5a67d824e1fa5b861aede867e72ce7b9fc800961 bag-info.txt
e2924b081506bac23f5fffe650ad1848a1c8ac1d bagit.txt
22765423165e02fb051662346de38f41486c01d1 manifest-sha1.txt
083add4cd452b0e6707a7943d76481d0e730f930 metadata/dataset.xml
7ce5c4a27f60ad31ea34a853184536a89899b916 metadata/files.xml
aa06505173e0140717f192f83bc5126110686483 manifest-sha1.txt
d371cfc7fbac8faa2b44a1a7a7783c33dcc581a9 manifest-md5.txt
818325b15b4893cacab4378d8751da6c7d8384c1 metadata/dataset.xml
da39a3ee5e6b4b0d3255bfef95601890afd80709 metadata/files.xml
9 changes: 4 additions & 5 deletions src/test/scala/nl.knaw.dans.easy.bag2deposit/AppSpec.scala
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
package nl.knaw.dans.easy.bag2deposit

import better.files.File
import nl.knaw.dans.bag.v0.DansV0Bag.EASY_USER_ACCOUNT_KEY
import nl.knaw.dans.easy.bag2deposit.BagSource._
import nl.knaw.dans.easy.bag2deposit.Fixture.{ AppConfigSupport, FileSystemSupport }
import nl.knaw.dans.easy.bag2deposit.IdType._
Expand All @@ -30,7 +29,7 @@ class AppSpec extends AnyFlatSpec with Matchers with AppConfigSupport with FileS
private val resourceBags: File = File("src/test/resources/bags/01")
private val validUUID = "04e638eb-3af1-44fb-985d-36af12fccb2d"

"addPropsToBags" should "move valid exports" in {
"addPropsToBags" should "move valid exports" in {
val delegate = mock[MockBagIndex]
val noBaseBagUUID = "87151a3a-12ed-426a-94f2-97313c7ae1f2"
(delegate.execute(_: String)) expects s"bag-sequence?contains=$validUUID" returning
Expand Down Expand Up @@ -68,7 +67,7 @@ class AppSpec extends AnyFlatSpec with Matchers with AppConfigSupport with FileS
movedDirs.foreach(dir => dir.isSameContentAs(resourceBags / dir.name) shouldBe false)

// total number of deposits should not change
movedDirs.size + leftDirs.size shouldBe resourceBags.children.toList.size
movedDirs.size + leftDirs.size shouldBe resourceBags.children.toList.size

movedDirs.size shouldBe 2 // base-bag-not-found is moved together with the valid bag-revision-1
// TODO should addPropsToBags check existence of base-bag in case of versioned bags?
Expand All @@ -91,8 +90,8 @@ class AppSpec extends AnyFlatSpec with Matchers with AppConfigSupport with FileS
movedBag / ".." / "deposit.properties" should exist

// other content changes verified in DepositPropertiesFactorySpec
(validBag / "bag-info.txt").contentAsString should include(EASY_USER_ACCOUNT_KEY)
(movedBag / "bag-info.txt").contentAsString shouldNot include(EASY_USER_ACCOUNT_KEY)
(validBag / "bag-info.txt").contentAsString should include(BagFacade.EASY_USER_ACCOUNT_KEY)
(movedBag / "bag-info.txt").contentAsString shouldNot include(BagFacade.EASY_USER_ACCOUNT_KEY)

// content of provenance verified in ddm.ProvenanceSpec
(validBag / "tagmanifest-sha1.txt").contentAsString shouldNot include("metadata/provenance.xml")
Expand Down

0 comments on commit 209f598

Please sign in to comment.