From 23f0eb93dbc2cfa9cddd78904a0c5b9a1f63eede Mon Sep 17 00:00:00 2001 From: Jo Pol Date: Thu, 20 May 2021 12:30:12 +0200 Subject: [PATCH] DD-443: Associate datasets with the chosen depositor-account (#39) * replace user in amd.xml * provenance for replaced user * group DDM related actions * account-substitutes.csv optional * speed up --help --- .../assembly/dist/cfg/account-substitutes.csv | 2 + .../BagFacade.scala | 8 +- .../Command.scala | 34 ++++- .../CommandLineOptions.scala | 6 +- .../Configuration.scala | 28 +--- .../DepositPropertiesFactory.scala | 1 + .../EasyConvertBagToDepositApp.scala | 24 +++- .../UserTransformer.scala | 61 +++++++++ .../collections/Collections.scala | 5 +- .../collections/FedoraProvider.scala | 5 +- .../collections/Resolver.scala | 1 + .../ddm/DdmTransformer.scala | 3 +- .../ddm/Provenance.scala | 53 +++++--- .../package.scala | 4 +- .../bag-revision-1/metadata/amd.xml | 128 ++++++++++++++++++ .../metadata/depositor-info/agreements.xml | 14 ++ .../base-bag-not-found/metadata/amd.xml | 128 ++++++++++++++++++ .../metadata/depositor-info/agreements.xml | 14 ++ .../debug-config/account-substitutes.csv | 2 + .../AppSpec.scala | 3 + .../ConfigurationSpec.scala | 22 ++- .../Fixture/AppConfigSupport.scala | 5 +- .../Fixture/XmlSupport.scala | 30 ++++ .../ReadmeSpec.scala | 7 +- .../ddm/ProvenanceSpec.scala | 72 ++++++++-- .../ddm/RewriteSpec.scala | 38 ++---- 26 files changed, 588 insertions(+), 110 deletions(-) create mode 100644 src/main/assembly/dist/cfg/account-substitutes.csv create mode 100644 src/main/scala/nl.knaw.dans.easy.bag2deposit/UserTransformer.scala create mode 100644 src/test/resources/bags/01/04e638eb-3af1-44fb-985d-36af12fccb2d/bag-revision-1/metadata/amd.xml create mode 100644 src/test/resources/bags/01/04e638eb-3af1-44fb-985d-36af12fccb2d/bag-revision-1/metadata/depositor-info/agreements.xml create mode 100644 src/test/resources/bags/01/87151a3a-12ed-426a-94f2-97313c7ae1f2/base-bag-not-found/metadata/amd.xml create mode 100644 src/test/resources/bags/01/87151a3a-12ed-426a-94f2-97313c7ae1f2/base-bag-not-found/metadata/depositor-info/agreements.xml create mode 100644 src/test/resources/debug-config/account-substitutes.csv create mode 100644 src/test/scala/nl.knaw.dans.easy.bag2deposit/Fixture/XmlSupport.scala diff --git a/src/main/assembly/dist/cfg/account-substitutes.csv b/src/main/assembly/dist/cfg/account-substitutes.csv new file mode 100644 index 00000000..06f3698d --- /dev/null +++ b/src/main/assembly/dist/cfg/account-substitutes.csv @@ -0,0 +1,2 @@ +removed-account, chosen-account +user001,USer \ No newline at end of file diff --git a/src/main/scala/nl.knaw.dans.easy.bag2deposit/BagFacade.scala b/src/main/scala/nl.knaw.dans.easy.bag2deposit/BagFacade.scala index 003dec2c..227c39eb 100644 --- a/src/main/scala/nl.knaw.dans.easy.bag2deposit/BagFacade.scala +++ b/src/main/scala/nl.knaw.dans.easy.bag2deposit/BagFacade.scala @@ -46,8 +46,11 @@ object BagFacade { case cause: Exception => Failure(InvalidBagException(s"$bagDir, $cause")) } - def updateMetadata(bag: Bag): Try[Unit] = Try { - MetadataWriter.writeBagMetadata(bag.getMetadata, bag.getVersion, bag.getRootDir, bag.getFileEncoding) + def updateMetadata(bag: Bag): Try[Unit] = { + trace(bag.getRootDir) + Try { + MetadataWriter.writeBagMetadata(bag.getMetadata, bag.getVersion, bag.getRootDir, bag.getFileEncoding) + } } private val includeHiddenFiles = true @@ -60,6 +63,7 @@ object BagFacade { * @return */ def updatePayloadManifests(bag: Bag, payloadEntries: Path): Try[Unit] = Try { + trace(bag.getRootDir) if (!payloadEntries.toString.startsWith("data/")) { throw new IllegalArgumentException(s"path must start with data, found $payloadEntries") } diff --git a/src/main/scala/nl.knaw.dans.easy.bag2deposit/Command.scala b/src/main/scala/nl.knaw.dans.easy.bag2deposit/Command.scala index 4f9d0c2e..2218111f 100644 --- a/src/main/scala/nl.knaw.dans.easy.bag2deposit/Command.scala +++ b/src/main/scala/nl.knaw.dans.easy.bag2deposit/Command.scala @@ -16,21 +16,51 @@ package nl.knaw.dans.easy.bag2deposit import better.files.File +import better.files.File.root +import nl.knaw.dans.easy.bag2deposit.collections.Collections.getCollectionsMap +import nl.knaw.dans.easy.bag2deposit.collections.FedoraProvider +import nl.knaw.dans.easy.bag2deposit.ddm.DdmTransformer import nl.knaw.dans.lib.logging.DebugEnhancedLogging +import org.apache.commons.configuration.PropertiesConfiguration +import java.net.URI import scala.language.reflectiveCalls object Command extends App with DebugEnhancedLogging { type FeedBackMessage = String + private val home = File(System.getProperty("app.home")) + val cfgPath = Seq( + root / "etc" / "opt" / "dans.knaw.nl" / "easy-convert-bag-to-deposit", + home / "cfg") + .find(_.exists) + .getOrElse { throw new IllegalStateException("No configuration directory found") } + val properties = { + new PropertiesConfiguration() { + setDelimiterParsingDisabled(true) + load((cfgPath / "application.properties").toJava) + } + } + val version = (home / "bin" / "version").contentAsString.stripLineEnd + val agent = properties.getString("http.agent", s"easy-convert-bag-to-deposit/$version") + logger.info(s"setting http.agent to $agent") + System.setProperty("http.agent", agent) - val configuration = Configuration(File(System.getProperty("app.home"))) - val commandLine: CommandLineOptions = new CommandLineOptions(args, configuration) { + val commandLine: CommandLineOptions = new CommandLineOptions(args, version) { verify() } private val bagParentDirs = commandLine.bagParentDir.map(Iterator(_)) .getOrElse(commandLine.bagGrandParentDir.map(_.children) .getOrElse(Iterator.empty)) + + val configuration = Configuration( + version, + dansDoiPrefixes = properties.getStringArray("dans-doi.prefixes"), + dataverseIdAuthority = properties.getString("dataverse.id-authority"), + bagIndex = BagIndex(new URI(properties.getString("bag-index.url"))), + ddmTransformer = new DdmTransformer(cfgPath, getCollectionsMap(cfgPath, FedoraProvider(properties))), + userTransformer = new UserTransformer(cfgPath) + ) private val propertiesFactory = DepositPropertiesFactory( configuration, commandLine.idType(), diff --git a/src/main/scala/nl.knaw.dans.easy.bag2deposit/CommandLineOptions.scala b/src/main/scala/nl.knaw.dans.easy.bag2deposit/CommandLineOptions.scala index 6bdee210..74faaca4 100644 --- a/src/main/scala/nl.knaw.dans.easy.bag2deposit/CommandLineOptions.scala +++ b/src/main/scala/nl.knaw.dans.easy.bag2deposit/CommandLineOptions.scala @@ -22,18 +22,18 @@ import org.rogach.scallop.{ ScallopConf, ScallopOption, ValueConverter, singleAr import java.nio.file.Path -class CommandLineOptions(args: Array[String], configuration: Configuration) extends ScallopConf(args) { +class CommandLineOptions(args: Array[String], version: String) extends ScallopConf(args) { appendDefaultToDescription = true editBuilder(_.setHelpWidth(110)) printedName = "easy-convert-bag-to-deposit" - version(configuration.version) + version(version) val description: String = s"""Add deposit.properties to directorie(s) with a bag""" val synopsis: String = s""" | $printedName { --dir | --uuid } -t { URN | DOI } -s { FEDORA | VAULT } [ -o ] |""".stripMargin - version(s"$printedName v${ configuration.version }") + version(s"$printedName v$version") banner( s""" | $description diff --git a/src/main/scala/nl.knaw.dans.easy.bag2deposit/Configuration.scala b/src/main/scala/nl.knaw.dans.easy.bag2deposit/Configuration.scala index 3ef7db44..1c60419c 100644 --- a/src/main/scala/nl.knaw.dans.easy.bag2deposit/Configuration.scala +++ b/src/main/scala/nl.knaw.dans.easy.bag2deposit/Configuration.scala @@ -30,31 +30,5 @@ case class Configuration(version: String, dataverseIdAuthority: String, bagIndex: BagIndex, ddmTransformer: DdmTransformer, + userTransformer: UserTransformer, ) - -object Configuration extends DebugEnhancedLogging { - - def apply(home: File): Configuration = { - val cfgPath = Seq( - root / "etc" / "opt" / "dans.knaw.nl" / "easy-convert-bag-to-deposit", - home / "cfg") - .find(_.exists) - .getOrElse { throw new IllegalStateException("No configuration directory found") } - val properties = new PropertiesConfiguration() { - setDelimiterParsingDisabled(true) - load((cfgPath / "application.properties").toJava) - } - val version = (home / "bin" / "version").contentAsString.stripLineEnd - val agent = properties.getString("http.agent", s"easy-convert-bag-to-deposit/$version") - logger.info(s"setting http.agent to $agent") - System.setProperty("http.agent", agent) - - new Configuration( - version, - dansDoiPrefixes = properties.getStringArray("dans-doi.prefixes"), - dataverseIdAuthority = properties.getString("dataverse.id-authority"), - bagIndex = BagIndex(new URI(properties.getString("bag-index.url"))), - ddmTransformer = new DdmTransformer(cfgPath, getCollectionsMap(cfgPath, FedoraProvider(properties))), - ) - } -} diff --git a/src/main/scala/nl.knaw.dans.easy.bag2deposit/DepositPropertiesFactory.scala b/src/main/scala/nl.knaw.dans.easy.bag2deposit/DepositPropertiesFactory.scala index b6c4e0f5..d1949b38 100644 --- a/src/main/scala/nl.knaw.dans.easy.bag2deposit/DepositPropertiesFactory.scala +++ b/src/main/scala/nl.knaw.dans.easy.bag2deposit/DepositPropertiesFactory.scala @@ -30,6 +30,7 @@ case class DepositPropertiesFactory(configuration: Configuration, idType: IdType private val dataverseIdAuthority = configuration.dataverseIdAuthority def create(bagInfo: BagInfo, ddm: Node): Try[PropertiesConfiguration] = Try { + trace(this.getClass) val ddmIds: NodeSeq = ddm \ "dcmiMetadata" \ "identifier" def formatOfPanId = (ddm \ "dcmiMetadata" \ "isFormatOf") diff --git a/src/main/scala/nl.knaw.dans.easy.bag2deposit/EasyConvertBagToDepositApp.scala b/src/main/scala/nl.knaw.dans.easy.bag2deposit/EasyConvertBagToDepositApp.scala index 9e37c097..f504f659 100644 --- a/src/main/scala/nl.knaw.dans.easy.bag2deposit/EasyConvertBagToDepositApp.scala +++ b/src/main/scala/nl.knaw.dans.easy.bag2deposit/EasyConvertBagToDepositApp.scala @@ -19,13 +19,15 @@ import better.files.File import better.files.File.CopyOptions import nl.knaw.dans.easy.bag2deposit.Command.FeedBackMessage import nl.knaw.dans.easy.bag2deposit.ddm.Provenance +import nl.knaw.dans.easy.bag2deposit.ddm.Provenance.compare import nl.knaw.dans.lib.logging.DebugEnhancedLogging import java.io.{ FileNotFoundException, IOException } import java.nio.file.Paths +import java.nio.charset.Charset import scala.collection.mutable.ListBuffer import scala.util.{ Failure, Success, Try } -import scala.xml.{ Elem, NodeSeq } +import scala.xml.NodeSeq class EasyConvertBagToDepositApp(configuration: Configuration) extends DebugEnhancedLogging { @@ -46,6 +48,7 @@ class EasyConvertBagToDepositApp(configuration: Configuration) extends DebugEnha ).toMap def registerMatchedReports(urn: String, reports: NodeSeq): Unit = { + trace(urn) reports.foreach { node => val reportUuid = (node \@ "valueURI").replaceAll(".*/", "") Try(reportMatches(reportUuid) += s"\t$urn\t${ node.text }") @@ -69,6 +72,7 @@ class EasyConvertBagToDepositApp(configuration: Configuration) extends DebugEnha app = getClass.getSimpleName, version = configuration.version ) + implicit val charset: Charset = Charset.forName("UTF-8") private def addProps(depositPropertiesFactory: DepositPropertiesFactory, maybeOutputDir: Option[File]) (bagParentDir: File): Try[Boolean] = { @@ -93,15 +97,26 @@ class EasyConvertBagToDepositApp(configuration: Configuration) extends DebugEnha props <- depositPropertiesFactory.create(bagInfo, ddmIn) datasetId = props.getString("identifier.fedora", "") ddmOut <- configuration.ddmTransformer.transform(ddmIn, datasetId) - _ = provenance.xml(ddmIn, ddmOut).foreach(writeProvenance(bagDir)) _ = registerMatchedReports(datasetId, ddmOut \\ "reportNumber") _ = props.save((bagParentDir / "deposit.properties").toJava) _ = ddmFile.writeText(ddmOut.serialize) + oldDcmi = (ddmIn \ "dcmiMetadata").headOption.getOrElse() + newDcmi = (ddmOut \ "dcmiMetadata").headOption.getOrElse() + amdChanges <- configuration.userTransformer.transform(metadata / "amd.xml") + _ = provenance.collectChangesInXmls(Map( + "http://easy.dans.knaw.nl/easy/dataset-administrative-metadata/" -> amdChanges, + "http://easy.dans.knaw.nl/schemas/md/ddm/" -> compare(oldDcmi, newDcmi), + )).foreach(xml => (metadata / "provenance.xml").writeText(xml.serialize)) migrationDir = (bagDir / "data" / "easy-migration").createDirectories() _ = migrationFiles.foreach(name => (metadata / name).copyTo(migrationDir / name)) + _ = bagInfoKeysToRemove.foreach(mutableBagMetadata.remove) + _ = trace("updating metadata") _ <- BagFacade.updateMetadata(bag) + _ = trace("updating payload manifest") _ <- BagFacade.updatePayloadManifests(bag, Paths.get("data/easy-migration")) + _ = trace("updating tag manifest") _ <- BagFacade.updateTagManifests(bag, changedMetadata) + _ = trace("writing manifests") _ <- BagFacade.writeManifests(bag) _ = maybeOutputDir.foreach(move(bagParentDir)) _ = logger.info(s"OK $datasetId ${ bagParentDir.name }/${ bagDir.name }") @@ -118,11 +133,6 @@ class EasyConvertBagToDepositApp(configuration: Configuration) extends DebugEnha Failure(e) } - private def writeProvenance(bagDir: File)(xml: Elem) = { - trace(bagDir) - (bagDir / "metadata" / "provenance.xml").writeText(xml.serialize) - } - private def move(bagParentDir: File)(outputDir: File) = { trace(bagParentDir, outputDir) val target = outputDir / bagParentDir.name diff --git a/src/main/scala/nl.knaw.dans.easy.bag2deposit/UserTransformer.scala b/src/main/scala/nl.knaw.dans.easy.bag2deposit/UserTransformer.scala new file mode 100644 index 00000000..6d7b9c8c --- /dev/null +++ b/src/main/scala/nl.knaw.dans.easy.bag2deposit/UserTransformer.scala @@ -0,0 +1,61 @@ +/** + * Copyright (C) 2020 DANS - Data Archiving and Networked Services (info@dans.knaw.nl) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package nl.knaw.dans.easy.bag2deposit + +import better.files.File +import nl.knaw.dans.easy.bag2deposit.ddm.Provenance +import org.apache.commons.csv.CSVFormat.RFC4180 + +import java.nio.charset.Charset +import scala.util.Try +import scala.xml.Node +import scala.xml.transform.{ RewriteRule, RuleTransformer } + +class UserTransformer(cfgDir: File) { + private val csvFile: File = cfgDir / "account-substitutes.csv" + private val userMap = if (!csvFile.exists || csvFile.isEmpty) + Map[String,String]() + else parseCsv( + csvFile, + nrOfHeaderLines = 1, + format = RFC4180.withHeader("old", "new"), + ).map(r => r.get("old") -> r.get("new")).toMap + + private val userRewriteRule: RewriteRule = new RewriteRule { + override def transform(node: Node): Seq[Node] = { + if (!Seq("depositorId", "signerId").contains(node.label)) node + else userMap + .get(node.text).map(id => { id }.copy(label = node.label)) + .getOrElse(node) + } + } + private val transformer = new RuleTransformer(userRewriteRule) + + // The default charset is determined during virtual-machine startup and typically + // depends upon the locale and charset of the underlying operating system. + implicit val charset: Charset = Charset.forName("UTF-8") + + def transform(file: File): Try[Seq[Node]] = { + for { + xmlIn <- loadXml(file) + xmlOut = transformer.transform(xmlIn).headOption + .getOrElse(throw new Exception("programming error: AmdTransformer returned multiple roots")) + _ = file.writeText(xmlOut.serialize) + diff = Provenance.compare(xmlIn, xmlOut) + _ = trace(diff.map(_.serialize)) + } yield diff + } +} diff --git a/src/main/scala/nl.knaw.dans.easy.bag2deposit/collections/Collections.scala b/src/main/scala/nl.knaw.dans.easy.bag2deposit/collections/Collections.scala index 2884888b..a63e00bc 100644 --- a/src/main/scala/nl.knaw.dans.easy.bag2deposit/collections/Collections.scala +++ b/src/main/scala/nl.knaw.dans.easy.bag2deposit/collections/Collections.scala @@ -37,6 +37,7 @@ object Collections extends DebugEnhancedLogging { private val resolver: Resolver = Resolver() private def parseCsv(file: File, format: CSVFormat): Try[Iterable[CSVRecord]] = { + trace(file) managed(CSVParser.parse( file.toJava, Charset.forName("UTF-8"), @@ -63,7 +64,7 @@ object Collections extends DebugEnhancedLogging { } def getCollectionsMap(cfgPath: File, maybeFedoraProvider: Option[FedoraProvider]): Map[String, Elem] = { - trace() + trace(()) val result: Map[String, Elem] = maybeFedoraProvider .map { provider => memberDatasetIdToInCollection(collectionDatasetIdToInCollection(cfgPath), provider) @@ -92,8 +93,6 @@ object Collections extends DebugEnhancedLogging { ) } - logger.info(s"building collections from $cfgDir") - parseCsv(cfgDir / "ThemathischeCollecties.csv", collectionCsvFormat) .unsafeGetOrThrow .map(parseCollectionRecord) diff --git a/src/main/scala/nl.knaw.dans.easy.bag2deposit/collections/FedoraProvider.scala b/src/main/scala/nl.knaw.dans.easy.bag2deposit/collections/FedoraProvider.scala index 609a1ac0..732f406d 100644 --- a/src/main/scala/nl.knaw.dans.easy.bag2deposit/collections/FedoraProvider.scala +++ b/src/main/scala/nl.knaw.dans.easy.bag2deposit/collections/FedoraProvider.scala @@ -63,8 +63,9 @@ case class FedoraProviderException(query: String, cause: Throwable) extends Exce object FedoraProvider extends DebugEnhancedLogging { def apply(properties: PropertiesConfiguration): Option[FedoraProvider] = { - trace() - Option(properties.getString("fcrepo.url")) + val repo = properties.getString("fcrepo.url") + trace(this.getClass, repo) + Option(repo) .toSeq.filter(_.trim.nonEmpty) .map(url => new FedoraProvider(new FedoraClient(new FedoraCredentials( diff --git a/src/main/scala/nl.knaw.dans.easy.bag2deposit/collections/Resolver.scala b/src/main/scala/nl.knaw.dans.easy.bag2deposit/collections/Resolver.scala index 7465757a..f53014ef 100644 --- a/src/main/scala/nl.knaw.dans.easy.bag2deposit/collections/Resolver.scala +++ b/src/main/scala/nl.knaw.dans.easy.bag2deposit/collections/Resolver.scala @@ -34,6 +34,7 @@ case class Resolver() extends DebugEnhancedLogging{ } private def resolve(url: String) = { + trace(url) Try(Http(url).asString).flatMap { case response if response.code == 404 => logger.error(s"not found: $url") diff --git a/src/main/scala/nl.knaw.dans.easy.bag2deposit/ddm/DdmTransformer.scala b/src/main/scala/nl.knaw.dans.easy.bag2deposit/ddm/DdmTransformer.scala index 8f831bc2..25fc48c4 100644 --- a/src/main/scala/nl.knaw.dans.easy.bag2deposit/ddm/DdmTransformer.scala +++ b/src/main/scala/nl.knaw.dans.easy.bag2deposit/ddm/DdmTransformer.scala @@ -27,7 +27,7 @@ import scala.xml.transform.{ RewriteRule, RuleTransformer } import scala.xml.{ Elem, Node, NodeSeq } class DdmTransformer(cfgDir: File, collectionsMap: Map[String, Elem] = Map.empty) extends DebugEnhancedLogging { - trace() + trace(()) val reportRewriteRule: ReportRewriteRule = ReportRewriteRule(cfgDir) private val acquisitionRewriteRule = AcquisitionRewriteRule(cfgDir) private val languageRewriteRule = LanguageRewriteRule(cfgDir / "languages.csv") @@ -71,6 +71,7 @@ class DdmTransformer(cfgDir: File, collectionsMap: Map[String, Elem] = Map.empty } def transform(ddmIn: Node, datasetId: String): Try[Node] = { + trace(datasetId) val newDcmiNodes = collectionsMap.get(datasetId) .toSeq ++ unknownRightsHolder(ddmIn) diff --git a/src/main/scala/nl.knaw.dans.easy.bag2deposit/ddm/Provenance.scala b/src/main/scala/nl.knaw.dans.easy.bag2deposit/ddm/Provenance.scala index 11a04730..63126adf 100644 --- a/src/main/scala/nl.knaw.dans.easy.bag2deposit/ddm/Provenance.scala +++ b/src/main/scala/nl.knaw.dans.easy.bag2deposit/ddm/Provenance.scala @@ -15,23 +15,26 @@ */ package nl.knaw.dans.easy.bag2deposit.ddm +import nl.knaw.dans.lib.logging.DebugEnhancedLogging import org.joda.time.DateTime.now import org.joda.time.format.DateTimeFormat import scala.xml.{ Elem, Node } -class Provenance(app: String, version: String) { +class Provenance(app: String, version: String) extends DebugEnhancedLogging { private val dateFormat = now().toString(DateTimeFormat.forPattern("yyyy-MM-dd")) - def xml(oldDdm: Node, newDdm: Node): Option[Elem] = { - - // children of both profile and dcmiMetadata - val oldNodes = oldDdm.flatMap(_.nonEmptyChildren).flatMap(_.nonEmptyChildren) - val newNodes = newDdm.flatMap(_.nonEmptyChildren).flatMap(_.nonEmptyChildren) - val onlyInOld = oldNodes.diff(newNodes) - val onlyInNew = newNodes.diff(oldNodes) - - if (onlyInOld.isEmpty && onlyInNew.isEmpty) None + /** + * collects differences between old and new versions of XMLs as far as they we + * + * @param changes the key of the map is the schema of the compared XMLs + * the values are an empty list or the content for + * @return + */ + def collectChangesInXmls(changes: Map[String, Seq[Node]]): Option[Elem] = { + trace(this.getClass) + val filtered = changes.filter(_._2.nonEmpty) + if (filtered.isEmpty) None else Some( - - { onlyInOld } - - - { onlyInNew } - + { filtered.map { case (scheme, diff) => + { diff } + }} ) } } +object Provenance { + /** + * Creates the content for a by comparing the direct child elements of each XML. + * @param oldXml the original instance + * @param newXml the modified instance + * @return and empty list if both versions have the same children + * when large/complex elements (like for example authors or polygons) have minor changes + * both versions of the complete element is returned + */ + def compare(oldXml: Node, newXml: Node): Seq[Node] = { + // TODO poor mans solution to call with ddm/dcmiMetadata respective root of amd + val oldNodes = oldXml.flatMap(_.nonEmptyChildren) + val newNodes = newXml.flatMap(_.nonEmptyChildren) + val onlyInOld = oldNodes.diff(newNodes) + val onlyInNew = newNodes.diff(oldNodes) + + if (onlyInOld.isEmpty && onlyInNew.isEmpty) Seq.empty + else { onlyInOld } + { onlyInNew } + } +} diff --git a/src/main/scala/nl.knaw.dans.easy.bag2deposit/package.scala b/src/main/scala/nl.knaw.dans.easy.bag2deposit/package.scala index faf81db1..7caffbe1 100644 --- a/src/main/scala/nl.knaw.dans.easy.bag2deposit/package.scala +++ b/src/main/scala/nl.knaw.dans.easy.bag2deposit/package.scala @@ -24,7 +24,7 @@ import org.joda.time.{ DateTime, DateTimeZone } import resource.managed import java.io.FileNotFoundException -import java.nio.charset.Charset.defaultCharset +import java.nio.charset.Charset import scala.collection.JavaConverters._ import scala.util.{ Failure, Try } import scala.xml._ @@ -41,7 +41,7 @@ package object bag2deposit extends DebugEnhancedLogging { def parseCsv(file: File, nrOfHeaderLines: Int, format: CSVFormat = CSVFormat.RFC4180): Iterable[CSVRecord] = { trace(file) - managed(CSVParser.parse(file.toJava, defaultCharset(), format)) + managed(CSVParser.parse(file.toJava, Charset.forName("UTF-8"), format)) .map(_.asScala.filter(_.asScala.nonEmpty).drop(nrOfHeaderLines)) .tried.unsafeGetOrThrow } diff --git a/src/test/resources/bags/01/04e638eb-3af1-44fb-985d-36af12fccb2d/bag-revision-1/metadata/amd.xml b/src/test/resources/bags/01/04e638eb-3af1-44fb-985d-36af12fccb2d/bag-revision-1/metadata/amd.xml new file mode 100644 index 00000000..cac05721 --- /dev/null +++ b/src/test/resources/bags/01/04e638eb-3af1-44fb-985d-36af12fccb2d/bag-revision-1/metadata/amd.xml @@ -0,0 +1,128 @@ + + + PUBLISHED + DRAFT + 2020-02-02T20:02:00.000+01:00 + user001 + + + DRAFT + PUBLISHED + 2020-02-02T20:02:00.000+01:00 + + + + + NOT_ASSIGNED + + dataset + + + + dataset.sip + true + + + + dataset.sip.files + + + + dataset.sip.files.completeness + true + + + + + dataset.sip.files.accessibility + true + + + + + dataset.sip.files.privacy + true + + + + + + + dataset.sip.file-list + + + + dataset.sip.file-list.file-metadata + + + + + + + dataset.sip.descriptive-metadata + + + + dataset.sip.descriptive-metadata.completeness + true + + + + + dataset.sip.descriptive-metadata.identifiers + + + + + + + + + dataset.aip + true + + + + dataset.aip.file-conversion + + + + + dataset.aip.file-metadata + + + + + dataset.aip.structure + + + + + + + dataset.dip + true + + + + dataset.dip.publish-files + true + + + + + dataset.dip.jumpoff + + + + + dataset.dip.relations + + + + + + + + + diff --git a/src/test/resources/bags/01/04e638eb-3af1-44fb-985d-36af12fccb2d/bag-revision-1/metadata/depositor-info/agreements.xml b/src/test/resources/bags/01/04e638eb-3af1-44fb-985d-36af12fccb2d/bag-revision-1/metadata/depositor-info/agreements.xml new file mode 100644 index 00000000..f7a9a93e --- /dev/null +++ b/src/test/resources/bags/01/04e638eb-3af1-44fb-985d-36af12fccb2d/bag-revision-1/metadata/depositor-info/agreements.xml @@ -0,0 +1,14 @@ + + + + user001 + 2019-05-03T11:54:26.638+02:00 + true + + + user001 + 2019-05-03T11:54:26.638+02:00 + true + + diff --git a/src/test/resources/bags/01/87151a3a-12ed-426a-94f2-97313c7ae1f2/base-bag-not-found/metadata/amd.xml b/src/test/resources/bags/01/87151a3a-12ed-426a-94f2-97313c7ae1f2/base-bag-not-found/metadata/amd.xml new file mode 100644 index 00000000..cac05721 --- /dev/null +++ b/src/test/resources/bags/01/87151a3a-12ed-426a-94f2-97313c7ae1f2/base-bag-not-found/metadata/amd.xml @@ -0,0 +1,128 @@ + + + PUBLISHED + DRAFT + 2020-02-02T20:02:00.000+01:00 + user001 + + + DRAFT + PUBLISHED + 2020-02-02T20:02:00.000+01:00 + + + + + NOT_ASSIGNED + + dataset + + + + dataset.sip + true + + + + dataset.sip.files + + + + dataset.sip.files.completeness + true + + + + + dataset.sip.files.accessibility + true + + + + + dataset.sip.files.privacy + true + + + + + + + dataset.sip.file-list + + + + dataset.sip.file-list.file-metadata + + + + + + + dataset.sip.descriptive-metadata + + + + dataset.sip.descriptive-metadata.completeness + true + + + + + dataset.sip.descriptive-metadata.identifiers + + + + + + + + + dataset.aip + true + + + + dataset.aip.file-conversion + + + + + dataset.aip.file-metadata + + + + + dataset.aip.structure + + + + + + + dataset.dip + true + + + + dataset.dip.publish-files + true + + + + + dataset.dip.jumpoff + + + + + dataset.dip.relations + + + + + + + + + diff --git a/src/test/resources/bags/01/87151a3a-12ed-426a-94f2-97313c7ae1f2/base-bag-not-found/metadata/depositor-info/agreements.xml b/src/test/resources/bags/01/87151a3a-12ed-426a-94f2-97313c7ae1f2/base-bag-not-found/metadata/depositor-info/agreements.xml new file mode 100644 index 00000000..a60cbfea --- /dev/null +++ b/src/test/resources/bags/01/87151a3a-12ed-426a-94f2-97313c7ae1f2/base-bag-not-found/metadata/depositor-info/agreements.xml @@ -0,0 +1,14 @@ + + + + Full Name + 2019-05-03T11:54:26.638+02:00 + true + + + user001 + 2019-05-03T11:54:26.638+02:00 + true + + diff --git a/src/test/resources/debug-config/account-substitutes.csv b/src/test/resources/debug-config/account-substitutes.csv new file mode 100644 index 00000000..06f3698d --- /dev/null +++ b/src/test/resources/debug-config/account-substitutes.csv @@ -0,0 +1,2 @@ +removed-account, chosen-account +user001,USer \ No newline at end of file diff --git a/src/test/scala/nl.knaw.dans.easy.bag2deposit/AppSpec.scala b/src/test/scala/nl.knaw.dans.easy.bag2deposit/AppSpec.scala index 81ccd960..78128078 100644 --- a/src/test/scala/nl.knaw.dans.easy.bag2deposit/AppSpec.scala +++ b/src/test/scala/nl.knaw.dans.easy.bag2deposit/AppSpec.scala @@ -100,5 +100,8 @@ class AppSpec extends AnyFlatSpec with Matchers with AppConfigSupport with FileS // other content changes are verified in ddm.*Spec (validBag / "metadata" / "dataset.xml").contentAsString should include("Example") (movedBag / "metadata" / "dataset.xml").contentAsString shouldNot include("Example") + (validBag / "metadata" / "amd.xml").contentAsString should include("user001") + (movedBag / "metadata" / "amd.xml").contentAsString should + (include("USer") and not include "user001") } } diff --git a/src/test/scala/nl.knaw.dans.easy.bag2deposit/ConfigurationSpec.scala b/src/test/scala/nl.knaw.dans.easy.bag2deposit/ConfigurationSpec.scala index b57d8b60..e84d91e2 100644 --- a/src/test/scala/nl.knaw.dans.easy.bag2deposit/ConfigurationSpec.scala +++ b/src/test/scala/nl.knaw.dans.easy.bag2deposit/ConfigurationSpec.scala @@ -17,18 +17,28 @@ package nl.knaw.dans.easy.bag2deposit import better.files.File import nl.knaw.dans.easy.bag2deposit.Fixture.FileSystemSupport -import nl.knaw.dans.easy.bag2deposit.collections.FedoraProviderException +import nl.knaw.dans.easy.bag2deposit.collections.Collections.getCollectionsMap +import nl.knaw.dans.easy.bag2deposit.collections.FedoraProvider +import nl.knaw.dans.easy.bag2deposit.ddm.DdmTransformer +import org.apache.commons.configuration.PropertiesConfiguration import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers -import java.net.UnknownHostException -import scala.util.{ Failure, Success, Try } +import scala.util.Success class ConfigurationSpec extends AnyFlatSpec with FileSystemSupport with Matchers { + + private val transformer = { + val cfgPath = File("src/main/assembly/dist/cfg") + val properties = new PropertiesConfiguration() { + setDelimiterParsingDisabled(true) + load((cfgPath / "application.properties").toJava) + } + new DdmTransformer(cfgPath, getCollectionsMap(cfgPath, FedoraProvider(properties))) + } + "constructor" should "get past the first transformation when fedora is not configured" in { distDir(fedoraUrl = "") - val transformer = Configuration(home = testDir / "dist").ddmTransformer - transformer.transform( D37000, "easy-dataset:123", @@ -37,8 +47,6 @@ class ConfigurationSpec extends AnyFlatSpec with FileSystemSupport with Matchers it should "no longer fail on the first transformation when fedora is not available" in { distDir(fedoraUrl = "https://does.not.exist.dans.knaw.nl") - - val transformer = Configuration(home = testDir / "dist").ddmTransformer // the lazy constructor argument throws an exception // breaking through the Try of the first call that needs it // this is not handled within the context of a for comprehension diff --git a/src/test/scala/nl.knaw.dans.easy.bag2deposit/Fixture/AppConfigSupport.scala b/src/test/scala/nl.knaw.dans.easy.bag2deposit/Fixture/AppConfigSupport.scala index aa3d91b9..c8a03072 100644 --- a/src/test/scala/nl.knaw.dans.easy.bag2deposit/Fixture/AppConfigSupport.scala +++ b/src/test/scala/nl.knaw.dans.easy.bag2deposit/Fixture/AppConfigSupport.scala @@ -17,7 +17,7 @@ package nl.knaw.dans.easy.bag2deposit.Fixture import better.files.File import nl.knaw.dans.easy.bag2deposit.ddm.DdmTransformer -import nl.knaw.dans.easy.bag2deposit.{ BagIndex, Configuration } +import nl.knaw.dans.easy.bag2deposit.{ UserTransformer, BagIndex, Configuration } trait AppConfigSupport extends BagIndexSupport { def testConfig(bagIndex: BagIndex): Configuration = { @@ -27,7 +27,8 @@ trait AppConfigSupport extends BagIndexSupport { dansDoiPrefixes = Seq("10.17026", "10.5072"), dataverseIdAuthority = "10.80270", bagIndex = bagIndex, - ddmTransformer = new DdmTransformer(cfgFile) + ddmTransformer = new DdmTransformer(cfgFile), + userTransformer = new UserTransformer(cfgFile) ) } } diff --git a/src/test/scala/nl.knaw.dans.easy.bag2deposit/Fixture/XmlSupport.scala b/src/test/scala/nl.knaw.dans.easy.bag2deposit/Fixture/XmlSupport.scala new file mode 100644 index 00000000..f7226abf --- /dev/null +++ b/src/test/scala/nl.knaw.dans.easy.bag2deposit/Fixture/XmlSupport.scala @@ -0,0 +1,30 @@ +/** + * Copyright (C) 2020 DANS - Data Archiving and Networked Services (info@dans.knaw.nl) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package nl.knaw.dans.easy.bag2deposit.Fixture + +import scala.xml.{ Node, PrettyPrinter, Utility } + +trait XmlSupport { + private val nameSpaceRegExp = """ xmlns:[a-z-]+="[^"]*"""" // these attributes have a variable order + private val printer = new PrettyPrinter(160, 2) // Utility.serialize would preserve white space, now tests are better readable + + def normalized(elem: Node): String = printer + .format(Utility.trim(elem)) // this trim normalizes and + .replaceAll(nameSpaceRegExp, "") // the random order would cause differences in actual and expected + .replaceAll(" +\n?", " ") + .replaceAll("\n +<", "\n<") + .trim +} diff --git a/src/test/scala/nl.knaw.dans.easy.bag2deposit/ReadmeSpec.scala b/src/test/scala/nl.knaw.dans.easy.bag2deposit/ReadmeSpec.scala index 52d8d591..13713e92 100644 --- a/src/test/scala/nl.knaw.dans.easy.bag2deposit/ReadmeSpec.scala +++ b/src/test/scala/nl.knaw.dans.easy.bag2deposit/ReadmeSpec.scala @@ -15,17 +15,16 @@ */ package nl.knaw.dans.easy.bag2deposit -import java.io.ByteArrayOutputStream - import better.files.File import nl.knaw.dans.easy.bag2deposit.Fixture.{ CustomMatchers, FixedCurrentDateTimeSupport } import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers +import java.io.ByteArrayOutputStream + class ReadmeSpec extends AnyFlatSpec with Matchers with CustomMatchers with FixedCurrentDateTimeSupport { - private val configuration = Configuration(version = "my-version", Seq.empty, null, null, null) - private val clo = new CommandLineOptions(Array[String](), configuration) { + private val clo = new CommandLineOptions(Array[String](), "my-version") { // avoids System.exit() in case of invalid arguments or "--help" override def verify(): Unit = {} } diff --git a/src/test/scala/nl.knaw.dans.easy.bag2deposit/ddm/ProvenanceSpec.scala b/src/test/scala/nl.knaw.dans.easy.bag2deposit/ddm/ProvenanceSpec.scala index 774c9242..2dfcdaa4 100644 --- a/src/test/scala/nl.knaw.dans.easy.bag2deposit/ddm/ProvenanceSpec.scala +++ b/src/test/scala/nl.knaw.dans.easy.bag2deposit/ddm/ProvenanceSpec.scala @@ -15,12 +15,16 @@ */ package nl.knaw.dans.easy.bag2deposit.ddm -import nl.knaw.dans.easy.bag2deposit.Fixture.FixedCurrentDateTimeSupport +import better.files.File +import nl.knaw.dans.easy.bag2deposit.UserTransformer +import nl.knaw.dans.easy.bag2deposit.Fixture.{ FileSystemSupport, FixedCurrentDateTimeSupport, XmlSupport } import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers -class ProvenanceSpec extends AnyFlatSpec with Matchers with FixedCurrentDateTimeSupport { - "Provenance" should "show diff" in { +import scala.xml.Utility + +class ProvenanceSpec extends AnyFlatSpec with FileSystemSupport with XmlSupport with Matchers with FixedCurrentDateTimeSupport { + "Provenance" should "show ddm diff" in { val ddmIn = { @@ -87,24 +91,76 @@ class ProvenanceSpec extends AnyFlatSpec with Matchers with FixedCurrentDateTime } - new Provenance("EasyConvertBagToDepositApp", "1.0.5").xml(ddmIn, ddmOut) shouldBe Some( + new Provenance("EasyConvertBagToDepositApp", "1.0.5") + .collectChangesInXmls(Map( + "http://easy.dans.knaw.nl/easy/dataset-administrative-metadata/" -> + Seq.empty, + "http://easy.dans.knaw.nl/schemas/md/ddm/" -> + Provenance.compare((ddmIn \ "dcmiMetadata").head, (ddmOut \ "dcmiMetadata").head), + )) + .map(normalized) shouldBe Some(normalized( + Rapport 456VMEAEGVWELA - Rapport 456Transect-rapport 2859Vroege Middeleeuwen Aveenwinning (inclusief zouthoudend veen t.b.v. zoutproductie) - akker / tuin - Rapport 123 - Unknown + Rapport 456Transect-rapport 2859Vroege Middeleeuwen Aveenwinning (inclusief zouthoudend veen t.b.v. zoutproductie) + akker / tuin + Rapport 123 + Unknown + + )) + } + it should "show amd diff" in { + (testDir / "amd.xml").writeText( + """""" + + Utility.serialize( + + PUBLISHED + DRAFT + 2020-02-02T20:02:00.000+01:00 + user001 + + + DRAFT + PUBLISHED + 2020-02-02T20:02:00.000+01:00 + + + + ).toString() ) + + val transformer = new UserTransformer(cfgDir = File("src/main/assembly/dist/cfg")) + new Provenance("EasyConvertBagToDepositApp", "1.0.5").collectChangesInXmls(Map( + "http://easy.dans.knaw.nl/easy/dataset-administrative-metadata/" -> + transformer.transform(testDir / "amd.xml").getOrElse(fail("could not transform")), + )).map(normalized) shouldBe Some(normalized( + + + + + user001 + + + USer + + + + + )) } } diff --git a/src/test/scala/nl.knaw.dans.easy.bag2deposit/ddm/RewriteSpec.scala b/src/test/scala/nl.knaw.dans.easy.bag2deposit/ddm/RewriteSpec.scala index 019895ce..ceec1b61 100644 --- a/src/test/scala/nl.knaw.dans.easy.bag2deposit/ddm/RewriteSpec.scala +++ b/src/test/scala/nl.knaw.dans.easy.bag2deposit/ddm/RewriteSpec.scala @@ -16,9 +16,9 @@ package nl.knaw.dans.easy.bag2deposit.ddm import better.files.File -import nl.knaw.dans.easy.bag2deposit.Fixture.{ DdmSupport, SchemaSupport } +import nl.knaw.dans.easy.bag2deposit.Fixture.{ DdmSupport, SchemaSupport, XmlSupport } import nl.knaw.dans.easy.bag2deposit.ddm.LanguageRewriteRule.logNotMappedLanguages -import nl.knaw.dans.easy.bag2deposit.{ BagIndex, Configuration, EasyConvertBagToDepositApp, InvalidBagException, parseCsv } +import nl.knaw.dans.easy.bag2deposit.{ UserTransformer, BagIndex, Configuration, EasyConvertBagToDepositApp, InvalidBagException, parseCsv } import org.apache.commons.csv.CSVRecord import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers @@ -27,11 +27,10 @@ import java.net.URI import java.nio.charset.Charset import java.util.UUID import scala.util.{ Failure, Success, Try } -import scala.xml.{ Node, PrettyPrinter, Utility } -class RewriteSpec extends AnyFlatSpec with SchemaSupport with Matchers with DdmSupport { +class RewriteSpec extends AnyFlatSpec with XmlSupport with SchemaSupport with Matchers with DdmSupport { private val cfgDir: File = File("src/main/assembly/dist/cfg") - private val transformer: DdmTransformer = new DdmTransformer(cfgDir, Map.empty) + private val ddmTransformer: DdmTransformer = new DdmTransformer(cfgDir, Map.empty) override val schema = "https://easy.dans.knaw.nl/schemas/md/ddm/ddm.xsd" @@ -119,12 +118,13 @@ class RewriteSpec extends AnyFlatSpec with SchemaSupport with Matchers with DdmS dansDoiPrefixes = "10.17026/,10.5072/".split(","), dataverseIdAuthority = "10.80270", bagIndex = BagIndex(new URI("http://localhost:20120/")), - ddmTransformer = transformer, + ddmTransformer = ddmTransformer, + userTransformer = new UserTransformer(cfgDir) )) // a few steps of EasyConvertBagToDepositApp.addPropsToBags val datasetId = "easy-dataset:123" - transformer.transform(ddmIn, datasetId).map(normalized) + ddmTransformer.transform(ddmIn, datasetId).map(normalized) .getOrElse(fail("no DDM returned")) shouldBe normalized(expectedDDM) app.registerMatchedReports(datasetId, expectedDDM \\ "reportNumber") app.logMatchedReports() // once for all datasets @@ -141,7 +141,7 @@ class RewriteSpec extends AnyFlatSpec with SchemaSupport with Matchers with DdmS ) - transformer.transform(ddmIn, "easy-dataset:123").map(normalized) shouldBe + ddmTransformer.transform(ddmIn, "easy-dataset:123").map(normalized) shouldBe Failure(InvalidBagException("temporal rabarbera not found; subject barbapappa not found")) } @@ -174,7 +174,7 @@ class RewriteSpec extends AnyFlatSpec with SchemaSupport with Matchers with DdmS ) val datasetId = "eas-dataset:123" - transformer.transform(ddmIn, datasetId).map(normalized) + ddmTransformer.transform(ddmIn, datasetId).map(normalized) .getOrElse(fail("no DDM returned")) shouldBe normalized(expectedDDM) // TODO manually check logging of not mapped language fields @@ -195,7 +195,7 @@ class RewriteSpec extends AnyFlatSpec with SchemaSupport with Matchers with DdmS ) - transformer.transform(ddmIn, "easy-dataset:123").map(normalized) shouldBe Success(normalized(ddmExpected)) + ddmTransformer.transform(ddmIn, "easy-dataset:123").map(normalized) shouldBe Success(normalized(ddmExpected)) // TODO manually check logging of briefrapport } @@ -216,7 +216,7 @@ class RewriteSpec extends AnyFlatSpec with SchemaSupport with Matchers with DdmS ) - transformer.transform(ddmIn, "easy-dataset:123").map(normalized) shouldBe + ddmTransformer.transform(ddmIn, "easy-dataset:123").map(normalized) shouldBe Success(normalized(expectedDdm)) } @@ -249,7 +249,7 @@ class RewriteSpec extends AnyFlatSpec with SchemaSupport with Matchers with DdmS ) // TODO these titles don't show up in target/test/TitlesSpec/matches-per-rce.txt - transformer.transform(ddmIn, "easy-dataset:123").map(normalized) shouldBe + ddmTransformer.transform(ddmIn, "easy-dataset:123").map(normalized) shouldBe Success(normalized(expectedDdm)) } @@ -280,7 +280,7 @@ class RewriteSpec extends AnyFlatSpec with SchemaSupport with Matchers with DdmS ) - transformer.transform(ddmIn, "easy-dataset:123").map(normalized) shouldBe + ddmTransformer.transform(ddmIn, "easy-dataset:123").map(normalized) shouldBe Success(normalized(expectedDdm)) } @@ -310,7 +310,7 @@ class RewriteSpec extends AnyFlatSpec with SchemaSupport with Matchers with DdmS ) - transformer.transform(ddmIn, "easy-dataset:123").map(normalized) shouldBe + ddmTransformer.transform(ddmIn, "easy-dataset:123").map(normalized) shouldBe Success(normalized(expectedDdm)) } @@ -519,14 +519,4 @@ class RewriteSpec extends AnyFlatSpec with SchemaSupport with Matchers with DdmS , ))) } - - private val nameSpaceRegExp = """ xmlns:[a-z-]+="[^"]*"""" // these attributes have a variable order - private val printer = new PrettyPrinter(160, 2) // Utility.serialize would preserve white space, now tests are better readable - - def normalized(elem: Node): String = printer - .format(Utility.trim(elem)) // this trim normalizes and - .replaceAll(nameSpaceRegExp, "") // the random order would cause differences in actual and expected - .replaceAll(" +\n?", " ") - .replaceAll("\n +<", "\n<") - .trim }