Skip to content

Commit

Permalink
DD-296 : DDM, convert title to report (#11)
Browse files Browse the repository at this point in the history
* attempt to find archeo report numbers
* replace titles in dcmiMetadata
* log changes made to DDM
* add package for transformation
* abr rule agnostic of report rule
* ignore plain briefrapport, report label without title trailer
* add report from profile to dcmiMetadata
* pattern matches to ifs, log missed titles
  • Loading branch information
jo-pol authored Jan 21, 2021
1 parent 7038425 commit 667650b
Show file tree
Hide file tree
Showing 18 changed files with 49,604 additions and 161 deletions.
135 changes: 135 additions & 0 deletions src/main/assembly/dist/cfg/ABR-reports.csv

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/main/scala/nl.knaw.dans.easy.bag2deposit/Command.scala
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ object Command extends App with DebugEnhancedLogging {
commandLine.idType(),
commandLine.bagSource()
)
new EasyConvertBagToDespositApp(configuration)
new EasyConvertBagToDepositApp(configuration)
.addPropsToBags(
bagParentDirs,
commandLine.outputDir.toOption,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,19 @@
*/
package nl.knaw.dans.easy.bag2deposit

import java.net.URI

import better.files.File
import better.files.File.root
import nl.knaw.dans.easy.bag2deposit.ddm.DdmTransformer
import nl.knaw.dans.lib.logging.DebugEnhancedLogging
import org.apache.commons.configuration.PropertiesConfiguration

import scala.xml.transform.RuleTransformer
import java.net.URI

case class Configuration(version: String,
dansDoiPrefixes: Seq[String],
dataverseIdAutority: String,
bagIndex: BagIndex,
ddmTransformer: RuleTransformer,
ddmTransformer: DdmTransformer,
)

object Configuration extends DebugEnhancedLogging {
Expand All @@ -53,7 +52,7 @@ object Configuration extends DebugEnhancedLogging {
dansDoiPrefixes = properties.getStringArray("dans-doi.prefixes"),
dataverseIdAutority = properties.getString("dataverse.id-authority"),
bagIndex = BagIndex(new URI(properties.getString("bag-index.url"))),
ddmTransformer = new RuleTransformer(AbrRewriteRule(cfgPath))
ddmTransformer = DdmTransformer(cfgPath)
)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,17 @@
*/
package nl.knaw.dans.easy.bag2deposit

import java.io.{ FileNotFoundException, IOException }

import better.files.File
import better.files.File.CopyOptions
import nl.knaw.dans.bag.v0.DansV0Bag
import nl.knaw.dans.easy.bag2deposit.BagSource.VAULT
import nl.knaw.dans.easy.bag2deposit.Command.FeedBackMessage
import nl.knaw.dans.lib.logging.DebugEnhancedLogging

import java.io.{ FileNotFoundException, IOException }
import scala.util.{ Failure, Success, Try }
import scala.xml.Node

class EasyConvertBagToDespositApp(configuration: Configuration) extends DebugEnhancedLogging {
class EasyConvertBagToDepositApp(configuration: Configuration) extends DebugEnhancedLogging {

def addPropsToBags(bagParentDirs: Iterator[File],
maybeOutputDir: Option[File],
Expand All @@ -38,6 +37,24 @@ class EasyConvertBagToDespositApp(configuration: Configuration) extends DebugEnh
.getOrElse(Success(s"No fatal errors")) // TODO show number of false/true values
}

def formatDiff(generated: Node, modified: Node): Option[String] = {
val original = normalized(generated).split("\n")
val changed = normalized(modified).split("\n")
val diff1 = original.diff(changed).mkString("\n").trim
val diff2 = changed.diff(original).mkString("\n").trim
if (diff1.nonEmpty || diff2.nonEmpty)
Some(
s"""===== only in old DDM
|
|$diff1
|
|===== only in new DDM by ${ getClass.getSimpleName } ${ configuration.version }
|
|$diff2
|""".stripMargin)
else None
}

private def addProps(depositPropertiesFactory: DepositPropertiesFactory, maybeOutputDir: Option[File])
(bagParentDir: File): Try[Boolean] = {
logger.debug(s"creating application.properties for $bagParentDir")
Expand All @@ -54,8 +71,8 @@ class EasyConvertBagToDespositApp(configuration: Configuration) extends DebugEnh
_ = logger.debug(s"$bagInfo")
ddmFile = bagDir / "metadata" / "dataset.xml"
ddmIn <- loadXml(ddmFile)
ddmOut = configuration.ddmTransformer.transform(ddmIn).headOption
.getOrElse(throw InvalidBagException("DDM transformation returned empty sequence"))
ddmOut <- configuration.ddmTransformer.transform(ddmIn)
_ = formatDiff(ddmIn, ddmOut).foreach(s => logger.info(s))
_ = ddmFile.writeText(ddmOut.serialize)
props <- depositPropertiesFactory.create(bagInfo, ddmOut)
_ = props.save((bagParentDir / "deposit.properties").toJava)
Expand Down
19 changes: 14 additions & 5 deletions src/main/scala/nl.knaw.dans.easy.bag2deposit/package.scala
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,18 @@
*/
package nl.knaw.dans.easy

import java.io.FileNotFoundException
import java.nio.charset.Charset.defaultCharset
import nl.knaw.dans.lib.error._
import better.files.File
import nl.knaw.dans.lib.error._
import org.apache.commons.csv.{ CSVFormat, CSVParser, CSVRecord }
import org.joda.time.format.{ DateTimeFormatter, ISODateTimeFormat }
import org.joda.time.{ DateTime, DateTimeZone }
import resource.managed

import scala.util.{ Failure, Try }
import scala.xml.{ Elem, Node, PrettyPrinter, SAXParseException, Utility, XML }
import java.io.FileNotFoundException
import java.nio.charset.Charset.defaultCharset
import scala.collection.JavaConverters._
import scala.util.{ Failure, Try }
import scala.xml._

package object bag2deposit {

Expand All @@ -44,6 +44,15 @@ package object bag2deposit {
.tried.unsafeGetOrThrow
}

private val nameSpaceRegExp = """ xmlns:[a-z-]+="[^"]*"""" // these attributes have a variable order

def normalized(elem: Node): String = printer
.format(Utility.trim(elem)) // this trim normalizes <a/> and <a></a>
.replaceAll(nameSpaceRegExp, "") // the random order would cause differences in actual and expected
.replaceAll(" +\n?", " ")
.replaceAll("\n +<", "\n<")
.trim

implicit class RichNode(val left: Node) extends AnyVal {

def hasType(t: String): Boolean = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,39 +13,37 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package nl.knaw.dans.easy.bag2deposit
package nl.knaw.dans.easy.bag2deposit.ddm

import better.files.File
import nl.knaw.dans.easy.bag2deposit.AbrRewriteRule.{ find, isAbr, parse }
import nl.knaw.dans.easy.bag2deposit.ddm.AbrRewriteRule.parse
import nl.knaw.dans.easy.bag2deposit.parseCsv
import nl.knaw.dans.lib.logging.DebugEnhancedLogging
import org.apache.commons.csv.CSVRecord

import scala.xml.transform.RewriteRule
import scala.xml.{ Elem, MetaData, Node, Text }
import scala.xml.{ Elem, Node }

case class AbrRewriteRule(cfgDir: File) extends RewriteRule {
case class AbrRewriteRule(cfgFile: File, oldLabel: String, newLabel: String) extends RewriteRule with DebugEnhancedLogging {
private val map = parse(cfgFile, newLabel)

private val periodFile: File = cfgDir / "ABR-period.csv"
private val periodMap = parse(periodFile, "ddm:temporal")
private val complexFile: File = cfgDir / "ABR-complex.csv"
private val complexMap = parse(complexFile, "ddm:subject")

override def transform(n: Node): Seq[Node] = n match {
case Elem(_, "temporal", attr: MetaData, _, Text(key)) if isAbr(attr) => find(key, periodMap, periodFile)
case Elem(_, "subject", attr: MetaData, _, Text(key)) if isAbr(attr) => find(key, complexMap, complexFile)
case _ => n
override def transform(node: Node): Seq[Node] = {
if (!isAbr(node)) node
else {
val key = node.text
map.getOrElse(key, <notImplemented>{ s"$key not found in ${ cfgFile.name }" }</notImplemented>)
}
}
}
object AbrRewriteRule extends DebugEnhancedLogging {
val nrOfHeaderLines = 2

private def isAbr(attr: MetaData) = {
attr.prefixedKey == "xsi:type" && attr.value.mkString("").startsWith("abr:ABR")
private def isAbr(node: Node) = {
val attr = node.attributes
node.label == oldLabel &&
attr.prefixedKey == "xsi:type" && attr.value.mkString("").startsWith("abr:ABR")
}
}

private def find(key: String, map: Map[String, Node], file: File): Node = {
map.getOrElse(key, throw new Exception(s"$key not found in $file"))
}
object AbrRewriteRule {
val nrOfHeaderLines = 2

private def parse(file: File, label: String): Map[String, Elem] = {
parseCsv(file, nrOfHeaderLines)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
/**
* Copyright (C) 2020 DANS - Data Archiving and Networked Services ([email protected])
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package nl.knaw.dans.easy.bag2deposit.ddm

import better.files.File
import nl.knaw.dans.easy.bag2deposit.InvalidBagException
import nl.knaw.dans.lib.logging.DebugEnhancedLogging

import scala.util.{ Failure, Success, Try }
import scala.xml.transform.{ RewriteRule, RuleTransformer }
import scala.xml.{ Node, NodeSeq }

case class DdmTransformer(cfgDir: File) extends DebugEnhancedLogging {

private val reportRewriteRule = ReportRewriteRule(cfgDir)
private val profileRuleTransformer = new RuleTransformer(reportRewriteRule)
private val dcmiMetadataRuleTransformer = new RuleTransformer(
reportRewriteRule,
AbrRewriteRule(cfgDir / "ABR-period.csv", "temporal", "ddm:temporal"),
AbrRewriteRule(cfgDir / "ABR-complex.csv", "subject", "ddm:subject"),
)

private case class DdmRewriteRule(reportNumberFromFirstTitle: NodeSeq) extends RewriteRule {
override def transform(n: Node): Seq[Node] = {
if (n.label != "dcmiMetadata") n
else <dcmiMetadata>
{ dcmiMetadataRuleTransformer(n).nonEmptyChildren }
{ reportNumberFromFirstTitle }
</dcmiMetadata>.copy(prefix = n.prefix, attributes = n.attributes, scope = n.scope)
}
}

def transform(ddmIn: Node): Try[Node] = {

// the single title may become a title and/or reportNumber
val transformedFirstTitle = (ddmIn \ "profile" \ "title").flatMap(profileRuleTransformer)
val reportNumberFromFirstTitle = transformedFirstTitle.filter(_.label == "reportNumber")
val notConvertedFirstTitle = transformedFirstTitle.filter(_ => reportNumberFromFirstTitle.isEmpty)

// the transformation
val ddmRuleTransformer = new RuleTransformer(DdmRewriteRule(reportNumberFromFirstTitle))
val ddmOut = ddmRuleTransformer(ddmIn)

// logging and error handling
val notConvertedTitles = (ddmOut \ "dcmiMetadata" \ "title") ++ notConvertedFirstTitle
logBriefRapportTitles(notConvertedTitles, ddmOut)
val problems = ddmOut \\ "notImplemented" // fail slow trick
if (problems.nonEmpty)
Failure(InvalidBagException(problems.map(_.text).mkString("; ")))
else ddmOut.headOption.map(Success(_))
.getOrElse(Failure(InvalidBagException("DDM transformation returned empty sequence")))
}

private def logBriefRapportTitles(notConvertedTitles: NodeSeq, ddmOut: Node): Unit = {
// these titles need a more complex transformation or manual fix before the final export
notConvertedTitles.foreach { node =>
val title = node.text
if (title.toLowerCase.matches(s"brief[^a-z]*rapport${ reportRewriteRule.nrTailRegexp } }"))
logger.info(s"briefrapport rightsHolder=[${ ddmOut \ "rightsHolder" }] publisher=[${ ddmOut \ "publisher" }] titles=[$title]")
}
}
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
/**
* Copyright (C) 2020 DANS - Data Archiving and Networked Services ([email protected])
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package nl.knaw.dans.easy.bag2deposit.ddm

import better.files.File
import nl.knaw.dans.easy.bag2deposit.parseCsv
import nl.knaw.dans.lib.logging.DebugEnhancedLogging

import scala.xml.transform.RewriteRule
import scala.xml.{ Elem, Node }

case class ReportRewriteRule(cfgDir: File) extends RewriteRule with DebugEnhancedLogging {

case class ReportCfg(uuid: String, label: String, regexp: String)

private val digit = "[0-9]"

/** alpha numeric (and a little more) */
private val an = "[-_/.a-z0-9]"

/** just one that does not match easy-dataset:99840 "Arcadis Archeologische Rapporten [2017 - 116]" */
val nrRegexp = s"\\W+$an*$digit$an*"

private val trailer = "([.]|:.*)?"
val nrTailRegexp = s"$nrRegexp$trailer"
private val missedRegExp = s".*(notitie|rapport|bericht|publicat).*$nrRegexp$trailer"

val reportMap: Seq[ReportCfg] = parseCsv(cfgDir / "ABR-reports.csv", 0)
.map(r => ReportCfg(
uuid = r.get(0),
label = r.get(1),
regexp = r.get(2).trim + nrTailRegexp,
)).toSeq

override def transform(n: Node): Seq[Node] = {
if (n.label != "title") n
else {
val titleValue = n.text
val lowerCaseTitle = titleValue.trim.toLowerCase
val reports = reportMap
.filter(cfg => lowerCaseTitle.matches(cfg.regexp))
.map(cfg => toReportNr(titleValue.replaceAll(":.*", ""), cfg.uuid))
.theSeq
if (reports.isEmpty && lowerCaseTitle.matches(missedRegExp))
logger.info(s"potential report number: $titleValue")
if (titleValue == reports.text)
reports
else reports :+ n
}
}

private def toReportNr(titleValue: String, uuid: String): Elem = {
<ddm:reportNumber
schemeURI="https://data.cultureelerfgoed.nl/term/id/abr/7a99aaba-c1e7-49a4-9dd8-d295dbcc870e"
valueURI={ s"https://data.cultureelerfgoed.nl/term/id/abr/$uuid" }
subjectScheme="ABR Rapporten"
reportNo={ titleValue.replaceAll(s".*($nrRegexp)$trailer", "$1").trim }
>{ titleValue }</ddm:reportNumber>
}
}
Loading

0 comments on commit 667650b

Please sign in to comment.