Skip to content

Commit

Permalink
DD-1215: Add Series description to collection members (#111)
Browse files Browse the repository at this point in the history
* DD-1215: Add Series description to collection members
* BugFix: Use ddm-v2 xsd
* Remove obsolete code for Thematische collecties

Co-authored-by: Jo Pol <[email protected]>
  • Loading branch information
lindareijnhoudt and jo-pol authored Jan 4, 2023
1 parent c736508 commit 04c9c80
Show file tree
Hide file tree
Showing 9 changed files with 135 additions and 81 deletions.
58 changes: 58 additions & 0 deletions src/main/assembly/dist/cfg/SSH/seriesDatasetIds.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
easy-dataset:45961
easy-dataset:41660
easy-dataset:113720
easy-dataset:64608
easy-dataset:45268
easy-dataset:41081
easy-dataset:64205
easy-dataset:33960
easy-dataset:65105
easy-dataset:61180
easy-dataset:46232
easy-dataset:46233
easy-dataset:61642
easy-dataset:41101
easy-dataset:56307
easy-dataset:41361
easy-dataset:213824
easy-dataset:41068
easy-dataset:65023
easy-dataset:41069
easy-dataset:41860
easy-dataset:121466
easy-dataset:65839
easy-dataset:207751
easy-dataset:41803
easy-dataset:201339
easy-dataset:61453
easy-dataset:41900
easy-dataset:41536
easy-dataset:58798
easy-dataset:44580
easy-dataset:99837
easy-dataset:46099
easy-dataset:41477
easy-dataset:33957
easy-dataset:110362
easy-dataset:77481
easy-dataset:41537
easy-dataset:43537
easy-dataset:41822
easy-dataset:41070
easy-dataset:41231
easy-dataset:44159
easy-dataset:41857
easy-dataset:41898
easy-dataset:76300
easy-dataset:178313
easy-dataset:237102
easy-dataset:41818
easy-dataset:45089
easy-dataset:259930
easy-dataset:61644
easy-dataset:58177
easy-dataset:65866
easy-dataset:41134
easy-dataset:51637
easy-dataset:210154
easy-dataset:249948
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ case class Configuration(version: String,
{
private val targetCfgPath = cfgPath / targetDataStation
val agreementsPath: File = cfgPath / "agreements"
val ddmTransformer = new DdmTransformer(cfgPath, targetDataStation, getCollectionsMap(targetCfgPath), ddmVersion)
val ddmTransformer = new DdmTransformer(cfgPath, targetDataStation, getCollectionsMap(targetCfgPath, maybeFedoraProvider), ddmVersion)
val amdTransformer = new AmdTransformer(targetCfgPath / "account-substitutes.csv")
val remarksConverter = new RemarksConverter(targetCfgPath)
}
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,8 @@ class EasyConvertBagToDepositApp(configuration: Configuration) extends DebugEnha
}
}



private def checkAgreementsXml(depositorId: String, agreementsFile: File) = {
if (agreementsFile.exists) { //the agreementsfile is created by easy-fedora-to-bag for FEDORA datasets
Success
Expand Down
1 change: 1 addition & 0 deletions src/main/scala/nl.knaw.dans.easy.bag2deposit/FoXml.scala
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ class FoXml {
}

def getAmd(foXml: Node): Try[Node] = getStream("AMD", "administrative-md", foXml)
def getEmd(foXml: Node): Try[Node] = getStream("EMD", "easymetadata", foXml)

private def hasControlGroup(controlGroup: String)(streamRoot: Node): Boolean = {
streamRoot.attribute("CONTROL_GROUP").map(_.text).contains(controlGroup)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,20 +16,17 @@
package nl.knaw.dans.easy.bag2deposit.collections

import better.files.File
import com.yourmediashelf.fedora.client.FedoraClientException
import net.ruippeixotog.scalascraper.browser.JsoupBrowser
import net.ruippeixotog.scalascraper.dsl.DSL.Extract._
import net.ruippeixotog.scalascraper.dsl.DSL._
import nl.knaw.dans.easy.bag2deposit.FoXml.getEmd
import nl.knaw.dans.lib.error._
import nl.knaw.dans.lib.logging.DebugEnhancedLogging
import org.apache.commons.csv.CSVFormat.RFC4180
import org.apache.commons.csv.{ CSVFormat, CSVParser, CSVRecord }
import resource.managed

import java.nio.charset.{ Charset, StandardCharsets }
import scala.collection.JavaConverters._
import scala.util.{ Failure, Try }
import scala.xml.Elem
import scala.xml.{ Elem, Node, Text }

case class Collection(name: String, ids: Seq[String], collectionType: String, comment: String, members: Seq[String])

Expand Down Expand Up @@ -85,89 +82,61 @@ object Collection extends DebugEnhancedLogging {
>{ r.get("prefLabel") }</ddm:inCollection>
}

def getSeriesNode(collectionDatasetIds: Seq[String], maybeFedoraProvider: Option[FedoraProvider], seriesSet: Set[String]): Node = {
val firstCollectionDatasetId = collectionDatasetIds.head
if (seriesSet.contains(firstCollectionDatasetId)) {
val parentEmd = getCollectionEmdXml(firstCollectionDatasetId, maybeFedoraProvider)
val emdDescription = parentEmd.get \\ "description"
<ddm:description descriptionType="SeriesInformation">{ emdDescription.head.text }</ddm:description>
}
else
Text("")
}

private def getCollectionEmdXml(datasetId: String, maybeFedoraProvider: Option[FedoraProvider]): Try[Node] = {
maybeFedoraProvider.map { provider =>
provider.loadFoXml(datasetId).flatMap(getEmd)
}.getOrElse(Failure(new IllegalStateException(s"could not get EMD for $datasetId because no fedora is configured")))
}

private def readSeriesFile(seriesFile: File): Set[String] = {
if (seriesFile.exists)
seriesFile.contentAsString.split("\n").toSet
else
Set()
}

/** @return collection-member-dataset-id -> <ddm:inCollection> */
def getCollectionsMap(cfgDir: File): Map[String, Seq[Elem]] = {
def getCollectionsMap(cfgDir: File, maybeFedoraProvider: Option[FedoraProvider]): Map[String, Seq[Node]] = {
val skosFile = cfgDir / "excel2skos-collecties.csv"
val collectionsFile = cfgDir / "ThemathischeCollecties.csv"
val seriesFile = cfgDir / "seriesDatasetIds.txt"

trace(skosFile, collectionsFile)
trace(skosFile, collectionsFile, seriesFile)
val tuples = {
for {
skosRecords <- parseCsv(skosFile, skosCsvFormat)
collectionRecords <- parseCsv(collectionsFile, collectionCsvFormat)
originalCollections = collectionRecords.toList.map(parseCollectionRecord)
skosMap = skosRecords.map(parseSkosRecord).toMap
seriesSet = readSeriesFile(seriesFile)
} yield originalCollections.flatMap { collection =>
memberToCollections(skosMap, collection)
memberToCollections(skosMap, collection, maybeFedoraProvider, seriesSet )
}
}.doIfFailure { case e => logger.error(s"could not build CollectionsMap: $cfgDir $e", e) }
.getOrElse(List.empty)
tuples.groupBy(_._1).mapValues(_.map(_._2))
tuples.groupBy(_._1).mapValues(_.flatMap(_._2))
}

private def memberToCollections(skosMap: Map[String, Elem], collection: Collection): Seq[(String, Elem)] = {
private def memberToCollections(skosMap: Map[String, Elem], collection: Collection, maybeFedoraProvider: Option[FedoraProvider], seriesSet: Set[String]): Seq[(String, List[Node])] = {
val name = collection.name
lazy val default = <notImplemented>
{s"$name not found in collections skos"}
</notImplemented>
val elem = skosMap.getOrElse(name, default)
if (elem.toString().contains("not found"))
logger.error(s"$name not found in collections skos")
collection.members.map(id => id -> elem)
collection.members.map(id => id -> (elem +: getSeriesNode(collection.ids, maybeFedoraProvider, seriesSet)).toList)
}

private def membersOf(fedoraProvider: FedoraProvider)(datasetId: String): Seq[String] = {
trace(datasetId)

def getMu(jumpoffId: String, streamId: String) = {
fedoraProvider
.disseminateDatastream(jumpoffId, streamId)
.map(browser.parseInputStream(_, StandardCharsets.UTF_8.name()))
.tried
}

def getMuAsHtmlDoc(jumpoffId: String) = {
getMu(jumpoffId, "HTML_MU")
.recoverWith {
case e: FedoraClientException if e.getStatus == 404 =>
logger.warn(s"no HTML_MU for $jumpoffId, trying TXT_MU")
getMu(jumpoffId, "TXT_MU")
case e =>
trace(e)
Failure(e)
}
}

// (?s) matches multiline values like https://github.com/DANS-KNAW/easy-convert-bag-to-deposit/blob/57e4ab9513d536c16121ad8916058d4102154138/src/test/resources/sample-jumpoff/3931-for-dataset-34359.html#L168-L169
// looking for links containing eiter of
// doi.org.*dans
// urn:nbn:nl:ui:13-
val regexp = "(?s).*(doi.org.*dans|urn:nbn:nl:ui:13-).*"
for {
maybeJumpoffId <- fedoraProvider.getJumpoff(datasetId)
jumpoffId = maybeJumpoffId.getOrElse(throw new Exception(s"no jumpoff for $datasetId"))
doc <- getMuAsHtmlDoc(jumpoffId)
items = doc >> elementList("a")
hrefs = items
.withFilter(_.hasAttr("href"))
.map(_.attr("href"))
.sortBy(identity)
.distinct
maybeIds = hrefs.withFilter(_.matches(regexp)).map(toDatasetId)
} yield maybeIds.withFilter(_.isDefined).map(_.get)
}.doIfFailure { case e => logger.error(s"could not find members of $datasetId: $e", e) }
.getOrElse(Seq.empty)

private def toDatasetId(str: String): Option[String] = {
val trimmed = str
.replaceAll(".*doi.org/", "")
.replaceAll(".*identifier=", "")
.trim
resolver.getDatasetId(trimmed)
}.doIfFailure { case e =>
logger.error(s"could not resolve $str: $e", e)
}.getOrElse {
logger.warn(s"resolver could not find $str")
None
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,6 @@ class FedoraProvider(fedoraClient: FedoraClient) extends DebugEnhancedLogging {

// which is a copy of https://github.com/DANS-KNAW/easy-export-dataset/blob/6e656c6e6dad19bdea70694d63ce929ab7b0ad2b/src/main/scala/nl.knaw.dans.easy.export/FedoraProvider.scala
// variant of https://github.com/DANS-KNAW/easy-deposit-agreement-creator/blob/e718655515ad5d597fd227bc29776c074a959f00/src/main/scala/nl/knaw/dans/easy/agreement/datafetch/Fedora.scala#L52
def getJumpoff(datasetId: String): Try[Option[String]] = {
search(
s"""
|PREFIX dans: <http://dans.knaw.nl/ontologies/relations#>
|SELECT ?s WHERE {?s dans:isJumpoffPageFor <info:fedora/$datasetId> . }
|""".stripMargin)
.map(_.drop(1).map(_.split("/").last).headOption)
}

private def search(query: String): Try[Seq[String]] = {
trace(query)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,11 @@ import scala.xml.transform.{ RewriteRule, RuleTransformer }

object DdmTransformer {
val ddmV2namespace = "http://schemas.dans.knaw.nl/dataset/ddm-v2/"
val ddmV2Location = "https://schemas.dans.knaw.nl/bag/metadata/prov/v2/provenance.xsd"
val ddmV2Location = "https://schemas.dans.knaw.nl/md/ddm/v2/ddm.xsd"
}
class DdmTransformer(cfgDir: File,
target: String,
collectionsMap: Map[String, Seq[Elem]] = Map.empty,
collectionsMap: Map[String, Seq[Node]] = Map.empty,
ddmVersion:DdmVersion = V1,
) extends DebugEnhancedLogging {
trace(())
Expand Down Expand Up @@ -142,7 +142,7 @@ class DdmTransformer(cfgDir: File,
else containsPrivacySensitiveData match {
case "true" => <ddm:personalData present="Yes" />
case "false" => <ddm:personalData present="No" />
case _ => <ddm:personalData present="unknown" />
case _ => <ddm:personalData present="Unknown" />
}
}

Expand Down
38 changes: 35 additions & 3 deletions src/test/scala/nl.knaw.dans.easy.bag2deposit/CollectionSpec.scala
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,46 @@
*/
package nl.knaw.dans.easy.bag2deposit

import nl.knaw.dans.easy.bag2deposit.Fixture.{ DdmSupport, FileSystemSupport, SchemaSupport }
import nl.knaw.dans.easy.bag2deposit.Fixture.{ AppConfigSupport, DdmSupport, FileSystemSupport, SchemaSupport }
import nl.knaw.dans.easy.bag2deposit.collections.Collection
import org.scalamock.scalatest.MockFactory
import org.scalatest.flatspec.AnyFlatSpec
import org.scalatest.matchers.should.Matchers

class CollectionSpec extends AnyFlatSpec with DdmSupport with SchemaSupport with Matchers with FileSystemSupport with MockFactory {
import scala.util.Try
import scala.xml.Node

class CollectionSpec extends AnyFlatSpec with DdmSupport with SchemaSupport with Matchers with FileSystemSupport with MockFactory with AppConfigSupport{
override val schema = "https://raw.githubusercontent.com/DANS-KNAW/easy-schema/eade34a3c05669d05ec8cdbeb91a085d83c6c030/lib/src/main/resources/md/2021/02/ddm.xsd"

"getSeriesNode" should "return a description Node" in {
val loadFoXmlResult =
<foxml:digitalObject xsi:schemaLocation="info:fedora/fedora-system:def/foxml# http://www.fedora.info/definitions/1/0/foxml1-1.xsd" PID="easy-dataset:17" VERSION="1.1" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:foxml="info:fedora/fedora-system:def/foxml#">
<foxml:datastream VERSIONABLE="false" CONTROL_GROUP="X" STATE="A" ID="EMD">
<foxml:datastreamVersion SIZE="4820" MIMETYPE="text/xml" CREATED="2021-06-04T12:06:56.477Z" LABEL="Administrative metadata for this dataset" ID="AMD.0">
<foxml:xmlContent>
<emd:easymetadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dct="http://purl.org/dc/terms/" xmlns:eas="http://easy.dans.knaw.nl/easy/easymetadata/eas/" xmlns:emd="http://easy.dans.knaw.nl/easy/easymetadata/" emd:version="0.1">
<emd:title>
<dc:title>Getuigen Verhalen, Burgerarbeiders in kamp Vught, interview 05</dc:title>
</emd:title>
<emd:description>
<dc:description>Herinneringen van inwoners uit Vught en Cromvoirt aan het bestaan van een concentratiekamp in hun directe omgeving.</dc:description>
</emd:description>
</emd:easymetadata>
</foxml:xmlContent>
</foxml:datastreamVersion>
</foxml:datastream>
</foxml:digitalObject>

val expected = <ddm:description descriptionType="SeriesInformation">
Herinneringen van inwoners uit Vught en Cromvoirt aan het bestaan van een concentratiekamp in hun directe omgeving.
</ddm:description>
val fedoraProvider = mock[MockFedoraProvider]
(fedoraProvider.loadFoXml _).expects("easy-dataset:33834").returning(Try(loadFoXmlResult)).once()
val series : Node = Collection.getSeriesNode(List("easy-dataset:33834","easy-dataset:33976"), Some(fedoraProvider), Set("easy-dataset:33834"))
series.text.trim shouldBe expected.text.trim
}

"getCollectionsMap" should "not stumble over <br> and combine multiple datasets into a single collection" in {
val originalCsv =
"""naam,EASY-dataset-id,type,opmerkingen,members
Expand All @@ -41,6 +72,7 @@ class CollectionSpec extends AnyFlatSpec with DdmSupport with SchemaSupport with
subjectScheme="DANS Collection"
>Diachron bv</ddm:inCollection>

Collection.getCollectionsMap(cfgDir).get("easy-dataset:64188").head.head shouldBe sampleElem

Collection.getCollectionsMap(cfgDir, Some(null)).get("easy-dataset:64188").head.head shouldBe sampleElem
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ import scala.xml.{ Utility, XML }
class ProvenanceSpec extends AnyFlatSpec with FileSystemSupport with XmlSupport with Matchers with FixedCurrentDateTimeSupport with DebugEnhancedLogging with SchemaSupport with AppConfigSupport {
// use the raw github location while upgraded schema is not yet published, your own fork if not yet merged.
private val schemaRoot = "https://easy.dans.knaw.nl/schemas"
override val schema: String = schemaRoot + "provenance.xsd"
override val schema: String = schemaRoot + "bag/metadata/prov/provenance.xsd"
private val schemaLocation = s"http://easy.dans.knaw.nl/schemas/bag/metadata/prov/ $schema"

// FixedCurrentDateTimeSupport is not effective for a val
Expand Down

0 comments on commit 04c9c80

Please sign in to comment.