Merge branch 'feature-essdive-suite-#423' into develop

NCEAS · Feb 24, 2022 · 17f2f48 · 17f2f48
2 parents 4e0f89b + b7435cd
commit 17f2f48
Show file tree

Hide file tree

Showing 11 changed files with 4,194 additions and 5 deletions.
diff --git a/build.properties b/build.properties
@@ -1,7 +1,7 @@
 # Ant build properties files for the metadig-checks build
 
 # MetaDIG checks version
-metadig-checks.version=0.2.6
+metadig-checks.version=0.4.0
 
 build=build
 dist=dist
@@ -12,4 +12,5 @@ data=data
 stagescript=stageFiles.py
 # The suites to include in the distribution tar file
 #suites=arctic-data-center.xml,ess-dive.xml,FAIR-suite.xml,knb-suite.xml
-suites=FAIR-suite.xml
+#suites=FAIR-suite.xml
+suites=ess-dive-1.1.0.xml
diff --git a/data/ess-dive-projects.json b/data/ess-dive-projects.json
diff --git a/src/checks/entity.present.check-2.0.0.xml → src/checks/entity.present.check-1.0.1.xml b/src/checks/entity.present.check-2.0.0.xml → src/checks/entity.present.check-1.0.1.xml
diff --git a/src/checks/entity.type.nonpropriety-1.0.0.xml b/src/checks/entity.type.nonpropriety-1.0.0.xml
@@ -0,0 +1,119 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<mdq:check xmlns:mdq="https://nceas.ucsb.edu/mdqe/v1" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="https://nceas.ucsb.edu/mdqe/v1 ../schemas/schema1.xsd">
+   <id>entity.type.nonproprietary-1.0.0</id>
+   <name>Non proprietary entity format</name>
+   <description>Check that all entities use non-propietary formats.</description>
+   <type>Reusable</type>
+   <level>REQUIRED</level>
+   <environment>python</environment>
+   <code><![CDATA[
+def call():
+  global output
+  global status
+  global mdq_params
+  global entityTypes
+  global entityNames
+  
+  # Check the data formats for all data entities.
+  # The check fails if the specified data format matches a format marked as proprietary.
+  # This check uses a reformatted copy of the DataONE format list, that is usually kept in the file 
+  # /opt/local/metadig/DataONEformats.csv. This file is manually edited to mark specific formats as proprietary. This file is obtained using the DataONE 'formats'
+  # service, i.e 'https://cn.dataone.org/cn/v2/formats'.
+  
+  # An additional step is made in this check - if any entities with a Microsoft Excel mediaType are found, then an informational message is printed as a tip to 
+  # ensure that CSV files are included that correspond to the Excel file. As it is not possible to know if the dataset author has exported tabs from the
+  # Excel file into more reusable CSV format, this tip is printed.
+  
+  import metadig.variable as mvar
+  import csv
+  maxPrint = 5
+  excelMediaTypes = ("application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
+  excelFileFound = False
+  
+  def isProprietary(formats, thisFormat):
+    for row in formats:
+      if (row[4].lower().strip() in ("yes", "y", "true", "t", "1")):
+        if(row[2].lower().strip() == thisFormat.lower().strip()):
+          return True
+        if(row[3].lower().strip() == thisFormat.lower().strip()):
+          return True
+        
+    return False
+  
+  # Are any entity formats present?
+  if ('entityTypes' not in globals() or entityTypes is None):
+    output = "No data entities (files) were found so unable to check for proprietary formats."
+    status = "FAILURE"
+    return False
+    
+  dataFilename = "DataONEformats.csv"
+  formatsFile = ""
+  # The checks data directory is passed via the 'mdq_params' hash
+  # The filename is known only to this check.
+  if('mdq_params' not in globals()):
+    output = "Internal error running check, mdq_params not available to check."
+    status = "ERROR"
+    return False
+  else:
+    formatsFile = "{}/{}".format(mdq_params['metadigDataDir'], dataFilename)
+  
+  # Create list with the DataONE formats
+  formats = []
+  with open(formatsFile, 'rb') as csvfile:
+      fmtreader = csv.reader(csvfile, delimiter=',', quotechar='"')
+      for row in fmtreader:
+        formats.append(row)
+
+  entityTypes = mvar.toUnicode(entityTypes)
+  
+  # If only a single value is returned (vs type "list"), then convert to a list
+  # for easier processing
+  if(isinstance(entityTypes, unicode)):
+    entityTypes = [entityTypes]
+  
+  proprietaryFound = []
+        
+  # Check each entity format and see if it is in the 'proprietary' list, which
+  # is based on all formats from DataONE that have been manually determined t  o be
+  # proprietary
+  for i in range(0, len(entityTypes)):
+    # Check if the entity format is a single string or arrayList
+    thisFormat = entityTypes[i].strip()
+    if(isProprietary(formats, thisFormat)):
+      proprietaryFound.append(thisFormat)
+    # Check if an Excel file was encountered
+    if(thisFormat.lower().strip() in excelMediaTypes):
+      excelFileFound = True
+      
+  if(len(proprietaryFound) > 0):
+    fmts = list(set([f.encode('UTF8') for f in proprietaryFound]))
+    output = u"It is recommended that non-proprietary file formats be used where possible. These {} proprietary data formats (out of {} total formats) were found: {}".format(len(fmts), len(entityTypes), ', '.join(fmts[0:maxPrint]))
+    if(len(fmts) > maxPrint):
+      output += u", ..."
+    else:
+      output += u"."
+      
+    if(excelFileFound):
+      output += u" TIP: If you have not already, upload a csv version for any excel file(s) included."
+    status = "FAILURE"
+    return False
+  else:
+    output = "No proprietary data formats found (out of {} total formats).".format(len(entityTypes))
+    if(excelFileFound):
+      output += u" TIP: If you have not already, upload a csv version for any excel file(s) included."
+    status = "SUCCESS"
+    return True
+      ]]></code>
+    <selector>
+      <name>entityTypes</name>
+      <xpath>/eml/dataset/otherEntity/entityType</xpath>
+    </selector>
+    <selector>
+      <name>entityNames</name>
+      <xpath>/eml/dataset/otherEntity/entityNames</xpath>
+   </selector>
+   <dialect>
+      <name>Ecological Metadata Language</name>
+      <xpath>boolean(/*[local-name() = 'eml'])</xpath>
+   </dialect>
+</mdq:check>
diff --git a/src/checks/identifierIsPresent.xml b/src/checks/identifierIsPresent.xml
@@ -9,10 +9,10 @@
   <code><![CDATA[
 if (length(identifier) > 0) {
 mdq_result <- list(status = "SUCCESS",
-                   output = list(list(value = "An identifier is present.")))
+                   output = list(list(value = "An identifier associated with this metadata document is present.")))
 } else {
 mdq_result <- list(status = "FAILURE",
-                   output = list(list(value = "An identifier is not present.")))
+                   output = list(list(value = "An identifier associated with this metadata document is not present.")))
 }
     ]]>
   </code>

diff --git a/src/checks/metadata.identifier.resolvable-1.1.0.xml b/src/checks/metadata.identifier.resolvable-1.1.0.xml
@@ -0,0 +1,101 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<mdq:check xmlns:mdq="https://nceas.ucsb.edu/mdqe/v1" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="https://nceas.ucsb.edu/mdqe/v1 ../schemas/schema1.xsd">
+   <id>metadata.identifier.resolvable-1.1.0</id>
+   <name>Metadata Identifier Resolvable</name>
+   <description>Check that the metadata identifier exists and is resolvable.</description>
+   <type>Accessible</type>
+   <level>REQUIRED</level>
+   <environment>python</environment>
+   <code><![CDATA[
+def call():
+  global output
+  global status
+
+  import metadig.variable as mvar
+  import metadig.checks as checks
+  import urllib
+  import re
+  global metadataIdentifier
+
+  d1_resolve_service="https://cn.dataone.org/cn/v2/resolve/"
+
+  # check if a metadata identifier is present
+  if 'metadataIdentifier' not in globals() or metadataIdentifier is None:
+    output = "A metadata identifier was not found."
+    status = "FAILURE"
+    return False
+    
+  metadataIdentifier = mvar.toUnicode(metadataIdentifier)
+  
+  # This should only be a single value, but if not (a list is returned) just get the first 
+  # one
+  if(isinstance(metadataIdentifier, list)):
+    metadataIdentifier = metadataIdentifier[0]
+
+  if (mvar.isBlank(metadataIdentifier)):
+    output = "The metadata identifier is blank."
+    status = "FAILURE"
+    return False
+  else:
+    output = u"The metadata identifier '{}' was found ".format(metadataIdentifier)
+    id = metadataIdentifier
+    
+  # Now check if the metadata identifier is a resolvable url. If it doesn't look like a URL, then 
+  # see if DataONE knows about it.
+  usedD1 = False
+  isDOI = False
+  if(re.match("^\s*http.*:\/", id)):
+    resolvable, msg = checks.isResolvable(id)
+  elif(re.match('doi:', id)):
+    isDOI = True
+    # If the identifier is a 'bare' DOI (e.g. "doi:10.18739/A2027H"), then prepend with a DOI resolver link
+    # i.e. https://dx.doi.org
+    resolvable, msg = checks.isResolvable("https://dx.doi.org/{}".format(id.strip()))
+  else:
+    usedD1 = True
+    url = "{}{}".format(d1_resolve_service,urllib.quote(id))
+    resolvable, msg = checks.isResolvable(url)
+        
+  if (resolvable):
+    if(usedD1):
+      output = u'{} and is resolvable using the DataONE resolve service.'.format(output)
+    elif(isDOI):
+      output = u'{} and is resolvable using a DOI resolver.'.format(output)      
+    else:
+      output = u'{} and is resolvable.'.format(output)      
+          
+    status = "SUCCESS"
+    return True
+  else:
+    # If the URL is unresolvable because it is private, and it is a DataONE identifier, then this
+    # special case will pass. Print an appropriate messge explaining this.
+    isPrivate = re.search("unauthorized", msg.lower())
+    if(isPrivate and usedD1):
+      output = u'{} and is resolvable using the DataONE resolve service, but is not publicly readable'.format(output)
+      status = "SUCCESS"
+    else:  
+      output = u"{}, but is not resolvable.".format(output)      
+      status = "FAILURE"
+      return False
+   ]]></code>
+   <selector>
+      <name>metadataIdentifier</name>
+      <xpath>
+             /resource/identifier |
+             /*/fileIdentifier/*/text()[normalize-space()] |
+             /eml/@packageId
+      </xpath>
+   </selector>
+   <dialect>
+      <name>DataCite 4</name>
+      <xpath>boolean(/*[local-name() = 'resource'])</xpath>
+   </dialect>
+   <dialect>
+      <name>Ecological Metadata Language</name>
+      <xpath>boolean(/*[local-name() = 'eml'])</xpath>
+   </dialect>
+   <dialect>
+      <name>ISO 19115 and ISO 19115-2 / ISO 19139 and ISO 19139-2</name>
+      <xpath>boolean(/*[local-name() = 'MI_Metadata' or local-name() = 'MD_Metadata'])</xpath>
+   </dialect>
+</mdq:check>
diff --git a/src/checks/metadata.identifier.resolvable.xml b/src/checks/metadata.identifier.resolvable.xml
@@ -37,7 +37,7 @@ def call():
     status = "FAILURE"
     return False
   else:
-    output = u"The metadata identifier '{}' was found)".format(metadataIdentifier)
+    output = u"The metadata identifier '{}' was found ".format(metadataIdentifier)
     id = metadataIdentifier
     
   # Now check if the metadata identifier is a resolvable url. If it doesn't look like a URL, then 

diff --git a/src/checks/resource.URLs.resolvable-1.0.0.xml b/src/checks/resource.URLs.resolvable-1.0.0.xml
@@ -0,0 +1,127 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<mdq:check xmlns:mdq="https://nceas.ucsb.edu/mdqe/v1" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="https://nceas.ucsb.edu/mdqe/v1 ../schemas/schema1.xsd">
+   <id>resource.URLs.resolvable-1.0.0</id>
+   <name>Metadata Identifier Resolvable</name>
+   <description>Check that the metadata identifier exists and is resolvable.</description>
+   <type>Accessible</type>
+   <level>REQUIRED</level>
+   <environment>python</environment>
+   <code><![CDATA[
+def call():
+
+  import metadig.variable as mvar
+  import metadig.checks as checks
+  import urllib
+  import re
+  global output
+  global status
+  global textFields
+  global uniqueUrls
+  global unresolvableUrls
+  maxPrint = 3
+
+  urls = []
+
+  # Get all URLs from these fields abstract, location description, methods, and related references
+
+  # first get all fields and concat into one string
+  # tokenize the string
+  # extract all tokens that match a url pattern, e.g. 'http*://', doi:*, https://doi.org, etc
+  # remove duplicates
+  # 
+
+  if 'textFields' not in globals() or textFields is None:
+    output = "Unable to retrieve required text fields."
+    status = "FAILURE"
+    return False
+
+  if(mvar.isBlank(textFields)):
+    output = "The required text fields are blank."
+    status = "FAILURE"
+    return False
+  
+  # Convert to unicode so that non-ascii characters don't cause decoding errors
+  textFields = mvar.toUnicode(textFields)
+  
+  # The text fields can be a textType element, so it may contain multiple subelements, i.e. <para>, etc 
+  # Since the metadig-engine is stuck at XPath 1.0, we cannot use the xpath to gather these into 
+  # a single string.
+  if(isinstance(textFields, list)):
+    textFields = ' '.join(textFields)
+
+  # If the text spans multiple lines, convert line breaks to spaces
+  textFields = textFields.replace('\n', ' ')
+  textFields = textFields.replace('\r', ' ')
+  # Convert separater characters to spaces to assist parsing
+  textFields = textFields.replace(",", " ")
+  textFields = textFields.replace(";", " ")
+  
+  # Tokenize the string and extract possible URLs
+  textTokens = textFields.split(' ')
+  
+  for token in textTokens:
+    token = token.strip()
+    token = token.strip('.')
+    token = token.strip('(')
+    token = token.strip(')')
+    if(re.match(".*http.*:\/\/", token.strip())):
+      urls.append(token.strip())
+    #elif (re.match("^\s*doi:.*", token)):
+      # If the identifier is a 'bare' DOI (e.g. "doi:10.18739/A2027H"), then prepend with a DOI resolver link
+      # i.e. https://dx.doi.org
+      #urls.append("https://dx.doi.org/{}".format(token.strip()))
+      
+  uniqueUrls = list(set(urls))
+  unresolvableUrls = []
+  
+  # Check each unique URL to see if it is resolvable. The 'isResolvable' function sends an HTTP 'Head'
+  # request to the URL.
+  for url in uniqueUrls:
+       resolvable, msg = checks.isResolvable(url)
+       if (not resolvable):
+         unresolvableUrls.append(url);
+
+  # Print errors message if unresolved URLs were found, and printing the first few unresolved.
+  if (len(unresolvableUrls) > 0):
+    if(len(unresolvableUrls) == 1):
+      output = u'1 of {} URLs provided in the metadata does not resolve correctly: {}'.format(len(uniqueUrls), ', '.join(unresolvableUrls[0:maxPrint]))
+    # If unresolved is more than 'maxPrint' URLs, only print first maxPrint entries
+    elif(len(unresolvableUrls) <= maxPrint):
+      output = u'{} of {} URLS provided in the metadadta do not resolve correctly: {}'.format(len(unresolvableUrls), len(uniqueUrls), ', '.join(unresolvableUrls))
+    else:
+      output = u'{} of {} URLs provided in the metadata do not resolve correctly, here are the first {}: {}'.format(len(unresolvableUrls), len(uniqueUrls), maxPrint, ', '.join(unresolvableUrls[0:maxPrint]))
+      output += u", ..."
+      
+    status = "FAILURE"
+    return False
+  else:
+    # Print out success message.
+    if(len(uniqueUrls) == 0):
+      output = u'No URLs were found in the metadata.'
+      status = "SUCCESS"
+      return True
+    elif (len(uniqueUrls) == 1):
+      output = u'The one URL found in the metadata resolves correctly.'
+      status = "SUCCESS"
+      return True
+    else:
+      output = u'All {} URLs found in the metadata resolve correctly.'.format(len(uniqueUrls))
+      status = "SUCCESS"
+      return True
+   ]]></code>
+   <selector>
+      <name>textFields</name>
+      <!--Get all URLs from these fields abstract, location description, methods, and related references. -->
+      <xpath>
+          /eml/dataset/abstract//text()[normalize-space()] |
+          /eml/dataset/abstract//ulink/@url |
+          /eml/dataset/coverage/geographicCoverage/geographicDescription/text()[normalize-space] |
+          /eml/dataset/methods/methodStep/description//text()[normalize-space()] |
+          /eml/dataset/additionalInfo//text()[normalize-space()]
+      </xpath>
+   </selector>
+   <dialect>
+      <name>Ecological Metadata Language</name>
+      <xpath>boolean(/*[local-name() = 'eml'])</xpath>
+   </dialect>
+</mdq:check>