Skip to content

Commit

Permalink
Merge branch 'feature-essdive-suite-#423' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
gothub committed Feb 24, 2022
2 parents 4e0f89b + b7435cd commit 17f2f48
Show file tree
Hide file tree
Showing 11 changed files with 4,194 additions and 5 deletions.
5 changes: 3 additions & 2 deletions build.properties
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Ant build properties files for the metadig-checks build

# MetaDIG checks version
metadig-checks.version=0.2.6
metadig-checks.version=0.4.0

build=build
dist=dist
Expand All @@ -12,4 +12,5 @@ data=data
stagescript=stageFiles.py
# The suites to include in the distribution tar file
#suites=arctic-data-center.xml,ess-dive.xml,FAIR-suite.xml,knb-suite.xml
suites=FAIR-suite.xml
#suites=FAIR-suite.xml
suites=ess-dive-1.1.0.xml
3,532 changes: 3,532 additions & 0 deletions data/ess-dive-projects.json

Large diffs are not rendered by default.

File renamed without changes.
119 changes: 119 additions & 0 deletions src/checks/entity.type.nonpropriety-1.0.0.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
<?xml version="1.0" encoding="UTF-8"?>
<mdq:check xmlns:mdq="https://nceas.ucsb.edu/mdqe/v1" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="https://nceas.ucsb.edu/mdqe/v1 ../schemas/schema1.xsd">
<id>entity.type.nonproprietary-1.0.0</id>
<name>Non proprietary entity format</name>
<description>Check that all entities use non-propietary formats.</description>
<type>Reusable</type>
<level>REQUIRED</level>
<environment>python</environment>
<code><![CDATA[
def call():
global output
global status
global mdq_params
global entityTypes
global entityNames
# Check the data formats for all data entities.
# The check fails if the specified data format matches a format marked as proprietary.
# This check uses a reformatted copy of the DataONE format list, that is usually kept in the file
# /opt/local/metadig/DataONEformats.csv. This file is manually edited to mark specific formats as proprietary. This file is obtained using the DataONE 'formats'
# service, i.e 'https://cn.dataone.org/cn/v2/formats'.
# An additional step is made in this check - if any entities with a Microsoft Excel mediaType are found, then an informational message is printed as a tip to
# ensure that CSV files are included that correspond to the Excel file. As it is not possible to know if the dataset author has exported tabs from the
# Excel file into more reusable CSV format, this tip is printed.
import metadig.variable as mvar
import csv
maxPrint = 5
excelMediaTypes = ("application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
excelFileFound = False
def isProprietary(formats, thisFormat):
for row in formats:
if (row[4].lower().strip() in ("yes", "y", "true", "t", "1")):
if(row[2].lower().strip() == thisFormat.lower().strip()):
return True
if(row[3].lower().strip() == thisFormat.lower().strip()):
return True
return False
# Are any entity formats present?
if ('entityTypes' not in globals() or entityTypes is None):
output = "No data entities (files) were found so unable to check for proprietary formats."
status = "FAILURE"
return False
dataFilename = "DataONEformats.csv"
formatsFile = ""
# The checks data directory is passed via the 'mdq_params' hash
# The filename is known only to this check.
if('mdq_params' not in globals()):
output = "Internal error running check, mdq_params not available to check."
status = "ERROR"
return False
else:
formatsFile = "{}/{}".format(mdq_params['metadigDataDir'], dataFilename)
# Create list with the DataONE formats
formats = []
with open(formatsFile, 'rb') as csvfile:
fmtreader = csv.reader(csvfile, delimiter=',', quotechar='"')
for row in fmtreader:
formats.append(row)
entityTypes = mvar.toUnicode(entityTypes)
# If only a single value is returned (vs type "list"), then convert to a list
# for easier processing
if(isinstance(entityTypes, unicode)):
entityTypes = [entityTypes]
proprietaryFound = []
# Check each entity format and see if it is in the 'proprietary' list, which
# is based on all formats from DataONE that have been manually determined t o be
# proprietary
for i in range(0, len(entityTypes)):
# Check if the entity format is a single string or arrayList
thisFormat = entityTypes[i].strip()
if(isProprietary(formats, thisFormat)):
proprietaryFound.append(thisFormat)
# Check if an Excel file was encountered
if(thisFormat.lower().strip() in excelMediaTypes):
excelFileFound = True
if(len(proprietaryFound) > 0):
fmts = list(set([f.encode('UTF8') for f in proprietaryFound]))
output = u"It is recommended that non-proprietary file formats be used where possible. These {} proprietary data formats (out of {} total formats) were found: {}".format(len(fmts), len(entityTypes), ', '.join(fmts[0:maxPrint]))
if(len(fmts) > maxPrint):
output += u", ..."
else:
output += u"."
if(excelFileFound):
output += u" TIP: If you have not already, upload a csv version for any excel file(s) included."
status = "FAILURE"
return False
else:
output = "No proprietary data formats found (out of {} total formats).".format(len(entityTypes))
if(excelFileFound):
output += u" TIP: If you have not already, upload a csv version for any excel file(s) included."
status = "SUCCESS"
return True
]]></code>
<selector>
<name>entityTypes</name>
<xpath>/eml/dataset/otherEntity/entityType</xpath>
</selector>
<selector>
<name>entityNames</name>
<xpath>/eml/dataset/otherEntity/entityNames</xpath>
</selector>
<dialect>
<name>Ecological Metadata Language</name>
<xpath>boolean(/*[local-name() = 'eml'])</xpath>
</dialect>
</mdq:check>
4 changes: 2 additions & 2 deletions src/checks/identifierIsPresent.xml
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@
<code><![CDATA[
if (length(identifier) > 0) {
mdq_result <- list(status = "SUCCESS",
output = list(list(value = "An identifier is present.")))
output = list(list(value = "An identifier associated with this metadata document is present.")))
} else {
mdq_result <- list(status = "FAILURE",
output = list(list(value = "An identifier is not present.")))
output = list(list(value = "An identifier associated with this metadata document is not present.")))
}
]]>
</code>
Expand Down
101 changes: 101 additions & 0 deletions src/checks/metadata.identifier.resolvable-1.1.0.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
<?xml version="1.0" encoding="UTF-8"?>
<mdq:check xmlns:mdq="https://nceas.ucsb.edu/mdqe/v1" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="https://nceas.ucsb.edu/mdqe/v1 ../schemas/schema1.xsd">
<id>metadata.identifier.resolvable-1.1.0</id>
<name>Metadata Identifier Resolvable</name>
<description>Check that the metadata identifier exists and is resolvable.</description>
<type>Accessible</type>
<level>REQUIRED</level>
<environment>python</environment>
<code><![CDATA[
def call():
global output
global status
import metadig.variable as mvar
import metadig.checks as checks
import urllib
import re
global metadataIdentifier
d1_resolve_service="https://cn.dataone.org/cn/v2/resolve/"
# check if a metadata identifier is present
if 'metadataIdentifier' not in globals() or metadataIdentifier is None:
output = "A metadata identifier was not found."
status = "FAILURE"
return False
metadataIdentifier = mvar.toUnicode(metadataIdentifier)
# This should only be a single value, but if not (a list is returned) just get the first
# one
if(isinstance(metadataIdentifier, list)):
metadataIdentifier = metadataIdentifier[0]
if (mvar.isBlank(metadataIdentifier)):
output = "The metadata identifier is blank."
status = "FAILURE"
return False
else:
output = u"The metadata identifier '{}' was found ".format(metadataIdentifier)
id = metadataIdentifier
# Now check if the metadata identifier is a resolvable url. If it doesn't look like a URL, then
# see if DataONE knows about it.
usedD1 = False
isDOI = False
if(re.match("^\s*http.*:\/", id)):
resolvable, msg = checks.isResolvable(id)
elif(re.match('doi:', id)):
isDOI = True
# If the identifier is a 'bare' DOI (e.g. "doi:10.18739/A2027H"), then prepend with a DOI resolver link
# i.e. https://dx.doi.org
resolvable, msg = checks.isResolvable("https://dx.doi.org/{}".format(id.strip()))
else:
usedD1 = True
url = "{}{}".format(d1_resolve_service,urllib.quote(id))
resolvable, msg = checks.isResolvable(url)
if (resolvable):
if(usedD1):
output = u'{} and is resolvable using the DataONE resolve service.'.format(output)
elif(isDOI):
output = u'{} and is resolvable using a DOI resolver.'.format(output)
else:
output = u'{} and is resolvable.'.format(output)
status = "SUCCESS"
return True
else:
# If the URL is unresolvable because it is private, and it is a DataONE identifier, then this
# special case will pass. Print an appropriate messge explaining this.
isPrivate = re.search("unauthorized", msg.lower())
if(isPrivate and usedD1):
output = u'{} and is resolvable using the DataONE resolve service, but is not publicly readable'.format(output)
status = "SUCCESS"
else:
output = u"{}, but is not resolvable.".format(output)
status = "FAILURE"
return False
]]></code>
<selector>
<name>metadataIdentifier</name>
<xpath>
/resource/identifier |
/*/fileIdentifier/*/text()[normalize-space()] |
/eml/@packageId
</xpath>
</selector>
<dialect>
<name>DataCite 4</name>
<xpath>boolean(/*[local-name() = 'resource'])</xpath>
</dialect>
<dialect>
<name>Ecological Metadata Language</name>
<xpath>boolean(/*[local-name() = 'eml'])</xpath>
</dialect>
<dialect>
<name>ISO 19115 and ISO 19115-2 / ISO 19139 and ISO 19139-2</name>
<xpath>boolean(/*[local-name() = 'MI_Metadata' or local-name() = 'MD_Metadata'])</xpath>
</dialect>
</mdq:check>
2 changes: 1 addition & 1 deletion src/checks/metadata.identifier.resolvable.xml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def call():
status = "FAILURE"
return False
else:
output = u"The metadata identifier '{}' was found)".format(metadataIdentifier)
output = u"The metadata identifier '{}' was found ".format(metadataIdentifier)
id = metadataIdentifier
# Now check if the metadata identifier is a resolvable url. If it doesn't look like a URL, then
Expand Down
127 changes: 127 additions & 0 deletions src/checks/resource.URLs.resolvable-1.0.0.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
<?xml version="1.0" encoding="UTF-8"?>
<mdq:check xmlns:mdq="https://nceas.ucsb.edu/mdqe/v1" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="https://nceas.ucsb.edu/mdqe/v1 ../schemas/schema1.xsd">
<id>resource.URLs.resolvable-1.0.0</id>
<name>Metadata Identifier Resolvable</name>
<description>Check that the metadata identifier exists and is resolvable.</description>
<type>Accessible</type>
<level>REQUIRED</level>
<environment>python</environment>
<code><![CDATA[
def call():
import metadig.variable as mvar
import metadig.checks as checks
import urllib
import re
global output
global status
global textFields
global uniqueUrls
global unresolvableUrls
maxPrint = 3
urls = []
# Get all URLs from these fields abstract, location description, methods, and related references
# first get all fields and concat into one string
# tokenize the string
# extract all tokens that match a url pattern, e.g. 'http*://', doi:*, https://doi.org, etc
# remove duplicates
#
if 'textFields' not in globals() or textFields is None:
output = "Unable to retrieve required text fields."
status = "FAILURE"
return False
if(mvar.isBlank(textFields)):
output = "The required text fields are blank."
status = "FAILURE"
return False
# Convert to unicode so that non-ascii characters don't cause decoding errors
textFields = mvar.toUnicode(textFields)
# The text fields can be a textType element, so it may contain multiple subelements, i.e. <para>, etc
# Since the metadig-engine is stuck at XPath 1.0, we cannot use the xpath to gather these into
# a single string.
if(isinstance(textFields, list)):
textFields = ' '.join(textFields)
# If the text spans multiple lines, convert line breaks to spaces
textFields = textFields.replace('\n', ' ')
textFields = textFields.replace('\r', ' ')
# Convert separater characters to spaces to assist parsing
textFields = textFields.replace(",", " ")
textFields = textFields.replace(";", " ")
# Tokenize the string and extract possible URLs
textTokens = textFields.split(' ')
for token in textTokens:
token = token.strip()
token = token.strip('.')
token = token.strip('(')
token = token.strip(')')
if(re.match(".*http.*:\/\/", token.strip())):
urls.append(token.strip())
#elif (re.match("^\s*doi:.*", token)):
# If the identifier is a 'bare' DOI (e.g. "doi:10.18739/A2027H"), then prepend with a DOI resolver link
# i.e. https://dx.doi.org
#urls.append("https://dx.doi.org/{}".format(token.strip()))
uniqueUrls = list(set(urls))
unresolvableUrls = []
# Check each unique URL to see if it is resolvable. The 'isResolvable' function sends an HTTP 'Head'
# request to the URL.
for url in uniqueUrls:
resolvable, msg = checks.isResolvable(url)
if (not resolvable):
unresolvableUrls.append(url);
# Print errors message if unresolved URLs were found, and printing the first few unresolved.
if (len(unresolvableUrls) > 0):
if(len(unresolvableUrls) == 1):
output = u'1 of {} URLs provided in the metadata does not resolve correctly: {}'.format(len(uniqueUrls), ', '.join(unresolvableUrls[0:maxPrint]))
# If unresolved is more than 'maxPrint' URLs, only print first maxPrint entries
elif(len(unresolvableUrls) <= maxPrint):
output = u'{} of {} URLS provided in the metadadta do not resolve correctly: {}'.format(len(unresolvableUrls), len(uniqueUrls), ', '.join(unresolvableUrls))
else:
output = u'{} of {} URLs provided in the metadata do not resolve correctly, here are the first {}: {}'.format(len(unresolvableUrls), len(uniqueUrls), maxPrint, ', '.join(unresolvableUrls[0:maxPrint]))
output += u", ..."
status = "FAILURE"
return False
else:
# Print out success message.
if(len(uniqueUrls) == 0):
output = u'No URLs were found in the metadata.'
status = "SUCCESS"
return True
elif (len(uniqueUrls) == 1):
output = u'The one URL found in the metadata resolves correctly.'
status = "SUCCESS"
return True
else:
output = u'All {} URLs found in the metadata resolve correctly.'.format(len(uniqueUrls))
status = "SUCCESS"
return True
]]></code>
<selector>
<name>textFields</name>
<!--Get all URLs from these fields abstract, location description, methods, and related references. -->
<xpath>
/eml/dataset/abstract//text()[normalize-space()] |
/eml/dataset/abstract//ulink/@url |
/eml/dataset/coverage/geographicCoverage/geographicDescription/text()[normalize-space] |
/eml/dataset/methods/methodStep/description//text()[normalize-space()] |
/eml/dataset/additionalInfo//text()[normalize-space()]
</xpath>
</selector>
<dialect>
<name>Ecological Metadata Language</name>
<xpath>boolean(/*[local-name() = 'eml'])</xpath>
</dialect>
</mdq:check>
Loading

0 comments on commit 17f2f48

Please sign in to comment.