-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'feature-essdive-suite-#423' into develop
- Loading branch information
Showing
11 changed files
with
4,194 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<mdq:check xmlns:mdq="https://nceas.ucsb.edu/mdqe/v1" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="https://nceas.ucsb.edu/mdqe/v1 ../schemas/schema1.xsd"> | ||
<id>entity.type.nonproprietary-1.0.0</id> | ||
<name>Non proprietary entity format</name> | ||
<description>Check that all entities use non-propietary formats.</description> | ||
<type>Reusable</type> | ||
<level>REQUIRED</level> | ||
<environment>python</environment> | ||
<code><![CDATA[ | ||
def call(): | ||
global output | ||
global status | ||
global mdq_params | ||
global entityTypes | ||
global entityNames | ||
# Check the data formats for all data entities. | ||
# The check fails if the specified data format matches a format marked as proprietary. | ||
# This check uses a reformatted copy of the DataONE format list, that is usually kept in the file | ||
# /opt/local/metadig/DataONEformats.csv. This file is manually edited to mark specific formats as proprietary. This file is obtained using the DataONE 'formats' | ||
# service, i.e 'https://cn.dataone.org/cn/v2/formats'. | ||
# An additional step is made in this check - if any entities with a Microsoft Excel mediaType are found, then an informational message is printed as a tip to | ||
# ensure that CSV files are included that correspond to the Excel file. As it is not possible to know if the dataset author has exported tabs from the | ||
# Excel file into more reusable CSV format, this tip is printed. | ||
import metadig.variable as mvar | ||
import csv | ||
maxPrint = 5 | ||
excelMediaTypes = ("application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet") | ||
excelFileFound = False | ||
def isProprietary(formats, thisFormat): | ||
for row in formats: | ||
if (row[4].lower().strip() in ("yes", "y", "true", "t", "1")): | ||
if(row[2].lower().strip() == thisFormat.lower().strip()): | ||
return True | ||
if(row[3].lower().strip() == thisFormat.lower().strip()): | ||
return True | ||
return False | ||
# Are any entity formats present? | ||
if ('entityTypes' not in globals() or entityTypes is None): | ||
output = "No data entities (files) were found so unable to check for proprietary formats." | ||
status = "FAILURE" | ||
return False | ||
dataFilename = "DataONEformats.csv" | ||
formatsFile = "" | ||
# The checks data directory is passed via the 'mdq_params' hash | ||
# The filename is known only to this check. | ||
if('mdq_params' not in globals()): | ||
output = "Internal error running check, mdq_params not available to check." | ||
status = "ERROR" | ||
return False | ||
else: | ||
formatsFile = "{}/{}".format(mdq_params['metadigDataDir'], dataFilename) | ||
# Create list with the DataONE formats | ||
formats = [] | ||
with open(formatsFile, 'rb') as csvfile: | ||
fmtreader = csv.reader(csvfile, delimiter=',', quotechar='"') | ||
for row in fmtreader: | ||
formats.append(row) | ||
entityTypes = mvar.toUnicode(entityTypes) | ||
# If only a single value is returned (vs type "list"), then convert to a list | ||
# for easier processing | ||
if(isinstance(entityTypes, unicode)): | ||
entityTypes = [entityTypes] | ||
proprietaryFound = [] | ||
# Check each entity format and see if it is in the 'proprietary' list, which | ||
# is based on all formats from DataONE that have been manually determined t o be | ||
# proprietary | ||
for i in range(0, len(entityTypes)): | ||
# Check if the entity format is a single string or arrayList | ||
thisFormat = entityTypes[i].strip() | ||
if(isProprietary(formats, thisFormat)): | ||
proprietaryFound.append(thisFormat) | ||
# Check if an Excel file was encountered | ||
if(thisFormat.lower().strip() in excelMediaTypes): | ||
excelFileFound = True | ||
if(len(proprietaryFound) > 0): | ||
fmts = list(set([f.encode('UTF8') for f in proprietaryFound])) | ||
output = u"It is recommended that non-proprietary file formats be used where possible. These {} proprietary data formats (out of {} total formats) were found: {}".format(len(fmts), len(entityTypes), ', '.join(fmts[0:maxPrint])) | ||
if(len(fmts) > maxPrint): | ||
output += u", ..." | ||
else: | ||
output += u"." | ||
if(excelFileFound): | ||
output += u" TIP: If you have not already, upload a csv version for any excel file(s) included." | ||
status = "FAILURE" | ||
return False | ||
else: | ||
output = "No proprietary data formats found (out of {} total formats).".format(len(entityTypes)) | ||
if(excelFileFound): | ||
output += u" TIP: If you have not already, upload a csv version for any excel file(s) included." | ||
status = "SUCCESS" | ||
return True | ||
]]></code> | ||
<selector> | ||
<name>entityTypes</name> | ||
<xpath>/eml/dataset/otherEntity/entityType</xpath> | ||
</selector> | ||
<selector> | ||
<name>entityNames</name> | ||
<xpath>/eml/dataset/otherEntity/entityNames</xpath> | ||
</selector> | ||
<dialect> | ||
<name>Ecological Metadata Language</name> | ||
<xpath>boolean(/*[local-name() = 'eml'])</xpath> | ||
</dialect> | ||
</mdq:check> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<mdq:check xmlns:mdq="https://nceas.ucsb.edu/mdqe/v1" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="https://nceas.ucsb.edu/mdqe/v1 ../schemas/schema1.xsd"> | ||
<id>metadata.identifier.resolvable-1.1.0</id> | ||
<name>Metadata Identifier Resolvable</name> | ||
<description>Check that the metadata identifier exists and is resolvable.</description> | ||
<type>Accessible</type> | ||
<level>REQUIRED</level> | ||
<environment>python</environment> | ||
<code><![CDATA[ | ||
def call(): | ||
global output | ||
global status | ||
import metadig.variable as mvar | ||
import metadig.checks as checks | ||
import urllib | ||
import re | ||
global metadataIdentifier | ||
d1_resolve_service="https://cn.dataone.org/cn/v2/resolve/" | ||
# check if a metadata identifier is present | ||
if 'metadataIdentifier' not in globals() or metadataIdentifier is None: | ||
output = "A metadata identifier was not found." | ||
status = "FAILURE" | ||
return False | ||
metadataIdentifier = mvar.toUnicode(metadataIdentifier) | ||
# This should only be a single value, but if not (a list is returned) just get the first | ||
# one | ||
if(isinstance(metadataIdentifier, list)): | ||
metadataIdentifier = metadataIdentifier[0] | ||
if (mvar.isBlank(metadataIdentifier)): | ||
output = "The metadata identifier is blank." | ||
status = "FAILURE" | ||
return False | ||
else: | ||
output = u"The metadata identifier '{}' was found ".format(metadataIdentifier) | ||
id = metadataIdentifier | ||
# Now check if the metadata identifier is a resolvable url. If it doesn't look like a URL, then | ||
# see if DataONE knows about it. | ||
usedD1 = False | ||
isDOI = False | ||
if(re.match("^\s*http.*:\/", id)): | ||
resolvable, msg = checks.isResolvable(id) | ||
elif(re.match('doi:', id)): | ||
isDOI = True | ||
# If the identifier is a 'bare' DOI (e.g. "doi:10.18739/A2027H"), then prepend with a DOI resolver link | ||
# i.e. https://dx.doi.org | ||
resolvable, msg = checks.isResolvable("https://dx.doi.org/{}".format(id.strip())) | ||
else: | ||
usedD1 = True | ||
url = "{}{}".format(d1_resolve_service,urllib.quote(id)) | ||
resolvable, msg = checks.isResolvable(url) | ||
if (resolvable): | ||
if(usedD1): | ||
output = u'{} and is resolvable using the DataONE resolve service.'.format(output) | ||
elif(isDOI): | ||
output = u'{} and is resolvable using a DOI resolver.'.format(output) | ||
else: | ||
output = u'{} and is resolvable.'.format(output) | ||
status = "SUCCESS" | ||
return True | ||
else: | ||
# If the URL is unresolvable because it is private, and it is a DataONE identifier, then this | ||
# special case will pass. Print an appropriate messge explaining this. | ||
isPrivate = re.search("unauthorized", msg.lower()) | ||
if(isPrivate and usedD1): | ||
output = u'{} and is resolvable using the DataONE resolve service, but is not publicly readable'.format(output) | ||
status = "SUCCESS" | ||
else: | ||
output = u"{}, but is not resolvable.".format(output) | ||
status = "FAILURE" | ||
return False | ||
]]></code> | ||
<selector> | ||
<name>metadataIdentifier</name> | ||
<xpath> | ||
/resource/identifier | | ||
/*/fileIdentifier/*/text()[normalize-space()] | | ||
/eml/@packageId | ||
</xpath> | ||
</selector> | ||
<dialect> | ||
<name>DataCite 4</name> | ||
<xpath>boolean(/*[local-name() = 'resource'])</xpath> | ||
</dialect> | ||
<dialect> | ||
<name>Ecological Metadata Language</name> | ||
<xpath>boolean(/*[local-name() = 'eml'])</xpath> | ||
</dialect> | ||
<dialect> | ||
<name>ISO 19115 and ISO 19115-2 / ISO 19139 and ISO 19139-2</name> | ||
<xpath>boolean(/*[local-name() = 'MI_Metadata' or local-name() = 'MD_Metadata'])</xpath> | ||
</dialect> | ||
</mdq:check> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<mdq:check xmlns:mdq="https://nceas.ucsb.edu/mdqe/v1" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="https://nceas.ucsb.edu/mdqe/v1 ../schemas/schema1.xsd"> | ||
<id>resource.URLs.resolvable-1.0.0</id> | ||
<name>Metadata Identifier Resolvable</name> | ||
<description>Check that the metadata identifier exists and is resolvable.</description> | ||
<type>Accessible</type> | ||
<level>REQUIRED</level> | ||
<environment>python</environment> | ||
<code><![CDATA[ | ||
def call(): | ||
import metadig.variable as mvar | ||
import metadig.checks as checks | ||
import urllib | ||
import re | ||
global output | ||
global status | ||
global textFields | ||
global uniqueUrls | ||
global unresolvableUrls | ||
maxPrint = 3 | ||
urls = [] | ||
# Get all URLs from these fields abstract, location description, methods, and related references | ||
# first get all fields and concat into one string | ||
# tokenize the string | ||
# extract all tokens that match a url pattern, e.g. 'http*://', doi:*, https://doi.org, etc | ||
# remove duplicates | ||
# | ||
if 'textFields' not in globals() or textFields is None: | ||
output = "Unable to retrieve required text fields." | ||
status = "FAILURE" | ||
return False | ||
if(mvar.isBlank(textFields)): | ||
output = "The required text fields are blank." | ||
status = "FAILURE" | ||
return False | ||
# Convert to unicode so that non-ascii characters don't cause decoding errors | ||
textFields = mvar.toUnicode(textFields) | ||
# The text fields can be a textType element, so it may contain multiple subelements, i.e. <para>, etc | ||
# Since the metadig-engine is stuck at XPath 1.0, we cannot use the xpath to gather these into | ||
# a single string. | ||
if(isinstance(textFields, list)): | ||
textFields = ' '.join(textFields) | ||
# If the text spans multiple lines, convert line breaks to spaces | ||
textFields = textFields.replace('\n', ' ') | ||
textFields = textFields.replace('\r', ' ') | ||
# Convert separater characters to spaces to assist parsing | ||
textFields = textFields.replace(",", " ") | ||
textFields = textFields.replace(";", " ") | ||
# Tokenize the string and extract possible URLs | ||
textTokens = textFields.split(' ') | ||
for token in textTokens: | ||
token = token.strip() | ||
token = token.strip('.') | ||
token = token.strip('(') | ||
token = token.strip(')') | ||
if(re.match(".*http.*:\/\/", token.strip())): | ||
urls.append(token.strip()) | ||
#elif (re.match("^\s*doi:.*", token)): | ||
# If the identifier is a 'bare' DOI (e.g. "doi:10.18739/A2027H"), then prepend with a DOI resolver link | ||
# i.e. https://dx.doi.org | ||
#urls.append("https://dx.doi.org/{}".format(token.strip())) | ||
uniqueUrls = list(set(urls)) | ||
unresolvableUrls = [] | ||
# Check each unique URL to see if it is resolvable. The 'isResolvable' function sends an HTTP 'Head' | ||
# request to the URL. | ||
for url in uniqueUrls: | ||
resolvable, msg = checks.isResolvable(url) | ||
if (not resolvable): | ||
unresolvableUrls.append(url); | ||
# Print errors message if unresolved URLs were found, and printing the first few unresolved. | ||
if (len(unresolvableUrls) > 0): | ||
if(len(unresolvableUrls) == 1): | ||
output = u'1 of {} URLs provided in the metadata does not resolve correctly: {}'.format(len(uniqueUrls), ', '.join(unresolvableUrls[0:maxPrint])) | ||
# If unresolved is more than 'maxPrint' URLs, only print first maxPrint entries | ||
elif(len(unresolvableUrls) <= maxPrint): | ||
output = u'{} of {} URLS provided in the metadadta do not resolve correctly: {}'.format(len(unresolvableUrls), len(uniqueUrls), ', '.join(unresolvableUrls)) | ||
else: | ||
output = u'{} of {} URLs provided in the metadata do not resolve correctly, here are the first {}: {}'.format(len(unresolvableUrls), len(uniqueUrls), maxPrint, ', '.join(unresolvableUrls[0:maxPrint])) | ||
output += u", ..." | ||
status = "FAILURE" | ||
return False | ||
else: | ||
# Print out success message. | ||
if(len(uniqueUrls) == 0): | ||
output = u'No URLs were found in the metadata.' | ||
status = "SUCCESS" | ||
return True | ||
elif (len(uniqueUrls) == 1): | ||
output = u'The one URL found in the metadata resolves correctly.' | ||
status = "SUCCESS" | ||
return True | ||
else: | ||
output = u'All {} URLs found in the metadata resolve correctly.'.format(len(uniqueUrls)) | ||
status = "SUCCESS" | ||
return True | ||
]]></code> | ||
<selector> | ||
<name>textFields</name> | ||
<!--Get all URLs from these fields abstract, location description, methods, and related references. --> | ||
<xpath> | ||
/eml/dataset/abstract//text()[normalize-space()] | | ||
/eml/dataset/abstract//ulink/@url | | ||
/eml/dataset/coverage/geographicCoverage/geographicDescription/text()[normalize-space] | | ||
/eml/dataset/methods/methodStep/description//text()[normalize-space()] | | ||
/eml/dataset/additionalInfo//text()[normalize-space()] | ||
</xpath> | ||
</selector> | ||
<dialect> | ||
<name>Ecological Metadata Language</name> | ||
<xpath>boolean(/*[local-name() = 'eml'])</xpath> | ||
</dialect> | ||
</mdq:check> |
Oops, something went wrong.