diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..27dfd90 --- /dev/null +++ b/.gitignore @@ -0,0 +1,9 @@ + +#hidden libs and cache dirs +.vscode +.pytest_cache +*/pytest_cache/ +*__pycache__ + +# word docs +*.docx \ No newline at end of file diff --git a/VERSIONS.json b/VERSIONS.json index 3fb0226..5427d04 100644 --- a/VERSIONS.json +++ b/VERSIONS.json @@ -1,4 +1,4 @@ { "slmd":"1.0.0", - "vlmd":"0.1.0" + "vlmd":"0.2.0" } \ No newline at end of file diff --git a/variable-level-metadata-schema/README.md b/variable-level-metadata-schema/README.md index 405bc74..1dc264c 100644 --- a/variable-level-metadata-schema/README.md +++ b/variable-level-metadata-schema/README.md @@ -1,31 +1,211 @@ # Variable level metadata -This metadata directory contains the specifications for variable level metadata submissions to the -HEAL platform in addition to variable level metadata templates in CSV format and the associated code -converting this template to its validated json format. +This metadata directory contains the specifications for variable level metadata documents in the HEAL data ecosystem. +## Schemas -## Workflow +❗ Look here for schema specifications. -The `schemas/dictionary` directory contains a comprehensive json schema with fields for +### json data dictionary format specification +1. `schemas/jsonschema/data-dictionary.json`: The "json" json data dictionary schema (ie json template schema) + - Intended to specify the data dictionary representation of json objects available in the HEAL platform metadata-service. + - See here for the markdown rendered version --> [`docs/md-rendered-schemas/jsonschema-jsontemplate-data-dictionary.md`](docs/md-rendered-schemas/jsonschema-jsontemplate-data-dictionary.md) + +### csv field format specifications +- See here for the markdown rendered version --> [`docs/md-rendered-schemas/jsonschema-jsontemplate-data-dictionary.md`](docs/md-rendered-schemas/jsonschema-csvtemplate-fields.md) + + +2. `schemas/frictionless/fields.json` Table schema (previously known as "frictionless") standard specification + - This json file is intended to represent csv data dictionary documents following the [Table Schema specification](https://specs.frictionlessdata.io/table-schema/). + - Csv version is intended to make data dictionary creation and discovery available in a more familiar/human readable format, + - The representation of data dictionary field values in a csv file. It's used to facilitate documentation of data dictionary csv + files in addition to input validation. +3. `schemas/jsontemplate/fields.json`The "csv" json schema (ie csv template schema) + - :warning: The "csv" json schema is intended to be an intermediate specification used for documentation and in translation workflows to the json schema template. As fully specifying a tabular file (for example missing value specification) is out of scope here (see the table schema representation in (2)) + +## Document flow chart + +```mermaid + + %%{init: {"flowchart": {"defaultRenderer": "elk","htmlLabels": false}} }%% + + flowchart TD + + subgraph dictionary[Dictionary YAML files] + + defs["schemas/dictionary/definitions.yaml"] + fields["schemas/dictionary/fields.yaml"] + dd["schemas/dictionary/data-dictionary.yaml"] + end + + subgraph Schema specifications + + jsonspec["schema/jsontemplate/data-dictionary.json"] + csvspec["schema/jsontemplate/csvtemplate/fields.json"] + csvtblspec["schema/frictionless/csvtemplate/fields.json"] + end + + subgraph "Rendered schema documentation \n(html also available)" + + csvmd["/docs/\nmd-rendered-schemas/\njsonschema-csvtemplate-fields.md"] + jsonmd["/docs/\nmd-rendered-schemas/\njsonschema-jsontemplate-data-dictionary.md"] + + end + + defs --> fields --> dd + defs --> dd + + fields --> csvspec --> csvtblspec + dd --> jsonspec + + csvspec --> csvmd + jsonspec --> jsonmd + +``` ## Directories - `docs`: See the rendered human readable schemas in a markdown format and an interactive html format. -- `schemas/jsonschema`: `data_dictionary.json` contains the final and full specification. -- `schemas/frictionless/csvtemplate`: contains schemas following the frictionless schema specifications. `fields.json` contains the frictionless Table Schema descriptor that validates a tabular heal templated csv data dictionary. See [here](https://specs.frictionlessdata.io/table-schema/) for the specification. **NOTE: the `csvtemplate` is an intermediate format meant to be converted into the final `jsontemplate` format. -- `schemas/dictionary`: the yaml files used to generate json schemas with build.py. Fields with `jsonSpec` and `csvSpec` keys to indicate which property to extract in the `build.py` script. +- `schemas/jsonschema`: contains the final and full specification for schemas following json schema. +- `schemas/frictionless`: contains schemas following the frictionless table schema specifications. See [here](https://specs.frictionlessdata.io/table-schema/) for the specification. +- `schemas/dictionary`: the yaml files used to generate json schemas and documentation with build.py. - `templates`: empty templates in csv spreadsheet format and JSON format. -- `examples`: the ~~(filled out)~~ templates in csv spreadsheet format and JSON format. - TO BE ADDED: for now, see https://github.com/norc-heal/healdata-utils/tree/main/tests/data/valid/output -- `build.py`: This script compiles the yaml files and generates associated jsonschemas and frictionless schemas in addition to the human rendered schemas +- `examples`: exapmles of filled out templates in csv spreadsheet format and JSON format. +- `build.py`: This script compiles the yaml files and generates associated schemas in addition to the human rendered schema + documentation. ## Contributing -To contribute to the variable level metadata, please modify the `dictionary/*.yaml` files directly. For example, if you want to add/modify an example, description, etc for either the JSON or CSV spec, then do so here. +To contribute to the variable level metadata specification (and annotations/examples/documentation), please modify the `dictionary/*.yaml` files directly. + +1. Update the dictionary/*.yaml files +2. Run `build.py` script +3. Check output is correct (see above) +4. When satisfied, push to github and ensure it passes validation (ie commit has ✔️ and not ❌) + +❗ Please read the below conventions and principles before contributing and review the existing `dictionary` directory. + + +## Conventions, principles, and rules for annotations and csv <> json translation + +### Annotation/documentation properties +1. `description`: SHOULD be created as markdown syntax without any headers as headers are applied in the templates. + +2. `additionalDescription`: SHOULD be added if there are additional documentation "footer" details. In rendering the documentation, these are appended to the end of rendered markdown document. + +### `type` conversion rules +Given csv field values can only be scalar values with records separated by a new line and each individual field values separated by a comma delimiter, the following rules and restrictions are applied to allow json to csv specification translation. + +1. type `object` + - converted to type `string` with pattern of `^(?:.*?=.*?(?:\||$))+$` to indicate a stringified object with a equal sign (`=`) connecting the key-value pair and a pipe (`|`) delimiter separating unique key-value pairs. +2. type `array` + - if type `object` in `items`: flattened to the children property or properties + - if type is a scalar (`string`,`integer`,`number`) in `items`, + translated to type `string` with pattern `^(?:[^|]+\||[^|]*)(?:[^|]*\|)*[^|]*$` to indicate a string containing a pipe delimiter (i.e., a stringified array with a pipe delimiter) +### `property` name conversion rules +To facilitate the mapping of json spec property names to csv property names, the resulting flattened `property` names from the flattened properties should correspond to the [jsonpath](https://datatracker.ietf.org/doc/id/draft-goessner-dispatch-jsonpath-00.html) representation where: + +1. type `object` + + The json spec type object property below: + ```json + + "constraints": { + "type": "object", + "properties": { + "maxLength": { + "type": "integer"} + } + } + ``` + + translates to the csv stringified type object: + + ```json + + "constraints.maxLength":{"type":"integer"} + + ``` +2. type `array` + + The json spec type array property below: + + ```json + { "..more props..":"...", + "standardsMappings": { + "type": "array", + "items": { + "type": "object", + "properties": { + "instrument": { + "type": "object", + "properties": { + "url": { + "type": "string", + "format": "uri" + }, + "..more props..":"..."} + }, + "..more props..":"..."} + }}} + + ``` + translates to the csv stringified type array property: + + ```json + { "..more props..":"...", + "standardsMappings[0].instrument.url": { + "type": "string", + "format": "uri" + } + } + ``` + +### Complex `type` restrictions + +1. Currently, no complex types (`anyOf`,`oneOf`) are supported and the `type` MUST be specified. This is to ensure coverage for all csv to json translation use cases. + - Each json specification schema property type must be a scalar (e.g., `boolean`,`string`,`integer`,`number`), an `array`, or an `object` + - Each csv specification schema property type must be a scalar (e.g., `boolean`,`string`,`integer`,`number`) + +### csv to json and json to csv translations + +There are two rules for conversion from json to csv (or csv to json) specs: + +1. __csv spec field-level property and json spec root-level property match__: If -- in the json schema spec version -- a property is specified at the root-level AND this same property is specified in the field level of the json spec schema + - csv to json: If the same value/instance of a property exists at the field level for ALL records (only one unique value but no missing values) then this unique value -- when translated to the json spec version -- will be moved to the root level data dictionary + - json to csv: All root level properties will be moved to individual field properties BUT field level properties that exist take precedence. + +More concretely, this provides a way to specify root level properties within vlmd csv documents for a few use cases but can generalize to other future additional property matches: + +1. specifying the schema version that represents the vlmd document (`schemaVersion`) +2. specifying other data dictionary level properties such as `standardsMappings[0].instrument` + +### root ("data dictionary level") and field property cascading pattern +Akin to the above json to csv, more generally: + +All root level properties will be applied to individual fields IF this same field level property is not specified (i.e., field-level takes precedence). This strategy can be seen in the [data package standard (but with missingValues)](https://specs.frictionlessdata.io/patterns/#missing-values-per-field) + + +### csv and json vlmd document file naming + +File names for json and csv translations of a vlmd document are suggested to +have the same stem name with corresponding "csv" and "json" suffixes (eg `my-heal-dd.csv` and `my-heal-dd.json`) + +## Additional table-level (root) and field-level properties + +Some table-level or field-level properties in other standards (or custom properties in specific use cases) do not map onto +a core HEAL property. To allow these properties to be included, we list these property names under `propertyNames`. + + ❗ For study or use case specific names, it is recommended to put the property under a `custom` namespace (e.g., `"custom":{"myvarname"})`. Adding additional properties here are for well established standards and/or property names used in practice. + + ☝️ At the root level, [`propertyNames`](https://json-schema.org/draft-07/json-schema-validation#rfc.section.6.5.8) was used to: + + 1. allow inclusion and minimal validation of these extra properties (ie of only the existence of property names) without making any assumptions about corresponding property types. + 2. It also provides a clear distinction between "core" properties and "extra" properties. + One consideration, however, is that `propertyNames` was introduced in json schema draft-6. ## Considerations diff --git a/variable-level-metadata-schema/build.py b/variable-level-metadata-schema/build.py index 7e271c7..8b8eb74 100644 --- a/variable-level-metadata-schema/build.py +++ b/variable-level-metadata-schema/build.py @@ -24,49 +24,11 @@ def load_yaml(filepath): yamlfile = yaml.safe_load(f) return yamlfile - -test = load_yaml("schemas/dictionary/definitions.yaml") # load all yamls def load_all_yamls(directory="schemas/dictionary"): filepaths = Path(directory).glob("*.yaml") return {filepath.stem: load_yaml(filepath) for filepath in filepaths} - -def select_specs(schema, specsuffix="CsvSpec"): - """ - select given specification type and remove other specification types. - These are denoted with the suffix (eg encodingsCsvSpec) in property name - - This function is useful when building multiple versions of schemas - conditional on the type of specificaiton (eg csv tabular data vs. json - for a workflow that may except csv that is translated into the json file.) - - """ - # loop through schema - schema_selected = {} - for key, item in schema.items(): - if re.search(f"{specsuffix}$", key): - newkey = key.replace(specsuffix, "") - schema_selected[newkey] = item - elif re.search("Spec$", key): - pass - elif isinstance(item, MutableMapping): - schema_selected[key] = select_specs(item, specsuffix) - else: - schema_selected[key] = item - return schema_selected - - -# resolve refs (and select type of schema spec) - -def get_ref(path,schema): - pass - -# loop through all iterables in a dictionary -# if key = $ref --> get_ref - - - def resolve_refs(items, schema, parentkey=False): """ resolve pseudo-json references @@ -109,8 +71,45 @@ def resolve_refs(items, schema, parentkey=False): return schema_resolved +def to_csv_properties(schema,**additional_props): + """ + translate complex types (eg arrays and objects) to stringified representations + """ + csv_schema = dict(schema) + csv_schema["properties"] = {} + properties = schema["properties"] + for key, item in properties.items(): + typename = item.get("type") + newitem = dict(item) + if typename == "array": + newitem["type"] = "string" + newitem["pattern"] = "^(?:[^|]+\||[^|]*)(?:[^|]*\|)*[^|]*$" + + if item.get("examples"): + newitem["examples"] = ["|".join(str(_e) for _e in e) for e in item["examples"]] + elif typename == "object": + newitem["type"] = "string" + newitem["pattern"] = "^(?:.*?=.*?(?:\||$))+$" + + if item.get("examples"): + newitem["examples"] = [ + "|".join([f"{key}={val}" for key,val in e.items()]) + for e in item["examples"] + ] + elif typename in ["string","integer","number","boolean"]: + newitem = dict(item) + else: + raise Exception("To convert to csv, the flattened property needs to be", + "of type array,object,boolean,string, integer, or number") + + csv_schema["properties"][key] = newitem + + # add additional properties at the beginning of the schema properties object + csv_schema["properties"] = {**additional_props,**csv_schema["properties"]} + + return csv_schema -def flatten_properties(properties, parentkey="", sep="."): +def flatten_properties(properties, parentkey="", sep=".",itemsep="[0]"): """ flatten schema properties """ @@ -130,7 +129,7 @@ def flatten_properties(properties, parentkey="", sep="."): properties_flattened.update(newprops) elif items: - newprops = flatten_properties(items,parentkey=flattenedkey) + newprops = flatten_properties(items,parentkey=flattenedkey+itemsep) properties_flattened.update(newprops) else: properties_flattened[flattenedkey] = item @@ -263,13 +262,26 @@ def generate_template(schema): if __name__ == "__main__": # compile frictionless schema fields dictionary = load_all_yamls() + + # compile json schema fields + json_pipeline = [ + # recursive fxn so need to grab items from overall dictionary for json paths + (resolve_refs, {"schema": dictionary}), + # no longer need the definitons as they have been resolved + (lambda _schema: _schema["data-dictionary"], None), + (lambda _schema: {"version":versions["vlmd"],**_schema},None) + ] + json_data_dictionary = reduce(run_pipeline_step, json_pipeline, dictionary) + Path("schemas/jsonschema/data-dictionary.json").write_text(json.dumps(json_data_dictionary, indent=4)) + + schema_version_prop = {"schemaVersion":json_data_dictionary["properties"]["schemaVersion"]} csv_pipeline = [ - (select_specs, {"specsuffix": "CsvSpec"}), # recursive fxn so need to grab items from overall dictionary for json paths (resolve_refs, {"schema": dictionary}), # no longer need the definitons as they have been resolved (lambda _schema: _schema["fields"], None), (flatten_schema, None), + (to_csv_properties,schema_version_prop), (to_frictionless, None), (lambda _schema: {"version":versions["vlmd"],**_schema},None) ] @@ -281,29 +293,17 @@ def generate_template(schema): # compile json schema fields csv_pipeline = [ - (select_specs, {"specsuffix": "CsvSpec"}), # recursive fxn so need to grab items from overall dictionary for json paths (resolve_refs, {"schema": dictionary}), # no longer need the definitons as they have been resolved (lambda _schema: _schema["fields"], None), (flatten_schema, None), + (to_csv_properties,schema_version_prop), + (lambda _schema: {"version":versions["vlmd"],**_schema},None) ] csvfields = reduce(run_pipeline_step, csv_pipeline, dictionary) Path("schemas/jsonschema/csvtemplate/fields.json").write_text(json.dumps(csvfields, indent=4)) - # compile json schema fields - json_pipeline = [ - # recursive fxn so need to grab items from overall dictionary for json paths - (resolve_refs, {"schema": dictionary}), - (select_specs, {"specsuffix": "JsonSpec"}), - # no longer need the definitons as they have been resolved - (lambda _schema: _schema["data-dictionary"], None), - (lambda _schema: {"version":versions["vlmd"],**_schema},None) - ] - jsonfields = reduce(run_pipeline_step, json_pipeline, dictionary) - Path("schemas/jsonschema/data-dictionary.json").write_text(json.dumps(jsonfields, indent=4)) - - # generate json schema versions of field schemas for documentation # generate html using the json-schema for human library @@ -317,14 +317,14 @@ def generate_template(schema): item=csvfields, schema=csvfields, templatefile="csvtemplate.md") - jsonfields_md = render_markdown( - item=jsonfields, - schema=jsonfields, + json_dd_md = render_markdown( + item=json_data_dictionary, + schema=json_data_dictionary, templatefile="jsontemplate.md" ) Path("docs/md-rendered-schemas/jsonschema-csvtemplate-fields.md").write_text(csvfields_md) - Path("docs/md-rendered-schemas/jsonschema-jsontemplate-data-dictionary.md").write_text(jsonfields_md) + Path("docs/md-rendered-schemas/jsonschema-jsontemplate-data-dictionary.md").write_text(json_dd_md) # generate templates - Path("templates/template_submission.json").write_text(json.dumps([generate_template(jsonfields)],indent=4)) + Path("templates/template_submission.json").write_text(json.dumps([generate_template(json_data_dictionary)],indent=4)) Path("templates/template_submission.csv").write_text(",".join((generate_template(csvfields)).keys())) \ No newline at end of file diff --git a/variable-level-metadata-schema/docs/assets/templates/csvtemplate.md b/variable-level-metadata-schema/docs/assets/templates/csvtemplate.md index 306253c..7c2227a 100644 --- a/variable-level-metadata-schema/docs/assets/templates/csvtemplate.md +++ b/variable-level-metadata-schema/docs/assets/templates/csvtemplate.md @@ -1,9 +1,28 @@ -# {{ schema.title }} +# {{ schema.title }} + +_version {{ schema.version }}_ + + + +The aim of this HEAL metadata piece is to track and provide basic information about variables in a tabular data file (i.e. a data file with rows and columns) from your HEAL study. The objective is to list all variables and descriptive information about those variables. This will ensure that potential secondary data users know what data has been collected or calculated and how to use these data. Note that a given study can have multiple tabular data files; You should create a data dictionary for each tabular data file. Thus, a study may have multiple data dictionaries. {{ schema.description }} -## Properties +## Properties (i.e., fields or variables) {% for itemname,item in schema.properties.items() %} {% include 'properties.md' %} +{% endfor %} + + +## End of schema - Additional Property information + +{% for itemname,item in schema['properties'].items() %} +{% if 'additionalDescription' in item %} +## `{{ itemname }}` {{ item.additionalDescription }} +{% endif %} {% endfor %} \ No newline at end of file diff --git a/variable-level-metadata-schema/docs/assets/templates/jsontemplate.md b/variable-level-metadata-schema/docs/assets/templates/jsontemplate.md index b5e769a..2afef89 100644 --- a/variable-level-metadata-schema/docs/assets/templates/jsontemplate.md +++ b/variable-level-metadata-schema/docs/assets/templates/jsontemplate.md @@ -1,16 +1,26 @@ -# {{ schema.title }} +# {{ schema.title }} + +_version {{ schema.version }}_ {{ schema.description }} {% for itemname,item in schema.properties.items() %} -### `{{ itemname }}` _({{ item.type }}{{ ',required' if itemname in schema.required }})_ +## `{{ itemname }}` _({{ item.type }}{{ ',required' if itemname in schema.required }})_ {{ item.description }} -{% if itemname == 'data_dictionary' %} +{% if itemname == 'fields' %} {{ item['items']['description'] }} -#### Properties for each record +### Properties for each `fields` record {% set schema = item['items'] %} {% for itemname,item in item['items']['properties'].items() %} {% include 'properties.md' %} {% endfor %} {% endif %} -{% endfor %} \ No newline at end of file +{% endfor %} + +### Additional `fields` property information + +{% for itemname,item in schema["properties"]["fields"]["items"]["properties"].items() %} +{% if 'additionalDescription' in item %} +#### `{{ itemname }}` {{ item.additionalDescription }} +{% endif %} +{% endfor %} diff --git a/variable-level-metadata-schema/docs/assets/templates/properties.md b/variable-level-metadata-schema/docs/assets/templates/properties.md index 0ee682c..f3ca54c 100644 --- a/variable-level-metadata-schema/docs/assets/templates/properties.md +++ b/variable-level-metadata-schema/docs/assets/templates/properties.md @@ -34,7 +34,7 @@ __{{ item.title }}__ {{ itemtype }} {{ item.description }} {# #} {# #} {% if item.enum is defined %} -{{ render_type_item('Possible values',item.enum)}} +Must be one of: {{ "`" + "`, `".join(item.enum) + "`" }} {% endif %} {# #} {# #} diff --git a/variable-level-metadata-schema/docs/html-rendered-schemas/jsonschema-csvtemplate-fields.html b/variable-level-metadata-schema/docs/html-rendered-schemas/jsonschema-csvtemplate-fields.html index 6d9bdda..6b62ce8 100644 --- a/variable-level-metadata-schema/docs/html-rendered-schemas/jsonschema-csvtemplate-fields.html +++ b/variable-level-metadata-schema/docs/html-rendered-schemas/jsonschema-csvtemplate-fields.html @@ -1,31 +1,35 @@ - HEAL Variable Level Metadata Fields

HEAL Variable Level Metadata Fields

Type: object

Variable level metadata individual fields integrated into the variable level
metadata object within the HEAL platform metadata service.

!!! note "NOTE"

Only name and description properties are required.
For categorical variables, constraints.enum and encodings (where applicable) properties are highly encouraged.
For studies using HEAL or other common data elements (CDEs), standardsMappings information is highly encouraged.
type and format properties may be particularly useful for some variable types (e.g. date-like variables)

Type: string

The section, form, survey instrument, set of measures or other broad category used
to group variables.


Examples:

"Demographics"
-
"PROMIS"
-
"Substance use"
-
"Medical History"
-
"Sleep questions"
-
"Physical activity"
-

Type: string

The name of a variable (i.e., field) as it appears in the data.

Type: string

The human-readable title or label of the variable.


Examples:

"My Variable"
-
"Gender identity"
+ HEAL Variable Level Metadata Fields 

HEAL Variable Level Metadata Fields

Type: object

!!! note "Highly encouraged"

  • Only name and description properties are required.
  • For categorical variables, constraints.enum and enumLabels (where applicable) properties are highly encouraged.
  • For studies using HEAL or other common data elements (CDEs), standardsMappings information is highly encouraged.
  • type and format properties may be particularly useful for some variable types (e.g. date-like variables)

Type: string

The version of the schema used in agreed upon convention of major.minor.path (e.g., 1.0.2)

NOTE: This is NOT for versioning of each indiviual data dictionary instance.
Rather, it is the
version of THIS schema document. See version property (below) if specifying the individual data dictionary instance
version.

If generating a vlmd document as a csv file, include this version in
every row/record to indicate this is a schema level property
(not applicable for the json version as this property is already at the schema/root level)

Must match regular expression: \d+\.\d+\.\d+
Examples:

"1.0.0"
+
"0.2.0"
+

Type: string

The section, form, survey instrument, set of measures or other broad category used
to group variables. Previously called "module."


Examples:

"Demographics"
+
"PROMIS"
+
"Medical History"
+

Type: string

The name of a variable (i.e., field) as it appears in the data.


Example:

"gender_id"
+

Type: string

The human-readable title or label of the variable.


Example:

"Gender identity"
 

Type: string

An extended description of the variable. This could be the definition of a variable or the
question text (e.g., if a survey).


Examples:

"The participant's age at the time of study enrollment"
 
"What is the highest grade or level of school you have completed or the highest degree you have received?"
-

Type: enum (of string)

A classification or category of a particular data element or property expected or allowed in the dataset.

Definitions:

  • number (A numeric value with optional decimal places. (e.g., 3.14))
  • integer (A whole number without decimal places. (e.g., 42))
  • string (A sequence of characters. (e.g., \"test\"))
  • any (Any type of data is allowed. (e.g., true))
  • boolean (A binary value representing true or false. (e.g., true))
  • date (A specific calendar date. (e.g., \"2023-05-25\"))
  • datetime (A specific date and time, including timezone information. (e.g., \"2023-05-25T10:30:00Z\"))
  • time (A specific time of day. (e.g., \"10:30:00\"))
  • year (A specific year. (e.g., 2023)
  • yearmonth (A specific year and month. (e.g., \"2023-05\"))
  • duration (A length of time. (e.g., \"PT1H\")
  • geopoint (A pair of latitude and longitude coordinates. (e.g., [51.5074, -0.1278]))

Must be one of:

  • "number"
  • "integer"
  • "string"
  • "any"
  • "boolean"
  • "date"
  • "datetime"
  • "time"
  • "year"
  • "yearmonth"
  • "duration"
  • "geopoint"

Type: string

Indicates the format of the type specified in the type property.
Each format is dependent on the type specified.
For example: If type is "string", then see the String formats.
If type is "date", "datetime", or "time", default format is ISO8601 formatting for those respective types (see details on ISO8601 format for Date,
Datetime,
or Time) - If you want to specify a date-like variable using standard Python/C strptime syntax, see here for details.
See here for more information about appropriate format values by variable type.

[Additional information]

Date Formats (date, datetime, time type variable):

A format for a date variable (date,time,datetime).
default: An ISO8601 format string.
any: Any parsable representation of a date/time/datetime. The implementing library can attempt to parse the datetime via a range of strategies.

{PATTERN}: The value can be parsed according to {PATTERN},
which MUST follow the date formatting syntax of
C / Python strftime such as:

  • "%Y-%m-%d (for date, e.g., 2023-05-25)"
  • "%Y%-%d (for date, e.g., 20230525) for date without dashes"
  • "%Y-%m-%dT%H:%M:%S (for datetime, e.g., 2023-05-25T10:30:45)"
  • "%Y-%m-%dT%H:%M:%SZ (for datetime with UTC timezone, e.g., 2023-05-25T10:30:45Z)"
  • "%Y-%m-%dT%H:%M:%S%z (for datetime with timezone offset, e.g., 2023-05-25T10:30:45+0300)"
  • "%Y-%m-%dT%H:%M (for datetime without seconds, e.g., 2023-05-25T10:30)"
  • "%Y-%m-%dT%H (for datetime without minutes and seconds, e.g., 2023-05-25T10)"
  • "%H:%M:%S (for time, e.g., 10:30:45)"
  • "%H:%M:%SZ (for time with UTC timezone, e.g., 10:30:45Z)"
  • "%H:%M:%S%z (for time with timezone offset, e.g., 10:30:45+0300)"

String formats:

  • "email if valid emails (e.g., test@gmail.com)"
  • "uri if valid uri addresses (e.g., https://example.com/resource123)"
  • "binary if a base64 binary encoded string (e.g., authentication token like aGVsbG8gd29ybGQ=)"
  • "uuid if a universal unique identifier also known as a guid (eg., f47ac10b-58cc-4372-a567-0e02b2c3d479)"

Geopoint formats:

The two types of formats for geopoint (describing a geographic point).

  • array (if 'lat,long' (e.g., 36.63,-90.20))
  • object (if {'lat':36.63,'lon':-90.20})

Type: integer

Indicates the maximum length of an iterable (e.g., array, string, or
object). For example, if 'Hello World' is the longest value of a
categorical variable, this would be a maxLength of 11.

Type: string

Constrains possible values to a set of values.

Must match regular expression: ^(?:[^|]+\||[^|]*)(?:[^|]*\|)*[^|]*$
Examples:

"1|2|3|4|5|6|7|8"
-
"White|Black or African American|American Indian or Alaska Native|Native Hawaiian or Other Pacific Islander|Asian|Some other race|Multiracial"
-

Type: string

A regular expression pattern the data MUST conform to.

Type: integer

Specifies the maximum value of a field (e.g., maximum -- or most
recent -- date, maximum integer etc). Note, this is different then
maxLength property.

Type: integer

Specifies the minimum value of a field.

Type: string

Variable value encodings provide a way to further annotate any value within a any variable type,
making values easier to understand.

Many analytic software programs (e.g., SPSS,Stata, and SAS) use numerical encodings and some algorithms
only support numerical values. Encodings (and mappings) allow categorical values to be stored as
numerical values.

Additionally, as another use case, this field provides a way to
store categoricals that are stored as "short" labels (such as
abbreviations).

Must match regular expression: ^(?:.*?=.*?(?:\||$))+$
Examples:

"0=No|1=Yes"
-
"HW=Hello world|GBW=Good bye world|HM=Hi,Mike"
-

Type: boolean

Indicates whether a categorical variable is ordered. This variable is
relevant for variables that have an ordered relationship but not
necessarily a numerical relationship (e.g., Strongly disagree < Disagree
< Neutral < Agree).

Type: string

A list of missing values specific to a variable.

Must match regular expression: ^(?:[^|]+\||[^|]*)(?:[^|]*\|)*[^|]*$
Examples:

"Missing|Skipped|No preference"
+

Type: enum (of string)

A classification or category of a particular data element or property expected or allowed in the dataset.

Must be one of:

  • "number"
  • "integer"
  • "string"
  • "any"
  • "boolean"
  • "date"
  • "datetime"
  • "time"
  • "year"
  • "yearmonth"
  • "duration"
  • "geopoint"

Type: string

Indicates the format of the type specified in the type property.
Each format is dependent on the type specified.
See here
for more information about appropriate format values by variable type.

Type: boolean

If this variable is marked as true, then this variable's value must be present
(ie not missing; see missingValues). If marked as false or not present, then the
variable CAN be missing.

Type: integer

Indicates the maximum length of an iterable (e.g., array, string, or
object). For example, if 'Hello World' is the longest value of a
categorical variable, this would be a maxLength of 11.

Type: string

Constrains possible values to a set of values.

Must match regular expression: ^(?:[^|]+\||[^|]*)(?:[^|]*\|)*[^|]*$
Examples:

"1|2|3|4|5"
+
"Poor|Fair|Good|Very good|Excellent"
+

Type: string

A regular expression pattern the data MUST conform to.

Type: integer

Specifies the maximum value of a field (e.g., maximum -- or most
recent -- date, maximum integer etc). Note, this is different then
maxLength property.

Type: integer

Specifies the minimum value of a field.

Type: string

Variable value encodings provide a way to further annotate any value within a any variable type,
making values easier to understand.

Many analytic software programs (e.g., SPSS,Stata, and SAS) use numerical encodings and some algorithms
only support numerical values. Encodings (and mappings) allow categorical values to be stored as
numerical values.

Additionally, as another use case, this field provides a way to
store categoricals that are stored as "short" labels (such as
abbreviations).

This field is intended to follow this pattern

Must match regular expression: ^(?:.*?=.*?(?:\||$))+$
Examples:

"1=Poor|2=Fair|3=Good|4=Very good|5=Excellent"
+
"HW=Hello world|GBW=Good bye world|HM=Hi, Mike"
+

Type: boolean

Indicates whether a categorical variable is ordered. This variable is
relevant for variables that have an ordered relationship but not
necessarily a numerical relationship (e.g., Strongly disagree < Disagree
< Neutral < Agree).

This field is intended to follow the ordering aspect of this [this pattern][this pattern](https://specs.frictionlessdata.io/patterns/#table-schema-enum-labels-and-ordering)

Type: string

A list of missing values specific to a variable.

Must match regular expression: ^(?:[^|]+\||[^|]*)(?:[^|]*\|)*[^|]*$
Examples:

"Missing|Skipped|No preference"
 
"Missing"
-

Type: string

For boolean (true) variable (as defined in type field), this field allows
a physical string representation to be cast as true (increasing
readability of the field). It can include one or more values.

Must match regular expression: ^(?:[^|]+\||[^|]*)(?:[^|]*\|)*[^|]*$
Examples:

"Required|REQUIRED"
-
"required|Yes|Y|Checked"
-
"Checked"
-
"Required"
-

Type: string

For boolean (false) variable (as defined in type field), this field allows
a physical string representation to be cast as false (increasing
readability of the field) that is not a standard false value. It can include one or more values.

Must match regular expression: ^(?:[^|]+\||[^|]*)(?:[^|]*\|)*[^|]*$

Type: stringFormat: uri

The url that links out to the published, standardized mapping.


Example:

"https://cde.nlm.nih.gov/deView?tinyId=XyuSGdTTI"
-

Type: string

The type of mapping linked to a published set of standard variables such as the NIH Common Data Elements program


Examples:

"cde"
-
"ontology"
-
"reference_list"
-

Type: string

A free text label of a mapping indicating a mapping(s) to a published set of standard variables such as the NIH Common Data Elements program.


Examples:

"substance use"
-
"chemical compound"
-
"promis"
-

Type: string

The source of the standardized variable.


Example:

"TBD (will have controlled vocabulary)"
-

Type: string

The id locating the individual mapping within the given source.

Type: stringFormat: uri

The url that links out to the published, standardized concept.


Example:

"https://cde.nlm.nih.gov/deView?tinyId=XyuSGdTTI"
-

Type: string

The type of mapping to a published set of concepts related to the given field such as
ontological information (eg., NCI thesaurus, bioportal etc)

Type: string

A free text label of mapping to a published set of concepts related to the given field such as
ontological information (eg., NCI thesaurus, bioportal etc)

Type: string

The source of the related concept.


Example:

"TBD (will have controlled vocabulary)"
-

Type: string

The id locating the individual mapping within the given source.

Type: number

Type: number

Type: number

Type: number

Type: number

Type: number

Type: integer

Value must be greater or equal to 0

Additional Properties of any type are allowed.

Type: object
\ No newline at end of file +

Type: string

For boolean (true) variable (as defined in type field), this field allows
a physical string representation to be cast as true (increasing
readability of the field). It can include one or more values.

Must match regular expression: ^(?:[^|]+\||[^|]*)(?:[^|]*\|)*[^|]*$
Examples:

"required|Yes|Checked"
+
"required"
+

Type: string

For boolean (false) variable (as defined in type field), this field allows
a physical string representation to be cast as false (increasing
readability of the field) that is not a standard false value. It can include one or more values.

Must match regular expression: ^(?:[^|]+\||[^|]*)(?:[^|]*\|)*[^|]*$
Examples:

"Not required|NOT REQUIRED"
+
"No"
+

Type: string

Additional properties not included a core property.

Must match regular expression: ^(?:.*?=.*?(?:\||$))+$

Type: stringFormat: uri

A url (e.g., link, address) to a file or other resource containing the instrument, or
a set of items which encompass a variable in this variable level metadata document (if at the root level or the document level)
or the individual variable (if at the field level).


Example:

"https://www.heal.nih.gov/files/CDEs/2023-05/adult-demographics-cdes.xlsx"
+

Type: enum (of string)

An abbreviated name/acronym from a controlled vocabulary referencing the resource (e.g., program or repository)
containing the instrument, or a set of items which encompass a variable in this variable level metadata document (if at the root level or the document level)
or the individual variable (if at the field level).

Must be one of:

  • "heal-cde"

Type: string

Examples:

"Adult demographics"
+
"adult-demographics"
+

Type: string

A code or other string that identifies the instrument within the source.
This should always be from the source's formal, standardized identification system


Example:

"5141"
+

Type: stringFormat: uri

The url that links out to the published, standardized mapping of a variable (e.g., common data element)


Example:

"https://evs.nci.nih.gov/ftp1/CDISC/SDTM/SDTM%20Terminology.html#CL.C74457.RACE"
+

Type: string

The source of the standardized variable. Note, this property is required if
an id is specified.


Example:

"CDISC"
+

Type: string

The id locating the individual mapping within the given source.
Note, the standardsMappings[0].source property is required if
this property is specified.


Example:

"C74457"
+

Type: stringFormat: uri

The url that links out to the published, related concept.
The listed examples could both be attached to any variable related to, for example, heroin use.

:pointup: if you are looking for mapping field values to common data elements or a set of standards, see standardsMappings


Examples:

"https://www.ebi.ac.uk/chebi/chebiOntology.do?chebiId=CHEBI:27808"
+
"http://purl.bioontology.org/ontology/RXNORM/3304"
+

Type: string

A human-readable title (ie label) to a concept related to the given field.
The listed examples could both be attached to any variable related to, for example, heroin use.

:pointup: if you are looking for mapping field values to common data elements or a set of standards, see standardsMappings


Examples:

"Heroin Molecular Structure"
+
"Heroin Ontology"
+

Type: string

The source (e.g., a dictionary or vocabulary set) to a concept related to the given field.
The listed examples could both be attached to any variable related to, for example, heroin use.

:pointup: if you are looking for mapping field values to common data elements or a set of standards, see standardsMappings


Examples:

"CHEBI"
+
"RXNORM"
+

Type: string

The id locating the individual concept within the source of the given field.
The listed examples could both be attached to any variable related to, for example, heroin use.

:pointup: if you are looking for mapping field values to common data elements or a set of standards, see standardsMappings


Examples:

"27808"
+
"3304"
+
\ No newline at end of file diff --git a/variable-level-metadata-schema/docs/html-rendered-schemas/jsonschema-jsontemplate-data-dictionary.html b/variable-level-metadata-schema/docs/html-rendered-schemas/jsonschema-jsontemplate-data-dictionary.html index 20acb18..e159954 100644 --- a/variable-level-metadata-schema/docs/html-rendered-schemas/jsonschema-jsontemplate-data-dictionary.html +++ b/variable-level-metadata-schema/docs/html-rendered-schemas/jsonschema-jsontemplate-data-dictionary.html @@ -1,61 +1,137 @@ - Variable Level Metadata (Data Dictionaries)

Variable Level Metadata (Data Dictionaries)

Type: object

This schema defines the variable level metadata for one data dictionary for a given study.Note a given study can have multiple data dictionaries

Type: string

Type: string

Type: array of object

Each item of this array must be:

Type: object

Variable level metadata individual fields integrated into the variable level
metadata object within the HEAL platform metadata service.

!!! note "NOTE"

Only name and description properties are required.
For categorical variables, constraints.enum and encodings (where applicable) properties are highly encouraged.
For studies using HEAL or other common data elements (CDEs), standardsMappings information is highly encouraged.
type and format properties may be particularly useful for some variable types (e.g. date-like variables)

Type: string

The section, form, survey instrument, set of measures or other broad category used
to group variables.


Examples:

"Demographics"
-
"PROMIS"
-
"Substance use"
-
"Medical History"
-
"Sleep questions"
-
"Physical activity"
-

Type: string

The name of a variable (i.e., field) as it appears in the data.

Type: string

The human-readable title or label of the variable.


Examples:

"My Variable"
-
"Gender identity"
-

Type: string

An extended description of the variable. This could be the definition of a variable or the
question text (e.g., if a survey).


Examples:

"The participant's age at the time of study enrollment"
-
"What is the highest grade or level of school you have completed or the highest degree you have received?"
-

Type: enum (of string)

A classification or category of a particular data element or property expected or allowed in the dataset.

Definitions:

  • number (A numeric value with optional decimal places. (e.g., 3.14))
  • integer (A whole number without decimal places. (e.g., 42))
  • string (A sequence of characters. (e.g., \"test\"))
  • any (Any type of data is allowed. (e.g., true))
  • boolean (A binary value representing true or false. (e.g., true))
  • date (A specific calendar date. (e.g., \"2023-05-25\"))
  • datetime (A specific date and time, including timezone information. (e.g., \"2023-05-25T10:30:00Z\"))
  • time (A specific time of day. (e.g., \"10:30:00\"))
  • year (A specific year. (e.g., 2023)
  • yearmonth (A specific year and month. (e.g., \"2023-05\"))
  • duration (A length of time. (e.g., \"PT1H\")
  • geopoint (A pair of latitude and longitude coordinates. (e.g., [51.5074, -0.1278]))

Must be one of:

  • "number"
  • "integer"
  • "string"
  • "any"
  • "boolean"
  • "date"
  • "datetime"
  • "time"
  • "year"
  • "yearmonth"
  • "duration"
  • "geopoint"

Type: string

Indicates the format of the type specified in the type property.
Each format is dependent on the type specified.
For example: If type is "string", then see the String formats.
If type is "date", "datetime", or "time", default format is ISO8601 formatting for those respective types (see details on ISO8601 format for Date,
Datetime,
or Time) - If you want to specify a date-like variable using standard Python/C strptime syntax, see here for details.
See here for more information about appropriate format values by variable type.

[Additional information]

Date Formats (date, datetime, time type variable):

A format for a date variable (date,time,datetime).
default: An ISO8601 format string.
any: Any parsable representation of a date/time/datetime. The implementing library can attempt to parse the datetime via a range of strategies.

{PATTERN}: The value can be parsed according to {PATTERN},
which MUST follow the date formatting syntax of
C / Python strftime such as:

  • "%Y-%m-%d (for date, e.g., 2023-05-25)"
  • "%Y%-%d (for date, e.g., 20230525) for date without dashes"
  • "%Y-%m-%dT%H:%M:%S (for datetime, e.g., 2023-05-25T10:30:45)"
  • "%Y-%m-%dT%H:%M:%SZ (for datetime with UTC timezone, e.g., 2023-05-25T10:30:45Z)"
  • "%Y-%m-%dT%H:%M:%S%z (for datetime with timezone offset, e.g., 2023-05-25T10:30:45+0300)"
  • "%Y-%m-%dT%H:%M (for datetime without seconds, e.g., 2023-05-25T10:30)"
  • "%Y-%m-%dT%H (for datetime without minutes and seconds, e.g., 2023-05-25T10)"
  • "%H:%M:%S (for time, e.g., 10:30:45)"
  • "%H:%M:%SZ (for time with UTC timezone, e.g., 10:30:45Z)"
  • "%H:%M:%S%z (for time with timezone offset, e.g., 10:30:45+0300)"

String formats:

  • "email if valid emails (e.g., test@gmail.com)"
  • "uri if valid uri addresses (e.g., https://example.com/resource123)"
  • "binary if a base64 binary encoded string (e.g., authentication token like aGVsbG8gd29ybGQ=)"
  • "uuid if a universal unique identifier also known as a guid (eg., f47ac10b-58cc-4372-a567-0e02b2c3d479)"

Geopoint formats:

The two types of formats for geopoint (describing a geographic point).

  • array (if 'lat,long' (e.g., 36.63,-90.20))
  • object (if {'lat':36.63,'lon':-90.20})

Type: object

Type: integer

Indicates the maximum length of an iterable (e.g., array, string, or
object). For example, if 'Hello World' is the longest value of a
categorical variable, this would be a maxLength of 11.

Type: array

Constrains possible values to a set of values.


Examples:

[
+ Variable Level Metadata (Data Dictionaries) 

Variable Level Metadata (Data Dictionaries)

Type: object

This schema defines the variable level metadata for one data dictionary for a given study.Note a given study can have multiple data dictionaries.

Type: string

Type: string

Type: string

The version of the schema used in agreed upon convention of major.minor.path (e.g., 1.0.2)

NOTE: This is NOT for versioning of each indiviual data dictionary instance.
Rather, it is the
version of THIS schema document. See version property (below) if specifying the individual data dictionary instance
version.

If generating a vlmd document as a csv file, include this version in
every row/record to indicate this is a schema level property
(not applicable for the json version as this property is already at the schema/root level)

Must match regular expression: \d+\.\d+\.\d+
Examples:

"1.0.0"
+
"0.2.0"
+

Type: string

The specified individual data dictionary instance version.

Type: array

A set of standardized instruments linked to all variables within the fields property (but see note).

!!! note "NOTE"

If standardsMappings is present at both the root (this property) and within fields,
then the fields standardsMappings property takes precedence.

Note, only instrument can be mapped to this property as opposed to the fields standardsMappings
This property has the same specification as the fields standardsMappings to make the cascading logic
easier to understand in the same way other standards implement cascading
(e.g., missingValues in the frictionless specification)

Each item of this array must be:

Type: object

Type: object

A standardized set of items which encompass
a variable in this variable level metadata document (if at the root level or the document level)
or the individual variable (if at the field level).

!!! note "NOTE"

If information is present at both the root and the field level,
then the information at the field level would take precedence (i.e., it would cascade).

Type: stringFormat: uri

A url (e.g., link, address) to a file or other resource containing the instrument, or
a set of items which encompass a variable in this variable level metadata document (if at the root level or the document level)
or the individual variable (if at the field level).


Example:

"https://www.heal.nih.gov/files/CDEs/2023-05/adult-demographics-cdes.xlsx"
+

Type: enum (of string)

An abbreviated name/acronym from a controlled vocabulary referencing the resource (e.g., program or repository)
containing the instrument, or a set of items which encompass a variable in this variable level metadata document (if at the root level or the document level)
or the individual variable (if at the field level).

Must be one of:

  • "heal-cde"

Type: string

Examples:

"Adult demographics"
+
"adult-demographics"
+

Type: string

A code or other string that identifies the instrument within the source.
This should always be from the source's formal, standardized identification system


Example:

"5141"
+

Type: object

Additional properties not included as a core property.

Type: array of object

Each item of this array must be:

Type: object

!!! note "Highly encouraged"

  • Only name and description properties are required.
  • For categorical variables, constraints.enum and enumLabels (where applicable) properties are highly encouraged.
  • For studies using HEAL or other common data elements (CDEs), standardsMappings information is highly encouraged.
  • type and format properties may be particularly useful for some variable types (e.g. date-like variables)

Type: string

The version of the schema used in agreed upon convention of major.minor.path (e.g., 1.0.2)

NOTE: This is NOT for versioning of each indiviual data dictionary instance.
Rather, it is the
version of THIS schema document. See version property (below) if specifying the individual data dictionary instance
version.

If generating a vlmd document as a csv file, include this version in
every row/record to indicate this is a schema level property
(not applicable for the json version as this property is already at the schema/root level)

Must match regular expression: \d+\.\d+\.\d+
Examples:

"1.0.0"
+
"0.2.0"
+

Type: string

The section, form, survey instrument, set of measures or other broad category used
to group variables. Previously called "module."


Examples:

"Demographics"
+
"PROMIS"
+
"Medical History"
+

Type: string

The name of a variable (i.e., field) as it appears in the data.


Example:

"gender_id"
+

Type: string

The human-readable title or label of the variable.


Example:

"Gender identity"
+

Type: string

An extended description of the variable. This could be the definition of a variable or the
question text (e.g., if a survey).


Examples:

"The participant's age at the time of study enrollment"
+
"What is the highest grade or level of school you have completed or the highest degree you have received?"
+

Type: enum (of string)

A classification or category of a particular data element or property expected or allowed in the dataset.

Must be one of:

  • "number"
  • "integer"
  • "string"
  • "any"
  • "boolean"
  • "date"
  • "datetime"
  • "time"
  • "year"
  • "yearmonth"
  • "duration"
  • "geopoint"

Type: string

Indicates the format of the type specified in the type property.
Each format is dependent on the type specified.
See here
for more information about appropriate format values by variable type.

Type: object

Type: boolean

If this variable is marked as true, then this variable's value must be present
(ie not missing; see missingValues). If marked as false or not present, then the
variable CAN be missing.

Type: integer

Indicates the maximum length of an iterable (e.g., array, string, or
object). For example, if 'Hello World' is the longest value of a
categorical variable, this would be a maxLength of 11.

Type: array

Constrains possible values to a set of values.


Examples:

[
     1,
     2,
     3,
-    4
-]
-
[
-    "White",
-    "Black or African American",
-    "American Indian or Alaska Native",
-    "Native Hawaiian or Other Pacific Islander",
-    "Asian",
-    "Some other race",
-    "Multiracial"
-]
-

Type: string

A regular expression pattern the data MUST conform to.

Type: integer

Specifies the maximum value of a field (e.g., maximum -- or most
recent -- date, maximum integer etc). Note, this is different then
maxLength property.

Type: integer

Specifies the minimum value of a field.

Type: object

Variable value encodings provide a way to further annotate any value within a any variable type,
making values easier to understand.

Many analytic software programs (e.g., SPSS,Stata, and SAS) use numerical encodings and some algorithms
only support numerical values. Encodings (and mappings) allow categorical values to be stored as
numerical values.

Additionally, as another use case, this field provides a way to
store categoricals that are stored as "short" labels (such as
abbreviations).


Examples:

{
-    "0": "No",
-    "1": "Yes"
+    4,
+    5
+]
+
[
+    "Poor",
+    "Fair",
+    "Good",
+    "Very good",
+    "Excellent"
+]
+

Type: string

A regular expression pattern the data MUST conform to.

Type: integer

Specifies the maximum value of a field (e.g., maximum -- or most
recent -- date, maximum integer etc). Note, this is different then
maxLength property.

Type: integer

Specifies the minimum value of a field.

Type: object

Variable value encodings provide a way to further annotate any value within a any variable type,
making values easier to understand.

Many analytic software programs (e.g., SPSS,Stata, and SAS) use numerical encodings and some algorithms
only support numerical values. Encodings (and mappings) allow categorical values to be stored as
numerical values.

Additionally, as another use case, this field provides a way to
store categoricals that are stored as "short" labels (such as
abbreviations).

This field is intended to follow this pattern


Examples:

{
+    "1": "Poor",
+    "2": "Fair",
+    "3": "Good",
+    "4": "Very good",
+    "5": "Excellent"
 }
-
{
+
{
     "HW": "Hello world",
     "GBW": "Good bye world",
     "HM": "Hi, Mike"
 }
-

Type: boolean

Indicates whether a categorical variable is ordered. This variable is
relevant for variables that have an ordered relationship but not
necessarily a numerical relationship (e.g., Strongly disagree < Disagree
< Neutral < Agree).

Type: array

A list of missing values specific to a variable.


Examples:

[
+

Type: boolean

Indicates whether a categorical variable is ordered. This variable is
relevant for variables that have an ordered relationship but not
necessarily a numerical relationship (e.g., Strongly disagree < Disagree
< Neutral < Agree).

This field is intended to follow the ordering aspect of this [this pattern][this pattern](https://specs.frictionlessdata.io/patterns/#table-schema-enum-labels-and-ordering)

Type: array

A list of missing values specific to a variable.


Examples:

[
     "Missing",
     "Skipped",
     "No preference"
 ]
-
[
+
[
     "Missing"
 ]
-

Type: array of string

For boolean (true) variable (as defined in type field), this field allows
a physical string representation to be cast as true (increasing
readability of the field). It can include one or more values.

Each item of this array must be:


Examples:

[
+

Type: array

For boolean (true) variable (as defined in type field), this field allows
a physical string representation to be cast as true (increasing
readability of the field). It can include one or more values.


Examples:

[
     "required",
     "Yes",
     "Checked"
 ]
-
[
+
[
     "required"
 ]
-

Type: array

For boolean (false) variable (as defined in type field), this field allows
a physical string representation to be cast as false (increasing
readability of the field) that is not a standard false value. It can include one or more values.

Type: array of object

A published set of standard variables such as the NIH Common Data Elements program.

Each item of this array must be:

Type: object

Type: stringFormat: uri

The url that links out to the published, standardized mapping.


Example:

"https://cde.nlm.nih.gov/deView?tinyId=XyuSGdTTI"
-

Type: string

The type of mapping linked to a published set of standard variables such as the NIH Common Data Elements program


Examples:

"cde"
-
"ontology"
-
"reference_list"
-

Type: string

A free text label of a mapping indicating a mapping(s) to a published set of standard variables such as the NIH Common Data Elements program.


Examples:

"substance use"
-
"chemical compound"
-
"promis"
-

Type: string

The source of the standardized variable.


Example:

"TBD (will have controlled vocabulary)"
-

Type: string

The id locating the individual mapping within the given source.

Type: array of object

Mappings to a published set of concepts related to the given field such as ontological information (eg., NCI thesaurus, bioportal etc)

Each item of this array must be:

Type: object

Type: stringFormat: uri

The url that links out to the published, standardized concept.


Example:

"https://cde.nlm.nih.gov/deView?tinyId=XyuSGdTTI"
-

Type: string

The type of mapping to a published set of concepts related to the given field such as
ontological information (eg., NCI thesaurus, bioportal etc)

Type: string

A free text label of mapping to a published set of concepts related to the given field such as
ontological information (eg., NCI thesaurus, bioportal etc)

Type: string

The source of the related concept.


Example:

"TBD (will have controlled vocabulary)"
-

Type: string

The id locating the individual mapping within the given source.

Type: object

Univariate statistics inferred from the data about the given variable

Type: integer

Value must be greater or equal to 0

Additional Properties of any type are allowed.

Type: object
\ No newline at end of file +

Type: array

For boolean (false) variable (as defined in type field), this field allows
a physical string representation to be cast as false (increasing
readability of the field) that is not a standard false value. It can include one or more values.


Examples:

[
+    "Not required",
+    "NOT REQUIRED"
+]
+
[
+    "No"
+]
+

Type: object

Additional properties not included a core property.

Type: array of object

A set of instrument and item references to standardized data elements designed to document
the HEAL common data elements program
and other standardized/common element sources to facilitate cross-study comparison and interoperability
of data. One can either map an individual data element or an instrument in which the field is
a part of.

*All Fields Mapped (Both Instrument and Item)*

"standardsMappings": [
+    {
+        "instrument": {
+            "url": "https://www.heal.nih.gov/files/CDEs/2023-05/adult-demographics-cdes.xlsx",
+            "source": "heal-cde",
+            "title": "adult-demographics",
+            "id": "5141"
+        },
+        "item": {
+            "url": "https://evs.nci.nih.gov/ftp1/CDISC/SDTM/SDTM%20Terminology.html#CL.C74457.RACE",
+            "source": "CDISC",
+            "id": "C74457"
+        }
+    }
+]
+

*Only Instrument Title of Form CDE File Mapped*

In this scenario, especially as CDE variables do not have associated CDISC ids listed, only instrument information is given.

"standardsMappings": [
+    {
+        "instrument": {
+            "source": "heal-cde",
+            "title": "Adult demographics"
+        }
+    }
+]
+

*Only Instrument ID of HEAL CDE Mapped*

"standardsMappings": [
+    {
+        "instrument": {
+            "source": "heal-cde",
+            "id": "5141"
+        }
+    }
+]
+

*Other Non-HEAL CDE Use Cases*

Only item matched (for example if found in the NIH (not HEAL) CDE repository). Folks would enter the information in the "Identifier" section. Similar to the above, they could also just enter the "url".

"standardsMappings": [
+    {
+        "item": {
+            "source": "NLM",
+            "id": "Fakc6Jy2x"
+        }
+    }
+]
+

*Multiple CDE Mappings*

Two separate records. If desired, multiple standard mappings can be entered, say from the NIH HEAL CDE repo and the NIH CDE lookup (NLM) by way of two separate records in the list.

"standardsMappings": [
+    {
+        "instrument": {
+            "source": "heal-cde",
+            "title": "Adult demographics"
+        },
+        "item": {
+            "source": "CDISC",
+            "id": "C74457"
+        },
+    },
+    {
+        "item": {
+            "source": "NLM",
+            "id": "Fakc6Jy2x"
+        }
+    }
+]
+

Each item of this array must be:

Type: object

Type: object

A standardized set of items which encompass
a variable in this variable level metadata document (if at the root level or the document level)
or the individual variable (if at the field level).

!!! note "NOTE"

If information is present at both the root and the field level,
then the information at the field level would take precedence (i.e., it would cascade).

Type: stringFormat: uri

A url (e.g., link, address) to a file or other resource containing the instrument, or
a set of items which encompass a variable in this variable level metadata document (if at the root level or the document level)
or the individual variable (if at the field level).


Example:

"https://www.heal.nih.gov/files/CDEs/2023-05/adult-demographics-cdes.xlsx"
+

Type: enum (of string)

An abbreviated name/acronym from a controlled vocabulary referencing the resource (e.g., program or repository)
containing the instrument, or a set of items which encompass a variable in this variable level metadata document (if at the root level or the document level)
or the individual variable (if at the field level).

Must be one of:

  • "heal-cde"

Type: string

Examples:

"Adult demographics"
+
"adult-demographics"
+

Type: string

A code or other string that identifies the instrument within the source.
This should always be from the source's formal, standardized identification system


Example:

"5141"
+

Type: object

A standardized item (ie field, variable etc) mapped to this individual variable.

Type: stringFormat: uri

The url that links out to the published, standardized mapping of a variable (e.g., common data element)


Example:

"https://evs.nci.nih.gov/ftp1/CDISC/SDTM/SDTM%20Terminology.html#CL.C74457.RACE"
+

Type: string

The source of the standardized variable. Note, this property is required if
an id is specified.


Example:

"CDISC"
+

Type: string

The id locating the individual mapping within the given source.
Note, the standardsMappings[0].source property is required if
this property is specified.


Example:

"C74457"
+

Type: array of object

*[Under development]* Mappings to a published set of concepts related to the given field such as
ontological information (eg., NCI thesaurus, bioportal etc)

Each item of this array must be:

Type: object

Type: stringFormat: uri

The url that links out to the published, related concept.
The listed examples could both be attached to any variable related to, for example, heroin use.

:pointup: if you are looking for mapping field values to common data elements or a set of standards, see standardsMappings


Examples:

"https://www.ebi.ac.uk/chebi/chebiOntology.do?chebiId=CHEBI:27808"
+
"http://purl.bioontology.org/ontology/RXNORM/3304"
+

Type: string

A human-readable title (ie label) to a concept related to the given field.
The listed examples could both be attached to any variable related to, for example, heroin use.

:pointup: if you are looking for mapping field values to common data elements or a set of standards, see standardsMappings


Examples:

"Heroin Molecular Structure"
+
"Heroin Ontology"
+

Type: string

The source (e.g., a dictionary or vocabulary set) to a concept related to the given field.
The listed examples could both be attached to any variable related to, for example, heroin use.

:pointup: if you are looking for mapping field values to common data elements or a set of standards, see standardsMappings


Examples:

"CHEBI"
+
"RXNORM"
+

Type: string

The id locating the individual concept within the source of the given field.
The listed examples could both be attached to any variable related to, for example, heroin use.

:pointup: if you are looking for mapping field values to common data elements or a set of standards, see standardsMappings


Examples:

"27808"
+
"3304"
+
\ No newline at end of file diff --git a/variable-level-metadata-schema/docs/md-rendered-schemas/jsonschema-csvtemplate-fields.md b/variable-level-metadata-schema/docs/md-rendered-schemas/jsonschema-csvtemplate-fields.md index 67aed35..b1e871e 100644 --- a/variable-level-metadata-schema/docs/md-rendered-schemas/jsonschema-csvtemplate-fields.md +++ b/variable-level-metadata-schema/docs/md-rendered-schemas/jsonschema-csvtemplate-fields.md @@ -1,71 +1,92 @@ -# HEAL Variable Level Metadata Fields +# HEAL Variable Level Metadata Fields -Variable level metadata individual fields integrated into the variable level -metadata object within the HEAL platform metadata service. +_version 0.2.0_ -!!! note "NOTE" + - Only `name` and `description` properties are required. - For categorical variables, `constraints.enum` and `encodings` (where applicable) properties are highly encouraged. - For studies using HEAL or other common data elements (CDEs), `standardsMappings` information is highly encouraged. - `type` and `format` properties may be particularly useful for some variable types (e.g. date-like variables) +The aim of this HEAL metadata piece is to track and provide basic information about variables in a tabular data file (i.e. a data file with rows and columns) from your HEAL study. The objective is to list all variables and descriptive information about those variables. This will ensure that potential secondary data users know what data has been collected or calculated and how to use these data. Note that a given study can have multiple tabular data files; You should create a data dictionary for each tabular data file. Thus, a study may have multiple data dictionaries. -## Properties +!!! note "Highly encouraged" -**`module`** _(string)_ - The section, form, survey instrument, set of measures or other broad category used -to group variables. + - Only `name` and `description` properties are required. + - For categorical variables, `constraints.enum` and `enumLabels` (where applicable) properties are highly encouraged. + - For studies using HEAL or other common data elements (CDEs), `standardsMappings` information is highly encouraged. + - `type` and `format` properties may be particularly useful for some variable types (e.g. date-like variables) + + +## Properties (i.e., fields or variables) + + +**`schemaVersion`** _(string)_ + The version of the schema used in agreed upon convention of major.minor.path (e.g., 1.0.2) + +NOTE: This is NOT for versioning of each indiviual data dictionary instance. +Rather, it is the +version of THIS schema document. See `version` property (below) if specifying the individual data dictionary instance +version. + +If generating a vlmd document as a csv file, include this version in +every row/record to indicate this is a schema level property +(not applicable for the json version as this property is already at the schema/root level) Examples: ``` - Demographics + 1.0.0 ``` ``` - PROMIS + 0.2.0 ``` -``` - Substance use +**`section`** _(string)_ + The section, form, survey instrument, set of measures or other broad category used +to group variables. Previously called "module." + +Examples: -``` ``` - Medical History + Demographics ``` ``` - Sleep questions + PROMIS ``` ``` - Physical activity + Medical History ``` **`name`** _(string,required)_ The name of a variable (i.e., field) as it appears in the data. - -**`title`** _(string)_ - The human-readable title or label of the variable. - Examples: ``` - My Variable + gender_id ``` +**`title`** _(string)_ + The human-readable title or label of the variable. + +Examples: + + ``` Gender identity @@ -91,131 +112,19 @@ Examples: **`type`** _(string)_ A classification or category of a particular data element or property expected or allowed in the dataset. -Definitions: - -- `number` (A numeric value with optional decimal places. (e.g., 3.14)) -- `integer` (A whole number without decimal places. (e.g., 42)) -- `string` (A sequence of characters. (e.g., \"test\")) -- `any` (Any type of data is allowed. (e.g., true)) -- `boolean` (A binary value representing true or false. (e.g., true)) -- `date` (A specific calendar date. (e.g., \"2023-05-25\")) -- `datetime` (A specific date and time, including timezone information. (e.g., \"2023-05-25T10:30:00Z\")) -- `time` (A specific time of day. (e.g., \"10:30:00\")) -- `year` (A specific year. (e.g., 2023) -- `yearmonth` (A specific year and month. (e.g., \"2023-05\")) -- `duration` (A length of time. (e.g., \"PT1H\") -- `geopoint` (A pair of latitude and longitude coordinates. (e.g., [51.5074, -0.1278])) - -Possible values: - -- ``` - - number - - ``` -- ``` - - integer - - ``` -- ``` - - string - - ``` -- ``` - - any - - ``` -- ``` - - boolean - - ``` -- ``` - - date - - ``` -- ``` - - datetime - - ``` -- ``` - - time - - ``` -- ``` - - year - - ``` -- ``` - - yearmonth - - ``` -- ``` - - duration - - ``` -- ``` - - geopoint - - ``` - +Must be one of: `number`, `integer`, `string`, `any`, `boolean`, `date`, `datetime`, `time`, `year`, `yearmonth`, `duration`, `geopoint` **`format`** _(string)_ Indicates the format of the type specified in the `type` property. Each format is dependent on the `type` specified. -For example: If `type` is "string", then see the [String formats](https://specs.frictionlessdata.io/table-schema/#string). -If `type` is "date", "datetime", or "time", default format is ISO8601 formatting for those respective types (see details on ISO8601 format for [Date](https://specs.frictionlessdata.io/table-schema/#date), -[Datetime](https://specs.frictionlessdata.io/table-schema/#datetime), -or [Time](https://specs.frictionlessdata.io/table-schema/#time)) - If you want to specify a date-like variable using standard Python/C strptime syntax, see [here](#format-details-for-date-datetime-time-type-variables) for details. -See [here](https://specs.frictionlessdata.io/table-schema/#types-and-formats) for more information about appropriate `format` values by variable `type`. - -[Additional information] - -Date Formats (date, datetime, time `type` variable): - -A format for a date variable (`date`,`time`,`datetime`). -**default**: An ISO8601 format string. -**any**: Any parsable representation of a date/time/datetime. The implementing library can attempt to parse the datetime via a range of strategies. +See [here](https://specs.frictionlessdata.io/table-schema/#types-and-formats) +for more information about appropriate `format` values by variable `type`. -**{PATTERN}**: The value can be parsed according to `{PATTERN}`, -which `MUST` follow the date formatting syntax of -C / Python [strftime](http://strftime.org/) such as: -- "`%Y-%m-%d` (for date, e.g., 2023-05-25)" -- "`%Y%-%d` (for date, e.g., 20230525) for date without dashes" -- "`%Y-%m-%dT%H:%M:%S` (for datetime, e.g., 2023-05-25T10:30:45)" -- "`%Y-%m-%dT%H:%M:%SZ` (for datetime with UTC timezone, e.g., 2023-05-25T10:30:45Z)" -- "`%Y-%m-%dT%H:%M:%S%z` (for datetime with timezone offset, e.g., 2023-05-25T10:30:45+0300)" -- "`%Y-%m-%dT%H:%M` (for datetime without seconds, e.g., 2023-05-25T10:30)" -- "`%Y-%m-%dT%H` (for datetime without minutes and seconds, e.g., 2023-05-25T10)" -- "`%H:%M:%S` (for time, e.g., 10:30:45)" -- "`%H:%M:%SZ` (for time with UTC timezone, e.g., 10:30:45Z)" -- "`%H:%M:%S%z` (for time with timezone offset, e.g., 10:30:45+0300)" - -String formats: - -- "`email` if valid emails (e.g., test@gmail.com)" -- "`uri` if valid uri addresses (e.g., https://example.com/resource123)" -- "`binary` if a base64 binary encoded string (e.g., authentication token like aGVsbG8gd29ybGQ=)" -- "`uuid` if a universal unique identifier also known as a guid (eg., f47ac10b-58cc-4372-a567-0e02b2c3d479)" - - -Geopoint formats: - -The two types of formats for `geopoint` (describing a geographic point). - -- `array` (if 'lat,long' (e.g., 36.63,-90.20)) -- `object` (if {'lat':36.63,'lon':-90.20}) +**`constraints.required`** _(boolean)_ + If this variable is marked as true, then this variable's value must be present +(ie not missing; see missingValues). If marked as false or not present, then the +variable CAN be missing. **`constraints.maxLength`** _(integer)_ @@ -231,12 +140,12 @@ Examples: ``` - 1|2|3|4|5|6|7|8 + 1|2|3|4|5 ``` ``` - White|Black or African American|American Indian or Alaska Native|Native Hawaiian or Other Pacific Islander|Asian|Some other race|Multiracial + Poor|Fair|Good|Very good|Excellent ``` @@ -254,7 +163,7 @@ maxLength property. Specifies the minimum value of a field. -**`encodings`** _(string)_ +**`enumLabels`** _(string)_ Variable value encodings provide a way to further annotate any value within a any variable type, making values easier to understand. @@ -267,25 +176,29 @@ Additionally, as another use case, this field provides a way to store categoricals that are stored as "short" labels (such as abbreviations). +This field is intended to follow [this pattern](https://specs.frictionlessdata.io/patterns/#table-schema-enum-labels-and-ordering) + Examples: ``` - 0=No|1=Yes + 1=Poor|2=Fair|3=Good|4=Very good|5=Excellent ``` ``` - HW=Hello world|GBW=Good bye world|HM=Hi,Mike + HW=Hello world|GBW=Good bye world|HM=Hi, Mike ``` -**`ordered`** _(boolean)_ +**`enumOrdered`** _(boolean)_ Indicates whether a categorical variable is ordered. This variable is relevant for variables that have an ordered relationship but not necessarily a numerical relationship (e.g., Strongly disagree < Disagree < Neutral < Agree). +This field is intended to follow the ordering aspect of this [this pattern][this pattern](https://specs.frictionlessdata.io/patterns/#table-schema-enum-labels-and-ordering) + **`missingValues`** _(string)_ A list of missing values specific to a variable. @@ -312,168 +225,241 @@ Examples: ``` - Required|REQUIRED + required|Yes|Checked ``` ``` - required|Yes|Y|Checked + required ``` +**`falseValues`** _(string)_ + For boolean (false) variable (as defined in type field), this field allows +a physical string representation to be cast as false (increasing +readability of the field) that is not a standard false value. It can include one or more values. + +Examples: + + ``` - Checked + Not required|NOT REQUIRED ``` ``` - Required + No ``` -**`falseValues`** _(string)_ - For boolean (false) variable (as defined in type field), this field allows -a physical string representation to be cast as false (increasing -readability of the field) that is not a standard false value. It can include one or more values. +**`custom`** _(string)_ + Additional properties not included a core property. + +**`standardsMappings[0].instrument.url`** _(string)_ + A url (e.g., link, address) to a file or other resource containing the instrument, or +a set of items which encompass a variable in this variable level metadata document (if at the root level or the document level) +or the individual variable (if at the field level). + +Examples: -**`repo_link`** _(string)_ - A link to the variable as it exists on the home repository, if applicable +``` + https://www.heal.nih.gov/files/CDEs/2023-05/adult-demographics-cdes.xlsx + +``` -**`standardsMappings.url`** _(string)_ - The url that links out to the published, standardized mapping. +**`standardsMappings[0].instrument.source`** _(string)_ + An abbreviated name/acronym from a controlled vocabulary referencing the resource (e.g., program or repository) +containing the instrument, or a set of items which encompass a variable in this variable level metadata document (if at the root level or the document level) +or the individual variable (if at the field level). +Must be one of: `heal-cde` + +**`standardsMappings[0].instrument.title`** _(string)_ + Examples: ``` - https://cde.nlm.nih.gov/deView?tinyId=XyuSGdTTI + Adult demographics + +``` + +``` + adult-demographics ``` -**`standardsMappings.type`** _(string)_ - The **type** of mapping linked to a published set of standard variables such as the NIH Common Data Elements program +**`standardsMappings[0].instrument.id`** _(string)_ + A code or other string that identifies the instrument within the source. +This should always be from the source's formal, standardized identification system Examples: ``` - cde + 5141 ``` +**`standardsMappings[0].item.url`** _(string)_ + The url that links out to the published, standardized mapping of a variable (e.g., common data element) + +Examples: + + ``` - ontology + https://evs.nci.nih.gov/ftp1/CDISC/SDTM/SDTM%20Terminology.html#CL.C74457.RACE ``` +**`standardsMappings[0].item.source`** _(string)_ + The source of the standardized variable. Note, this property is required if +an id is specified. + +Examples: + + ``` - reference_list + CDISC ``` -**`standardsMappings.label`** _(string)_ - A free text **label** of a mapping indicating a mapping(s) to a published set of standard variables such as the NIH Common Data Elements program. +**`standardsMappings[0].item.id`** _(string)_ + The id locating the individual mapping within the given source. +Note, the `standardsMappings[0].source` property is required if +this property is specified. Examples: ``` - substance use + C74457 ``` +**`relatedConcepts[0].url`** _(string)_ + The url that links out to the published, related concept. +The listed examples could both be attached to any variable related to, for example, heroin use. + +> :point_up: if you are looking for mapping field values to common data elements or a set of standards, see `standardsMappings`_ + +Examples: + + ``` - chemical compound + https://www.ebi.ac.uk/chebi/chebiOntology.do?chebiId=CHEBI:27808 ``` ``` - promis + http://purl.bioontology.org/ontology/RXNORM/3304 ``` -**`standardsMappings.source`** _(string)_ - The source of the standardized variable. +**`relatedConcepts[0].title`** _(string)_ + A human-readable title (ie label) to a concept related to the given field. +The listed examples could both be attached to any variable related to, for example, heroin use. + +> :point_up: if you are looking for mapping field values to common data elements or a set of standards, see `standardsMappings`_ Examples: ``` - TBD (will have controlled vocabulary) + Heroin Molecular Structure ``` -**`standardsMappings.id`** _(string)_ - The id locating the individual mapping within the given source. +``` + Heroin Ontology +``` -**`relatedConcepts.url`** _(string)_ - The url that links out to the published, standardized concept. +**`relatedConcepts[0].source`** _(string)_ + The source (e.g., a dictionary or vocabulary set) to a concept related to the given field. +The listed examples could both be attached to any variable related to, for example, heroin use. + +> :point_up: if you are looking for mapping field values to common data elements or a set of standards, see `standardsMappings`_ Examples: ``` - https://cde.nlm.nih.gov/deView?tinyId=XyuSGdTTI + CHEBI ``` -**`relatedConcepts.type`** _(string)_ - The **type** of mapping to a published set of concepts related to the given field such as -ontological information (eg., NCI thesaurus, bioportal etc) - +``` + RXNORM -**`relatedConcepts.label`** _(string)_ - A free text **label** of mapping to a published set of concepts related to the given field such as -ontological information (eg., NCI thesaurus, bioportal etc) +``` +**`relatedConcepts[0].id`** _(string)_ + The id locating the individual concept within the source of the given field. +The listed examples could both be attached to any variable related to, for example, heroin use. -**`relatedConcepts.source`** _(string)_ - The source of the related concept. +> :point_up: if you are looking for mapping field values to common data elements or a set of standards, see `standardsMappings`_ Examples: ``` - TBD (will have controlled vocabulary) + 27808 ``` -**`relatedConcepts.id`** _(string)_ - The id locating the individual mapping within the given source. +``` + 3304 +``` -**`univarStats.median`** _(number)_ - -**`univarStats.mean`** _(number)_ - +## End of schema - Additional Property information -**`univarStats.std`** _(number)_ - +## `type` enum definitions: -**`univarStats.min`** _(number)_ - +- `number` (A numeric value with optional decimal places. (e.g., 3.14)) +- `integer` (A whole number without decimal places. (e.g., 42)) +- `string` (A sequence of characters. (e.g., \"test\")) +- `any` (Any type of data is allowed. (e.g., true)) +- `boolean` (A binary value representing true or false. (e.g., true)) +- `date` (A specific calendar date. (e.g., \"2023-05-25\")) +- `datetime` (A specific date and time, including timezone information. (e.g., \"2023-05-25T10:30:00Z\")) +- `time` (A specific time of day. (e.g., \"10:30:00\")) +- `year` (A specific year. (e.g., 2023) +- `yearmonth` (A specific year and month. (e.g., \"2023-05\")) +- `duration` (A length of time. (e.g., \"PT1H\") +- `geopoint` (A pair of latitude and longitude coordinates. (e.g., [51.5074, -0.1278])) -**`univarStats.max`** _(number)_ - +## `format` examples/definitions of patterns and possible values: -**`univarStats.mode`** _(number)_ - +Examples of date time pattern formats -**`univarStats.count`** _(integer)_ - +- `%Y-%m-%d` (for date, e.g., 2023-05-25) +- `%Y%-%d` (for date, e.g., 20230525) for date without dashes +- `%Y-%m-%dT%H:%M:%S` (for datetime, e.g., 2023-05-25T10:30:45) +- `%Y-%m-%dT%H:%M:%SZ` (for datetime with UTC timezone, e.g., 2023-05-25T10:30:45Z) +- `%Y-%m-%dT%H:%M:%S%z` (for datetime with timezone offset, e.g., 2023-05-25T10:30:45+0300) +- `%Y-%m-%dT%H:%M` (for datetime without seconds, e.g., 2023-05-25T10:30) +- `%Y-%m-%dT%H` (for datetime without minutes and seconds, e.g., 2023-05-25T10) +- `%H:%M:%S` (for time, e.g., 10:30:45) +- `%H:%M:%SZ` (for time with UTC timezone, e.g., 10:30:45Z) +- `%H:%M:%S%z` (for time with timezone offset, e.g., 10:30:45+0300) -**`univarStats.twentyFifthPercentile`** _(number)_ - +Examples of string formats -**`univarStats.seventyFifthPercentile`** _(number)_ - +- `email` if valid emails (e.g., test@gmail.com) +- `uri` if valid uri addresses (e.g., https://example.com/resource123) +- `binary` if a base64 binary encoded string (e.g., authentication token like aGVsbG8gd29ybGQ=) +- `uuid` if a universal unique identifier also known as a guid (eg., f47ac10b-58cc-4372-a567-0e02b2c3d479) -**`univarStats.categoricalMarginals.name`** _(string)_ - -**`univarStats.categoricalMarginals.count`** _(integer)_ - +Examples of geopoint formats + +The two types of formats for `geopoint` (describing a geographic point). + +- `array` (if 'lat,long' (e.g., 36.63,-90.20)) +- `object` (if {'lat':36.63,'lon':-90.20}) + diff --git a/variable-level-metadata-schema/docs/md-rendered-schemas/jsonschema-jsontemplate-data-dictionary.md b/variable-level-metadata-schema/docs/md-rendered-schemas/jsonschema-jsontemplate-data-dictionary.md index 0727a97..c569d2c 100644 --- a/variable-level-metadata-schema/docs/md-rendered-schemas/jsonschema-jsontemplate-data-dictionary.md +++ b/variable-level-metadata-schema/docs/md-rendered-schemas/jsonschema-jsontemplate-data-dictionary.md @@ -1,77 +1,120 @@ -# Variable Level Metadata (Data Dictionaries) +# Variable Level Metadata (Data Dictionaries) -This schema defines the variable level metadata for one data dictionary for a given study.Note a given study can have multiple data dictionaries +_version 0.2.0_ -### `title` _(string,required)_ +This schema defines the variable level metadata for one data dictionary for a given study.Note a given study can have multiple data dictionaries. -### `description` _(string)_ +## `title` _(string,required)_ -### `data_dictionary` _(array,required)_ +## `description` _(string)_ -Variable level metadata individual fields integrated into the variable level -metadata object within the HEAL platform metadata service. +## `schemaVersion` _(string)_ +The version of the schema used in agreed upon convention of major.minor.path (e.g., 1.0.2) + +NOTE: This is NOT for versioning of each indiviual data dictionary instance. +Rather, it is the +version of THIS schema document. See `version` property (below) if specifying the individual data dictionary instance +version. + +If generating a vlmd document as a csv file, include this version in +every row/record to indicate this is a schema level property +(not applicable for the json version as this property is already at the schema/root level) + +## `version` _(string)_ +The specified individual data dictionary instance version. +## `standardsMappings` _(array)_ +A set of standardized instruments linked to all variables within the `fields` property (but see note). !!! note "NOTE" - Only `name` and `description` properties are required. - For categorical variables, `constraints.enum` and `encodings` (where applicable) properties are highly encouraged. - For studies using HEAL or other common data elements (CDEs), `standardsMappings` information is highly encouraged. - `type` and `format` properties may be particularly useful for some variable types (e.g. date-like variables) + If `standardsMappings` is present at both the root (this property) and within `fields`, + then the `fields` `standardsMappings` property takes precedence. -#### Properties for each record + Note, only instrument can be mapped to this property as opposed to the `fields` `standardsMappings` + This property has the same specification as the `fields` `standardsMappings` to make the cascading logic + easier to understand in the same way other standards implement cascading + (e.g., `missingValues` in the [frictionless specification](https://specs.frictionlessdata.io/patterns/#missing-values-per-field)) -**`module`** _(string)_ - The section, form, survey instrument, set of measures or other broad category used -to group variables. +## `custom` _(object)_ +Additional properties not included as a core property. + +## `fields` _(array,required)_ + + + +!!! note "Highly encouraged" + + - Only `name` and `description` properties are required. + - For categorical variables, `constraints.enum` and `enumLabels` (where applicable) properties are highly encouraged. + - For studies using HEAL or other common data elements (CDEs), `standardsMappings` information is highly encouraged. + - `type` and `format` properties may be particularly useful for some variable types (e.g. date-like variables) + +### Properties for each `fields` record + +**`schemaVersion`** _(string)_ + The version of the schema used in agreed upon convention of major.minor.path (e.g., 1.0.2) + +NOTE: This is NOT for versioning of each indiviual data dictionary instance. +Rather, it is the +version of THIS schema document. See `version` property (below) if specifying the individual data dictionary instance +version. + +If generating a vlmd document as a csv file, include this version in +every row/record to indicate this is a schema level property +(not applicable for the json version as this property is already at the schema/root level) Examples: ``` - Demographics + 1.0.0 ``` ``` - PROMIS + 0.2.0 ``` -``` - Substance use +**`section`** _(string)_ + The section, form, survey instrument, set of measures or other broad category used +to group variables. Previously called "module." + +Examples: -``` ``` - Medical History + Demographics ``` ``` - Sleep questions + PROMIS ``` ``` - Physical activity + Medical History ``` **`name`** _(string,required)_ The name of a variable (i.e., field) as it appears in the data. - -**`title`** _(string)_ - The human-readable title or label of the variable. - Examples: ``` - My Variable + gender_id ``` +**`title`** _(string)_ + The human-readable title or label of the variable. + +Examples: + + ``` Gender identity @@ -97,135 +140,24 @@ Examples: **`type`** _(string)_ A classification or category of a particular data element or property expected or allowed in the dataset. -Definitions: - -- `number` (A numeric value with optional decimal places. (e.g., 3.14)) -- `integer` (A whole number without decimal places. (e.g., 42)) -- `string` (A sequence of characters. (e.g., \"test\")) -- `any` (Any type of data is allowed. (e.g., true)) -- `boolean` (A binary value representing true or false. (e.g., true)) -- `date` (A specific calendar date. (e.g., \"2023-05-25\")) -- `datetime` (A specific date and time, including timezone information. (e.g., \"2023-05-25T10:30:00Z\")) -- `time` (A specific time of day. (e.g., \"10:30:00\")) -- `year` (A specific year. (e.g., 2023) -- `yearmonth` (A specific year and month. (e.g., \"2023-05\")) -- `duration` (A length of time. (e.g., \"PT1H\") -- `geopoint` (A pair of latitude and longitude coordinates. (e.g., [51.5074, -0.1278])) - -Possible values: - -- ``` - - number - - ``` -- ``` - - integer - - ``` -- ``` - - string - - ``` -- ``` - - any - - ``` -- ``` - - boolean - - ``` -- ``` - - date - - ``` -- ``` - - datetime - - ``` -- ``` - - time - - ``` -- ``` - - year - - ``` -- ``` - - yearmonth - - ``` -- ``` - - duration - - ``` -- ``` - - geopoint - - ``` - +Must be one of: `number`, `integer`, `string`, `any`, `boolean`, `date`, `datetime`, `time`, `year`, `yearmonth`, `duration`, `geopoint` **`format`** _(string)_ Indicates the format of the type specified in the `type` property. Each format is dependent on the `type` specified. -For example: If `type` is "string", then see the [String formats](https://specs.frictionlessdata.io/table-schema/#string). -If `type` is "date", "datetime", or "time", default format is ISO8601 formatting for those respective types (see details on ISO8601 format for [Date](https://specs.frictionlessdata.io/table-schema/#date), -[Datetime](https://specs.frictionlessdata.io/table-schema/#datetime), -or [Time](https://specs.frictionlessdata.io/table-schema/#time)) - If you want to specify a date-like variable using standard Python/C strptime syntax, see [here](#format-details-for-date-datetime-time-type-variables) for details. -See [here](https://specs.frictionlessdata.io/table-schema/#types-and-formats) for more information about appropriate `format` values by variable `type`. - -[Additional information] - -Date Formats (date, datetime, time `type` variable): - -A format for a date variable (`date`,`time`,`datetime`). -**default**: An ISO8601 format string. -**any**: Any parsable representation of a date/time/datetime. The implementing library can attempt to parse the datetime via a range of strategies. - -**{PATTERN}**: The value can be parsed according to `{PATTERN}`, -which `MUST` follow the date formatting syntax of -C / Python [strftime](http://strftime.org/) such as: +See [here](https://specs.frictionlessdata.io/table-schema/#types-and-formats) +for more information about appropriate `format` values by variable `type`. -- "`%Y-%m-%d` (for date, e.g., 2023-05-25)" -- "`%Y%-%d` (for date, e.g., 20230525) for date without dashes" -- "`%Y-%m-%dT%H:%M:%S` (for datetime, e.g., 2023-05-25T10:30:45)" -- "`%Y-%m-%dT%H:%M:%SZ` (for datetime with UTC timezone, e.g., 2023-05-25T10:30:45Z)" -- "`%Y-%m-%dT%H:%M:%S%z` (for datetime with timezone offset, e.g., 2023-05-25T10:30:45+0300)" -- "`%Y-%m-%dT%H:%M` (for datetime without seconds, e.g., 2023-05-25T10:30)" -- "`%Y-%m-%dT%H` (for datetime without minutes and seconds, e.g., 2023-05-25T10)" -- "`%H:%M:%S` (for time, e.g., 10:30:45)" -- "`%H:%M:%SZ` (for time with UTC timezone, e.g., 10:30:45Z)" -- "`%H:%M:%S%z` (for time with timezone offset, e.g., 10:30:45+0300)" -String formats: - -- "`email` if valid emails (e.g., test@gmail.com)" -- "`uri` if valid uri addresses (e.g., https://example.com/resource123)" -- "`binary` if a base64 binary encoded string (e.g., authentication token like aGVsbG8gd29ybGQ=)" -- "`uuid` if a universal unique identifier also known as a guid (eg., f47ac10b-58cc-4372-a567-0e02b2c3d479)" - - -Geopoint formats: - -The two types of formats for `geopoint` (describing a geographic point). +**`constraints`** _(object)_ + -- `array` (if 'lat,long' (e.g., 36.63,-90.20)) -- `object` (if {'lat':36.63,'lon':-90.20}) +- **`required`** _(boolean)_ + If this variable is marked as true, then this variable's value must be present + (ie not missing; see missingValues). If marked as false or not present, then the + variable CAN be missing. -**`constraints`** _(object)_ - - **`maxLength`** _(integer)_ @@ -243,13 +175,13 @@ The two types of formats for `geopoint` (describing a geographic point). ```json - [1, 2, 3, 4] + [1, 2, 3, 4, 5] ``` ```json - ['White', 'Black or African American', 'American Indian or Alaska Native', 'Native Hawaiian or Other Pacific Islander', 'Asian', 'Some other race', 'Multiracial'] + ['Poor', 'Fair', 'Good', 'Very good', 'Excellent'] ``` @@ -271,7 +203,7 @@ The two types of formats for `geopoint` (describing a geographic point). -**`encodings`** _(object)_ +**`enumLabels`** _(object)_ Variable value encodings provide a way to further annotate any value within a any variable type, making values easier to understand. @@ -284,12 +216,14 @@ Additionally, as another use case, this field provides a way to store categoricals that are stored as "short" labels (such as abbreviations). +This field is intended to follow [this pattern](https://specs.frictionlessdata.io/patterns/#table-schema-enum-labels-and-ordering) + Examples: ```json - {'0': 'No', '1': 'Yes'} + {'1': 'Poor', '2': 'Fair', '3': 'Good', '4': 'Very good', '5': 'Excellent'} ``` @@ -299,12 +233,14 @@ Examples: ``` -**`ordered`** _(boolean)_ +**`enumOrdered`** _(boolean)_ Indicates whether a categorical variable is ordered. This variable is relevant for variables that have an ordered relationship but not necessarily a numerical relationship (e.g., Strongly disagree < Disagree < Neutral < Agree). +This field is intended to follow the ordering aspect of this [this pattern][this pattern](https://specs.frictionlessdata.io/patterns/#table-schema-enum-labels-and-ordering) + **`missingValues`** _(array)_ A list of missing values specific to a variable. @@ -349,58 +285,171 @@ Examples: a physical string representation to be cast as false (increasing readability of the field) that is not a standard false value. It can include one or more values. +Examples: + + +```json + + ['Not required', 'NOT REQUIRED'] + +``` + +```json + + ['No'] + +``` -**`repo_link`** _(string)_ - A link to the variable as it exists on the home repository, if applicable +**`custom`** _(object)_ + Additional properties not included a core property. **`standardsMappings`** _(array)_ - A published set of standard variables such as the NIH Common Data Elements program. + +A set of instrument and item references to standardized data elements designed to document +the [HEAL common data elements program](https://heal.nih.gov/data/common-data-elements) +and other standardized/common element sources to facilitate cross-study comparison and interoperability +of data. One can either map an individual data element or an instrument in which the field is +a part of. -**`relatedConcepts`** _(array)_ - Mappings to a published set of concepts related to the given field such as ontological information (eg., NCI thesaurus, bioportal etc) +__**All Fields Mapped (Both Instrument and Item)**__ -**`univarStats`** _(object)_ - Univariate statistics inferred from the data about the given variable +```json +"standardsMappings": [ + { + "instrument": { + "url": "https://www.heal.nih.gov/files/CDEs/2023-05/adult-demographics-cdes.xlsx", + "source": "heal-cde", + "title": "adult-demographics", + "id": "5141" + }, + "item": { + "url": "https://evs.nci.nih.gov/ftp1/CDISC/SDTM/SDTM%20Terminology.html#CL.C74457.RACE", + "source": "CDISC", + "id": "C74457" + } + } +] +``` +__**Only Instrument Title of Form CDE File Mapped**__ +In this scenario, especially as CDE variables do not have associated CDISC ids listed, only instrument information is given. -- **`median`** _(number)_ - +```json +"standardsMappings": [ + { + "instrument": { + "source": "heal-cde", + "title": "Adult demographics" + } + } +] +``` +__**Only Instrument ID of HEAL CDE Mapped**__ -- **`mean`** _(number)_ - +```json +"standardsMappings": [ + { + "instrument": { + "source": "heal-cde", + "id": "5141" + } + } +] +``` +__**Other Non-HEAL CDE Use Cases**__ -- **`std`** _(number)_ - +Only item matched (for example if found in the NIH (not HEAL) CDE repository). Folks would enter the information in the "Identifier" section. Similar to the above, they could also just enter the "url". +```json +"standardsMappings": [ + { + "item": { + "source": "NLM", + "id": "Fakc6Jy2x" + } + } +] +``` -- **`min`** _(number)_ - +__**Multiple CDE Mappings**__ +Two separate records. If desired, multiple standard mappings can be entered, say from the NIH HEAL CDE repo and the NIH CDE lookup (NLM) by way of two separate records in the list. -- **`max`** _(number)_ - +```json +"standardsMappings": [ + { + "instrument": { + "source": "heal-cde", + "title": "Adult demographics" + }, + "item": { + "source": "CDISC", + "id": "C74457" + }, + }, + { + "item": { + "source": "NLM", + "id": "Fakc6Jy2x" + } + } +] +``` -- **`mode`** _(number)_ - +**`relatedConcepts`** _(array)_ + __**[Under development]**__ Mappings to a published set of concepts related to the given field such as +ontological information (eg., NCI thesaurus, bioportal etc) -- **`count`** _(integer)_ - +### Additional `fields` property information +#### `type` enum definitions: -- **`twentyFifthPercentile`** _(number)_ - +- `number` (A numeric value with optional decimal places. (e.g., 3.14)) +- `integer` (A whole number without decimal places. (e.g., 42)) +- `string` (A sequence of characters. (e.g., \"test\")) +- `any` (Any type of data is allowed. (e.g., true)) +- `boolean` (A binary value representing true or false. (e.g., true)) +- `date` (A specific calendar date. (e.g., \"2023-05-25\")) +- `datetime` (A specific date and time, including timezone information. (e.g., \"2023-05-25T10:30:00Z\")) +- `time` (A specific time of day. (e.g., \"10:30:00\")) +- `year` (A specific year. (e.g., 2023) +- `yearmonth` (A specific year and month. (e.g., \"2023-05\")) +- `duration` (A length of time. (e.g., \"PT1H\") +- `geopoint` (A pair of latitude and longitude coordinates. (e.g., [51.5074, -0.1278])) +#### `format` examples/definitions of patterns and possible values: -- **`seventyFifthPercentile`** _(number)_ - +Examples of date time pattern formats +- `%Y-%m-%d` (for date, e.g., 2023-05-25) +- `%Y%-%d` (for date, e.g., 20230525) for date without dashes +- `%Y-%m-%dT%H:%M:%S` (for datetime, e.g., 2023-05-25T10:30:45) +- `%Y-%m-%dT%H:%M:%SZ` (for datetime with UTC timezone, e.g., 2023-05-25T10:30:45Z) +- `%Y-%m-%dT%H:%M:%S%z` (for datetime with timezone offset, e.g., 2023-05-25T10:30:45+0300) +- `%Y-%m-%dT%H:%M` (for datetime without seconds, e.g., 2023-05-25T10:30) +- `%Y-%m-%dT%H` (for datetime without minutes and seconds, e.g., 2023-05-25T10) +- `%H:%M:%S` (for time, e.g., 10:30:45) +- `%H:%M:%SZ` (for time with UTC timezone, e.g., 10:30:45Z) +- `%H:%M:%S%z` (for time with timezone offset, e.g., 10:30:45+0300) -- **`categoricalMarginals`** _(array)_ - +Examples of string formats + +- `email` if valid emails (e.g., test@gmail.com) +- `uri` if valid uri addresses (e.g., https://example.com/resource123) +- `binary` if a base64 binary encoded string (e.g., authentication token like aGVsbG8gd29ybGQ=) +- `uuid` if a universal unique identifier also known as a guid (eg., f47ac10b-58cc-4372-a567-0e02b2c3d479) + + +Examples of geopoint formats + +The two types of formats for `geopoint` (describing a geographic point). + +- `array` (if 'lat,long' (e.g., 36.63,-90.20)) +- `object` (if {'lat':36.63,'lon':-90.20}) diff --git a/variable-level-metadata-schema/examples/valid/template_submission.csv b/variable-level-metadata-schema/examples/valid/template_submission.csv index 3e27439..f199e77 100644 --- a/variable-level-metadata-schema/examples/valid/template_submission.csv +++ b/variable-level-metadata-schema/examples/valid/template_submission.csv @@ -1,8 +1,8 @@ -module,name,title,description,type,format,constraints.maxLength,constraints.enum,constraints.pattern,constraints.maximum,constraints.minimum,encodings,ordered,missingValues,trueValues,falseValues,repo_link,standardsMappings.type,standardsMappings.label,standardsMappings.url,standardsMappings.source,standardsMappings.id,relatedConcepts.type,relatedConcepts.label,relatedConcepts.url,relatedConcepts.source,relatedConcepts.id,univarStats.median,univarStats.mean,univarStats.std,univarStats.min,univarStats.max,univarStats.mode,univarStats.count,univarStats.twentyFifthPercentile,univarStats.seventyFifthPercentile,univarStats.categoricalMarginals.name,univarStats.categoricalMarginals.count -Enrollment,participant_id,Participant Id,Unique identifier for participant,string,,,,[A-Z][0-9][0-9][0-9]-[0-9][0-9][0-9][0-9],,,,,,,,,,,,,,,,,,,,,,,,,,,,, -Demographics,race,Race,Self-reported race,integer,,,1|2|3|4|5|6|7|8,,,,1=White|2=Black or African American|3=American Indian or Alaska Native|4=Native| 5=Hawaiian or Other Pacific Islander|6=Asian|7=Some other race|8=Multiracial|99=Not reported,,99,,,,cde|cde,NLM race,,NLM|NLM,Fakc6Jy2x|m1_atF7L7U,,,,,,,,,,,,,,,, -Demographics,age,Age,What is your age? (age at enrollment),integer,,,,,90,0,,,,,,,,,,,,,,,,,,,,,,,,,,, -Demographics,hispanic,"Hispanic, Latino, or Spanish Origin","Are you of Hispanic, Latino, or Spanish origin?",boolean,,,,,,,,,Not reported,No,Yes,,,,,,,,,,,,,,,,,,,,,, -Demographics,sex_at_birth,Sex at Birth,The self-reported sex of the participant/subject at birth,string,,,Male|Female|Intersex|None of these describe me|Prefer not to answer|Unknown,,,,,,Prefer not to answer|Unknown,,,,,,,,,,,,,,,,,,,,,,,, -Substance Use,SU4,Heroin Days Used,During the past 30 days how many days did you use heroin (alone or mixed with other drugs)? ] [Write 0 days if no use],integer,,,,,,,,,,,,,,,,,,ontology|ontology,,https://www.ebi.ac.uk/chebi/chebiOntology.do?chebiId=CHEBI:27808|http://purl.bioontology.org/ontology/RXNORM/3304,CHEBI|RXNORM,27808|3304,,,,,,,,,,, -Biomeasures,pulse_rate,Pulse Rate,Heart rate measured at systemic artery,number,,,,,,,,,,,,,,,,,,ontology,SNOMEDCT bioontology,http://purl.bioontology.org/ontology/SNOMEDCT/78564009,SNOMEDCT,78564009,,,,,,,,,,, +section,name,title,description,type,format,constraints.maxLength,constraints.enum,constraints.pattern,constraints.maximum,constraints.minimum,enumLabels,enumOrdered,missingValues,trueValues,falseValues +Enrollment,participant_id,Participant Id,Unique identifier for participant,string,,,,[A-Z][0-9][0-9][0-9]-[0-9][0-9][0-9][0-9],,,,,,, +Demographics,race,Race,Self-reported race,integer,,,1|2|3|4|5|6|7|8,,,,1=White|2=Black or African American|3=American Indian or Alaska Native|4=Native| 5=Hawaiian or Other Pacific Islander|6=Asian|7=Some other race|8=Multiracial|99=Not reported,,99,, +Demographics,age,Age,What is your age? (age at enrollment),integer,,,,,90,0,,,,, +Demographics,hispanic,"Hispanic, Latino, or Spanish Origin","Are you of Hispanic, Latino, or Spanish origin?",boolean,,,,,,,,,Not reported,No,Yes +Demographics,sex_at_birth,Sex at Birth,The self-reported sex of the participant/subject at birth,string,,,Male|Female|Intersex|None of these describe me|Prefer not to answer|Unknown,,,,,,Prefer not to answer|Unknown,, +Substance Use,SU4,Heroin Days Used,During the past 30 days how many days did you use heroin (alone or mixed with other drugs)? ] [Write 0 days if no use],integer,,,,,,,,,,, +Biomeasures,pulse_rate,Pulse Rate,Heart rate measured at systemic artery,number,,,,,,,,,,, diff --git a/variable-level-metadata-schema/examples/valid/template_submission.json b/variable-level-metadata-schema/examples/valid/template_submission.json index 3aa31e5..380428f 100644 --- a/variable-level-metadata-schema/examples/valid/template_submission.json +++ b/variable-level-metadata-schema/examples/valid/template_submission.json @@ -1,9 +1,9 @@ { "title": "Example VLMD", "description": "This is an example description", - "data_dictionary": [ + "fields": [ { - "module": "Enrollment", + "section": "Enrollment", "name": "participant_id", "title": "Participant Id", "description": "Unique identifier for participant", @@ -13,7 +13,7 @@ } }, { - "module": "Demographics", + "section": "Demographics", "name": "race", "title": "Race", "description": "Self-reported race", @@ -59,7 +59,7 @@ ] }, { - "module": "Demographics", + "section": "Demographics", "name": "age", "title": "Age", "description": "What is your age? (age at enrollment)", @@ -70,7 +70,7 @@ } }, { - "module": "Demographics", + "section": "Demographics", "name": "hispanic", "title": "Hispanic, Latino, or Spanish Origin", "description": "Are you of Hispanic, Latino, or Spanish origin?", @@ -86,7 +86,7 @@ ] }, { - "module": "Demographics", + "section": "Demographics", "name": "sex_at_birth", "title": "Sex at Birth", "description": "The self-reported sex of the participant/subject at birth", @@ -107,7 +107,7 @@ ] }, { - "module": "Substance Use", + "section": "Substance Use", "name": "SU4", "title": "Heroin Days Used", "description": "During the past 30 days how many days did you use heroin (alone or mixed with other drugs)? ] [Write 0 days if no use]", @@ -128,7 +128,7 @@ ] }, { - "module": "Biomeasures", + "section": "Biomeasures", "name": "pulse_rate", "title": "Pulse Rate", "description": "Heart rate measured at systemic artery", diff --git a/variable-level-metadata-schema/examples/valid/template_submission_minimal.csv b/variable-level-metadata-schema/examples/valid/template_submission_minimal.csv index 6815bd3..2d5175f 100644 --- a/variable-level-metadata-schema/examples/valid/template_submission_minimal.csv +++ b/variable-level-metadata-schema/examples/valid/template_submission_minimal.csv @@ -1,8 +1,8 @@ -module,name,title,description,type,format,constraints.maxLength,constraints.enum,constraints.pattern,constraints.maximum,constraints.minimum,encodings,ordered,missingValues,trueValues,falseValues,repo_link,standardsMappings.type,standardsMappings.label,standardsMappings.url,standardsMappings.source,standardsMappings.id,relatedConcepts.type,relatedConcepts.label,relatedConcepts.url,relatedConcepts.source,relatedConcepts.id,univarStats.median,univarStats.mean,univarStats.std,univarStats.min,univarStats.max,univarStats.mode,univarStats.count,univarStats.twentyFifthPercentile,univarStats.seventyFifthPercentile,univarStats.categoricalMarginals.name,univarStats.categoricalMarginals.count -,participant_id,,Unique identifier for participant,string,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -,race,,Self-reported race,integer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -,age,,What is your age? (age at enrollment),integer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -,hispanic,,"Are you of Hispanic, Latino, or Spanish origin?",boolean,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -,sex_at_birth,,The self-reported sex of the participant/subject at birth,string,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -,SU4,,During the past 30 days how many days did you use heroin (alone or mixed with other drugs)? ] [Write 0 days if no use],integer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, -,pulse_rate,,Heart rate measured at systemic artery,number,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, +section,name,title,description,type +Enrollment,participant_id,Participant id,Unique identifier for participant,string +Demographics,race,Race,Self-reported race,integer +Demographics,age,Age,What is your age? (age at enrollment),integer +Demographics,hispanic,Hispanic,"Are you of Hispanic, Latino, or Spanish origin?",boolean +Demographics,sex_at_birth,Sex at Birth,The self-reported sex of the participant/subject at birth,string +Substance Use,SU4,Heroin Days Used,During the past 30 days how many days did you use heroin (alone or mixed with other drugs)? ] [Write 0 days if no use],integer +Biomeasures,pulse_rate,Pulse rate,Heart rate measured at systemic artery,number diff --git a/variable-level-metadata-schema/examples/valid/template_submission_minimal.json b/variable-level-metadata-schema/examples/valid/template_submission_minimal.json index 21b993d..62c4f4f 100644 --- a/variable-level-metadata-schema/examples/valid/template_submission_minimal.json +++ b/variable-level-metadata-schema/examples/valid/template_submission_minimal.json @@ -1,7 +1,7 @@ { "title": "Minimal Example VLMD", "description": "This is an minimally filled out template", - "data_dictionary": [ + "fields": [ { "name": "participant_id", "description": "Unique identifier for participant", diff --git a/variable-level-metadata-schema/schemas/dictionary/_definitions.yaml b/variable-level-metadata-schema/schemas/dictionary/_definitions.yaml new file mode 100644 index 0000000..e34bae3 --- /dev/null +++ b/variable-level-metadata-schema/schemas/dictionary/_definitions.yaml @@ -0,0 +1,286 @@ +csvArray: + type: string + pattern: ^(?:[^|]+\||[^|]*)(?:[^|]*\|)*[^|]*$ + +csvObject: + type: string + pattern: ^(?:.*?=.*?(?:\||$))+$ + +schemaVersion: + type: string + description: | + The version of the schema used in agreed upon convention of major.minor.path (e.g., 1.0.2) + + NOTE: This is NOT for versioning of each indiviual data dictionary instance. + Rather, it is the + version of THIS schema document. See `version` property (below) if specifying the individual data dictionary instance + version. + + If generating a vlmd document as a csv file, include this version in + every row/record to indicate this is a schema level property + (not applicable for the json version as this property is already at the schema/root level) + pattern: \d+\.\d+\.\d+ + examples: + - "1.0.0" + - "0.2.0" +standardsMappingsInstrumentObject: + type: object + title: Standard mapping - instrument + description: | + A standardized set of items which encompass + a variable in this variable level metadata document (if at the root level or the document level) + or the individual variable (if at the field level). + + + !!! note "NOTE" + + If information is present at both the root and the field level, + then the information at the field level would take precedence (i.e., it would cascade). + + properties: + url: + title: Url + description: | + A url (e.g., link, address) to a file or other resource containing the instrument, or + a set of items which encompass a variable in this variable level metadata document (if at the root level or the document level) + or the individual variable (if at the field level). + type: string + format: uri + examples: + - "https://www.heal.nih.gov/files/CDEs/2023-05/adult-demographics-cdes.xlsx" + source: + type: string + title: Source + description: | + An abbreviated name/acronym from a controlled vocabulary referencing the resource (e.g., program or repository) + containing the instrument, or a set of items which encompass a variable in this variable level metadata document (if at the root level or the document level) + or the individual variable (if at the field level). + enum: ["heal-cde"] + title: + type: string + title: Title + examples: + - Adult demographics + - adult-demographics + id: + type: string + title: Identifier + description: | + A code or other string that identifies the instrument within the source. + This should always be from the source's formal, standardized identification system + + examples: + - "5141" + +rootStandardsMappingsItem: + type: array + description: | + A set of standardized instruments linked to all variables within the `fields` property (but see note). + + !!! note "NOTE" + + If `standardsMappings` is present at both the root (this property) and within `fields`, + then the `fields` `standardsMappings` property takes precedence. + + Note, only instrument can be mapped to this property as opposed to the `fields` `standardsMappings` + This property has the same specification as the `fields` `standardsMappings` to make the cascading logic + easier to understand in the same way other standards implement cascading + (e.g., `missingValues` in the [frictionless specification](https://specs.frictionlessdata.io/patterns/#missing-values-per-field)) + items: + properties: + type: object + instrument: + $ref: "#/_definitions/standardsMappingsInstrumentObject" + + +fieldStandardsMappingsItem: + type: array + description: | + + A set of instrument and item references to standardized data elements designed to document + the [HEAL common data elements program](https://heal.nih.gov/data/common-data-elements) + and other standardized/common element sources to facilitate cross-study comparison and interoperability + of data. One can either map an individual data element or an instrument in which the field is + a part of. + + __**All Fields Mapped (Both Instrument and Item)**__ + + ```json + "standardsMappings": [ + { + "instrument": { + "url": "https://www.heal.nih.gov/files/CDEs/2023-05/adult-demographics-cdes.xlsx", + "source": "heal-cde", + "title": "adult-demographics", + "id": "5141" + }, + "item": { + "url": "https://evs.nci.nih.gov/ftp1/CDISC/SDTM/SDTM%20Terminology.html#CL.C74457.RACE", + "source": "CDISC", + "id": "C74457" + } + } + ] + ``` + + __**Only Instrument Title of Form CDE File Mapped**__ + + In this scenario, especially as CDE variables do not have associated CDISC ids listed, only instrument information is given. + + ```json + "standardsMappings": [ + { + "instrument": { + "source": "heal-cde", + "title": "Adult demographics" + } + } + ] + ``` + + __**Only Instrument ID of HEAL CDE Mapped**__ + + ```json + "standardsMappings": [ + { + "instrument": { + "source": "heal-cde", + "id": "5141" + } + } + ] + ``` + + __**Other Non-HEAL CDE Use Cases**__ + + Only item matched (for example if found in the NIH (not HEAL) CDE repository). Folks would enter the information in the "Identifier" section. Similar to the above, they could also just enter the "url". + + ```json + "standardsMappings": [ + { + "item": { + "source": "NLM", + "id": "Fakc6Jy2x" + } + } + ] + ``` + + __**Multiple CDE Mappings**__ + + Two separate records. If desired, multiple standard mappings can be entered, say from the NIH HEAL CDE repo and the NIH CDE lookup (NLM) by way of two separate records in the list. + + ```json + "standardsMappings": [ + { + "instrument": { + "source": "heal-cde", + "title": "Adult demographics" + }, + "item": { + "source": "CDISC", + "id": "C74457" + }, + }, + { + "item": { + "source": "NLM", + "id": "Fakc6Jy2x" + } + } + ] + ``` + items: + type: object + properties: + instrument: + $ref: "#/_definitions/standardsMappingsInstrumentObject" + + + item: + type: object + title: Standards mappings - Item + description: | + A standardized item (ie field, variable etc) mapped to this individual variable. + properties: + url: + title: Standards mappings - Url + description: | + The url that links out to the published, standardized mapping of a variable (e.g., common data element) + type: string + format: uri + examples: + - "https://evs.nci.nih.gov/ftp1/CDISC/SDTM/SDTM%20Terminology.html#CL.C74457.RACE" + source: + title: Standards mappings - Source + description: | + The source of the standardized variable. Note, this property is required if + an id is specified. + examples: + - "CDISC" + type: string + id: + title: Standards Mappings - Id + type: string + description: | + The id locating the individual mapping within the given source. + Note, the `standardsMappings[0].source` property is required if + this property is specified. + examples: + - "C74457" + +relatedConcepts: + title: Related Concepts + description: | + __**[Under development]**__ Mappings to a published set of concepts related to the given field such as + ontological information (eg., NCI thesaurus, bioportal etc) + type: array + items: + type: object + properties: + url: + title: Related Concepts - Url + description: | + The url that links out to the published, related concept. + The listed examples could both be attached to any variable related to, for example, heroin use. + + > :point_up: if you are looking for mapping field values to common data elements or a set of standards, see `standardsMappings`_ + type: string + format: uri + examples: + - https://www.ebi.ac.uk/chebi/chebiOntology.do?chebiId=CHEBI:27808 + - http://purl.bioontology.org/ontology/RXNORM/3304 + + title: + title: Related concepts - Type + description: | + A human-readable title (ie label) to a concept related to the given field. + The listed examples could both be attached to any variable related to, for example, heroin use. + + > :point_up: if you are looking for mapping field values to common data elements or a set of standards, see `standardsMappings`_ + type: string + examples: + - Heroin Molecular Structure + - Heroin Ontology + source: + title: Related Concepts - Source + description: | + The source (e.g., a dictionary or vocabulary set) to a concept related to the given field. + The listed examples could both be attached to any variable related to, for example, heroin use. + + > :point_up: if you are looking for mapping field values to common data elements or a set of standards, see `standardsMappings`_ + type: string + examples: + - CHEBI + - RXNORM + id: + title: Related Concepts - Id + type: string + description: | + The id locating the individual concept within the source of the given field. + The listed examples could both be attached to any variable related to, for example, heroin use. + + > :point_up: if you are looking for mapping field values to common data elements or a set of standards, see `standardsMappings`_ + examples: + - "27808" + - "3304" diff --git a/variable-level-metadata-schema/schemas/dictionary/data-dictionary.yaml b/variable-level-metadata-schema/schemas/dictionary/data-dictionary.yaml index cc83799..b99ab2f 100644 --- a/variable-level-metadata-schema/schemas/dictionary/data-dictionary.yaml +++ b/variable-level-metadata-schema/schemas/dictionary/data-dictionary.yaml @@ -2,17 +2,48 @@ "$id": vlmd title: Variable Level Metadata (Data Dictionaries) description: This schema defines the variable level metadata for one data dictionary - for a given study.Note a given study can have multiple data dictionaries + for a given study.Note a given study can have multiple data dictionaries. type: object required: - title -- data_dictionary +- fields properties: title: type: string description: type: string - data_dictionary: + schemaVersion: + $ref: "#/_definitions/schemaVersion" + + version: # TODO: think about having a version text/message and id (akin to a git commit) + type: string + description: The specified individual data dictionary instance version. + standardsMappings: + $ref: "#/_definitions/rootStandardsMappingsItem" + custom: + type: object + description: | + Additional properties not included as a core property. + fields: type: array items: - $ref: "#/fields" \ No newline at end of file + $ref: "#/fields" +propertyNames: + description: | + To allow additional properties for compatibility with other standards at the "table" , or root, but not included in the core `properties` set: + + [Frictionless Data package table schema standard](https://specs.frictionlessdata.io/table-schema): `missingValues`|`primaryKey`|`foreignKeys` + enum: + # core properties + - title + - description + - schemaVersion + - version + - standardsMappings + - fields + # custom properties + - custom + # custom properties but a part of standards + - missingValues + - primaryKey + - foreignKeys diff --git a/variable-level-metadata-schema/schemas/dictionary/definitions.yaml b/variable-level-metadata-schema/schemas/dictionary/definitions.yaml deleted file mode 100644 index 8ac0d07..0000000 --- a/variable-level-metadata-schema/schemas/dictionary/definitions.yaml +++ /dev/null @@ -1,69 +0,0 @@ -csvArray: - type: string - pattern: ^(?:[^|]+\||[^|]*)(?:[^|]*\|)*[^|]*$ - -csvObject: - type: string - pattern: ^(?:.*?=.*?(?:\||$))+$ - - -# for frictionless types and formats see: -# https://specs.frictionlessdata.io/table-schema/#types-and-formats - -# NOTE: The below was excluded from schema to simplify (10/6/2023) and formats is now just type string isntead of anyOf -stringFormat: - title: String Formats - description: | - A format for a specialized type of string of: - - - "`email` if valid emails (e.g., test@gmail.com)" - - "`uri` if valid uri addresses (e.g., https://example.com/resource123)" - - "`binary` if a base64 binary encoded string (e.g., authentication token like aGVsbG8gd29ybGQ=)" - - "`uuid` if a universal unique identifier also known as a guid (eg., f47ac10b-58cc-4372-a567-0e02b2c3d479)" - - enum: - - uri - - email - - binary - - uuid - -dateFormat: - title: Date Formats - type: string - description: | - A format for a date variable (`date`,`time`,`datetime`). - **default**: An ISO8601 format string. - **any**: Any parsable representation of a date/time/datetime. The implementing library can attempt to parse the datetime via a range of strategies. - - **{PATTERN}**: The value can be parsed according to `{PATTERN}`, - which `MUST` follow the date formatting syntax of - C / Python [strftime](http://strftime.org/) such as: - - - "`%Y-%m-%d` (for date, e.g., 2023-05-25)" - - "`%Y%-%d` (for date, e.g., 20230525) for date without dashes" - - "`%Y-%m-%dT%H:%M:%S` (for datetime, e.g., 2023-05-25T10:30:45)" - - "`%Y-%m-%dT%H:%M:%SZ` (for datetime with UTC timezone, e.g., 2023-05-25T10:30:45Z)" - - "`%Y-%m-%dT%H:%M:%S%z` (for datetime with timezone offset, e.g., 2023-05-25T10:30:45+0300)" - - "`%Y-%m-%dT%H:%M` (for datetime without seconds, e.g., 2023-05-25T10:30)" - - "`%Y-%m-%dT%H` (for datetime without minutes and seconds, e.g., 2023-05-25T10)" - - "`%H:%M:%S` (for time, e.g., 10:30:45)" - - "`%H:%M:%SZ` (for time with UTC timezone, e.g., 10:30:45Z)" - - "`%H:%M:%S%z` (for time with timezone offset, e.g., 10:30:45+0300)" - - -geojsonFormat: - title: Geojson Formats - type: string - description: The JSON object according to the geojson spec. - enum: [topojson,default] - -geopointFormat: - title: Geopoint Format - type: string - description: | - The two types of formats for `geopoint` (describing a geographic point). - - - `array` (if 'lat,long' (e.g., 36.63,-90.20)) - - `object` (if {'lat':36.63,'lon':-90.20}) - enum: [array,object] - diff --git a/variable-level-metadata-schema/schemas/dictionary/fields.yaml b/variable-level-metadata-schema/schemas/dictionary/fields.yaml index 6053edd..d0c07e9 100644 --- a/variable-level-metadata-schema/schemas/dictionary/fields.yaml +++ b/variable-level-metadata-schema/schemas/dictionary/fields.yaml @@ -1,52 +1,46 @@ -"$schema": http://json-schema.org/draft-04/schema# -"$id": vlmd-fields title: HEAL Variable Level Metadata Fields description: | - Variable level metadata individual fields integrated into the variable level - metadata object within the HEAL platform metadata service. + - !!! note "NOTE" - - Only `name` and `description` properties are required. - For categorical variables, `constraints.enum` and `encodings` (where applicable) properties are highly encouraged. - For studies using HEAL or other common data elements (CDEs), `standardsMappings` information is highly encouraged. - `type` and `format` properties may be particularly useful for some variable types (e.g. date-like variables) + !!! note "Highly encouraged" + - Only `name` and `description` properties are required. + - For categorical variables, `constraints.enum` and `enumLabels` (where applicable) properties are highly encouraged. + - For studies using HEAL or other common data elements (CDEs), `standardsMappings` information is highly encouraged. + - `type` and `format` properties may be particularly useful for some variable types (e.g. date-like variables) type: object -additionalProperties: true required: - name - description properties: - module: + schemaVersion: + $ref: "#/_definitions/schemaVersion" + section: type: string - title: Module + title: Section description: | The section, form, survey instrument, set of measures or other broad category used - to group variables. + to group variables. Previously called "module." examples: - Demographics - PROMIS - - Substance use - Medical History - - Sleep questions - - Physical activity name: type: string title: Variable Name description: | The name of a variable (i.e., field) as it appears in the data. + examples: + - gender_id title: type: string title: Variable Label (ie Title) description: | - The human-readable title or label of the variable. - - examples: - - My Variable + The human-readable title or label of the variable. + examples: - Gender identity description: type: string @@ -63,8 +57,8 @@ properties: type: string description: | A classification or category of a particular data element or property expected or allowed in the dataset. - - Definitions: + additionalDescription: | + enum definitions: - `number` (A numeric value with optional decimal places. (e.g., 3.14)) - `integer` (A whole number without decimal places. (e.g., 42)) @@ -97,44 +91,34 @@ properties: description: | Indicates the format of the type specified in the `type` property. Each format is dependent on the `type` specified. - For example: If `type` is "string", then see the [String formats](https://specs.frictionlessdata.io/table-schema/#string). - If `type` is "date", "datetime", or "time", default format is ISO8601 formatting for those respective types (see details on ISO8601 format for [Date](https://specs.frictionlessdata.io/table-schema/#date), - [Datetime](https://specs.frictionlessdata.io/table-schema/#datetime), - or [Time](https://specs.frictionlessdata.io/table-schema/#time)) - If you want to specify a date-like variable using standard Python/C strptime syntax, see [here](#format-details-for-date-datetime-time-type-variables) for details. - See [here](https://specs.frictionlessdata.io/table-schema/#types-and-formats) for more information about appropriate `format` values by variable `type`. - - [Additional information] + See [here](https://specs.frictionlessdata.io/table-schema/#types-and-formats) + for more information about appropriate `format` values by variable `type`. - Date Formats (date, datetime, time `type` variable): - - A format for a date variable (`date`,`time`,`datetime`). - **default**: An ISO8601 format string. - **any**: Any parsable representation of a date/time/datetime. The implementing library can attempt to parse the datetime via a range of strategies. + additionalDescription: | + examples/definitions of patterns and possible values: - **{PATTERN}**: The value can be parsed according to `{PATTERN}`, - which `MUST` follow the date formatting syntax of - C / Python [strftime](http://strftime.org/) such as: + Examples of date time pattern formats - - "`%Y-%m-%d` (for date, e.g., 2023-05-25)" - - "`%Y%-%d` (for date, e.g., 20230525) for date without dashes" - - "`%Y-%m-%dT%H:%M:%S` (for datetime, e.g., 2023-05-25T10:30:45)" - - "`%Y-%m-%dT%H:%M:%SZ` (for datetime with UTC timezone, e.g., 2023-05-25T10:30:45Z)" - - "`%Y-%m-%dT%H:%M:%S%z` (for datetime with timezone offset, e.g., 2023-05-25T10:30:45+0300)" - - "`%Y-%m-%dT%H:%M` (for datetime without seconds, e.g., 2023-05-25T10:30)" - - "`%Y-%m-%dT%H` (for datetime without minutes and seconds, e.g., 2023-05-25T10)" - - "`%H:%M:%S` (for time, e.g., 10:30:45)" - - "`%H:%M:%SZ` (for time with UTC timezone, e.g., 10:30:45Z)" - - "`%H:%M:%S%z` (for time with timezone offset, e.g., 10:30:45+0300)" + - `%Y-%m-%d` (for date, e.g., 2023-05-25) + - `%Y%-%d` (for date, e.g., 20230525) for date without dashes + - `%Y-%m-%dT%H:%M:%S` (for datetime, e.g., 2023-05-25T10:30:45) + - `%Y-%m-%dT%H:%M:%SZ` (for datetime with UTC timezone, e.g., 2023-05-25T10:30:45Z) + - `%Y-%m-%dT%H:%M:%S%z` (for datetime with timezone offset, e.g., 2023-05-25T10:30:45+0300) + - `%Y-%m-%dT%H:%M` (for datetime without seconds, e.g., 2023-05-25T10:30) + - `%Y-%m-%dT%H` (for datetime without minutes and seconds, e.g., 2023-05-25T10) + - `%H:%M:%S` (for time, e.g., 10:30:45) + - `%H:%M:%SZ` (for time with UTC timezone, e.g., 10:30:45Z) + - `%H:%M:%S%z` (for time with timezone offset, e.g., 10:30:45+0300) - String formats: + Examples of string formats - - "`email` if valid emails (e.g., test@gmail.com)" - - "`uri` if valid uri addresses (e.g., https://example.com/resource123)" - - "`binary` if a base64 binary encoded string (e.g., authentication token like aGVsbG8gd29ybGQ=)" - - "`uuid` if a universal unique identifier also known as a guid (eg., f47ac10b-58cc-4372-a567-0e02b2c3d479)" + - `email` if valid emails (e.g., test@gmail.com) + - `uri` if valid uri addresses (e.g., https://example.com/resource123) + - `binary` if a base64 binary encoded string (e.g., authentication token like aGVsbG8gd29ybGQ=) + - `uuid` if a universal unique identifier also known as a guid (eg., f47ac10b-58cc-4372-a567-0e02b2c3d479) - Geopoint formats: + Examples of geopoint formats The two types of formats for `geopoint` (describing a geographic point). @@ -144,6 +128,13 @@ properties: constraints: type: object properties: + required: + type: boolean + title: Required variable + description: | + If this variable is marked as true, then this variable's value must be present + (ie not missing; see missingValues). If marked as false or not present, then the + variable CAN be missing. maxLength: type: integer title: Maximum Length @@ -152,24 +143,14 @@ properties: object). For example, if 'Hello World' is the longest value of a categorical variable, this would be a maxLength of 11. - enumJsonSpec: + enum: + type: array title: Variable Possible Values description: | Constrains possible values to a set of values. - - type: array examples: - - [1,2,3,4] - - ["White","Black or African American","American Indian or Alaska Native","Native Hawaiian or Other Pacific Islander","Asian","Some other race","Multiracial"] - enumCsvSpec: - title: Variable Possible Values - description: | - Constrains possible values to a set of values. - - $ref: "#/definitions/csvArray" - examples: - - 1|2|3|4|5|6|7|8 - - White|Black or African American|American Indian or Alaska Native|Native Hawaiian or Other Pacific Islander|Asian|Some other race|Multiracial + - [1,2,3,4,5] + - ["Poor","Fair","Good","Very good","Excellent"] pattern: type: string title: Regular Expression Pattern @@ -190,7 +171,7 @@ properties: description: | Specifies the minimum value of a field. - encodingsJsonSpec: + enumLabels: title: 'Variable Value Encodings (i.e., mappings; value labels)' description: | Variable value encodings provide a way to further annotate any value within a any variable type, @@ -205,30 +186,13 @@ properties: store categoricals that are stored as "short" labels (such as abbreviations). + This field is intended to follow [this pattern](https://specs.frictionlessdata.io/patterns/#table-schema-enum-labels-and-ordering) + type: object examples: - - {"0":"No","1":"Yes"} + - {"1":"Poor","2":"Fair","3":"Good","4":"Very good","5":"Excellent"} - {"HW":"Hello world","GBW":"Good bye world","HM":"Hi, Mike"} - encodingsCsvSpec: - title: 'Variable Value Encodings (i.e., mappings; value labels)' - description: | - Variable value encodings provide a way to further annotate any value within a any variable type, - making values easier to understand. - - - Many analytic software programs (e.g., SPSS,Stata, and SAS) use numerical encodings and some algorithms - only support numerical values. Encodings (and mappings) allow categorical values to be stored as - numerical values. - - Additionally, as another use case, this field provides a way to - store categoricals that are stored as "short" labels (such as - abbreviations). - - $ref: "#/definitions/csvObject" - examples: - - '0=No|1=Yes' - - 'HW=Hello world|GBW=Good bye world|HM=Hi,Mike' - ordered: + enumOrdered: title: An ordered variable description: | Indicates whether a categorical variable is ordered. This variable is @@ -236,29 +200,20 @@ properties: necessarily a numerical relationship (e.g., Strongly disagree < Disagree < Neutral < Agree). + This field is intended to follow the ordering aspect of this [this pattern][this pattern](https://specs.frictionlessdata.io/patterns/#table-schema-enum-labels-and-ordering) + type: boolean - missingValuesJsonSpec: + missingValues: title: Missing Values description: | A list of missing values specific to a variable. - examples: - ["Missing","Skipped","No preference"] - ["Missing"] type: array - missingValuesCsvSpec: - title: Missing Values - description: | - A list of missing values specific to a variable. - - examples: - - - Missing|Skipped|No preference - - Missing - $ref: "#/definitions/csvArray" - trueValuesJsonSpec: + trueValues: title: Boolean True Value Labels description: | For boolean (true) variable (as defined in type field), this field allows @@ -266,164 +221,24 @@ properties: readability of the field). It can include one or more values. type: array - items: - type: string examples: - ["required","Yes","Checked"] - ["required"] - - trueValuesCsvSpec: - $ref: "#/definitions/csvArray" - description: | - For boolean (true) variable (as defined in type field), this field allows - a physical string representation to be cast as true (increasing - readability of the field). It can include one or more values. - - examples: - - Required|REQUIRED - - required|Yes|Y|Checked - - Checked - - Required - falseValuesJsonSpec: + falseValues: title: Boolean False Value Labels description: | For boolean (false) variable (as defined in type field), this field allows a physical string representation to be cast as false (increasing readability of the field) that is not a standard false value. It can include one or more values. type: array - falseValuesCsvSpec: - title: Boolean False Value Labels - description: | - For boolean (false) variable (as defined in type field), this field allows - a physical string representation to be cast as false (increasing - readability of the field) that is not a standard false value. It can include one or more values. - $ref: "#/definitions/csvArray" - repo_link: - type: string - title: Variable Repository Link + examples: + - ["Not required","NOT REQUIRED"] + - ["No"] + custom: + type: object description: | - A link to the variable as it exists on the home repository, if applicable + Additional properties not included a core property. standardsMappings: - title: Standards Mappings - description: A published set of standard variables such as the NIH Common Data Elements program. - type: array - items: - type: object - properties: - url: - title: Standards Mapping - Url - description: | - The url that links out to the published, standardized mapping. - type: string - format: uri - examples: - - https://cde.nlm.nih.gov/deView?tinyId=XyuSGdTTI - type: - title: Standards Mapping - Title - description: | - The **type** of mapping linked to a published set of standard variables such as the NIH Common Data Elements program - - examples: - - cde - - ontology - - reference_list - type: string - label: - title: Standards Mapping - Label - description: | - A free text **label** of a mapping indicating a mapping(s) to a published set of standard variables such as the NIH Common Data Elements program. - - type: string - examples: - - substance use - - chemical compound - - promis - source: - title: Standard Mapping - Source - description: | - The source of the standardized variable. - type: string - examples: - - TBD (will have controlled vocabulary) - id: - title: Standard Mapping - Id - type: string - description: | - The id locating the individual mapping within the given source. + $ref: "#/_definitions/fieldStandardsMappingsItem" relatedConcepts: - title: Related Concepts - description: Mappings to a published set of concepts related to the given field such as - ontological information (eg., NCI thesaurus, bioportal etc) - - - type: array - items: - type: object - properties: - url: - title: Related Concepts - Url - description: | - The url that links out to the published, standardized concept. - type: string - format: uri - examples: - - https://cde.nlm.nih.gov/deView?tinyId=XyuSGdTTI - type: - title: Related concepts - Type - description: | - The **type** of mapping to a published set of concepts related to the given field such as - ontological information (eg., NCI thesaurus, bioportal etc) - type: string - label: - type: string - title: Related Concepts - Label - description: | - A free text **label** of mapping to a published set of concepts related to the given field such as - ontological information (eg., NCI thesaurus, bioportal etc) - - source: - title: Related Concepts - Source - description: | - The source of the related concept. - type: string - examples: - - TBD (will have controlled vocabulary) - id: - title: Related Concepts - Id - type: string - description: | - The id locating the individual mapping within the given source. - univarStats: - type: object - description: | - Univariate statistics inferred from the data about the given variable - - properties: - median: - type: number - mean: - type: number - std: - type: number - min: - type: number - max: - type: number - mode: - type: number - count: - type: integer - minimum: 0 - twentyFifthPercentile: - type: number - seventyFifthPercentile: - type: number - categoricalMarginals: - type: array - items: - type: object - properties: - name: - type: string - count: - type: integer + $ref: "#/_definitions/relatedConcepts" \ No newline at end of file diff --git a/variable-level-metadata-schema/schemas/frictionless/csvtemplate/fields.json b/variable-level-metadata-schema/schemas/frictionless/csvtemplate/fields.json index 68a981f..50a83e3 100644 --- a/variable-level-metadata-schema/schemas/frictionless/csvtemplate/fields.json +++ b/variable-level-metadata-schema/schemas/frictionless/csvtemplate/fields.json @@ -1,19 +1,28 @@ { - "version": "0.1.0", - "description": "Variable level metadata individual fields integrated into the variable level\nmetadata object within the HEAL platform metadata service.\n\n!!! note \"NOTE\"\n\n Only `name` and `description` properties are required. \n For categorical variables, `constraints.enum` and `encodings` (where applicable) properties are highly encouraged. \n For studies using HEAL or other common data elements (CDEs), `standardsMappings` information is highly encouraged.\n `type` and `format` properties may be particularly useful for some variable types (e.g. date-like variables)\n", + "version": "0.2.0", + "description": "\n\n!!! note \"Highly encouraged\"\n\n - Only `name` and `description` properties are required. \n - For categorical variables, `constraints.enum` and `enumLabels` (where applicable) properties are highly encouraged. \n - For studies using HEAL or other common data elements (CDEs), `standardsMappings` information is highly encouraged.\n - `type` and `format` properties may be particularly useful for some variable types (e.g. date-like variables)\n", "title": "HEAL Variable Level Metadata Fields", "fields": [ { - "name": "module", - "description": "The section, form, survey instrument, set of measures or other broad category used \nto group variables.\n", - "title": "Module", + "name": "schemaVersion", + "description": "The version of the schema used in agreed upon convention of major.minor.path (e.g., 1.0.2) \n\nNOTE: This is NOT for versioning of each indiviual data dictionary instance. \nRather, it is the\nversion of THIS schema document. See `version` property (below) if specifying the individual data dictionary instance\nversion.\n\nIf generating a vlmd document as a csv file, include this version in \nevery row/record to indicate this is a schema level property \n(not applicable for the json version as this property is already at the schema/root level)\n", + "examples": [ + "1.0.0", + "0.2.0" + ], + "type": "string", + "constraints": { + "pattern": "\\d+\\.\\d+\\.\\d+" + } + }, + { + "name": "section", + "description": "The section, form, survey instrument, set of measures or other broad category used \nto group variables. Previously called \"module.\"\n", + "title": "Section", "examples": [ "Demographics", "PROMIS", - "Substance use", - "Medical History", - "Sleep questions", - "Physical activity" + "Medical History" ], "type": "string" }, @@ -21,6 +30,9 @@ "name": "name", "description": "The name of a variable (i.e., field) as it appears in the data. \n", "title": "Variable Name", + "examples": [ + "gender_id" + ], "type": "string", "constraints": { "required": true @@ -28,10 +40,9 @@ }, { "name": "title", - "description": "The human-readable title or label of the variable. \n", + "description": "The human-readable title or label of the variable.\n", "title": "Variable Label (ie Title)", "examples": [ - "My Variable", "Gender identity" ], "type": "string" @@ -51,32 +62,38 @@ }, { "name": "type", - "description": "A classification or category of a particular data element or property expected or allowed in the dataset.\n\nDefinitions:\n\n- `number` (A numeric value with optional decimal places. (e.g., 3.14))\n- `integer` (A whole number without decimal places. (e.g., 42))\n- `string` (A sequence of characters. (e.g., \\\"test\\\"))\n- `any` (Any type of data is allowed. (e.g., true))\n- `boolean` (A binary value representing true or false. (e.g., true))\n- `date` (A specific calendar date. (e.g., \\\"2023-05-25\\\"))\n- `datetime` (A specific date and time, including timezone information. (e.g., \\\"2023-05-25T10:30:00Z\\\"))\n- `time` (A specific time of day. (e.g., \\\"10:30:00\\\"))\n- `year` (A specific year. (e.g., 2023)\n- `yearmonth` (A specific year and month. (e.g., \\\"2023-05\\\"))\n- `duration` (A length of time. (e.g., \\\"PT1H\\\")\n- `geopoint` (A pair of latitude and longitude coordinates. (e.g., [51.5074, -0.1278]))\n", + "description": "A classification or category of a particular data element or property expected or allowed in the dataset.\n", "title": "Variable Type", "type": "string", "constraints": { "enum": [ + "time", "number", - "datetime", - "date", - "string", + "geopoint", "any", + "yearmonth", "year", - "geopoint", - "time", + "datetime", + "date", "integer", - "yearmonth", "duration", - "boolean" + "boolean", + "string" ] } }, { "name": "format", - "description": "Indicates the format of the type specified in the `type` property. \nEach format is dependent on the `type` specified. \nFor example: If `type` is \"string\", then see the [String formats](https://specs.frictionlessdata.io/table-schema/#string). \nIf `type` is \"date\", \"datetime\", or \"time\", default format is ISO8601 formatting for those respective types (see details on ISO8601 format for [Date](https://specs.frictionlessdata.io/table-schema/#date),\n[Datetime](https://specs.frictionlessdata.io/table-schema/#datetime), \nor [Time](https://specs.frictionlessdata.io/table-schema/#time)) - If you want to specify a date-like variable using standard Python/C strptime syntax, see [here](#format-details-for-date-datetime-time-type-variables) for details. \nSee [here](https://specs.frictionlessdata.io/table-schema/#types-and-formats) for more information about appropriate `format` values by variable `type`. \n\n[Additional information]\n\nDate Formats (date, datetime, time `type` variable):\n\nA format for a date variable (`date`,`time`,`datetime`). \n**default**: An ISO8601 format string.\n**any**: Any parsable representation of a date/time/datetime. The implementing library can attempt to parse the datetime via a range of strategies.\n\n**{PATTERN}**: The value can be parsed according to `{PATTERN}`,\nwhich `MUST` follow the date formatting syntax of \nC / Python [strftime](http://strftime.org/) such as:\n\n- \"`%Y-%m-%d` (for date, e.g., 2023-05-25)\"\n- \"`%Y%-%d` (for date, e.g., 20230525) for date without dashes\"\n- \"`%Y-%m-%dT%H:%M:%S` (for datetime, e.g., 2023-05-25T10:30:45)\"\n- \"`%Y-%m-%dT%H:%M:%SZ` (for datetime with UTC timezone, e.g., 2023-05-25T10:30:45Z)\"\n- \"`%Y-%m-%dT%H:%M:%S%z` (for datetime with timezone offset, e.g., 2023-05-25T10:30:45+0300)\"\n- \"`%Y-%m-%dT%H:%M` (for datetime without seconds, e.g., 2023-05-25T10:30)\"\n- \"`%Y-%m-%dT%H` (for datetime without minutes and seconds, e.g., 2023-05-25T10)\"\n- \"`%H:%M:%S` (for time, e.g., 10:30:45)\"\n- \"`%H:%M:%SZ` (for time with UTC timezone, e.g., 10:30:45Z)\"\n- \"`%H:%M:%S%z` (for time with timezone offset, e.g., 10:30:45+0300)\"\n\nString formats:\n\n- \"`email` if valid emails (e.g., test@gmail.com)\"\n- \"`uri` if valid uri addresses (e.g., https://example.com/resource123)\"\n- \"`binary` if a base64 binary encoded string (e.g., authentication token like aGVsbG8gd29ybGQ=)\"\n- \"`uuid` if a universal unique identifier also known as a guid (eg., f47ac10b-58cc-4372-a567-0e02b2c3d479)\"\n\n\nGeopoint formats:\n\nThe two types of formats for `geopoint` (describing a geographic point).\n\n- `array` (if 'lat,long' (e.g., 36.63,-90.20))\n- `object` (if {'lat':36.63,'lon':-90.20})\n", + "description": "Indicates the format of the type specified in the `type` property. \nEach format is dependent on the `type` specified. \nSee [here](https://specs.frictionlessdata.io/table-schema/#types-and-formats) \nfor more information about appropriate `format` values by variable `type`.\n", "title": "Variable Format", "type": "string" }, + { + "name": "constraints.required", + "description": "If this variable is marked as true, then this variable's value must be present\n(ie not missing; see missingValues). If marked as false or not present, then the \nvariable CAN be missing.\n", + "title": "Required variable", + "type": "boolean" + }, { "name": "constraints.maxLength", "description": "Indicates the maximum length of an iterable (e.g., array, string, or\nobject). For example, if 'Hello World' is the longest value of a\ncategorical variable, this would be a maxLength of 11.\n", @@ -88,8 +105,8 @@ "description": "Constrains possible values to a set of values.\n", "title": "Variable Possible Values", "examples": [ - "1|2|3|4|5|6|7|8", - "White|Black or African American|American Indian or Alaska Native|Native Hawaiian or Other Pacific Islander|Asian|Some other race|Multiracial" + "1|2|3|4|5", + "Poor|Fair|Good|Very good|Excellent" ], "type": "string", "constraints": { @@ -115,12 +132,12 @@ "type": "integer" }, { - "name": "encodings", - "description": "Variable value encodings provide a way to further annotate any value within a any variable type,\nmaking values easier to understand. \n\n\nMany analytic software programs (e.g., SPSS,Stata, and SAS) use numerical encodings and some algorithms\nonly support numerical values. Encodings (and mappings) allow categorical values to be stored as\nnumerical values.\n\nAdditionally, as another use case, this field provides a way to\nstore categoricals that are stored as \"short\" labels (such as\nabbreviations).\n", + "name": "enumLabels", + "description": "Variable value encodings provide a way to further annotate any value within a any variable type,\nmaking values easier to understand. \n\n\nMany analytic software programs (e.g., SPSS,Stata, and SAS) use numerical encodings and some algorithms\nonly support numerical values. Encodings (and mappings) allow categorical values to be stored as\nnumerical values.\n\nAdditionally, as another use case, this field provides a way to\nstore categoricals that are stored as \"short\" labels (such as\nabbreviations).\n\nThis field is intended to follow [this pattern](https://specs.frictionlessdata.io/patterns/#table-schema-enum-labels-and-ordering)\n", "title": "Variable Value Encodings (i.e., mappings; value labels)", "examples": [ - "0=No|1=Yes", - "HW=Hello world|GBW=Good bye world|HM=Hi,Mike" + "1=Poor|2=Fair|3=Good|4=Very good|5=Excellent", + "HW=Hello world|GBW=Good bye world|HM=Hi, Mike" ], "type": "string", "constraints": { @@ -128,8 +145,8 @@ } }, { - "name": "ordered", - "description": "Indicates whether a categorical variable is ordered. This variable is\nrelevant for variables that have an ordered relationship but not\nnecessarily a numerical relationship (e.g., Strongly disagree < Disagree\n< Neutral < Agree).\n", + "name": "enumOrdered", + "description": "Indicates whether a categorical variable is ordered. This variable is\nrelevant for variables that have an ordered relationship but not\nnecessarily a numerical relationship (e.g., Strongly disagree < Disagree\n< Neutral < Agree).\n\nThis field is intended to follow the ordering aspect of this [this pattern][this pattern](https://specs.frictionlessdata.io/patterns/#table-schema-enum-labels-and-ordering)\n", "title": "An ordered variable", "type": "boolean" }, @@ -149,11 +166,10 @@ { "name": "trueValues", "description": "For boolean (true) variable (as defined in type field), this field allows\na physical string representation to be cast as true (increasing\nreadability of the field). It can include one or more values.\n", + "title": "Boolean True Value Labels", "examples": [ - "Required|REQUIRED", - "required|Yes|Y|Checked", - "Checked", - "Required" + "required|Yes|Checked", + "required" ], "type": "string", "constraints": { @@ -164,142 +180,127 @@ "name": "falseValues", "description": "For boolean (false) variable (as defined in type field), this field allows\na physical string representation to be cast as false (increasing\nreadability of the field) that is not a standard false value. It can include one or more values.\n", "title": "Boolean False Value Labels", + "examples": [ + "Not required|NOT REQUIRED", + "No" + ], "type": "string", "constraints": { "pattern": "^(?:[^|]+\\||[^|]*)(?:[^|]*\\|)*[^|]*$" } }, { - "name": "repo_link", - "description": "A link to the variable as it exists on the home repository, if applicable\n", - "title": "Variable Repository Link", - "type": "string" + "name": "custom", + "description": "Additional properties not included a core property. \n", + "type": "string", + "constraints": { + "pattern": "^(?:.*?=.*?(?:\\||$))+$" + } }, { - "name": "standardsMappings.url", - "description": "The url that links out to the published, standardized mapping.\n", - "title": "Standards Mapping - Url", + "name": "standardsMappings[0].instrument.url", + "description": "A url (e.g., link, address) to a file or other resource containing the instrument, or\na set of items which encompass a variable in this variable level metadata document (if at the root level or the document level) \nor the individual variable (if at the field level). \n", + "title": "Url", "examples": [ - "https://cde.nlm.nih.gov/deView?tinyId=XyuSGdTTI" + "https://www.heal.nih.gov/files/CDEs/2023-05/adult-demographics-cdes.xlsx" ], "type": "string" }, { - "name": "standardsMappings.type", - "description": "The **type** of mapping linked to a published set of standard variables such as the NIH Common Data Elements program\n", - "title": "Standards Mapping - Title", + "name": "standardsMappings[0].instrument.source", + "description": "An abbreviated name/acronym from a controlled vocabulary referencing the resource (e.g., program or repository)\ncontaining the instrument, or a set of items which encompass a variable in this variable level metadata document (if at the root level or the document level) \nor the individual variable (if at the field level). \n", + "title": "Source", + "type": "string", + "constraints": { + "enum": [ + "heal-cde" + ] + } + }, + { + "name": "standardsMappings[0].instrument.title", + "title": "Title", "examples": [ - "cde", - "ontology", - "reference_list" + "Adult demographics", + "adult-demographics" ], "type": "string" }, { - "name": "standardsMappings.label", - "description": "A free text **label** of a mapping indicating a mapping(s) to a published set of standard variables such as the NIH Common Data Elements program.\n", - "title": "Standards Mapping - Label", + "name": "standardsMappings[0].instrument.id", + "description": "A code or other string that identifies the instrument within the source.\nThis should always be from the source's formal, standardized identification system \n", + "title": "Identifier", "examples": [ - "substance use", - "chemical compound", - "promis" + "5141" ], "type": "string" }, { - "name": "standardsMappings.source", - "description": "The source of the standardized variable.\n", - "title": "Standard Mapping - Source", + "name": "standardsMappings[0].item.url", + "description": "The url that links out to the published, standardized mapping of a variable (e.g., common data element)\n", + "title": "Standards mappings - Url", "examples": [ - "TBD (will have controlled vocabulary)" + "https://evs.nci.nih.gov/ftp1/CDISC/SDTM/SDTM%20Terminology.html#CL.C74457.RACE" ], "type": "string" }, { - "name": "standardsMappings.id", - "description": "The id locating the individual mapping within the given source.\n", - "title": "Standard Mapping - Id", + "name": "standardsMappings[0].item.source", + "description": "The source of the standardized variable. Note, this property is required if \nan id is specified.\n", + "title": "Standards mappings - Source", + "examples": [ + "CDISC" + ], "type": "string" }, { - "name": "relatedConcepts.url", - "description": "The url that links out to the published, standardized concept.\n", - "title": "Related Concepts - Url", + "name": "standardsMappings[0].item.id", + "description": "The id locating the individual mapping within the given source. \nNote, the `standardsMappings[0].source` property is required if \nthis property is specified.\n", + "title": "Standards Mappings - Id", "examples": [ - "https://cde.nlm.nih.gov/deView?tinyId=XyuSGdTTI" + "C74457" ], "type": "string" }, { - "name": "relatedConcepts.type", - "description": "The **type** of mapping to a published set of concepts related to the given field such as \nontological information (eg., NCI thesaurus, bioportal etc)\n", - "title": "Related concepts - Type", + "name": "relatedConcepts[0].url", + "description": "The url that links out to the published, related concept. \nThe listed examples could both be attached to any variable related to, for example, heroin use.\n\n> :point_up: if you are looking for mapping field values to common data elements or a set of standards, see `standardsMappings`_\n", + "title": "Related Concepts - Url", + "examples": [ + "https://www.ebi.ac.uk/chebi/chebiOntology.do?chebiId=CHEBI:27808", + "http://purl.bioontology.org/ontology/RXNORM/3304" + ], "type": "string" }, { - "name": "relatedConcepts.label", - "description": "A free text **label** of mapping to a published set of concepts related to the given field such as \nontological information (eg., NCI thesaurus, bioportal etc)\n", - "title": "Related Concepts - Label", + "name": "relatedConcepts[0].title", + "description": "A human-readable title (ie label) to a concept related to the given field.\nThe listed examples could both be attached to any variable related to, for example, heroin use.\n\n> :point_up: if you are looking for mapping field values to common data elements or a set of standards, see `standardsMappings`_\n", + "title": "Related concepts - Type", + "examples": [ + "Heroin Molecular Structure", + "Heroin Ontology" + ], "type": "string" }, { - "name": "relatedConcepts.source", - "description": "The source of the related concept.\n", + "name": "relatedConcepts[0].source", + "description": "The source (e.g., a dictionary or vocabulary set) to a concept related to the given field.\nThe listed examples could both be attached to any variable related to, for example, heroin use.\n\n> :point_up: if you are looking for mapping field values to common data elements or a set of standards, see `standardsMappings`_\n", "title": "Related Concepts - Source", "examples": [ - "TBD (will have controlled vocabulary)" + "CHEBI", + "RXNORM" ], "type": "string" }, { - "name": "relatedConcepts.id", - "description": "The id locating the individual mapping within the given source.\n", + "name": "relatedConcepts[0].id", + "description": "The id locating the individual concept within the source of the given field.\nThe listed examples could both be attached to any variable related to, for example, heroin use.\n\n> :point_up: if you are looking for mapping field values to common data elements or a set of standards, see `standardsMappings`_\n", "title": "Related Concepts - Id", + "examples": [ + "27808", + "3304" + ], "type": "string" - }, - { - "name": "univarStats.median", - "type": "number" - }, - { - "name": "univarStats.mean", - "type": "number" - }, - { - "name": "univarStats.std", - "type": "number" - }, - { - "name": "univarStats.min", - "type": "number" - }, - { - "name": "univarStats.max", - "type": "number" - }, - { - "name": "univarStats.mode", - "type": "number" - }, - { - "name": "univarStats.count", - "type": "integer" - }, - { - "name": "univarStats.twentyFifthPercentile", - "type": "number" - }, - { - "name": "univarStats.seventyFifthPercentile", - "type": "number" - }, - { - "name": "univarStats.categoricalMarginals.name", - "type": "string" - }, - { - "name": "univarStats.categoricalMarginals.count", - "type": "integer" } ], "missingValues": [ diff --git a/variable-level-metadata-schema/schemas/jsonschema/csvtemplate/fields.json b/variable-level-metadata-schema/schemas/jsonschema/csvtemplate/fields.json index ac350ad..325a9c3 100644 --- a/variable-level-metadata-schema/schemas/jsonschema/csvtemplate/fields.json +++ b/variable-level-metadata-schema/schemas/jsonschema/csvtemplate/fields.json @@ -1,39 +1,45 @@ { - "$schema": "http://json-schema.org/draft-04/schema#", - "$id": "vlmd-fields", + "version": "0.2.0", "title": "HEAL Variable Level Metadata Fields", - "description": "Variable level metadata individual fields integrated into the variable level\nmetadata object within the HEAL platform metadata service.\n\n!!! note \"NOTE\"\n\n Only `name` and `description` properties are required. \n For categorical variables, `constraints.enum` and `encodings` (where applicable) properties are highly encouraged. \n For studies using HEAL or other common data elements (CDEs), `standardsMappings` information is highly encouraged.\n `type` and `format` properties may be particularly useful for some variable types (e.g. date-like variables)\n", + "description": "\n\n!!! note \"Highly encouraged\"\n\n - Only `name` and `description` properties are required. \n - For categorical variables, `constraints.enum` and `enumLabels` (where applicable) properties are highly encouraged. \n - For studies using HEAL or other common data elements (CDEs), `standardsMappings` information is highly encouraged.\n - `type` and `format` properties may be particularly useful for some variable types (e.g. date-like variables)\n", "type": "object", - "additionalProperties": true, "required": [ "name", "description" ], "properties": { - "module": { + "schemaVersion": { "type": "string", - "title": "Module", - "description": "The section, form, survey instrument, set of measures or other broad category used \nto group variables.\n", + "description": "The version of the schema used in agreed upon convention of major.minor.path (e.g., 1.0.2) \n\nNOTE: This is NOT for versioning of each indiviual data dictionary instance. \nRather, it is the\nversion of THIS schema document. See `version` property (below) if specifying the individual data dictionary instance\nversion.\n\nIf generating a vlmd document as a csv file, include this version in \nevery row/record to indicate this is a schema level property \n(not applicable for the json version as this property is already at the schema/root level)\n", + "pattern": "\\d+\\.\\d+\\.\\d+", + "examples": [ + "1.0.0", + "0.2.0" + ] + }, + "section": { + "type": "string", + "title": "Section", + "description": "The section, form, survey instrument, set of measures or other broad category used \nto group variables. Previously called \"module.\"\n", "examples": [ "Demographics", "PROMIS", - "Substance use", - "Medical History", - "Sleep questions", - "Physical activity" + "Medical History" ] }, "name": { "type": "string", "title": "Variable Name", - "description": "The name of a variable (i.e., field) as it appears in the data. \n" + "description": "The name of a variable (i.e., field) as it appears in the data. \n", + "examples": [ + "gender_id" + ] }, "title": { "type": "string", "title": "Variable Label (ie Title)", - "description": "The human-readable title or label of the variable. \n", + "description": "The human-readable title or label of the variable.\n", "examples": [ - "My Variable", "Gender identity" ] }, @@ -49,7 +55,8 @@ "type": { "title": "Variable Type", "type": "string", - "description": "A classification or category of a particular data element or property expected or allowed in the dataset.\n\nDefinitions:\n\n- `number` (A numeric value with optional decimal places. (e.g., 3.14))\n- `integer` (A whole number without decimal places. (e.g., 42))\n- `string` (A sequence of characters. (e.g., \\\"test\\\"))\n- `any` (Any type of data is allowed. (e.g., true))\n- `boolean` (A binary value representing true or false. (e.g., true))\n- `date` (A specific calendar date. (e.g., \\\"2023-05-25\\\"))\n- `datetime` (A specific date and time, including timezone information. (e.g., \\\"2023-05-25T10:30:00Z\\\"))\n- `time` (A specific time of day. (e.g., \\\"10:30:00\\\"))\n- `year` (A specific year. (e.g., 2023)\n- `yearmonth` (A specific year and month. (e.g., \\\"2023-05\\\"))\n- `duration` (A length of time. (e.g., \\\"PT1H\\\")\n- `geopoint` (A pair of latitude and longitude coordinates. (e.g., [51.5074, -0.1278]))\n", + "description": "A classification or category of a particular data element or property expected or allowed in the dataset.\n", + "additionalDescription": "enum definitions:\n\n- `number` (A numeric value with optional decimal places. (e.g., 3.14))\n- `integer` (A whole number without decimal places. (e.g., 42))\n- `string` (A sequence of characters. (e.g., \\\"test\\\"))\n- `any` (Any type of data is allowed. (e.g., true))\n- `boolean` (A binary value representing true or false. (e.g., true))\n- `date` (A specific calendar date. (e.g., \\\"2023-05-25\\\"))\n- `datetime` (A specific date and time, including timezone information. (e.g., \\\"2023-05-25T10:30:00Z\\\"))\n- `time` (A specific time of day. (e.g., \\\"10:30:00\\\"))\n- `year` (A specific year. (e.g., 2023)\n- `yearmonth` (A specific year and month. (e.g., \\\"2023-05\\\"))\n- `duration` (A length of time. (e.g., \\\"PT1H\\\")\n- `geopoint` (A pair of latitude and longitude coordinates. (e.g., [51.5074, -0.1278]))\n", "enum": [ "number", "integer", @@ -68,7 +75,13 @@ "format": { "title": "Variable Format", "type": "string", - "description": "Indicates the format of the type specified in the `type` property. \nEach format is dependent on the `type` specified. \nFor example: If `type` is \"string\", then see the [String formats](https://specs.frictionlessdata.io/table-schema/#string). \nIf `type` is \"date\", \"datetime\", or \"time\", default format is ISO8601 formatting for those respective types (see details on ISO8601 format for [Date](https://specs.frictionlessdata.io/table-schema/#date),\n[Datetime](https://specs.frictionlessdata.io/table-schema/#datetime), \nor [Time](https://specs.frictionlessdata.io/table-schema/#time)) - If you want to specify a date-like variable using standard Python/C strptime syntax, see [here](#format-details-for-date-datetime-time-type-variables) for details. \nSee [here](https://specs.frictionlessdata.io/table-schema/#types-and-formats) for more information about appropriate `format` values by variable `type`. \n\n[Additional information]\n\nDate Formats (date, datetime, time `type` variable):\n\nA format for a date variable (`date`,`time`,`datetime`). \n**default**: An ISO8601 format string.\n**any**: Any parsable representation of a date/time/datetime. The implementing library can attempt to parse the datetime via a range of strategies.\n\n**{PATTERN}**: The value can be parsed according to `{PATTERN}`,\nwhich `MUST` follow the date formatting syntax of \nC / Python [strftime](http://strftime.org/) such as:\n\n- \"`%Y-%m-%d` (for date, e.g., 2023-05-25)\"\n- \"`%Y%-%d` (for date, e.g., 20230525) for date without dashes\"\n- \"`%Y-%m-%dT%H:%M:%S` (for datetime, e.g., 2023-05-25T10:30:45)\"\n- \"`%Y-%m-%dT%H:%M:%SZ` (for datetime with UTC timezone, e.g., 2023-05-25T10:30:45Z)\"\n- \"`%Y-%m-%dT%H:%M:%S%z` (for datetime with timezone offset, e.g., 2023-05-25T10:30:45+0300)\"\n- \"`%Y-%m-%dT%H:%M` (for datetime without seconds, e.g., 2023-05-25T10:30)\"\n- \"`%Y-%m-%dT%H` (for datetime without minutes and seconds, e.g., 2023-05-25T10)\"\n- \"`%H:%M:%S` (for time, e.g., 10:30:45)\"\n- \"`%H:%M:%SZ` (for time with UTC timezone, e.g., 10:30:45Z)\"\n- \"`%H:%M:%S%z` (for time with timezone offset, e.g., 10:30:45+0300)\"\n\nString formats:\n\n- \"`email` if valid emails (e.g., test@gmail.com)\"\n- \"`uri` if valid uri addresses (e.g., https://example.com/resource123)\"\n- \"`binary` if a base64 binary encoded string (e.g., authentication token like aGVsbG8gd29ybGQ=)\"\n- \"`uuid` if a universal unique identifier also known as a guid (eg., f47ac10b-58cc-4372-a567-0e02b2c3d479)\"\n\n\nGeopoint formats:\n\nThe two types of formats for `geopoint` (describing a geographic point).\n\n- `array` (if 'lat,long' (e.g., 36.63,-90.20))\n- `object` (if {'lat':36.63,'lon':-90.20})\n" + "description": "Indicates the format of the type specified in the `type` property. \nEach format is dependent on the `type` specified. \nSee [here](https://specs.frictionlessdata.io/table-schema/#types-and-formats) \nfor more information about appropriate `format` values by variable `type`.\n", + "additionalDescription": "examples/definitions of patterns and possible values:\n\nExamples of date time pattern formats\n\n- `%Y-%m-%d` (for date, e.g., 2023-05-25)\n- `%Y%-%d` (for date, e.g., 20230525) for date without dashes\n- `%Y-%m-%dT%H:%M:%S` (for datetime, e.g., 2023-05-25T10:30:45)\n- `%Y-%m-%dT%H:%M:%SZ` (for datetime with UTC timezone, e.g., 2023-05-25T10:30:45Z)\n- `%Y-%m-%dT%H:%M:%S%z` (for datetime with timezone offset, e.g., 2023-05-25T10:30:45+0300)\n- `%Y-%m-%dT%H:%M` (for datetime without seconds, e.g., 2023-05-25T10:30)\n- `%Y-%m-%dT%H` (for datetime without minutes and seconds, e.g., 2023-05-25T10)\n- `%H:%M:%S` (for time, e.g., 10:30:45)\n- `%H:%M:%SZ` (for time with UTC timezone, e.g., 10:30:45Z)\n- `%H:%M:%S%z` (for time with timezone offset, e.g., 10:30:45+0300)\n\nExamples of string formats\n\n- `email` if valid emails (e.g., test@gmail.com)\n- `uri` if valid uri addresses (e.g., https://example.com/resource123)\n- `binary` if a base64 binary encoded string (e.g., authentication token like aGVsbG8gd29ybGQ=)\n- `uuid` if a universal unique identifier also known as a guid (eg., f47ac10b-58cc-4372-a567-0e02b2c3d479)\n\n\nExamples of geopoint formats\n\nThe two types of formats for `geopoint` (describing a geographic point).\n\n- `array` (if 'lat,long' (e.g., 36.63,-90.20))\n- `object` (if {'lat':36.63,'lon':-90.20})\n" + }, + "constraints.required": { + "type": "boolean", + "title": "Required variable", + "description": "If this variable is marked as true, then this variable's value must be present\n(ie not missing; see missingValues). If marked as false or not present, then the \nvariable CAN be missing.\n" }, "constraints.maxLength": { "type": "integer", @@ -76,14 +89,14 @@ "description": "Indicates the maximum length of an iterable (e.g., array, string, or\nobject). For example, if 'Hello World' is the longest value of a\ncategorical variable, this would be a maxLength of 11.\n" }, "constraints.enum": { + "type": "string", "title": "Variable Possible Values", "description": "Constrains possible values to a set of values.\n", - "type": "string", - "pattern": "^(?:[^|]+\\||[^|]*)(?:[^|]*\\|)*[^|]*$", "examples": [ - "1|2|3|4|5|6|7|8", - "White|Black or African American|American Indian or Alaska Native|Native Hawaiian or Other Pacific Islander|Asian|Some other race|Multiracial" - ] + "1|2|3|4|5", + "Poor|Fair|Good|Very good|Excellent" + ], + "pattern": "^(?:[^|]+\\||[^|]*)(?:[^|]*\\|)*[^|]*$" }, "constraints.pattern": { "type": "string", @@ -100,19 +113,19 @@ "title": "Minimum Value", "description": "Specifies the minimum value of a field.\n" }, - "encodings": { + "enumLabels": { "title": "Variable Value Encodings (i.e., mappings; value labels)", - "description": "Variable value encodings provide a way to further annotate any value within a any variable type,\nmaking values easier to understand. \n\n\nMany analytic software programs (e.g., SPSS,Stata, and SAS) use numerical encodings and some algorithms\nonly support numerical values. Encodings (and mappings) allow categorical values to be stored as\nnumerical values.\n\nAdditionally, as another use case, this field provides a way to\nstore categoricals that are stored as \"short\" labels (such as\nabbreviations).\n", + "description": "Variable value encodings provide a way to further annotate any value within a any variable type,\nmaking values easier to understand. \n\n\nMany analytic software programs (e.g., SPSS,Stata, and SAS) use numerical encodings and some algorithms\nonly support numerical values. Encodings (and mappings) allow categorical values to be stored as\nnumerical values.\n\nAdditionally, as another use case, this field provides a way to\nstore categoricals that are stored as \"short\" labels (such as\nabbreviations).\n\nThis field is intended to follow [this pattern](https://specs.frictionlessdata.io/patterns/#table-schema-enum-labels-and-ordering)\n", "type": "string", - "pattern": "^(?:.*?=.*?(?:\\||$))+$", "examples": [ - "0=No|1=Yes", - "HW=Hello world|GBW=Good bye world|HM=Hi,Mike" - ] + "1=Poor|2=Fair|3=Good|4=Very good|5=Excellent", + "HW=Hello world|GBW=Good bye world|HM=Hi, Mike" + ], + "pattern": "^(?:.*?=.*?(?:\\||$))+$" }, - "ordered": { + "enumOrdered": { "title": "An ordered variable", - "description": "Indicates whether a categorical variable is ordered. This variable is\nrelevant for variables that have an ordered relationship but not\nnecessarily a numerical relationship (e.g., Strongly disagree < Disagree\n< Neutral < Agree).\n", + "description": "Indicates whether a categorical variable is ordered. This variable is\nrelevant for variables that have an ordered relationship but not\nnecessarily a numerical relationship (e.g., Strongly disagree < Disagree\n< Neutral < Agree).\n\nThis field is intended to follow the ordering aspect of this [this pattern][this pattern](https://specs.frictionlessdata.io/patterns/#table-schema-enum-labels-and-ordering)\n", "type": "boolean" }, "missingValues": { @@ -126,134 +139,124 @@ "pattern": "^(?:[^|]+\\||[^|]*)(?:[^|]*\\|)*[^|]*$" }, "trueValues": { - "type": "string", - "pattern": "^(?:[^|]+\\||[^|]*)(?:[^|]*\\|)*[^|]*$", + "title": "Boolean True Value Labels", "description": "For boolean (true) variable (as defined in type field), this field allows\na physical string representation to be cast as true (increasing\nreadability of the field). It can include one or more values.\n", + "type": "string", "examples": [ - "Required|REQUIRED", - "required|Yes|Y|Checked", - "Checked", - "Required" - ] + "required|Yes|Checked", + "required" + ], + "pattern": "^(?:[^|]+\\||[^|]*)(?:[^|]*\\|)*[^|]*$" }, "falseValues": { "title": "Boolean False Value Labels", "description": "For boolean (false) variable (as defined in type field), this field allows\na physical string representation to be cast as false (increasing\nreadability of the field) that is not a standard false value. It can include one or more values.\n", "type": "string", + "examples": [ + "Not required|NOT REQUIRED", + "No" + ], "pattern": "^(?:[^|]+\\||[^|]*)(?:[^|]*\\|)*[^|]*$" }, - "repo_link": { + "custom": { "type": "string", - "title": "Variable Repository Link", - "description": "A link to the variable as it exists on the home repository, if applicable\n" + "description": "Additional properties not included a core property. \n", + "pattern": "^(?:.*?=.*?(?:\\||$))+$" }, - "standardsMappings.url": { - "title": "Standards Mapping - Url", - "description": "The url that links out to the published, standardized mapping.\n", + "standardsMappings[0].instrument.url": { + "title": "Url", + "description": "A url (e.g., link, address) to a file or other resource containing the instrument, or\na set of items which encompass a variable in this variable level metadata document (if at the root level or the document level) \nor the individual variable (if at the field level). \n", "type": "string", "format": "uri", "examples": [ - "https://cde.nlm.nih.gov/deView?tinyId=XyuSGdTTI" + "https://www.heal.nih.gov/files/CDEs/2023-05/adult-demographics-cdes.xlsx" ] }, - "standardsMappings.type": { - "title": "Standards Mapping - Title", - "description": "The **type** of mapping linked to a published set of standard variables such as the NIH Common Data Elements program\n", + "standardsMappings[0].instrument.source": { + "type": "string", + "title": "Source", + "description": "An abbreviated name/acronym from a controlled vocabulary referencing the resource (e.g., program or repository)\ncontaining the instrument, or a set of items which encompass a variable in this variable level metadata document (if at the root level or the document level) \nor the individual variable (if at the field level). \n", + "enum": [ + "heal-cde" + ] + }, + "standardsMappings[0].instrument.title": { + "type": "string", + "title": "Title", "examples": [ - "cde", - "ontology", - "reference_list" - ], - "type": "string" + "Adult demographics", + "adult-demographics" + ] }, - "standardsMappings.label": { - "title": "Standards Mapping - Label", - "description": "A free text **label** of a mapping indicating a mapping(s) to a published set of standard variables such as the NIH Common Data Elements program.\n", + "standardsMappings[0].instrument.id": { "type": "string", + "title": "Identifier", + "description": "A code or other string that identifies the instrument within the source.\nThis should always be from the source's formal, standardized identification system \n", "examples": [ - "substance use", - "chemical compound", - "promis" + "5141" ] }, - "standardsMappings.source": { - "title": "Standard Mapping - Source", - "description": "The source of the standardized variable.\n", + "standardsMappings[0].item.url": { + "title": "Standards mappings - Url", + "description": "The url that links out to the published, standardized mapping of a variable (e.g., common data element)\n", "type": "string", + "format": "uri", "examples": [ - "TBD (will have controlled vocabulary)" + "https://evs.nci.nih.gov/ftp1/CDISC/SDTM/SDTM%20Terminology.html#CL.C74457.RACE" ] }, - "standardsMappings.id": { - "title": "Standard Mapping - Id", + "standardsMappings[0].item.source": { + "title": "Standards mappings - Source", + "description": "The source of the standardized variable. Note, this property is required if \nan id is specified.\n", + "examples": [ + "CDISC" + ], + "type": "string" + }, + "standardsMappings[0].item.id": { + "title": "Standards Mappings - Id", "type": "string", - "description": "The id locating the individual mapping within the given source.\n" + "description": "The id locating the individual mapping within the given source. \nNote, the `standardsMappings[0].source` property is required if \nthis property is specified.\n", + "examples": [ + "C74457" + ] }, - "relatedConcepts.url": { + "relatedConcepts[0].url": { "title": "Related Concepts - Url", - "description": "The url that links out to the published, standardized concept.\n", + "description": "The url that links out to the published, related concept. \nThe listed examples could both be attached to any variable related to, for example, heroin use.\n\n> :point_up: if you are looking for mapping field values to common data elements or a set of standards, see `standardsMappings`_\n", "type": "string", "format": "uri", "examples": [ - "https://cde.nlm.nih.gov/deView?tinyId=XyuSGdTTI" + "https://www.ebi.ac.uk/chebi/chebiOntology.do?chebiId=CHEBI:27808", + "http://purl.bioontology.org/ontology/RXNORM/3304" ] }, - "relatedConcepts.type": { + "relatedConcepts[0].title": { "title": "Related concepts - Type", - "description": "The **type** of mapping to a published set of concepts related to the given field such as \nontological information (eg., NCI thesaurus, bioportal etc)\n", - "type": "string" - }, - "relatedConcepts.label": { + "description": "A human-readable title (ie label) to a concept related to the given field.\nThe listed examples could both be attached to any variable related to, for example, heroin use.\n\n> :point_up: if you are looking for mapping field values to common data elements or a set of standards, see `standardsMappings`_\n", "type": "string", - "title": "Related Concepts - Label", - "description": "A free text **label** of mapping to a published set of concepts related to the given field such as \nontological information (eg., NCI thesaurus, bioportal etc)\n" + "examples": [ + "Heroin Molecular Structure", + "Heroin Ontology" + ] }, - "relatedConcepts.source": { + "relatedConcepts[0].source": { "title": "Related Concepts - Source", - "description": "The source of the related concept.\n", + "description": "The source (e.g., a dictionary or vocabulary set) to a concept related to the given field.\nThe listed examples could both be attached to any variable related to, for example, heroin use.\n\n> :point_up: if you are looking for mapping field values to common data elements or a set of standards, see `standardsMappings`_\n", "type": "string", "examples": [ - "TBD (will have controlled vocabulary)" + "CHEBI", + "RXNORM" ] }, - "relatedConcepts.id": { + "relatedConcepts[0].id": { "title": "Related Concepts - Id", "type": "string", - "description": "The id locating the individual mapping within the given source.\n" - }, - "univarStats.median": { - "type": "number" - }, - "univarStats.mean": { - "type": "number" - }, - "univarStats.std": { - "type": "number" - }, - "univarStats.min": { - "type": "number" - }, - "univarStats.max": { - "type": "number" - }, - "univarStats.mode": { - "type": "number" - }, - "univarStats.count": { - "type": "integer", - "minimum": 0 - }, - "univarStats.twentyFifthPercentile": { - "type": "number" - }, - "univarStats.seventyFifthPercentile": { - "type": "number" - }, - "univarStats.categoricalMarginals.name": { - "type": "string" - }, - "univarStats.categoricalMarginals.count": { - "type": "integer" + "description": "The id locating the individual concept within the source of the given field.\nThe listed examples could both be attached to any variable related to, for example, heroin use.\n\n> :point_up: if you are looking for mapping field values to common data elements or a set of standards, see `standardsMappings`_\n", + "examples": [ + "27808", + "3304" + ] } } } \ No newline at end of file diff --git a/variable-level-metadata-schema/schemas/jsonschema/data-dictionary.json b/variable-level-metadata-schema/schemas/jsonschema/data-dictionary.json index 19d6a51..05c88d5 100644 --- a/variable-level-metadata-schema/schemas/jsonschema/data-dictionary.json +++ b/variable-level-metadata-schema/schemas/jsonschema/data-dictionary.json @@ -1,13 +1,13 @@ { - "version": "0.1.0", + "version": "0.2.0", "$schema": "http://json-schema.org/draft-07/schema#", "$id": "vlmd", "title": "Variable Level Metadata (Data Dictionaries)", - "description": "This schema defines the variable level metadata for one data dictionary for a given study.Note a given study can have multiple data dictionaries", + "description": "This schema defines the variable level metadata for one data dictionary for a given study.Note a given study can have multiple data dictionaries.", "type": "object", "required": [ "title", - "data_dictionary" + "fields" ], "properties": { "title": { @@ -16,44 +16,115 @@ "description": { "type": "string" }, - "data_dictionary": { + "schemaVersion": { + "type": "string", + "description": "The version of the schema used in agreed upon convention of major.minor.path (e.g., 1.0.2) \n\nNOTE: This is NOT for versioning of each indiviual data dictionary instance. \nRather, it is the\nversion of THIS schema document. See `version` property (below) if specifying the individual data dictionary instance\nversion.\n\nIf generating a vlmd document as a csv file, include this version in \nevery row/record to indicate this is a schema level property \n(not applicable for the json version as this property is already at the schema/root level)\n", + "pattern": "\\d+\\.\\d+\\.\\d+", + "examples": [ + "1.0.0", + "0.2.0" + ] + }, + "version": { + "type": "string", + "description": "The specified individual data dictionary instance version." + }, + "standardsMappings": { + "type": "array", + "description": "A set of standardized instruments linked to all variables within the `fields` property (but see note).\n\n!!! note \"NOTE\"\n\n If `standardsMappings` is present at both the root (this property) and within `fields`, \n then the `fields` `standardsMappings` property takes precedence.\n\n Note, only instrument can be mapped to this property as opposed to the `fields` `standardsMappings`\n This property has the same specification as the `fields` `standardsMappings` to make the cascading logic\n easier to understand in the same way other standards implement cascading \n (e.g., `missingValues` in the [frictionless specification](https://specs.frictionlessdata.io/patterns/#missing-values-per-field))\n", + "items": { + "properties": { + "type": "object", + "instrument": { + "type": "object", + "title": "Standard mapping - instrument", + "description": "A standardized set of items which encompass \na variable in this variable level metadata document (if at the root level or the document level) \nor the individual variable (if at the field level). \n\n\n!!! note \"NOTE\"\n\n If information is present at both the root and the field level, \n then the information at the field level would take precedence (i.e., it would cascade).\n", + "properties": { + "url": { + "title": "Url", + "description": "A url (e.g., link, address) to a file or other resource containing the instrument, or\na set of items which encompass a variable in this variable level metadata document (if at the root level or the document level) \nor the individual variable (if at the field level). \n", + "type": "string", + "format": "uri", + "examples": [ + "https://www.heal.nih.gov/files/CDEs/2023-05/adult-demographics-cdes.xlsx" + ] + }, + "source": { + "type": "string", + "title": "Source", + "description": "An abbreviated name/acronym from a controlled vocabulary referencing the resource (e.g., program or repository)\ncontaining the instrument, or a set of items which encompass a variable in this variable level metadata document (if at the root level or the document level) \nor the individual variable (if at the field level). \n", + "enum": [ + "heal-cde" + ] + }, + "title": { + "type": "string", + "title": "Title", + "examples": [ + "Adult demographics", + "adult-demographics" + ] + }, + "id": { + "type": "string", + "title": "Identifier", + "description": "A code or other string that identifies the instrument within the source.\nThis should always be from the source's formal, standardized identification system \n", + "examples": [ + "5141" + ] + } + } + } + } + } + }, + "custom": { + "type": "object", + "description": "Additional properties not included as a core property. \n" + }, + "fields": { "type": "array", "items": { - "$schema": "http://json-schema.org/draft-04/schema#", - "$id": "vlmd-fields", "title": "HEAL Variable Level Metadata Fields", - "description": "Variable level metadata individual fields integrated into the variable level\nmetadata object within the HEAL platform metadata service.\n\n!!! note \"NOTE\"\n\n Only `name` and `description` properties are required. \n For categorical variables, `constraints.enum` and `encodings` (where applicable) properties are highly encouraged. \n For studies using HEAL or other common data elements (CDEs), `standardsMappings` information is highly encouraged.\n `type` and `format` properties may be particularly useful for some variable types (e.g. date-like variables)\n", + "description": "\n\n!!! note \"Highly encouraged\"\n\n - Only `name` and `description` properties are required. \n - For categorical variables, `constraints.enum` and `enumLabels` (where applicable) properties are highly encouraged. \n - For studies using HEAL or other common data elements (CDEs), `standardsMappings` information is highly encouraged.\n - `type` and `format` properties may be particularly useful for some variable types (e.g. date-like variables)\n", "type": "object", - "additionalProperties": true, "required": [ "name", "description" ], "properties": { - "module": { + "schemaVersion": { "type": "string", - "title": "Module", - "description": "The section, form, survey instrument, set of measures or other broad category used \nto group variables.\n", + "description": "The version of the schema used in agreed upon convention of major.minor.path (e.g., 1.0.2) \n\nNOTE: This is NOT for versioning of each indiviual data dictionary instance. \nRather, it is the\nversion of THIS schema document. See `version` property (below) if specifying the individual data dictionary instance\nversion.\n\nIf generating a vlmd document as a csv file, include this version in \nevery row/record to indicate this is a schema level property \n(not applicable for the json version as this property is already at the schema/root level)\n", + "pattern": "\\d+\\.\\d+\\.\\d+", + "examples": [ + "1.0.0", + "0.2.0" + ] + }, + "section": { + "type": "string", + "title": "Section", + "description": "The section, form, survey instrument, set of measures or other broad category used \nto group variables. Previously called \"module.\"\n", "examples": [ "Demographics", "PROMIS", - "Substance use", - "Medical History", - "Sleep questions", - "Physical activity" + "Medical History" ] }, "name": { "type": "string", "title": "Variable Name", - "description": "The name of a variable (i.e., field) as it appears in the data. \n" + "description": "The name of a variable (i.e., field) as it appears in the data. \n", + "examples": [ + "gender_id" + ] }, "title": { "type": "string", "title": "Variable Label (ie Title)", - "description": "The human-readable title or label of the variable. \n", + "description": "The human-readable title or label of the variable.\n", "examples": [ - "My Variable", "Gender identity" ] }, @@ -69,7 +140,8 @@ "type": { "title": "Variable Type", "type": "string", - "description": "A classification or category of a particular data element or property expected or allowed in the dataset.\n\nDefinitions:\n\n- `number` (A numeric value with optional decimal places. (e.g., 3.14))\n- `integer` (A whole number without decimal places. (e.g., 42))\n- `string` (A sequence of characters. (e.g., \\\"test\\\"))\n- `any` (Any type of data is allowed. (e.g., true))\n- `boolean` (A binary value representing true or false. (e.g., true))\n- `date` (A specific calendar date. (e.g., \\\"2023-05-25\\\"))\n- `datetime` (A specific date and time, including timezone information. (e.g., \\\"2023-05-25T10:30:00Z\\\"))\n- `time` (A specific time of day. (e.g., \\\"10:30:00\\\"))\n- `year` (A specific year. (e.g., 2023)\n- `yearmonth` (A specific year and month. (e.g., \\\"2023-05\\\"))\n- `duration` (A length of time. (e.g., \\\"PT1H\\\")\n- `geopoint` (A pair of latitude and longitude coordinates. (e.g., [51.5074, -0.1278]))\n", + "description": "A classification or category of a particular data element or property expected or allowed in the dataset.\n", + "additionalDescription": "enum definitions:\n\n- `number` (A numeric value with optional decimal places. (e.g., 3.14))\n- `integer` (A whole number without decimal places. (e.g., 42))\n- `string` (A sequence of characters. (e.g., \\\"test\\\"))\n- `any` (Any type of data is allowed. (e.g., true))\n- `boolean` (A binary value representing true or false. (e.g., true))\n- `date` (A specific calendar date. (e.g., \\\"2023-05-25\\\"))\n- `datetime` (A specific date and time, including timezone information. (e.g., \\\"2023-05-25T10:30:00Z\\\"))\n- `time` (A specific time of day. (e.g., \\\"10:30:00\\\"))\n- `year` (A specific year. (e.g., 2023)\n- `yearmonth` (A specific year and month. (e.g., \\\"2023-05\\\"))\n- `duration` (A length of time. (e.g., \\\"PT1H\\\")\n- `geopoint` (A pair of latitude and longitude coordinates. (e.g., [51.5074, -0.1278]))\n", "enum": [ "number", "integer", @@ -88,35 +160,40 @@ "format": { "title": "Variable Format", "type": "string", - "description": "Indicates the format of the type specified in the `type` property. \nEach format is dependent on the `type` specified. \nFor example: If `type` is \"string\", then see the [String formats](https://specs.frictionlessdata.io/table-schema/#string). \nIf `type` is \"date\", \"datetime\", or \"time\", default format is ISO8601 formatting for those respective types (see details on ISO8601 format for [Date](https://specs.frictionlessdata.io/table-schema/#date),\n[Datetime](https://specs.frictionlessdata.io/table-schema/#datetime), \nor [Time](https://specs.frictionlessdata.io/table-schema/#time)) - If you want to specify a date-like variable using standard Python/C strptime syntax, see [here](#format-details-for-date-datetime-time-type-variables) for details. \nSee [here](https://specs.frictionlessdata.io/table-schema/#types-and-formats) for more information about appropriate `format` values by variable `type`. \n\n[Additional information]\n\nDate Formats (date, datetime, time `type` variable):\n\nA format for a date variable (`date`,`time`,`datetime`). \n**default**: An ISO8601 format string.\n**any**: Any parsable representation of a date/time/datetime. The implementing library can attempt to parse the datetime via a range of strategies.\n\n**{PATTERN}**: The value can be parsed according to `{PATTERN}`,\nwhich `MUST` follow the date formatting syntax of \nC / Python [strftime](http://strftime.org/) such as:\n\n- \"`%Y-%m-%d` (for date, e.g., 2023-05-25)\"\n- \"`%Y%-%d` (for date, e.g., 20230525) for date without dashes\"\n- \"`%Y-%m-%dT%H:%M:%S` (for datetime, e.g., 2023-05-25T10:30:45)\"\n- \"`%Y-%m-%dT%H:%M:%SZ` (for datetime with UTC timezone, e.g., 2023-05-25T10:30:45Z)\"\n- \"`%Y-%m-%dT%H:%M:%S%z` (for datetime with timezone offset, e.g., 2023-05-25T10:30:45+0300)\"\n- \"`%Y-%m-%dT%H:%M` (for datetime without seconds, e.g., 2023-05-25T10:30)\"\n- \"`%Y-%m-%dT%H` (for datetime without minutes and seconds, e.g., 2023-05-25T10)\"\n- \"`%H:%M:%S` (for time, e.g., 10:30:45)\"\n- \"`%H:%M:%SZ` (for time with UTC timezone, e.g., 10:30:45Z)\"\n- \"`%H:%M:%S%z` (for time with timezone offset, e.g., 10:30:45+0300)\"\n\nString formats:\n\n- \"`email` if valid emails (e.g., test@gmail.com)\"\n- \"`uri` if valid uri addresses (e.g., https://example.com/resource123)\"\n- \"`binary` if a base64 binary encoded string (e.g., authentication token like aGVsbG8gd29ybGQ=)\"\n- \"`uuid` if a universal unique identifier also known as a guid (eg., f47ac10b-58cc-4372-a567-0e02b2c3d479)\"\n\n\nGeopoint formats:\n\nThe two types of formats for `geopoint` (describing a geographic point).\n\n- `array` (if 'lat,long' (e.g., 36.63,-90.20))\n- `object` (if {'lat':36.63,'lon':-90.20})\n" + "description": "Indicates the format of the type specified in the `type` property. \nEach format is dependent on the `type` specified. \nSee [here](https://specs.frictionlessdata.io/table-schema/#types-and-formats) \nfor more information about appropriate `format` values by variable `type`.\n", + "additionalDescription": "examples/definitions of patterns and possible values:\n\nExamples of date time pattern formats\n\n- `%Y-%m-%d` (for date, e.g., 2023-05-25)\n- `%Y%-%d` (for date, e.g., 20230525) for date without dashes\n- `%Y-%m-%dT%H:%M:%S` (for datetime, e.g., 2023-05-25T10:30:45)\n- `%Y-%m-%dT%H:%M:%SZ` (for datetime with UTC timezone, e.g., 2023-05-25T10:30:45Z)\n- `%Y-%m-%dT%H:%M:%S%z` (for datetime with timezone offset, e.g., 2023-05-25T10:30:45+0300)\n- `%Y-%m-%dT%H:%M` (for datetime without seconds, e.g., 2023-05-25T10:30)\n- `%Y-%m-%dT%H` (for datetime without minutes and seconds, e.g., 2023-05-25T10)\n- `%H:%M:%S` (for time, e.g., 10:30:45)\n- `%H:%M:%SZ` (for time with UTC timezone, e.g., 10:30:45Z)\n- `%H:%M:%S%z` (for time with timezone offset, e.g., 10:30:45+0300)\n\nExamples of string formats\n\n- `email` if valid emails (e.g., test@gmail.com)\n- `uri` if valid uri addresses (e.g., https://example.com/resource123)\n- `binary` if a base64 binary encoded string (e.g., authentication token like aGVsbG8gd29ybGQ=)\n- `uuid` if a universal unique identifier also known as a guid (eg., f47ac10b-58cc-4372-a567-0e02b2c3d479)\n\n\nExamples of geopoint formats\n\nThe two types of formats for `geopoint` (describing a geographic point).\n\n- `array` (if 'lat,long' (e.g., 36.63,-90.20))\n- `object` (if {'lat':36.63,'lon':-90.20})\n" }, "constraints": { "type": "object", "properties": { + "required": { + "type": "boolean", + "title": "Required variable", + "description": "If this variable is marked as true, then this variable's value must be present\n(ie not missing; see missingValues). If marked as false or not present, then the \nvariable CAN be missing.\n" + }, "maxLength": { "type": "integer", "title": "Maximum Length", "description": "Indicates the maximum length of an iterable (e.g., array, string, or\nobject). For example, if 'Hello World' is the longest value of a\ncategorical variable, this would be a maxLength of 11.\n" }, "enum": { + "type": "array", "title": "Variable Possible Values", "description": "Constrains possible values to a set of values.\n", - "type": "array", "examples": [ [ 1, 2, 3, - 4 + 4, + 5 ], [ - "White", - "Black or African American", - "American Indian or Alaska Native", - "Native Hawaiian or Other Pacific Islander", - "Asian", - "Some other race", - "Multiracial" + "Poor", + "Fair", + "Good", + "Very good", + "Excellent" ] ] }, @@ -137,14 +214,17 @@ } } }, - "encodings": { + "enumLabels": { "title": "Variable Value Encodings (i.e., mappings; value labels)", - "description": "Variable value encodings provide a way to further annotate any value within a any variable type,\nmaking values easier to understand. \n\n\nMany analytic software programs (e.g., SPSS,Stata, and SAS) use numerical encodings and some algorithms\nonly support numerical values. Encodings (and mappings) allow categorical values to be stored as\nnumerical values.\n\nAdditionally, as another use case, this field provides a way to\nstore categoricals that are stored as \"short\" labels (such as\nabbreviations).\n", + "description": "Variable value encodings provide a way to further annotate any value within a any variable type,\nmaking values easier to understand. \n\n\nMany analytic software programs (e.g., SPSS,Stata, and SAS) use numerical encodings and some algorithms\nonly support numerical values. Encodings (and mappings) allow categorical values to be stored as\nnumerical values.\n\nAdditionally, as another use case, this field provides a way to\nstore categoricals that are stored as \"short\" labels (such as\nabbreviations).\n\nThis field is intended to follow [this pattern](https://specs.frictionlessdata.io/patterns/#table-schema-enum-labels-and-ordering)\n", "type": "object", "examples": [ { - "0": "No", - "1": "Yes" + "1": "Poor", + "2": "Fair", + "3": "Good", + "4": "Very good", + "5": "Excellent" }, { "HW": "Hello world", @@ -153,9 +233,9 @@ } ] }, - "ordered": { + "enumOrdered": { "title": "An ordered variable", - "description": "Indicates whether a categorical variable is ordered. This variable is\nrelevant for variables that have an ordered relationship but not\nnecessarily a numerical relationship (e.g., Strongly disagree < Disagree\n< Neutral < Agree).\n", + "description": "Indicates whether a categorical variable is ordered. This variable is\nrelevant for variables that have an ordered relationship but not\nnecessarily a numerical relationship (e.g., Strongly disagree < Disagree\n< Neutral < Agree).\n\nThis field is intended to follow the ordering aspect of this [this pattern][this pattern](https://specs.frictionlessdata.io/patterns/#table-schema-enum-labels-and-ordering)\n", "type": "boolean" }, "missingValues": { @@ -177,9 +257,6 @@ "title": "Boolean True Value Labels", "description": "For boolean (true) variable (as defined in type field), this field allows\na physical string representation to be cast as true (increasing\nreadability of the field). It can include one or more values.\n", "type": "array", - "items": { - "type": "string" - }, "examples": [ [ "required", @@ -194,151 +271,145 @@ "falseValues": { "title": "Boolean False Value Labels", "description": "For boolean (false) variable (as defined in type field), this field allows\na physical string representation to be cast as false (increasing\nreadability of the field) that is not a standard false value. It can include one or more values.\n", - "type": "array" + "type": "array", + "examples": [ + [ + "Not required", + "NOT REQUIRED" + ], + [ + "No" + ] + ] }, - "repo_link": { - "type": "string", - "title": "Variable Repository Link", - "description": "A link to the variable as it exists on the home repository, if applicable\n" + "custom": { + "type": "object", + "description": "Additional properties not included a core property. \n" }, "standardsMappings": { - "title": "Standards Mappings", - "description": "A published set of standard variables such as the NIH Common Data Elements program.", "type": "array", + "description": "\nA set of instrument and item references to standardized data elements designed to document\nthe [HEAL common data elements program](https://heal.nih.gov/data/common-data-elements)\nand other standardized/common element sources to facilitate cross-study comparison and interoperability\nof data. One can either map an individual data element or an instrument in which the field is \na part of.\n\n__**All Fields Mapped (Both Instrument and Item)**__\n\n```json\n\"standardsMappings\": [\n {\n \"instrument\": {\n \"url\": \"https://www.heal.nih.gov/files/CDEs/2023-05/adult-demographics-cdes.xlsx\",\n \"source\": \"heal-cde\",\n \"title\": \"adult-demographics\",\n \"id\": \"5141\"\n },\n \"item\": {\n \"url\": \"https://evs.nci.nih.gov/ftp1/CDISC/SDTM/SDTM%20Terminology.html#CL.C74457.RACE\",\n \"source\": \"CDISC\",\n \"id\": \"C74457\"\n }\n }\n]\n```\n\n__**Only Instrument Title of Form CDE File Mapped**__\n\nIn this scenario, especially as CDE variables do not have associated CDISC ids listed, only instrument information is given.\n\n```json\n\"standardsMappings\": [\n {\n \"instrument\": {\n \"source\": \"heal-cde\",\n \"title\": \"Adult demographics\"\n }\n }\n]\n```\n\n__**Only Instrument ID of HEAL CDE Mapped**__\n\n```json\n\"standardsMappings\": [\n {\n \"instrument\": {\n \"source\": \"heal-cde\",\n \"id\": \"5141\"\n }\n }\n]\n```\n\n__**Other Non-HEAL CDE Use Cases**__\n\nOnly item matched (for example if found in the NIH (not HEAL) CDE repository). Folks would enter the information in the \"Identifier\" section. Similar to the above, they could also just enter the \"url\".\n\n```json\n\"standardsMappings\": [\n {\n \"item\": {\n \"source\": \"NLM\",\n \"id\": \"Fakc6Jy2x\"\n }\n }\n]\n```\n\n__**Multiple CDE Mappings**__\n\nTwo separate records. If desired, multiple standard mappings can be entered, say from the NIH HEAL CDE repo and the NIH CDE lookup (NLM) by way of two separate records in the list.\n\n```json\n\"standardsMappings\": [\n {\n \"instrument\": {\n \"source\": \"heal-cde\",\n \"title\": \"Adult demographics\"\n },\n \"item\": {\n \"source\": \"CDISC\",\n \"id\": \"C74457\"\n },\n },\n {\n \"item\": {\n \"source\": \"NLM\",\n \"id\": \"Fakc6Jy2x\"\n }\n }\n]\n```\n", "items": { "type": "object", "properties": { - "url": { - "title": "Standards Mapping - Url", - "description": "The url that links out to the published, standardized mapping.\n", - "type": "string", - "format": "uri", - "examples": [ - "https://cde.nlm.nih.gov/deView?tinyId=XyuSGdTTI" - ] - }, - "type": { - "title": "Standards Mapping - Title", - "description": "The **type** of mapping linked to a published set of standard variables such as the NIH Common Data Elements program\n", - "examples": [ - "cde", - "ontology", - "reference_list" - ], - "type": "string" - }, - "label": { - "title": "Standards Mapping - Label", - "description": "A free text **label** of a mapping indicating a mapping(s) to a published set of standard variables such as the NIH Common Data Elements program.\n", - "type": "string", - "examples": [ - "substance use", - "chemical compound", - "promis" - ] - }, - "source": { - "title": "Standard Mapping - Source", - "description": "The source of the standardized variable.\n", - "type": "string", - "examples": [ - "TBD (will have controlled vocabulary)" - ] + "instrument": { + "type": "object", + "title": "Standard mapping - instrument", + "description": "A standardized set of items which encompass \na variable in this variable level metadata document (if at the root level or the document level) \nor the individual variable (if at the field level). \n\n\n!!! note \"NOTE\"\n\n If information is present at both the root and the field level, \n then the information at the field level would take precedence (i.e., it would cascade).\n", + "properties": { + "url": { + "title": "Url", + "description": "A url (e.g., link, address) to a file or other resource containing the instrument, or\na set of items which encompass a variable in this variable level metadata document (if at the root level or the document level) \nor the individual variable (if at the field level). \n", + "type": "string", + "format": "uri", + "examples": [ + "https://www.heal.nih.gov/files/CDEs/2023-05/adult-demographics-cdes.xlsx" + ] + }, + "source": { + "type": "string", + "title": "Source", + "description": "An abbreviated name/acronym from a controlled vocabulary referencing the resource (e.g., program or repository)\ncontaining the instrument, or a set of items which encompass a variable in this variable level metadata document (if at the root level or the document level) \nor the individual variable (if at the field level). \n", + "enum": [ + "heal-cde" + ] + }, + "title": { + "type": "string", + "title": "Title", + "examples": [ + "Adult demographics", + "adult-demographics" + ] + }, + "id": { + "type": "string", + "title": "Identifier", + "description": "A code or other string that identifies the instrument within the source.\nThis should always be from the source's formal, standardized identification system \n", + "examples": [ + "5141" + ] + } + } }, - "id": { - "title": "Standard Mapping - Id", - "type": "string", - "description": "The id locating the individual mapping within the given source.\n" + "item": { + "type": "object", + "title": "Standards mappings - Item", + "description": "A standardized item (ie field, variable etc) mapped to this individual variable.\n", + "properties": { + "url": { + "title": "Standards mappings - Url", + "description": "The url that links out to the published, standardized mapping of a variable (e.g., common data element)\n", + "type": "string", + "format": "uri", + "examples": [ + "https://evs.nci.nih.gov/ftp1/CDISC/SDTM/SDTM%20Terminology.html#CL.C74457.RACE" + ] + }, + "source": { + "title": "Standards mappings - Source", + "description": "The source of the standardized variable. Note, this property is required if \nan id is specified.\n", + "examples": [ + "CDISC" + ], + "type": "string" + }, + "id": { + "title": "Standards Mappings - Id", + "type": "string", + "description": "The id locating the individual mapping within the given source. \nNote, the `standardsMappings[0].source` property is required if \nthis property is specified.\n", + "examples": [ + "C74457" + ] + } + } } } } }, "relatedConcepts": { "title": "Related Concepts", - "description": "Mappings to a published set of concepts related to the given field such as ontological information (eg., NCI thesaurus, bioportal etc)", + "description": "__**[Under development]**__ Mappings to a published set of concepts related to the given field such as \nontological information (eg., NCI thesaurus, bioportal etc)\n", "type": "array", "items": { "type": "object", "properties": { "url": { "title": "Related Concepts - Url", - "description": "The url that links out to the published, standardized concept.\n", + "description": "The url that links out to the published, related concept. \nThe listed examples could both be attached to any variable related to, for example, heroin use.\n\n> :point_up: if you are looking for mapping field values to common data elements or a set of standards, see `standardsMappings`_\n", "type": "string", "format": "uri", "examples": [ - "https://cde.nlm.nih.gov/deView?tinyId=XyuSGdTTI" + "https://www.ebi.ac.uk/chebi/chebiOntology.do?chebiId=CHEBI:27808", + "http://purl.bioontology.org/ontology/RXNORM/3304" ] }, - "type": { + "title": { "title": "Related concepts - Type", - "description": "The **type** of mapping to a published set of concepts related to the given field such as \nontological information (eg., NCI thesaurus, bioportal etc)\n", - "type": "string" - }, - "label": { + "description": "A human-readable title (ie label) to a concept related to the given field.\nThe listed examples could both be attached to any variable related to, for example, heroin use.\n\n> :point_up: if you are looking for mapping field values to common data elements or a set of standards, see `standardsMappings`_\n", "type": "string", - "title": "Related Concepts - Label", - "description": "A free text **label** of mapping to a published set of concepts related to the given field such as \nontological information (eg., NCI thesaurus, bioportal etc)\n" + "examples": [ + "Heroin Molecular Structure", + "Heroin Ontology" + ] }, "source": { "title": "Related Concepts - Source", - "description": "The source of the related concept.\n", + "description": "The source (e.g., a dictionary or vocabulary set) to a concept related to the given field.\nThe listed examples could both be attached to any variable related to, for example, heroin use.\n\n> :point_up: if you are looking for mapping field values to common data elements or a set of standards, see `standardsMappings`_\n", "type": "string", "examples": [ - "TBD (will have controlled vocabulary)" + "CHEBI", + "RXNORM" ] }, "id": { "title": "Related Concepts - Id", "type": "string", - "description": "The id locating the individual mapping within the given source.\n" - } - } - } - }, - "univarStats": { - "type": "object", - "description": "Univariate statistics inferred from the data about the given variable \n", - "properties": { - "median": { - "type": "number" - }, - "mean": { - "type": "number" - }, - "std": { - "type": "number" - }, - "min": { - "type": "number" - }, - "max": { - "type": "number" - }, - "mode": { - "type": "number" - }, - "count": { - "type": "integer", - "minimum": 0 - }, - "twentyFifthPercentile": { - "type": "number" - }, - "seventyFifthPercentile": { - "type": "number" - }, - "categoricalMarginals": { - "type": "array", - "items": { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "count": { - "type": "integer" - } - } + "description": "The id locating the individual concept within the source of the given field.\nThe listed examples could both be attached to any variable related to, for example, heroin use.\n\n> :point_up: if you are looking for mapping field values to common data elements or a set of standards, see `standardsMappings`_\n", + "examples": [ + "27808", + "3304" + ] } } } @@ -346,5 +417,20 @@ } } } + }, + "propertyNames": { + "description": "To allow additional properties for compatibility with other standards at the \"table\" , or root, but not included in the core `properties` set:\n\n[Frictionless Data package table schema standard](https://specs.frictionlessdata.io/table-schema): `missingValues`|`primaryKey`|`foreignKeys`\n", + "enum": [ + "title", + "description", + "schemaVersion", + "version", + "standardsMappings", + "fields", + "custom", + "missingValues", + "primaryKey", + "foreignKeys" + ] } } \ No newline at end of file diff --git a/variable-level-metadata-schema/templates/template_submission.csv b/variable-level-metadata-schema/templates/template_submission.csv index f4a25d6..1e629e3 100644 --- a/variable-level-metadata-schema/templates/template_submission.csv +++ b/variable-level-metadata-schema/templates/template_submission.csv @@ -1 +1 @@ -module,name,title,description,type,format,constraints.maxLength,constraints.enum,constraints.pattern,constraints.maximum,constraints.minimum,encodings,ordered,missingValues,trueValues,falseValues,repo_link,standardsMappings.url,standardsMappings.type,standardsMappings.label,standardsMappings.source,standardsMappings.id,relatedConcepts.url,relatedConcepts.type,relatedConcepts.label,relatedConcepts.source,relatedConcepts.id,univarStats.median,univarStats.mean,univarStats.std,univarStats.min,univarStats.max,univarStats.mode,univarStats.count,univarStats.twentyFifthPercentile,univarStats.seventyFifthPercentile,univarStats.categoricalMarginals.name,univarStats.categoricalMarginals.count \ No newline at end of file +schemaVersion,section,name,title,description,type,format,constraints.required,constraints.maxLength,constraints.enum,constraints.pattern,constraints.maximum,constraints.minimum,enumLabels,enumOrdered,missingValues,trueValues,falseValues,custom,standardsMappings[0].instrument.url,standardsMappings[0].instrument.source,standardsMappings[0].instrument.title,standardsMappings[0].instrument.id,standardsMappings[0].item.url,standardsMappings[0].item.source,standardsMappings[0].item.id,relatedConcepts[0].url,relatedConcepts[0].title,relatedConcepts[0].source,relatedConcepts[0].id \ No newline at end of file diff --git a/variable-level-metadata-schema/templates/template_submission.json b/variable-level-metadata-schema/templates/template_submission.json index fe4c050..c8d2524 100644 --- a/variable-level-metadata-schema/templates/template_submission.json +++ b/variable-level-metadata-schema/templates/template_submission.json @@ -2,64 +2,65 @@ { "title": null, "description": null, - "data_dictionary": [ + "schemaVersion": null, + "version": null, + "standardsMappings": [ { - "module": null, + "instrument": { + "url": null, + "source": null, + "title": null, + "id": null + } + } + ], + "custom": {}, + "fields": [ + { + "schemaVersion": null, + "section": null, "name": null, "title": null, "description": null, "type": null, "format": null, "constraints": { + "required": null, "maxLength": null, "enum": [], "pattern": null, "maximum": null, "minimum": null }, - "encodings": {}, - "ordered": null, + "enumLabels": {}, + "enumOrdered": null, "missingValues": [], - "trueValues": [ - {} - ], + "trueValues": [], "falseValues": [], - "repo_link": null, + "custom": {}, "standardsMappings": [ { - "url": null, - "type": null, - "label": null, - "source": null, - "id": null + "instrument": { + "url": null, + "source": null, + "title": null, + "id": null + }, + "item": { + "url": null, + "source": null, + "id": null + } } ], "relatedConcepts": [ { "url": null, - "type": null, - "label": null, + "title": null, "source": null, "id": null } - ], - "univarStats": { - "median": null, - "mean": null, - "std": null, - "min": null, - "max": null, - "mode": null, - "count": null, - "twentyFifthPercentile": null, - "seventyFifthPercentile": null, - "categoricalMarginals": [ - { - "name": null, - "count": null - } - ] - } + ] } ] }