From f2dd8fc2a60007e4b73f6352a5951b41e8589c51 Mon Sep 17 00:00:00 2001 From: Damien Irving Date: Tue, 26 Jun 2018 15:42:17 +1000 Subject: [PATCH] Documentation complete --- docs/api_reference.rst | 5 +++ docs/conf.py | 10 +++--- docs/index.rst | 80 +++++++++++++++++++++++++++++++++--------- 3 files changed, 73 insertions(+), 22 deletions(-) create mode 100644 docs/api_reference.rst diff --git a/docs/api_reference.rst b/docs/api_reference.rst new file mode 100644 index 0000000..998edfc --- /dev/null +++ b/docs/api_reference.rst @@ -0,0 +1,5 @@ +API reference +------------ + +.. automodule:: cmdline_provenance + :members: \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py index 368fbcc..27993e9 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -17,9 +17,9 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # -# import os -# import sys -# sys.path.insert(0, os.path.abspath('.')) +import os +import sys +sys.path.insert(0, os.path.abspath('../cmdline_provenance/')) # -- General configuration ------------------------------------------------ @@ -30,7 +30,7 @@ # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. -extensions = [] +extensions = ['sphinx.ext.autodoc'] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] @@ -175,7 +175,7 @@ # Custom sidebar templates, maps document names to template names. # -# html_sidebars = {} +html_sidebars = {'**': ['relations.html', 'sourcelink.html', 'searchbox.html']} # Additional templates that should be rendered to pages, maps page names to # template names. diff --git a/docs/index.rst b/docs/index.rst index e0bab7f..92c9054 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,6 +1,13 @@ Command Line Provenance ======================= +.. toctree:: + :maxdepth: 3 + :hidden: + + index + api_reference + Introduction ------------ @@ -9,9 +16,11 @@ Introduction It was inspired by the popular `NCO `_ and `CDO `_ command line tools, -which automatically generate a record of what was executed at the command line, -append that record to the history attribute from the input (netCDF) data file, -and then set the new extended record as the history attribute of the output (netCDF) data file. +which are used to manipulate the data and/or metadata contained in netCDF files +(a self-describing file format that is popular in the weather and climate sciences). +These tools generate a record of what was executed at the command line, +append that record to the history attribute from the input data file, +and then set this command log as the history attribute of the output data file. For example, after selecting the 2001-2005 time period from a rainfall data file and then deleting the ``long_name`` file attribute, @@ -24,7 +33,7 @@ the command log would look as follows: Following this simple approach to data provenance, it is possible maintain a record of all data processing steps -from intial download/creation of your data files to the end result (e.g. a .png image). +from intial download/creation of the data files to the end result (e.g. a .png image). ``cmdline_provenance`` contains a series of functions for generating history records in the NCO/CDO format, and for combining the current record with previous records to maintain a complete command log. @@ -82,17 +91,17 @@ and the log entry will specify the precise version of ``ocean_analysis.py`` that Each commit in a git repository is associated with a unique 40-character identifier known as a hash. The ``new_log`` function has included the first 7-characters of the hash associated with the latest commit to the repository, -which is sufficient information to revert back to that previous version of ``ocean_analysis.py``. +which is sufficient information to revert back to that previous version of ``ocean_analysis.py`` if need be. -outputting a log -^^^^^^^^^^^^^^^^ +writing a log to file +^^^^^^^^^^^^^^^^^^^^^ If our output file is a self-describing file format (i.e. a format that carries its metadata with it), -then we would include our new log in the file metadata. +then we can include our new log in the file metadata. For instance, a common convention in weather and climate science is to include the command log in the global history attribute of netCDF data files. -If we were using the iris library (for instance) +If we were using the `iris `_ library (for instance) to read and write netCDF files using its cube data structure, the process would look something like this: @@ -107,22 +116,59 @@ the process would look something like this: >>> output_cube.attributes['history'] = my_log >>> iris.save(output_cube, 'output.nc') -If the output file was not a self-describing format (e.g. a .png image), +If the output file was not a self-describing format (e.g. ``output.png``), then we can write a separate log file (i.e. a simple text file with the log in it) using the ``write_log`` function. .. code-block:: python - >>> outfile = 'output.png' - >>> logfile = 'output.log' - >>> cmdprov.write_log(logfile, my_log) + >>> cmdprov.write_log('output.log', my_log) While it's not a formal requirement of the ``write_log`` function, it's good practice to make the name of the log file exactly the same as the name of the output file, -just with a different file extension such as .log or .txt. +just with a different file extension (such as .log or .txt). + +including logs from input files +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In order to capture the complete provenance of the output file, +we need to include the command log from the input files in our new log. +We can do this using the ``infile_history`` keyword argument associated with the +``new_log`` function. + +If our input files are a self describing format, +then similar to the iris example above, +we can extract the input file logs from the metadata of the input file/s: + +.. code-block:: python + + >>> inlogs = {} + >>> inlogs['temperature_data.nc'] = temperature_cube.attributes['history'] + >>> inlogs['salinity_data.nc'] = salinity_cube.attributes['history'] + >>> my_log = cmdprov.new_log(infile_history=inlogs, git_repo='/path/to/git/repo/') + >>> print(my_log) + Tue Jun 26 11:24:46 2018: /Applications/anaconda/bin/python ocean_analysis.py temperature_data.nc salinity_data.nc output.nc (Git hash: 026301f) + History of temperature_data.nc: Tue Jun 26 09:24:03 2018: cdo daymean temperature_data.nc + History of salinity_data.nc: Tue Jun 26 09:22:10 2018: cdo daymean salinity_data.nc + Tue Jun 26 09:21:54 2018: ncatted -O -a standard_name,so,o,c,"ocean_salinity" salinity_data.nc + +If the input files aren't self describing, +you can use the ``read_log`` function to read the log files associated with the input data files +(these logs files may have been previously written using the ``write_log`` function): + +.. code-block:: python + + >>> inlogs = {} + >>> inlogs['temperature_data.csv'] = cmdprov.read_log('temperature_data.log') + >>> inlogs['salinity_data.csv'] = cmdprov.read_log('salinity_data.log') + >>> my_log = cmdprov.new_log(infile_history=inlogs, git_repo='/path/to/git/repo/') -input file history -^^^^^^^^^^^^^^^^^^ -If we want a complete log... +For scripts that take many input files, +the resulting log files can become very long and unwieldy. +To avoid this, think about ways to avoid repetition. +For instance, if you've got one input file that contains data from the year 1999-2003 +and another equivalent file with data from 2004-2008, +it's probably only necessary to include the log from the 1999-2003 file +(i.e. because essentially identical data processing steps were taken to produce both files).