diff --git a/.github/workflows/cgatcore_python.yml b/.github/workflows/cgatcore_python.yml index 80a0b09f..00cba1f9 100644 --- a/.github/workflows/cgatcore_python.yml +++ b/.github/workflows/cgatcore_python.yml @@ -19,14 +19,15 @@ jobs: steps: - uses: actions/checkout@v3 + - name: Cache conda uses: actions/cache@v3 env: - # Increase this value to reset cache if conda/environments/cgat-core.yml has not changed CACHE_NUMBER: 0 with: path: ~/conda_pkgs_dir key: ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{ hashFiles('conda/environments/cgat-core.yml') }} + - name: Set installer URL id: set-installer-url run: | @@ -35,6 +36,7 @@ jobs: elif [[ "${{ matrix.os }}" == "macos-latest" ]]; then echo "installer-url=https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh" >> $GITHUB_ENV fi + - uses: conda-incubator/setup-miniconda@v2 with: installer-url: ${{ env.installer-url }} @@ -43,13 +45,45 @@ jobs: channel-priority: true activate-environment: cgat-core environment-file: conda/environments/cgat-core.yml + - name: Configure Conda Paths run: echo "/usr/share/miniconda3/condabin" >> $GITHUB_PATH + - name: Show conda run: | conda info conda list + + - name: Debug Python Environment + run: | + python --version + pip list + openssl version + - name: Test run: | pip install . ./all-tests.sh + + + deploy_docs: + name: Deploy MkDocs Documentation + runs-on: ubuntu-latest + needs: build + + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.10' + + - name: Install MkDocs and Dependencies + run: | + pip install mkdocs mkdocs-material mkdocstrings[python] + + - name: Build and Deploy MkDocs Site + run: mkdocs gh-deploy --force --clean + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/LICENSE b/LICENSE index f669650d..f7c62a85 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2019 cgat-developers +Copyright (c) 2024 cgat-developers Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 2a283ecf..9bf17da4 100644 --- a/README.md +++ b/README.md @@ -1,17 +1,10 @@ ![CGAT-core](https://github.com/cgat-developers/cgat-core/blob/master/docs/img/CGAT_logo.png) ---------------------------------------- +![Licence](https://img.shields.io/github/license/cgat-developers/cgat-core.svg) +![Conda](https://img.shields.io/conda/v/bioconda/cgatcore.svg) +![Build Status](https://github.com/cgat-developers/cgat-core/actions/workflows/cgatcore_python.yml/badge.svg) -

- - - - - - - - -

---------------------------------------- @@ -19,11 +12,7 @@ CGAT-core is a workflow management system that allows users to quickly and repro data analysis pipelines. CGAT-core is a set of libraries and helper functions used to enable researchers to design and build computational workflows for the analysis of large-scale data-analysis. -Documentation for CGAT-core can be accessed at [read the docs](http://cgat-core.readthedocs.io/en/latest/) - -Used in combination with CGAT-apps, we have demonstrated the functionality of our -flexible implementation using a set of well documented, easy to install and easy to use workflows, -called [CGAT-flow](https://github.com/cgat-developers/cgat-flow) ([Documentation](https://www.cgat.org/downloads/public/cgatpipelines/documentation/)). +Documentation for CGAT-core can be accessed [here](https://cgat-developers.github.io/cgat-core/) CGAT-core is open-sourced, powerful and user-friendly, and has been continually developed as a Next Generation Sequencing (NGS) workflow management system over the past 10 years. @@ -32,19 +21,7 @@ as a Next Generation Sequencing (NGS) workflow management system over the past 1 Installation ============ -The following sections describe how to install the [cgatcore](https://cgat-core.readthedocs.io/en/latest/index.html) framework. For instructions on how to install -our other repos, CGAT-apps (scripts) and CGAT-flow (workflows/pipelines), please follow these instructions [here](https://www.cgat.org/downloads/public/cgatpipelines/documentation/InstallingPipelines.html). +The following sections describe how to install the [cgatcore](https://cgat-developers.github.io/cgat-core/) framework. The preferred method to install the cgatcore is using conda, by following the instructions on [read the docs](https://cgat-core.readthedocs.io/en/latest/getting_started/Installation.html). However, there are a few other methods to install cgatcore, including pip and our own bash script installer. -Linux vs OS X -============= - -* ulimit works as expected in Linux but it does not have an effect on OS X. [Disabled](https://github.com/cgat-developers/cgat-core/commit/d4d9b9fb75525873b291028a622aac70c44a5065) ulimit tests for OS X. - -* ssh.connect times out in OSX. Exception [caught](https://github.com/cgat-developers/cgat-core/commit/d4d9b9fb75525873b291028a622aac70c44a5065) - -* Linux uses /proc/meminfo and OS X uses [vm_stat](https://github.com/cgat-developers/cgat-core/compare/bb1c75df8f42...575f0699b326) - -* Currently our testing framework is broken for OSX, however we are working to fix this. However, we dont envisage any issues running the code at present. - diff --git a/all-tests.sh b/all-tests.sh index 94bf6504..32e22d59 100755 --- a/all-tests.sh +++ b/all-tests.sh @@ -18,3 +18,4 @@ pytest -v tests/test_pipeline_cli.py pytest -v tests/test_pipeline_actions.py pytest -v tests/test_execution_cleanup.py pytest -v tests/test_s3_decorators.py +pytest -v tests/test_container.py diff --git a/cgatcore/pipeline/execution.py b/cgatcore/pipeline/execution.py index 78166dea..f25aec7e 100644 --- a/cgatcore/pipeline/execution.py +++ b/cgatcore/pipeline/execution.py @@ -125,6 +125,60 @@ def _pickle_args(args, kwargs): return (submit_args, args_file) +class ContainerConfig: + """Container configuration for pipeline execution.""" + + def __init__(self, image=None, volumes=None, env_vars=None, runtime="docker"): + """ + Args: + image (str): Container image (e.g., "ubuntu:20.04"). + volumes (list): Volume mappings (e.g., ['/data:/data']). + env_vars (dict): Environment variables for the container. + runtime (str): Container runtime ("docker" or "singularity"). + """ + self.image = image + self.volumes = volumes or [] + self.env_vars = env_vars or {} + self.runtime = runtime.lower() # Normalise to lowercase + + if self.runtime not in ["docker", "singularity"]: + raise ValueError("Unsupported container runtime: {}".format(self.runtime)) + + def get_container_command(self, statement): + """Convert a statement to run inside a container.""" + if not self.image: + return statement + + if self.runtime == "docker": + return self._get_docker_command(statement) + elif self.runtime == "singularity": + return self._get_singularity_command(statement) + else: + raise ValueError("Unsupported container runtime: {}".format(self.runtime)) + + def _get_docker_command(self, statement): + """Generate a Docker command.""" + volume_args = [f"-v {volume}" for volume in self.volumes] + env_args = [f"-e {key}={value}" for key, value in self.env_vars.items()] + + return " ".join([ + "docker", "run", "--rm", + *volume_args, *env_args, self.image, + "/bin/bash", "-c", f"'{statement}'" + ]) + + def _get_singularity_command(self, statement): + """Generate a Singularity command.""" + volume_args = [f"--bind {volume}" for volume in self.volumes] + env_args = [f"--env {key}={value}" for key, value in self.env_vars.items()] + + return " ".join([ + "singularity", "exec", + *volume_args, *env_args, self.image, + "bash", "-c", f"'{statement}'" + ]) + + def start_session(): """start and initialize the global DRMAA session.""" global GLOBAL_SESSION @@ -789,6 +843,13 @@ def get_val(d, v, alt): return benchmark_data + def set_container_config(self, image, volumes=None, env_vars=None, runtime="docker"): + """Set container configuration for all tasks executed by this executor.""" + + if not image: + raise ValueError("An image must be specified for the container configuration.") + self.container_config = ContainerConfig(image=image, volumes=volumes, env_vars=env_vars, runtime=runtime) + def start_job(self, job_info): """Add a job to active_jobs list when it starts.""" self.active_jobs.append(job_info) @@ -838,15 +899,63 @@ def cleanup_failed_job(self, job_info): else: self.logger.info(f"Output file not found (already removed or not created): {outfile}") - def run(self, statement_list): - """Run a list of statements and track each job's lifecycle.""" + def run( + self, + statement_list, + job_memory=None, + job_threads=None, + container_runtime=None, + image=None, + volumes=None, + env_vars=None, + **kwargs,): + + """ + Execute a list of statements with optional container support. + + Args: + statement_list (list): List of commands to execute. + job_memory (str): Memory requirements (e.g., "4G"). + job_threads (int): Number of threads to use. + container_runtime (str): Container runtime ("docker" or "singularity"). + image (str): Container image to use. + volumes (list): Volume mappings (e.g., ['/data:/data']). + env_vars (dict): Environment variables for the container. + **kwargs: Additional arguments. + """ + # Validation checks + if container_runtime and container_runtime not in ["docker", "singularity"]: + self.logger.error(f"Invalid container_runtime: {container_runtime}") + raise ValueError("Container runtime must be 'docker' or 'singularity'") + + if container_runtime and not image: + self.logger.error(f"Container runtime specified without an image: {container_runtime}") + raise ValueError("An image must be specified when using a container runtime") + benchmark_data = [] + for statement in statement_list: job_info = {"statement": statement} - self.start_job(job_info) # Add job to active_jobs + self.start_job(job_info) try: - # Execute job + # Prepare containerized execution + if container_runtime: + self.set_container_config(image=image, volumes=volumes, env_vars=env_vars, runtime=container_runtime) + statement = self.container_config.get_container_command(statement) + + # Add memory and thread environment variables + if job_memory: + env_vars = env_vars or {} + env_vars["JOB_MEMORY"] = job_memory + if job_threads: + env_vars = env_vars or {} + env_vars["JOB_THREADS"] = job_threads + + # Debugging: Log the constructed command + self.logger.info(f"Executing command: {statement}") + + # Build and execute the statement full_statement, job_path = self.build_job_script(statement) process = subprocess.Popen( full_statement, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE @@ -856,19 +965,22 @@ def run(self, statement_list): if process.returncode != 0: raise OSError( f"Job failed with return code {process.returncode}.\n" - f"stderr: {stderr.decode('utf-8')}\nstatement: {statement}" + f"stderr: {stderr.decode('utf-8')}\ncommand: {statement}" ) - # Collect benchmark data if job was successful + # Collect benchmark data for successful jobs benchmark_data.append( - self.collect_benchmark_data([statement], resource_usage=[{"job_id": process.pid}]) + self.collect_benchmark_data( + statement, resource_usage={"job_id": process.pid} + ) ) - self.finish_job(job_info) # Remove job from active_jobs + self.finish_job(job_info) except Exception as e: self.logger.error(f"Job failed: {e}") self.cleanup_failed_job(job_info) - continue + if not self.ignore_errors: + raise return benchmark_data diff --git a/conda/environments/cgat-core.yml b/conda/environments/cgat-core.yml index 795675dd..5dbf4dae 100644 --- a/conda/environments/cgat-core.yml +++ b/conda/environments/cgat-core.yml @@ -36,3 +36,4 @@ dependencies: - paramiko - pytest - pytest-pep8 +- pyopenssl>=23.2.0 diff --git a/docs/Makefile b/docs/Makefile deleted file mode 100644 index d7751a97..00000000 --- a/docs/Makefile +++ /dev/null @@ -1,20 +0,0 @@ -# Minimal makefile for Sphinx documentation -# - -# You can set these variables from the command line. -SPHINXOPTS = -SPHINXBUILD = sphinx-build -SPHINXPROJ = cgatcore -SOURCEDIR = . -BUILDDIR = _build - -# Put it first so that "make" without argument is like "make help". -help: - @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) - -.PHONY: help Makefile - -# Catch-all target: route all unknown targets to Sphinx using the new -# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). -%: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py deleted file mode 100644 index 09f2ee49..00000000 --- a/docs/conf.py +++ /dev/null @@ -1,205 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# -# cgatcore documentation build configuration file, created by -# sphinx-quickstart on Sat Mar 3 13:24:26 2018. -# -# This file is execfile()d with the current directory set to its -# containing dir. -# -# Note that not all possible configuration values are present in this -# autogenerated file. -# -# All configuration values have a default; values that are commented out -# serve to show the default. - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -# -import os -import sys - -sys.path.insert(0, os.path.abspath('../')) - - -# -- General configuration ------------------------------------------------ - -# If your documentation needs a minimal Sphinx version, state it here. -# -# needs_sphinx = '1.0' - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = ['sphinx.ext.autodoc', - 'sphinx.ext.todo', - 'sphinx.ext.coverage', - 'sphinx.ext.imgmath', - 'sphinx.ext.ifconfig', - 'sphinx.ext.inheritance_diagram', - 'sphinx.ext.intersphinx', - 'sphinx.ext.napoleon'] - -# Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] - -# The suffix(es) of source filenames. -# You can specify multiple suffix as a list of string: -# -# source_suffix = ['.rst', '.md'] -source_suffix = '.rst' - -# The master toctree document. -master_doc = 'index' - -# General information about the project. -project = 'cgatcore' -copyright = '2018-2019, CGAT Developers' -author = 'CGAT Developers' - -# The version info for the project you're documenting, acts as replacement for -# |version| and |release|, also used in various other places throughout the -# built documents. -# -# The short X.Y version. -version = '1.0' -# The full version, including alpha/beta/rc tags. -release = '1.0' - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -# -# This is also used if you do content translation via gettext catalogs. -# Usually you set "language" from the command line for these cases. -language = None - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -# This patterns also effect to html_static_path and html_extra_path -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' - -# If true, `todo` and `todoList` produce output, else they produce nothing. -todo_include_todos = False - - -# -- Options for HTML output ---------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -html_theme = "sphinx_rtd_theme" - -# Theme options are theme-specific and customize the look and feel of a theme -# further. For a list of options available for each theme, see the -# documentation. -# -# html_theme_options = {} - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -# html_static_path = ['_static'] - -# Custom sidebar templates, must be a dictionary that maps document names -# to template names. -# -# This is required for the alabaster theme -# refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars -html_sidebars = { - '**': [ - 'relations.html', # needs 'show_related': True theme option to display - 'searchbox.html', - ] -} - -# Included at the end of each rst file -rst_epilog = ''' -.. _CGAT Training Programme: http://www.cgat.org -.. _CGAT pipeline Collection: https://www.cgat.org/downloads/public/CGATpipelines/documentation/ -.. _CGAT Code Collection: https://www.cgat.org/downloads/public/cgat/documentation/ -.. _pysam: https://github.com/pysam-developers/pysam -.. _samtools: http://samtools.sourceforge.net/ -.. _htslib: http://www.htslib.org/ -.. _tabix: http://samtools.sourceforge.net/tabix.shtml/ -.. _Galaxy: https://main.g2.bx.psu.edu/ -.. _cython: http://cython.org/ -.. _python: http://python.org/ -.. _ipython: http://ipython.org/ -.. _pyximport: http://www.prescod.net/pyximport/ -.. _sphinx: http://sphinx-doc.org/ -.. _ruffus: http://www.ruffus.org.uk/ -.. _cgatreport: https://github.com/AndreasHeger/CGATReport/ -.. _sqlite: http://www.sqlite.org/ -.. _make: http://www.gnu.org/software/make -.. _UCSC: http://genome.ucsc.edu -.. _ENSEMBL: http://www.ensembl.org -.. _GO: http://www.geneontology.org -.. _gwascatalog: http://www.genome.gov/gwastudies/ -.. _distlid: http://distild.jensenlab.org/ -.. _mysql: https://mariadb.org/ -.. _postgres: http://www.postgresql.org/ -.. _bedtools: http://bedtools.readthedocs.org/en/latest/ -.. _UCSC Tools: http://genome.ucsc.edu/admin/git.html -.. _git: http://git-scm.com/ -.. _sge: http://wiki.gridengine.info/wiki/index.php/Main_Page -.. _alignlib: https://github.com/AndreasHeger/alignlib -.. _iGenomes: https://support.illumina.com/sequencing/sequencing_software/igenome.html -''' -# -- Options for HTMLHelp output ------------------------------------------ - -# Output file base name for HTML help builder. -htmlhelp_basename = 'cgatcoredoc' - - -# -- Options for LaTeX output --------------------------------------------- - -latex_elements = { - # The paper size ('letterpaper' or 'a4paper'). - # - # 'papersize': 'letterpaper', - - # The font size ('10pt', '11pt' or '12pt'). - # - # 'pointsize': '10pt', - - # Additional stuff for the LaTeX preamble. - # - # 'preamble': '', - - # Latex figure (float) alignment - # - # 'figure_align': 'htbp', -} - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, -# author, documentclass [howto, manual, or own class]). -latex_documents = [ - (master_doc, 'cgatcore.tex', 'cgatcore Documentation', - 'CGAT Developers', 'manual'), -] - - -# -- Options for manual page output --------------------------------------- - -# One entry per manual page. List of tuples -# (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, 'cgatcore', 'cgatcore Documentation', - [author], 1) -] - - -# -- Options for Texinfo output ------------------------------------------- - -# Grouping the document tree into Texinfo files. List of tuples -# (source start file, target name, title, author, -# dir menu entry, description, category) -texinfo_documents = [ - (master_doc, 'cgatcore', 'cgatcore Documentation', - author, 'cgatcore', 'One line description of project.', - 'Miscellaneous'), -] diff --git a/docs/container/tasks.md b/docs/container/tasks.md new file mode 100644 index 00000000..5d19bc8a --- /dev/null +++ b/docs/container/tasks.md @@ -0,0 +1,164 @@ +# Containerised Execution in `P.run()` + +The `P.run()` method supports executing jobs within container environments using **Docker** or **Singularity**. This functionality enables seamless integration of containerisation for computational workflows. + +## Features + +- **Container Runtime Support**: Execute jobs using either Docker or Singularity. +- **Environment Variables**: Pass custom environment variables to the container. +- **Volume Mapping**: Bind directories between the host system and the container. +- **Container-Specific Command Construction**: Automatically builds the appropriate command for Docker or Singularity. + +--- + +## API Documentation + +### `P.run()` + +The `P.run()` method executes a list of commands with optional support for containerisation via Docker or Singularity. + +### Parameters + +| Parameter | Type | Description | Default | +|---------------------|-----------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------| +| `statement_list` | `list` | List of commands (statements) to execute. | Required | +| `job_memory` | `str` | Memory requirements for the job (e.g., `"4G"`). | `None` | +| `job_threads` | `int` | Number of threads to use. | `None` | +| `container_runtime` | `str` | Container runtime to use. Must be `"docker"` or `"singularity"`. | `None` | +| `image` | `str` | The container image to use (e.g., `"ubuntu:20.04"` for Docker or `/path/to/image.sif` for Singularity). | `None` | +| `volumes` | `list` | List of volume mappings (e.g., `"/host/path:/container/path"`). | `None` | +| `env_vars` | `dict` | Dictionary of environment variables to pass to the container (e.g., `{"VAR": "value"}`). | `None` | +| `**kwargs` | `dict` | Additional arguments passed to the executor. | `None` | + +### Returns + +- **`list`**: A list of benchmark data collected from executed jobs. + +### Raises + +- **`ValueError`**: If invalid arguments are provided (e.g., container runtime is missing or invalid, or required arguments for container execution are not supplied). +- **`OSError`**: If the job fails during execution. + +--- + +## Examples + +### Running a Job with Docker + +To execute a job using Docker, specify the `container_runtime` as `"docker"` and provide an image. Optionally, bind host directories to container directories using `volumes`, and pass environment variables with `env_vars`. + +```python +P.run( + statement_list=["echo 'Hello from Docker'"], + container_runtime="docker", + image="ubuntu:20.04", + volumes=["/data:/data"], + env_vars={"MY_VAR": "value"} +) +``` + +This will construct and execute the following Docker command: + +```bash +docker run --rm -v /data:/data -e MY_VAR=value ubuntu:20.04 /bin/bash -c 'echo Hello from Docker' +``` + +### Running a Job with Singularity + +To execute a job using Singularity, specify the `container_runtime` as `"singularity"` and provide a Singularity Image File (SIF). Similarly, you can bind host directories and set environment variables. + +```python +P.run( + statement_list=["echo 'Hello from Singularity'"], + container_runtime="singularity", + image="/path/to/image.sif", + volumes=["/data:/data"], + env_vars={"MY_VAR": "value"} +) +``` + +This will construct and execute the following Singularity command: + +```bash +singularity exec --bind /data:/data --env MY_VAR=value /path/to/image.sif /bin/bash -c 'echo Hello from Singularity' +``` + +--- + +## Usage Notes + +1. **Container Runtime Selection**: + - Use `"docker"` for Docker-based container execution. + - Use `"singularity"` for Singularity-based container execution. + - Ensure the appropriate runtime is installed and available on the system. + +2. **Environment Variables**: + - Use the `env_vars` argument to pass environment variables to the container. + +3. **Volume Mapping**: + - Use the `volumes` argument to bind directories between the host system and the container. + - Docker: Use `["/host/path:/container/path"]`. + - Singularity: Use `["/host/path:/container/path"]`. + +4. **Validation**: + - If `container_runtime` is not specified, container-specific arguments such as `volumes`, `env_vars`, and `image` cannot be used. + - A valid container image must be provided if `container_runtime` is specified. + +--- + +## Error Handling + +- **Invalid Configurations**: + - Raises `ValueError` for invalid configurations, such as: + - Missing container runtime. + - Missing or invalid container image. + - Incompatible arguments (e.g., volumes provided without a container runtime). + +- **Job Failures**: + - Automatically cleans up failed jobs, including temporary files and job outputs. + +--- + +## Implementation Details + +Internally, `P.run()` constructs the appropriate command based on the specified runtime and arguments: + +### Docker + +For Docker, the command is constructed as follows: +```bash +docker run --rm -v /host/path:/container/path -e VAR=value image /bin/bash -c 'statement' +``` + +### Singularity + +For Singularity, the command is constructed as follows: +```bash +singularity exec --bind /host/path:/container/path --env VAR=value image /bin/bash -c 'statement' +``` + +Both commands ensure proper execution and clean-up after the job completes. + +--- + +## Contributing + +To add or enhance containerisation functionality, ensure: +1. New features or fixes support both Docker and Singularity. +2. Unit tests cover all edge cases for container runtime usage. +3. Updates to this documentation reflect changes in functionality. + +--- + +## Adding to MkDocs + +Save this content in a markdown file (e.g., `docs/container_execution.md`) and add it to the `mkdocs.yml` navigation: + +```yaml +nav: + - Home: index.md + - P.run: + - Docker and Singularity: container_execution.md +``` + +This provides a clear, accessible reference for users leveraging containerisation with `P.run()`. diff --git a/docs/container/whole_pipeline.md b/docs/container/whole_pipeline.md new file mode 100644 index 00000000..06842929 --- /dev/null +++ b/docs/container/whole_pipeline.md @@ -0,0 +1,91 @@ +# Container Configuration for Entire Pipeline + +This document describes how to use the `Pipeline` class from `cgatcore.pipeline` to configure container settings **for the entire pipeline**. Unlike configuring individual jobs with container support, this method allows you to set up a consistent execution environment for all tasks across the entire workflow. This is useful for ensuring reproducibility and simplifying pipeline management. + +## Overview + +The `Pipeline` class from `cgatcore.pipeline` allows you to: +- Configure container support for tasks. +- Set up Docker or Singularity containers with environment variables and volume mappings. +- Seamlessly execute multiple tasks inside containers. +- Configure container settings for the entire pipeline, ensuring consistent execution environments across all tasks. + +By configuring the container support at the pipeline level, all commands that are run through `P.run()` will automatically use the specified container settings. + +--- + +## Usage Examples + +### Setting Docker as the Default Runtime for the Entire Pipeline + +Below is an example of how to use the `Pipeline` class to configure and execute all tasks in the pipeline within a Docker container: + +```python +from cgatcore.pipeline import Pipeline + +# Create a pipeline instance +P = Pipeline() + +# Configure container support for Docker for the entire pipeline +P.set_container_config( + image="ubuntu:20.04", + volumes=["/data:/data", "/reference:/reference"], + env_vars={"THREADS": "4", "PATH": "/usr/local/bin:$PATH"}, + runtime="docker" +) + +# Define and run tasks - these will all run in the specified Docker container +P.run([ + "bwa mem /reference/genome.fa /data/sample1.fastq > /data/sample1.bam", + "bwa mem /reference/genome.fa /data/sample2.fastq > /data/sample2.bam" +]) +``` + +### Setting Singularity as the Default Runtime for the Entire Pipeline + +Similarly, the following example shows how to use Singularity for all tasks in the pipeline: + +```python +from cgatcore.pipeline import Pipeline + +# Create a pipeline instance +P = Pipeline() + +# Configure container support for Singularity for the entire pipeline +P.set_container_config( + image="/path/to/ubuntu.sif", + volumes=["/data:/data", "/reference:/reference"], + env_vars={"THREADS": "4", "PATH": "/usr/local/bin:$PATH"}, + runtime="singularity" +) + +# Define and run tasks - these will all run in the specified Singularity container +P.run([ + "bwa mem /reference/genome.fa /data/sample1.fastq > /data/sample1.bam", + "bwa mem /reference/genome.fa /data/sample2.fastq > /data/sample2.bam" +]) +``` + +## When to Use This Approach + +This configuration approach is ideal when: +- You want **all tasks in the pipeline** to run in the same controlled container environment without having to configure container support repeatedly for each individual command. +- Consistency and reproducibility are essential, as this ensures that all tasks use the same software versions, dependencies, and environment. +- You are managing complex workflows where each step depends on a well-defined environment, avoiding any variations that may arise if each step had to be configured separately. + +## Differences from Per-Command Containerisation + +- **Pipeline-Level Configuration**: Use `P.set_container_config()` to set the container settings for the entire pipeline. Every task executed through `P.run()` will use this configuration by default. +- **Per-Command Containerisation**: Use container-specific arguments in `P.run()` for each task individually, which allows different tasks to use different container settings if needed. This is covered in the separate documentation titled **Containerised Execution in `P.run()`**. + +--- + +## Conclusion + +The `Pipeline` class provides an efficient way to standardise the execution environment across all pipeline tasks. By setting container configurations at the pipeline level: +- **All tasks** will use the same Docker or Singularity environment. +- **Configuration is centralised**, reducing redundancy and the risk of errors. +- **Portability** and **reproducibility** are enhanced, making this approach particularly useful for workflows requiring a consistent environment across multiple stages. + +With these examples, users can set up a fully containerised workflow environment for all stages of their pipeline, ensuring robust and repeatable results. + diff --git a/docs/defining_workflow/Tutorial.rst b/docs/defining_workflow/Tutorial.rst deleted file mode 100644 index 29a069a3..00000000 --- a/docs/defining_workflow/Tutorial.rst +++ /dev/null @@ -1,304 +0,0 @@ -.. _defining_workflow-Configuration: - - -============================ -Writing a workflow- Tutorial -============================ - -The explicit aim of cgat-core is to allow users to quickly and easily build their own computational pipelines that will speed up your analysis workflow. - -Installation of cgat-core -------------------------- - -In order to begin writing a pipeline you will need to install the cgat-core code -(see :ref:`getting_started-Installation`) for installation instructions. - - -Tutorial start --------------- - -Setting up the pipleine -======================= - -**1.** First navigate to a directory where you want to start building your code:: - - mkdir test && cd test && mkdir configuration && touch configuration/pipeline.yml && touch pipeline_test.py && touch ModuleTest.py - -This will create a directory called test in the current directory with the following layout:: - - |-- configuration - | `-- pipeline.yml - `-- pipeline_test.py - -- ModuleTest.py - - -The layout has the following components:: - -pipeline_test.py - This is the file that will contain all of the ruffus workflows, the file needs - the format pipeline_.py -test/ - Directory containing the configuration yml file. The directory needs to be named - the same as the pipeline_.py file. This folder will contain the `pipeline.yml` - configuration file. -ModuleTest.py - This file will contain functions that will be imported into the main ruffus - workflow file, pipeline_test.py - -**2.** View the source code within pipeline_test.py - -This is where the ruffus tasks will be written. The code begins with a doc -string detailing the pipeline functionality. You should use this section to document your -pipeline. :: - - '''This pipeline is a test and this is where the documentation goes ''' - -The pipeline then needs a few utility functions to help with executing the pipeline. - -**Import statements** You will need to import ruffus and cgatcore utilities :: - - from ruffus import * - import cgatcore.experiment as E - from cgatcore import pipeline as P - -Importing ruffus allows ruffus decorators to be used within out pipeline - -Importing experiment from cgatcore is a module that contains ultility functions for argument parsion, logging and record keeping -within scripts. - -Importing pipeline from cgatcore allows users to run utility functions for interfacing with CGAT ruffus pipelines -with an HPC cluster, uploading data to a database, provides paramaterisation and more. - -You'll also need python modules:: - - import os - import sys - -**Config parser:** This code helps with parsing the pipeline.yml file:: - - # load options from the config file - PARAMS = P.get_parameters( - ["%s/pipeline.yml" % os.path.splitext(__file__)[0], - "../pipeline.yml", - "pipeline.yml"]) - -**Pipeline configuration:** We will add configurable variables to our pipeline.yml file -so that we can modify the output of out pipeline. With `pipeline.yml` open, copy and paste the following -into the file. :: - - database: - name: "csvdb" - -When you come to run the pipeline the configuration variables (in this case csvdb) can be accessed in the pipeline -by PARAMS["database_name"]. - - -**Database connection:** This code helps with connecting to a sqlite database:: - - def connect(): - '''utility function to connect to database. - - Use this method to connect to the pipeline database. - Additional databases can be attached here as well. - - Returns an sqlite3 database handle. - ''' - - dbh = sqlite3.connect(PARAMS["database_name"]) - - return dbh - - -**Commandline parser:** This bit of code allows pipeline to parse arguments. :: - - def main(argv=None): - if argv is None: - argv = sys.argv - P.main(argv) - - - if __name__ == "__main__": - sys.exit(P.main(sys.argv)) - - -Running test pipeline -===================== - -You now have the bare bones layout of the pipeline and you now need code to execute. Below you will -find example code that you can copy and paste into your pipeline_test.py file. The code -includes two ruffus_ **@transform** tasks that parse the pipeline.yml. The first function -called :code:`countWords` is then called which contains a statement that counts the -number of words in the file. The statement is then ran using :code:`P.run()` function. - -The second ruffus_ **@transform** function called :code:`loadWordCounts` takes as an input the output of -the function countWords and loads the number of words to a sqlite database using :code:`P.load()`. - -The third :code:`def full()` function is a dummy task that is written to run the whole -pipeline. It has an **@follows** function that takes the :code:`loadWordCounts` function. -This helps complete the pipeline chain and the pipeline can be ran with the tak name full to execute the -whole workflow. - -The following code should be pasted just before the **Commandline parser** arguments and after the **database connection** code. -:: - - # --------------------------------------------------- - # Specific pipeline tasks - @transform("pipeline.yml", - regex("(.*)\.(.*)"), - r"\1.counts") - def countWords(infile, outfile): - '''count the number of words in the pipeline configuration files.''' - - # the command line statement we want to execute - statement = '''awk 'BEGIN { printf("word\\tfreq\\n"); } - {for (i = 1; i <= NF; i++) freq[$i]++} - END { for (word in freq) printf "%%s\\t%%d\\n", word, freq[word] }' - < %(infile)s > %(outfile)s''' - - # execute command in variable statement. - # - # The command will be sent to the cluster. The statement will be - # interpolated with any options that are defined in in the - # configuration files or variable that are declared in the calling - # function. For example, %(infile)s will we substituted with the - # contents of the variable "infile". - P.run(statement) - - - @transform(countWords, - suffix(".counts"), - "_counts.load") - def loadWordCounts(infile, outfile): - '''load results of word counting into database.''' - P.load(infile, outfile, "--add-index=word") - - # --------------------------------------------------- - # Generic pipeline tasks - @follows(loadWordCounts) - def full(): - pass - -To run the pipeline navigate to the working directory and then run the pipeline. :: - - python /location/to/code/pipeline_test.py config - python /location/to/code/pipeline_test.py show full -v 5 - -This will place the pipeline.yml in the folder. Then run :: - - python /location/to/code/pipeline_test.py make full -v5 --local - -The pipeline will then execute and count the words in the yml file. - - -Modifying the test pipeline to build your own workflows -======================================================= - -The next step is to modify the basic code in the pipeline to fit your particular -NGS workflow needs. For example, say we wanted to convert a sam file into a bam -file then perform flag stats on that output bam file. The code and layout that we just wrote -can be easily modified to perform this. We would remove all of the code from the -specific pipeline tasks and write our own. - -The pipeline will have two steps: -1. Identify all sam files and convert to a bam file. -2. Take the output of step 1 and then perform flagstats on that bam file. - -The first step would involve writing a function to identify all -`sam` files in a `data.dir/` directory. This first function would accept a sam file then -use samtools view to convert it to a bam file. Therefore, we would require an ``@transform`` -function. - -The second function would then take the output of the first function, perform samtools -flagstat and then output the results as a flat .txt file. Again, an ``@transform`` function is required -to track the input and outputs. - -This would be written as follows: -:: - @transform("data.dir/*.sam", - regex("data.dir/(\S+).sam"), - r"\1.bam") - def bamConvert(infile, outfile): - 'convert a sam file into a bam file using samtools view' - - statement = ''' samtools view -bT /ifs/mirror/genomes/plain/hg19.fasta - %(infile)s > %(outfile)s''' - - P.run(statement) - - @transform(bamConvert, - suffix(".bam"), - "_flagstats.txt") - def bamFlagstats(infile, outfile): - 'perform flagstats on a bam file' - - statement = '''samtools flagstat %(infile)s > %(outfile)s''' - - P.run(statement) - - -To run the pipeline:: - - python /path/to/file/pipeline_test.py make full -v5 - - -The bam files and flagstats outputs should then be generated. - - -Parameterising the code using the .yml file -=========================================== - -Having written the basic function of our pipleine, as a philosophy, -we try and avoid any hard coded parameters. - -This means that any variables can be easily modified by the user -without having to modify any code. - -Looking at the code above, the hard coded link to the hg19.fasta file -can be added as a customisable parameter. This could allow the user to -specify any fasta file depending on the genome build used to map and -generate the bam file. - -In order to do this the :file:`pipeline.yml` file needs to be modified. This -can be performed in the following way: - -Configuration values are accessible via the :py:data:`PARAMS` -variable. The :py:data:`PARAMS` variable is a dictionary mapping -configuration parameters to values. Keys are in the format -``section_parameter``. For example, the key ``genome_fasta`` will -provide the configuration value of:: - - genome: - fasta: /ifs/mirror/genomes/plain/hg19.fasta - -In the pipeline.yml, add the above code to the file. In the pipeline_test.py -code the value can be accessed via ``PARAMS["genome_fasta"]``. - -Therefore the code we wrote before for parsing bam files can be modified to -:: - @transform("data.dir/*.sam", - regex("data.dir/(\S+).sam"), - r"\1.bam") - def bamConvert(infile, outfile): - 'convert a sam file into a bam file using samtools view' - - genome_fasta = PARAMS["genome_fasta"] - - statement = ''' samtools view -bT %(genome_fasta)s - %(infile)s > %(outfile)s''' - - P.run(statement) - - @transform(bamConvert, - suffix(".bam"), - "_flagstats.txt") - def bamFlagstats(infile, outfile): - 'perform flagstats on a bam file' - - statement = '''samtools flagstat %(infile)s > %(outfile)s''' - - P.run(statement) - - -Running the code again should generate the same output. However, if you -had bam files that came from a different genome build then the parameter in the yml file -can be modified easily, the output files deleted and the pipeline ran using the new configuration values. diff --git a/docs/defining_workflow/Writing_workflow.rst b/docs/defining_workflow/Writing_workflow.rst deleted file mode 100644 index 189a403a..00000000 --- a/docs/defining_workflow/Writing_workflow.rst +++ /dev/null @@ -1,611 +0,0 @@ -.. _defining_workflow-Writing_workflow: - -================== -Writing a workflow -================== - - -.. _defining_workflow-philosophy: - -Our workflow philosophy ------------------------ - -The explicit aim of CGAT-core is to allow users to quickly and easily build their own computational pipelines that will speed up your analysis workflow. - -When building pipelines it is often useful to keep in mind the following philosophy: - -Flexibility - There are always new tools and insights that could be incorporated into a pipeline. Ultimately, a pipeline should be flexible and the code should not constrain you when implimenting new features. -Scriptability - The pipeline should be scriptable, i.e, the whole pipeline can be run within another pipeline. Similarly, parts of a pipeline can be duplicated to process several data streams in parallel. This is a crucial feature in genome studies as a single analysis will not permit making inferences by itself. When we write a pipeline we usually attempt to write a command line script (and include it in the CGAT-apps repository) and then run this script as a command line statement in the pipeline. -Reproducibility - The pipeline is fully automated. The same inputs and configuration will produce the same outputs. -Reusability - The pipeline should be able to be re-used on similar data, preferably only requiring changes to a configuration file (pipeline.yml). -Archivability - Once finished, the whole project should be able to be archived without too many major dependencies on external data. This should be a simple process and hence all project data should be self-contained. It should not involve going through various directories or databases to figure out which files and tables belong to a project or a project depends on. - -.. _defining_workflow-building: - -Building a pipeline -------------------- - -The best way to build a pipeline is to start from an example. In `cgat-showcase `_ we have a toy example of an RNA-seq -analysis pipeline that aims to show users how simple workflows can be generated with minimal code. `cgat-flow `_ demonstrates a set of complex workflows. - -For a step by step tutorial on how to run the pipelines please refer to our :ref:`getting_started-Tutorial`. - -For help on how to construct pipelines from scratch please continue reading for more information. - -In an empty directory you will need to make a new directory and then a python file -with the same name. For example:: - - mkdir test && touch pipeline_test.py - -All pipelines require a yml configuration file that will allow you to add configurable values to modify the behaviour of your code. -This is placed within the test/ directory, which should have the same name as the name of your pipeline_test.py file:: - - touch test/pipeline.yml - -In order to help with debugging and reading our code, our pipelines are written so that -a pipeline task file contains Ruffus tasks and calls functions in an associated module file, -which contains all of the code to transform and analyse the data. - -Therefore, if you wish to create a module file, we usually save this file in the following convention, -``ModuleTest.py`` and it can be imported into the main pipeline task file (``pipeline_test.py``)as: - -.. code-block:: python - - import ModuleTest - -This section describes how pipelines can be constructed using the -:mod:`pipeline` module in cgat-core. The `pipeline `_ module contains a variety of -useful functions for pipeline construction. - -.. _defining_workflow-p-input: - -pipeline input --------------- - -Pipelines are executed within a dedicated working -directory. They usually require the following files within this -directory: - - * a pipeline configuration file :file:`pipeline.yml` - * input data files, usually listed in the documentatuion of each pipeline - -Other files that might be used in a pipeline are: - - * external data files such as genomes that a referred to by they their full path name. - -The pipelines will work from the input files in the working -directory, usually identified by their suffix. For example, a -mapping pipeline might look for any ``*.fastq.gz`` files in the -directory, run QC on these and map the reads to a genome sequence etc. - -.. _defining_workflow-p-output: - -pipeline output ----------------- - -The pipeline will create files and database tables in the -working directory. When building a pipeline, you can choose -any file/directory layout that suits your needs. Some prefer flat -hierarchies with many files, while others prefer deep directories. - -.. _defining_workflow-guidelines: - -Guidelines ----------- - -To preserve disk space, we always use compressed files as -much as possible. Most data files compress very well, for example -fastq files often compress by a factor of 80% or more: a 10Gb file -will use just 2Gb. - -Working with compressed files is straight-forward using unix pipes and -the commands ``gzip``, ``gunzip`` or ``zcat``. - -If you require random access to a file, load the file into the -database and index it appropriately. Genomic interval files can be -indexed with tabix to allow random access. - -.. _pipelineCommands: - - -Import statements ------------------ - -In order to run our pipelines you will need to import the cgatcore python -modules into your pipeline. For every CGAT pipeline we recommend importing the -basic modules as follows. Then any additional modules can be imported as required. - -.. code-block:: python - - from ruffus import * - import cgatcore.experiment as E - from cgatcore import pipeline as P - import cgatcore.iotools as iotools - -Selecting the appropriate Ruffus decorator ------------------------------------------- - -Before starting to write a pipeline it is always best to map out -on a whiteboard the the steps and flow of your potential pipeline. This will allow you -to identify the input and outputs of each task. Once you have assessed this then the next step is -to identify which Ruffus decorator you require. Documentation on each decorator can be found in the -`ruffus documentation `_ - - - -Running commands within tasks ------------------------------ - -To run a command line program within a pipeline task, build a -statement and call the :meth:`pipeline.run` method:: - - @transform( '*.unsorted', suffix('.unsorted'), '.sorted') - def sortFile( infile, outfile ): - - statement = '''sort %(infile)s > %(outfile)s''' - P.run(statement) - -On calling the :meth:`pipeline.run` method, the environment of the -caller is examined for a variable called ``statement``. The variable -is subjected to string substitution from other variables in the local -namespace. In the example above, ``%(infile)s`` and ``%(outfile)s`` -are substituted with the values of the variables ``infile`` and -``outfile``, respectively. - -The same mechanism also permits setting configuration parameters, for example:: - - @transform( '*.unsorted', suffix('.unsorted'), '.sorted') - def sortFile( infile, outfile ): - - statement = '''sort -t %(tmpdir)s %(infile)s > %(outfile)s''' - P.run(statement) - -will automatically substitute the configuration parameter ``tmpdir`` -into the command. See ConfigurationValues_ for more on using configuration -parameters. - -The pipeline will stop and return an error if the command exits with an error code. - -If you chain multiple commands, only the return value of the last -command is used to check for an error. Thus, if an upstream command -fails, it will go unnoticed. To detect these errors, insert -``&&`` between commands. For example:: - - @transform( '*.unsorted.gz', suffix('.unsorted.gz'), '.sorted) - def sortFile( infile, outfile ): - - statement = '''gunzip %(infile)s %(infile)s.tmp && - sort -t %(tmpdir)s %(infile)s.tmp > %(outfile)s && - rm -f %(infile)s.tmp - P.run(statement) - -Of course, the statement aboved could be executed more efficiently -using pipes:: - - @transform( '*.unsorted.gz', suffix('.unsorted.gz'), '.sorted.gz') - def sortFile( infile, outfile ): - - statement = '''gunzip < %(infile)s - | sort -t %(tmpdir)s - | gzip > %(outfile)s''' - P.run(statement) - -The pipeline inserts code automatically to check for error return -codes if multiple commands are combined in a pipe. - -Running commands on the cluster -------------------------------- - -In order to run commands on cluster, use ``to_cluster=True``. - -To run the command from the previous section on the cluster:: - - @files( '*.unsorted.gz', suffix('.unsorted.gz'), '.sorted.gz') - def sortFile( infile, outfile ): - - to_cluster = True - statement = '''gunzip < %(infile)s - | sort -t %(tmpdir)s - | gzip > %(outfile)s''' - P.run(statement) - -The pipeline will automatically create the job submission files, -submit the job to the cluster and wait for its return. - -pipelines will use the command line options ``--cluster-queue``, -``--cluster-priority``, etc. for global job control. For example, to -change the priority when starting the pipeline, use:: - - python --cluster-priority=-20 - -To set job options specific to a task, you can define additional -variables:: - - @files( '*.unsorted.gz', suffix('.unsorted.gz'), '.sorted.gz') - def sortFile( infile, outfile ): - - to_cluster = True - job_queue = 'longjobs.q' - job_priority = -10 - job_options= "-pe dedicated 4 -R y" - - statement = '''gunzip < %(infile)s - | sort -t %(tmpdir)s - | gzip > %(outfile)s''' - P.run(statement) - -The above statement will be run in the queue ``longjobs.q`` at a -priority of ``-10``. Additionally, it will be executed in the -parallel environment ``dedicated`` with at least 4 cores. - -Array jobs can be controlled through the ``job_array`` variable:: - - @files( '*.in', suffix('.in'), '.out') - def myGridTask( infile, outfile ): - - job_array=(0, nsnps, stepsize) - - statement = '''grid_task.bash %(infile)s %(outfile)s - > %(outfile)s.$SGE_TASK_ID 2> %(outfile)s.err.$SGE_TASK_ID - ''' - P.run(statement) - - -Note that the :file:`grid_task.bash` file must be grid engine -aware. This means it makes use of the :envvar:`SGE_TASK_ID`, -:envvar:`SGE_TASK_FIRST`, :envvar:`SGE_TASK_LAST` and -:envvar:`SGE_TASK_STEPSIZE` environment variables to select the chunk -of data it wants to work on. - -The job submission files are files called `tmp*` in the :term:`working -directory`. These files will be deleted automatically. However, the -files will remain after aborted runs to be cleaned up manually. - -.. _defining_workflow-databases: - - -Useful information regarding decorators ---------------------------------------- - -To see a full list of ruffus decorators that control the flow of the pipeline please -see the `ruffus documentation `_. - -However, during peer review it was pointed out that it would be helpful to include a few examples of -how you can modify the infile name and transform it to the output filename. There are a few ways of doing this: - -The first way is to capture the suffix so the outfile is placed into the same folder as the infile:: - - # pairs are a tuple of read pairs (read1, read2) - @transform(pairs, - suffix(.fastq.gz), - ("_trimmed.fastq.gz", "_trimmed.fastq.gz")) - -This will transform an input .fastq.gz and result in an output -with a new siffix _trimmed.fastq.gz. - -Another way to add a output file into aother filer is to use a regex:: - - @follows(mkdir("new_folder.dir")) - @transform(pairs, - regex((\S+).fastq.gz), - (r"new_folder.dir/\1_trimmed.fastq.gz", r"new_folder.dir/\1_trimmed.fastq.gz")) - -This can also be achieved using the formatter function:: - - @follows(mkdir("new_folder.dir")) - @transform(pairs, - formatter((\S+).fastq.gz), - ("new_folder.dir/{SAMPLE[0]}_trimmed.fastq.gz", r"new_folder.dir/{SAMPLE[0]}_trimmed.fastq.gz")) - - -Combining commands together ---------------------------- - -In order to combine commands together you will need to use `&&` -to make sure your commands are chained correctly. For example:: - - statement = """ - module load cutadapt && - cutadapt .... - """ - - P.run(statement) - -If you didnt have the `&&` then the command will fail because the cutadapt command will be -executed as part of the module load statement. - -Databases ---------- - -Loading data into the database -============================== - -:mod:`pipeline.py` offers various tools for working with databases. By -default, it is configured to use an sqlite3 database in the -:term:`working directory` called :file:`csvdb`. - -Tab-separated output files can be loaded into a table using the -:meth:`pipeline.load` function. For example:: - - @jobs_limit(PARAMS.get("jobs_limit_db", 1), "db") - @transform('data_*.tsv.gz', suffix('.tsv.gz'), '.load') - def loadTables(infile, outfile): - P.load(infile, outfile) - -The task above will load all tables ending with ``tsv.gz`` into the -database Table names are given by the filenames, i.e, the data in -:file:`data_1.tsv.gz` will be loaded into the table :file:`data_1`. - -The load mechanism uses the script :file:`csv2db.py` and can be -configured using the configuration options in the ``database`` section -of :file:`pipeline.ini`. Additional options can be given via the -optional *options* argument:: - - @jobs_limit(PARAMS.get("jobs_limit_db", 1), "db") - @transform('data_*.tsv.gz', suffix('.tsv.gz'), '.load') - def loadTables( infile, outfile ): - P.load(infile, outfile, "--add-index=gene_id") - -In order for the load mechanism to be transparent, it is best avoided -to call the :file:`csv2db.py` script directly. Instead, use the -:meth:`pipeline.load` function. If the :file:`csv2db.py` needs to -called at the end of a succession of statements, use the -:meth:`pipeline.build_load_statement` method, for example:: - - def loadTranscript2Gene(infile, outfile): - '''build and load a map of transcript to gene from gtf file - ''' - load_statement = P.build_load_statement( - P.toTable(outfile), - options="--add-index=gene_id " - "--add-index=transcript_id ") - - statement = ''' - gunzip < %(infile)s - | python %(scriptsdir)s/gtf2tsv.py --output-map=transcript2gene -v 0 - | %(load_statement)s - > %(outfile)s''' - P.run() - -See also the variants :meth:`pipeline.mergeAndLoad` and -`:meth:`pipeline.concatenateAndLoad` to combine multiple tables and -upload to the database in one go. - -Connecting to a database -======================== - -To use data in the database in your tasks, you need to first connect -to the database. The best way to do this is via the connect() method -in pipeline.py. - -The following example illustrates how to use the connection:: - - @transform( ... ) - def buildCodingTranscriptSet( infile, outfile ): - - dbh = connect() - - statement = '''SELECT DISTINCT transcript_id FROM transcript_info WHERE transcript_biotype = 'protein_coding' ''' - cc = dbh.cursor() - transcript_ids = set( [x[0] for x in cc.execute(statement)] ) - ... - -.. _pipelineReports: - -Reports -------- - -MultiQC -======= - -When using cgat-core to build pipelines we recomend using `MultiQC `_ -as the default reporting tool for generic thrid party computational biology software. - -To run multiQC in our pipelines you only need to run a statement as a commanline -task. For example we impliment this in our pipelines as:: - - @follows(mkdir("MultiQC_report.dir")) - @originate("MultiQC_report.dir/multiqc_report.html") - def renderMultiqc(infile): - '''build mulitqc report''' - - statement = '''LANG=en_GB.UTF-8 multiqc . -f; - mv multiqc_report.html MultiQC_report.dir/''' - - P.run(statement) - - -Rmarkdown -========= - -MultiQC is very useful for running third generation computational biology tools. However, currently -it is very difficult to use it as a bespoke reporting tool. Therefore, one was of running -bespoke reports is using the Rmarkdown framework and using the render functionality of knitr. - -Rendering an Rmarkdown document is very easy if you place the .Rmd file in the same test/ directory as the pipeline.yml. -Then the file can easily run using:: - - @follows(mkdir("Rmarkdown.dir")) - @originate("Rmarkdown.dir/report.html") - def render_rmarkdown(outfile): - - NOTEBOOK_ROOT = os.path.join(os.path.dirname(__file__), "test") - - statement = '''cp %(NOTEBOOK_ROOT)s/report.Rmd Rmarkdown.dir && - cd Rmarkdown.dir/ && R -e "rmarkdown::render('report.Rmd',encoding = 'UTF-8')" ''' - - P.run(statement) - -This should generate an html output of whatever report your wrote for your particular task. - - -Jupyter notebook -================ - -Another bespoke reporting that we also perform for our pipelines is to use a Jupyter notebook -implimentation and execture it in using the commandline. All that is required is that you -place your jupyter notebook into the same test/ directory as the pipeline.yml and call the following:: - - @follows(mkdir("jupyter_report.dir")) - @originate("jupyter_report.dir/report.html") - def render_jupyter(outfile): - - NOTEBOOK_ROOT = os.path.join(os.path.dirname(__file__), "test") - - statement = '''cp %(NOTEBOOK_ROOT)s/report.ipynb jupyter_report.dir/ && cd jupyter_report.dir/ && - jupyter nbconvert --ExecutePreprocessor.timeout=None --to html --execute *.ipynb --allow-errors; - - P.run(statement) - - -.. _ConfigurationValues: - -Configuration values --------------------- - -Setting up configuration values -=============================== - -There are different ways to pass on configuration values to pipelines. -Here we explain the priority for all the possible options so you can -choose the best one for your requirements. - -The pipeline goes *in order* through different configuration options -to load configuration values and stores them in the :py:data:`PARAMS` -dictionary. This order determines a priority so values read in the first -place can be overwritten by values read in subsequent steps; i.e. values -read lastly have higher priority. - -Here is the order in which the configuration values are read: - -1. Hard-coded values in :file:`cgatcore/pipeline/parameters.py`. -2. Parameters stored in :file:`pipeline.yml` files in different locations. -3. Variables declared in the ruffus tasks calling ``P.run()``; - e.g. ``job_memory=32G`` -4. :file:`.cgat.yml` file in the home directory -5. ``cluster_*`` options specified in the command line; - e.g. ``python pipeline_example.py --cluster-parallel=dedicated make full`` - - -This means that configuration values for the cluster provided as -command-line options will have the highest priority. Therefore:: - - python pipeline_example.py --cluster-parallel=dedicated make full - -will overwrite any ``cluster_parallel`` configuration values given -in :file:`pipeline.yml` files. Type:: - - python pipeline_example.py --help - -to check the full list of available command-line options. - -You are encouraged to include the following snippet at the beginning -of your pipeline script to setup proper configuration values for -your analyses:: - - # load options from the config file - from cgatcore import pipeline as P - # load options from the config file - P.get_parameters( - ["%s/pipeline.yml" % os.path.splitext(__file__)[0], - "../pipeline.yml", - "pipeline.yml"]) - -The method :meth:`pipeline.getParameters` reads parameters from -the :file:`pipeline.yml` located in the current :term:`working directory` -and updates :py:data:`PARAMS`, a global dictionary of parameter values. -It automatically guesses the type of parameters in the order of ``int()``, -``float()`` or ``str()``. If a configuration variable is empty (``var=``), -it will be set to ``None``. - -However, as explained above, there are other :file:`pipeline.yml` -files that are read by the pipeline at start up. In order to get the -priority of them all, you can run:: - - python pipeline_example.py printconfig - -to see a complete list of :file:`pipeline.yml` files and their priorities. - - -Using configuration values -========================== - -Configuration values are accessible via the :py:data:`PARAMS` -variable. The :py:data:`PARAMS` variable is a dictionary mapping -configuration parameters to values. Keys are in the format -``section_parameter``. For example, the key ``bowtie_threads`` will -provide the configuration value of:: - - bowtie: - threads: 4 - -In a script, the value can be accessed via -``PARAMS["bowtie_threads"]``. - -Undefined configuration values will throw a :class:`ValueError`. To -test if a configuration variable exists, use:: - - if 'bowtie_threads' in PARAMS: pass - -To test, if it is unset, use:: - - if 'bowie_threads' in PARAMS and not PARAMS['botwie_threads']: - pass - -Task specific parameters ------------------------- - -Task specific parameters can be set by creating a task specific section in -the :file:`pipeline.yml`. The task is identified by the output filename. -For example, given the following task:: - - @files( '*.fastq', suffix('.fastq'), '.bam') - def mapWithBowtie( infile, outfile ): - ... - -and the files :file:`data1.fastq` and :file:`data2.fastq` in the -:term:`working directory`, two output files :file:`data.bam` and -:file:`data2.bam` will be created on executing ``mapWithBowtie``. Both -will use the same parameters. To set parameters specific to the -execution of :file:`data1.fastq`, add the following to -:file:`pipeline.yml`:: - - data1.fastq: - bowtie_threads: 16 - -This will set the configuration value ``bowtie_threads`` to 16 when -using the command line substitution method in :meth:`pipeline.run`. To -get an task-specific parameter values in a python task, use:: - - @files( '*.fastq', suffix('.fastq'), '.bam') - def mytask( infile, outfile ): - MY_PARAMS = P.substitute_parameters( locals() ) - -Thus, task specific are implemented generically using the -:meth:`pipeline.run` mechanism, but pipeline authors need to -explicitely code for track specific parameters. - -Using different conda environments ----------------------------------- - -In addition to running a pipeline using your default conda environment, specifying `job_condaenv=""` to the -P.run() function allows you run the statement using a different conda environment. For example:: - - @follows(mkdir("MultiQC_report.dir")) - @originate("MultiQC_report.dir/multiqc_report.html") - def renderMultiqc(infile): - '''build mulitqc report''' - - statement = '''LANG=en_GB.UTF-8 multiqc . -f; - mv multiqc_report.html MultiQC_report.dir/''' - - P.run(statement, job_condaenv="multiqc") - -This can be extremely useful when you have python 2 only code but are running in a python 3 environment. Or -more importantly, when you have conflicting dependancies in software and you need to seperate them out into -two different environments.xs diff --git a/docs/defining_workflow/run_parameter.md b/docs/defining_workflow/run_parameter.md new file mode 100644 index 00000000..a1eca9d4 --- /dev/null +++ b/docs/defining_workflow/run_parameter.md @@ -0,0 +1,39 @@ +# Setting run parameters + +Our workflows are executed using default settings that specify parameters for requirements such as memory, threads, environment, etc. Each of these parameters can be modified within the pipeline as needed. + +## Modifiable run parameters + +- **`job_memory`**: Number of slots (threads/cores/CPU) to use for the task. Default: "4G". +- **`job_total_memory`**: Total memory to use for a job. +- **`to_cluster`**: Send the job to the cluster. Default: `True`. +- **`without_cluster`**: Run the job locally when set to `True`. Default: `False`. +- **`cluster_memory_ulimit`**: Restrict virtual memory. Default: `False`. +- **`job_condaenv`**: Name of the conda environment to use for each job. Default: will use the one specified in `bashrc`. +- **`job_array`**: If set to `True`, run the statement as an array job. `job_array` should be a tuple with start, end, and increment values. Default: `False`. + +## Specifying parameters to a job + +Parameters can be set within a pipeline task as follows: + +```python +@transform('*.unsorted', suffix('.unsorted'), '.sorted') +def sortFile(infile, outfile): + statement = '''sort -t %(tmpdir)s %(infile)s > %(outfile)s''' + P.run(statement, + job_condaenv="sort_environment", + job_memory="30G", + job_threads=2, + without_cluster=False, + job_total_memory="50G") +``` + +In this example, the `sortFile` function sorts an unsorted file and saves it as a new sorted file. The `P.run()` statement is used to specify various parameters: + +- `job_condaenv="sort_environment"`: This specifies that the task should use the `sort_environment` conda environment. +- `job_memory="30G"`: This sets the memory requirement for the task to 30GB. +- `job_threads=2`: The task will use 2 threads. +- `without_cluster=False`: This ensures the job is sent to the cluster. +- `job_total_memory="50G"`: The total memory allocated for the job is 50GB. + +These parameters allow fine-tuning of job execution to fit specific computational requirements, such as allocating more memory or running on a local machine rather than a cluster. \ No newline at end of file diff --git a/docs/defining_workflow/run_parameters.rst b/docs/defining_workflow/run_parameters.rst deleted file mode 100644 index e3b04f14..00000000 --- a/docs/defining_workflow/run_parameters.rst +++ /dev/null @@ -1,37 +0,0 @@ -.. _defining_workflow-run_parameters: - -====================== -Setting run parameters -====================== - -Our workflows are executed using defaults that specify parameters for -setting requirements for memory, threads, environment, e.c.t. Each of these -parameters can be modified within the pipeline. - -Modifiable run parameters -------------------------- - -- `job_memory`: Number of slots (threads/cores/CPU) to use for the task. Default: "4G" -- `job_total_memory`: Total memory to use for a job. -- `to_cluster`: Send the job to the cluster. Default: True -- `without_cluster`: When this is set to True the job is ran locally. Default: False -- `cluster_memory_ulimit`: Restrict virtual memory. Default: False -- `job_condaenv`: Name of the conda environment to use for each job. Default: will use the one specified in bashrc -- `job_array`: If set True, run statement as an array job. Job_array should be tuple with start, end, and increment. Default: False - -Specifying parameters to job ----------------------------- - -Parameters can be set within a pipeline task as follows:: - - @transform( '*.unsorted', suffix('.unsorted'), '.sorted') - def sortFile( infile, outfile ): - - statement = '''sort -t %(tmpdir)s %(infile)s > %(outfile)s''' - - P.run(statement, - job_condaenv="sort_environment", - job_memory=30G, - job_threads=2, - without_cluster = False, - job_total_memory = 50G) diff --git a/docs/defining_workflow/tutorial.md b/docs/defining_workflow/tutorial.md new file mode 100644 index 00000000..5df41d47 --- /dev/null +++ b/docs/defining_workflow/tutorial.md @@ -0,0 +1,243 @@ +# Writing a workflow - Tutorial + +The explicit aim of cgat-core is to allow users to quickly and easily build their own computational pipelines that will speed up your analysis workflow. + +## Installation of cgat-core + +In order to begin writing a pipeline, you will need to install the cgat-core code (see installation instructions in the "Getting Started" section). + +## Tutorial start + +### Setting up the pipeline + +**1.** First, navigate to a directory where you want to start building your code: + +```bash +mkdir test && cd test && mkdir configuration && touch configuration/pipeline.yml && touch pipeline_test.py && touch ModuleTest.py +``` + +This command will create a directory called `test` in the current directory with the following layout: + +``` +|-- configuration +| \-- pipeline.yml +|-- pipeline_test.py +|-- ModuleTest.py +``` + +The layout has the following components: + +- **pipeline_test.py**: This is the file that will contain all of the ruffus workflows. The file needs to be named in the format `pipeline_.py`. +- **test/**: Directory containing the configuration `.yml` file. The directory needs to have the same name as the `pipeline_.py` file. This folder will contain the `pipeline.yml` configuration file. +- **ModuleTest.py**: This file will contain functions that will be imported into the main ruffus workflow file (`pipeline_test.py`). + +**2.** View the source code within `pipeline_test.py` + +This is where the ruffus tasks will be written. The code begins with a docstring detailing the pipeline functionality. You should use this section to document your pipeline: + +```python +'''This pipeline is a test and this is where the documentation goes ''' +``` + +The pipeline then needs a few utility functions to help with executing the pipeline. + +- **Import statements**: You will need to import ruffus and cgatcore utilities: + +```python +from ruffus import * +import cgatcore.experiment as E +from cgatcore import pipeline as P +``` + +Importing `ruffus` allows ruffus decorators to be used within the pipeline. +Importing `experiment` from `cgatcore` provides utility functions for argument parsing, logging, and record-keeping within scripts. +Importing `pipeline` from `cgatcore` provides utility functions for interfacing CGAT ruffus pipelines with an HPC cluster, uploading data to a database, and parameterisation. + +You'll also need some Python modules: + +```python +import os +import sys +``` + +- **Config parser**: This code helps with parsing the `pipeline.yml` file: + +```python +# Load options from the config file +PARAMS = P.get_parameters([ + "%s/pipeline.yml" % os.path.splitext(__file__)[0], + "../pipeline.yml", + "pipeline.yml"]) +``` + +- **Pipeline configuration**: We will add configurable variables to our `pipeline.yml` file so that we can modify the output of our pipeline. Open `pipeline.yml` and add the following: + +```yaml +database: + name: "csvdb" +``` + +When you run the pipeline, the configuration variables (in this case `csvdb`) can be accessed in the pipeline by `PARAMS["database_name"]`. + +- **Database connection**: This code helps with connecting to an SQLite database: + +```python +def connect(): + '''Utility function to connect to the database. + + Use this method to connect to the pipeline database. + Additional databases can be attached here as well. + + Returns an sqlite3 database handle. + ''' + dbh = sqlite3.connect(PARAMS["database_name"]) + return dbh +``` + +- **Commandline parser**: This code allows the pipeline to parse arguments: + +```python +def main(argv=None): + if argv is None: + argv = sys.argv + P.main(argv) + +if __name__ == "__main__": + sys.exit(P.main(sys.argv)) +``` + +### Running the test pipeline + +You now have the bare bones layout of the pipeline, and you need some code to execute. Below is example code that you can copy and paste into your `pipeline_test.py` file. The code includes two ruffus `@transform` tasks that parse `pipeline.yml`. The first function, called `countWords`, contains a statement that counts the number of words in the file. The statement is then executed using the `P.run()` function. + +The second ruffus `@transform` function called `loadWordCounts` takes as input the output of the function `countWords` and loads the number of words into an SQLite database using `P.load()`. + +The third function, `full()`, is a dummy task that runs the entire pipeline. It has an `@follows` decorator that takes the `loadWordCounts` function, completing the pipeline chain. + +The following code should be pasted just before the **Commandline parser** arguments and after the **database connection** code: + +```python +# --------------------------------------------------- +# Specific pipeline tasks +@transform("pipeline.yml", + regex("(.*)\.(.*)"), + r"\1.counts") +def countWords(infile, outfile): + '''Count the number of words in the pipeline configuration files.''' + + # The command line statement we want to execute + statement = '''awk 'BEGIN { printf("word\tfreq\n"); } + {for (i = 1; i <= NF; i++) freq[$i]++} + END { for (word in freq) printf "%s\t%d\n", word, freq[word] }' + < %(infile)s > %(outfile)s''' + + # Execute the command in the variable statement. + P.run(statement) + +@transform(countWords, + suffix(".counts"), + "_counts.load") +def loadWordCounts(infile, outfile): + '''Load results of word counting into database.''' + P.load(infile, outfile, "--add-index=word") + +# --------------------------------------------------- +# Generic pipeline tasks +@follows(loadWordCounts) +def full(): + pass +``` + +To run the pipeline, navigate to the working directory and then run the pipeline: + +```bash +python /location/to/code/pipeline_test.py config +python /location/to/code/pipeline_test.py show full -v 5 +``` + +This will place the `pipeline.yml` in the folder. Then run: + +```bash +python /location/to/code/pipeline_test.py make full -v5 --local +``` + +The pipeline will then execute and count the words in the `yml` file. + +### Modifying the test pipeline to build your own workflows + +The next step is to modify the basic code in the pipeline to fit your particular NGS workflow needs. For example, suppose you want to convert a SAM file into a BAM file, then perform flag stats on that output BAM file. The code and layout that we just wrote can be easily modified to perform this. + +The pipeline will have two steps: +1. Identify all SAM files and convert them to BAM files. +2. Take the output of step 1 and perform flag stats on that BAM file. + +The first step would involve writing a function to identify all `sam` files in a `data.dir/` directory and convert them to BAM files using `samtools view`. The second function would then take the output of the first function, perform `samtools flagstat`, and output the results as a flat `.txt` file. This would be written as follows: + +```python +@transform("data.dir/*.sam", + regex("data.dir/(\S+).sam"), + r"\1.bam") +def bamConvert(infile, outfile): + '''Convert a SAM file into a BAM file using samtools view.''' + + statement = '''samtools view -bT /ifs/mirror/genomes/plain/hg19.fasta \ + %(infile)s > %(outfile)s''' + P.run(statement) + +@transform(bamConvert, + suffix(".bam"), + "_flagstats.txt") +def bamFlagstats(infile, outfile): + '''Perform flagstats on a BAM file.''' + + statement = '''samtools flagstat %(infile)s > %(outfile)s''' + P.run(statement) +``` + +To run the pipeline: + +```bash +python /path/to/file/pipeline_test.py make full -v5 +``` + +The BAM files and flagstats outputs should be generated. + +### Parameterising the code using the `.yml` file + +As a philosophy, we try and avoid any hardcoded parameters, so that any variables can be easily modified by the user without changing the code. + +Looking at the code above, the hardcoded link to the `hg19.fasta` file can be added as a customisable parameter, allowing users to specify any FASTA file depending on the genome build used. In the `pipeline.yml`, add: + +```yaml +genome: + fasta: /ifs/mirror/genomes/plain/hg19.fasta +``` + +In the `pipeline_test.py` code, the value can be accessed via `PARAMS["genome_fasta"]`. +Therefore, the code for parsing BAM files can be modified as follows: + +```python +@transform("data.dir/*.sam", + regex("data.dir/(\S+).sam"), + r"\1.bam") +def bamConvert(infile, outfile): + '''Convert a SAM file into a BAM file using samtools view.''' + + genome_fasta = PARAMS["genome_fasta"] + + statement = '''samtools view -bT %(genome_fasta)s \ + %(infile)s > %(outfile)s''' + P.run(statement) + +@transform(bamConvert, + suffix(".bam"), + "_flagstats.txt") +def bamFlagstats(infile, outfile): + '''Perform flagstats on a BAM file.''' + + statement = '''samtools flagstat %(infile)s > %(outfile)s''' + P.run(statement) +``` + +Running the code again will generate the same output. However, if you had BAM files that came from a different genome build, the parameter in the `yml` file can be easily modified, the output files deleted, and the pipeline run again with the new configuration values. + diff --git a/docs/defining_workflow/writing_workflows.md b/docs/defining_workflow/writing_workflows.md new file mode 100644 index 00000000..9e31900c --- /dev/null +++ b/docs/defining_workflow/writing_workflows.md @@ -0,0 +1,220 @@ +# Writing a workflow + +## Our workflow philosophy + +The explicit aim of CGAT-core is to allow users to quickly and easily build their own computational pipelines, speeding up their analysis workflow. + +When building pipelines, it is often useful to keep in mind the following guiding principles: + +### Flexibility + +There are always new tools and insights that could be incorporated into a pipeline. Ultimately, a pipeline should be flexible, and the code should not constrain you when implementing new features. + +### Scriptability + +The pipeline should be scriptable, i.e., the entire pipeline can be run within another pipeline. Similarly, parts of a pipeline should be easily duplicated to process several data streams in parallel. This is crucial in genome studies, as a single analysis will not always permit making inferences by itself. When writing a pipeline, we typically create a command line script (included in the CGAT-apps repository) and then run this script as a command line statement in the pipeline. + +### Reproducibility + +The pipeline should be fully automated so that the same inputs and configuration produce the same outputs. + +### Reusability + +The pipeline should be reusable on similar data, preferably requiring only changes to a configuration file (such as `pipeline.yml`). + +### Archivability + +Once finished, the whole project should be archivable without relying heavily on external data. This process should be simple; all project data should be self-contained, without needing to go through various directories or databases to determine dependencies. + +## Building a pipeline + +The best way to build a pipeline is to start from an example. The [CGAT Showcase](https://cgat-showcase.readthedocs.io/en/latest/index.html) contains a toy example of an RNA-seq analysis pipeline, demonstrating how simple workflows can be generated with minimal code. For more complex workflows, you can refer to [CGAT-Flow](https://github.com/cgat-developers/cgat-flow). + +For a step-by-step tutorial on running pipelines, refer to our [Getting Started Tutorial](#). + +To construct a pipeline from scratch, continue reading below. + +In an empty directory, create a new directory and then a Python file with the same name. For example: + +```bash +mkdir test && touch pipeline_test.py +``` + +All pipelines require a `.yml` configuration file that allows you to modify the behaviour of your code. This file is placed in the `test/` directory and should have the same name as the pipeline Python file: + +```bash +touch test/pipeline.yml +``` + +To facilitate debugging and reading, our pipelines are designed so that the pipeline task file contains Ruffus tasks, while the code to transform and analyse data is in an associated module file. + +If you wish to create a module file, it is conventionally named using the format `ModuleTest.py`. You can import it into the main pipeline task file (`pipeline_test.py`) as follows: + +```python +import ModuleTest +``` + +The [pipeline module](https://github.com/cgat-developers/cgat-core/tree/master/cgatcore/pipeline) in CGAT-core provides many useful functions for pipeline construction. + +## Pipeline input + +Pipelines are executed within a dedicated working directory, which usually contains: + +- A pipeline configuration file: `pipeline.yml` +- Input data files, typically specified in the pipeline documentation + +Other files that might be used include external data files, such as genomes, referred to by their full path. + +Pipelines work with input files in the working directory, usually identified by their suffix. For instance, a mapping pipeline might look for any `.fastq.gz` files in the directory, perform QC on them, and map the reads to a genome sequence. + +## Pipeline output + +The pipeline will generate files and database tables in the working directory. You can structure your files/directories in any way that fits your needs—some prefer a flat structure with many files, while others use deeper hierarchies. + +To save disk space, compressed files should be used wherever possible. Most data files compress well; for example, `fastq` files often compress by up to 80%. Working with compressed files is straightforward using Unix pipes (`gzip`, `gunzip`, `zcat`). + +If you need random access to a file, load it into a database and index it appropriately. Genomic interval files can be indexed with `tabix` to allow random access. + +## Import statements + +To run our pipelines, you need to import the CGAT-core Python modules into your pipeline. We recommend importing the following modules for every CGAT pipeline: + +```python +from ruffus import * +import cgatcore.experiment as E +from cgatcore import pipeline as P +import cgatcore.iotools as iotools +``` + +Additional modules can be imported as needed. + +## Selecting the appropriate Ruffus decorator + +Before starting a pipeline, it is helpful to map out the steps and flow of your potential pipeline on a whiteboard. This helps identify the inputs and outputs of each task. Once you have a clear picture, determine which Ruffus decorator to use for each task. For more information on each decorator, refer to the [Ruffus documentation](http://www.ruffus.org.uk/decorators/decorators.html). + +## Running commands within tasks + +To run a command line program within a pipeline task, build a statement and call the `P.run()` method: + +```python +@transform('*.unsorted', suffix('.unsorted'), '.sorted') +def sortFile(infile, outfile): + statement = '''sort %(infile)s > %(outfile)s''' + P.run(statement) +``` + +In the `P.run()` method, the environment of the caller is examined for a variable called `statement`, which is then subjected to string substitution from other variables in the local namespace. In the example above, `%(infile)s` and `%(outfile)s` are replaced with the values of `infile` and `outfile`, respectively. + +The same mechanism also allows configuration parameters to be set, as shown here: + +```python +@transform('*.unsorted', suffix('.unsorted'), '.sorted') +def sortFile(infile, outfile): + statement = '''sort -t %(tmpdir)s %(infile)s > %(outfile)s''' + P.run(statement) +``` + +In this case, the configuration parameter `tmpdir` is substituted into the command. + +### Chaining commands with error checking + +If you need to chain multiple commands, you can use `&&` to ensure that errors in upstream commands are detected: + +```python +@transform('*.unsorted.gz', suffix('.unsorted.gz'), '.sorted') +def sortFile(infile, outfile): + statement = '''gunzip %(infile)s %(infile)s.tmp && + sort -t %(tmpdir)s %(infile)s.tmp > %(outfile)s && + rm -f %(infile)s.tmp''' + P.run(statement) +``` + +Alternatively, you can achieve this more efficiently using pipes: + +```python +@transform('*.unsorted.gz', suffix('.unsorted.gz'), '.sorted.gz') +def sortFile(infile, outfile): + statement = '''gunzip < %(infile)s | sort -t %(tmpdir)s | gzip > %(outfile)s''' + P.run(statement) +``` + +The pipeline automatically inserts code to check for error return codes when multiple commands are combined in a pipe. + +## Running commands on a cluster + +To run commands on a cluster, set `to_cluster=True`: + +```python +@files('*.unsorted.gz', suffix('.unsorted.gz'), '.sorted.gz') +def sortFile(infile, outfile): + to_cluster = True + statement = '''gunzip < %(infile)s | sort -t %(tmpdir)s | gzip > %(outfile)s''' + P.run(statement) +``` + +Pipelines will use command line options such as `--cluster-queue` and `--cluster-priority` for global job control. For instance, to change the priority when starting the pipeline: + +```bash +python --cluster-priority=-20 +``` + +To set job-specific options, you can define additional variables: + +```python +@files('*.unsorted.gz', suffix('.unsorted.gz'), '.sorted.gz') +def sortFile(infile, outfile): + to_cluster = True + job_queue = 'longjobs.q' + job_priority = -10 + job_options = "-pe dedicated 4 -R y" + statement = '''gunzip < %(infile)s | sort -t %(tmpdir)s | gzip > %(outfile)s''' + P.run(statement) +``` + +The statement above will run in the queue `longjobs.q` with a priority of `-10`. It will also be executed in the parallel environment `dedicated`, using at least four cores. + +## Combining commands + +To combine commands, use `&&` to ensure they execute in the intended order: + +```python +statement = """ +module load cutadapt && +cutadapt ... +""" + +P.run(statement) +``` + +Without `&&`, the command would fail because the `cutadapt` command would execute as part of the `module load` statement. + +## Useful information regarding decorators + +For a full list of Ruffus decorators that control pipeline flow, see the [Ruffus documentation](http://www.ruffus.org.uk/decorators/decorators.html). + +Here are some examples of modifying an input file name to transform it into the output filename: + +### Using Suffix + +```python +@transform(pairs, suffix('.fastq.gz'), ('_trimmed.fastq.gz', '_trimmed.fastq.gz')) +``` + +This will transform an input `.fastq.gz` into an output `_trimmed.fastq.gz`. + +### Using Regex + +```python +@follows(mkdir("new_folder.dir")) +@transform(pairs, regex('(\S+).fastq.gz'), ('new_folder.dir/\1_trimmed.fastq.gz', 'new_folder.dir/\1_trimmed.fastq.gz')) +``` + +### Using Formatter + +```python +@follows(mkdir("new_folder.dir")) +@transform(pairs, formatter('(\S+).fastq.gz'), ('new_folder.dir/{SAMPLE[0]}_trimmed.fastq.gz', 'new_folder.dir/{SAMPLE[0]}_trimmed.fastq.gz')) +``` + +This documentation aims to provide a comprehensive guide to writing your own workflows and pipelines. For more advanced usage, please refer to the original CGAT-core and Ruffus documentation. + diff --git a/docs/function_doc/Core.rst b/docs/function_doc/Core.rst deleted file mode 100644 index 1c5c25fb..00000000 --- a/docs/function_doc/Core.rst +++ /dev/null @@ -1,15 +0,0 @@ - -=================== -Core helper modules -=================== - -Add links to the other core documentation - -.. toctree:: - - Core/Experiment.rst - Core/CSV.rst - Core/csv2db.rst - Core/Database.rst - Core/IOTools.rst - Core/Logfile.rst diff --git a/docs/function_doc/Core/CSV.rst b/docs/function_doc/Core/CSV.rst deleted file mode 100644 index b01f7bbb..00000000 --- a/docs/function_doc/Core/CSV.rst +++ /dev/null @@ -1,5 +0,0 @@ - -.. automodule:: cgatcore.csv - :members: - :show-inheritance: - diff --git a/docs/function_doc/Core/Database.rst b/docs/function_doc/Core/Database.rst deleted file mode 100644 index 6a500030..00000000 --- a/docs/function_doc/Core/Database.rst +++ /dev/null @@ -1,4 +0,0 @@ - -.. automodule:: cgatcore.database - :members: - :show-inheritance: \ No newline at end of file diff --git a/docs/function_doc/Core/Experiment.rst b/docs/function_doc/Core/Experiment.rst deleted file mode 100644 index 7ee2d027..00000000 --- a/docs/function_doc/Core/Experiment.rst +++ /dev/null @@ -1,4 +0,0 @@ - -.. automodule:: cgatcore.experiment - :members: - :show-inheritance: \ No newline at end of file diff --git a/docs/function_doc/Core/IOTools.rst b/docs/function_doc/Core/IOTools.rst deleted file mode 100644 index 6b45809d..00000000 --- a/docs/function_doc/Core/IOTools.rst +++ /dev/null @@ -1,4 +0,0 @@ - -.. automodule:: cgatcore.iotools - :members: - :show-inheritance: \ No newline at end of file diff --git a/docs/function_doc/Core/Logfile.rst b/docs/function_doc/Core/Logfile.rst deleted file mode 100644 index 5c3f1c9c..00000000 --- a/docs/function_doc/Core/Logfile.rst +++ /dev/null @@ -1,4 +0,0 @@ - -.. automodule:: cgatcore.Logfile - :members: - :show-inheritance: \ No newline at end of file diff --git a/docs/function_doc/Core/csv2db.rst b/docs/function_doc/Core/csv2db.rst deleted file mode 100644 index 80d547a7..00000000 --- a/docs/function_doc/Core/csv2db.rst +++ /dev/null @@ -1,4 +0,0 @@ - -.. automodule:: cgatcore.csv2db - :members: - :show-inheritance: diff --git a/docs/function_doc/Overview.rst b/docs/function_doc/Overview.rst deleted file mode 100644 index 98d514f2..00000000 --- a/docs/function_doc/Overview.rst +++ /dev/null @@ -1,6 +0,0 @@ -.. function_doc-Overview - - -========================= -Overview of the functions -========================= diff --git a/docs/function_doc/Pipeline.rst b/docs/function_doc/Pipeline.rst deleted file mode 100644 index 87c00eff..00000000 --- a/docs/function_doc/Pipeline.rst +++ /dev/null @@ -1,8 +0,0 @@ - -================ -pipeline modules -================ - -.. automodule:: cgatcore.pipeline - :members: - :show-inheritance: diff --git a/docs/function_doc/Pipeline/Control.rst b/docs/function_doc/Pipeline/Control.rst deleted file mode 100644 index 75ebe0ed..00000000 --- a/docs/function_doc/Pipeline/Control.rst +++ /dev/null @@ -1,4 +0,0 @@ - -.. automodule:: cgatcore.pipeline.control - :members: - :show-inheritance: diff --git a/docs/function_doc/Pipeline/Database.rst b/docs/function_doc/Pipeline/Database.rst deleted file mode 100644 index 40c20e2e..00000000 --- a/docs/function_doc/Pipeline/Database.rst +++ /dev/null @@ -1,4 +0,0 @@ - -.. automodule:: cgatcore.pipeline.database - :members: - :show-inheritance: diff --git a/docs/function_doc/Pipeline/Execution.rst b/docs/function_doc/Pipeline/Execution.rst deleted file mode 100644 index 8cb31a75..00000000 --- a/docs/function_doc/Pipeline/Execution.rst +++ /dev/null @@ -1,4 +0,0 @@ - -.. automodule:: cgatcore.pipeline.execution - :members: - :show-inheritance: diff --git a/docs/function_doc/Pipeline/Files.rst b/docs/function_doc/Pipeline/Files.rst deleted file mode 100644 index 920d95b6..00000000 --- a/docs/function_doc/Pipeline/Files.rst +++ /dev/null @@ -1,4 +0,0 @@ - -.. automodule:: cgatcore.pipeline.files - :members: - :show-inheritance: diff --git a/docs/function_doc/Pipeline/Utils.rst b/docs/function_doc/Pipeline/Utils.rst deleted file mode 100644 index 942da4e4..00000000 --- a/docs/function_doc/Pipeline/Utils.rst +++ /dev/null @@ -1,4 +0,0 @@ - -.. automodule:: cgatcore.pipeline.utils - :members: - :show-inheritance: diff --git a/docs/function_doc/csv2db.md b/docs/function_doc/csv2db.md new file mode 100644 index 00000000..570382be --- /dev/null +++ b/docs/function_doc/csv2db.md @@ -0,0 +1,5 @@ +# CGATcore CSV2DB Module + +::: cgatcore.csv2db + :members: + :show-inheritance: diff --git a/docs/function_doc/database.md b/docs/function_doc/database.md new file mode 100644 index 00000000..cb3239ed --- /dev/null +++ b/docs/function_doc/database.md @@ -0,0 +1,5 @@ +# CGATcore Database Module + +::: cgatcore.database + :members: + :show-inheritance: diff --git a/docs/function_doc/experiment.md b/docs/function_doc/experiment.md new file mode 100644 index 00000000..4b4c6bb7 --- /dev/null +++ b/docs/function_doc/experiment.md @@ -0,0 +1,5 @@ +# CGATcore Experiment Module + +::: cgatcore.experiment + :members: + :show-inheritance: diff --git a/docs/function_doc/iotools.md b/docs/function_doc/iotools.md new file mode 100644 index 00000000..9dd52fea --- /dev/null +++ b/docs/function_doc/iotools.md @@ -0,0 +1,5 @@ +# CGATcore IOTools Module + +::: cgatcore.iotools + :members: + :show-inheritance: diff --git a/docs/function_doc/logfile.md b/docs/function_doc/logfile.md new file mode 100644 index 00000000..5da51b19 --- /dev/null +++ b/docs/function_doc/logfile.md @@ -0,0 +1,5 @@ +# CGATcore Logfile Module + +::: cgatcore.logfile + :members: + :show-inheritance: diff --git a/docs/function_doc/pipeline.md b/docs/function_doc/pipeline.md new file mode 100644 index 00000000..d9a4af32 --- /dev/null +++ b/docs/function_doc/pipeline.md @@ -0,0 +1,5 @@ +# CGATcore Pipeline Module + +::: cgatcore.pipeline + :members: + :show-inheritance: diff --git a/docs/getting_started/Cluster_config.rst b/docs/getting_started/Cluster_config.rst deleted file mode 100644 index d7c10811..00000000 --- a/docs/getting_started/Cluster_config.rst +++ /dev/null @@ -1,36 +0,0 @@ -.. _getting_started-Config: - - -===================== -Cluster configuration -===================== - -Currently SGE, SLURM, Torque and PBSPro workload managers are supported. The default cluster options for -cgatcore are set for SunGrid Engine (SGE). Therefore, if you would like to run an alternative workload manager -then you will need to configure your settings for your cluster. In order to do this you will need to -create a :file:`.cgat.yml` within the user`s home directory. - -This will allow you to overide the default configurations. To view the hardcoded parameters for cgatcore -please see the `parameters.py `_ -file. - -For an example of how to configure a PBSpro workload manager see this link to this `config example `_. - -The .cgat.yml is placed in your home directory and when a pipeline is executed it will automatically prioritise the -:file:`.cgat.yml` parameters over the cgatcore hard coded parameters. For example, adding the following to the -.cgat.yml file will implement cluster settings for PBSpro:: - - - memory_resource: mem - - options: -l walltime=00:10:00 -l select=1:ncpus=8:mem=1gb - - queue_manager: pbspro - - queue: NONE - - parallel_environment: "dedicated" - - - - diff --git a/docs/getting_started/Examples.rst b/docs/getting_started/Examples.rst deleted file mode 100644 index bf38d5f0..00000000 --- a/docs/getting_started/Examples.rst +++ /dev/null @@ -1,324 +0,0 @@ -.. _getting_started-Examples: - - -================== -Running a pipeline -================== - - -This section provides a tutorial-like introduction of how to run CGAT pipelines. As an example of how we build simple -computational pipelines please refer to `cgat-showcase `_. As an example of how we use cgatcore to -build more complex computational pipelines, please refer to the code detailed in our `cgat-flow repository `_. - -.. _getting_started-Intro: - -Introduction -============= - -A pipeline takes input data and performs a series of automated steps on it to produce some output data. - -Each pipeline is usually coupled with a report (usually MultiQC or Rmarkdown) document to -summarize and visualize the results. - -It really helps if you are familiar with: - - * the unix command line to run and debug the pipeline - * python_ in order to understand what happens in the pipeline - * ruffus_ in order to understand the pipeline code - * sge_ (or any other workflow manager) in order to monitor your jobs - * git_ in order to up-to-date code - -.. _getting_started-setting-up-pipeline: - -Setting up a pipeline -====================== - -**Step 1**: Install cgat-showcase (our toy example of a cgatcore pipeline): - -Check that your computing environment is appropriate and follow cgat-showcase installation instructions (see `Installation instructions `_). - -**Step2**: Clone the repository - -To inspect the code and the layout clone the repository:: - - git clone https://github.com/cgat-developers/cgat-showcase.git - -When inspecting the respoitory: -The source directory will contain the pipeline master script named -:file:`cgatshowcase/pipeline_.py` - -The default configuration files will be contained in the folder -:file:`cgatshowcase/pipeline/` - -All our pipelines are written to be lightweight. Therefore, a module file -assoaiated with the pipeline master script, typically named -:file:`cgatshowcase/Module.py`, is usually where code required to run the tasks -of the pipleine is located. - -**Step 3**: To run a pipeline you will need to create a working directory -and enter it. For example:: - - mkdir version1 - cd version1/ - -This is where the pipeline will be executed and files will be generated in this -directory. - -However, the cgat-showcase example comes with test data and this can be downloaded by running:: - - wget https://www.cgat.org/downloads/public/showcase/showcase_test_data.tar.gz - tar -zxvf showcase_test_data.tar.gz - cd showcase_test_data - -**Step 4**: Configure the cluster - -Running pipelines on a cluster required the drmaa API settings to be configures and passed -to cgatcore. The default cluster engine is SGE, however we also support SLURM and Torque/PBSpro. -In order to execute using a non SGE cluster you will need to setup a `.cgat.yml` file in your -home directory and specify parameters according to the `cluster configuration documentation `_. - -**Step 5**: Our pipelines are written with minimal hard coded options. Therefore, -to run a pipeline an initial configuration file needs to be -generated. A configuration file with all the default values can be obtained by -running:: - - cgatshowcase config - -For example, if you wanted to run the transdiffexprs pipeline you would run:: - - cgatshowcase transdiffexprs config - - -This will create a new :file:`pipeline.yml` file. **YOU MUST EDIT THIS -FILE**. The default values are unlikely to be configured correctly for your data. The -configuration file should be well documented and the format is -simple. The documenation for the `ConfigParser -`_ python module -contains the full specification. - -**Step 6**: Add the input files. The required input is specific for each -pipeline in the documentation string at the; read the pipeline documentation to find out exactly which -files are needed and where they should be put. Commonly, a pipeline -works from input files linked into the working directory and -named following pipeline specific conventions. - -**Step 7**: You can check if all the external dependencies to tools and -R packages are satisfied by running:: - - cgatshowcase check - -.. _getting_started-pipelineRunning: - -Running a pipeline -=================== - -pipelines are controlled by a single python script called -:file:`pipeline_.py` that lives in the source directory. Command line usage information is available by running:: - - cgatshowcase --help - -Alternatively, you can call the python script directly:: - - python /path/to/code/cgatshowcase/pipeline_.py --help - -The basic syntax for ``pipeline_.py`` is:: - - cgatshowcase [workflow options] [workflow arguments] - -For example, to run the readqc pipeline you would run the following:: - - cgatshowcase readqc make full - -``workflow options`` can be one of the following: - -make - - run all tasks required to build task - -show - - show tasks required to build task without executing them - -plot - - plot image of workflow (requires `inkscape `_) of - pipeline state for task - -touch - - touch files without running task or its pre-requisites. This sets the - timestamps for files in task and its pre-requisites such that they will - seem up-to-date to the pipeline. - -config - - write a new configuration file :file:`pipeline.ini` with - default values. An existing configuration file will not be - overwritten. - -clone - - clone a pipeline from :file:`srcdir` into the current - directory. Cloning attempts to conserve disk space by linking. - -In case you are running a long pipeline, make sure you start it -appropriately, for example:: - - nice -19 nohup cgatshowcase make full -v5 -c1 - -This will keep the pipeline running if you close the terminal. - -Fastq naming convention ------------------------ - -Most of our pipelines assume that input fastq files follows the following -naming convention (with the read inserted between the fastq and the gz. The reason -for this is so that regular expressions do not have to acount for the read within the name. -It is also more explicit:: - - sample1-condition.fastq.1.gz - sample1-condition.fastq.2.gz - - -Additional pipeline options ---------------------------- - -In addition to running the pipeline with default command line options, running a -pipeline with --help will allow you to see additional options for ``workflow arguments`` -when running the pipelines. These will modify the way the pipeline in ran. - -`- -no-cluster` - - This option allows the pipeline to run locally. - -`- -input-validation` - - This option will check the pipeline.ini file for missing values before the - pipeline starts. - -`- -debug` - - Add debugging information to the console and not the logfile - -`- -dry-run` - - Perform a dry run of the pipeline (do not execute shell commands) - -`- -exceptions` - - Echo exceptions immidietly as they occur. - -`-c - -checksums` - - Set the level of ruffus checksums. - -.. _getting_started-Building-reports: - -Building pipeline reports -================================ - -We always associate some for of reporting with our pipelines to display summary information as a set of nicely formatted -html pages. - -Currently in CGAT we have 3 preferred types of report generation. - - * MultiQC report (for general alignment and tool reporting) - * R markdown (for bespoke reporting) - * Jupyter notebook (for bespoke reporting) - -To determine which type of reporting is implimented for each pipeline, refer to -the specific pipeline documentation at the beginning of the script. - -Reports are generated using the following command once a workflow has completed:: - - cgatshowcase make build_report - -MultiQC report --------------- - -MultiQC is a python framework for automating reporting and we have imliemnted it in the -majority of our workflows to generate QC stats for frequently used tools (mostly in our -generic workflows). - - -R markdown ----------- -R markdown report generation is very useful for generating bespoke reports that require user -defined reporting. We have implimented this in our bamstats workflow. - -Jupyter notebook ----------------- -Jupyter notebook is a second approach that we use to produce bespoke reports. An example is -also implimented in our bamstats workflow. - -.. _getting_started-Troubleshooting: - -Troubleshooting -=============== - -Many things can go wrong while running the pipeline. Look out for - - * bad input format. The pipeline does not perform sanity checks on the input format. If the input is bad, you might see wrong or missing results or an error message. - * pipeline disruptions. Problems with the cluster, the file system or the controlling terminal might all cause the pipeline to abort. - * bugs. The pipeline makes many implicit assumptions about the input files and the programs it runs. If program versions change or inputs change, the pipeline might not be able to deal with it. The result will be wrong or missing results or an error message. - -If the pipeline aborts, locate the step that caused the error by -reading the logfiles and the error messages on stderr -(:file:`nohup.out`). See if you can understand the error and guess the -likely problem (new program versions, badly formatted input, ...). If -you are able to fix the error, remove the output files of the step in -which the error occured and restart the pipeline. Processing should -resume at the appropriate point. - -.. note:: - - Look out for upstream errors. For example, the pipeline might build - a geneset filtering by a certain set of contigs. If the contig - names do not match, the geneset will be empty, but the geneset - building step might conclude successfully. However, you might get - an error in any of the downstream steps complaining that the gene - set is empty. To fix this, fix the error and delete the files - created by the geneset building step and not just the step that - threw the error. - -Common pipeline errors ----------------------- - -One of the most common errors when runnig the pipeline is:: - - GLOBAL_SESSION = drmaa.Session() - NameError: name 'drmaa' is not defined - -This error occurrs because you are not connected to the cluster. Alternatively -you can run the pipleine in local mode by adding `- -no-cluster` as a command line option. - -Updating to the latest code version ------------------------------------ - -To get the latest bugfixes, go into the source directory and type:: - - git pull - -The first command retrieves the latest changes from the master -repository and the second command updates your local version with -these changes. - - -Using qsub commands -------------------- - -We would always recommend using cgat-core to perform the job submission as this -is handled in the background without the need to use qsub commands. However, -if users wish to use qsub then it is perfectly simple to do so. Since our -statements to P.run() are essentially commandline scripts then you can write -the qsub as you would normally do when sending a script to the commandline. For -example:: - - statement = "qsub [commands] echo 'This is where you would put commands you wan ran' " - - P.run(statament) - -When running the pipeline make sure you specify `--no-cluster` as a commandlie option and your -good to go. - -.. _pipelineReporting: diff --git a/docs/getting_started/Installation.rst b/docs/getting_started/Installation.rst deleted file mode 100644 index c3f3271e..00000000 --- a/docs/getting_started/Installation.rst +++ /dev/null @@ -1,112 +0,0 @@ -.. _getting_started-Installation: - - -============ -Installation -============ - -The following sections describe how to install the cgatcore framework. - -.. _getting_started-Conda: - -Conda Installation ------------------- - -The our preffered method of installation is using conda. If you dont have conda installed then -please install conda using `miniconda `_ or `anaconda `_. - -cgatcore is currently installed using the bioconda channel and the recipe can be found on `github `_. To install cgatcore:: - - conda install -c conda-forge -c bioconda cgatcore - -.. _getting_started-Automated: - - -Pip installation ----------------- -We recommend installation through conda because it manages the dependancies. However, cgatcore is -generally lightweight and can be installed easily using pip package manager. However, you may also have to -install other dependancies manually:: - - pip install cgatcore - -.. _getting_started-pip: - -Automated installation ----------------------- - -The following sections describe how to install the cgatcore framework. - -The preferred method to install the cgatcore is using conda but we have also created a bash installation script, -which uses `conda `_ under the hood. - -Here are the steps:: - - # download installation script: - curl -O https://raw.githubusercontent.com/cgat-developers/cgat-core/master/install.sh - - # see help: - bash install.sh - - # install the development version (recommended, no production version yet): - bash install.sh --devel [--location ] - - # the code is downloaded in zip format by default. If you want to get a git clone, use: - --git # for an HTTPS clone - --git-ssh # for a SSH clone (you need to be a cgat-developer contributor on GitHub to do this) - - # enable the conda environment as requested by the installation script - # NB: you probably want to automate this by adding the instructions below to your .bashrc - source /conda-install/etc/profile.d/conda.sh - conda activate base - conda activate cgat-c - -The installation script will put everything under the specified location. -The aim of the script is to provide a portable installation that does not interfere with the existing -software. As a result, you will have a conda environment working with the cgat-core which can be enabled -on demand according to your needs. - -.. _getting_started-Manual: - -Manual installation -------------------- - -To obtain the latest code, check it out from the public git_ repository and activate it:: - - git clone https://github.com/cgat-developers/cgat-core.git - cd cgat-core - python setup.py develop - -Once checked-out, you can get the latest changes via pulling:: - - git pull - - -.. _getting_started-Additional: - -Installing additonal software ------------------------------ - -When building your own workflows we recomend using conda to install software into your environment where possible. - -This can easily be performed by:: - - conda search - conda install - -Access libdrmaa shared library ------------------------------- - -You may also need access to the libdrmaa.so.1.0 C library, which can often be installed as part of the -libdrmaa-dev package on most Unixes. Once you have installed that, you may need to tell DRMAA Python -where it is installed by setting the DRMAA_LIBRARY_PATH environment variable, if it is not installed -in a location that Python usually looks for libraries. - -In order to set this correctly every time please add the following line to your bashrc, but set the library -path to the location of the libdrmaa.so.1.0:: - - export DRMAA_LIBRARY_PATH=/usr/lib/libdrmaa.so.1.0 - - - -.. _conda: https://conda.io diff --git a/docs/getting_started/Tutorial.rst b/docs/getting_started/Tutorial.rst deleted file mode 100644 index 13edd592..00000000 --- a/docs/getting_started/Tutorial.rst +++ /dev/null @@ -1,72 +0,0 @@ -.. _getting_started-Tutorial: - - -============================= -Running a pipeline - Tutorial -============================= - - -Before beginning this tutorial make sure you have the CGAT-core installed correctly, -please see here (see :ref:`getting_started-Installation`) for installation instructions. - -As a tutorial example of how to run a CGAT workflow we will run the cgat-showcase pipeline. Therefore, -you will also need to install the cgat-showcase (see `instructions `_) - -The aim of this pipeline is to perform pseaudoalignment using kallisto. The pipeline can be ran locally or -dirtributed accross a cluster. This tutorial will explain the steps required to run this pipeline. Further documentation -on cgat-showcase can be found `here `_. - -The cgat-showcase highlights some of the functionality of cgat-core. However, we also have our utility -pipelines contained in the cgat-flow repository which demonstrate our advanced pipelines for next-generation -sequencing analysis (see `cgat-flow `_). - -Tutorial start --------------- - - -**1.** First download the tutorial data:: - - mkdir showcase - cd showcase - wget https://www.cgat.org/downloads/public/showcase/showcase_test_data.tar.gz - tar -zxvf showcase_test_data.tar.gz - -**2.** Next we will generate a configuration yml file so the pipeline output can be modified:: - - cd showcase_test_data - cgatshowcase transdiffexpres config - -or you can alternatively call the workflow file directly:: - - python /path/to/file/pipeline_transdiffexpres.py config - -This will generate a **pipeline.yml** file containing the configuration parameters than can be used to modify -the output of the pipleine. However, for this tutorial you do not need to modify the parameters to run the -pipeline. In the :ref:`modify_config` section below I have detailed how you can modify the config file to -change the output of the pipeline. - -**3.** Next we will run the pipleine:: - - cgatshowcase transdiffexpres make full -v5 --no-cluster - -This ``--no-cluster`` will run the pipeline locally if you do not have access to a cluster. Alternatively if you have a -cluster remove the ``--no-cluster`` option and the pipleine will distribute your jobs accross the cluster. - -.. note:: - - There are many commandline options available to run the pipeline. To see available options please run :code:`cgatshowcase --help`. - -**4.** Generate a report - -The final step is to generate a report to display the output of the pipeline. We have a preference for using MultiQC -for generate bioinformatics tools (such as mappers and pseudoaligners) and Rmarkdown for generating custom reports. -In order to generate these run the command:: - - cgatshowcase transdiffexprs make build_report -v 5 --no-cluster - -This will generate a MultiQC report in the folder `MultiQC_report.dir/` and an Rmarkdown report in `R_report.dir/`. - - - -This completes the tutorial for running the transdiffexprs pipeline for cgat-showcase, hope you find it as useful as -we do for writing workflows within python. diff --git a/docs/getting_started/examples.md b/docs/getting_started/examples.md new file mode 100644 index 00000000..660e2a0f --- /dev/null +++ b/docs/getting_started/examples.md @@ -0,0 +1,224 @@ +# Running a pipeline + +This section provides a tutorial-like introduction to running CGAT pipelines. For an example of how to build simple computational pipelines, refer to [cgat-showcase](https://github.com/cgat-developers/cgat-showcase). To see how `cgatcore` is used to build more complex computational pipelines, refer to the [cgat-flow repository](https://github.com/cgat-developers/cgat-flow). + +## Introduction + +A pipeline takes input data and performs a series of automated steps to produce output data. Each pipeline is usually coupled with a report (such as MultiQC or Rmarkdown) to summarise and visualise the results. + +It helps if you are familiar with: + +- The Unix command line to run and debug the pipeline +- [Python](https://www.python.org/) to understand what happens in the pipeline +- [Ruffus](http://www.ruffus.org.uk/) to understand the pipeline code +- SGE (or any other workload manager) to monitor jobs +- [Git](https://git-scm.com/) to keep code up-to-date + +## Setting up a pipeline + +**Step 1**: Install `cgat-showcase` (a toy example of a `cgatcore` pipeline). + +Ensure your computing environment is appropriate and follow `cgat-showcase` installation instructions (see [Installation instructions](https://cgat-showcase.readthedocs.io/en/latest/getting_started/Installation.html)). + +**Step 2**: Clone the repository + +To inspect the code and layout, clone the repository: + +```bash +git clone https://github.com/cgat-developers/cgat-showcase.git +``` + +When inspecting the repository: + +- The source directory will contain the pipeline master script named `cgatshowcase/pipeline_.py`. +- The default configuration files are in `cgatshowcase/pipeline/`. +- Associated module files are typically named `cgatshowcase/Module.py` and contain code required to run pipeline tasks. + +**Step 3**: Create a working directory + +To run a pipeline, create a working directory and navigate to it: + +```bash +mkdir version1 +cd version1/ +``` + +The pipeline will execute and generate files in this directory. + +To use test data for `cgat-showcase`, download it with: + +```bash +wget https://www.cgat.org/downloads/public/showcase/showcase_test_data.tar.gz +tar -zxvf showcase_test_data.tar.gz +cd showcase_test_data +``` + +**Step 4**: Configure the cluster + +Running pipelines on a cluster requires configuring DRMAA API settings for `cgatcore`. The default cluster engine is SGE, but SLURM and Torque/PBSpro are also supported. To use a non-SGE cluster, create a `.cgat.yml` file in your home directory and configure the parameters (see the [cluster configuration documentation](https://cgat-core.readthedocs.io/en/latest/getting_started/Cluster_config.html)). + +**Step 5**: Generate a configuration file + +Our pipelines are written with minimal hard-coded options. To run a pipeline, generate an initial configuration file: + +```bash +cgatshowcase config +``` + +For example, to run the `transdiffexprs` pipeline, run: + +```bash +cgatshowcase transdiffexprs config +``` + +This will create a new `pipeline.yml` file. **You must edit this file** as the default values are unlikely to suit your data. The configuration file format is simple and well-documented. For more information, see the [ConfigParser documentation](http://docs.python.org/library/configparser.html). + +**Step 6**: Add input files + +The required input is specific to each pipeline. Check the pipeline documentation to determine which files are needed and where to place them. Typically, input files are linked into the working directory and follow pipeline-specific naming conventions. + +**Step 7**: Check dependencies + +Check if all external dependencies, such as tools and R packages, are satisfied by running: + +```bash +cgatshowcase check +``` + +## Running a pipeline + +Pipelines are controlled by a Python script named `pipeline_.py` in the source directory. Command line usage information is available by running: + +```bash +cgatshowcase --help +``` + +Alternatively, call the Python script directly: + +```bash +python /path/to/code/cgatshowcase/pipeline_.py --help +``` + +The basic syntax for `pipeline_.py` is: + +```bash +cgatshowcase [workflow options] [workflow arguments] +``` + +For example, to run the `readqc` pipeline: + +```bash +cgatshowcase readqc make full +``` + +### Workflow options + +- **make ``**: Run all tasks required to build ``. +- **show ``**: Show tasks required to build `` without executing them. +- **plot ``**: Plot an image of workflow (requires [Inkscape](http://inkscape.org/)) of pipeline state for ``. +- **touch ``**: Touch files without running `` or prerequisites, setting timestamps so files appear up-to-date. +- **config**: Write a new configuration file (`pipeline.ini`) with default values (won't overwrite existing files). +- **clone ``**: Clone a pipeline from `` into the current directory. + +To run a long pipeline appropriately: + +```bash +nice -19 nohup cgatshowcase make full -v5 -c1 +``` + +This command will keep the pipeline running if you close the terminal. + +### Fastq naming convention + +Most of our pipelines assume input FASTQ files follow this naming convention: + +``` +sample1-condition.fastq.1.gz +sample1-condition.fastq.2.gz +``` + +This convention ensures regular expressions do not need to account for the read within the name, making it more explicit. + +### Additional pipeline options + +Running the pipeline with `--help` will show additional workflow arguments that modify the pipeline's behaviour: + +- **--no-cluster**: Run the pipeline locally. +- **--input-validation**: Check `pipeline.ini` file for missing values before starting. +- **--debug**: Add debugging information to the console (not the logfile). +- **--dry-run**: Perform a dry run (do not execute shell commands). +- **--exceptions**: Echo exceptions immediately as they occur. +- **-c --checksums**: Set the level of Ruffus checksums. + +## Building pipeline reports + +We associate some form of reporting with our pipelines to display summary information as nicely formatted HTML pages. Currently, CGAT supports three types of reports: + +- **MultiQC**: For general alignment and tool reporting. +- **R Markdown**: For bespoke reporting. +- **Jupyter Notebook**: For bespoke reporting. + +Refer to the specific pipeline documentation at the beginning of the script to determine which type of reporting is implemented. + +Reports are generated using the following command once a workflow has completed: + +```bash +cgatshowcase make build_report +``` + +### MultiQC report + +[MultiQC](https://multiqc.info/) is a Python framework for automating reporting, implemented in most workflows to generate QC stats for commonly used tools. + +### R Markdown + +R Markdown report generation is useful for creating custom reports. This is implemented in the `bamstats` workflow. + +### Jupyter Notebook + +Jupyter Notebook is another approach we use for bespoke reports, also implemented in the `bamstats` workflow. + +## Troubleshooting + +Many things can go wrong while running the pipeline: + +- **Bad input format**: The pipeline does not perform sanity checks on input formats, which may lead to missing or incorrect results. +- **Pipeline disruptions**: Issues with the cluster, file system, or terminal may cause the pipeline to abort. +- **Bugs**: Changes in program versions or inputs can cause unexpected issues. + +If the pipeline aborts, read the log files and error messages (e.g., `nohup.out`) to locate the error. Attempt to fix the error, remove the output files from the step in which the error occurred, and restart the pipeline. + +**Note**: Look out for upstream errors. For example, if a geneset filtering by specific contigs does not match, the geneset may be empty, causing errors in downstream steps. To resolve this, fix the initial error and delete the files from the geneset-building step, not just the step that threw the error. + +### Common pipeline errors + +One common error is: + +```text +GLOBAL_SESSION = drmaa.Session() +NameError: name 'drmaa' is not defined +``` + +This occurs because you are not connected to the cluster. Alternatively, run the pipeline in local mode by adding `--no-cluster` as a command line option. + +### Updating to the latest code version + +To get the latest bug fixes, navigate to the source directory and run: + +```bash +git pull +``` + +This command retrieves the latest changes from the master repository and updates your local version. + +## Using qsub commands + +We recommend using `cgat-core` to perform job submissions, as this is handled automatically. However, if you wish to use `qsub` manually, you can do so. Since the statements passed to `P.run()` are essentially command-line scripts, you can write the `qsub` commands as needed. For example: + +```python +statement = "qsub [commands] echo 'This is where you would put commands you want ran' " +P.run(statement) +``` + +When running the pipeline, make sure to specify `--no-cluster` as a command line option. + diff --git a/docs/getting_started/installation.md b/docs/getting_started/installation.md new file mode 100644 index 00000000..bd02adfd --- /dev/null +++ b/docs/getting_started/installation.md @@ -0,0 +1,90 @@ +# Installation + +The following sections describe how to install the `cgatcore` framework. + +## Conda installation + +The preferred method of installation is using Conda. If you do not have Conda installed, you can install it using [Miniconda](https://conda.io/miniconda.html) or [Anaconda](https://www.anaconda.com/download/#macos). + +`cgatcore` is installed via the Bioconda channel, and the recipe can be found on [GitHub](https://github.com/bioconda/bioconda-recipes/tree/b1a943da5a73b4c3fad93fdf281915b397401908/recipes/cgat-core). To install `cgatcore`, run the following command: + +```bash +conda install -c conda-forge -c bioconda cgatcore +``` + +## Pip installation + +We recommend installation through Conda because it manages dependencies automatically. However, `cgatcore` is generally lightweight and can also be installed using the `pip` package manager. Note that you may need to manually install other dependencies as needed: + +```bash +pip install cgatcore +``` + +## Automated installation + +The preferred method to install `cgatcore` is using Conda. However, we have also created a Bash installation script, which uses [Conda](https://conda.io/docs/) under the hood. + +Here are the steps: + +```bash +# Download the installation script: +curl -O https://raw.githubusercontent.com/cgat-developers/cgat-core/master/install.sh + +# See help: +bash install.sh + +# Install the development version (recommended, as there is no production version yet): +bash install.sh --devel [--location ] + +# To download the code in Git format instead of the default zip format, use: +--git # for an HTTPS clone +--git-ssh # for an SSH clone (you need to be a cgat-developer contributor on GitHub to do this) + +# Enable the Conda environment as instructed by the installation script +# Note: you might want to automate this by adding the following instructions to your .bashrc +source /conda-install/etc/profile.d/conda.sh +conda activate base +conda activate cgat-c +``` + +The installation script will place everything under the specified location. The aim of the script is to provide a portable installation that does not interfere with existing software environments. As a result, you will have a dedicated Conda environment that can be activated as needed to work with `cgatcore`. + +## Manual installation + +To obtain the latest code, check it out from the public Git repository and activate it: + +```bash +git clone https://github.com/cgat-developers/cgat-core.git +cd cgat-core +python setup.py develop +``` + +To update to the latest version, simply pull the latest changes: + +```bash +git pull +``` + +## Installing additional software + +When building your own workflows, we recommend using Conda to install software into your environment where possible. This ensures compatibility and ease of installation. + +To search for and install a package using Conda: + +```bash +conda search +conda install +``` + +## Accessing the libdrmaa shared library + +You may also need access to the `libdrmaa.so.1.0` C library, which can often be installed as part of the `libdrmaa-dev` package on most Unix systems. Once installed, you may need to specify the location of the DRMAA library if it is not in a default library path. Set the `DRMAA_LIBRARY_PATH` environment variable to point to the library location. + +To set this variable permanently, add the following line to your `.bashrc` file (adjusting the path as necessary): + +```bash +export DRMAA_LIBRARY_PATH=/usr/lib/libdrmaa.so.1.0 +``` + +[Conda documentation](https://conda.io) + diff --git a/docs/getting_started/run_parameters.md b/docs/getting_started/run_parameters.md new file mode 100644 index 00000000..37834e4a --- /dev/null +++ b/docs/getting_started/run_parameters.md @@ -0,0 +1,23 @@ +# Cluster configuration + +Currently, cgatcore supports the following workload managers: SGE, SLURM and Torque. The default cluster options are set for SunGrid Engine (SGE). If you are using a different workload manager, you need to configure your cluster settings accordingly by creating a `.cgat.yml` file in your home directory. + +This configuration file allows you to override the default settings. To view the hardcoded parameters for cgatcore, refer to the [parameters.py file](https://github.com/cgat-developers/cgat-core/blob/eb6d29e5fe1439de2318aeb5cdfa730f36ec3af4/cgatcore/pipeline/parameters.py#L67). + +For an example of configuring a PBSPro workload manager, see the provided [config example](https://github.com/AntonioJBT/pipeline_example/blob/master/Docker_and_config_file_examples/cgat.yml). + +The `.cgat.yml` file in your home directory will take precedence over the default cgatcore settings. For instance, adding the following configuration to `.cgat.yml` will implement cluster settings for PBSPro: + +```yaml +memory_resource: mem + +options: -l walltime=00:10:00 -l select=1:ncpus=8:mem=1gb + +queue_manager: pbspro + +queue: NONE + +parallel_environment: "dedicated" +``` + +This setup specifies memory resource allocation (`mem`), runtime limits (`walltime`), selection of CPU and memory resources, and the use of the PBSPro queue manager, among other settings. Make sure to adjust the parameters according to your cluster environment to optimise the workload manager for your pipeline runs. \ No newline at end of file diff --git a/docs/getting_started/tutorial.md b/docs/getting_started/tutorial.md new file mode 100644 index 00000000..a6e9bf22 --- /dev/null +++ b/docs/getting_started/tutorial.md @@ -0,0 +1,72 @@ +# Running a pipeline - Tutorial + +Before beginning this tutorial, ensure that `cgat-core` is installed correctly. Refer to the [installation instructions](#installation) for guidance. + +As a tutorial example of how to run a CGAT workflow, we will use the `cgat-showcase` pipeline. You will also need to install `cgat-showcase` (see the [instructions](https://cgat-showcase.readthedocs.io/en/latest/getting_started/Tutorial.html)). + +The aim of this pipeline is to perform pseudoalignment using `kallisto`. The pipeline can be run locally or distributed across a cluster. This tutorial will explain the steps required to run the pipeline. Further documentation on `cgat-showcase` can be found [here](https://cgat-showcase.readthedocs.io/en/latest/). + +The `cgat-showcase` pipeline highlights some of the functionality of `cgat-core`. Additionally, more advanced workflows for next-generation sequencing analysis are available in the [cgat-flow repository](https://github.com/cgat-developers/cgat-flow). + +## Tutorial start + +### Step 1: Download the tutorial data + +Create a new directory, navigate to it, and download the test data: + +```bash +mkdir showcase +cd showcase +wget https://www.cgat.org/downloads/public/showcase/showcase_test_data.tar.gz +tar -zxvf showcase_test_data.tar.gz +``` + +### Step 2: Generate a configuration YAML file + +Navigate to the test data directory and generate a configuration file for the pipeline: + +```bash +cd showcase_test_data +cgatshowcase transdiffexpres config +``` + +Alternatively, you can call the workflow file directly: + +```bash +python /path/to/file/pipeline_transdiffexpres.py config +``` + +This will generate a `pipeline.yml` file containing configuration parameters that can be used to modify the pipeline output. For this tutorial, you do not need to modify the parameters to run the pipeline. In the [Modify Config](#modify-config) section below, you will find details on how to adjust the config file to change the pipeline's output. + +### Step 3: Run the pipeline + +Run the pipeline using the following command: + +```bash +cgatshowcase transdiffexpres make full -v5 --no-cluster +``` + +The `--no-cluster` flag will run the pipeline locally if you do not have access to a cluster. If you have access to a cluster, you can remove the `--no-cluster` option, and the pipeline will distribute the jobs across the cluster. + +**Note**: There are many command line options available to run the pipeline. To see the available options, run: + +```bash +cgatshowcase --help +``` + +### Step 4: Generate a report + +The final step is to generate a report to display the output of the pipeline. We recommend using `MultiQC` for generating reports from commonly used bioinformatics tools (such as mappers and pseudoaligners) and `Rmarkdown` for generating custom reports. + +To generate these reports, run the following command: + +```bash +cgatshowcase transdiffexprs make build_report -v 5 --no-cluster +``` + +This will generate a `MultiQC` report in the folder `MultiQC_report.dir/` and an `Rmarkdown` report in `R_report.dir/`. + +## Conclusion + +This completes the tutorial for running the `transdiffexprs` pipeline for `cgat-showcase`. We hope you find it as useful as we do for writing workflows in Python. + diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 00000000..8fdc1aec --- /dev/null +++ b/docs/index.md @@ -0,0 +1,92 @@ +# CGAT-core Documentation + +![Licence](https://img.shields.io/github/license/cgat-developers/cgat-core.svg) +![Conda](https://img.shields.io/conda/v/bioconda/cgatcore.svg) +![Build Status](https://github.com/cgat-developers/cgat-core/actions/workflows/cgatcore_python.yml/badge.svg) + +Welcome to the CGAT-core documentation! CGAT-core is a workflow management system designed to support the rapid development of scalable, reproducible data analysis pipelines. It is built upon a flexible and user-friendly set of libraries and functions tailored for large-scale data analysis. + +## Overview + +CGAT-core has been continuously developed over the past decade to serve as a Next Generation Sequencing (NGS) workflow management system. By combining CGAT-core with [CGAT-apps](https://github.com/cgat-developers/cgat-apps), users can create diverse computational workflows. For a practical demonstration, refer to the [cgat-showcase](https://github.com/cgat-developers/cgat-showcase), which features a simple RNA-seq pipeline. + +For advanced usage examples, explore the [cgat-flow](https://github.com/cgat-developers/cgat-flow) repository, which contains production-ready pipelines for automating NGS data analysis. Note that it is under active development and may require additional software dependencies. + +## Citation + +If you use CGAT-core, please cite our publication in F1000 Research: + +**Cribbs AP, Luna-Valero S, George C et al. CGAT-core: a python framework for building scalable, reproducible computational biology workflows [version 1; peer review: 1 approved, 1 approved with reservations].** +F1000Research 2019, 8:377 +[https://doi.org/10.12688/f1000research.18674.1](https://doi.org/10.12688/f1000research.18674.1) + +## Support + +- For frequently asked questions, visit the [FAQ](project_info/faq.md). +- To report bugs or issues, raise an issue on our [GitHub repository](https://github.com/cgat-developers/cgat-core). +- To contribute, see the [contributing guidelines](project_info/contributing.md) and refer to the [GitHub source code](https://github.com/cgat-developers/cgat-core). + +## Example Workflows + +### cgat-showcase +A simple example of workflow development using CGAT-core. Visit the [GitHub page](https://github.com/cgat-developers/cgat-showcase) or view the [documentation](https://cgat-showcase.readthedocs.io/en/latest/). + +### cgat-flow +This repository demonstrates CGAT-core's flexibility through fully tested production pipelines. For details on usage and installation, see the [GitHub page](https://github.com/cgat-developers/cgat-flow). + +### Single-Cell RNA-seq +- **Cribbs Lab**: Uses CGAT-core for pseudoalignment pipelines in single-cell [Drop-seq](https://github.com/Acribbs/single-cell) methods. +- **Sansom Lab**: Develops single-cell sequencing analysis workflows using the CGAT-core workflow engine ([TenX workflows](https://github.com/sansomlab/tenx)). + +## Pipeline Modules Overview + +CGAT-core provides a comprehensive set of modules to facilitate the creation and management of data processing pipelines. These modules offer various functionalities, from pipeline control and execution to database management and file handling. + +### Available Modules + +1. [Control](pipeline_modules/control.md): Manages the overall pipeline execution flow. +2. [Database](pipeline_modules/database.md): Handles database operations and uploads. +3. [Files](pipeline_modules/files.md): Provides utilities for file management and temporary file handling. +4. [Cluster](pipeline_modules/cluster.md): Manages job submission and execution on compute clusters. +5. [Execution](pipeline_modules/execution.md): Handles task execution and logging. +6. [Utils](pipeline_modules/utils.md): Offers various utility functions for pipeline operations. +7. [Parameters](pipeline_modules/parameters.md): Manages pipeline parameters and configuration. + +### Integration with Ruffus + +CGAT-core builds upon the Ruffus pipeline library, extending its functionality and providing additional features. It includes the following Ruffus decorators: + +- `@transform` +- `@merge` +- `@split` +- `@originate` +- `@follows` +- `@suffix` + +These decorators can be used to define pipeline tasks and their dependencies. + +### S3 Integration + +CGAT-core also provides S3-aware decorators and functions for seamless integration with AWS S3: + +- `@s3_transform` +- `@s3_merge` +- `@s3_split` +- `@s3_originate` +- `@s3_follows` + +For more information on working with S3, see the [S3 Integration](s3_integration/s3_pipeline.md) section. + +By leveraging these modules and decorators, you can build powerful, scalable, and efficient data processing pipelines using CGAT-core. + +--- + +## Quick Links + +- [Getting Started](getting_started/installation.md) +- [Building a Workflow](defining_workflow/writing_workflow.md) +- [Pipeline Modules Overview](pipeline_modules/overview.md) +- [S3 Integration](s3_integration/s3_pipeline.md) +- [Working with Remote Files](remote/s3.md) +- [Core Functions](function_doc/pipeline.md) +- [Project Info](project_info/contributing.md) diff --git a/docs/index.rst b/docs/index.rst deleted file mode 100644 index c6c330bd..00000000 --- a/docs/index.rst +++ /dev/null @@ -1,138 +0,0 @@ -.. _manual-main: - -======================== -CGAT-core documentation! -======================== - -.. image:: https://img.shields.io/github/license/cgat-developers/cgat-core.svg - :alt: Licence - -.. image:: https://img.shields.io/conda/v/bioconda/cgatcore.svg - :alt: Conda - -.. image:: https://readthedocs.org/projects/cgat-core/badge/?version=latest - :target: http://cgat-core.readthedocs.io/en/latest/?badge=latest - :alt: Documentation Status - -.. image:: https://img.shields.io/travis/cgat-developers/cgat-core.svg - :alt: Travis - -.. image:: https://img.shields.io/twitter/follow/CGAT_Oxford.svg?style=social&logo=twitter&label=Follow - :target: https://twitter.com/cgat_oxford?lang=en - :alt: Twitter Followers - -.. image:: https://img.shields.io/twitter/url/http/shields.io.svg?style=social&logo=twitter - :target: https://twitter.com/cgat_oxford?lang=en - :alt: Twitter URL - - -CGAT-core is a workflow management system that allows users to quickly and reproducibly build scalable -data analysis pipelines. CGAT-core is a set of libraries and helper functions used to enable researchers -to design and build computational workflows for the analysis of large-scale data-analysis. - -Used in combination with `CGAT-apps `_, we have deomonstrated the functionality of our -flexible workflow management system using a simple RNA-seq pipeline in `cgat-showcase `_. - -CGAT-core is open-sourced, powerful and user-friendly, and has been continually developed -as a Next Generation Sequencing (NGS) workflow management system over the past 10 years. - -For more advanced examples of cgatcore utilities please refer to our `cgat-flow `_ repository, however -please be aware that this is in constant development and has many software dependancies. - - -.. _manual-quick_example: - --------- -Citation --------- - -Our workflow management system is published in F1000 Research: - -Cribbs AP, Luna-Valero S, George C et al. CGAT-core: a python framework for building scalable, reproducible computational biology workflows [version 1; peer review: 1 approved, 1 approved with reservations]. F1000Research 2019, 8:377 -(`https://doi.org/10.12688/f1000research.18674.1 `_) - -.. _manual-support: - -------- -Support -------- - -- Please refer to our :ref:`FAQ` section -- For bugs and issues, please raise an issue on `github `_ -- For contributions, please refer to our contributor section and `github `_ source code. - --------- -Examples --------- - -**cgat-showcase** - This is a toy example of how to develop a simple workflow. Please refer to the `github page `_ and the `documentation `_. -**cgat-flow** - As an example of the flexibility and functionality of CGAT-core, we have developed a set of fully tested production pipelines for automating the analysis of our NGS data. Please refer to the `github `_ page for information on how to install and use our code. -**Single cell RNA-seq** - The cribbs lab use CGAT-core to develop pseudoalignment pipelines for single cell `dropseq methods `_ - The sansom lab use the CGAT-core workflow engine to develop single cell `sequencing analysis workflows `_. - - -------------------------------------- -Selected publications using CGAT-core -------------------------------------- - -CGAT-core has been developed over the past 10 years and as such has been used in many previously published articles - -For a non-comprehensive list of citations please see our :citing and :ref:`project_info-citations` - - - - -.. toctree:: - :caption: Getting started - :name: getting_started - :maxdepth: 1 - :hidden: - - getting_started/Installation.rst - getting_started/Cluster_config.rst - getting_started/Examples.rst - getting_started/Tutorial.rst - -.. toctree:: - :caption: Build a workflow - :name: build - :maxdepth: 1 - :hidden: - - defining_workflow/Writing_workflow.rst - defining_workflow/run_parameters.rst - defining_workflow/Tutorial.rst - -.. toctree:: - :caption: Working with remote files - :name: build - :maxdepth: 1 - :hidden: - - remote/S3.rst - remote/GC.rst - remote/Azure.rst - -.. toctree:: - :caption: cgatcore functions - :name: function_doc - :maxdepth: 1 - :hidden: - - function_doc/Pipeline.rst - function_doc/Core.rst - -.. toctree:: - :caption: Project Info - :name: project-info - :maxdepth: 1 - :hidden: - - project_info/Contributing.rst - project_info/how_to_contribute.rst - project_info/citations.rst - project_info/FAQ.rst - project_info/Licence.rst diff --git a/docs/pipeline_modules/cluster.md b/docs/pipeline_modules/cluster.md new file mode 100644 index 00000000..2e2f7fff --- /dev/null +++ b/docs/pipeline_modules/cluster.md @@ -0,0 +1,185 @@ +# Cluster Module + +`cluster.py` - Cluster utility functions for cgatcore pipelines + +============================================================== + +This module abstracts the DRMAA native specification and provides convenience functions for running DRMAA jobs. It currently supports SGE, SLURM, Torque, and PBSPro cluster environments, enabling users to submit and manage cluster jobs easily within cgatcore pipelines. + +## Reference + +The following documentation details the cluster management utilities provided by the `cluster.py` module. + +## Import Statements + +```python +import re +import math +import collections +import os +import stat +import time +import datetime +import logging +import gevent +import cgatcore.experiment as E +try: + import drmaa +except (ImportError, RuntimeError, OSError): + pass +``` + +## Key Classes and Functions + +### `get_logger()` + +Returns the logger for the CGAT-core pipeline, which is used to handle logging within the cluster management utilities. + +```python +def get_logger(): + return logging.getLogger("cgatcore.pipeline") +``` + +### `DRMAACluster` Class + +This class provides core functionality for managing DRMAA cluster jobs, abstracting cluster specifications for SGE, SLURM, Torque, and PBSPro. + +#### `__init__(self, session, ignore_errors=False)` + +Initialises a DRMAA cluster instance. + +Arguments: +- `session` (drmaa.Session): DRMAA session for interacting with the cluster. +- `ignore_errors` (bool, optional): If `True`, job errors are ignored, allowing the pipeline to continue. + +#### `setup_drmaa_job_template(...)` + +Sets up a DRMAA job template. Supported environments include SGE, SLURM, Torque, and PBSPro. + +Arguments: +- `drmaa_session`: The DRMAA session object. +- `job_name` (string): Name of the job. +- `job_memory` (string): Memory requirements for the job. +- `job_threads` (int): Number of threads to allocate for the job. +- `working_directory` (string): Working directory for the job. + +Raises: +- `ValueError`: If job memory is not specified. + +#### `collect_single_job_from_cluster(...)` + +Collects a single job running on the cluster, waiting for its completion and returning stdout, stderr, and resource usage. + +Arguments: +- `job_id` (string): The job ID. +- `statement` (string): Command executed by the job. +- `stdout_path` (string): Path to the stdout file. +- `stderr_path` (string): Path to the stderr file. +- `job_path` (string): Path to the job file. + +#### `get_drmaa_job_stdout_stderr(...)` + +Fetches stdout and stderr for a DRMAA job, allowing for some lag. + +Arguments: +- `stdout_path` (string): Path to the stdout file. +- `stderr_path` (string): Path to the stderr file. +- `tries` (int, optional): Number of attempts to retrieve the files. +- `encoding` (string, optional): Encoding for reading files. + +Returns: +- `tuple`: stdout and stderr as lists of strings. + +#### `set_drmaa_job_paths(job_template, job_path)` + +Adds the job path, stdout path, and stderr path to the job template. + +Arguments: +- `job_template`: DRMAA job template object. +- `job_path` (string): Path to the job script. + +### Cluster-Specific Classes + +The following classes inherit from `DRMAACluster` and implement cluster-specific logic for each cluster type. + +#### `SGECluster` + +Handles SGE-specific cluster job setup. + +- **`get_native_specification(...)`**: Returns native specification parameters for SGE jobs. + +#### `SlurmCluster` + +Handles SLURM-specific cluster job setup. + +- **`get_native_specification(...)`**: Returns native specification parameters for SLURM jobs. +- **`parse_accounting_data(...)`**: Parses SLURM accounting data to retrieve resource usage information. + +#### `TorqueCluster` + +Handles Torque-specific cluster job setup. + +- **`get_native_specification(...)`**: Returns native specification parameters for Torque jobs. + +#### `PBSProCluster` + +Handles PBSPro-specific cluster job setup. + +- **`get_native_specification(...)`**: Returns native specification parameters for PBSPro jobs. +- **`update_template(jt)`**: Updates the DRMAA job template environment. + +### `get_queue_manager(queue_manager, *args, **kwargs)` + +Returns a cluster instance based on the specified queue manager type. + +Arguments: +- `queue_manager` (string): Type of queue manager (`sge`, `slurm`, `torque`, `pbspro`). +- `*args, **kwargs`: Additional arguments passed to the cluster class initialiser. + +Raises: +- `ValueError`: If the queue manager type is not supported. + +```python +def get_queue_manager(queue_manager, *args, **kwargs): + qm = queue_manager.lower() + if qm == "sge": + return SGECluster(*args, **kwargs) + elif qm == "slurm": + return SlurmCluster(*args, **kwargs) + elif qm == "torque": + return TorqueCluster(*args, **kwargs) + elif qm == "pbspro": + return PBSProCluster(*args, **kwargs) + else: + raise ValueError("Queue manager {} not supported".format(queue_manager)) +``` + +## Notes + +- This module provides a unified interface for running cluster jobs across different cluster managers, allowing the user to switch between cluster types without rewriting job submission scripts. +- The module includes timeout settings for managing gevent event loops (`GEVENT_TIMEOUT_SACCT` and `GEVENT_TIMEOUT_WAIT`) to ensure that jobs are properly monitored without excessive waiting. +- The `JobInfo` named tuple is used to encapsulate job information, including job ID and resource usage. + +### Supported Clusters + +- **SGE** (Sun Grid Engine) +- **SLURM** (Simple Linux Utility for Resource Management) +- **Torque** +- **PBSPro** + +Each cluster type requires specific configurations and resource definitions, which are managed through the appropriate cluster class. + +## Usage Example + +To use a specific cluster type, you would first initialise the relevant cluster class or use the `get_queue_manager()` function to automatically return an instance: + +```python +from cluster import get_queue_manager + +queue_manager = "slurm" +cluster = get_queue_manager(queue_manager, session=drmaa.Session(), ignore_errors=True) +``` + +Once the cluster is initialised, you can use its methods to create job templates, submit jobs, and manage their execution. + +As you continue to expand the functionality of CGAT-core, ensure that this module is updated with new cluster types, resource mappings, and relevant updates for managing cluster jobs effectively. diff --git a/docs/pipeline_modules/control.md b/docs/pipeline_modules/control.md new file mode 100644 index 00000000..5cc83eeb --- /dev/null +++ b/docs/pipeline_modules/control.md @@ -0,0 +1,60 @@ +# Pipeline Control Module + +The Control module in CGAT-core is responsible for managing the overall execution flow of the pipeline. It provides functions and classes for running the pipeline, handling command-line arguments, and controlling the pipeline's behaviour. + +## Key Functions + +### `run_pipeline()` + +This function is the main entry point for executing a pipeline. It sets up the pipeline, processes command-line arguments, and runs the specified tasks. + +```python +from cgatcore import pipeline as P + +def my_pipeline(): + # Define your pipeline tasks here + pass + +if __name__ == "__main__": + P.run_pipeline(pipeline_func=my_pipeline) +``` + +### `get_parameters()` + +Retrieves pipeline parameters from configuration files and command-line arguments. + +```python +PARAMS = P.get_parameters("pipeline.yml") +``` + +## `Pipeline` Class + +The `Pipeline` class is the core class for managing pipeline execution. It provides methods for adding tasks, running the pipeline, and handling dependencies. + +```python +pipeline = P.Pipeline() +pipeline.add_task(my_task) +pipeline.run() +``` + +## Command-line Interface + +The Control module provides a command-line interface for running pipelines. Common options include: + +- `--pipeline-action`: Specify the action to perform (e.g., `show`, `plot`, `run`) +- `--local`: Run the pipeline locally instead of on a cluster +- `--multiprocess`: Specify the number of processes to use for local execution + +### Example usage: + +```sh +python my_pipeline.py --pipeline-action run --local +``` + +For more detailed information on pipeline control and execution, refer to the Pipeline Execution documentation. + +## Next Steps + +These new pages provide more comprehensive documentation for the CGAT-core pipeline modules and S3 integration. You should create similar pages for the other modules (Database, Files, Cluster, Execution, Utils, Parameters) and S3-related topics (S3 Decorators, Configuring S3). + +Remember to include code examples, explanations of key concepts, and links to other relevant parts of the documentation. As you continue to develop and expand the CGAT-core functionality, make sure to update the documentation accordingly. diff --git a/docs/pipeline_modules/database.md b/docs/pipeline_modules/database.md new file mode 100644 index 00000000..2e076e09 --- /dev/null +++ b/docs/pipeline_modules/database.md @@ -0,0 +1,202 @@ +# Database Module + +`database.py` - Database upload for cgatcore pipelines + +========================================================= + +## Reference + +This module contains functions to facilitate data upload into a database using CGAT-core. It is particularly useful for integrating with cgatcore pipelines for automating and managing complex workflows. + +## Import Statements + +```python +import re +import os +import sqlite3 +import sqlalchemy +from cgatcore import database as database +import cgatcore.experiment as E +from cgatcore.iotools import snip, touch_file +from cgatcore.pipeline.files import get_temp_file +from cgatcore.pipeline.execution import run +from cgatcore.pipeline.parameters import get_params +``` + +## Key Functions + +### `tablequote(track)` + +Quotes a track name to make it suitable as a table name. + +```python +def tablequote(track): + '''quote a track name such that it is suitable as a table name.''' + return re.sub(r"[-(),\[\].]", "_", track) +``` + +### `to_table(outfile)` + +Converts a filename from a load statement into a table name. Checks if the filename ends with `.load`, removes the suffix, and then quotes it. + +Arguments: +- `outfile` (string): Filename ending in `.load`. + +Returns: +- `tablename` (string): A suitable table name derived from the file. + +```python +def to_table(outfile): + '''convert a filename from a load statement into a table name.''' + assert outfile.endswith(".load") + name = os.path.basename(outfile[:-len(".load")]) + return tablequote(name) +``` + +### `build_load_statement(tablename, retry=True, options="")` + +Builds a command line statement to upload data to the database via the `csv2db` script. + +Arguments: +- `tablename` (string): Tablename for upload. +- `retry` (bool): If `True`, add the `--retry` option to `csv2db.py`. +- `options` (string): Command line options to be passed on to `csv2db.py`. + +Returns: +- `string`: A command line statement for uploading data. + +```python +def build_load_statement(tablename, retry=True, options=""): + opts = [] + if retry: + opts.append(" --retry ") + params = get_params() + opts.append("--database-url={}".format(params["database"]["url"])) + db_options = " ".join(opts) + load_statement = ( + "python -m cgatcore.csv2db {db_options} {options} --table={tablename}".format(**locals())) + return load_statement +``` + +### `load(...)` + +Imports data from a tab-separated file into the database. + +Arguments: +- `infile` (string): Filename of the input data. +- `outfile` (string): Output filename containing logging information. +- Various additional arguments to control the loading behaviour. + +Typical usage within Ruffus: + +```python +@transform("*.tsv.gz", suffix(".tsv.gz"), ".load") +def loadData(infile, outfile): + P.load(infile, outfile) +``` + +### `concatenate_and_load(...)` + +Concatenates multiple tab-separated files and uploads the result to the database. + +Arguments: +- `infiles` (list): List of input filenames. +- `outfile` (string): Output filename. +- Various additional arguments for concatenation and loading. + +Typical usage within Ruffus: + +```python +@merge("*.tsv.gz", ".load") +def loadData(infiles, outfile): + P.concatenate_and_load(infiles, outfile) +``` + +### `merge_and_load(...)` + +Merges multiple categorical tables and loads them into the database. + +Arguments: +- `infiles` (list): List of input files. +- `outfile` (string): Output filename. +- Various additional arguments to control the merging and loading behaviour. + +### `connect()` + +Connects to the SQLite database used in the pipeline. Currently only implemented for SQLite databases. + +Returns: +- `dbh`: A database handle. + +### `create_view(...)` + +Creates a database view for a list of tables by performing a join across them. + +Arguments: +- `dbhandle`: Database handle. +- `tables`: List of tuples containing table names and fields to join. +- `tablename` (string): Name of the view or table to be created. +- `view_type` (string): Type of view, either `VIEW` or `TABLE`. + +### `get_database_name()` + +Returns the database name associated with the pipeline. Implemented for backwards compatibility. + +## Utility Functions + +These functions assist in interacting with a database in various ways: + +- **`load_from_iterator(...)`**: Imports data from an iterator into a database. +- **`apsw_connect(dbname, modname="tsv")`**: Attempts to connect to an APSW database, creating a virtual table from a TSV file. + +## Database Utility Functions + +`database.py` - Database utility functions + +=========================================== + +This module contains convenience functions to work with a relational database. + +### `executewait(dbhandle, statement, regex_error="locked", retries=-1, wait=5)` + +Repeatedly executes an SQL statement until it succeeds. + +Arguments: +- `dbhandle`: A DB-API conform database handle. +- `statement`: SQL statement to execute. +- `regex_error`: Regex to match error messages to ignore. + +Returns: +- `Cursor`: A cursor object for further database operations. + +### `getColumnNames(dbhandle, table)` + +Returns the column names of a table from the database. + +### `getTables(dbhandle)` + +Gets a list of tables in an SQLite database. + +### `toTSV(dbhandle, outfile, statement, remove_none=True)` + +Executes a statement and saves the result as a TSV file to disk. + +### Database Interaction Functions + +- **`connect(dbhandle=None, attach=None, url=None)`**: Attempts to connect to a database, returning a database handle. +- **`execute(queries, dbhandle=None, attach=False)`**: Executes one or more SQL statements against a database. +- **`fetch(query, dbhandle=None, attach=False)`**: Fetches all query results and returns them. +- **`fetch_with_names(query, dbhandle=None, attach=False)`**: Fetches query results and returns them as an array of row arrays, including field names. +- **`fetch_DataFrame(query, dbhandle=None, attach=False)`**: Fetches query results and returns them as a pandas DataFrame. +- **`write_DataFrame(dataframe, tablename, dbhandle=None, index=False, if_exists='replace')`**: Writes a pandas DataFrame to an SQLite database. + +### Virtual Table Creation with APSW + +- **`apsw_connect(dbname=None, modname="tsv")`**: Connects to an APSW database and creates a virtual table from a TSV file. +- **`_VirtualTable` and `_Table` classes**: Defines the structure and methods to support virtual tables in APSW. + +## Notes and Recommendations + +- As you continue to expand and develop the CGAT-core functionality, ensure to update the database module documentation accordingly. +- This module heavily utilises `csv2db` to facilitate data upload and management. +- Always consider potential SQL locking issues and use retry mechanisms where applicable. diff --git a/docs/pipeline_modules/execution.md b/docs/pipeline_modules/execution.md new file mode 100644 index 00000000..ef9c0ef7 --- /dev/null +++ b/docs/pipeline_modules/execution.md @@ -0,0 +1,265 @@ +# Execution Module + +`execution.py` - Job control for cgatcore pipelines + +========================================================= + +This module manages the job execution for cgatcore pipelines, particularly using DRMAA sessions for cluster job management. It provides functionality to start and close DRMAA sessions, execute shell commands, manage job submission, and handle different cluster environments. Supported cluster types include SGE, SLURM, Torque, and Kubernetes. + +## Reference + +The following documentation details the execution management utilities provided by the `execution.py` module. + +## Import Statements + +```python +import collections +import importlib +import os +import pickle +import re +import json +import stat +import socket +import logging +import subprocess +import sys +import time +import math +import shutil +import gevent +import signal +import cgatcore.experiment as E +import cgatcore.iotools as iotools +from cgatcore.pipeline.utils import get_caller_locals, get_caller, get_calling_function +from cgatcore.pipeline.files import get_temp_filename, get_temp_dir +from cgatcore.pipeline.parameters import substitute_parameters, get_params +from cgatcore.pipeline.cluster import get_queue_manager, JobInfo +from cgatcore.pipeline.executors import SGEExecutor, SlurmExecutor, TorqueExecutor, LocalExecutor +try: + from cgatcore.pipeline.kubernetes import KubernetesExecutor +except ImportError: + KubernetesExecutor = None # Fallback if Kubernetes is not available +``` + +## Key Functions + +### `start_session()` + +Starts and initializes the global DRMAA session. + +```python +def start_session(): + """Start and initialize the global DRMAA session.""" + global GLOBAL_SESSION + + if HAS_DRMAA and GLOBAL_SESSION is None: + GLOBAL_SESSION = drmaa.Session() + try: + GLOBAL_SESSION.initialize() + except drmaa.errors.InternalException as ex: + get_logger().warn("could not initialize global drmaa session: {}".format(ex)) + GLOBAL_SESSION = None + return GLOBAL_SESSION +``` + +### `close_session()` + +Closes the global DRMAA session. + +```python +def close_session(): + """Close the global DRMAA session.""" + global GLOBAL_SESSION + + if GLOBAL_SESSION is not None: + GLOBAL_SESSION.exit() + GLOBAL_SESSION = None +``` + +### `get_executor(options=None)` + +Returns an executor instance based on the specified queue manager in the options. + +Arguments: +- `options` (dict): Dictionary containing execution options, including `"cluster_queue_manager"`. + +Returns: +- Executor instance appropriate for the specified queue manager. + +This function decides which executor to use depending on the queue manager specified in the options, defaulting to the local executor if no cluster is specified or the cluster is not supported. + +```python +def get_executor(options=None): + if options is None: + options = get_params() + + if options.get("testing", False): + return LocalExecutor(**options) + + if not options.get("to_cluster", True): + return LocalExecutor(**options) + + queue_manager = options.get("cluster_queue_manager", None) + + if queue_manager == "kubernetes" and KubernetesExecutor is not None: + return KubernetesExecutor(**options) + elif queue_manager == "sge" and shutil.which("qsub") is not None: + return SGEExecutor(**options) + elif queue_manager == "slurm" and shutil.which("sbatch") is not None: + return SlurmExecutor(**options) + elif queue_manager == "torque" and shutil.which("qsub") is not None: + return TorqueExecutor(**options) + else: + return LocalExecutor(**options) +``` + +### `execute(statement, **kwargs)` + +Executes a command line statement locally. + +Arguments: +- `statement` (string): Command line statement to be run. + +Returns: +- `stdout` (string): Data sent to standard output by the command. +- `stderr` (string): Data sent to standard error by the command. + +```python +def execute(statement, **kwargs): + if not kwargs: + kwargs = get_caller_locals() + + kwargs = dict(list(get_params().items()) + list(kwargs.items())) + + logger = get_logger() + logger.info("running %s" % (statement % kwargs)) + + if "cwd" not in kwargs: + cwd = get_params()["work_dir"] + else: + cwd = kwargs["cwd"] + + statement = " ".join(re.sub("\t+", " ", statement).split("\n")).strip() + if statement.endswith(";"): + statement = statement[:-1] + + os.environ.update({'BASH_ENV': os.path.join(os.environ['HOME'], '.bashrc')}) + process = subprocess.Popen(statement % kwargs, + cwd=cwd, + shell=True, + stdin=sys.stdin, + stdout=sys.stdout, + stderr=sys.stderr, + env=os.environ.copy(), + executable="/bin/bash") + + stdout, stderr = process.communicate() + + if process.returncode != 0: + raise OSError( + "Child was terminated by signal %i: \n" + "The stderr was: \n%s\n%s\n" % + (-process.returncode, stderr, statement)) + + return stdout, stderr +``` + +### `run(statement, **kwargs)` + +Runs a command line statement, either locally or on a cluster using DRMAA. + +Arguments: +- `statement` (string or list of strings): Command line statement or list of statements to execute. +- `kwargs` (dict): Additional options for job execution. + +This function runs the given statement(s) by selecting the appropriate executor. It handles different types of job submission (local or cluster-based) based on the provided arguments and global configuration. + +```python +def run(statement, **kwargs): + """ + Run a command line statement. + """ + logger = get_logger() + + options = dict(list(get_params().items())) + caller_options = get_caller_locals() + options.update(list(caller_options.items())) + + if "self" in options: + del options["self"] + options.update(list(kwargs.items())) + + if "params" in options: + try: + options.update(options["params"]._asdict()) + except AttributeError: + pass + + options['cluster']['options'] = options.get( + 'job_options', options['cluster']['options']) + options['cluster']['queue'] = options.get( + 'job_queue', options['cluster']['queue']) + options['without_cluster'] = options.get('without_cluster') + + name_substrate = str(options.get("outfile", "cgatcore")) + if os.path.basename(name_substrate).startswith("result"): + name_substrate = os.path.basename(os.path.dirname(name_substrate)) + else: + name_substrate = os.path.basename(name_substrate) + + options["job_name"] = re.sub("[:"]", "_", name_substrate) + try: + calling_module = get_caller().__name__ + except AttributeError: + calling_module = "unknown" + + options["task_name"] = calling_module + "." + get_calling_function() + + if isinstance(statement, list): + statement_list = [interpolate_statement(stmt, options) for stmt in statement] + else: + statement_list = [interpolate_statement(statement, options)] + + if options.get("dryrun", False): + for statement in statement_list: + logger.info("Dry-run: {}".format(statement)) + return [] + + executor = get_executor(options) + + with executor as e: + benchmark_data = e.run(statement_list) + + for data in benchmark_data: + logger.info(json.dumps(data)) + + BenchmarkData = collections.namedtuple('BenchmarkData', sorted(benchmark_data[0])) + return [BenchmarkData(**d) for d in benchmark_data] +``` + +## Notes + +- This module is responsible for the orchestration and execution of jobs either locally or on different cluster environments (e.g., SGE, Slurm, Torque). +- The global DRMAA session is managed to interface with cluster schedulers, and the `GLOBAL_SESSION` variable maintains the session state. +- Executors are used to control where and how the jobs are executed, allowing for local and cluster-based execution. Depending on the queue manager specified, different executor classes such as `LocalExecutor`, `SGEExecutor`, `SlurmExecutor`, `TorqueExecutor`, or `KubernetesExecutor` are instantiated. + +### Supported Cluster Types + +- **SGE** (Sun Grid Engine) +- **SLURM** (Simple Linux Utility for Resource Management) +- **Torque** +- **Kubernetes** + +## Usage Example + +To use the `run()` function to execute a command either locally or on a cluster: + +```python +from execution import run + +statement = "echo 'Hello, world!'" +run(statement) +``` + +The execution environment is determined by the configuration parameters, and the job is run on the diff --git a/docs/pipeline_modules/executors.md b/docs/pipeline_modules/executors.md new file mode 100644 index 00000000..e17c078f --- /dev/null +++ b/docs/pipeline_modules/executors.md @@ -0,0 +1,190 @@ +# Executors for job scheduling + +## Overview + +This documentation describes several executor classes for job scheduling in computational pipelines. Each of these classes inherits from the `BaseExecutor` and is responsible for submitting jobs to a different type of cluster system or local machine. The following executors are available: + +- `SGEExecutor`: Submits jobs to an SGE (Sun Grid Engine) cluster. +- `SlurmExecutor`: Submits jobs to a Slurm cluster. +- `TorqueExecutor`: Submits jobs to a Torque cluster. +- `LocalExecutor`: Executes jobs locally. +- `KubernetesExecutor`: Submits jobs to a Kubernetes cluster. + +Each executor has specific methods and logging functionality that enable it to handle job submission, monitoring, and error management effectively. + +## `SGEExecutor` + +The `SGEExecutor` is responsible for running jobs on an SGE cluster. It extends the `BaseExecutor` class. + +### Methods + +#### `__init__(self, **kwargs)` +Initialises the `SGEExecutor` and sets up a logger for the instance. + +#### `run(self, statement_list)` +Runs the provided list of statements using SGE. + +- **Arguments**: + - `statement_list`: A list of shell command statements to be executed. + +- **Workflow**: + - Builds an SGE job submission command for each statement. + - Uses `subprocess.run()` to submit jobs using the `qsub` command. + - Handles job submission errors, logs relevant information, and monitors job completion. + +#### `build_job_script(self, statement)` +Builds a job script for SGE based on the provided statement. + +- **Overrides**: This method is an override of the `BaseExecutor.build_job_script()`. + +## `SlurmExecutor` + +The `SlurmExecutor` is responsible for running jobs on a Slurm cluster. It also extends the `BaseExecutor` class. + +### Methods + +#### `__init__(self, **kwargs)` +Initialises the `SlurmExecutor` and sets up a logger for the instance. + +#### `run(self, statement_list)` +Runs the provided list of statements using Slurm. + +- **Arguments**: + - `statement_list`: A list of shell command statements to be executed. + +- **Workflow**: + - Builds a Slurm job submission command for each statement using `sbatch`. + - Uses `subprocess.run()` to submit jobs to the Slurm scheduler. + - Monitors the job submission status, logs relevant information, and handles any errors. + +#### `build_job_script(self, statement)` +Builds a job script for submission on Slurm. + +- **Overrides**: This method is an override of the `BaseExecutor.build_job_script()`. + +## `TorqueExecutor` + +The `TorqueExecutor` class runs jobs on a Torque cluster, using `qsub` for job submissions. + +### Methods + +#### `__init__(self, **kwargs)` +Initialises the `TorqueExecutor` and sets up a logger for the instance. + +#### `run(self, statement_list)` +Runs the provided list of statements using Torque. + +- **Arguments**: + - `statement_list`: A list of shell command statements to be executed. + +- **Workflow**: + - Builds a job script and submits it using the `qsub` command. + - Uses `subprocess.run()` to handle the submission and logs all related information. + - Handles job submission errors and monitors job completion. + +#### `build_job_script(self, statement)` +Builds a job script for submission on a Torque cluster. + +- **Overrides**: This method is an override of the `BaseExecutor.build_job_script()`. + +## `LocalExecutor` + +The `LocalExecutor` runs jobs on the local machine without the need for cluster scheduling. This is useful for development, testing, or when the jobs are small enough to run locally. + +### Methods + +#### `__init__(self, **kwargs)` +Initialises the `LocalExecutor` and sets up a logger for the instance. + +#### `run(self, statement_list)` +Runs the provided list of statements locally. + +- **Arguments**: + - `statement_list`: A list of shell command statements to be executed. + +- **Workflow**: + - Builds the job script and runs it locally using `subprocess.Popen()`. + - Monitors the output and logs the job status. + - Handles any runtime errors by logging them and raising exceptions as needed. + +#### `build_job_script(self, statement)` +Builds a job script for local execution. + +- **Overrides**: This method is an override of the `BaseExecutor.build_job_script()`. + +## `KubernetesExecutor` + +The `KubernetesExecutor` is used for running jobs on a Kubernetes cluster. + +### Methods + +#### `__init__(self, **kwargs)` +Initialises the `KubernetesExecutor`. + +- **Workflow**: + - Loads the Kubernetes configuration and sets up both Core and Batch API clients for job management. + - Logs information about successful or failed configuration loads. + +#### `run(self, statement, job_path, job_condaenv)` +Runs a job using Kubernetes. + +- **Arguments**: + - `statement`: The shell command to be executed within a Kubernetes job. + - `job_path`: Path for the job files. + - `job_condaenv`: Conda environment to be used within the job container. + +- **Workflow**: + - Defines the Kubernetes job specification, including container image, command, and job parameters. + - Submits the job using `create_namespaced_job()` and waits for its completion. + - Collects job logs and benchmark data for analysis. + - Cleans up the Kubernetes job once it is complete. + +#### `_wait_for_job_completion(self, job_name)` +Waits for the Kubernetes job to complete. + +- **Arguments**: + - `job_name`: The name of the job. + +- **Workflow**: + - Repeatedly queries the job status using `read_namespaced_job_status()` until it succeeds or fails. + +#### `_get_pod_logs(self, job_name)` +Retrieves the logs of the pod associated with the specified Kubernetes job. + +- **Arguments**: + - `job_name`: The name of the job. + +#### `_cleanup_job(self, job_name)` +Deletes the Kubernetes job and its associated pods. + +- **Arguments**: + - `job_name`: The name of the job to be deleted. + +#### `collect_benchmark_data(self, job_name, resource_usage_file)` +Collects benchmark data such as CPU and memory usage from the job's pod(s). + +- **Arguments**: + - `job_name`: Name of the job for which benchmark data is being collected. + - `resource_usage_file`: Path to a file where resource usage data will be saved. + +#### `collect_metric_data(self, process, start_time, end_time, time_data_file)` +Collects and saves metric data related to job duration. + +- **Arguments**: + - `process`: The name of the process. + - `start_time`: Timestamp when the job started. + - `end_time`: Timestamp when the job ended. + - `time_data_file`: Path to a file where timing data will be saved. + +## Logging and Error Handling + +All executor classes use the Python `logging` module to log different stages of job submission, execution, and monitoring. Logging levels like `INFO`, `ERROR`, and `WARNING` are used to provide information on job progress and errors. Executors also make use of exception handling to raise `RuntimeError` when job submission or execution fails. + +## Notes +- The job script generation is handled by the `build_job_script()` function, which is customised per executor but is based on the implementation from `BaseExecutor`. +- Job monitoring and benchmark data collection are placeholder implementations in some of the executors. Users should consider implementing job-specific monitoring and resource management tailored to their requirements. + +## Summary + +The executor classes provide a modular way to submit jobs to different cluster systems or run them locally. Each executor manages the nuances of the corresponding cluster scheduler, allowing seamless integration with cgatcore pipelines. They provide functionalities such as job submission, logging, monitoring, and benchmarking, ensuring a streamlined and customisable workflow for distributed computing environments. + diff --git a/docs/pipeline_modules/farm.md b/docs/pipeline_modules/farm.md new file mode 100644 index 00000000..83c8cc6d --- /dev/null +++ b/docs/pipeline_modules/farm.md @@ -0,0 +1,149 @@ +# farm.py - Documentation + +This document provides an overview of the `farm.py` script, which is used to split a data stream into independent chunks for parallel processing on a cluster. This is particularly useful for large-scale computational tasks where dividing the workload into smaller, independent parts can significantly reduce processing time. + +## Table of Contents +- [Purpose](#purpose) +- [Usage](#usage) +- [Documentation](#documentation) + - [Input and Output Handling](#input-and-output-handling) + - [Chunking Methods](#chunking-methods) + - [Error Handling](#error-handling) + - [Examples](#examples) +- [Classes and Functions](#classes-and-functions) + - [Chunk Iterator Functions](#chunk-iterator-functions) + - [Mapper Classes](#mapper-classes) + - [ResultBuilder Classes](#resultbuilder-classes) + - [build_command Function](#build_command-function) + - [hasFinished Function](#hasfinished-function) + - [get_option_parser Function](#get_option_parser-function) + - [main Function](#main-function) + +## Purpose + +The `farm.py` script is designed to process a data stream in parallel on a cluster by splitting it into smaller, independent chunks. This approach is suitable for "embarrassingly parallel" jobs, where the computations for each chunk can be executed independently without requiring communication between them. + +The script reads data from `stdin`, splits the data, executes user-specified commands on each chunk, and writes the output to `stdout`. The results are returned in the same order as they are submitted. + +## Usage + +The script can be run from the command line. Below are two basic examples: + +- Split the input data by the first column and execute a Perl command on each split: + ```sh + cat go.txt | farm.py --split-at-column=1 perl -p -e "s/GO/gaga/" + ``` + +- Split a fasta file at each sequence entry and compute an approximate sequence length: + ```sh + cat genome.fasta | farm.py --split-at-regex="^>(\S+)" "wc -c" + ``` + +Run `python farm.py --help` to get detailed command-line options. + +## Documentation + +### Input and Output Handling + +The input to `farm.py` is provided via `stdin` and processed in parallel. The output is written to `stdout` with results combined in the same order they were processed. The script ensures that duplicate headers are avoided and can also handle jobs that output multiple files. + +### Chunking Methods + +The script provides multiple ways to split (or "chunk") the input data: +- **Split at lines**: Divide the input data by the number of lines. +- **Split by column**: Split based on the unique values in a specified column. +- **Split using regex**: Use a regular expression to define how to split data. +- **Group using regex**: Group entries together if they match a regular expression. + +### Error Handling + +If an error occurs during the execution of a job, the error messages are printed, and the temporary directory used for processing is not deleted, allowing manual recovery. The script also implements a retry mechanism for failed jobs and can log errors into separate files for analysis. + +### Examples + +1. **Basic Example**: Split the file "go" at the first column and replace `GO` with `gaga` in each chunk: + ```sh + cat go | farm.py --split-at-column=1 perl -p -e "s/GO/gaga/" + ``` + +2. **FASTA File Processing**: Split a fasta file at each sequence and calculate length: + ```sh + cat genome.fasta | farm.py --split-at-regex="^>(\S+)" "wc -c" + ``` + +3. **Chunk by Sequence Count**: Split a fasta file at every 10 sequences: + ```sh + cat genome.fasta | farm.py --split-at-regex="^>(\S+)" --chunk-size=10 "wc -c" + ``` + +## Classes and Functions + +### Chunk Iterator Functions + +The script includes various functions to handle chunking of the input data: + +- **`chunk_iterator_lines`**: Splits input data by a specific number of lines. +- **`chunk_iterator_column`**: Splits input based on values in a specified column. +- **`chunk_iterator_regex_group`**: Groups input lines based on a regex match. +- **`chunk_iterator_regex_split`**: Splits input whenever a regex matches. + +These functions yield filenames containing the chunks, which are then processed independently. + +### Mapper Classes + +Mappers are used to rename or manage output IDs: +- **`MapperGlobal`**: Maps IDs globally with a given pattern. +- **`MapperLocal`**: Maps IDs locally, associating a unique ID to each key within a specific file. +- **`MapperEmpty`**: Passes through the original ID without modification. + +### ResultBuilder Classes + +The `ResultBuilder` classes handle the output from processed chunks: + +- **`ResultBuilder`**: Merges results from table-formatted output. +- **`ResultBuilderFasta`**: Handles results from fasta-formatted output. +- **`ResultBuilderBinary`**: Concatenates binary output files. +- **`ResultBuilderCopies`**: Creates indexed copies of output files. +- **`ResultBuilderLog`**: Aggregates log files from multiple jobs. + +### build_command Function +```python +def build_command(data): + # Function code... +``` +This function constructs the shell command to execute each chunk, including logging and managing temporary directories. It replaces placeholders (e.g., `%STDIN%` and `%DIR%`) with appropriate values. + +### hasFinished Function +```python +def hasFinished(retcode, filename, output_tag, logfile): + # Function code... +``` +The `hasFinished()` function checks if a run has finished successfully by inspecting the return code and looking for a completion tag in the log file. + +### get_option_parser Function +```python +def get_option_parser(): + # Function code... +``` +The `get_option_parser()` function sets up and returns an argument parser with various command-line options for specifying how the input should be split, how output should be handled, and other behaviours (e.g., memory requirements, logging). + +### main Function +```python +def main(argv=None): + # Function code... +``` +The `main()` function is the entry point of the script. It parses the command-line arguments, prepares the input data for processing, builds commands for each chunk, runs these commands (using a specified method: multiprocessing, threading, etc.), and finally collects and processes the results. + +- **Key Steps in Main**: + - **Argument Parsing**: Uses `get_option_parser()` to parse command-line options. + - **Chunking Input**: Chooses the appropriate chunking method and splits the input accordingly. + - **Job Execution**: Executes each chunk using the specified method (e.g., `multiprocessing`, `threads`, or `drmaa` for cluster management). + - **Result Collection**: Collects and combines the results using `ResultBuilder` classes. + - **Error Handling**: Logs any failed jobs and, if all jobs succeed, cleans up the temporary directories. + +## Conclusion + +The `farm.py` script is a powerful utility for dividing data streams into smaller tasks, running them in parallel on a cluster, and collating the output. It is well-suited for "embarrassingly parallel" tasks, such as processing large tabular datasets or fasta files, and integrates seamlessly with cluster environments for distributed computation. + +With flexible options for chunking data, managing output, and handling errors, this script is a useful tool for bioinformatics pipelines and other data-intensive workflows. + diff --git a/docs/pipeline_modules/files.md b/docs/pipeline_modules/files.md new file mode 100644 index 00000000..c28b33a2 --- /dev/null +++ b/docs/pipeline_modules/files.md @@ -0,0 +1,113 @@ +# Files Module + +`files.py` - Working with files in cgatcore pipelines + +==================================================== + +## Reference + +This module provides a collection of functions to facilitate file handling within cgatcore pipelines, including generating temporary files, temporary directories, and checking for required executables or scripts. + +## Import Statements + +```python +import os +import tempfile + +import cgatcore.iotools as iotools +import cgatcore.experiment as E +from cgatcore.pipeline.parameters import get_params +``` + +## Key Functions + +### `get_temp_file(dir=None, shared=False, suffix="", mode="w+", encoding="utf-8")` + +Gets a temporary file. The file is created and opened in text mode by default (mode `w+`), with UTF-8 encoding. + +Arguments: +- `dir` (string, optional): Directory of the temporary file. If not provided, defaults to the temporary directory defined in the global configuration. +- `shared` (bool, optional): If set to `True`, creates the file in a shared temporary location. +- `suffix` (string, optional): Suffix for the filename. +- `mode` (string, optional): File mode (e.g., `w+` for reading and writing). Defaults to `w+`. +- `encoding` (string, optional): Encoding for the file. Defaults to `utf-8`. + +Returns: +- `file` (File object): A temporary file object that the caller must close and delete once it's no longer needed. + +### `get_temp_filename(dir=None, shared=False, clear=True, suffix="")` + +Returns a temporary filename. The file is created and then optionally deleted if `clear` is set to `True`. + +Arguments: +- `dir` (string, optional): Directory for the temporary file. Defaults to the configuration's temporary directory. +- `shared` (bool, optional): If set to `True`, places the file in a shared temporary location. +- `clear` (bool, optional): If set to `True`, deletes the file after creation. Defaults to `True`. +- `suffix` (string, optional): Suffix for the filename. + +Returns: +- `filename` (string): Absolute path to the temporary file. + +### `get_temp_dir(dir=None, shared=False, clear=False)` + +Creates and returns a temporary directory. + +Arguments: +- `dir` (string, optional): Directory for the temporary directory. Defaults to the configuration's temporary directory. +- `shared` (bool, optional): If set to `True`, places the directory in a shared temporary location. +- `clear` (bool, optional): If set to `True`, removes the directory after creation. + +Returns: +- `filename` (string): Absolute path to the temporary directory. + +### `check_executables(filenames)` + +Checks for the presence of required executables in the system's PATH. + +Arguments: +- `filenames` (list of strings): List of executables to check for. + +Raises: +- `ValueError`: If any executable is missing. + +```python +def check_executables(filenames): + missing = [] + for filename in filenames: + if not iotools.which(filename): + missing.append(filename) + if missing: + raise ValueError("missing executables: %s" % ",".join(missing)) +``` + +### `check_scripts(filenames)` + +Checks for the presence of specified scripts in the filesystem. + +Arguments: +- `filenames` (list of strings): List of script filenames to check for. + +Raises: +- `ValueError`: If any script is missing. + +```python +def check_scripts(filenames): + missing = [] + for filename in filenames: + if not os.path.exists(filename): + missing.append(filename) + if missing: + raise ValueError("missing scripts: %s" % ",".join(missing)) +``` + +## Usage + +These functions are useful for managing temporary files and ensuring all necessary executables or scripts are available when running Ruffus pipelines. They help maintain clean temporary storage and facilitate proper error checking to prevent missing dependencies. + +## Notes + +- Temporary files and directories are created in either a shared or a default temporary location, based on the provided arguments. +- The `get_temp_file` function provides a safe way to generate temporary files, with customisable directory, mode, and encoding options. +- Always handle temporary files and directories appropriately, ensuring they are deleted after use to avoid cluttering the filesystem. + +As you continue to expand the functionality of CGAT-core, make sure to keep this module up to date with new helper functions or improvements. diff --git a/docs/pipeline_modules/overview.md b/docs/pipeline_modules/overview.md new file mode 100644 index 00000000..a3363b92 --- /dev/null +++ b/docs/pipeline_modules/overview.md @@ -0,0 +1,40 @@ +# Pipeline Modules Overview + +CGAT-core provides a comprehensive set of modules to facilitate the creation and management of data processing pipelines. These modules offer various functionalities, from pipeline control and execution to database management and file handling. + +## Available Modules + +1. [Control](control.md): Manages the overall pipeline execution flow. +2. [Database](database.md): Handles database operations and uploads. +3. [Files](files.md): Provides utilities for file management and temporary file handling. +4. [Cluster](cluster.md): Manages job submission and execution on compute clusters. +5. [Execution](execution.md): Handles task execution and logging. +6. [Utils](utils.md): Offers various utility functions for pipeline operations. +7. [Parameters](parameters.md): Manages pipeline parameters and configuration. + +## Integration with Ruffus + +CGAT-core builds upon the Ruffus pipeline library, extending its functionality and providing additional features. It includes the following Ruffus decorators: + +- `@transform` +- `@merge` +- `@split` +- `@originate` +- `@follows` +- `@suffix` + +These decorators can be used to define pipeline tasks and their dependencies. + +## S3 Integration + +CGAT-core also provides S3-aware decorators and functions for seamless integration with AWS S3: + +- `@s3_transform` +- `@s3_merge` +- `@s3_split` +- `@s3_originate` +- `@s3_follows` + +For more information on working with S3, see the [S3 Integration](../s3_integration/s3_pipeline.md) section. + +By leveraging these modules and decorators, you can build powerful, scalable, and efficient data processing pipelines using CGAT-core. \ No newline at end of file diff --git a/docs/pipeline_modules/parameters.md b/docs/pipeline_modules/parameters.md new file mode 100644 index 00000000..53b5f0f4 --- /dev/null +++ b/docs/pipeline_modules/parameters.md @@ -0,0 +1,160 @@ +# Parameter handling for cgatcore Pipelines + +This document provides an overview of the `parameters.py` module used in cgatcore pipelines to handle configuration and parameter management. It includes functions for loading, validating, and handling parameters, as well as managing global configurations. This module is essential for customising and controlling cgatcore pipelines' behaviour, allowing the user to flexibly specify parameters via configuration files, command-line arguments, or hard-coded defaults. + +## Table of Contents +- [Overview](#overview) +- [Global Constants and Initial Setup](#global-constants-and-initial-setup) +- [Functions Overview](#functions-overview) + - [get_logger Function](#get_logger-function) + - [get_parameters Function](#get_parameters-function) + - [config_to_dictionary Function](#config_to_dictionary-function) + - [nested_update Function](#nested_update-function) + - [input_validation Function](#input_validation-function) + - [match_parameter Function](#match_parameter-function) + - [substitute_parameters Function](#substitute_parameters-function) + - [as_list Function](#as_list-function) + - [is_true Function](#is_true-function) + - [check_parameter Function](#check_parameter-function) + - [get_params Function](#get_params-function) + - [get_parameters_as_namedtuple Function](#get_parameters_as_namedtuple-function) + - [get_param_section Function](#get_param_section-function) + +## Overview +The `parameters.py` module is designed to facilitate the management of configuration values for cgatcore pipelines. The configuration values are read from a variety of sources, including YAML configuration files, hard-coded dictionaries, and user-specific configuration files. The module also provides tools for parameter interpolation, validation, and nested dictionary handling. + +### Global Constants and Initial Setup +The module begins by defining some constants and setting up paths: +- `SCRIPTS_ROOT_DIR` and `SCRIPTS_SCRIPTS_DIR`: Defines the root directory of scripts used within the pipeline. +- `HAVE_INITIALIZED`: A boolean variable used to indicate if the global parameters have been loaded. +- `PARAMS`: A global dictionary for parameter interpolation. This dictionary can be switched between `defaultdict` and standard dictionary behaviour to facilitate handling missing parameters. + +### Functions Overview + +#### get_logger Function +```python +def get_logger(): + return logging.getLogger("cgatcore.pipeline") +``` +This function returns a logger instance for use in the pipeline, allowing consistent logging across the module. + +#### get_parameters Function +```python +def get_parameters(filenames=None, defaults=None, site_ini=True, user=True, only_import=None): + # Function code... +``` +The `get_parameters` function reads one or more configuration files to build the global `PARAMS` dictionary. It can read from various configuration files (e.g., `pipeline.yml`, `cgat.yml`), and merge configurations from user, site-specific, and default sources. + +- **Arguments**: + - `filenames (list or str)`: A list of filenames for configuration files. + - `defaults (dict)`: A dictionary of default values. + - `site_ini (bool)`: If `True`, configuration files from `/etc/cgat/pipeline.yml` are also read. + - `user (bool)`: If `True`, reads configuration from a user's home directory. + - `only_import (bool)`: If set, the parameter dictionary will default to a collection type. + +- **Returns**: + - `dict`: A global configuration dictionary (`PARAMS`). + +#### config_to_dictionary Function +```python +def config_to_dictionary(config): + # Function code... +``` +This function converts the contents of a `ConfigParser` object into a dictionary. Section names are prefixed with an underscore for clarity. + +- **Returns**: + - `dict`: A dictionary containing all configuration values, with nested sections appropriately handled. + +#### nested_update Function +```python +def nested_update(old, new): + # Function code... +``` +The `nested_update` function updates nested dictionaries. If both `old[x]` and `new[x]` are dictionaries, they are recursively merged; otherwise, `old[x]` is updated with `new[x]`. + +#### input_validation Function +```python +def input_validation(PARAMS, pipeline_script=""): + # Function code... +``` +The `input_validation` function inspects the `PARAMS` dictionary to check for problematic values, such as missing or placeholder inputs. + +- **Validations**: + - Checks for missing parameters (`?` placeholders). + - Ensures that all required tools are available on the system PATH. + - Verifies input file paths are readable. + +#### match_parameter Function +```python +def match_parameter(param): + # Function code... +``` +This function attempts to find an exact or prefix match in the global `PARAMS` dictionary for the given parameter. If no match is found, a `KeyError` is raised. + +- **Returns**: + - `str`: The full name of the parameter if found. + +#### substitute_parameters Function +```python +def substitute_parameters(**kwargs): + # Function code... +``` +This function returns a dictionary of parameter values for a specific task. It substitutes global parameter values and task-specific configuration values. + +- **Example**: + - If `PARAMS` has `"sample1.bam.gz_tophat_threads": 6` and `outfile = "sample1.bam.gz"`, it returns `{ "tophat_threads": 6 }`. + +#### as_list Function +```python +def as_list(value): + # Function code... +``` +This function converts a given value to a list. If the value is a comma-separated string, it splits the string into a list. + +- **Returns**: + - `list`: The input value as a list. + +#### is_true Function +```python +def is_true(param, **kwargs): + # Function code... +``` +This function checks if a parameter has a truthy value. Values like `0`, `''`, `false`, and `False` are considered as `False`. + +- **Returns**: + - `bool`: Whether the parameter is truthy or not. + +#### check_parameter Function +```python +def check_parameter(param): + # Function code... +``` +The `check_parameter` function checks if the given parameter is set in the global `PARAMS` dictionary. If it is not set, a `ValueError` is raised. + +#### get_params Function +```python +def get_params(): + # Function code... +``` +This function returns a handle to the global `PARAMS` dictionary. + +#### get_parameters_as_namedtuple Function +```python +def get_parameters_as_namedtuple(*args, **kwargs): + # Function code... +``` +The `get_parameters_as_namedtuple` function returns the `PARAMS` dictionary as a namedtuple, allowing for more convenient and attribute-based access to parameters. + +#### get_param_section Function +```python +def get_param_section(section): + # Function code... +``` +This function returns all configuration values within a specific section of the `PARAMS` dictionary. Sections are defined by common prefixes. + +- **Returns**: + - `list`: A list of tuples containing section-specific parameters. + +### Summary +The `parameters.py` module is designed to facilitate flexible and powerful parameter management for cgatcore pipelines. The functions provided allow for seamless integration of configuration from multiple sources, validation, and management of parameters, while also offering tools for introspection and nested dictionary handling. These utilities help create more robust and maintainable cgatcore pipelines, allowing for greater customisation and scalability. + diff --git a/docs/pipeline_modules/run_function.md b/docs/pipeline_modules/run_function.md new file mode 100644 index 00000000..f7abea20 --- /dev/null +++ b/docs/pipeline_modules/run_function.md @@ -0,0 +1,99 @@ +# run_function.py - Documentation + +This document provides an overview of the `run_function.py` script, which is used to execute a function from a specified Python module remotely on a cluster. This utility allows functions from Python modules to be executed with user-defined parameters, input files, and output files, which is useful for running scripts as part of a computational pipeline. + +## Table of Contents +- [Purpose](#purpose) +- [Usage](#usage) +- [Command Line Options](#command-line-options) +- [Workflow](#workflow) + - [Parsing Options](#parsing-options) + - [Module Importing](#module-importing) + - [Function Invocation](#function-invocation) +- [Examples](#examples) +- [Error Handling](#error-handling) + +## Purpose + +The `run_function.py` script allows the execution of a specified function from a Python module with given input and output files, and other parameters. It can be used within a cluster environment to facilitate the remote execution of Python functions for parallel processing tasks or batch jobs. + +## Usage + +The script is typically used in conjunction with other pipeline tools. Here is an example: + +```python +statement = """python %(scriptsdir)s/run_function.py \ + -p infile,outfile,additional_param1 \ + -m modulefile \ + -f function""" + +P.run() +``` + +If the module to be used is within the `$PYTHONPATH`, it can be directly named (e.g., "pipeline" would refer to `pipeline.py`). The script is mainly tested for cases involving single input/output pairs. + +## Command Line Options + +- `-p, --params, --args`: Comma-separated list of additional parameter strings to be passed to the function. +- `-m, --module`: The full path to the module file from which the function will be imported. +- `-i, --input`: Input filename(s). Can be specified multiple times for multiple inputs. +- `-o, --output-section`: Output filename(s). Can be specified multiple times for multiple outputs. +- `-f, --function`: The name of the function to be executed from the specified module. + +## Workflow + +The workflow of the `run_function.py` script includes: + +### Parsing Options + +The script begins by parsing command-line arguments using `OptionParser`. The user must specify the module file and the function to run, along with any input and output files or additional parameters. + +- **Mandatory Parameters**: + - Module (`-m`) and Function (`-f`): Both must be specified. +- **Optional Parameters**: + - Input and output files (`-i`, `-o`) are optional depending on the function requirements. + - Additional parameters (`-p`) are optional but can be specified to provide custom arguments. + +### Module Importing + +After parsing the arguments, the script imports the specified module using `importlib`. This is necessary for dynamically loading the module that contains the function to be executed. + +- **Adding Path**: If a full path is provided, the script appends that path to `sys.path` to ensure that Python can locate the module. +- **Module Import**: The `importlib.import_module()` function is used to import the module by its basename. The script also handles cases where the `.py` file extension is included. +- **Function Mapping**: The specified function is retrieved from the module using `getattr()`. If the function cannot be found, an error is raised, indicating the available functions within the module. + +### Function Invocation + +The function is invoked with the appropriate arguments, depending on which input, output, and parameter combinations are specified. + +- **Handling Inputs and Outputs**: + - The script manages cases where there are multiple or single input/output files, converting them into the expected formats for the function. + - The `infiles` and `outfiles` arguments are handled to ensure they are passed appropriately, either as lists or as single file paths. +- **Parameter Parsing**: If additional parameters are provided, they are split into a list and passed as arguments to the function. +- **Function Call**: Based on the presence of inputs, outputs, and parameters, the function is called with different argument combinations. + +## Examples + +1. **Basic Function Execution** + ```sh + python run_function.py -m mymodule.py -f my_function -i input.txt -o output.txt -p param1,param2 + ``` + In this example, `my_function` from `mymodule.py` is executed with `input.txt` as the input file, `output.txt` as the output file, and `param1` and `param2` as additional parameters. + +2. **Executing a Function without Input/Output** + ```sh + python run_function.py -m utilities.py -f simple_function -p param1,param2 + ``` + This runs `simple_function` from `utilities.py` with the specified parameters, but without any input or output files. + +## Error Handling + +- **Missing Module or Function**: If either the module (`-m`) or function (`-f`) options are missing, the script raises a `ValueError`, indicating that both must be provided. +- **Import Errors**: The script checks if the module exists at the specified location, and if the function is present within the module. It provides debug information (`sys.path`) to help locate import issues. +- **Attribute Errors**: If the specified function is not found in the module, an `AttributeError` is raised, and the script lists all available functions within the module. +- **Invalid Argument Combinations**: If the expected combination of input, output, and parameters is not provided, the script raises a `ValueError`, clarifying what is expected. + +## Conclusion + +The `run_function.py` script is a versatile tool for remotely executing functions from Python modules on a cluster. It supports input/output file handling and passing of additional parameters, making it suitable for use in complex computational pipelines. With its flexible argument parsing and dynamic module importing, it provides an easy way to run Python functions in distributed environments, aiding in the modularisation and parallelisation of tasks. + diff --git a/docs/pipeline_modules/utils.md b/docs/pipeline_modules/utils.md new file mode 100644 index 00000000..773cfa01 --- /dev/null +++ b/docs/pipeline_modules/utils.md @@ -0,0 +1,158 @@ +# Utilities for cgatcore Pipelines - Documentation + +This document provides an overview of the utility functions and classes defined in `utils.py` for use in cgatcore pipelines. The utilities include functions for context inspection, testing mode checks, and decorators for managing documentation strings. These functions are particularly helpful for debugging and providing context information during the execution of a cgatcore pipeline. + +## Table of Contents +- [EmptyRunner Class](#emptyrunner-class) +- [is_test Function](#is_test-function) +- [get_caller_locals Function](#get_caller_locals-function) +- [get_caller Function](#get_caller-function) +- [get_calling_function Function](#get_calling_function-function) +- [add_doc Decorator](#add_doc-decorator) + +### EmptyRunner Class + +```python +class EmptyRunner: + def __init__(self, name): + self.__name__ = name + + def __call__(self, *args, **kwargs): + pass +``` + +The `EmptyRunner` class is a simple utility that creates an object with a callable interface that does nothing when called. It is primarily useful as a placeholder in situations where a no-operation handler is required. + +- **Attributes**: + - `name`: A name assigned to the instance for identification purposes. + +- **Methods**: + - `__call__(self, *args, **kwargs)`: This method is a no-operation handler. + +### is_test Function + +```python +def is_test(): + return "--is-test" in sys.argv +``` + +The `is_test()` function checks whether the pipeline is being run in testing mode. + +- **Returns**: + - `bool`: Returns `True` if `--is-test` is passed as a command-line argument; otherwise, `False`. + +This function is useful for conditionally enabling or disabling testing-specific behaviour. + +### get_caller_locals Function + +```python +def get_caller_locals(decorators=0): + f = sys._getframe(2 + decorators) + args = inspect.getargvalues(f) + return args.locals +``` + +The `get_caller_locals()` function returns the local variables of the calling function. This is useful for debugging or inspecting the state of the caller. + +- **Parameters**: + - `decorators (int)`: The number of decorator contexts to go up to find the caller of interest. Default is `0`. + +- **Returns**: + - `dict`: A dictionary of local variables defined in the caller's context. + +### get_caller Function + +```python +def get_caller(decorators=0): + frm = inspect.stack() + return inspect.getmodule(frm[2 + decorators].frame) +``` + +The `get_caller()` function returns the calling class/module. This helps identify the caller and can be helpful for logging and debugging. + +- **Parameters**: + - `decorators (int)`: The number of decorator contexts to go up to find the caller of interest. Default is `0`. + +- **Returns**: + - `module`: The calling module or `None` if not found. + +### get_calling_function Function + +```python +def get_calling_function(decorators=0): + frm = inspect.stack() + return frm[2 + decorators].function +``` + +The `get_calling_function()` function returns the name of the calling function. This is useful for introspection and debugging to know which function invoked the current function. + +- **Parameters**: + - `decorators (int)`: The number of decorator contexts to go up to find the caller of interest. Default is `0`. + +- **Returns**: + - `str`: The name of the calling function, or `None` if not found. + +### add_doc Decorator + +```python +def add_doc(value, replace=False): + def _doc(func): + @wraps(func) + def wrapped_func(*args, **kwargs): + return func(*args, **kwargs) + + if func.__doc__: + lines = value.__doc__.split("\n") + for x, line in enumerate(lines): + if line.strip() == "": + break + if not replace: + lines.insert(x + 1, " " * 4 + func.__doc__) + wrapped_func.__doc__ = "\n".join(lines) + else: + wrapped_func.__doc__ = value.__doc__ + else: + wrapped_func.__doc__ = value.__doc__ + + return wrapped_func + + return _doc +``` + +The `add_doc()` function is a decorator that adds or replaces the docstring of the decorated function. + +- **Parameters**: + - `value`: The function or string whose documentation will be added. + - `replace (bool)`: If `True`, the existing documentation is replaced. Otherwise, the new documentation is appended after the existing documentation. Default is `False`. + +- **Returns**: + - A decorated function with the modified or updated docstring. + +This utility is helpful for ensuring that custom decorators or utility functions carry informative and up-to-date documentation, which can help when generating automated docs or maintaining code. + +### General Notes + +- The functions use Python's `inspect` and `sys` libraries for introspection and manipulation of stack frames, which can be useful for debugging complex pipelines. +- The `logging` module is used for error handling to ensure that potential issues (e.g., accessing out-of-range stack frames) are logged rather than silently ignored. + +### Example Usage + +```python +@add_doc(is_test) +def example_function(): + """This is an example function.""" + print("Example function running.") + +if __name__ == "__main__": + if is_test(): + print("Running in testing mode.") + else: + example_function() +``` + +In this example, `example_function()` is decorated with the `add_doc()` decorator, using the documentation from `is_test()`. This effectively appends the `is_test()` docstring to `example_function()`. + +### Conclusion + +These utilities provide helpful functionality for cgatcore pipelines by allowing developers to inspect caller contexts, easily handle testing conditions, and dynamically update function documentation. The use of the `inspect` library allows access to stack frames, making these utilities especially useful for debugging and dynamic analysis during pipeline execution. + diff --git a/docs/project_info/Contributing.rst b/docs/project_info/Contributing.rst deleted file mode 100644 index 607831ef..00000000 --- a/docs/project_info/Contributing.rst +++ /dev/null @@ -1,39 +0,0 @@ -.. project_info-Contributing - -========== -Developers -========== - -The following individuals are the main developers of the cgatcore - -Andreas Heger - -`Adam Cribbs `_ - -Sebastian Luna Valero - -Hania Pavlou - -David Sims - -Charlotte George - -Tom Smith - -Ian Sudbery - -Jakub Scaber - -Mike Morgan - -Katy Brown - -Nick Ilott - -Jethro Johnson - -Katherine Fawcett - -Steven Sansom - -Antonio Berlanga diff --git a/docs/project_info/FAQ.rst b/docs/project_info/FAQ.rst deleted file mode 100644 index fa0af778..00000000 --- a/docs/project_info/FAQ.rst +++ /dev/null @@ -1,9 +0,0 @@ -.. _FAQ: - -==== -FAQs -==== - -As our workflow develops we will add frequently asked questions here. - -In the meantime please add issues to the `github page `_ diff --git a/docs/project_info/Licence.rst b/docs/project_info/Licence.rst deleted file mode 100644 index 423a15ee..00000000 --- a/docs/project_info/Licence.rst +++ /dev/null @@ -1,36 +0,0 @@ -.. project_info-Licence - - -======= -Licence -======= - -CGAT-core is an open-source project and we have made the cgat-developers repositor available under the open source permissive free MIT software licence, allowing free and full use of the code for both commercial and non-commercial purposes. A copy of the licence is shown below: - -MIT License ------------ - -Copyright (c) 2018 cgat-developers - -Permission is hereby granted, free of charge, to any person obtaining -a copy of this software and associated documentation files (the -"Software"), to deal in the Software without restriction, including -without limitation the rights to use, copy, modify, merge, publish, -distribute, sublicense, and/or sell copies of the Software, and to -permit persons to whom the Software is furnished to do so, subject to -the following conditions: - -The above copyright notice and this permission notice shall be -included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE -LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION -OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION -WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -Contributions by @andreashegergenomics are provided under the -MIT licence and are Copyright (c) 2018 GENOMICS plc. diff --git a/docs/project_info/citations.md b/docs/project_info/citations.md new file mode 100644 index 00000000..20ed316e --- /dev/null +++ b/docs/project_info/citations.md @@ -0,0 +1,74 @@ +# Citing and Citations + +`cgatcore` has been developed over the past 10 years and has been used in a number of previously published scientific articles. This page provides information on how to cite `cgatcore` and highlights some of the key publications where it has been used. + +## Citing cgatcore + +When using `cgatcore` for a publication, **please cite the following article** in your paper: + +**ADD CITATION HERE** + +## More references + +The following is a list of publications that have used tools developed by CGAT developers. Note that this list is not exhaustive: + +- **A ChIP-seq defined genome-wide map of vitamin D receptor binding: associations with disease and evolution** + SV Ramagopalan, A Heger, AJ Berlanga, NJ Maugeri, MR Lincoln, ... Genome research 20 (10), 1352-1360 (2010) + +- **Sequencing depth and coverage: key considerations in genomic analyses** + D Sims, I Sudbery, NE Ilott, A Heger, CP Ponting Nature Reviews Genetics 15 (2), 121 (2014) + +- **KDM2B links the Polycomb Repressive Complex 1 (PRC1) to recognition of CpG islands** + AM Farcas, NP Blackledge, I Sudbery, HK Long, JF McGouran, NR Rose, ... eLife (2012) + +- **Targeting polycomb to pericentric heterochromatin in embryonic stem cells reveals a role for H2AK119u1 in PRC2 recruitment** + S Cooper, M Dienstbier, R Hassan, L Schermelleh, J Sharif, ... Cell reports 7 (5), 1456-1470 (2014) + +- **Long non-coding RNAs and enhancer RNAs regulate the lipopolysaccharide-induced inflammatory response in human monocytes** + NE Ilott, JA Heward, B Roux, E Tsitsiou, PS Fenwick, L Lenzi, I Goodhead, ... Nature communications 5, 3979 (2014) + +- **Population and single-cell genomics reveal the Aire dependency, relief from Polycomb silencing, and distribution of self-antigen expression in thymic epithelia** + SN Sansom, N Shikama-Dorn, S Zhanybekova, G Nusspaumer, ... Genome research 24 (12), 1918-1931 (2014) + +- **Epigenetic conservation at gene regulatory elements revealed by non-methylated DNA profiling in seven vertebrates** + HK Long, D Sims, A Heger, NP Blackledge, C Kutter, ML Wright, ... eLife 2 (2013) + +- **The long non‐coding RNA Paupar regulates the expression of both local and distal genes** + KW Vance, SN Sansom, S Lee, V Chalei, L Kong, SE Cooper, PL Oliver, ... The EMBO journal 33 (4), 296-311 (2014) + +- **A genome-wide association study implicates the APOE locus in nonpathological cognitive ageing** + G Davies, SE Harris, CA Reynolds, A Payton, HM Knight, DC Liewald, ... Molecular Psychiatry 19 (1), 76 (2014) + +- **Predicting long non-coding RNAs using RNA sequencing** + NE Ilott, CP Ponting Methods 63 (1), 50-59 (2013) + +- **Next-generation sequencing of advanced prostate cancer treated with androgen-deprivation therapy** + P Rajan, IM Sudbery, MEM Villasevil, E Mui, J Fleming, M Davis, I Ahmad, ... European urology 66 (1), 32-39 (2014) + +- **The long non-coding RNA Dali is an epigenetic regulator of neural differentiation** + V Chalei, SN Sansom, L Kong, S Lee, JF Montiel, KW Vance, CP Ponting eLife 3 (2014) + +- **GAT: a simulation framework for testing the association of genomic intervals** + A Heger, C Webber, M Goodson, CP Ponting, G Lunter Bioinformatics 29 (16), 2046-2048 (2013) + +- **De novo point mutations in patients diagnosed with ataxic cerebral palsy** + R Parolin Schnekenberg, EM Perkins, JW Miller, WIL Davies, ... Brain 138 (7), 1817-1832 (2015) + +- **SPG7 mutations are a common cause of undiagnosed ataxia** + G Pfeffer, A Pyle, H Griffin, J Miller, V Wilson, L Turnbull, K Fawcett, ... Neurology 84 (11), 1174-1176 (2015) + +- **CDK9 inhibitors define elongation checkpoints at both ends of RNA polymerase II–transcribed genes** + C Laitem, J Zaborowska, NF Isa, J Kufs, M Dienstbier, S Murphy Nature Structural and Molecular Biology 22 (5), 396 (2015) + +- **IRF5: RelA interaction targets inflammatory genes in macrophages** + DG Saliba, A Heger, HL Eames, S Oikonomopoulos, A Teixeira, K Blazek, ... Cell reports 8 (5), 1308-1317 (2014) + +- **UMI-tools: modeling sequencing errors in Unique Molecular Identifiers to improve quantification accuracy** + T Smith, A Heger, I Sudbery Genome research 27 (3), 491-499 (2017) + +- **Long noncoding RNAs in B-cell development and activation** + TF Brazão, JS Johnson, J Müller, A Heger, CP Ponting, VLJ Tybulewicz Blood 128 (7), e10-e19 (2016) + +- **CGAT: computational genomics analysis toolkit** + D Sims, NE Ilott, SN Sansom, IM Sudbery, JS Johnson, KA Fawcett, ... Bioinformatics 30 (9), 1290-1291 (2014) + diff --git a/docs/project_info/citations.rst b/docs/project_info/citations.rst deleted file mode 100644 index 31111f78..00000000 --- a/docs/project_info/citations.rst +++ /dev/null @@ -1,59 +0,0 @@ -.. _project_info-citations: - -==================== -Citing and Citations -==================== - -cgatcore has been developed over the past 10 years and as such has been used in a number of previously published scientific artciles. - -Citing cgatcore ---------------- - -When using cgatcore for a publication, **please cite the following article** in you paper: - -ADD CITATION HERE - -More references ---------------- - -A number of publications that have used CGAT-developer tools are listed below, however this list is not an exhastive list: - -**A ChIP-seq defined genome-wide map of vitamin D receptor binding: associations with disease and evolution** SV Ramagopalan, A Heger, AJ Berlanga, NJ Maugeri, MR Lincoln, ... Genome research 20 (10), 1352-1360 2010 - -**Sequencing depth and coverage: key considerations in genomic analyses** D Sims, I Sudbery, NE Ilott, A Heger, CP Ponting Nature Reviews Genetics 15 (2), 121 2014 - -**KDM2B links the Polycomb Repressive Complex 1 (PRC1) to recognition of CpG islands** AM Farcas, NP Blackledge, I Sudbery, HK Long, JF McGouran, NR Rose, ... elife 2012 - -**Targeting polycomb to pericentric heterochromatin in embryonic stem cells reveals a role for H2AK119u1 in PRC2 recruitment** S Cooper, M Dienstbier, R Hassan, L Schermelleh, J Sharif, ... Cell reports 7 (5), 1456-1470 2014 - -**Long non-coding RNAs and enhancer RNAs regulate the lipopolysaccharide-induced inflammatory response in human monocytes** NE Ilott, JA Heward, B Roux, E Tsitsiou, PS Fenwick, L Lenzi, I Goodhead, ... Nature communications 5, 3979 2014 - -**Population and single-cell genomics reveal the Aire dependency, relief from Polycomb silencing, and distribution of self-antigen expression in thymic epithelia** SN Sansom, N Shikama-Dorn, S Zhanybekova, G Nusspaumer, ... Genome research 24 (12), 1918-1931 2014 - -**Epigenetic conservation at gene regulatory elements revealed by non-methylated DNA profiling in seven vertebrates** HK Long, D Sims, A Heger, NP Blackledge, C Kutter, ML Wright, ... Elife 2 2013 - -**The long non‐coding RNA Paupar regulates the expression of both local and distal genes** KW Vance, SN Sansom, S Lee, V Chalei, L Kong, SE Cooper, PL Oliver, ... The EMBO journal 33 (4), 296-311 2014 - -**A genome-wide association study implicates the APOE locus in nonpathological cognitive ageing** G Davies, SE Harris, CA Reynolds, A Payton, HM Knight, DC Liewald, ... Molecular Psychiatry 19 (1), 76 2014 - -**Predicting long non-coding RNAs using RNA sequencing** NE Ilott, CP Ponting Methods 63 (1), 50-59 2013 - -**Next-generation sequencing of advanced prostate cancer treated with androgen-deprivation therapy** P Rajan, IM Sudbery, MEM Villasevil, E Mui, J Fleming, M Davis, I Ahmad, ... European urology 66 (1), 32-39 2014 - -**The long non-coding RNA Dali is an epigenetic regulator of neural differentiation** V Chalei, SN Sansom, L Kong, S Lee, JF Montiel, KW Vance, CP Ponting Elife 3 2014 - -**GAT: a simulation framework for testing the association of genomic intervals** A Heger, C Webber, M Goodson, CP Ponting, G Lunter Bioinformatics 29 (16), 2046-2048 2013 - -**De novo point mutations in patients diagnosed with ataxic cerebral palsy** R Parolin Schnekenberg, EM Perkins, JW Miller, WIL Davies, ... Brain 138 (7), 1817-1832 2015 - -**SPG7 mutations are a common cause of undiagnosed ataxia** G Pfeffer, A Pyle, H Griffin, J Miller, V Wilson, L Turnbull, K Fawcett, ... Neurology 84 (11), 1174-1176 2015 - -**CDK9 inhibitors define elongation checkpoints at both ends of RNA polymerase II–transcribed genes** C Laitem, J Zaborowska, NF Isa, J Kufs, M Dienstbier, S Murphy Nature Structural and Molecular Biology 22 (5), 396 2015 - -**IRF5: RelA interaction targets inflammatory genes in macrophages** DG Saliba, A Heger, HL Eames, S Oikonomopoulos, A Teixeira, K Blazek, ... Cell reports 8 (5), 1308-1317 2014 - -**UMI-tools: modeling sequencing errors in Unique Molecular Identifiers to improve quantification accuracy** T Smith, A Heger, I Sudbery Genome research 27 (3), 491-499 2017 - -**Long noncoding RNAs in B-cell development and activation** TF Brazão, JS Johnson, J Müller, A Heger, CP Ponting, VLJ Tybulewicz Blood 128 (7), e10-e19 2016 - -**CGAT: computational genomics analysis toolkit** D Sims, NE Ilott, SN Sansom, IM Sudbery, JS Johnson, KA Fawcett, ... Bioinformatics 30 (9), 1290-1291 2014 diff --git a/docs/project_info/contributing.md b/docs/project_info/contributing.md new file mode 100644 index 00000000..6becf594 --- /dev/null +++ b/docs/project_info/contributing.md @@ -0,0 +1,21 @@ +# Developers + +The following individuals are the main developers of `cgatcore`: + +- Andreas Heger +- [Adam Cribbs](https://cribbslab.co.uk/) +- Sebastian Luna Valero +- Hania Pavlou +- David Sims +- Charlotte George +- Tom Smith +- Ian Sudbery +- Jakub Scaber +- Mike Morgan +- Katy Brown +- Nick Ilott +- Jethro Johnson +- Katherine Fawcett +- Steven Sansom +- Antonio Berlanga + diff --git a/docs/project_info/faq.md b/docs/project_info/faq.md new file mode 100644 index 00000000..0460897f --- /dev/null +++ b/docs/project_info/faq.md @@ -0,0 +1,5 @@ +# FAQs + +As our workflow develops, we will add frequently asked questions here. + +In the meantime, please add issues to the [GitHub page](https://github.com/cgat-developers/cgat-core/issues). \ No newline at end of file diff --git a/docs/project_info/how_to_contribute.md b/docs/project_info/how_to_contribute.md new file mode 100644 index 00000000..0d902464 --- /dev/null +++ b/docs/project_info/how_to_contribute.md @@ -0,0 +1,50 @@ +# Contributing + +Contributions are very much encouraged, and we greatly appreciate the time and effort people make to help maintain and support our tools. Every contribution helps, so please don't be shy—we don't bite. + +You can contribute to the development of our software in a number of different ways: + +## Reporting bug fixes + +Bugs are annoying, and reporting them helps us to fix issues quickly. + +Bugs can be reported using the issue section on [GitHub](https://github.com/cgat-developers/cgat-core/issues). + +When reporting issues, please include: + +- Steps in your code/command that led to the bug so it can be reproduced. +- The error message from the log output. +- Any other helpful information, such as the system/cluster engine or version details. + +## Proposing a new feature/enhancement + +If you wish to contribute a new feature to the CGAT-core repository, the best way is to raise this as an issue and label it as an enhancement on [GitHub](https://github.com/cgat-developers/cgat-core/issues). + +When proposing a new feature, please: + +- Explain how your enhancement will work. +- Describe, as best as you can, how you plan to implement it. +- If you don't feel you have the necessary skills to implement this on your own, please mention it—we'll try our best to help (or even implement it for you). However, please note that this is community-developed software, and our volunteers have other jobs, so we may not be able to work as quickly as you might hope. + +## Pull Request Guidelines + +Why not contribute to our project? It's a great way of making the project better, and your help is always welcome. We follow the fork/pull request [model](https://guides.github.com/activities/forking/). To update our documentation, fix bugs, or add enhancements, you'll need to create a pull request through GitHub. + +To create a pull request, follow these steps: + +1. Create a GitHub account. +2. Create a personal fork of the project on GitHub. +3. Clone the fork onto your local machine. Your remote repo on GitHub is called `origin`. +4. Add the original repository as a remote called `upstream`. +5. If you made the fork a while ago, make sure you run `git pull upstream` to keep your repository up to date. +6. Create a new branch to work on! We usually name our branches with capital initials followed by a dash and something unique. For example: `git checkout -b AC-new_doc`. +7. Implement your fix/enhancement and make sure your code is effectively documented. +8. Our code has tests, which are run when a pull request is submitted. You can also run the tests beforehand—many of them are in the `tests/` directory. To run all tests, use `pytest --pep8 tests`. +9. Add or modify the documentation in the `docs/` directory. +10. Squash all of your commits into a single commit using Git's [interactive rebase](https://help.github.com/articles/about-git-rebase/). +11. Push your branch to your fork on GitHub: `git push origin`. +12. From your fork on GitHub, open a pull request in the correct branch. +13. Someone will review your changes, and they may suggest modifications or approve them. +14. Once the pull request is approved and merged, pull the changes from `upstream` to your local repo and delete your branch. + +> **Note**: Always write your commit messages in the present tense. Your commit messages should describe what the commit does to the code, not what you did to the code. \ No newline at end of file diff --git a/docs/project_info/how_to_contribute.rst b/docs/project_info/how_to_contribute.rst deleted file mode 100644 index cc0ff263..00000000 --- a/docs/project_info/how_to_contribute.rst +++ /dev/null @@ -1,93 +0,0 @@ -.. _contribute: - -============ -Contributing -============ - -Contributions are very much encouraged and we greatly appreciate the time and effort people make to help maintain and support out tools. Every contribution helps, please dont be shy, we dont bite. - -You can contribute to the development of our software in a number of different ways: - -------------------- -Reporting bug fixes -------------------- - -Bugs are annoying and reporting them will help us to fix your issue. - -Bugs can be reported using the issue section in `github `_ - -When reporting issues, please include: - -- Steps in your code/command that led to the bug so it can be reproduced. -- The error message from the log message. -- Any other helpful info, such as the system/cluster engine or version information. - ------------------------------------ -Proposing a new feature/enhancement ------------------------------------ - -If you wish to contribute a new feature to the CGAT-core repository then the best way is to raise this as an issue and label it as an enhancement in `github `_ - -If you propose a new feature then please: - -- Explain how your enhancement will work -- Describe as best as you can how you plan to implement this. -- If you dont think you have the necessary skills to implement this on your own then please say and we will try our best to help (or implement this for you). However, please be aware that this is a community developed software and our volunteers have other jobs. Therefore, we may not be able to work as fast as you hoped. - ------------------------ -Pull Request Guidelines ------------------------ - -Why not contribute to our project, its a great way of making the project better, your help is always welcome. We follow the fork/pull request `model `_. To update our documentation, fix bugs or add extra enhancements you will need to create a pull request through github. - -To create a pull request perform these steps: - -1. Create a github account. - -2. Create a personal fork of the project on github. - -3. Clone the fork onto your local machine. Your remote repo on github - is called ``origin``. - -4. Add the orginal repository as a remote called ``upstream``. - -5. If you made the fork a while ago then please make sure you ``git - pull upstream`` to keep your repository up to date - -6. Create a new branch to work on! We usually name our branches with - capital first and last followed by a dash and something unique. For - example: ``git checkout -b AC-new_doc``. - -7. Impliment your fix/enhancement and make sure your code is - effectively documented. - -8. Our code has tests and these will be ran when a pull request is - submitted, however you can run our tests before you make the pull - request, we have a number written in the ``tests/`` directory. To - run all tests, type ``pytest --pep8 tests`` - -9. Add or change our documentation in the ``docs/`` directory. - -10. Squash all of your commits into a single commit with gits - `interactive rebase - `_. - -11. Push your branch to your fork on github ``git push origin`` - -12. From your fork in github.com, open a pull request in the correct - branch. - -13. ... This is where someone will review your changes and modify them - or approve them ... - -14. Once the pull request is approved and merged you can pull the - changes from the ``upstream`` to your local repo and delete your - branch. - -.. note:: - - Always write your commit messages in the present tense. Your commit - messages should describe what the commit does to the code and not - what you did to the code. - - diff --git a/docs/project_info/license.md b/docs/project_info/license.md new file mode 100644 index 00000000..f7c62a85 --- /dev/null +++ b/docs/project_info/license.md @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 cgat-developers + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/docs/release.rst b/docs/release.rst deleted file mode 100644 index cdcf6146..00000000 --- a/docs/release.rst +++ /dev/null @@ -1,24 +0,0 @@ -============= -Release Notes -============= - -Notes on each release are below. - -Release 0.4.0 -============= - -* contributions by Genomics PLC ; https://github.com/cgat-developers/cgat-core/pull/1 -* added installation script and conda environments ; https://github.com/cgat-developers/cgat-core/pull/2 -* make pytest pass on both Linux and OSX ; https://github.com/cgat-developers/cgat-core/pull/4 -* snakefood (py2) does not parse Control.py correctly ; https://github.com/cgat-developers/cgat-core/pull/15 -* do not run P.load in the cluster ; https://github.com/cgat-developers/cgat-core/pull/22 -* fix os.path.relpath cache function ; https://github.com/cgat-developers/cgat-core/pull/24 ; https://github.com/cgat-developers/cgat-core/pull/28 -* migrating https://github.com/CGATOxford/CGATPipelines/pull/406 ; https://github.com/cgat-developers/cgat-core/pull/26 ; https://github.com/cgat-developers/cgat-core/pull/31 -* migrating https://github.com/CGATOxford/CGATPipelines/pull/411 ; https://github.com/cgat-developers/cgat-core/pull/16 -* make enforcing memory restrictions via ulimit optional ; https://github.com/cgat-developers/cgat-core/pull/27 -* change printconfig to work with yml files ; https://github.com/cgat-developers/cgat-core/pull/32 -* changes to work only with yml configuration files (ini files are no longer in use) ; https://github.com/cgat-developers/cgat-core/pull/25 -* update conda from 4.3 to 4.5 (solving "CXXABI_1.3.9 not found" error ; https://github.com/ContinuumIO/anaconda-issues/issues/5191) ; https://github.com/cgat-developers/cgat-core/commit/b940e3e1e10e29ad65ce00c346881e05584bfc9b -* migrating https://github.com/CGATOxford/CGATPipelines/pull/399 ; migrating https://github.com/cgat-developers/cgat-core/pull/34 -* new way of activating conda environments ; https://github.com/cgat-developers/cgat-core/pull/35 - diff --git a/docs/remote/Azure.rst b/docs/remote/Azure.rst deleted file mode 100644 index aa45d2c0..00000000 --- a/docs/remote/Azure.rst +++ /dev/null @@ -1,108 +0,0 @@ -.. _Azure: - -======================= -Microsoft Azure Storage -======================= - -This section describes how to interact with Microsoft Azure cloud storage. In order to interact with the -Azure cloud storage resource we use the `azure `_ SDK. - -Like all of our remote connection functionality, this is a work in progress and we are currently in the -process of adding extra features. If you have bug reports or comments then please raise them as an issue -on `github `_ - - -Setting up credentials ----------------------- - -Unlike other remote access providers, the credentials are set up by passing them directly into the initial class -as variables as follows:: - - Azure = AzureRemoteObject(account_name = "firstaccount", account_key = "jbiuebcjubncjklncjkln........") - -These access keys can be found in the Azure portal and locating the storage account. In the settings of the storage account -there is a selection "Access keys". The account name and access keys are listed here. - -Download from Azure -------------------- - -Using remote files with Azure can be achieved easily by using `download`, `upload` and `delete_file` functions that are written into a RemoteClass. - -Firstly you will need to initiate the class as follows:: - - from cgatcore.remote.azure import * - Azure = AzureRemoteObject(account_name = "firstaccount", account_key = "jbiuebcjubncjklncjkln........") - -In order to download a file and use it within the decorator you can follows the example:: - - @transform(Azure.download('test-azure',"pipeline.yml", "./pipeline.yml"), - regex("(.*)\.(.*)"), - r"\1.counts") - -This will download the file `pipeline.yml` in the Azure container `test-azure` locally to `./pipeline.yml` -and it will be picked up by the decoratory function as normal. - -Upload to Azure ---------------- - -In order to upload files to Azure you simply need to run:: - - Azure.upload('test-azure',"pipeline2.yml", "./pipeline.yml") - -This will upload to the `test-azure` Azure container the `./pipeline.yml` file and it will be saved as -`pipeline2.yml` in that bucket. - -Delete file from Azure ----------------------- - -In order to delete a file from the Azure container then you simply run:: - - Azure.delete_file('test-azure',"pipeline2.yml") - -This will delete the `pipeline2.yml` file from the `test-azure` container. - - -Functional example ------------------- - -As a simple example, the following one function pipeline demonstrates the way you can interact with AWS S3:: - - from ruffus import * - import sys - import os - import cgatcore.experiment as E - from cgatcore import pipeline as P - from cgatcore.remote.azure import * - - # load options from the config file - PARAMS = P.get_parameters( - ["%s/pipeline.yml" % os.path.splitext(__file__)[0], - "../pipeline.yml", - "pipeline.yml"]) - - Azure = AzureRemoteObject(account_name = "firstaccount", account_key = "jbiuebcjubncjklncjkln........") - - - @transform(Azure.download('test-azure',"pipeline.yml", "./pipeline.yml"), - regex("(.*)\.(.*)"), - r"\1.counts") - def countWords(infile, outfile): - '''count the number of words in the pipeline configuration files.''' - - # Upload file to Azure - Azure.upload('test-azure',"pipeline2.yml", "/ifs/projects/adam/test_remote/data/pipeline.yml") - - # the command line statement we want to execute - statement = '''awk 'BEGIN { printf("word\\tfreq\\n"); } - {for (i = 1; i <= NF; i++) freq[$i]++} - END { for (word in freq) printf "%%s\\t%%d\\n", word, freq[word] }' - < %(infile)s > %(outfile)s''' - - P.run(statement) - - # Delete file from Azure - Azure.delete_file('test-azure',"pipeline2.yml") - - @follows(countWords) - def full(): - pass diff --git a/docs/remote/GC.rst b/docs/remote/GC.rst deleted file mode 100644 index fd1377ac..00000000 --- a/docs/remote/GC.rst +++ /dev/null @@ -1,127 +0,0 @@ -.. _GC: - -============== -Google storage -============== - -This section describes how to interact with the google cloud storage -bucket and blob (files). In order to interact with the cloud -resource we use the `google.cloud` API for python. - -This is a work in progress and we would really like feedback for extra features or if there -are any bugs then please report them as `issues on github `_. - -Setting up credentials ----------------------- - -In order to use google cloud storage feature you will need to conigure -your credentials. This is quite easy with the `gcloud` tool. This tool -is ran before exectuing a workflow in the following way:: - - gcloud auth application-default login - -This sets up a JSON file with all of the credentiaals on your home -folder, usually in the file `.config/gcloud/application_default_credentials.json` - -Next you will also need to tell the API which project you are using. -Projects are usually set in the google console and all have a unique -ID. This ID needs to be passed into cgat-core. - -This can be achieved in the following ways: - -* passing project_id into the JASON file:: - - { - "client_id": "764086051850-6qr4p6gpi6hn506pt8ejuq83di341hur.apps.googleusercontent.com", - "client_secret": "d-FL95Q19q7MQmFpd7hHD0Ty", - "refresh_token": "1/d8JxxulX84r3jiJVlt-xMrpDLcIp3RHuxLHtieDu8uA", - "type": "authorized_user", - "project_id": "extended-cache-163811" - } - -* project_id can be set in the `bashrc`:: - - export GCLOUD_PROJECT=extended-cache-163811 - -Download from google storage ----------------------------- - -Using remote files with google cloud can be acieved easily by using `download`, `upload` and `delete_file` functions that are written into a RemoteClass. - -Firstly you will need to initiate the class as follows:: - - from cgatcore.remote.google_cloud import * - GC = GCRemoteObject() - -In order to download a file and use it within the decorator you can follows the example:: - - @transform(GC.download('gc-test',"pipeline.yml", "./pipeline.yml"), - regex("(.*)\.(.*)"), - r"\1.counts") - -This will download the file `pipeline.yml` from the google cloud bucket `gc-test` locally to `./pipeline.yml` -and it will be picked up by the decoratory function as normal. - -Upload to google cloud ----------------------- - -In order to upload files to google cloud you simply need to run:: - - GC.upload('gc-test',"pipeline2.yml", "./pipeline.yml") - -This will upload to the `gc-test` google cloud bucket the `./pipeline.yml` file and it will be saved as -`pipeline2.yml` in that bucket. - -Delete file from AWS S3 ------------------------ - -In order to delete a file from the AWS S3 bucket then you simply run:: - - S3.delete_file('aws-test-boto',"pipeline2.yml") - -This will delete the `pipeline2.yml` file from the `aws-test-boto` bucket. - -Functional example ------------------- - -As a simple example, the following one function pipeline demonstrates the way you can interact with the google cloud:: - - from ruffus import * - import sys - import os - import cgatcore.experiment as E - from cgatcore import pipeline as P - from cgatcore.remote.google_cloud import * - - # load options from the config file - PARAMS = P.get_parameters( - ["%s/pipeline.yml" % os.path.splitext(__file__)[0], - "../pipeline.yml", - "pipeline.yml"]) - - GC = GCRemoteObject() - - - @transform(GC.download('gc-test',"pipeline.yml", "./pipeline.yml"), - regex("(.*)\.(.*)"), - r"\1.counts") - def countWords(infile, outfile): - '''count the number of words in the pipeline configuration files.''' - - # Upload file to google cloud - GC.upload('gc-test',"pipeline2.yml", "/ifs/projects/adam/test_remote/data/pipeline.yml") - - # the command line statement we want to execute - statement = '''awk 'BEGIN { printf("word\\tfreq\\n"); } - {for (i = 1; i <= NF; i++) freq[$i]++} - END { for (word in freq) printf "%%s\\t%%d\\n", word, freq[word] }' - < %(infile)s > %(outfile)s''' - - P.run(statement) - - # Delete file from google cloud - GC.delete_file('gc-test',"pipeline2.yml") - - @follows(countWords) - def full(): - pass diff --git a/docs/remote/S3.rst b/docs/remote/S3.rst deleted file mode 100644 index 91a8704f..00000000 --- a/docs/remote/S3.rst +++ /dev/null @@ -1,126 +0,0 @@ -.. _S3: - -============== -AWS S3 Storage -============== - -This section described how to interact with amazon cloud simple -remote storage (S3). In order to interact with the S3 resource we -use the `boto3 `_ SDK. - -This is a work in progress and we would really like feedback for extra features or if there -are any bugs then please report them as `issues on github `_. - - -Setting up credentials ----------------------- - -In order to use the AWS remote feature you will need to configure -your credentials (The access key and secret key). You can set up -these credentials by adding the keys as environment variables in -a file `~/.aws/credentials` as detailed in the `boto3 configuration page `_. In brief you will need to add the keys as follows:: - - [default] - aws_access_key_id = YOUR_ACCESS_KEY - aws_secret_access_key = YOUR_SECRET_KEY - -These access keys can be found within your S3 AWS console and you can access them by following these steps: -* `Log in to your AWS Management Console. `_ -* Click on your user name at the top right of the page. -* Click My Security Credentials. -* Click Users in left hand menu and select a user. -* Click the Security credentials tab. -* YOUR_ACCESS_KEY is located in the Access key section - -If you have lost YOUR_SECRET_KEY then you will need to Create a new access key, please see `AWS documentation `_ on how to do this. Please not that every 90 days AWS will rotate your access keys. - -In additon, you may also want to configure the default region:: - - [default] - region=us-east-1 - -Once configuration variables have been created then you are ready to interact -with the S3 storage. - -Download from AWS S3 --------------------- - -Using remote files with AWS can be acieved easily by using `download`, `upload` and `delete_file` functions that are written into a RemoteClass. - -Firstly you will need to initiate the class as follows:: - - from cgatcore.remote.aws import * - S3 = S3RemoteObject() - -In order to download a file and use it within the decorator you can follows the example:: - - @transform(S3.download('aws-test-boto',"pipeline.yml", "./pipeline.yml"), - regex("(.*)\.(.*)"), - r"\1.counts") - -This will download the file `pipeline.yml` in the AWS bucket `aws-test-boto` locally to `./pipeline.yml` -and it will be picked up by the decoratory function as normal. - -Upload to AWS S3 ----------------- - -In order to upload files to aws S3 you simply need to run:: - - S3.upload('aws-test-boto',"pipeline2.yml", "./pipeline.yml") - -This will upload to the `aws-test-boto` S3 bucket the `./pipeline.yml` file and it will be saved as -`pipeline2.yml` in that bucket. - -Delete file from AWS S3 ------------------------ - -In order to delete a file from the AWS S3 bucket then you simply run:: - - S3.delete_file('aws-test-boto',"pipeline2.yml") - -This will delete the `pipeline2.yml` file from the `aws-test-boto` bucket. - -Functional example ------------------- - -As a simple example, the following one function pipeline demonstrates the way you can interact with AWS S3:: - - from ruffus import * - import sys - import os - import cgatcore.experiment as E - from cgatcore import pipeline as P - from cgatcore.remote.aws import * - - # load options from the config file - PARAMS = P.get_parameters( - ["%s/pipeline.yml" % os.path.splitext(__file__)[0], - "../pipeline.yml", - "pipeline.yml"]) - - S3 = S3RemoteObject() - - - @transform(S3.download('aws-test-boto',"pipeline.yml", "./pipeline.yml"), - regex("(.*)\.(.*)"), - r"\1.counts") - def countWords(infile, outfile): - '''count the number of words in the pipeline configuration files.''' - - # Upload file to S3 - S3.upload('aws-test-boto',"pipeline2.yml", "/ifs/projects/adam/test_remote/data/pipeline.yml") - - # the command line statement we want to execute - statement = '''awk 'BEGIN { printf("word\\tfreq\\n"); } - {for (i = 1; i <= NF; i++) freq[$i]++} - END { for (word in freq) printf "%%s\\t%%d\\n", word, freq[word] }' - < %(infile)s > %(outfile)s''' - - P.run(statement) - - # Delete file from S3 - S3.delete_file('aws-test-boto',"pipeline2.yml") - - @follows(countWords) - def full(): - pass diff --git a/docs/remote/SFTP.rst b/docs/remote/SFTP.rst deleted file mode 100644 index da9c8d98..00000000 --- a/docs/remote/SFTP.rst +++ /dev/null @@ -1,53 +0,0 @@ -.. _SFTP: - - -======================== -File transfer using SFTP -======================== - -Cgat-core can access files on a remote server vis SFTP. This functionality is provided -by the `pysftp `_ python library. - -Given that you have already set up your SSH key pairs correctly for your server then -accessing the server is easy::: - - from cgatcore.remote.sftp import * - sftp = SFTPRemoteObject() - - -Download from SFTP ------------------- - -Using remote files with SFTP can be achieved easily by using `download` function that -is written into a RemoteClass. - -In order to download a file and use it within the decorator you can follows the example:: - - from cgatcore.remote.SFTP import * - sftp = SFTPRemoteObject() - - @transform(sftp.download('example.com/path/to/file.txt'), - regex("(.*)\.txt"), - r"\1.counts") - - -The remote address must be specified with the host (domain or IP address) and the absolute -path to the file on the remote server. A port may be specified if the SSH daemon on the server -is listening on a port other than 22.:: - - from cgatcore.remote.SFTP import * - sftp = SFTPRemoteObject(port=4040) - - @transform(sftp.download('example.com/path/to/file.txt'), - regex("(.*)\.txt"), - r"\1.counts") - -You can specify standard arguments used by `pysftp `_. For -example:: - - from cgatcore.remote.SFTP import * - sftp = SFTPRemoteObject(username= "cgatpassword", password="cgatpassword") - - @transform(sftp.download('example.com/path/to/file.txt'), - regex("(.*)\.txt"), - r"\1.counts") diff --git a/docs/remote/azure.md b/docs/remote/azure.md new file mode 100644 index 00000000..dcd96ecf --- /dev/null +++ b/docs/remote/azure.md @@ -0,0 +1,123 @@ +# Azure Blob Storage + +This section describes how to interact with Microsoft's Azure Blob Storage, which is used for storing data in containers (similar to buckets in other cloud services). We use the [`azure-storage-blob`](https://pypi.org/project/azure-storage-blob/) package for interacting with Azure storage in Python. + +This documentation is a work in progress. If you find any bugs or want to request extra features, please report them as [issues on GitHub](https://github.com/cgat-developers/cgat-core/issues). + +## Setting up credentials + +To use Azure Blob Storage, you need an **account name** and an **account key** from Azure. These credentials can be found in the Azure Portal under "Access keys" for the storage account. You will need to use these credentials to interact with the Azure containers through the `AzureRemoteObject` class. + +## Using Azure Blob Storage with `AzureRemoteObject` + +The `AzureRemoteObject` class allows you to interact with Azure Blob Storage in your Python scripts or workflows. The operations supported by this class include checking for the existence of containers, downloading blobs, uploading blobs, and deleting blobs. + +First, initiate the class as follows: + +```python +from cgatcore.remote.azure import AzureRemoteObject + +account_name = "your_account_name" +account_key = "your_account_key" + +azure_obj = AzureRemoteObject(account_name=account_name, account_key=account_key) +``` + +### Check if a Container Exists + +To check whether a container exists in Azure Blob Storage: + +```python +azure_obj.exists('my-container') +``` + +If the container does not exist, a `KeyError` will be raised. + +### Download from Azure Blob Storage + +To download a file (blob) from a container, use the `download` method: + +```python +azure_obj.download('my-container', 'my-blob.txt', './local-dir/my-blob.txt') +``` + +This command will download the file named `my-blob.txt` from the container `my-container` and save it locally to `./local-dir/my-blob.txt`. + +### Upload to Azure Blob Storage + +To upload a file to an Azure container, use the `upload` method: + +```python +azure_obj.upload('my-container', 'my-blob.txt', './local-dir/my-blob.txt') +``` + +This will upload the local file `./local-dir/my-blob.txt` to the container `my-container`, where it will be saved as `my-blob.txt`. + +### Delete a File from Azure Blob Storage + +To delete a blob from an Azure container, use the `delete_file` method: + +```python +azure_obj.delete_file('my-container', 'my-blob.txt') +``` + +This command will delete the blob named `my-blob.txt` from the container `my-container`. + +## Functional Example + +Below is an example demonstrating the usage of the `AzureRemoteObject` class within a data processing pipeline: + +```python +from ruffus import * +import sys +import os +import cgatcore.experiment as E +from cgatcore import pipeline as P +from cgatcore.remote.azure import AzureRemoteObject + +# Set up credentials +account_name = "your_account_name" +account_key = "your_account_key" + +azure_obj = AzureRemoteObject(account_name=account_name, account_key=account_key) + +# Load options from the config file +PARAMS = P.get_parameters([ + "%s/pipeline.yml" % os.path.splitext(__file__)[0], + "../pipeline.yml", + "pipeline.yml" +]) + +@transform(azure_obj.download('my-container', 'input.txt', './input.txt'), + regex(r"(.*)\.(.*)"), + r"\1.counts") +def countWords(infile, outfile): + '''Count the number of words in the input file.''' + + # Upload file to Azure Blob Storage + azure_obj.upload('my-container', 'output.txt', './input.txt') + + # The command line statement we want to execute + statement = '''awk 'BEGIN { printf("word\tfreq\n"); } + {for (i = 1; i <= NF; i++) freq[$i]++} + END { for (word in freq) printf "%%s\t%%d\n", word, freq[word] }' + < %(infile)s > %(outfile)s''' + + P.run(statement) + + # Delete file from Azure Blob Storage + azure_obj.delete_file('my-container', 'output.txt') + +@follows(countWords) +def full(): + pass +``` + +In this example: + +1. **Download**: The `countWords` function downloads `input.txt` from the container `my-container` to a local path `./input.txt`. +2. **Word Count**: The function then counts the number of words in the file using the `awk` command. +3. **Upload**: The output is uploaded to Azure Blob Storage. +4. **Delete**: Finally, the uploaded file is deleted from the container. + +This example demonstrates how Azure Blob Storage can be integrated seamlessly into a data pipeline using the `AzureRemoteObject` class. \ No newline at end of file diff --git a/docs/remote/gc.md b/docs/remote/gc.md new file mode 100644 index 00000000..8da6a2dd --- /dev/null +++ b/docs/remote/gc.md @@ -0,0 +1,133 @@ +# Google Cloud Storage + +This section describes how to interact with Google Cloud Storage, specifically dealing with buckets and blobs (files). To interact with the cloud resource, we use the [`google.cloud` API](https://googleapis.dev/python/google-api-core/latest/index.html) for Python. + +This documentation is a work in progress. We welcome any feedback for extra features or, if you find any bugs, please report them as [issues on GitHub](https://github.com/cgat-developers/cgat-core/issues). + +## Setting up credentials + +To use Google Cloud Storage features, you need to configure your credentials. This is quite easy with the `gcloud` tool. You need to run the following command before executing a workflow: + +```bash +gcloud auth application-default login +``` + +This command sets up a JSON file with all of the credentials in your home folder, typically located at `.config/gcloud/application_default_credentials.json`. + +Next, you need to specify which Google Cloud project you are using. Projects are created in the Google Cloud Console and each has a unique ID. This ID needs to be passed into CGAT-core. You can achieve this in two ways: + +1. **Passing the project ID into the JSON file**: + + ```json + { + "client_id": "764086051850-6qr4p6gpi6hn506pt8ejuq83di341hur.apps.googleusercontent.com", + "client_secret": "d-FL95Q19q7MQmFpd7hHD0Ty", + "refresh_token": "1/d8JxxulX84r3jiJVlt-xMrpDLcIp3RHuxLHtieDu8uA", + "type": "authorized_user", + "project_id": "extended-cache-163811" + } + ``` + +2. **Setting the project ID in `.bashrc`**: + + ```bash + export GCLOUD_PROJECT=extended-cache-163811 + ``` + +## Download from Google Cloud Storage + +Using remote files with Google Cloud can be achieved easily by using the `download`, `upload`, and `delete_file` functions that are part of the `GCRemoteObject` class. + +First, initiate the class as follows: + +```python +from cgatcore.remote.google_cloud import GCRemoteObject + +GC = GCRemoteObject() +``` + +To download a file and use it within the decorator, follow this example: + +```python +@transform(GC.download('gc-test', 'pipeline.yml', './pipeline.yml'), + regex(r"(.*)\.(.*)"), + r"\1.counts") +``` + +This will download the file `pipeline.yml` from the Google Cloud bucket `gc-test` to `./pipeline.yml`, and it will be picked up by the decorator function as normal. + +## Upload to Google Cloud Storage + +To upload files to Google Cloud, run: + +```python +GC.upload('gc-test', 'pipeline2.yml', './pipeline.yml') +``` + +This command will upload the local file `./pipeline.yml` to the `gc-test` Google Cloud bucket, where it will be saved as `pipeline2.yml`. + +## Delete a File from Google Cloud Storage + +To delete a file from a Google Cloud bucket, run: + +```python +GC.delete_file('gc-test', 'pipeline2.yml') +``` + +This command will delete the file `pipeline2.yml` from the `gc-test` bucket. + +## Functional Example + +As a simple example, the following one-function pipeline demonstrates how you can interact with Google Cloud: + +```python +from ruffus import * +import sys +import os +import cgatcore.experiment as E +from cgatcore import pipeline as P +from cgatcore.remote.google_cloud import GCRemoteObject + +# Load options from the config file +PARAMS = P.get_parameters([ + "%s/pipeline.yml" % os.path.splitext(__file__)[0], + "../pipeline.yml", + "pipeline.yml" +]) + +GC = GCRemoteObject() + +@transform(GC.download('gc-test', 'pipeline.yml', './pipeline.yml'), + regex(r"(.*)\.(.*)"), + r"\1.counts") +def countWords(infile, outfile): + '''Count the number of words in the pipeline configuration file.''' + + # Upload file to Google Cloud + GC.upload('gc-test', 'pipeline2.yml', '/ifs/projects/adam/test_remote/data/pipeline.yml') + + # The command line statement we want to execute + statement = '''awk 'BEGIN { printf("word\tfreq\n"); } + {for (i = 1; i <= NF; i++) freq[$i]++} + END { for (word in freq) printf "%%s\t%%d\n", word, freq[word] }' + < %(infile)s > %(outfile)s''' + + P.run(statement) + + # Delete file from Google Cloud + GC.delete_file('gc-test', 'pipeline2.yml') + +@follows(countWords) +def full(): + pass +``` + +In this example: + +1. **Download**: The `countWords` function downloads `pipeline.yml` from the `gc-test` bucket to `./pipeline.yml`. +2. **Word Count**: The function counts the number of words in the file using the `awk` command. +3. **Upload**: The processed file is then uploaded to Google Cloud. +4. **Delete**: Finally, the uploaded file is deleted from the bucket. + +This functional example provides a simple illustration of how Google Cloud integration can be achieved within a CGAT pipeline. + diff --git a/docs/remote/s3.md b/docs/remote/s3.md new file mode 100644 index 00000000..51e0b3fd --- /dev/null +++ b/docs/remote/s3.md @@ -0,0 +1,150 @@ +# AWS S3 Storage + +This section describes how to interact with Amazon's cloud storage service (S3). To interact with the S3 resource, we use the [`boto3`](https://boto3.readthedocs.io) SDK. + +This documentation is a work in progress. We welcome any feedback or requests for extra features. If you find any bugs, please report them as [issues on GitHub](https://github.com/cgat-developers/cgat-core/issues). + +## Setting up credentials + +To use the AWS remote feature, you need to configure your credentials (i.e., the access key and secret key). You can set up these credentials by adding them as environment variables in a file `~/.aws/credentials`, as detailed in the [`boto3`](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/quickstart.html#configuration)[ configuration page](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/quickstart.html#configuration). In brief, you need to add the keys as follows: + +```ini +[default] +aws_access_key_id = YOUR_ACCESS_KEY +aws_secret_access_key = YOUR_SECRET_KEY +``` + +These access keys can be found within your S3 AWS console by following these steps: + +1. [Log in to your AWS Management Console.](http://aws.amazon.com/) +2. Click on your username at the top right of the page. +3. Click "My Security Credentials." +4. Click "Users" in the left-hand menu and select a user. +5. Click the "Security credentials" tab. +6. YOUR\_ACCESS\_KEY is located in the "Access key" section. + +If you have lost YOUR\_SECRET\_KEY, you will need to create a new access key. Please see the [AWS documentation](https://aws.amazon.com/blogs/security/wheres-my-secret-access-key/) for instructions on how to do this. Note that every 90 days, AWS will rotate your access keys. + +Additionally, you may want to configure the default region: + +```ini +[default] +region=us-east-1 +``` + +Once configuration variables have been created, you are ready to interact with the S3 storage. + +## Using S3 with S3Pipeline + +The `S3Pipeline` class in `file_handler.py` is a convenient tool to integrate AWS S3 operations into data processing workflows. The class provides decorators to simplify working with S3 files in different stages of the pipeline. + +First, initiate the class as follows: + +```python +from cgatcore import pipeline as P +from cgatcore.remote.file_handler import S3Pipeline + +pipeline = S3Pipeline(name="MyPipeline", temp_dir="/tmp") +``` + +The `S3Pipeline` class provides several decorators: + +### Download from AWS S3 with `s3_transform` + +To download a file, process it, and save the output, use the `s3_transform` decorator. Here's an example: + +```python +@pipeline.s3_transform('s3://aws-test-boto/pipeline.yml', '_counts', 's3://aws-test-boto/pipeline_counts.yml') +def countWords(input_file, output_file): + """Count the number of words in the file.""" + with open(input_file, 'r') as infile, open(output_file, 'w') as outfile: + content = infile.read() + words = content.split() + outfile.write(f"word\tfreq\n") + word_freq = {word: words.count(word) for word in set(words)} + for word, count in word_freq.items(): + outfile.write(f"{word}\t{count}\n") +``` + +This decorator downloads `pipeline.yml` from the S3 bucket `aws-test-boto` to a local temporary directory, processes it, and saves the results to a new file (`pipeline_counts.yml`) back on S3. + +### Merging Multiple Files with `s3_merge` + +If you need to merge multiple files from S3 into one, use the `s3_merge` decorator. Here's how: + +```python +@pipeline.s3_merge(['s3://aws-test-boto/file1.txt', 's3://aws-test-boto/file2.txt'], 's3://aws-test-boto/merged_file.txt') +def mergeFiles(input_files, output_file): + """Merge multiple input files into one.""" + with open(output_file, 'w') as outfile: + for file in input_files: + with open(file, 'r') as infile: + outfile.write(infile.read()) +``` + +### Splitting a File with `s3_split` + +To split a single input file into multiple output files, use the `s3_split` decorator: + +```python +@pipeline.s3_split('s3://aws-test-boto/largefile.txt', ['s3://aws-test-boto/part1.txt', 's3://aws-test-boto/part2.txt']) +def splitFile(input_file, output_files): + """Split the input file into multiple output files.""" + with open(input_file, 'r') as infile: + content = infile.readlines() + mid = len(content) // 2 + with open(output_files[0], 'w') as outfile1: + outfile1.writelines(content[:mid]) + with open(output_files[1], 'w') as outfile2: + outfile2.writelines(content[mid:]) +``` + +This splits the large input file into two separate parts, saving them as different S3 objects. + +### Running the Pipeline + +To run all tasks in the pipeline: + +```python +pipeline.run() +``` + +This will sequentially execute all tasks that have been added to the pipeline through the decorators. + +## Example: Full Pipeline + +Here is an example of a simple pipeline that uses the `S3Pipeline` class to count words in a file, merge two files, and then delete a file: + +```python +from cgatcore.remote.file_handler import S3Pipeline + +pipeline = S3Pipeline(name="ExamplePipeline", temp_dir="/tmp") + +@pipeline.s3_transform('s3://aws-test-boto/pipeline.yml', '_counts', 's3://aws-test-boto/pipeline_counts.yml') +def countWords(input_file, output_file): + with open(input_file, 'r') as infile, open(output_file, 'w') as outfile: + content = infile.read() + words = content.split() + outfile.write(f"word\tfreq\n") + word_freq = {word: words.count(word) for word in set(words)} + for word, count in word_freq.items(): + outfile.write(f"{word}\t{count}\n") + +@pipeline.s3_merge(['s3://aws-test-boto/file1.txt', 's3://aws-test-boto/file2.txt'], 's3://aws-test-boto/merged_file.txt') +def mergeFiles(input_files, output_file): + with open(output_file, 'w') as outfile: + for file in input_files: + with open(file, 'r') as infile: + outfile.write(infile.read()) + +pipeline.run() +``` + +In this example: + +1. **Download and Transform**: The `countWords` function downloads a file from S3, counts the words, and uploads the output back to S3. +2. **Merge**: The `mergeFiles` function merges two files from S3 and writes the merged output back to S3. +3. **Run**: Finally, all the tasks are executed sequentially with `pipeline.run()`. + +This updated documentation provides a more accurate representation of the current capabilities of the `S3Pipeline` class, allowing for an easier and more efficient way to handle AWS S3 resources within your pipelines. + diff --git a/docs/s3_integration/configuring_s3.md b/docs/s3_integration/configuring_s3.md new file mode 100644 index 00000000..75126a7c --- /dev/null +++ b/docs/s3_integration/configuring_s3.md @@ -0,0 +1,90 @@ +# Configuring S3 for Pipeline Execution + +To integrate AWS S3 into your CGAT pipeline, you need to configure S3 access to facilitate file handling for reading and writing data. This document explains how to set up S3 configuration for the CGAT pipelines. + +## Overview + +`configure_s3()` is a utility function provided by the CGATcore pipeline tools to handle authentication and access to AWS S3. This function allows you to provide credentials, specify regions, and set up other configurations that enable seamless integration of S3 into your workflow. + +### Basic Configuration + +To get started, you will need to import and use the `configure_s3()` function. Here is a basic example: + +```python +from cgatcore.pipeline import configure_s3 + +configure_s3(aws_access_key_id="YOUR_AWS_ACCESS_KEY", aws_secret_access_key="YOUR_AWS_SECRET_KEY") +``` + +### Configurable Parameters + +- **`aws_access_key_id`**: Your AWS access key, used to authenticate and identify the user. +- **`aws_secret_access_key`**: Your secret key, corresponding to your access key. +- **`region_name`** (optional): AWS region where your S3 bucket is located. Defaults to the region set in your environment, if available. +- **`profile_name`** (optional): Name of the AWS profile to use if you have multiple profiles configured locally. + +### Using AWS Profiles + +If you have multiple AWS profiles configured locally, you can use the `profile_name` parameter to select the appropriate one without hardcoding the access keys in your code: + +```python +configure_s3(profile_name="my-profile") +``` + +### Configuring Endpoints + +To use custom endpoints, such as when working with MinIO or an AWS-compatible service: + +```python +configure_s3( + aws_access_key_id="YOUR_AWS_ACCESS_KEY", + aws_secret_access_key="YOUR_AWS_SECRET_KEY", + endpoint_url="https://custom-endpoint.com" +) +``` + +### Security Recommendations + +1. **Environment Variables**: Use environment variables to set credentials securely rather than hardcoding them in your scripts. This avoids potential exposure of credentials: + + ```bash + export AWS_ACCESS_KEY_ID=YOUR_AWS_ACCESS_KEY + export AWS_SECRET_ACCESS_KEY=YOUR_AWS_SECRET_KEY + ``` + +2. **AWS IAM Roles**: If you are running the pipeline on AWS infrastructure (such as EC2 instances), it's recommended to use IAM roles. These roles provide temporary security credentials that are automatically rotated by AWS. + +### Example Pipeline Integration + +After configuring S3, you can seamlessly use the S3-aware methods within your pipeline. Below is an example: + +```python +from cgatcore.pipeline import get_s3_pipeline + +# Configure S3 access +configure_s3(profile_name="my-profile") + +# Instantiate the S3 pipeline +s3_pipeline = get_s3_pipeline() + +# Use S3-aware methods in the pipeline +@s3_pipeline.s3_transform("s3://my-bucket/input.txt", suffix(".txt"), ".processed") +def process_s3_file(infile, outfile): + # Processing logic + with open(infile, 'r') as fin: + data = fin.read() + processed_data = data.upper() + with open(outfile, 'w') as fout: + fout.write(processed_data) +``` + +### Summary + +- Use the `configure_s3()` function to set up AWS credentials and S3 access. +- Options are available to use IAM roles, profiles, or custom endpoints. +- Use the S3-aware decorators to integrate S3 files seamlessly in your pipeline. + +## Additional Resources + +- [AWS IAM Roles Documentation](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles.html) +- [AWS CLI Configuration and Credential Files](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-files.html) \ No newline at end of file diff --git a/docs/s3_integration/s3_decorators.md b/docs/s3_integration/s3_decorators.md new file mode 100644 index 00000000..e47171cd --- /dev/null +++ b/docs/s3_integration/s3_decorators.md @@ -0,0 +1,5 @@ +# CGATcore S3 decorators + +::: cgatcore.pipeline + :members: + :show-inheritance: diff --git a/docs/s3_integration/s3_pipeline.md b/docs/s3_integration/s3_pipeline.md new file mode 100644 index 00000000..a5af3d27 --- /dev/null +++ b/docs/s3_integration/s3_pipeline.md @@ -0,0 +1,74 @@ +# S3 Pipeline + +The `S3Pipeline` class is part of the integration with AWS S3, enabling seamless data handling in CGAT pipelines that use both local files and S3 storage. This is particularly useful when working with large datasets that are better managed in cloud storage or when collaborating across multiple locations. + +## Overview + +`S3Pipeline` provides the following functionalities: + +- Integration of AWS S3 into CGAT pipeline workflows +- Lazy-loading of S3-specific classes and functions to avoid circular dependencies +- Facilitating operations on files that reside on S3, making it possible to apply transformations and merges without copying everything locally + +### Example Usage + +The `S3Pipeline` class can be accessed through the `get_s3_pipeline()` function, which returns an instance that is lazy-loaded to prevent issues related to circular imports. Below is an example of how to use it: + +```python +from cgatcore.pipeline import get_s3_pipeline + +# Instantiate the S3 pipeline +s3_pipeline = get_s3_pipeline() + +# Use methods from s3_pipeline as needed +s3_pipeline.s3_transform(...) +``` + +### Building a Function Using `S3Pipeline` + +To build a function that utilises `S3Pipeline`, you can follow a few simple steps. Below is a guide on how to create a function that uses the `s3_transform` method to process data from S3: + +1. **Import the required modules**: First, import `get_s3_pipeline` from `cgatcore.pipeline`. +2. **Instantiate the pipeline**: Use `get_s3_pipeline()` to create an instance of `S3Pipeline`. +3. **Define your function**: Use the S3-aware methods like `s3_transform()` to perform the desired operations on your S3 files. + +#### Example Function + +```python +from cgatcore.pipeline import get_s3_pipeline + +# Instantiate the S3 pipeline +s3_pipeline = get_s3_pipeline() + +# Define a function that uses s3_transform +def process_s3_data(input_s3_path, output_s3_path): + @s3_pipeline.s3_transform(input_s3_path, suffix(".txt"), output_s3_path) + def transform_data(infile, outfile): + # Add your processing logic here + with open(infile, 'r') as fin: + data = fin.read() + # Example transformation + processed_data = data.upper() + with open(outfile, 'w') as fout: + fout.write(processed_data) + + # Run the transformation + transform_data() +``` + +### Methods in `S3Pipeline` + +- **`s3_transform(*args, **kwargs)`**: Perform a transformation on data stored in S3, similar to Ruffus `transform()` but adapted for S3 files. +- **`s3_merge(*args, **kwargs)`**: Merge multiple input files into one, allowing the files to be located on S3. +- **`s3_split(*args, **kwargs)`**: Split input data into smaller chunks, enabling parallel processing, even when the input resides on S3. +- **`s3_originate(*args, **kwargs)`**: Create new files directly in S3. +- **`s3_follows(*args, **kwargs)`**: Indicate a dependency on another task, ensuring correct task ordering even for S3 files. + +These methods are intended to be directly equivalent to standard Ruffus methods, allowing pipelines to easily mix and match S3-based and local operations. + +## Why Use `S3Pipeline`? + +- **Scalable Data Management**: Keeps large datasets in the cloud, reducing local storage requirements. +- **Seamless Integration**: Provides a drop-in replacement for standard decorators, enabling hybrid workflows involving both local and cloud files. +- **Lazy Loading**: Optimised to initialise S3 components only when they are needed, minimising overhead and avoiding unnecessary dependencies. + diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 00000000..8abf2a6a --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,57 @@ +site_name: CGAT-core Documentation +theme: material +plugins: + - search + - mkdocstrings: + default_handler: python + +markdown_extensions: + - admonition + - codehilite + - toc: + permalink: true +nav: + - Home: index.md + - Getting Started: + - Installation: getting_started/installation.md + - Cluster Configuration: getting_started/run_parameters.md + - Running a Pipeline: getting_started/examples.md + - Running a Pipeline Tutorial: getting_started/tutorial.md + - Run Parameters: defining_workflow/run_parameters.md + - Building a Workflow: + - Workflow Overview: defining_workflow/writing_workflows.md + - Writing a Workflow Tutorial: defining_workflow/tutorial.md + - Pipeline Modules: + - Overview: pipeline_modules/overview.md + - Control: pipeline_modules/control.md + - Database: pipeline_modules/database.md + - Files: pipeline_modules/files.md + - Cluster: pipeline_modules/cluster.md + - Execution: pipeline_modules/execution.md + - Utils: pipeline_modules/utils.md + - Parameters: pipeline_modules/parameters.md + - Container support: + - Individual tasks: container/tasks.md + - Whole pipeline: docs/container/whole_pipeline.md + - S3 Cloud: + - S3 Pipeline: s3_integration/s3_pipeline.md + - S3 Decorators: s3_integration/s3_decorators.md + - Configuring S3: s3_integration/configuring_s3.md + - Working with Remote Files: + - S3: remote/s3.md + - Google Cloud: remote/gc.md + - Azure: remote/azure.md + - Core Functions: + - Pipeline: function_doc/pipeline.md + - Core: + - Experiment: function_doc/experiment.md + - csv2db: function_doc/csv2db.md + - Database: function_doc/database.md + - IOTools: function_doc/iotools.md + - Logfile: function_doc/logfile.md + - Project Info: + - Contributing: project_info/contributing.md + - How to Contribute: project_info/how_to_contribute.md + - Citations: project_info/citations.md + - FAQ: project_info/faq.md + - License: project_info/license.md \ No newline at end of file diff --git a/readthedocs.yml b/readthedocs.yml deleted file mode 100644 index 8dd73203..00000000 --- a/readthedocs.yml +++ /dev/null @@ -1,2 +0,0 @@ -conda: - file: conda/environments/readthedocs.yml diff --git a/tests/test_container.py b/tests/test_container.py new file mode 100644 index 00000000..1be9c4aa --- /dev/null +++ b/tests/test_container.py @@ -0,0 +1,140 @@ +import pytest +from unittest.mock import MagicMock, patch +from cgatcore.pipeline.execution import run, Executor + +mocked_params = { + "cluster": { + "options": "", + "queue": None, + "memory_default": "4G", + "tmpdir": "/tmp", + "monitor_interval_queued_default": 30, + "monitor_interval_running_default": 30, + }, + "cluster_tmpdir": "/tmp", + "tmpdir": "/tmp", + "work_dir": "/tmp", + "os": "Linux", +} + + +@patch("cgatcore.pipeline.execution.get_params", return_value=mocked_params) +def test_run_with_container_support(mock_get_params): + """Test running a command with container support.""" + with patch("cgatcore.pipeline.execution.subprocess.Popen") as mock_popen: + mock_process = MagicMock() + mock_process.communicate.return_value = (b"Hello from Docker", b"") + mock_process.returncode = 0 + mock_process.pid = 12345 + mock_popen.return_value = mock_process + + # Use Executor instance + executor = Executor() + + # Mock the method that collects benchmark data + with patch.object(executor, "collect_benchmark_data", return_value=None) as mock_collect_benchmark: + executor.run( + statement_list=["echo Hello from Docker"], + container_runtime="docker", + image="ubuntu:20.04", + ) + + mock_popen.assert_called_once() + actual_call = mock_popen.call_args[0][0] + print(f"Actual call to subprocess.Popen: {actual_call}") + assert "docker run --rm" in actual_call + assert "ubuntu:20.04" in actual_call + assert "echo Hello from Docker" in actual_call + + # Validate that collect_benchmark_data was called + mock_collect_benchmark.assert_called_once() + + +@patch("cgatcore.pipeline.execution.get_params", return_value=mocked_params) +def test_run_without_container_support(mock_get_params): + """Test running a command without container support.""" + with patch("cgatcore.pipeline.execution.subprocess.Popen") as mock_popen: + mock_process = MagicMock() + mock_process.communicate.return_value = (b"Hello from local execution", b"") + mock_process.returncode = 0 + mock_process.pid = 12345 + mock_popen.return_value = mock_process + + # Use Executor instance + executor = Executor() + + # Mock the method that collects benchmark data + with patch.object(executor, "collect_benchmark_data", return_value=None) as mock_collect_benchmark: + executor.run(statement_list=["echo Hello from local execution"]) + + mock_popen.assert_called_once() + actual_call = mock_popen.call_args[0][0] + print(f"Actual call to subprocess.Popen: {actual_call}") + assert "echo Hello from local execution" in actual_call + + # Validate that collect_benchmark_data was called + mock_collect_benchmark.assert_called_once() + + +@patch("cgatcore.pipeline.execution.get_params", return_value=mocked_params) +def test_invalid_container_runtime(mock_get_params): + """Test handling of invalid container runtime.""" + with pytest.raises(ValueError, match="Container runtime must be 'docker' or 'singularity'"): + executor = Executor() + executor.run(statement_list=["echo Test"], container_runtime="invalid_runtime") + + +@patch("cgatcore.pipeline.execution.get_params", return_value=mocked_params) +def test_missing_required_params(mock_get_params): + """Test handling of missing required parameters.""" + with pytest.raises(ValueError, match="An image must be specified when using a container runtime"): + executor = Executor() + executor.run(statement_list=["echo Test"], container_runtime="docker") + + +@patch("cgatcore.pipeline.execution.get_params", return_value=mocked_params) +@patch("cgatcore.pipeline.execution.Executor.cleanup_failed_job") +def test_cleanup_on_failure(mock_cleanup, mock_get_params): + """Test cleanup logic when a job fails.""" + from cgatcore.pipeline.execution import Executor # Ensure proper import + + # Create an instance of Executor + executor = Executor() + + with patch("cgatcore.pipeline.execution.subprocess.Popen") as mock_popen: + # Mock a process failure + mock_process = MagicMock() + mock_process.communicate.return_value = (b"", b"Some error occurred") + mock_process.returncode = 1 # Simulate failure + mock_popen.return_value = mock_process + + # Attempt to run a failing command + with pytest.raises(OSError, match="Job failed with return code"): + executor.run( + statement_list=["echo This will fail"], + container_runtime="docker", # Pass a valid container_runtime + image="ubuntu:20.04" # Add a valid image since container_runtime is provided + ) + + # Ensure cleanup_failed_job was called + mock_cleanup.assert_called_once() + print(f"Arguments to cleanup_failed_job: {mock_cleanup.call_args}") + + # Check subprocess was invoked + mock_popen.assert_called_once() + print(f"Subprocess call: {mock_popen.call_args_list}") + + +@patch("cgatcore.pipeline.execution.get_params", return_value=mocked_params) +def test_job_tracking(mock_get_params): + """Test job tracking lifecycle.""" + with patch("cgatcore.pipeline.execution.subprocess.Popen") as mock_popen: + mock_process = MagicMock() + mock_process.communicate.return_value = (b"output", b"") + mock_process.returncode = 0 + mock_process.pid = 12345 + mock_popen.return_value = mock_process + + run(statement=["echo Job tracking test"]) + + mock_popen.assert_called_once() diff --git a/tests/test_container_config.py b/tests/test_container_config.py new file mode 100644 index 00000000..6258dd63 --- /dev/null +++ b/tests/test_container_config.py @@ -0,0 +1,81 @@ +import unittest +from unittest.mock import patch, MagicMock +from cgatcore.pipeline.execution import Executor, ContainerConfig + + +class TestContainerConfig(unittest.TestCase): + def setUp(self): + """Set up a mock for get_params and an Executor instance.""" + patcher = patch("cgatcore.pipeline.execution.get_params") + self.mock_get_params = patcher.start() + self.addCleanup(patcher.stop) + + # Mock the return value of get_params + self.mock_get_params.return_value = { + "cluster": { + "options": "", + "queue": None, + "memory_default": "4G", + "tmpdir": "/tmp", + "monitor_interval_queued_default": 30, + "monitor_interval_running_default": 30, + }, + "cluster_tmpdir": "/tmp", + "tmpdir": "/tmp", + "work_dir": "/tmp", + "os": "Linux", + } + + self.executor = Executor() + + def test_set_container_config_docker(self): + """Test setting container configuration for Docker.""" + self.executor.set_container_config( + image="ubuntu:20.04", + volumes=["/data:/data", "/reference:/reference"], + env_vars={"TEST_VAR": "value"}, + runtime="docker" + ) + config = self.executor.container_config + self.assertIsInstance(config, ContainerConfig) + self.assertEqual(config.image, "ubuntu:20.04") + self.assertEqual(config.runtime, "docker") + self.assertIn("/data:/data", config.volumes) + self.assertIn("/reference:/reference", config.volumes) + self.assertEqual(config.env_vars["TEST_VAR"], "value") + + def test_set_container_config_singularity(self): + """Test setting container configuration for Singularity.""" + self.executor.set_container_config( + image="/path/to/container.sif", + volumes=["/data:/data", "/reference:/reference"], + env_vars={"TEST_VAR": "value"}, + runtime="singularity" + ) + config = self.executor.container_config + self.assertIsInstance(config, ContainerConfig) + self.assertEqual(config.image, "/path/to/container.sif") + self.assertEqual(config.runtime, "singularity") + self.assertIn("/data:/data", config.volumes) + self.assertIn("/reference:/reference", config.volumes) + self.assertEqual(config.env_vars["TEST_VAR"], "value") + + def test_invalid_runtime(self): + """Test setting an invalid container runtime.""" + with self.assertRaises(ValueError) as context: + self.executor.set_container_config( + image="ubuntu:20.04", runtime="invalid_runtime" + ) + self.assertIn("Unsupported container runtime", str(context.exception)) + + def test_missing_image(self): + """Test setting container configuration without an image.""" + with self.assertRaises(ValueError) as context: + self.executor.set_container_config( + image=None, runtime="docker" + ) + self.assertIn("An image must be specified", str(context.exception)) + + +if __name__ == "__main__": + unittest.main()