diff --git a/cgatcore/__init__.py b/cgatcore/__init__.py index ef099e8b..f0b602b6 100644 --- a/cgatcore/__init__.py +++ b/cgatcore/__init__.py @@ -1,5 +1,32 @@ # cgatcore/__init__.py -import importlib -pipeline = importlib.import_module('cgatcore.pipeline') -remote = importlib.import_module('cgatcore.remote') +class CgatCore: + """Main class to encapsulate CGAT core functionality.""" + + def __init__(self): + self._pipeline = None + self._remote = None + + @property + def pipeline(self): + """Lazy load the pipeline module.""" + if self._pipeline is None: + from cgatcore import pipeline + self._pipeline = pipeline + return self._pipeline + + @property + def remote(self): + """Lazy load the remote module.""" + if self._remote is None: + from cgatcore import remote + self._remote = remote + return self._remote + + +# Create a global instance of the CgatCore class +cgatcore = CgatCore() + +# Expose the pipeline and remote attributes +pipeline = cgatcore.pipeline +remote = cgatcore.remote diff --git a/cgatcore/pipeline/__init__.py b/cgatcore/pipeline/__init__.py index a75d3db2..10d47cc8 100644 --- a/cgatcore/pipeline/__init__.py +++ b/cgatcore/pipeline/__init__.py @@ -1,160 +1,13 @@ -''' -pipeline.py - Tools for CGAT Ruffus Pipelines +# cgatcore/pipeline/__init__.py + +"""pipeline.py - Tools for CGAT Ruffus Pipelines ============================================= This module provides a comprehensive set of tools to facilitate the creation and management of data processing pipelines using CGAT Ruffus. It includes functionalities for pipeline control, logging, parameterization, task execution, database uploads, temporary file management, and integration with AWS S3. - -**Features:** - -- **Pipeline Control:** Command-line interface for executing, showing, and managing pipeline tasks. -- **Logging:** Configures logging to files and RabbitMQ for real-time monitoring. -- **Parameterization:** Loads and manages configuration parameters from various files. -- **Task Execution:** Manages the execution of tasks, supporting both local and cluster environments. -- **Database Upload:** Utilities for uploading processed data to databases. -- **Temporary File Management:** Functions to handle temporary files and directories. -- **AWS S3 Integration:** Support for processing files stored in AWS S3. - -**Example Usage:** - -```python -from cgatcore import pipeline as P - -@P.transform("input.txt", suffix(".txt"), ".processed.txt") -def process_data(infile, outfile): - # Processing logic here - pass - -if __name__ == "__main__": - P.main() - -Logging -------- - -Logging is set up by :func:`main`. Logging messages will be sent to -the file :file:`pipeline.log` in the current directory. Additionally, -messages are sent to an RabbitMQ_ message exchange to permit -monitoring of pipeline progress. - -Running tasks -------------- - -:mod:`pipeline` provides a :func:`pipeline.run` method to control -running commandline tools. The :func:`pipeline.run` method takes care -of distributing these tasks to the cluster. It takes into -consideration command line options such as ``--cluster-queue``. The -command line option ``--local`` will run jobs locally for testing -purposes. - -For running Python code that is inside a module in a distributed -function, use the :func:`submit` function. The :func:`execute` method -runs a command locally. - -Functions such as :func:`shellquote`, :func:`getCallerLocals`, -:func:`getCaller`, :func:`buildStatement`, :func:`expandStatement`, -:func:`joinStatements` support the parameter interpolation mechanism -used in :mod:`pipeline`. - -Parameterisation ----------------- - -:mod:`pipeline` provides hooks for reading pipeline configuration -values from :file:`.ini` files and making them available inside ruffus_ -tasks. The fundamental usage is a call to :func:`getParamaters` with -a list of configuration files, typically:: - - # load options from the config file - P.get_parameters( - ["%s/pipeline.yml" % os.path.splitext(__file__)[0], - "../pipeline.yml", - "pipeline.yml"]) - -The :mod:`pipeline` module defines a global variable :data:`PARAMS` -that provides access the configuration values. To get a handle to -this variable outside a pipeline script, call :func:`getParams`:: - - my_cmd = "%(scripts_dir)s/bam2bam.py" % P.getParams() - -Functions such as :func:`configToDictionary`, :func:`loadParameters` -:func:`matchParameter`, :func:`substituteParameters` support this -functionality. - -Functions such as :func:`asList` and :func:`isTrue` are useful to work -with parameters. - -The method :func:`peekParameters` allows one to programmatically read the -parameters of another pipeline. - -Temporary files ---------------- - -Tasks containg multiple steps often require temporary memory storage -locations. The functions :func:`getTempFilename`, :func:`getTempFile` -and :func:`getTempDir` provide these. These functions are aware of the -temporary storage locations either specified in configuration files or -on the command line and distinguish between the ``private`` locations -that are visible only within a particular compute node, and ``shared`` -locations that are visible between compute nodes and typically on a -network mounted location. - -Requirements ------------- - -The methods :func:`checkExecutables`, :func:`checkScripts` and -:func:`checkParameter` check for the presence of executables, scripts -or parameters. These methods are useful to perform pre-run checks -inside a pipeline if a particular requirement is met. But see also the -``check`` commandline command. - -database upload ---------------- - -To assist with uploading data into a database, :mod:`pipeline` provides -several utility functions for conveniently uploading data. The :func:`load` -method uploads data in a tab-separated file:: - - @P.transform("*.tsv.gz", suffix(".tsv.gz"), ".load") - def loadData(infile, outfile): - P.load(infile, outfile) - -The methods :func:`mergeAndLoad` and :func:`concatenateAndLoad` upload -multiple files into same database by combining them first. The method -:func:`createView` creates a table or view derived from other tables -in the database. The function :func:`importFromIterator` uploads -data from a python list or other iterable directly. - -The functions :func:`tablequote` and :func:`toTable` translate track -names derived from filenames into names that are suitable for tables. - -The method :func:`build_load_statement` can be used to create an -upload command that can be added to command line statements to -directly upload data without storing an intermediate file. - -The method :func:`connect` returns a database handle for querying the -database. - -Package layout --------------- - -The module is arranged as a python package with several submodules. Functions -within a submodule to be exported are all imported to the namespace of -:mod:`pipeline`. - -.. toctree:: - - cgatcore.pipeline.control - cgatcore.pipeline.database - cgatcore.pipeline.execution - cgatcore.pipeline.files - cgatcore.pipeline.parameters - cgatcore.pipeline.utils - - -''' -# cgatcore/pipeline/__init__.py - +""" # Import existing pipeline functionality from cgatcore.pipeline.control import * @@ -200,6 +53,7 @@ def loadData(infile, outfile): 'S3Pipeline', 'S3Mapper', 's3_path_to_local', 'suffix', 's3_mapper', 'configure_s3' ] + # Add a docstring for the module __doc__ = """ This module provides pipeline functionality for cgat-core, including support for AWS S3. @@ -226,4 +80,3 @@ def process_s3_file(infile, outfile): # Configure S3 credentials if needed P.configure_s3(aws_access_key_id="YOUR_KEY", aws_secret_access_key="YOUR_SECRET") """ -