diff --git a/bcftbx/SolidData.py b/bcftbx/SolidData.py index 2c73306..94c80ba 100644 --- a/bcftbx/SolidData.py +++ b/bcftbx/SolidData.py @@ -1,5 +1,5 @@ # SolidData.py: module for handling data about SOLiD sequencer runs -# Copyright (C) University of Manchester 2011-2019 Peter Briggs +# Copyright (C) University of Manchester 2011-2024 Peter Briggs # ######################################################################## # @@ -8,45 +8,15 @@ ######################################################################### """ -Provides classes for extracting data about SOLiD runs from directory -structure, data files and naming conventions. +Legacy module providing classes for extracting data about SOLiD runs +from directory structure, data files and naming conventions. -Typical usage is to create a new SolidRun instance by pointing it at -the top-level output directory produced by the sequencer: +The functionality of the module has been moved to the 'platforms.solid.data' +module, which supersedes this one. This module is now deprecated and +will be removed in a future release. ->>> solid_run = SolidRun('/path/to/solid0123_20141225_FRAG_BC') - -This will automatically attempt to collect the data about the run, which -can then be accessed via other objects linked through the SolidRun -object's properties. - -The most useful are: - -* SolidRun.run_info: a SolidRunInfo object which holds data extracted - from the run name (e.g. instrument, datestamp etc) - -* SolidRun.samples: a list of SolidSample objects which hold data about - each of the samples in the run. - -Each sample in turn holds a list of libraries within that sample -(SolidLibrary objects in 'SolidSample.libraries') and a list of projects -(SolidProject objects in 'SolidSample.projects'). The 'getLibrary' and -'getProject' methods also provide ways to look up specific libraries -or projects. - -Projects are groupings of libraries (based on library names) which are -assumed to form a single experiment. The libraries within a project can -be obtained via the SolidLibrary.projects, or using the 'getLibrary' -method. - -Finally, SolidLibrary objects hold data about the location of the -primary data files. The 'SolidLibrary.csfasta' and 'SolidLibrary.qual' -properties hold the locations of the data for the F3 reads, while for -paired-end runs the 'SolidLibrary.csfasta_f5' and 'SolidLibrary.qual_f5' -properties point to the F5 reads. - -(The 'is_paired_end' function can be used to test whether a SolidRun -object holds data for a paired-end run.) +The legacy classes have been reimplemented as wrappers to the classes +in the newer module, to preserve backwards compatibility. """ @@ -54,1291 +24,232 @@ # Import modules that this module depends on ####################################################################### -from builtins import str -import sys -import os -import io -import string -import logging -from . import utils +from .platforms.solid.data import * ####################################################################### # Class definitions ####################################################################### -class SolidRun: - """Describe a SOLiD run. - - The SolidRun class provides an interface to data about a SOLiD - run. It analyses the SOLiD data directory to look for run - definitions, statistics files and primary data files. - - It uses the same terminology as the SETS interface and the data - files produced by the SOLiD instrument, so a run contains - 'samples' and each sample contains one or more 'libraries'. - - One initialised, access the data about the run via the SolidRun - object's properties: - - - run_dir: directory with the run data - - run_name: name of the run e.g. solid0123_20130426_FRAG_BC - - run_info: a SolidRunInfo object with data derived from the run name - - run_definition: a SolidRunDefinition object with data extracted from - the run_definition.txt file - - samples: a list of SolidSample objects representing the samples in - the run +class SolidRun(Run): + """ + Describe a SOLiD run. """ - def __init__(self,solid_run_dir): """Create and populate a new SolidRun instance. Arguments: - solid_run_dir: path to the top-level directory holding the files - generated by the SOLiD sequencer run e.g. - /path/to/SOLiD/data/solid0123_20130426_FRAG_BC - """ - - # Initialise - self.run_dir = None - self.run_name = None - self.run_info = None - self.run_definition = None - self.samples = [] - - # Basic data the supplied directory name - if not os.path.isdir(os.path.abspath(solid_run_dir)): - # Directory not found - logging.info("SOLiD data dir '%s' not found" % solid_run_dir) - return - self.run_dir = os.path.abspath(solid_run_dir) - - # Locate and process the run definition file - self.run_name = self.run_dir.strip(os.sep).split(os.sep)[-1] - self.run_defn_filn = os.path.join(self.run_dir, - self.run_name+"_run_definition.txt") - if not os.path.isfile(self.run_defn_filn): - # Unable to find run definition - logging.warning("Unable to find run definition file for %s" % self.run_dir) - # Attempt to recover: look for other possible candidates - self.run_defn_filn = None - for f in os.listdir(self.run_dir): - if f.endswith("_run_definition.txt"): - self.run_defn_filn = os.path.join(self.run_dir,f) - logging.warning("%s: using run definition file %s" % - (os.path.basename(self.run_dir),self.run_defn_filn)) - break - - if self.run_defn_filn: - # Populate run definition object - self.run_definition = SolidRunDefinition(self.run_defn_filn) - # Get run name and info - self.run_name = self.run_definition.runName - self.run_info = SolidRunInfo(self.run_name) - # Populate libraries - for i in range(0,self.run_definition.nSamples()): - sample_name = self.run_definition.getDataItem('sampleName',i) - library_name = self.run_definition.getDataItem('library',i) - # Barcoded samples - # - # Look for content in the "barcodes" column for the library - # in the run definition file - # - # There may be several barcoded samples - # Example barcode items: - # --> "1" - # --> "1,2,3,4,5,6,7,8" - # (or could be empty) - try: - barcodes = self.run_definition.getDataItem('barcodes',i) - except IndexError: - barcodes = '' - logging.debug("%s: barcodes: %s" % (library_name,barcodes)) - library_is_barcoded = (barcodes != '' and barcodes) - if library_is_barcoded: - barcodes = barcodes.strip('"').split(',') - - # Look for the directory with the results - # - # There should be a symlink "results" that will - # point to the actual results directory - results = os.path.join(self.run_dir,sample_name,'results') - if os.path.islink(results): - libraries_dir = os.path.join(self.run_dir, - sample_name, - os.readlink(results), - 'libraries') - else: - libraries_dir = None - self.add_library(sample_name,library_name, - libraries_dir,library_is_barcoded) - else: - logging.warning("No run definition file found for %s" % self.run_dir) - # Improvise run name and info - self.run_name = os.path.basename(self.run_dir) - self.run_info = SolidRunInfo(self.run_name) - # Try to guess samples and libraries - samples = [] - for s in utils.list_dirs(self.run_dir): - logging.debug("Examining subdir %s" % s) - # Look for 'results' subdir - results = os.path.join(self.run_dir,s,'results') - if not os.path.isdir(results): - continue - # Look for 'libraries' subdir - if os.path.islink(results): - libraries_dir = os.path.join(self.run_dir,s, - os.readlink(results), - 'libraries') - else: - continue - # Look for possible libraries - for d in utils.list_dirs(libraries_dir): - logging.debug("Examining putative library subdir %s" % d) - self.add_library(s,d,libraries_dir,False) - - def add_library(self,sample_name,library_name,libraries_dir,is_barcoded): - """Add a library to the SolidRun - - Arguments: - sample_name: name of the sample - library_name: name of the library - libraries_dir: path to the 'libraries' directory containing the - library being added - is_barcoded: True if sample is barcoded, False if not - - """ - try: - # Look for existing sample - sample = None - for s in self.samples: - if sample_name == s.name: - sample = s - break - if sample is None: - # New sample - sample = SolidSample(sample_name,parent_run=self) - self.samples.append(sample) - - # Locate and process barcode statistics - if libraries_dir: - for f in os.listdir(libraries_dir): - if f.startswith("BarcodeStatistics"): - barcode_stats_filn = os.path.join(libraries_dir,f) - sample.barcode_stats = \ - SolidBarcodeStatistics(barcode_stats_filn) - break - else: - logging.warning("%s: libraries directory '%s' is missing" % - (self.run_name,libraries_dir)) - # Locate and process 'unassigned' data - # These are csfasta/qual files in the directory - # /results.F1B1/libraries/unassigned - unassigned_dir = os.path.join(libraries_dir,"unassigned") - logging.debug("%s: 'unassigned' dir %s" % (sample.name,unassigned_dir)) - if os.path.isdir(unassigned_dir): - # Collect information on unassigned read data - sample.unassigned = SolidLibrary("unassigned",parent_sample=sample) - for d in os.listdir(unassigned_dir): - reads_dir = os.path.join(unassigned_dir,d,"reads") - logging.debug("%s: reads dir %s" % (sample.name,reads_dir)) - if os.path.isdir(reads_dir): - csfasta,qual = get_primary_data_file_pair(reads_dir) - if csfasta and qual: - sample.unassigned.addPrimaryData(csfasta,qual) - logging.debug("-----> Adding primary data (unassigned)") - - # Store the library - library = sample.addLibrary(library_name) - library.is_barcoded = is_barcoded - - # Locate data files for this library - # - # This is a bit convoluted but essentially we're - # looking for a "primary.XXXXXXX" subdirectory of the - # subdirectory, which contains a "reject" - # subdirectory - # The "reads" subdirectory parallel to the "reject" - # dir has the data we want - # - # For non-paired-end runs there should only be one - # matching directory, containing data files with 'F3' - # in the names. - # For paired-end runs there should be two matching - # directories, one with the 'F3' files and the other - # with the 'F5' files. - - # Check for directory with result files - if libraries_dir: - this_library_dir = os.path.join(libraries_dir,library.name) - if not os.path.isdir(this_library_dir): - this_library_dir = None - else: - this_library_dir = None - - # Locate the primary data - if this_library_dir: - logging.debug("Library dir: %s..." % this_library_dir) - # Iterate over available directories - for d in os.listdir(this_library_dir): - logging.debug("--> Library %s subdir: %s" % (library_name,d)) - reject = os.path.join(this_library_dir,d,"reject") - reads = os.path.join(this_library_dir,d,"reads") - reports = os.path.join(this_library_dir,d,"reports") - # Check that we have 'reject', 'reads' and 'reports' - if os.path.isdir(reject) and \ - os.path.isdir(reads) and \ - os.path.isdir(reports): - logging.debug("---> has all of reads, reject and reports") - # Check for csfasta and qual files - csfasta,qual = get_primary_data_file_pair(reads) - # Add to list of primary data - if csfasta and qual: - library.addPrimaryData(csfasta,qual) - logging.debug("-----> Adding primary data") - # Finished locating primary data - if not len(library.primary_data): - # No primary data stored - logging.warning("%s: unable to locate any primary data for %s" % - (self.run_name,library)) - else: - # Assign "canonical" primary data files - f3_timestamp = None - f5_timestamp = None - for primary_data in library.primary_data: - if primary_data.is_f3(): - if f3_timestamp is None or primary_data.timestamp > f3_timestamp: - # This data has newer timestamp - library.csfasta = primary_data.csfasta - library.qual = primary_data.qual - f3_timestamp = primary_data.timestamp - elif primary_data.is_f5(): - if f5_timestamp is None or primary_data.timestamp > f5_timestamp: - # This data has newer timestamp - library.csfasta_f5 = primary_data.csfasta - library.qual_f5 = primary_data.qual - f5_timestamp = primary_data.timestamp - except Exception as ex: - logging.error("Exception adding sample: %s" % ex) - raise ex - - @property - def is_paired_end(self): - """Return True if run is paired end, False if not + solid_run_dir (str): path to the top-level directory holding + the files generated by the SOLiD sequencer run e.g. + '/path/to/SOLiD/data/solid0123_20130426_FRAG_BC' """ - return is_paired_end(self) + Run.__init__(self, solid_run_dir, + classes=dict(BarcodeStatistics=SolidBarcodeStatistics, + Library=SolidLibrary, + RunDefinition=SolidRunDefinition, + RunInfo=SolidRunInfo, + Sample=SolidSample)) - def verify(self): - """Perform verification checks on the SOLiD run directory - - Checks for the expected sample and library directories, and that - primary data files (csfasta and qual) have been assigned and exist. - - Returns: - True if the checks pass, False if there is a problem. + def fetchLibraries(self, sample_name='*', library_name='*'): """ - if not self: - # Some error processing the basics - return False - # Check basic parameters: should have non-zero numbers of - # samples and libraries - if len(self.samples) == 0: - logging.warning("No sample data") - return False - # Check libraries in each sample - run_ok = True - for sample in self.samples: - if len(sample.libraries) == 0: - logging.warning("No libraries for sample %s" % sample.name) - run_ok = False - for library in sample.libraries: - # Check csfasta was found - if not library.csfasta: - logging.warning("No F3 csfasta for %s/%s" % (sample.name, - library.name)) - run_ok = False - elif not os.path.exists(library.csfasta): - logging.warning("Missing F3 csfasta for %s/%s" % (sample.name, - library.name)) - run_ok = False - # Check qual was found - if not library.qual: - logging.warning("No F3 qual for %s/%s" % (sample.name, - library.name)) - run_ok = False - elif not os.path.exists(library.qual): - logging.warning("Missing F3 qual for %s/%s" % (sample.name, - library.name)) - run_ok = False - # Paired-end run: check F5 reads - if self.is_paired_end: - # Check for F5 csfasta - if not library.csfasta_f5: - logging.warning("No F5 csfasta for %s/%s" % (sample.name, - library.name)) - run_ok = False - elif not os.path.exists(library.csfasta_f5): - logging.warning("Missing F5 csfasta for %s/%s" % (sample.name, - library.name)) - run_ok = False - # Check for F5 qual - if not library.qual_f5: - logging.warning("No F5 qual for %s/%s" % (sample.name, - library.name)) - run_ok = False - elif not os.path.exists(library.qual_f5): - logging.warning("Missing F5 qual for %s/%s" % (sample.name, - library.name)) - run_ok = False - # Completed checks - return run_ok - - def fetchLibraries(self,sample_name='*',library_name='*'): - """Retrieve libraries based on sample and library names - - Supplied names can be exact matches or simple patterns (using trailing - '*'s as wildcards). '*' matches all names. - - Arguments: - sample_name: (optional) a name or pattern to match against sample - names (matches all sample names if not specified) - library_name: (optional) a name or pattern to match against - library names (matches all library names if not specified) - - Returns: - A list of SolidLibrary objects which match the supplied sample and - library names or patterns. + Retrieve libraries based on sample and library names """ - matching_libraries = [] - for sample in self.samples: - if match(sample_name,sample.name): - # Found a matching sample - for library in sample.libraries: - if match(library_name,library.name): - # Found a matching library - logging.debug("Located sample and library: %s/%s" % - (sample.name,library.name)) - matching_libraries.append(library) - if len(matching_libraries) == 0: - logging.debug("No libraries matched to %s/%s in %s" % (sample_name,library_name, - self.run_dir)) - # Finished - return matching_libraries + return Run.fetch_libraries(self, + sample_name=sample_name, + library_name=library_name) def slideLayout(self): - """Return description of the slide layout - - Returns: - A string describing the slide layout for the run based on - the number of samples in the run, e.g. "Whole slide", "Quads", - "Octets" etc. - Returns None if the slide layout can't be determined. """ - return slide_layout(len(self.samples)) - - def __nonzero__(self): - """Implement __nonzero__ built-in - + Return description of the slide layout """ - return self.__bool__() - - def __bool__(self): - """Implement __bool__ built-in - - SolidRun object is False if the source directory doesn't - exist, or if basic data couldn't be loaded.""" - if not self.run_name: - return False - elif not self.run_info: - return False - elif not self.run_definition and len(self.samples) == 0: - return False - else: - return True - -class SolidSample: - """Store information about a sample in a SOLiD run. - - A sample has a name and contains a set of libraries. - The information about the sample can be accessed via the - following properties: - - - name: the sample name - - libraries: a list of SolidLibrary objects representing the libraries - within the sample - - projects: a list of SolidProject objects representing groups of - related libraries within the sample - - unassigned: SolidProject object representing the 'unassigned' data - - barcode_stats: a SolidBarcodeStats object with data extracted from - the BarcodeStatistics file (or None, if no file was available) - - parent_run: the parent SolidRun object, or None. + return Run.slide_layout(self) - The class also provides the following methods: - - addLibrary: to create and append a SolidLibrary object - - getLibrary: fetch an existing SolidLibrary - - getProject: fetch an existing SolidProject - - Typically the calling subprogram calls the 'addLibrary' method to - add a SolidLibrary object, which it then populates itself. - - The SolidSample class automatically creates SolidProject objects - based on the library names to group libraries considered to belong - to the same experiments. +class SolidSample(Sample): """ + Store information about a sample in a SOLiD run. - def __init__(self,name,parent_run=None): - """Create a new SolidSample instance. + Arguments: + name (str): name of the sample (e.g. AS_XC_pool) + parent_run (SolidRun): (optional) the parent SolidRun object + """ + def __init__(self, name, parent_run=None): + Sample.__init__(self, name, parent_run=parent_run, + classes=dict(Library=SolidLibrary, + Project=SolidProject)) - Arguments: - name: name of the sample (e.g. AS_XC_pool) - parent_run: (optional) the parent SolidRun object + def addLibrary(self, library_name): """ - self.name = name - self.libraries = [] - self.libraries_dir = None - self.barcode_stats = None - self.projects = [] - self.unassigned = None - self.parent_run = parent_run - - def __repr__(self): - """Implement __repr__ built-in - - Return string representation for the SolidSample - - i.e. the sample name.""" - return str(self.name) - - def addLibrary(self,library_name): - """Associate a library with the sample - - The supplied library is added to the list of libraries - associated with the sample, if it's not already in the - list. - - Arguments: - library_name: name of the library to add - - Returns: - New or existing SolidLibrary object representing the - library. + Associate a library with the sample """ - # Check if the library is already in the list - library = self.getLibrary(library_name) - if not library: - # Create new library object and add to list - library = SolidLibrary(library_name,parent_sample=self) - self.libraries.append(library) - # Keep libraries in order - self.libraries = sorted(self.libraries, - key=lambda l: (l.prefix,l.index)) - # Deal with projects - project_name = library.initials - project = self.getProject(project_name) - if not project: - # Create new project - project = SolidProject(project_name) - self.projects.append(project) - # Add the library to the project - project.addLibrary(library) - # Return library object - return library - - def getLibrary(self,library_name): - """Look up library object matching a library name + return Sample.add_library(self, library_name) - Arguments: - library_name: name of library to look up - - Returns: - SolidLibrary object with the same name as 'library_name', - or None if no names match. + def getLibrary(self, library_name): """ - for library in self.libraries: - if library.name == library_name: - return library - # Not found - return None - - def getProject(self,project_name): - """Look up project object matching a project name - - Arguments: - project_name: name of the project to look up - - Returns: - SolidProject object with the same name as 'project_name', - or None if no names match. + Look up library object matching a library name """ - for project in self.projects: - if project.name == project_name: - return project - # Not found - return None + return Sample.get_library(self, library_name) -class SolidLibrary: - """Store information about a SOLiD library. - - The following properties hold data about the library: - - - name: the library name - - initials: the experimenter's initials - - prefix: the library name prefix (i.e. name without the trailing - numbers) - - index_as_string: the trailing numbers from the name, as a string - (preserves any leading zeroes) - - index: the trailing numbers from the name as an integer - - csfasta: full path to the csfasta file for the library (F3 reads) - - qual: full path to qual file for the library (F3 reads) - - csfasta_f5: full path to the F5 read (paired-end runs, otherwise - will be None) - - qual_f5: full path to the F5 read (paired-end runs, otherwise will - be None) - - primary_data: list of SolidPrimaryData objects for all possible - primary data file pairs associated with the library - - parent_sample: parent SolidSample object, or None. + def getProject(self, project_name): + """ + Look up project object matching a project name + """ + return Sample.get_project(self, project_name) - The following methods are also available: - - addPrimaryData: creates a new SolidPrimaryData object and appends - to the list in the primary_data property +class SolidLibrary(Library): """ + Store information about a SOLiD library. + Arguments: + name (str): name of the library (e.g. AS_07) + parent_sample (SolidSample): (optional) parent SolidSample + object + """ def __init__(self,name,parent_sample=None): - """Create a new SolidLibrary instance. + Library.__init__(self, name, parent_sample=parent_sample, + classes=dict(PrimaryData=SolidPrimaryData, + Project=SolidProject)) - Inputs: - name: name of the library (e.g. AS_07) - parent_sample: (optional) parent SolidSample object + def addPrimaryData(self, csfasta, qual): """ - # Name - self.name = str(name) - # Name-based information - self.initials = utils.extract_initials(self.name) - self.prefix = utils.extract_prefix(self.name) - self.index_as_string = utils.extract_index_as_string(self.name) - self.index = utils.extract_index(self.name) - # Barcoding - self.is_barcoded = False - # Associated canonical data files - self.csfasta = None - self.qual = None - self.csfasta_f5 = None - self.qual_f5 = None - # References to all primary data - self.primary_data = [] - # Parent sample - self.parent_sample = parent_sample - - def addPrimaryData(self,csfasta,qual): - """Add reference to primary data to the library - - Creates and populates a new SolidPrimaryData instance - and adds it to the list of primary data objects - associated with this library. - - Arguments: - csfasta: full path and name of the csfasta file - qual : full path and name of the qual file - - Returns: - The new SolidPrimaryData object + Add reference to primary data to the library """ - # Create and populate primary data object - primary_data = SolidPrimaryData() - primary_data.csfasta = csfasta - primary_data.qual = qual - # Deal with type (F3 or F5) - try: - csfasta.index('_F5-BC_') - primary_data.type = 'F5' - except ValueError: - pass - try: - csfasta.index('_F3_') - primary_data.type = 'F3' - except ValueError: - pass - # Deal with timestamp - primary_data.timestamp = extract_library_timestamp(csfasta) - if primary_data.timestamp != extract_library_timestamp(qual): - logging.warning("Timestamps differ on CSFASTA/QUAL pair for %s" % self.name) - # Append to the list of primary data files - self.primary_data.append(primary_data) - # Sort into timestamp order (newest to older) - self.primary_data = sorted(self.primary_data, - key=lambda x: x.timestamp) - # Return the SolidPrimaryData object - return primary_data - - def __repr__(self): - """Implement __repr__ built-in + return Library.add_primary_data(self, csfasta, qual) - Return string representation for the SolidLibrary - - i.e. the library name.""" - return str(self.name) - -class SolidPrimaryData: - """Class to store references to primary data files - - This is a convenience class for storing references to csfasta/qual - file pairs within a SolidLibrary instance. - - The class provides the following attributes: - - csfasta: full path to csfasta file - qual: full path to qual file - timestamp: timestamp associated with the file pair - type: string indicating 'F3' or 'F5', or None - - The following methods are provided: - - is_f3: indicates if data is F3 - is_f5: indicates if data is F5 +class SolidPrimaryData(PrimaryData): + """ + Class to store references to primary data files """ def __init__(self): - """Create a new SolidPrimaryData instance. - - """ - self.timestamp = None - self.csfasta = None - self.qual = None - self.type = None - - def is_f3(self): - """Returns True if this is F3 data, False otherwise - """ - return (self.type == 'F3') - - def is_f5(self): - """Returns True if this is F5 data, False otherwise - """ - return (self.type == 'F5') - - def __repr__(self): - return self.timestamp - -class SolidProject: - """Class to hold information about a SOLiD 'project' - - A SolidProject object holds a collection of libraries which - together constitute a 'project'. - - The definition of a 'project' is quite loose in this context: - essentially it's a grouping of libraries within a sample. - Typically the grouping is by the initial letters of the library - name e.g. DR for DR1, EP for EP_NCYC2669 - but this determination - is made at the application level. - - Libraries are added to the project via the addLibrary method. - Data about the project can be accessed via the following - properties: + PrimaryData.__init__(self) - name: the project name (supplied on object creation) - libraries: a list of libraries in the project - Also has the following methods: - - - getSample(): returns the parent SolidSample - - getRun(): returns the parent SolidRun - - isBarcoded(): returns boolean indicating whether the libraries - in the sample are barcoded +class SolidProject(Project): """ + Class to hold information about a SOLiD 'project' + Arguments: + name (str): the name of the project. + run (SolidRun): (optional) the parent SolidRun for the + project + sample (SolidSample): (optional) the parent SolidSample + for the project + """ def __init__(self,name,run=None,sample=None): - """Create a new SolidProject object. - - Arguments: - name: the name of the project. - run: (optional) the parent SolidRun for the project - sample: (optional) the parent SolidSample for the project - """ - self.name = name - self.libraries = [] + Project.__init__(self, name, run=run, sample=sample) def addLibrary(self,library): - """Add a library to the project. - - Arguments: - library: SolidLibrary object to add to the project """ - if not isinstance(library,SolidLibrary): - raise TypeError("addLibrary requires SolidLibrary instance") - self.libraries.append(library) + Add a library to the project. + """ + return Project.add_library(self, library) def getSample(self): - """Return the parent sample for the project. - - Returns the parent SolidSample object, or None if no parent - is defined. """ - if len(self.libraries): - return self.libraries[0].parent_sample - else: - return None + Return the parent sample for the project. + """ + return Project.get_sample(self) def getRun(self): - """Return the parent run for the project. - - Returns the parent SolidRun object for the project, by - looking up the run that the parent sample belongs to. - Returns None if no parent sample is defined, or if the - parent sample doesn't have a parent run. """ - parent_sample = self.getSample() - if parent_sample: - return parent_sample.parent_run - else: - return None + Return the parent run for the project. + """ + return Project.get_run(self) def isBarcoded(self): - """Return boolean indicating if the libraries are barcoded. - - If all libraries in the project are barcoded then return - True, otherwise return False if at least one isn't barcoded - (or if there are no libraries associated with the project). """ - # If any library is not barcoded, return False - for library in self.libraries: - if not library.is_barcoded: - return False - # Will be True as long as there's at least one library - return len(self.libraries) > 0 + Return boolean indicating if the libraries are barcoded. + """ + return Project.is_barcoded(self) def getLibraryNamePattern(self): - """Return wildcard pattern matching all library names in the project. - - Find the longest pattern which matches all the library names in - the project. For example if the project contains four libraries - PB1, PB2, PB3 and PB4 then return 'PB*'. - - If the project only contains one library then the pattern will be - the single name without wildcard characters. """ - pattern = None - for library in self.libraries: - if pattern is None: - pattern = library.name - else: - new_pattern = [] - for i in range(min(len(pattern),len(library.name))): - if pattern[i] != library.name[i]: - if len(new_pattern) < len(library.name): - new_pattern.append('*') - pattern = ''.join(new_pattern) - break - else: - new_pattern.append(pattern[i]) - return pattern + Return wildcard pattern matching all library names in the project. + """ + return Project.get_library_name_pattern(self) def getProjectName(self): - """Return a name for the project. - - Typically this is the same as the project name assigned when - the project was created, unless the project essentially maps - to an entire sample (i.e. all the libraries in the parent - sample are also in the project) - then the project name is - the sample name. """ - if len(self.getSample().libraries) == len(self.libraries): - return self.getSample().name - else: - return self.name + Return a name for the project. + """ + return Project.get_project_name(self) def prettyPrintLibraries(self): - """Return a nicely formatted string describing the library names - - Wraps a call to 'pretty_print_names' function. """ - return utils.pretty_print_names(self.libraries) - - def getTimeStamps(self): - """Return a list of timestamps for primary data - - Returns a list of the unique timestamps associated with all - primary data files within the project. + Return a nicely formatted string describing the library names """ - timestamps = [] - for library in self.libraries: - for primary_data in library.primary_data: - if primary_data.timestamp not in timestamps: - timestamps.append(primary_data.timestamp) - return timestamps - -class SolidRunInfo: - """Extract data about a run from the run name - - Run names are of the form 'solid0123_20130426_FRAG_BC_2' - - This class analyses the name and breaks it down into components - that can be accessed as object properties, specifically: - - name: the supplied run name - instrument: the instrument name e.g. solid0123 - datestamp: e.g. 20130426 - is_fragment_library: True or False - is_barcoded_sample: True or False - flow_cell: 1 or 2 - date: datestamp reformatted as DD/MM/YY - id: the run name without any flow cell identifier - """ - - def __init__(self,run_name): - """Create and initialise a new SolidRunInfo instance + return Project.pretty_print_libraries(self) - Arguments: - run_name: the name of the run, e.g. solid0123_20130426_FRAG_BC_2. - NB this is not a path to a run directory + def getTimeStamps(self): """ - # Initialise - self.name = str(run_name) - self.id = None - self.instrument = None - self.datestamp = None - self.is_paired_end = False - self.is_fragment_library = False - self.is_barcoded_sample = False - self.flow_cell = 1 - self.date = None - # - if self.name.count('_') == 0: - return - # - data = self.name.split('_') - # - # Basic info - self.instrument = data[0] - self.datestamp = data[1] - # - # Paired end - if 'PE' in data: - self.is_paired_end = True - # - # Fragment library - if 'FRAG' in data: - self.is_fragment_library = True - # - # Barcoded sample - if 'BC' in data: - self.is_barcoded_sample = True - # - # Flow cell - if data[-1] == '2': - self.flow_cell = 2 - # - # I.D. - self.id = "%s_%s" % (self.instrument, - self.datestamp) - if self.is_fragment_library: - self.id += "_FRAG" - if self.is_paired_end: - self.id += "_PE" - if self.is_barcoded_sample: - self.id += "_BC" - # - # Date - if len(self.datestamp) == 8: - self.date = "%s/%s/%s" % (self.datestamp[6:8], - self.datestamp[4:6], - self.datestamp[2:4]) - - def __repr__(self): - """Implement __repr__ built in for str etc - - Returns the orginal run name, which encodes all the - information held in the object. + Return a list of timestamps for primary data """ - return self.name - -class SolidRunDefinition: - """Class to store data from a SOLiD run definition file + return Project.get_timestamps(self) - Once the SolidRunDefinition object is populated from a run - definition file, use the 'nSamples' method to find out how - many 'samples' (actually sample/library pairs) are defined, - and the 'fields' method to get a list of column headings for - each. - Data can be extracted for each sample using the 'getDataItem' - method to look up the value for a particular field on a - particular line, e.g.: +class SolidRunInfo(RunInfo): + """ + Extract data about a run from the run name - >>> library = run_defn.getDataItem('library',0) + Arguments: + run_name: the name of the run, e.g. solid0123_20130426_FRAG_BC_2. + NB this is not a path to a run directory + """ + def __init__(self,run_name): + RunInfo.__init__(self, run_name=run_name) - The SolidRunDefinition object also has a number of attributes - populated from the header of the run definition file, - specifically: - - version, userId, runType, isMultiplexing, runName, runDesc, - mask and protocol. - The attributes are strings and can be accessed directly from - the object, e.g.: +class SolidRunDefinition(RunDefinition): + """ + Class to store data from a SOLiD run definition file - >>> version = run_defn.version - >>> isMultiplexing = run_defn.isMultiplexing + Arguments: + run_definition_file (str): name of run definition file (including + any leading path) from which to extract data """ def __init__(self,run_definition_file): - """Create a new SolidRunDefinition object. - - Arguments: - run_definition_file: name of run definition file (including - any leading path) from which to extract data - """ - self.file = run_definition_file - # Header attributes - self.version = None - self.userId = None - self.runType = None - self.isMultiplexing = None - self.runName = None - self.runDesc = None - self.mask = None - self.protocol = None - # Data about specific samples/libraries - self.header_fields = [] - self.data = [] - try: - self.populate() - except IOError as ex: - logging.error("Failed to populate SolidRunDefinition: '%s'" % ex) - - def __nonzero__(self): - """Implement the built-in __nonzero__ method""" - return self.__bool__() - - def __bool__(self): - """Implement the built-in __bool__ method""" - return (len(self.data) != 0) - - def fields(self): - """Return list of fields""" - return self.header_fields + RunDefinition.__init__(self, + run_definition_file=run_definition_file) def nSamples(self): - """Return the number of samples""" - return len(self.data) - - def getDataItem(self,field,i): - """Return data item from specified row - - Arguments: - field: one of the fields read in from the file - i: integer row index. - - Returns: - Value stored for 'field' in row 'i'. Returns None - if the field isn't found, and raises an IndexError - exception if the index doesn't address an existing - row.""" - try: - pos = self.header_fields.index(field) - except ValueError: - logging.error("Field '%s' not found in '%s'" % (field,self.file)) - return None - return self.data[i][pos] - - def populate(self): - """Populate the SolidRunDefiniton object. - - Internal: loads data from the run definition file into - the object.""" - # Initialise - reading_header = False - reading_samples = False - # Open the file - f = io.open(self.file,'rt') - for line in f: - # Look for the header line (first line of the file) describing run attributes - # This looks like: - # version userId runType isMultiplexing runName runDesc mask protocol - if line.startswith("version"): - reading_header = True - elif reading_header: - # Store the data from the header - data = line.strip().split('\t') - self.version = data[0] - self.userId = data[1] - self.runType = data[2] - self.isMultiplexing = data[3] - self.runName = data[4] - self.runDesc = data[5] - self.mask = data[6] - self.protocol = data[7] - reading_header = False - # Look for header line for sample/library data - # This looks like: - # sampleName sampleDesc spotAssignments primarySetting library application secondaryAnalysis multiplexingSeries barcodes - if line.startswith("sampleName"): - for field in line.strip().split('\t'): - self.header_fields.append(field) - reading_samples = True - elif reading_samples: - # Deal with information under the header - data = line.strip().split('\t') - self.data.append(data) - # Finished - f.close() - -class SolidBarcodeStatistics: - """Store data from a SOLiD BarcodeStatistics file""" - - def __init__(self,barcode_statistics_file): - """Create a new SolidBarcodeStatistics object""" - self.file = barcode_statistics_file - self.header = None - self.data = [] - try: - self.populate() - except IOError as ex: - logging.error("Failed to populate SolidBarcodeStatistics: '%s'" % ex) - - def __nonzero__(self): - """Implement __nonzero__ built-in """ - return self.__bool__() - - def __bool__(self): - """Implement the __bool__ built-in""" - return (len(self.data) != 0) - - def populate(self): - """Populate the SolidBarcodeStatistics object. + Return the number of samples """ - got_header = False - f = io.open(self.file,'rt') - for line in f: - if got_header: - data = line.strip().split('\t') - self.data.append(data) - elif line.startswith('##'): - self.header = line.strip().strip('#').split('\t') - got_header = True - f.close() - - def header(self): - """Return list of header fields""" - return self.header - - def nRows(self): - """Return the number of rows""" - return len(self.data) + return RunDefinition.n_samples(self) - def getDataByName(self,name): - """Return a row of data matching 'name' + def getDataItem(self, field, i): """ - for data in self.data: - if data[0] == name: - return data - return None - - def totalReads(self): - """Return the total reads - - The total reads are calculated as the totals for all beads less the - subtotal of unassigned reads. - - If the required data cannot be found from the barcode stats data then - returns None. + Return data item from specified row """ - unassigned_subtotals = None - all_beads_totals = None - for line in self.data: - if line[0] == 'unassigned' and line[1] == 'Subtotals': - # Look for unassigned subtotals - logging.debug(">>> %s" % line) - if unassigned_subtotals is not None: - logging.warning("Multiple unassigned subtotals found") - try: - unassigned_subtotals = int(line[-1]) - except ValueError: - logging.error("Unassigned subtotal '%s' is not an integer" % line[-1]) - elif line[0] == 'All Beads' and line[1] == 'Totals': - # Look for total of all beads - logging.debug(">>> %s" % line) - if all_beads_totals is not None: - logging.warning("Multiple all beads totals found") - try: - all_beads_totals = int(line[-1]) - except ValueError: - logging.error("Unassigned subtotal '%s' is not an integer" % line[-1]) - # Work out the total reads - if unassigned_subtotals is None or all_beads_totals is None: - logging.error("Unable to acquire values for one or both of subtotals or totals") - return None - else: - return all_beads_totals - unassigned_subtotals - -####################################################################### -# Module Functions -####################################################################### - -def list_run_directories(solid_run_dir): - """Return list of matching run directories - - Given the name of a SOLiD run directory, find all the 'matching' - run directories based on the instrument name and date stamp. + return RunDefinition.get_data_item(self, field, i) - For example, 'solid0127_20120123_FRAG_BC' and - 'solid0127_20120123_FRAG_BC_2' would form a matching set, as would - 'solid0127_20120123_PE_BC' etc. - - For "nonstandard" names (e.g. 'solid0127_20120123_PE_BC_COPY', if - no matches are found then just the input is returned. - - Returns a list of matching directories which includes the input. - - """ - # Break up the input - base_dir = os.path.dirname(os.path.abspath(solid_run_dir)) - run_name = os.path.basename(solid_run_dir.rstrip(os.sep)) - # Get the run info from the name - try: - base_run_info = SolidRunInfo(run_name) - except Exception: - # Wrong format for name - logging.error("'%s' not a valid SOLiD run directory name" % solid_run_dir) - return [] - # List all directories in the base dir and look for matches - dirs = [] - for f in os.listdir(base_dir): - if os.path.isdir(os.path.join(base_dir,f)): - try: - # Check if instrument name and datestamp match - run_info = SolidRunInfo(f) - if run_info.instrument != base_run_info.instrument or \ - run_info.datestamp != base_run_info.datestamp: - # Not a match - continue - except Exception: - # Wrong format for name, not a match - continue - # Check for run definition file - if not os.path.exists(os.path.join(base_dir,f,f+'_run_definition.txt')): - continue - # Must be a match, add to the list - dirs.append(os.path.join(base_dir,f)) - # Check that the original run is also included - if os.path.abspath(solid_run_dir) not in dirs: - dirs = [solid_run_dir] - # Sort and return directories - dirs.sort() - return dirs - -def is_paired_end(solid_run): - """Determine if a SolidRun instance is a paired-end run - - Arguments: - solid_run: a populated SolidRun instance - - Returns: - True if this is a paired-end run, False otherwise. +class SolidBarcodeStatistics(BarcodeStatistics): """ - if solid_run.run_definition: - return (solid_run.run_definition.runType == "PAIRED-END") - else: - for sample in solid_run.samples: - for lib in sample.libraries: - if lib.csfasta_f5: return True - return False - -def get_primary_data_file_pair(dirn): - """Return csfasta/qual file pair from specified directory - - Arguments: - dirn: directory to search for csfasta/qual pair - - Returns: - Tuple (csfasta,qual) with full path for each file, or - (None,None) if a pair wasn't located. + Store data from a SOLiD BarcodeStatistics file """ - csfasta = None - qual = None - for filen in os.listdir(dirn): - ext = os.path.splitext(filen)[1] - if ext == ".csfasta": - csfasta = os.path.abspath(os.path.join(dirn,filen)) - elif ext == ".qual": - qual = os.path.abspath(os.path.join(dirn,filen)) - if csfasta and qual: - return (csfasta,qual) - else: - return (None,None) - -def extract_library_timestamp(path): - """Extract the timestamp string from a path - Given a path of the form '/path/to/data/.../primary.1234567/...', - return the timestamp string attached to the 'primary.XXXXXXX' - component of the name. - - Arguments: - path: absolute or relative path to arbitrary directory or - file in the SOLiD data structure - - Returns: - Timestamp string, or None if no timestamp was identified. - """ - # Given the path to a file, extract the timestamp attached to - # the "primary" component of the name - for c in path.split(os.sep): - if c.startswith("primary."): - i = c.index('.') - timestamp = c[i+1:] - return timestamp - # Got to the end without locating the timestamp, return None - return None - -def match(pattern,word): - """Check if a word matches pattern - - Implements a very simple pattern matching algorithm, which allows - only exact matches or glob-like strings (i.e. using a trailing '*' - to indicate a wildcard). - - For example: 'ABC*' matches 'ABC', 'ABC1', 'ABCDEFG' etc, while - 'ABC' only matches itself. - - Arguments: - pattern: simple glob-like pattern - word: string to test against 'pattern' - - Returns: - True if 'word' is a match to 'pattern', False otherwise. - """ - if not pattern or pattern == '*': - # No pattern/wildcard, matches everything - return True - # Only simple patterns considered for now - if pattern.endswith('*'): - # Match the start - return word.startswith(pattern[:-1]) - else: - # Match the whole word exactly - return (word == pattern) - -def slide_layout(nsamples): - """Description of the slide layout based on number of samples - - Arguments: - nsamples: number of samples in the run + def __init__(self,barcode_statistics_file): + BarcodeStatistics.__init__(self, + barcode_statistics_file=\ + barcode_statistics_file) - Returns: - A string describing the slide layout for the run based on the - number of samples in the run, e.g. "Whole slide", "Quads", - "Octets" etc. Returns None if the number of samples doesn't - map to a recognised layout. - """ - if nsamples == 1: - return "Whole slide" - elif nsamples == 4: - return "Quads" - elif nsamples == 8: - return "Octets" - else: - logging.warning("Undefined layout for '%s' samples" % nsamples) - return None + def nRows(self): + """ + Return the number of rows + """ + return BarcodeStatistics.n_rows(self) + def getDataByName(self, name): + """ + Return a row of data matching 'name' + """ + return BarcodeStatistics.get_data_by_name(self, name) + def totalReads(self): + """ + Return the total reads + """ + return BarcodeStatistics.total_reads(self)