diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index c8f360e..4bb7172 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -20,4 +20,4 @@ jobs: pip install -r requirements/dev . - name: Launch tests run: | - nosetests ./tests + python -m unittest discover -s ./tests -t . diff --git a/config/app.config b/config/app.config index d11ff09..6bf17f1 100644 --- a/config/app.config +++ b/config/app.config @@ -8,6 +8,7 @@ runfolder_directory: tests/resources/runfolders general_project_directory: tests/resources/projects staging_directory: /tmp/ project_links_directory: /tmp/ +readme_directory: tests/resources/readme dds_conf: log_path: dds.log port: 9999 diff --git a/delivery/__init__.py b/delivery/__init__.py index 0552768..f5f41e5 100644 --- a/delivery/__init__.py +++ b/delivery/__init__.py @@ -1 +1 @@ -__version__ = "3.0.1" +__version__ = "3.1.0" diff --git a/delivery/app.py b/delivery/app.py index 6b4aa7a..c0349fc 100644 --- a/delivery/app.py +++ b/delivery/app.py @@ -25,7 +25,8 @@ FileSystemBasedUnorganisedRunfolderRepository from delivery.repositories.staging_repository import DatabaseBasedStagingRepository from delivery.repositories.deliveries_repository import DatabaseBasedDeliveriesRepository -from delivery.repositories.project_repository import GeneralProjectRepository, UnorganisedRunfolderProjectRepository +from delivery.repositories.project_repository import GeneralProjectRepository, \ + UnorganisedRunfolderProjectRepository from delivery.repositories.delivery_sources_repository import DatabaseBasedDeliverySourcesRepository from delivery.repositories.sample_repository import RunfolderProjectBasedSampleRepository @@ -120,13 +121,19 @@ def _assert_is_dir(directory): project_links_directory = config["project_links_directory"] _assert_is_dir(project_links_directory) - runfolder_repo = FileSystemBasedRunfolderRepository(runfolder_dir) - project_repository = UnorganisedRunfolderProjectRepository( - sample_repository=RunfolderProjectBasedSampleRepository() + readme_directory = config["readme_directory"] + _assert_is_dir(readme_directory) + + sample_repo = RunfolderProjectBasedSampleRepository() + runfolder_repo = FileSystemBasedRunfolderRepository( + runfolder_dir ) unorganised_runfolder_repo = FileSystemBasedUnorganisedRunfolderRepository( runfolder_dir, - project_repository=project_repository + project_repository=UnorganisedRunfolderProjectRepository( + sample_repository=sample_repo, + readme_directory=readme_directory + ) ) general_project_dir = config['general_project_directory'] diff --git a/delivery/models/project.py b/delivery/models/project.py index 47a69c0..cb2ae09 100644 --- a/delivery/models/project.py +++ b/delivery/models/project.py @@ -35,18 +35,29 @@ def __hash__(self): class RunfolderProject(BaseProject): """ - Model a project directory in a runfolder on disk. Note that this means that this project model only extends - to the idea of projects as subdirectories in a demultiplexed Illumina runfolder. + Model a project directory in a runfolder on disk. Note that this means that this project model + only extends to the idea of projects as subdirectories in a demultiplexed Illumina runfolder. """ - def __init__(self, name, path, runfolder_path, runfolder_name, samples=None, project_files=None): + def __init__( + self, + name, + path, + runfolder_path, + runfolder_name, + samples=None, + project_files=None + ): """ Instantiate a new `RunfolderProject` object + :param name: of the project :param path: path to the project :param runfolder_path: path the runfolder in which this project is stored. :param runfolder_name: name of the runfolder in which this project is stored :param samples: list of instances of Sample, representing samples in the project + :param project_files: list of instances of RunfolderFile, representing files belonging to + the project. These do not include files already accounted for in the `samples` list. """ self.name = name self.path = os.path.abspath(path) diff --git a/delivery/models/runfolder.py b/delivery/models/runfolder.py index 3691132..0896433 100644 --- a/delivery/models/runfolder.py +++ b/delivery/models/runfolder.py @@ -37,7 +37,63 @@ def __hash__(self): class RunfolderFile(object): - def __init__(self, file_path, file_checksum=None): + def __init__( + self, + file_path, + base_path=None, + file_checksum=None + ): + """ + A `RunfolderFile` object representing a file in the runfolder + + If specified, the `base_path` parameter should specify the path that the file will be + considered relative to. For example, if `file_path` is `/path/to/example/file_name` and + `base_path` is `/path/to`, the file object, if moved or symlinked, will be placed under the + intermediate directory, i.e. `example/file_name`. + + :param file_path: the path to the file + :param base_path: a path relative to which the file will be considered + :param file_checksum: a computed checksum for the file + """ self.file_path = os.path.abspath(file_path) self.file_name = os.path.basename(file_path) + self.base_path = base_path or os.path.dirname(self.file_path) self.checksum = file_checksum + + @classmethod + def create_object_from_path( + cls, + file_path, + runfolder_path, + filesystem_service, + metadata_service, + base_path=None, + checksums=None + ): + """ + Factory method that creates and returns a class instance. + + :param file_path: the path to the file + :param runfolder_path: the path to the runfolder containing the file + :param filesystem_service: a service which can access the file system + :param metadata_service: a MetadataService for reading and writing metadata files + :param base_path: a path relative to which the file will be considered + :param checksums: a list of pre-computed checksums that may or may not contain an entry for + the `file_path` relative to the `runfolder_path` + :return: a class object representing a file in the runfolder + """ + checksums = checksums or {} + relative_file_path = filesystem_service.relpath( + file_path, + filesystem_service.dirname( + runfolder_path + ) + ) + checksum = checksums[relative_file_path] \ + if relative_file_path in checksums \ + else metadata_service.hash_file(file_path) + return cls( + file_path, + base_path=base_path, + file_checksum=checksum + ) diff --git a/delivery/models/sample.py b/delivery/models/sample.py index b6ba45a..8187065 100644 --- a/delivery/models/sample.py +++ b/delivery/models/sample.py @@ -46,9 +46,16 @@ def __init__( lane_no=None, read_no=None, is_index=None, - checksum=None): + base_path=None, + checksum=None + ): """ - Instantiate a new `SampleFile` object + A `SampleFile` object + + If specified, the `base_path` parameter should specify the path that the file will be + considered relative to. For example, if `file_path` is `/path/to/example/file_name` and + `base_path` is `/path/to`, the file object, if moved or symlinked, will be placed under the + intermediate directory, i.e. `example/file_name` :param sample_path: the path to the file :param sample_name: the name of the sample @@ -56,9 +63,14 @@ def __init__( :param lane_no: the lane number the sequences in the file were derived from :param read_no: the read number :param is_index: if True, the sequence file contains index sequences + :param base_path: a path relative to which the file will be considered :param checksum: the MD5 checksum for this SampleFile """ - super(SampleFile, self).__init__(sample_path, file_checksum=checksum) + super(SampleFile, self).__init__( + sample_path, + base_path=base_path, + file_checksum=checksum + ) self.sample_name = sample_name self.sample_index = sample_index self.lane_no = lane_no @@ -77,4 +89,5 @@ def __hash__(self): self.lane_no, self.read_no, self.is_index, - self.checksum)) + self.checksum + )) diff --git a/delivery/repositories/project_repository.py b/delivery/repositories/project_repository.py index 1ac7af9..1ede6d3 100644 --- a/delivery/repositories/project_repository.py +++ b/delivery/repositories/project_repository.py @@ -59,22 +59,33 @@ def get_project(self, project_name): class UnorganisedRunfolderProjectRepository(object): """ - Repository for a unorganised project in a runfolder. For this purpose a project is represented by a directory under - the runfolder's PROJECTS_DIR directory, having at least one fastq file beneath it. + Repository for a unorganised project in a runfolder. For this purpose a project is represented + by a directory under the runfolder's PROJECTS_DIR directory, having at least one fastq file + beneath it. """ PROJECTS_DIR = "Unaligned" - def __init__(self, sample_repository, filesystem_service=FileSystemService(), metadata_service=MetadataService()): + def __init__( + self, + sample_repository, + readme_directory, + filesystem_service=FileSystemService(), + metadata_service=MetadataService() + ): """ Instantiate a new UnorganisedRunfolderProjectRepository object :param sample_repository: a RunfolderProjectBasedSampleRepository instance - :param filesystem_service: a FileSystemService instance for accessing the file system + :param readme_directory: the path to the directory containing README files to include when + organising the project + :param filesystem_service: a FileSystemService instance for accessing the file system + :param metadata_service: a MetadataService for reading and writing metadata files """ self.filesystem_service = filesystem_service self.sample_repository = sample_repository self.metadata_service = metadata_service + self.readme_directory = readme_directory def dump_checksums(self, project): """ @@ -89,7 +100,7 @@ def _sample_file_checksum(sample_file): sample_file.checksum, self.filesystem_service.relpath( sample_file.file_path, - project.path)] if sample_file.checksum else None + project.path)] if sample_file.checksum else [None, None] def _sample_checksums(sample): for sample_file in sample.sample_files: @@ -110,11 +121,13 @@ def _sample_checksums(sample): def get_projects(self, runfolder): """ - Returns a list of RunfolderProject instances, representing all projects found in this runfolder. + Returns a list of RunfolderProject instances, representing all projects found in this + runfolder, or None if no project can be found. :param runfolder: a Runfolder instance :return: a list of RunfolderProject instances or None if no projects were found - :raises: ProjectsDirNotfoundException if the Unaligned directory could not be found in the runfolder + :raises: ProjectsDirNotfoundException if the Unaligned directory could not be found in the + runfolder """ def dir_contains_fastq_files(d): return any( @@ -123,116 +136,184 @@ def dir_contains_fastq_files(d): self.filesystem_service.list_files_recursively(d))) def project_from_dir(d): - project = RunfolderProject( - name=os.path.basename(d), + project_path = os.path.join(projects_base_dir, d) + project_name = os.path.basename(d) + project_files = [] + + try: + project_files.extend( + self.get_report_files( + project_path, + project_name, + runfolder, + checksums=runfolder.checksums + ) + ) + except ProjectReportNotFoundException as ex: + log.warning(ex) + + try: + project_files.extend( + self.get_project_readme( + project_name=project_name, + runfolder=runfolder, + with_undetermined=False + ) + ) + except ProjectReportNotFoundException as ex: + log.warning(ex) + + samples = self.sample_repository.get_samples( + project_path, + project_name, + runfolder + ) + + return RunfolderProject( + name=project_name, path=os.path.join(projects_base_dir, d), runfolder_path=runfolder.path, - runfolder_name=runfolder.name + runfolder_name=runfolder.name, + project_files=project_files, + samples=samples ) - try: - project.project_files = self.get_report_files(project, checksums=runfolder.checksums) - except ProjectReportNotFoundException as e: - log.warning(e) - - project.samples = self.sample_repository.get_samples(project, runfolder) - return project try: projects_base_dir = os.path.join(runfolder.path, self.PROJECTS_DIR) # only include directories that have fastq.gz files beneath them + dirs = self.filesystem_service.find_project_directories(projects_base_dir) project_directories = filter( dir_contains_fastq_files, - self.filesystem_service.find_project_directories(projects_base_dir) + [dirs] if type(dirs) is str else dirs ) - return list(map(project_from_dir, project_directories)) or None + return [ + project_from_dir(project_directory) + for project_directory in project_directories + ] or None except FileNotFoundError: - raise ProjectsDirNotfoundException("Did not find Unaligned folder for: {}".format(runfolder.name)) + raise ProjectsDirNotfoundException( + f"Did not find {self.PROJECTS_DIR} folder for: {runfolder.name}" + ) - def get_report_files(self, project, checksums=None): + def get_report_files(self, project_path, project_name, runfolder, checksums=None): """ - Gets the paths to files associated with the supplied project's report. This can be either a MultiQC report or, - if no such report was found, a Sisyphus report. If a pre-calculated checksum cannot be found for a file, it will - be calculated on-the-fly. - - :param project: a RunfolderProject instance - :param checksums: a dict with pre-calculated checksums for files. paths are keys and the corresponding - checksum is the value - :return: a list of RunfolderFile objects - :raises ProjectReportNotFoundException: if no MultiQC or Sisyphus report was found for the project + Gets the paths to files associated with the supplied project's MultiQC report. This report + is fetched from seqreports unless there is a MultiQC report directly under the project's + path. If a pre-calculated checksum cannot be found for a file, it will be calculated + on-the-fly. + + :param project_path: the path to the project folder + :param project_name: the name of the project + :param runfolder: a Runfolder instance representing the runfolder containing the project + :param checksums: a dict with pre-calculated checksums for files. paths are keys and the + corresponding checksum is the value + :return: a list of RunfolderFile objects representing project report files + :raises ProjectReportNotFoundException: if no MultiQC report was found for the project """ - def _file_object_from_path(file_path): - relative_file_path = self.filesystem_service.relpath( - file_path, - self.filesystem_service.dirname(project.runfolder_path)) - checksum = checksums[relative_file_path] \ - if relative_file_path in checksums else self.metadata_service.hash_file(file_path) - return RunfolderFile(file_path, file_checksum=checksum) - - checksums = checksums or {} - if self.filesystem_service.exists(self.multiqc_report_path(project)): - log.info("MultiQC reports found in Unaligned/{}, overriding organisation of seqreports".format(project.name)) - return list(map(_file_object_from_path, self.multiqc_report_files(project))) - for sisyphus_report_path in self.sisyphus_report_path(project): - if self.filesystem_service.exists(sisyphus_report_path): - log.info("Organising sisyphus reports for {}".format(project.name)) - return list(map( - _file_object_from_path, - self.sisyphus_report_files( - self.filesystem_service.dirname(sisyphus_report_path)))) - if self.filesystem_service.exists(self.seqreports_path(project)): - log.info("Organising seqreports for {}".format(project.name)) - return list(map(_file_object_from_path, self.seqreports_files(project))) - raise ProjectReportNotFoundException("No project report found for {}".format(project.name)) - @staticmethod - def sisyphus_report_path(project): - return os.path.join( - project.runfolder_path, "Summary", project.name, "report.html"), \ - os.path.join( - project.path, "report.html") - - def sisyphus_report_files(self, report_dir): - report_files = [ - os.path.join(report_dir, "report.html"), - os.path.join(report_dir, "report.xml"), - os.path.join(report_dir, "report.xsl") + # look for the MultiQC report in these locations (in order of precedence) + report_paths = [ + project_path, + os.path.join( + runfolder.path, + "seqreports", + "projects", + project_name + ) ] - report_files.extend(list( - self.filesystem_service.list_files_recursively( - os.path.join( - report_dir, - "Plots")))) - return report_files - - @staticmethod - def multiqc_report_path(project): - return os.path.join( - project.path, - "{}_multiqc_report.html".format(project.name)) - - def multiqc_report_files(self, project): - report_files = [self.multiqc_report_path(project)] - report_dir = self.filesystem_service.dirname(report_files[0]) - report_files.append( - os.path.join(report_dir, "{}_multiqc_report_data.zip".format(project.name))) - return report_files + # depending on the location of the MultiQC report, use these different prefixes + report_prefixes = [ + project_name, + f"{runfolder.name}_{project_name}" + ] + report_names = [ + "multiqc_report.html", + "multiqc_report_data.zip" + ] + for report_path, report_prefix in zip(report_paths, report_prefixes): + # creating the RunfolderFile for a non-existing file should raise an exception + try: + report_files = [] + for report_name in report_names: + report_file = os.path.join( + report_path, + f"{report_prefix}_{report_name}" + ) + report_files.append( + RunfolderFile.create_object_from_path( + file_path=report_file, + runfolder_path=runfolder.path, + filesystem_service=self.filesystem_service, + metadata_service=self.metadata_service, + base_path=report_path, + checksums=checksums + ) + ) + except FileNotFoundError: + report_files = [] + # if no exception was raised, return the report files and log a message depending on + # where they were found + else: + if report_path == report_paths[0]: + log.info( + f"MultiQC reports found in Unaligned/{project_name}, overriding " + f"organisation of seqreports" + ) + else: + log.info(f"Organising seqreports for {project_name}") + + return report_files + + # raise an exception if no report files at all were found + raise ProjectReportNotFoundException( + f"No project report found for {project_name}" + ) + + def get_project_readme( + self, + project_name, + runfolder, + checksums=None, + with_undetermined=False + ): + """ + Get the README to be included with the project data set. + + :param project_name: the name of the project + :param runfolder: a Runfolder instance representing the runfolder containing the project + :param checksums: a dict with pre-calculated checksums for files. paths are keys and the + corresponding checksum is the value + :param with_undetermined: if True, the README should refer to data that includes + undetermined reads + :return: the path to the README file wrapped in a list + :raises ProjectReportNotFoundException: if the README was not found + """ + log.info(f"Organising README for {project_name}") + readme_file = os.path.join( + self.readme_directory, + "undetermined" if with_undetermined else "", + "README.md" + ) - @staticmethod - def seqreports_path(project): - return os.path.join( - project.runfolder_path, "seqreports", "projects", project.name, - "{}_{}_multiqc_report.html".format(project.runfolder_name, project.name)) - - def seqreports_files(self, project): - report_files = [self.seqreports_path(project)] - report_dir = self.filesystem_service.dirname(report_files[0]) - report_files.append( - os.path.join(report_dir, - "{}_{}_multiqc_report_data.zip".format(project.runfolder_name, project.name))) - return report_files + try: + return [ + RunfolderFile.create_object_from_path( + file_path=readme_file, + runfolder_path=runfolder.path, + filesystem_service=self.filesystem_service, + metadata_service=self.metadata_service, + base_path=self.filesystem_service.dirname(readme_file), + checksums=checksums + ) + ] + except FileNotFoundError: + raise ProjectReportNotFoundException( + f"{os.path.basename(readme_file)} not found at {os.path.dirname(readme_file)} for " + f"{project_name}" + ) def is_sample_in_project(self, project, sample_project, sample_id, sample_lane): """ @@ -249,7 +330,8 @@ def is_sample_in_project(self, project, sample_project, sample_id, sample_lane): return all([ sample_project == project.name, sample_obj, - sample_lane in sample_lanes]) + sample_lane in sample_lanes + ]) @staticmethod def get_sample(project, sample_id): diff --git a/delivery/repositories/runfolder_repository.py b/delivery/repositories/runfolder_repository.py index 54a157b..98d30f2 100644 --- a/delivery/repositories/runfolder_repository.py +++ b/delivery/repositories/runfolder_repository.py @@ -21,11 +21,17 @@ class FileSystemBasedRunfolderRepository(object): CHECKSUM_FILE_PATH = os.path.join("MD5", "checksums.md5") SAMPLESHEET_PATH = "SampleSheet.csv" - def __init__(self, base_path, file_system_service=FileSystemService(), metadata_service=MetadataService()): + def __init__( + self, + base_path, + file_system_service=FileSystemService(), + metadata_service=MetadataService() + ): """ Instantiate a new FileSystemBasedRunfolderRepository :param base_path: the directory where runfolders are stored :param file_system_service: a service which can access the file system. + :param metadata_service: a MetadataService for reading and writing metadata files """ self._base_path = base_path self.file_system_service = file_system_service @@ -156,6 +162,7 @@ def __init__( :param base_path: the directory where runfolders are stored :param project_repository: an instance of UnorganisedRunfolderProjectRepository :param file_system_service: a service which can access the file system + :param metadata_service: a MetadataService for reading and writing metadata files """ super(FileSystemBasedUnorganisedRunfolderRepository, self).__init__( base_path, @@ -220,12 +227,17 @@ def _mask_samplesheet_entry(e): return masked_entry samplesheet_data = self.get_samplesheet(runfolder) - # mask all entries not belonging to the project and write the resulting data to the project-specific location + # mask all entries not belonging to the project and write the resulting data to the + # project-specific location project_samplesheet_data = list(map(_mask_samplesheet_entry, samplesheet_data)) project_samplesheet_file = os.path.join(project.path, runfolder.name, self.SAMPLESHEET_PATH) - self.metadata_service.write_samplesheet_file(project_samplesheet_file, project_samplesheet_data) + self.metadata_service.write_samplesheet_file( + project_samplesheet_file, + project_samplesheet_data + ) return RunfolderFile( project_samplesheet_file, + base_path=self.file_system_service.dirname(project_samplesheet_file), file_checksum=self.metadata_service.hash_file( project_samplesheet_file)) diff --git a/delivery/repositories/sample_repository.py b/delivery/repositories/sample_repository.py index 6616de8..5e0ad71 100644 --- a/delivery/repositories/sample_repository.py +++ b/delivery/repositories/sample_repository.py @@ -23,7 +23,7 @@ class RunfolderProjectBasedSampleRepository(object): def __init__(self, file_system_service=FileSystemService()): self.file_system_service = file_system_service - def get_samples(self, project, runfolder): + def get_samples(self, project_path, project_name, runfolder): """ Parse the supplied project directory and create Sample instances representing the samples in the project. @@ -31,26 +31,26 @@ def get_samples(self, project, runfolder): :param runfolder: a Runfolder instance :return: a list of Sample instances """ - return self._get_samples(project, runfolder) + return self._get_samples(project_path, project_name, runfolder) - def _get_samples(self, project, runfolder): + def _get_samples(self, project_path, project_name, runfolder): def _is_fastq_file(f): return re.match(self.filename_regexp, f) is not None def _name_from_sample_file(s): - subdir = self.file_system_service.relpath(os.path.dirname(s.file_path), project.path) + subdir = self.file_system_service.relpath(os.path.dirname(s.file_path), project_path) return s.sample_name, subdir if subdir != "." else None def _sample_from_name(name_id, sample_files=None): - return Sample(name_id[0], project.name, sample_id=name_id[1], sample_files=sample_files) + return Sample(name_id[0], project_name, sample_id=name_id[1], sample_files=sample_files) def _sample_file_from_path(p): return self.sample_file_from_sample_path(p, runfolder) project_fastq_files = filter( _is_fastq_file, - self.file_system_service.list_files_recursively(project.path)) + self.file_system_service.list_files_recursively(project_path)) # create SampleFile objects from the paths project_sample_files = list(map( @@ -111,13 +111,15 @@ def sample_file_from_sample_path(self, sample_path, runfolder): checksum = None return SampleFile( - sample_path, + sample_path=sample_path, sample_name=sample_name, sample_index=sample_index, lane_no=lane_no, read_no=read_no, is_index=is_index, - checksum=checksum) + base_path=self.file_system_service.dirname(sample_path), + checksum=checksum + ) @staticmethod def sample_lanes(sample): diff --git a/delivery/services/file_system_service.py b/delivery/services/file_system_service.py index 86bb4a6..de2470c 100644 --- a/delivery/services/file_system_service.py +++ b/delivery/services/file_system_service.py @@ -1,6 +1,7 @@ import os import logging +import shutil log = logging.getLogger(__name__) @@ -91,6 +92,19 @@ def symlink(self, source, link_name): self.makedirs(self.dirname(link_name), exist_ok=True) return os.symlink(source, link_name) + @staticmethod + def copy(source, dest): + """ + Shadows shutil.copyfile + :param source: + :param dest: + :return: None + """ + try: + return shutil.copyfile(source, dest) + except IsADirectoryError: + return shutil.copytree(source, dest, symlinks=True) + @staticmethod def mkdir(path): """ diff --git a/delivery/services/organise_service.py b/delivery/services/organise_service.py index 6f874b7..5e128c0 100644 --- a/delivery/services/organise_service.py +++ b/delivery/services/organise_service.py @@ -81,15 +81,14 @@ def check_previously_organised_project(self, project, organised_projects_path, f def organise_project(self, runfolder, project, organised_projects_path, lanes): """ - Organise a project on a runfolder into its own directory and into a standard structure. If the project has - already been organised, a ProjectAlreadyOrganisedException will be raised, unless force is True. If force is - True, the existing project path will be renamed with a unique suffix. + Organise a project on a runfolder into its own directory and into a standard structure. If + the project has already been organised, a ProjectAlreadyOrganisedException will be raised. - :param runfolder: a Runfolder instance representing the runfolder on which the project belongs + :param runfolder: a Runfolder instance representing the runfolder on which the project + belongs :param project: a Project instance representing the project to be organised - :param lanes: if not None, only samples on any of the specified lanes will be organised - :param force: if True, a previously organised project will be renamed with a unique suffix - :raises ProjectAlreadyOrganisedException: if project has already been organised and force is False + :param lanes: if not None, only samples on any of the specified lanes will be organised fix + :raises ProjectAlreadyOrganisedException: if project has already been organised :return: a Project instance representing the project after organisation """ # symlink the samples @@ -105,13 +104,11 @@ def organise_project(self, runfolder, project, organised_projects_path, lanes): # symlink the project files organised_project_files = [] if project.project_files: - project_file_base = self.file_system_service.dirname(project.project_files[0].file_path) for project_file in project.project_files: organised_project_files.append( self.organise_project_file( project_file, - organised_project_runfolder_path, - project_file_base=project_file_base)) + organised_project_runfolder_path)) organised_project = RunfolderProject( project.name, organised_project_path, @@ -128,27 +125,36 @@ def organise_project(self, runfolder, project, organised_projects_path, lanes): return organised_project - def organise_project_file(self, project_file, organised_project_path, project_file_base=None): + def organise_project_file(self, project_file, organised_project_path): """ - Find and symlink the project report to the organised project directory. + Find and symlink or copy the project-associated files to the organised project directory. - :param project: a Project instance representing the project before organisation - :param organised_project: a Project instance representing the project after organisation + :param project_file: a RunfolderFile instance representing the project-associated file + before organisation + :param organised_project_path: path where the project will be organised """ - project_file_base = project_file_base or self.file_system_service.dirname(project_file.file_path) - # the full path to the symlink - link_name = os.path.join( - organised_project_path, - self.file_system_service.relpath( - project_file.file_path, - project_file_base)) - # the relative path from the symlink to the original file - link_path = self.file_system_service.relpath( + # the relative path from the project file base to the project file (e.g. plots/filename.png) + relpath = self.file_system_service.relpath( project_file.file_path, - self.file_system_service.dirname(link_name)) - self.file_system_service.symlink(link_path, link_name) - return RunfolderFile(link_name, file_checksum=project_file.checksum) + project_file.base_path + ) + + # the full path to the organised file + destination = os.path.join( + organised_project_path, + relpath + ) + + # copy the source file to the organised file destination + self.file_system_service.copy(project_file.file_path, destination) + + # return a new RunFolder file object representing the organised file at its new location + return RunfolderFile( + destination, + base_path=organised_project_path, + file_checksum=project_file.checksum + ) def organise_sample(self, sample, organised_project_path, lanes): """ @@ -169,7 +175,13 @@ def organise_sample(self, sample, organised_project_path, lanes): # symlink the sample files using relative paths organised_sample_files = [] for sample_file in sample.sample_files: - organised_sample_files.append(self.organise_sample_file(sample_file, organised_sample_path, lanes)) + organised_sample_files.append( + self.organise_sample_file( + sample_file, + organised_sample_path, + lanes + ) + ) # clean up the list of sample files by removing None elements organised_sample_files = list(filter(None, organised_sample_files)) @@ -182,13 +194,16 @@ def organise_sample(self, sample, organised_project_path, lanes): def organise_sample_file(self, sample_file, organised_sample_path, lanes): """ - Organise a sample file by creating a relative symlink in the supplied directory, pointing back to the supplied - SampleFile's original file path. The Sample file can be excluded from organisation based on the lane it was - derived from. The supplied directory will be created if it doesn't exist. + Organise a sample file by creating a relative symlink in the supplied directory, + pointing back to the supplied SampleFile's original file path. The Sample file can be + excluded from organisation based on the lane it was derived from. The supplied directory + will be created if it doesn't exist. :param sample_file: a SampleFile instance representing the sample file to be organised - :param organised_sample_path: the path to the organised sample directory under which to place the symlink - :param lanes: if not None, only sample files derived from any of the specified lanes will be organised + :param organised_sample_path: the path to the organised sample directory under which to + place the symlink + :param lanes: if not None, only sample files derived from any of the specified lanes will + be organised :return: a new SampleFile instance representing the sample file after organisation """ # skip if the sample file data is derived from a lane that shouldn't be included @@ -196,14 +211,28 @@ def organise_sample_file(self, sample_file, organised_sample_path, lanes): return None # create the symlink in the supplied directory and relative to the file's original location - link_name = os.path.join(organised_sample_path, sample_file.file_name) - relative_path = self.file_system_service.relpath(sample_file.file_path, organised_sample_path) - self.file_system_service.symlink(relative_path, link_name) + destination = os.path.join( + organised_sample_path, + sample_file.file_name + ) + + # get the relative path between the source file and the organised directory + relative_path = self.file_system_service.relpath( + sample_file.file_path, + organised_sample_path + ) + + # create the symlink using the relative path + self.file_system_service.symlink(relative_path, destination) + + # return a new SampleFile instance representing the sample file after organisation return SampleFile( - link_name, + sample_path=destination, sample_name=sample_file.sample_name, sample_index=sample_file.sample_index, lane_no=sample_file.lane_no, read_no=sample_file.read_no, is_index=sample_file.is_index, - checksum=sample_file.checksum) + base_path=organised_sample_path, + checksum=sample_file.checksum + ) diff --git a/tests/integration_tests/__init__.py b/tests/integration_tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/integration_tests/test_integration.py b/tests/integration_tests/test_integration.py index 71d54da..cbf68b4 100644 --- a/tests/integration_tests/test_integration.py +++ b/tests/integration_tests/test_integration.py @@ -93,10 +93,9 @@ def _verify_checksum(file_path, expected_checksum): project_file_base = os.path.dirname(project.project_files[0].file_path) relative_path = os.path.relpath(project_file.file_path, project_file_base) organised_project_file_path = os.path.join(organised_path, relative_path) - self.assertTrue( - os.path.samefile( - organised_project_file_path, - project_file.file_path)) + self.assertEqual( + os.path.basename(organised_project_file_path), + project_file.file_name) _verify_checksum(os.path.join(runfolder.name, relative_path), project_file.checksum) for sample in project.samples: sample_path = os.path.join(organised_path, sample.sample_id) diff --git a/tests/integration_tests/test_integration_dds.py b/tests/integration_tests/test_integration_dds.py index 7527642..2473524 100644 --- a/tests/integration_tests/test_integration_dds.py +++ b/tests/integration_tests/test_integration_dds.py @@ -31,6 +31,9 @@ def test_can_stage_and_delivery_runfolder(self): staging_status_links = response_json.get("staging_order_links") + # Insert a pause to allow staging to complete + time.sleep(1) + for project, link in staging_status_links.items(): self.assertEqual(project, "ABC_123") @@ -79,6 +82,9 @@ def test_can_stage_and_delivery_project_dir(self): staging_status_links = response_json.get("staging_order_links") + # Insert a pause to allow staging to complete + time.sleep(1) + for project, link in staging_status_links.items(): self.assertEqual(project, dir_name) @@ -153,6 +159,7 @@ def test_can_stage_and_deliver_batch_flowcells(self): staging_status_links = response_json.get("staging_order_links") + # Insert a pause to allow staging to complete time.sleep(1) for project, link in staging_status_links.items(): @@ -190,6 +197,9 @@ def test_can_stage_and_deliver_force_flowcells(self): staging_status_links = response_json.get("staging_order_links") + # Insert a pause to allow staging to complete + time.sleep(1) + for project, link in staging_status_links.items(): self.assertEqual(project, 'XYZ_123') @@ -248,7 +258,7 @@ def __init__(self, *args): self.mock_duration = 2 - @gen_test(timeout=5) + @gen_test(timeout=10) def test_mock_duration_is_2(self): with tempfile.TemporaryDirectory( dir='./tests/resources/runfolders/', @@ -268,6 +278,9 @@ def test_mock_duration_is_2(self): staging_order_project_and_id = response_json.get("staging_order_ids") + # Insert a pause to allow staging to complete + time.sleep(1) + for project, staging_id in staging_order_project_and_id.items(): delivery_url = '/'.join([ self.API_BASE, 'deliver', 'stage_id', str(staging_id)]) @@ -290,6 +303,9 @@ def test_mock_duration_is_2(self): delivery_link = delivery_resp_as_json['delivery_order_link'] while True: + # Insert a pause to allow delivery to complete + time.sleep(1) + status_response = yield self.http_client.fetch( delivery_link) if (json.loads(status_response.body)["status"] @@ -299,7 +315,7 @@ def test_mock_duration_is_2(self): stop = time.time() self.assertTrue(stop - start >= self.mock_duration) - @gen_test(timeout=5) + @gen_test(timeout=10) def test_can_delivery_data_asynchronously(self): with tempfile.TemporaryDirectory( dir='./tests/resources/runfolders/', @@ -318,8 +334,18 @@ def test_can_delivery_data_asynchronously(self): response_json = json.loads(response.body) staging_order_project_and_id = response_json.get("staging_order_ids") + staging_status_links = response_json.get("staging_order_links") + + for project, link in staging_status_links.items(): + staging_id = staging_order_project_and_id[project] + while True: + # Insert a pause to allow staging to complete + time.sleep(1) + status_response = yield self.http_client.fetch(link) + if json.loads(status_response.body)["status"] == \ + StagingStatus.staging_successful.name: + break - for project, staging_id in staging_order_project_and_id.items(): delivery_url = '/'.join([ self.API_BASE, 'deliver', 'stage_id', str(staging_id)]) delivery_body = { @@ -339,6 +365,8 @@ def test_can_delivery_data_asynchronously(self): delivery_link = delivery_resp_as_json['delivery_order_link'] while True: + # Insert a pause to allow delivery to complete + time.sleep(1) status_response = yield self.http_client.fetch( delivery_link) if (json.loads(status_response.body)["status"] @@ -379,6 +407,9 @@ def test_can_deliver_and_not_timeout(self): staging_order_project_and_id = response_json.get( "staging_order_ids") + # Insert a pause to allow staging to complete + time.sleep(1) + for project, staging_id in staging_order_project_and_id.items(): delivery_url = '/'.join([ self.API_BASE, 'deliver', 'stage_id', str(staging_id)]) diff --git a/tests/resources/readme/README.md b/tests/resources/readme/README.md new file mode 100644 index 0000000..a31412e --- /dev/null +++ b/tests/resources/readme/README.md @@ -0,0 +1,25 @@ +# DELIVERY OF FASTQ FILES FROM NGI-UPPSALA,THE SNP&SEQ TECHNOLOGY PLATFORM + +This delivery includes sequencing data in FASTQ format, a summary report created with MultiQC, checksums and a copy of the sample sheet used for demultiplexing. + +## Delivery structure +``` + +└── + ├── README.md + ├── checksums.md5 + ├── SampleSheet.csv + ├── __multiqc_report_data.zip + ├── __multiqc_report.html + ├── + ├── __R1_001.fastq.gz + └── __R2_001.fastq.gz + ├── + ├── __R1_001.fastq.gz + └── __R2_001.fastq.gz + : + : + └── + ├── __R1_001.fastq.gz + └── __R2_001.fastq.gz +``` diff --git a/tests/resources/readme/undetermined/README.md b/tests/resources/readme/undetermined/README.md new file mode 100644 index 0000000..059951a --- /dev/null +++ b/tests/resources/readme/undetermined/README.md @@ -0,0 +1,30 @@ +# DELIVERY OF FASTQ FILES FROM NGI-UPPSALA,THE SNP&SEQ TECHNOLOGY PLATFORM + +This delivery includes sequencing data in FASTQ format, a summary report created with MultiQC, checksums and a copy of the sample sheet used for demultiplexing. + +## Delivery structure +``` + +└── + ├── README.md + ├── checksums.md5 + ├── SampleSheet.csv + ├── __multiqc_report_data.zip + ├── __multiqc_report.html + ├── + ├── __R1_001.fastq.gz + └── __R2_001.fastq.gz + ├── + ├── __R1_001.fastq.gz + └── __R2_001.fastq.gz + : + : + └── + ├── __R1_001.fastq.gz + └── __R2_001.fastq.gz + └── Undetermined + ├── Undetermined___R1.001.fastq.gz + ├── Undetermined___R2.001.fastq.gz + ├── ... + └── checksums.md5 +``` diff --git a/tests/test_utils.py b/tests/test_utils.py index 89f8b88..e508516 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -68,7 +68,7 @@ def lane_generator(): def report_type_generator(): - report_types = ["multiqc", "seqreports", "sisyphus"] + report_types = ["multiqc", "seqreports"] while True: for rt in report_types: yield rt @@ -306,6 +306,7 @@ def project_report_files(project, report_type): FAKE_RUNFOLDERS = [_runfolder1, _runfolder2] UNORGANISED_RUNFOLDER = unorganised_runfolder() +README_DIRECTORY = "/bar" def assert_eventually_equals(self, timeout, f, expected, delay=0.1): diff --git a/tests/unit_tests/repositories/test_project_repository.py b/tests/unit_tests/repositories/test_project_repository.py index 2eb262f..8539879 100644 --- a/tests/unit_tests/repositories/test_project_repository.py +++ b/tests/unit_tests/repositories/test_project_repository.py @@ -13,7 +13,7 @@ from delivery.services.file_system_service import FileSystemService from delivery.services.metadata_service import MetadataService -from tests.test_utils import UNORGANISED_RUNFOLDER +from tests.test_utils import README_DIRECTORY, UNORGANISED_RUNFOLDER class TestGeneralProjectRepository(unittest.TestCase): @@ -43,6 +43,7 @@ def setUp(self) -> None: self.metadata_service = mock.create_autospec(MetadataService) self.project_repository = UnorganisedRunfolderProjectRepository( sample_repository=self.sample_repository, + readme_directory=README_DIRECTORY, filesystem_service=self.filesystem_service, metadata_service=self.metadata_service) self.runfolder = UNORGANISED_RUNFOLDER @@ -51,10 +52,14 @@ def test_get_report_files(self): # missing report files will raise an exception at this point self.filesystem_service.exists.return_value = False + self.metadata_service.hash_file.side_effect = FileNotFoundError self.assertRaises( ProjectReportNotFoundException, self.project_repository.get_report_files, - self.runfolder.projects[0]) + self.runfolder.projects[0].path, + self.runfolder.projects[0].name, + self.runfolder + ) def test_get_projects(self): @@ -78,5 +83,9 @@ def test_override_log_message(self): self.filesystem_service.dirname.return_value = "foo/bar" with self.assertLogs(level='INFO') as log: - self.project_repository.get_report_files(self.runfolder.projects[0]) + self.project_repository.get_report_files( + self.runfolder.projects[0].path, + self.runfolder.projects[0].name, + self.runfolder + ) self.assertIn('overriding organisation of seqreports', log.output[0]) diff --git a/tests/unit_tests/repositories/test_sample_repository.py b/tests/unit_tests/repositories/test_sample_repository.py index bbb9465..9194ae9 100644 --- a/tests/unit_tests/repositories/test_sample_repository.py +++ b/tests/unit_tests/repositories/test_sample_repository.py @@ -27,7 +27,11 @@ def setUp(self): def test_get_samples(self): self.file_system_service.relpath.side_effect = os.path.relpath self.file_system_service.dirname = os.path.dirname - for sample in self.sample_repo.get_samples(self.project, self.runfolder): + for sample in self.sample_repo.get_samples( + self.project.path, + self.project.name, + self.runfolder + ): self.assertIn(sample, self.project.samples) for sample_file in sample.sample_files: sample_file_subdir = os.path.dirname( diff --git a/tests/unit_tests/services/test_organise_service.py b/tests/unit_tests/services/test_organise_service.py index 3be72a0..8f0c8c1 100644 --- a/tests/unit_tests/services/test_organise_service.py +++ b/tests/unit_tests/services/test_organise_service.py @@ -116,8 +116,11 @@ def test_organise_project(self): organise_project_file_mock.assert_has_calls([ mock.call( project_file, - os.path.join(organised_projects_path, self.project.name, self.project.runfolder_name), - project_file_base=os.path.dirname(self.project.project_files[0].file_path) + os.path.join( + organised_projects_path, + self.project.name, + self.project.runfolder_name + ) ) for project_file in self.project.project_files]) @@ -197,23 +200,24 @@ def test_organise_project_file(self): os.path.join( project_file_base, project_file), + base_path=project_file_base, file_checksum="checksum-for-{}".format(project_file)) for project_file in ("a-report-file", os.path.join("report-dir", "another-report-file"))] self.file_system_service.relpath.side_effect = os.path.relpath self.file_system_service.dirname.side_effect = os.path.dirname for project_file in project_files: organised_project_file = self.organise_service.organise_project_file( - project_file, organised_project_path, project_file_base) + project_file, organised_project_path) self.assertEqual( os.path.join( organised_project_path, os.path.relpath(project_file.file_path, project_file_base)), organised_project_file.file_path) self.assertEqual(project_file.checksum, organised_project_file.checksum) - self.file_system_service.symlink.assert_has_calls([ + self.file_system_service.copy.assert_has_calls([ mock.call( - os.path.join("..", "..", "foo", "a-report-file"), + os.path.join(project_files[0].file_path), os.path.join(organised_project_path, "a-report-file")), mock.call( - os.path.join("..", "..", "..", "foo", "report-dir", "another-report-file"), + os.path.join(project_files[1].file_path), os.path.join(organised_project_path, "report-dir", "another-report-file"))])