From 8246380ee3ec5260ad32b0affe13b938744bfc72 Mon Sep 17 00:00:00 2001 From: b97pla Date: Tue, 23 Jan 2024 14:31:29 +0100 Subject: [PATCH] simplify report fetching and file object --- delivery/models/project.py | 17 ++- delivery/models/runfolder.py | 25 ++-- delivery/models/sample.py | 10 +- delivery/repositories/project_repository.py | 123 ++++++++---------- delivery/services/organise_service.py | 78 ++++++----- tests/integration_tests/test_integration.py | 7 +- .../services/test_organise_service.py | 6 +- 7 files changed, 133 insertions(+), 133 deletions(-) diff --git a/delivery/models/project.py b/delivery/models/project.py index 47a69c0..cb2ae09 100644 --- a/delivery/models/project.py +++ b/delivery/models/project.py @@ -35,18 +35,29 @@ def __hash__(self): class RunfolderProject(BaseProject): """ - Model a project directory in a runfolder on disk. Note that this means that this project model only extends - to the idea of projects as subdirectories in a demultiplexed Illumina runfolder. + Model a project directory in a runfolder on disk. Note that this means that this project model + only extends to the idea of projects as subdirectories in a demultiplexed Illumina runfolder. """ - def __init__(self, name, path, runfolder_path, runfolder_name, samples=None, project_files=None): + def __init__( + self, + name, + path, + runfolder_path, + runfolder_name, + samples=None, + project_files=None + ): """ Instantiate a new `RunfolderProject` object + :param name: of the project :param path: path to the project :param runfolder_path: path the runfolder in which this project is stored. :param runfolder_name: name of the runfolder in which this project is stored :param samples: list of instances of Sample, representing samples in the project + :param project_files: list of instances of RunfolderFile, representing files belonging to + the project. These do not include files already accounted for in the `samples` list. """ self.name = name self.path = os.path.abspath(path) diff --git a/delivery/models/runfolder.py b/delivery/models/runfolder.py index 3a95458..0896433 100644 --- a/delivery/models/runfolder.py +++ b/delivery/models/runfolder.py @@ -41,8 +41,7 @@ def __init__( self, file_path, base_path=None, - file_checksum=None, - file_operation="symlink" + file_checksum=None ): """ A `RunfolderFile` object representing a file in the runfolder @@ -55,15 +54,11 @@ def __init__( :param file_path: the path to the file :param base_path: a path relative to which the file will be considered :param file_checksum: a computed checksum for the file - :param file_operation: the file operation to perform, symlink (default) or copy """ self.file_path = os.path.abspath(file_path) self.file_name = os.path.basename(file_path) self.base_path = base_path or os.path.dirname(self.file_path) self.checksum = file_checksum - self.file_operation = file_operation \ - if file_operation and file_operation in ["symlink", "copy"] \ - else "symlink" @classmethod def create_object_from_path( @@ -73,9 +68,20 @@ def create_object_from_path( filesystem_service, metadata_service, base_path=None, - checksums=None, - file_operation="symlink" + checksums=None ): + """ + Factory method that creates and returns a class instance. + + :param file_path: the path to the file + :param runfolder_path: the path to the runfolder containing the file + :param filesystem_service: a service which can access the file system + :param metadata_service: a MetadataService for reading and writing metadata files + :param base_path: a path relative to which the file will be considered + :param checksums: a list of pre-computed checksums that may or may not contain an entry for + the `file_path` relative to the `runfolder_path` + :return: a class object representing a file in the runfolder + """ checksums = checksums or {} relative_file_path = filesystem_service.relpath( file_path, @@ -89,6 +95,5 @@ def create_object_from_path( return cls( file_path, base_path=base_path, - file_checksum=checksum, - file_operation=file_operation + file_checksum=checksum ) diff --git a/delivery/models/sample.py b/delivery/models/sample.py index d823e5c..8187065 100644 --- a/delivery/models/sample.py +++ b/delivery/models/sample.py @@ -47,8 +47,7 @@ def __init__( read_no=None, is_index=None, base_path=None, - checksum=None, - file_operation="symlink" + checksum=None ): """ A `SampleFile` object @@ -66,13 +65,11 @@ def __init__( :param is_index: if True, the sequence file contains index sequences :param base_path: a path relative to which the file will be considered :param checksum: the MD5 checksum for this SampleFile - :param file_operation: the file operation to perform, symlink (default) or copy """ super(SampleFile, self).__init__( sample_path, base_path=base_path, - file_checksum=checksum, - file_operation=file_operation + file_checksum=checksum ) self.sample_name = sample_name self.sample_index = sample_index @@ -92,6 +89,5 @@ def __hash__(self): self.lane_no, self.read_no, self.is_index, - self.checksum, - self.file_operation + self.checksum )) diff --git a/delivery/repositories/project_repository.py b/delivery/repositories/project_repository.py index a89478f..1ede6d3 100644 --- a/delivery/repositories/project_repository.py +++ b/delivery/repositories/project_repository.py @@ -214,81 +214,63 @@ def get_report_files(self, project_path, project_name, runfolder, checksums=None :raises ProjectReportNotFoundException: if no MultiQC report was found for the project """ - report_files = self.project_multiqc_report_files(project_path, project_name) - if self.filesystem_service.exists(report_files[0]): - log.info( - f"MultiQC reports found in Unaligned/{project_name}, overriding organisation of " - f"seqreports" - ) - else: - log.info(f"Organising seqreports for {project_name}") - report_files = self.runfolder_project_multiqc_report_files( - project_name, - runfolder - ) - try: - report_path = self.filesystem_service.dirname(report_files[0]) - return [ - RunfolderFile.create_object_from_path( - file_path=report_file, - runfolder_path=runfolder.path, - filesystem_service=self.filesystem_service, - metadata_service=self.metadata_service, - base_path=report_path, - checksums=checksums - ) - for report_file in report_files - ] - except FileNotFoundError: - raise ProjectReportNotFoundException( - f"No project report found for {project_name}" - ) - - @staticmethod - def project_multiqc_report_files(project_path, project_name): - """ - Return a list of MultiQC report files found under the project directory. - - :param project_path: the path to the project folder - :param project_name: the name of the project - :return: a list of paths to MultiQC-report-related files - """ - return [ - os.path.join( - project_path, - f"{project_name}_multiqc_report.html" - ), + # look for the MultiQC report in these locations (in order of precedence) + report_paths = [ + project_path, os.path.join( - project_path, - f"{project_name}_multiqc_report_data.zip" + runfolder.path, + "seqreports", + "projects", + project_name ) ] + # depending on the location of the MultiQC report, use these different prefixes + report_prefixes = [ + project_name, + f"{runfolder.name}_{project_name}" + ] + report_names = [ + "multiqc_report.html", + "multiqc_report_data.zip" + ] + for report_path, report_prefix in zip(report_paths, report_prefixes): + # creating the RunfolderFile for a non-existing file should raise an exception + try: + report_files = [] + for report_name in report_names: + report_file = os.path.join( + report_path, + f"{report_prefix}_{report_name}" + ) + report_files.append( + RunfolderFile.create_object_from_path( + file_path=report_file, + runfolder_path=runfolder.path, + filesystem_service=self.filesystem_service, + metadata_service=self.metadata_service, + base_path=report_path, + checksums=checksums + ) + ) + except FileNotFoundError: + report_files = [] + # if no exception was raised, return the report files and log a message depending on + # where they were found + else: + if report_path == report_paths[0]: + log.info( + f"MultiQC reports found in Unaligned/{project_name}, overriding " + f"organisation of seqreports" + ) + else: + log.info(f"Organising seqreports for {project_name}") - @staticmethod - def runfolder_project_multiqc_report_files(project_name, runfolder): - """ - Return a list of MultiQC report files for a project found under the seqreports folder. + return report_files - :param project_name: the name of the project - :param runfolder: a Runfolder instance representing the runfolder containing the project - :return: a list of paths to MultiQC-report-related files - """ - report_dir = os.path.join( - runfolder.path, - "seqreports", - "projects", - project_name + # raise an exception if no report files at all were found + raise ProjectReportNotFoundException( + f"No project report found for {project_name}" ) - return [ - os.path.join( - report_dir, - f"{runfolder.name}_{project_name}_multiqc_report.html", - ), - os.path.join( - report_dir, - f"{runfolder.name}_{project_name}_multiqc_report_data.zip" - ) - ] def get_project_readme( self, @@ -324,8 +306,7 @@ def get_project_readme( filesystem_service=self.filesystem_service, metadata_service=self.metadata_service, base_path=self.filesystem_service.dirname(readme_file), - checksums=checksums, - file_operation="copy" + checksums=checksums ) ] except FileNotFoundError: diff --git a/delivery/services/organise_service.py b/delivery/services/organise_service.py index 7a2d169..5e128c0 100644 --- a/delivery/services/organise_service.py +++ b/delivery/services/organise_service.py @@ -81,15 +81,14 @@ def check_previously_organised_project(self, project, organised_projects_path, f def organise_project(self, runfolder, project, organised_projects_path, lanes): """ - Organise a project on a runfolder into its own directory and into a standard structure. If the project has - already been organised, a ProjectAlreadyOrganisedException will be raised, unless force is True. If force is - True, the existing project path will be renamed with a unique suffix. + Organise a project on a runfolder into its own directory and into a standard structure. If + the project has already been organised, a ProjectAlreadyOrganisedException will be raised. - :param runfolder: a Runfolder instance representing the runfolder on which the project belongs + :param runfolder: a Runfolder instance representing the runfolder on which the project + belongs :param project: a Project instance representing the project to be organised - :param lanes: if not None, only samples on any of the specified lanes will be organised - :param force: if True, a previously organised project will be renamed with a unique suffix - :raises ProjectAlreadyOrganisedException: if project has already been organised and force is False + :param lanes: if not None, only samples on any of the specified lanes will be organised fix + :raises ProjectAlreadyOrganisedException: if project has already been organised :return: a Project instance representing the project after organisation """ # symlink the samples @@ -141,23 +140,18 @@ def organise_project_file(self, project_file, organised_project_path): project_file.base_path ) - # the full path to the symlink - link_name = os.path.join( + # the full path to the organised file + destination = os.path.join( organised_project_path, relpath ) - if project_file.file_operation == "copy": - self.file_system_service.copy(project_file.file_path, link_name) - else: - # the relative path from the symlink to the original file - link_path = self.file_system_service.relpath( - project_file.file_path, - self.file_system_service.dirname(link_name) - ) - self.file_system_service.symlink(link_path, link_name) + # copy the source file to the organised file destination + self.file_system_service.copy(project_file.file_path, destination) + + # return a new RunFolder file object representing the organised file at its new location return RunfolderFile( - link_name, + destination, base_path=organised_project_path, file_checksum=project_file.checksum ) @@ -181,7 +175,13 @@ def organise_sample(self, sample, organised_project_path, lanes): # symlink the sample files using relative paths organised_sample_files = [] for sample_file in sample.sample_files: - organised_sample_files.append(self.organise_sample_file(sample_file, organised_sample_path, lanes)) + organised_sample_files.append( + self.organise_sample_file( + sample_file, + organised_sample_path, + lanes + ) + ) # clean up the list of sample files by removing None elements organised_sample_files = list(filter(None, organised_sample_files)) @@ -194,13 +194,16 @@ def organise_sample(self, sample, organised_project_path, lanes): def organise_sample_file(self, sample_file, organised_sample_path, lanes): """ - Organise a sample file by creating a relative symlink in the supplied directory, pointing back to the supplied - SampleFile's original file path. The Sample file can be excluded from organisation based on the lane it was - derived from. The supplied directory will be created if it doesn't exist. + Organise a sample file by creating a relative symlink in the supplied directory, + pointing back to the supplied SampleFile's original file path. The Sample file can be + excluded from organisation based on the lane it was derived from. The supplied directory + will be created if it doesn't exist. :param sample_file: a SampleFile instance representing the sample file to be organised - :param organised_sample_path: the path to the organised sample directory under which to place the symlink - :param lanes: if not None, only sample files derived from any of the specified lanes will be organised + :param organised_sample_path: the path to the organised sample directory under which to + place the symlink + :param lanes: if not None, only sample files derived from any of the specified lanes will + be organised :return: a new SampleFile instance representing the sample file after organisation """ # skip if the sample file data is derived from a lane that shouldn't be included @@ -208,18 +211,23 @@ def organise_sample_file(self, sample_file, organised_sample_path, lanes): return None # create the symlink in the supplied directory and relative to the file's original location - link_name = os.path.join(organised_sample_path, sample_file.file_name) - if sample_file.file_operation == "copy": - self.file_system_service.copy(sample_file.file_path, link_name) - else: - relative_path = self.file_system_service.relpath( - sample_file.file_path, - organised_sample_path - ) - self.file_system_service.symlink(relative_path, link_name) + destination = os.path.join( + organised_sample_path, + sample_file.file_name + ) + + # get the relative path between the source file and the organised directory + relative_path = self.file_system_service.relpath( + sample_file.file_path, + organised_sample_path + ) + + # create the symlink using the relative path + self.file_system_service.symlink(relative_path, destination) + # return a new SampleFile instance representing the sample file after organisation return SampleFile( - sample_path=link_name, + sample_path=destination, sample_name=sample_file.sample_name, sample_index=sample_file.sample_index, lane_no=sample_file.lane_no, diff --git a/tests/integration_tests/test_integration.py b/tests/integration_tests/test_integration.py index 71d54da..cbf68b4 100644 --- a/tests/integration_tests/test_integration.py +++ b/tests/integration_tests/test_integration.py @@ -93,10 +93,9 @@ def _verify_checksum(file_path, expected_checksum): project_file_base = os.path.dirname(project.project_files[0].file_path) relative_path = os.path.relpath(project_file.file_path, project_file_base) organised_project_file_path = os.path.join(organised_path, relative_path) - self.assertTrue( - os.path.samefile( - organised_project_file_path, - project_file.file_path)) + self.assertEqual( + os.path.basename(organised_project_file_path), + project_file.file_name) _verify_checksum(os.path.join(runfolder.name, relative_path), project_file.checksum) for sample in project.samples: sample_path = os.path.join(organised_path, sample.sample_id) diff --git a/tests/unit_tests/services/test_organise_service.py b/tests/unit_tests/services/test_organise_service.py index 5bfe57a..8f0c8c1 100644 --- a/tests/unit_tests/services/test_organise_service.py +++ b/tests/unit_tests/services/test_organise_service.py @@ -214,10 +214,10 @@ def test_organise_project_file(self): os.path.relpath(project_file.file_path, project_file_base)), organised_project_file.file_path) self.assertEqual(project_file.checksum, organised_project_file.checksum) - self.file_system_service.symlink.assert_has_calls([ + self.file_system_service.copy.assert_has_calls([ mock.call( - os.path.join("..", "..", "foo", "a-report-file"), + os.path.join(project_files[0].file_path), os.path.join(organised_project_path, "a-report-file")), mock.call( - os.path.join("..", "..", "..", "foo", "report-dir", "another-report-file"), + os.path.join(project_files[1].file_path), os.path.join(organised_project_path, "report-dir", "another-report-file"))])