From e668dd7cdc99feb431f9c7f6901fc84d1897c372 Mon Sep 17 00:00:00 2001 From: b97pla Date: Thu, 11 Jan 2024 16:32:14 +0100 Subject: [PATCH 1/5] include readme when organising --- config/app.config | 1 + delivery/__init__.py | 2 +- delivery/app.py | 17 ++- delivery/models/runfolder.py | 8 +- delivery/models/sample.py | 8 +- delivery/repositories/project_repository.py | 144 ++++++++++++++---- delivery/repositories/runfolder_repository.py | 16 +- delivery/repositories/sample_repository.py | 1 + delivery/services/organise_service.py | 25 +-- tests/resources/readme/README.md | 25 +++ tests/resources/readme/undetermined/README.md | 30 ++++ tests/test_utils.py | 1 + .../repositories/test_project_repository.py | 3 +- .../services/test_organise_service.py | 10 +- 14 files changed, 238 insertions(+), 53 deletions(-) create mode 100644 tests/resources/readme/README.md create mode 100644 tests/resources/readme/undetermined/README.md diff --git a/config/app.config b/config/app.config index d11ff09..6bf17f1 100644 --- a/config/app.config +++ b/config/app.config @@ -8,6 +8,7 @@ runfolder_directory: tests/resources/runfolders general_project_directory: tests/resources/projects staging_directory: /tmp/ project_links_directory: /tmp/ +readme_directory: tests/resources/readme dds_conf: log_path: dds.log port: 9999 diff --git a/delivery/__init__.py b/delivery/__init__.py index 0552768..f5f41e5 100644 --- a/delivery/__init__.py +++ b/delivery/__init__.py @@ -1 +1 @@ -__version__ = "3.0.1" +__version__ = "3.1.0" diff --git a/delivery/app.py b/delivery/app.py index 6b4aa7a..c0349fc 100644 --- a/delivery/app.py +++ b/delivery/app.py @@ -25,7 +25,8 @@ FileSystemBasedUnorganisedRunfolderRepository from delivery.repositories.staging_repository import DatabaseBasedStagingRepository from delivery.repositories.deliveries_repository import DatabaseBasedDeliveriesRepository -from delivery.repositories.project_repository import GeneralProjectRepository, UnorganisedRunfolderProjectRepository +from delivery.repositories.project_repository import GeneralProjectRepository, \ + UnorganisedRunfolderProjectRepository from delivery.repositories.delivery_sources_repository import DatabaseBasedDeliverySourcesRepository from delivery.repositories.sample_repository import RunfolderProjectBasedSampleRepository @@ -120,13 +121,19 @@ def _assert_is_dir(directory): project_links_directory = config["project_links_directory"] _assert_is_dir(project_links_directory) - runfolder_repo = FileSystemBasedRunfolderRepository(runfolder_dir) - project_repository = UnorganisedRunfolderProjectRepository( - sample_repository=RunfolderProjectBasedSampleRepository() + readme_directory = config["readme_directory"] + _assert_is_dir(readme_directory) + + sample_repo = RunfolderProjectBasedSampleRepository() + runfolder_repo = FileSystemBasedRunfolderRepository( + runfolder_dir ) unorganised_runfolder_repo = FileSystemBasedUnorganisedRunfolderRepository( runfolder_dir, - project_repository=project_repository + project_repository=UnorganisedRunfolderProjectRepository( + sample_repository=sample_repo, + readme_directory=readme_directory + ) ) general_project_dir = config['general_project_directory'] diff --git a/delivery/models/runfolder.py b/delivery/models/runfolder.py index 3691132..065c57c 100644 --- a/delivery/models/runfolder.py +++ b/delivery/models/runfolder.py @@ -37,7 +37,13 @@ def __hash__(self): class RunfolderFile(object): - def __init__(self, file_path, file_checksum=None): + def __init__( + self, + file_path, + base_path=None, + file_checksum=None + ): self.file_path = os.path.abspath(file_path) self.file_name = os.path.basename(file_path) + self.base_path = base_path or os.path.dirname(self.file_path) self.checksum = file_checksum diff --git a/delivery/models/sample.py b/delivery/models/sample.py index b6ba45a..b842609 100644 --- a/delivery/models/sample.py +++ b/delivery/models/sample.py @@ -46,6 +46,7 @@ def __init__( lane_no=None, read_no=None, is_index=None, + base_path=None, checksum=None): """ Instantiate a new `SampleFile` object @@ -56,9 +57,14 @@ def __init__( :param lane_no: the lane number the sequences in the file were derived from :param read_no: the read number :param is_index: if True, the sequence file contains index sequences + :param base_path: the base path that the sample_path may be referenced relative to :param checksum: the MD5 checksum for this SampleFile """ - super(SampleFile, self).__init__(sample_path, file_checksum=checksum) + super(SampleFile, self).__init__( + sample_path, + base_path=base_path, + file_checksum=checksum + ) self.sample_name = sample_name self.sample_index = sample_index self.lane_no = lane_no diff --git a/delivery/repositories/project_repository.py b/delivery/repositories/project_repository.py index 1ac7af9..5e0bdc8 100644 --- a/delivery/repositories/project_repository.py +++ b/delivery/repositories/project_repository.py @@ -65,7 +65,13 @@ class UnorganisedRunfolderProjectRepository(object): PROJECTS_DIR = "Unaligned" - def __init__(self, sample_repository, filesystem_service=FileSystemService(), metadata_service=MetadataService()): + def __init__( + self, + sample_repository, + readme_directory, + filesystem_service=FileSystemService(), + metadata_service=MetadataService() + ): """ Instantiate a new UnorganisedRunfolderProjectRepository object @@ -75,6 +81,7 @@ def __init__(self, sample_repository, filesystem_service=FileSystemService(), me self.filesystem_service = filesystem_service self.sample_repository = sample_repository self.metadata_service = metadata_service + self.readme_directory = readme_directory def dump_checksums(self, project): """ @@ -89,7 +96,7 @@ def _sample_file_checksum(sample_file): sample_file.checksum, self.filesystem_service.relpath( sample_file.file_path, - project.path)] if sample_file.checksum else None + project.path)] if sample_file.checksum else [None, None] def _sample_checksums(sample): for sample_file in sample.sample_files: @@ -129,11 +136,31 @@ def project_from_dir(d): runfolder_path=runfolder.path, runfolder_name=runfolder.name ) - try: - project.project_files = self.get_report_files(project, checksums=runfolder.checksums) - except ProjectReportNotFoundException as e: - log.warning(e) - + project_files = [] + for fn, kw in zip( + [ + self.get_report_files, + self.get_project_readme + ], + [ + { + "project": project, + "checksums": runfolder.checksums + }, + { + "project": project, + "with_undetermined": False + } + ] + ): + try: + project_files.extend( + fn(**kw) + ) + except ProjectReportNotFoundException as e: + log.warning(e) + + project.project_files = project_files project.samples = self.sample_repository.get_samples(project, runfolder) return project @@ -141,15 +168,31 @@ def project_from_dir(d): projects_base_dir = os.path.join(runfolder.path, self.PROJECTS_DIR) # only include directories that have fastq.gz files beneath them + dirs = self.filesystem_service.find_project_directories(projects_base_dir) project_directories = filter( dir_contains_fastq_files, - self.filesystem_service.find_project_directories(projects_base_dir) + [dirs] if type(dirs) is str else dirs ) return list(map(project_from_dir, project_directories)) or None except FileNotFoundError: - raise ProjectsDirNotfoundException("Did not find Unaligned folder for: {}".format(runfolder.name)) + raise ProjectsDirNotfoundException( + f"Did not find {self.PROJECTS_DIR} folder for: {runfolder.name}" + ) + + def file_object_from_path(self, file_path, base_path, project, checksums=None): + checksums = checksums or {} + relative_file_path = self.filesystem_service.relpath( + file_path, + self.filesystem_service.dirname(project.runfolder_path)) + checksum = checksums[relative_file_path] \ + if relative_file_path in checksums else self.metadata_service.hash_file(file_path) + return RunfolderFile( + file_path, + base_path=base_path, + file_checksum=checksum + ) def get_report_files(self, project, checksums=None): """ @@ -163,29 +206,47 @@ def get_report_files(self, project, checksums=None): :return: a list of RunfolderFile objects :raises ProjectReportNotFoundException: if no MultiQC or Sisyphus report was found for the project """ - def _file_object_from_path(file_path): - relative_file_path = self.filesystem_service.relpath( - file_path, - self.filesystem_service.dirname(project.runfolder_path)) - checksum = checksums[relative_file_path] \ - if relative_file_path in checksums else self.metadata_service.hash_file(file_path) - return RunfolderFile(file_path, file_checksum=checksum) + def _file_object_from_path(file_paths): + return self.file_object_from_path(file_paths[0], file_paths[1], project, checksums) checksums = checksums or {} + report_files = [] if self.filesystem_service.exists(self.multiqc_report_path(project)): - log.info("MultiQC reports found in Unaligned/{}, overriding organisation of seqreports".format(project.name)) - return list(map(_file_object_from_path, self.multiqc_report_files(project))) - for sisyphus_report_path in self.sisyphus_report_path(project): - if self.filesystem_service.exists(sisyphus_report_path): - log.info("Organising sisyphus reports for {}".format(project.name)) - return list(map( - _file_object_from_path, - self.sisyphus_report_files( - self.filesystem_service.dirname(sisyphus_report_path)))) - if self.filesystem_service.exists(self.seqreports_path(project)): - log.info("Organising seqreports for {}".format(project.name)) - return list(map(_file_object_from_path, self.seqreports_files(project))) - raise ProjectReportNotFoundException("No project report found for {}".format(project.name)) + log.info( + f"MultiQC reports found in Unaligned/{project.name}, overriding organisation of " + f"seqreports") + report_path = self.filesystem_service.dirname( + self.multiqc_report_path(project) + ) + report_files = [ + (report_file, report_path) + for report_file in self.multiqc_report_files(project) + ] + elif self.filesystem_service.exists(self.seqreports_path(project)): + log.info(f"Organising seqreports for {project.name}") + report_path = self.filesystem_service.dirname( + self.seqreports_path(project) + ) + report_files = [ + (report_file, report_path) + for report_file in self.seqreports_files(project) + ] + else: + for sisyphus_report_path in self.sisyphus_report_path(project): + if self.filesystem_service.exists(sisyphus_report_path): + log.info("Organising sisyphus reports for {}".format(project.name)) + report_path = self.filesystem_service.dirname(sisyphus_report_path) + report_files = [ + (report_file, report_path) + for report_file in self.sisyphus_report_files(report_path) + ] + break + if not report_files: + raise ProjectReportNotFoundException( + f"No project report found for {project.name}" + ) + + return list(map(_file_object_from_path, report_files)) @staticmethod def sisyphus_report_path(project): @@ -234,6 +295,31 @@ def seqreports_files(self, project): "{}_{}_multiqc_report_data.zip".format(project.runfolder_name, project.name))) return report_files + def project_readme_path(self, with_undetermined=False): + return os.path.join( + self.readme_directory, + "undetermined" if with_undetermined else "", + "README.md" + ) + + def get_project_readme(self, project, checksums=None, with_undetermined=False): + log.info("Organising README for {}".format(project.name)) + readme_file = self.project_readme_path(with_undetermined) + + if not os.path.exists(readme_file): + raise ProjectReportNotFoundException( + f"{os.path.basename(readme_file)} not found at {os.path.dirname(readme_file)} for " + f"{project.name}" + ) + return [ + self.file_object_from_path( + readme_file, + self.filesystem_service.dirname(readme_file), + project, + checksums + ) + ] + def is_sample_in_project(self, project, sample_project, sample_id, sample_lane): """ Checks if a matching sample is present in the project. diff --git a/delivery/repositories/runfolder_repository.py b/delivery/repositories/runfolder_repository.py index 54a157b..b6d681d 100644 --- a/delivery/repositories/runfolder_repository.py +++ b/delivery/repositories/runfolder_repository.py @@ -21,7 +21,12 @@ class FileSystemBasedRunfolderRepository(object): CHECKSUM_FILE_PATH = os.path.join("MD5", "checksums.md5") SAMPLESHEET_PATH = "SampleSheet.csv" - def __init__(self, base_path, file_system_service=FileSystemService(), metadata_service=MetadataService()): + def __init__( + self, + base_path, + file_system_service=FileSystemService(), + metadata_service=MetadataService() + ): """ Instantiate a new FileSystemBasedRunfolderRepository :param base_path: the directory where runfolders are stored @@ -220,12 +225,17 @@ def _mask_samplesheet_entry(e): return masked_entry samplesheet_data = self.get_samplesheet(runfolder) - # mask all entries not belonging to the project and write the resulting data to the project-specific location + # mask all entries not belonging to the project and write the resulting data to the + # project-specific location project_samplesheet_data = list(map(_mask_samplesheet_entry, samplesheet_data)) project_samplesheet_file = os.path.join(project.path, runfolder.name, self.SAMPLESHEET_PATH) - self.metadata_service.write_samplesheet_file(project_samplesheet_file, project_samplesheet_data) + self.metadata_service.write_samplesheet_file( + project_samplesheet_file, + project_samplesheet_data + ) return RunfolderFile( project_samplesheet_file, + base_path=self.file_system_service.dirname(project_samplesheet_file), file_checksum=self.metadata_service.hash_file( project_samplesheet_file)) diff --git a/delivery/repositories/sample_repository.py b/delivery/repositories/sample_repository.py index 6616de8..db2a8a2 100644 --- a/delivery/repositories/sample_repository.py +++ b/delivery/repositories/sample_repository.py @@ -117,6 +117,7 @@ def sample_file_from_sample_path(self, sample_path, runfolder): lane_no=lane_no, read_no=read_no, is_index=is_index, + base_path=self.file_system_service.dirname(sample_path), checksum=checksum) @staticmethod diff --git a/delivery/services/organise_service.py b/delivery/services/organise_service.py index 6f874b7..e2e1984 100644 --- a/delivery/services/organise_service.py +++ b/delivery/services/organise_service.py @@ -105,13 +105,11 @@ def organise_project(self, runfolder, project, organised_projects_path, lanes): # symlink the project files organised_project_files = [] if project.project_files: - project_file_base = self.file_system_service.dirname(project.project_files[0].file_path) for project_file in project.project_files: organised_project_files.append( self.organise_project_file( project_file, - organised_project_runfolder_path, - project_file_base=project_file_base)) + organised_project_runfolder_path)) organised_project = RunfolderProject( project.name, organised_project_path, @@ -128,27 +126,35 @@ def organise_project(self, runfolder, project, organised_projects_path, lanes): return organised_project - def organise_project_file(self, project_file, organised_project_path, project_file_base=None): + def organise_project_file(self, project_file, organised_project_path): """ Find and symlink the project report to the organised project directory. :param project: a Project instance representing the project before organisation :param organised_project: a Project instance representing the project after organisation """ - project_file_base = project_file_base or self.file_system_service.dirname(project_file.file_path) + + # the relative path from the project file base to the project file (e.g. plots/filename.png) + relpath = self.file_system_service.relpath( + project_file.file_path, + project_file.base_path + ) # the full path to the symlink link_name = os.path.join( organised_project_path, - self.file_system_service.relpath( - project_file.file_path, - project_file_base)) + relpath + ) # the relative path from the symlink to the original file link_path = self.file_system_service.relpath( project_file.file_path, self.file_system_service.dirname(link_name)) self.file_system_service.symlink(link_path, link_name) - return RunfolderFile(link_name, file_checksum=project_file.checksum) + return RunfolderFile( + link_name, + base_path=organised_project_path, + file_checksum=project_file.checksum + ) def organise_sample(self, sample, organised_project_path, lanes): """ @@ -206,4 +212,5 @@ def organise_sample_file(self, sample_file, organised_sample_path, lanes): lane_no=sample_file.lane_no, read_no=sample_file.read_no, is_index=sample_file.is_index, + base_path=organised_sample_path, checksum=sample_file.checksum) diff --git a/tests/resources/readme/README.md b/tests/resources/readme/README.md new file mode 100644 index 0000000..a31412e --- /dev/null +++ b/tests/resources/readme/README.md @@ -0,0 +1,25 @@ +# DELIVERY OF FASTQ FILES FROM NGI-UPPSALA,THE SNP&SEQ TECHNOLOGY PLATFORM + +This delivery includes sequencing data in FASTQ format, a summary report created with MultiQC, checksums and a copy of the sample sheet used for demultiplexing. + +## Delivery structure +``` + +└── + ├── README.md + ├── checksums.md5 + ├── SampleSheet.csv + ├── __multiqc_report_data.zip + ├── __multiqc_report.html + ├── + ├── __R1_001.fastq.gz + └── __R2_001.fastq.gz + ├── + ├── __R1_001.fastq.gz + └── __R2_001.fastq.gz + : + : + └── + ├── __R1_001.fastq.gz + └── __R2_001.fastq.gz +``` diff --git a/tests/resources/readme/undetermined/README.md b/tests/resources/readme/undetermined/README.md new file mode 100644 index 0000000..059951a --- /dev/null +++ b/tests/resources/readme/undetermined/README.md @@ -0,0 +1,30 @@ +# DELIVERY OF FASTQ FILES FROM NGI-UPPSALA,THE SNP&SEQ TECHNOLOGY PLATFORM + +This delivery includes sequencing data in FASTQ format, a summary report created with MultiQC, checksums and a copy of the sample sheet used for demultiplexing. + +## Delivery structure +``` + +└── + ├── README.md + ├── checksums.md5 + ├── SampleSheet.csv + ├── __multiqc_report_data.zip + ├── __multiqc_report.html + ├── + ├── __R1_001.fastq.gz + └── __R2_001.fastq.gz + ├── + ├── __R1_001.fastq.gz + └── __R2_001.fastq.gz + : + : + └── + ├── __R1_001.fastq.gz + └── __R2_001.fastq.gz + └── Undetermined + ├── Undetermined___R1.001.fastq.gz + ├── Undetermined___R2.001.fastq.gz + ├── ... + └── checksums.md5 +``` diff --git a/tests/test_utils.py b/tests/test_utils.py index 89f8b88..fc84645 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -306,6 +306,7 @@ def project_report_files(project, report_type): FAKE_RUNFOLDERS = [_runfolder1, _runfolder2] UNORGANISED_RUNFOLDER = unorganised_runfolder() +README_DIRECTORY = "/bar" def assert_eventually_equals(self, timeout, f, expected, delay=0.1): diff --git a/tests/unit_tests/repositories/test_project_repository.py b/tests/unit_tests/repositories/test_project_repository.py index 2eb262f..0dfeb0d 100644 --- a/tests/unit_tests/repositories/test_project_repository.py +++ b/tests/unit_tests/repositories/test_project_repository.py @@ -13,7 +13,7 @@ from delivery.services.file_system_service import FileSystemService from delivery.services.metadata_service import MetadataService -from tests.test_utils import UNORGANISED_RUNFOLDER +from tests.test_utils import README_DIRECTORY, UNORGANISED_RUNFOLDER class TestGeneralProjectRepository(unittest.TestCase): @@ -43,6 +43,7 @@ def setUp(self) -> None: self.metadata_service = mock.create_autospec(MetadataService) self.project_repository = UnorganisedRunfolderProjectRepository( sample_repository=self.sample_repository, + readme_directory=README_DIRECTORY, filesystem_service=self.filesystem_service, metadata_service=self.metadata_service) self.runfolder = UNORGANISED_RUNFOLDER diff --git a/tests/unit_tests/services/test_organise_service.py b/tests/unit_tests/services/test_organise_service.py index 3be72a0..5bfe57a 100644 --- a/tests/unit_tests/services/test_organise_service.py +++ b/tests/unit_tests/services/test_organise_service.py @@ -116,8 +116,11 @@ def test_organise_project(self): organise_project_file_mock.assert_has_calls([ mock.call( project_file, - os.path.join(organised_projects_path, self.project.name, self.project.runfolder_name), - project_file_base=os.path.dirname(self.project.project_files[0].file_path) + os.path.join( + organised_projects_path, + self.project.name, + self.project.runfolder_name + ) ) for project_file in self.project.project_files]) @@ -197,13 +200,14 @@ def test_organise_project_file(self): os.path.join( project_file_base, project_file), + base_path=project_file_base, file_checksum="checksum-for-{}".format(project_file)) for project_file in ("a-report-file", os.path.join("report-dir", "another-report-file"))] self.file_system_service.relpath.side_effect = os.path.relpath self.file_system_service.dirname.side_effect = os.path.dirname for project_file in project_files: organised_project_file = self.organise_service.organise_project_file( - project_file, organised_project_path, project_file_base) + project_file, organised_project_path) self.assertEqual( os.path.join( organised_project_path, From e63249bef7344ed5034abf86238ddd598460e47b Mon Sep 17 00:00:00 2001 From: b97pla Date: Fri, 12 Jan 2024 16:12:23 +0100 Subject: [PATCH 2/5] pause in tests, increase timeout --- .github/workflows/tests.yml | 2 +- tests/integration_tests/__init__.py | 0 .../integration_tests/test_integration_dds.py | 37 +++++++++++++++++-- 3 files changed, 35 insertions(+), 4 deletions(-) create mode 100644 tests/integration_tests/__init__.py diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index c8f360e..4bb7172 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -20,4 +20,4 @@ jobs: pip install -r requirements/dev . - name: Launch tests run: | - nosetests ./tests + python -m unittest discover -s ./tests -t . diff --git a/tests/integration_tests/__init__.py b/tests/integration_tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/integration_tests/test_integration_dds.py b/tests/integration_tests/test_integration_dds.py index 7527642..2473524 100644 --- a/tests/integration_tests/test_integration_dds.py +++ b/tests/integration_tests/test_integration_dds.py @@ -31,6 +31,9 @@ def test_can_stage_and_delivery_runfolder(self): staging_status_links = response_json.get("staging_order_links") + # Insert a pause to allow staging to complete + time.sleep(1) + for project, link in staging_status_links.items(): self.assertEqual(project, "ABC_123") @@ -79,6 +82,9 @@ def test_can_stage_and_delivery_project_dir(self): staging_status_links = response_json.get("staging_order_links") + # Insert a pause to allow staging to complete + time.sleep(1) + for project, link in staging_status_links.items(): self.assertEqual(project, dir_name) @@ -153,6 +159,7 @@ def test_can_stage_and_deliver_batch_flowcells(self): staging_status_links = response_json.get("staging_order_links") + # Insert a pause to allow staging to complete time.sleep(1) for project, link in staging_status_links.items(): @@ -190,6 +197,9 @@ def test_can_stage_and_deliver_force_flowcells(self): staging_status_links = response_json.get("staging_order_links") + # Insert a pause to allow staging to complete + time.sleep(1) + for project, link in staging_status_links.items(): self.assertEqual(project, 'XYZ_123') @@ -248,7 +258,7 @@ def __init__(self, *args): self.mock_duration = 2 - @gen_test(timeout=5) + @gen_test(timeout=10) def test_mock_duration_is_2(self): with tempfile.TemporaryDirectory( dir='./tests/resources/runfolders/', @@ -268,6 +278,9 @@ def test_mock_duration_is_2(self): staging_order_project_and_id = response_json.get("staging_order_ids") + # Insert a pause to allow staging to complete + time.sleep(1) + for project, staging_id in staging_order_project_and_id.items(): delivery_url = '/'.join([ self.API_BASE, 'deliver', 'stage_id', str(staging_id)]) @@ -290,6 +303,9 @@ def test_mock_duration_is_2(self): delivery_link = delivery_resp_as_json['delivery_order_link'] while True: + # Insert a pause to allow delivery to complete + time.sleep(1) + status_response = yield self.http_client.fetch( delivery_link) if (json.loads(status_response.body)["status"] @@ -299,7 +315,7 @@ def test_mock_duration_is_2(self): stop = time.time() self.assertTrue(stop - start >= self.mock_duration) - @gen_test(timeout=5) + @gen_test(timeout=10) def test_can_delivery_data_asynchronously(self): with tempfile.TemporaryDirectory( dir='./tests/resources/runfolders/', @@ -318,8 +334,18 @@ def test_can_delivery_data_asynchronously(self): response_json = json.loads(response.body) staging_order_project_and_id = response_json.get("staging_order_ids") + staging_status_links = response_json.get("staging_order_links") + + for project, link in staging_status_links.items(): + staging_id = staging_order_project_and_id[project] + while True: + # Insert a pause to allow staging to complete + time.sleep(1) + status_response = yield self.http_client.fetch(link) + if json.loads(status_response.body)["status"] == \ + StagingStatus.staging_successful.name: + break - for project, staging_id in staging_order_project_and_id.items(): delivery_url = '/'.join([ self.API_BASE, 'deliver', 'stage_id', str(staging_id)]) delivery_body = { @@ -339,6 +365,8 @@ def test_can_delivery_data_asynchronously(self): delivery_link = delivery_resp_as_json['delivery_order_link'] while True: + # Insert a pause to allow delivery to complete + time.sleep(1) status_response = yield self.http_client.fetch( delivery_link) if (json.loads(status_response.body)["status"] @@ -379,6 +407,9 @@ def test_can_deliver_and_not_timeout(self): staging_order_project_and_id = response_json.get( "staging_order_ids") + # Insert a pause to allow staging to complete + time.sleep(1) + for project, staging_id in staging_order_project_and_id.items(): delivery_url = '/'.join([ self.API_BASE, 'deliver', 'stage_id', str(staging_id)]) From 6e932c42b1515f384c5bf86d8dfc9fac5bbcc50e Mon Sep 17 00:00:00 2001 From: b97pla Date: Thu, 18 Jan 2024 17:19:14 +0100 Subject: [PATCH 3/5] refactor unorganised project file fetching --- delivery/models/runfolder.py | 38 +++ delivery/models/sample.py | 9 +- delivery/repositories/project_repository.py | 318 +++++++++--------- delivery/repositories/runfolder_repository.py | 2 + delivery/repositories/sample_repository.py | 17 +- delivery/services/organise_service.py | 5 +- tests/test_utils.py | 2 +- .../repositories/test_project_repository.py | 12 +- .../repositories/test_sample_repository.py | 6 +- 9 files changed, 241 insertions(+), 168 deletions(-) diff --git a/delivery/models/runfolder.py b/delivery/models/runfolder.py index 065c57c..0bcf9e5 100644 --- a/delivery/models/runfolder.py +++ b/delivery/models/runfolder.py @@ -43,7 +43,45 @@ def __init__( base_path=None, file_checksum=None ): + """ + A `RunfolderFile` object representing a file in the runfolder + + If specified, the `base_path` parameter should specify the path that the file will be + considered relative to. For example, if `file_path` is `/path/to/example/file_name` and + `base_path` is `/path/to`, the file object, if moved or symlinked, will be placed under the + intermediate directory, i.e. `example/file_name`. + + :param file_path: the path to the file + :param base_path: a path relative to which the file will be considered + :param file_checksum: a computed checksum for the file + """ self.file_path = os.path.abspath(file_path) self.file_name = os.path.basename(file_path) self.base_path = base_path or os.path.dirname(self.file_path) self.checksum = file_checksum + + @classmethod + def create_object_from_path( + cls, + file_path, + runfolder_path, + filesystem_service, + metadata_service, + base_path=None, + checksums=None + ): + checksums = checksums or {} + relative_file_path = filesystem_service.relpath( + file_path, + filesystem_service.dirname( + runfolder_path + ) + ) + checksum = checksums[relative_file_path] \ + if relative_file_path in checksums \ + else metadata_service.hash_file(file_path) + return cls( + file_path, + base_path=base_path, + file_checksum=checksum + ) diff --git a/delivery/models/sample.py b/delivery/models/sample.py index b842609..6f3b2b1 100644 --- a/delivery/models/sample.py +++ b/delivery/models/sample.py @@ -49,7 +49,12 @@ def __init__( base_path=None, checksum=None): """ - Instantiate a new `SampleFile` object + A `SampleFile` object + + If specified, the `base_path` parameter should specify the path that the file will be + considered relative to. For example, if `file_path` is `/path/to/example/file_name` and + `base_path` is `/path/to`, the file object, if moved or symlinked, will be placed under the + intermediate directory, i.e. `example/file_name` :param sample_path: the path to the file :param sample_name: the name of the sample @@ -57,7 +62,7 @@ def __init__( :param lane_no: the lane number the sequences in the file were derived from :param read_no: the read number :param is_index: if True, the sequence file contains index sequences - :param base_path: the base path that the sample_path may be referenced relative to + :param base_path: a path relative to which the file will be considered :param checksum: the MD5 checksum for this SampleFile """ super(SampleFile, self).__init__( diff --git a/delivery/repositories/project_repository.py b/delivery/repositories/project_repository.py index 5e0bdc8..97bbb68 100644 --- a/delivery/repositories/project_repository.py +++ b/delivery/repositories/project_repository.py @@ -59,8 +59,9 @@ def get_project(self, project_name): class UnorganisedRunfolderProjectRepository(object): """ - Repository for a unorganised project in a runfolder. For this purpose a project is represented by a directory under - the runfolder's PROJECTS_DIR directory, having at least one fastq file beneath it. + Repository for a unorganised project in a runfolder. For this purpose a project is represented + by a directory under the runfolder's PROJECTS_DIR directory, having at least one fastq file + beneath it. """ PROJECTS_DIR = "Unaligned" @@ -76,7 +77,10 @@ def __init__( Instantiate a new UnorganisedRunfolderProjectRepository object :param sample_repository: a RunfolderProjectBasedSampleRepository instance - :param filesystem_service: a FileSystemService instance for accessing the file system + :param readme_directory: the path to the directory containing README files to include when + organising the project + :param filesystem_service: a FileSystemService instance for accessing the file system + :param metadata_service: a MetadataService for reading and writing metadata files """ self.filesystem_service = filesystem_service self.sample_repository = sample_repository @@ -117,11 +121,13 @@ def _sample_checksums(sample): def get_projects(self, runfolder): """ - Returns a list of RunfolderProject instances, representing all projects found in this runfolder. + Returns a list of RunfolderProject instances, representing all projects found in this + runfolder, or None if no project can be found. :param runfolder: a Runfolder instance :return: a list of RunfolderProject instances or None if no projects were found - :raises: ProjectsDirNotfoundException if the Unaligned directory could not be found in the runfolder + :raises: ProjectsDirNotfoundException if the Unaligned directory could not be found in the + runfolder """ def dir_contains_fastq_files(d): return any( @@ -130,39 +136,47 @@ def dir_contains_fastq_files(d): self.filesystem_service.list_files_recursively(d))) def project_from_dir(d): - project = RunfolderProject( - name=os.path.basename(d), - path=os.path.join(projects_base_dir, d), - runfolder_path=runfolder.path, - runfolder_name=runfolder.name - ) + project_path = os.path.join(projects_base_dir, d) + project_name = os.path.basename(d) project_files = [] - for fn, kw in zip( - [ - self.get_report_files, - self.get_project_readme - ], - [ - { - "project": project, - "checksums": runfolder.checksums - }, - { - "project": project, - "with_undetermined": False - } - ] - ): - try: - project_files.extend( - fn(**kw) + + try: + project_files.extend( + self.get_report_files( + project_path, + project_name, + runfolder, + checksums=runfolder.checksums + ) + ) + except ProjectReportNotFoundException as ex: + log.warning(ex) + + try: + project_files.extend( + self.get_project_readme( + project_name=project_name, + runfolder=runfolder, + with_undetermined=False ) - except ProjectReportNotFoundException as e: - log.warning(e) + ) + except ProjectReportNotFoundException as ex: + log.warning(ex) + + samples = self.sample_repository.get_samples( + project_path, + project_name, + runfolder + ) - project.project_files = project_files - project.samples = self.sample_repository.get_samples(project, runfolder) - return project + return RunfolderProject( + name=project_name, + path=os.path.join(projects_base_dir, d), + runfolder_path=runfolder.path, + runfolder_name=runfolder.name, + project_files=project_files, + samples=samples + ) try: projects_base_dir = os.path.join(runfolder.path, self.PROJECTS_DIR) @@ -174,151 +188,150 @@ def project_from_dir(d): [dirs] if type(dirs) is str else dirs ) - return list(map(project_from_dir, project_directories)) or None + return [ + project_from_dir(project_directory) + for project_directory in project_directories + ] or None except FileNotFoundError: raise ProjectsDirNotfoundException( f"Did not find {self.PROJECTS_DIR} folder for: {runfolder.name}" ) - def file_object_from_path(self, file_path, base_path, project, checksums=None): - checksums = checksums or {} - relative_file_path = self.filesystem_service.relpath( - file_path, - self.filesystem_service.dirname(project.runfolder_path)) - checksum = checksums[relative_file_path] \ - if relative_file_path in checksums else self.metadata_service.hash_file(file_path) - return RunfolderFile( - file_path, - base_path=base_path, - file_checksum=checksum - ) - - def get_report_files(self, project, checksums=None): + def get_report_files(self, project_path, project_name, runfolder, checksums=None): """ - Gets the paths to files associated with the supplied project's report. This can be either a MultiQC report or, - if no such report was found, a Sisyphus report. If a pre-calculated checksum cannot be found for a file, it will - be calculated on-the-fly. - - :param project: a RunfolderProject instance - :param checksums: a dict with pre-calculated checksums for files. paths are keys and the corresponding - checksum is the value - :return: a list of RunfolderFile objects - :raises ProjectReportNotFoundException: if no MultiQC or Sisyphus report was found for the project + Gets the paths to files associated with the supplied project's MultiQC report. This report + is fetched from seqreports unless there is a MultiQC report directly under the project's + path. If a pre-calculated checksum cannot be found for a file, it will be calculated + on-the-fly. + + :param project_path: the path to the project folder + :param project_name: the name of the project + :param runfolder: a Runfolder instance representing the runfolder containing the project + :param checksums: a dict with pre-calculated checksums for files. paths are keys and the + corresponding checksum is the value + :return: a list of RunfolderFile objects representing project report files + :raises ProjectReportNotFoundException: if no MultiQC report was found for the project """ - def _file_object_from_path(file_paths): - return self.file_object_from_path(file_paths[0], file_paths[1], project, checksums) - checksums = checksums or {} - report_files = [] - if self.filesystem_service.exists(self.multiqc_report_path(project)): + report_files = self.project_multiqc_report_files(project_path, project_name) + if self.filesystem_service.exists(report_files[0]): log.info( - f"MultiQC reports found in Unaligned/{project.name}, overriding organisation of " - f"seqreports") - report_path = self.filesystem_service.dirname( - self.multiqc_report_path(project) + f"MultiQC reports found in Unaligned/{project_name}, overriding organisation of " + f"seqreports" ) - report_files = [ - (report_file, report_path) - for report_file in self.multiqc_report_files(project) - ] - elif self.filesystem_service.exists(self.seqreports_path(project)): - log.info(f"Organising seqreports for {project.name}") - report_path = self.filesystem_service.dirname( - self.seqreports_path(project) + else: + log.info(f"Organising seqreports for {project_name}") + report_files = self.runfolder_project_multiqc_report_files( + project_name, + runfolder ) - report_files = [ - (report_file, report_path) - for report_file in self.seqreports_files(project) + try: + report_path = self.filesystem_service.dirname(report_files[0]) + return [ + RunfolderFile.create_object_from_path( + file_path=report_file, + runfolder_path=runfolder.path, + filesystem_service=self.filesystem_service, + metadata_service=self.metadata_service, + base_path=report_path, + checksums=checksums + ) + for report_file in report_files ] - else: - for sisyphus_report_path in self.sisyphus_report_path(project): - if self.filesystem_service.exists(sisyphus_report_path): - log.info("Organising sisyphus reports for {}".format(project.name)) - report_path = self.filesystem_service.dirname(sisyphus_report_path) - report_files = [ - (report_file, report_path) - for report_file in self.sisyphus_report_files(report_path) - ] - break - if not report_files: + except FileNotFoundError: raise ProjectReportNotFoundException( - f"No project report found for {project.name}" + f"No project report found for {project_name}" ) - return list(map(_file_object_from_path, report_files)) - @staticmethod - def sisyphus_report_path(project): - return os.path.join( - project.runfolder_path, "Summary", project.name, "report.html"), \ - os.path.join( - project.path, "report.html") - - def sisyphus_report_files(self, report_dir): - report_files = [ - os.path.join(report_dir, "report.html"), - os.path.join(report_dir, "report.xml"), - os.path.join(report_dir, "report.xsl") + def project_multiqc_report_files(project_path, project_name): + """ + Return a list of MultiQC report files found under the project directory. + + :param project_path: the path to the project folder + :param project_name: the name of the project + :return: a list of paths to MultiQC-report-related files + """ + return [ + os.path.join( + project_path, + f"{project_name}_multiqc_report.html" + ), + os.path.join( + project_path, + f"{project_name}_multiqc_report_data.zip" + ) ] - report_files.extend(list( - self.filesystem_service.list_files_recursively( - os.path.join( - report_dir, - "Plots")))) - return report_files @staticmethod - def multiqc_report_path(project): - return os.path.join( - project.path, - "{}_multiqc_report.html".format(project.name)) - - def multiqc_report_files(self, project): - report_files = [self.multiqc_report_path(project)] - report_dir = self.filesystem_service.dirname(report_files[0]) - report_files.append( - os.path.join(report_dir, "{}_multiqc_report_data.zip".format(project.name))) - return report_files + def runfolder_project_multiqc_report_files(project_name, runfolder): + """ + Return a list of MultiQC report files for a project found under the seqreports folder. - @staticmethod - def seqreports_path(project): - return os.path.join( - project.runfolder_path, "seqreports", "projects", project.name, - "{}_{}_multiqc_report.html".format(project.runfolder_name, project.name)) - - def seqreports_files(self, project): - report_files = [self.seqreports_path(project)] - report_dir = self.filesystem_service.dirname(report_files[0]) - report_files.append( - os.path.join(report_dir, - "{}_{}_multiqc_report_data.zip".format(project.runfolder_name, project.name))) - return report_files - - def project_readme_path(self, with_undetermined=False): - return os.path.join( + :param project_name: the name of the project + :param runfolder: a Runfolder instance representing the runfolder containing the project + :return: a list of paths to MultiQC-report-related files + """ + report_dir = os.path.join( + runfolder.path, + "seqreports", + "projects", + project_name + ) + return [ + os.path.join( + report_dir, + f"{runfolder.name}_{project_name}_multiqc_report.html", + ), + os.path.join( + report_dir, + f"{runfolder.name}_{project_name}_multiqc_report_data.zip" + ) + ] + + def get_project_readme( + self, + project_name, + runfolder, + checksums=None, + with_undetermined=False + ): + """ + Get the README to be included with the project data set. + + :param project_name: the name of the project + :param runfolder: a Runfolder instance representing the runfolder containing the project + :param checksums: a dict with pre-calculated checksums for files. paths are keys and the + corresponding checksum is the value + :param with_undetermined: if True, the README should refer to data that includes + undetermined reads + :return: the path to the README file wrapped in a list + :raises ProjectReportNotFoundException: if the README was not found + """ + log.info(f"Organising README for {project_name}") + readme_file = os.path.join( self.readme_directory, "undetermined" if with_undetermined else "", "README.md" ) - def get_project_readme(self, project, checksums=None, with_undetermined=False): - log.info("Organising README for {}".format(project.name)) - readme_file = self.project_readme_path(with_undetermined) - - if not os.path.exists(readme_file): + try: + return [ + RunfolderFile.create_object_from_path( + file_path=readme_file, + runfolder_path=runfolder.path, + filesystem_service=self.filesystem_service, + metadata_service=self.metadata_service, + base_path=self.filesystem_service.dirname(readme_file), + checksums=checksums + ) + ] + except FileNotFoundError: raise ProjectReportNotFoundException( f"{os.path.basename(readme_file)} not found at {os.path.dirname(readme_file)} for " - f"{project.name}" + f"{project_name}" ) - return [ - self.file_object_from_path( - readme_file, - self.filesystem_service.dirname(readme_file), - project, - checksums - ) - ] def is_sample_in_project(self, project, sample_project, sample_id, sample_lane): """ @@ -335,7 +348,8 @@ def is_sample_in_project(self, project, sample_project, sample_id, sample_lane): return all([ sample_project == project.name, sample_obj, - sample_lane in sample_lanes]) + sample_lane in sample_lanes + ]) @staticmethod def get_sample(project, sample_id): diff --git a/delivery/repositories/runfolder_repository.py b/delivery/repositories/runfolder_repository.py index b6d681d..98d30f2 100644 --- a/delivery/repositories/runfolder_repository.py +++ b/delivery/repositories/runfolder_repository.py @@ -31,6 +31,7 @@ def __init__( Instantiate a new FileSystemBasedRunfolderRepository :param base_path: the directory where runfolders are stored :param file_system_service: a service which can access the file system. + :param metadata_service: a MetadataService for reading and writing metadata files """ self._base_path = base_path self.file_system_service = file_system_service @@ -161,6 +162,7 @@ def __init__( :param base_path: the directory where runfolders are stored :param project_repository: an instance of UnorganisedRunfolderProjectRepository :param file_system_service: a service which can access the file system + :param metadata_service: a MetadataService for reading and writing metadata files """ super(FileSystemBasedUnorganisedRunfolderRepository, self).__init__( base_path, diff --git a/delivery/repositories/sample_repository.py b/delivery/repositories/sample_repository.py index db2a8a2..5e0ad71 100644 --- a/delivery/repositories/sample_repository.py +++ b/delivery/repositories/sample_repository.py @@ -23,7 +23,7 @@ class RunfolderProjectBasedSampleRepository(object): def __init__(self, file_system_service=FileSystemService()): self.file_system_service = file_system_service - def get_samples(self, project, runfolder): + def get_samples(self, project_path, project_name, runfolder): """ Parse the supplied project directory and create Sample instances representing the samples in the project. @@ -31,26 +31,26 @@ def get_samples(self, project, runfolder): :param runfolder: a Runfolder instance :return: a list of Sample instances """ - return self._get_samples(project, runfolder) + return self._get_samples(project_path, project_name, runfolder) - def _get_samples(self, project, runfolder): + def _get_samples(self, project_path, project_name, runfolder): def _is_fastq_file(f): return re.match(self.filename_regexp, f) is not None def _name_from_sample_file(s): - subdir = self.file_system_service.relpath(os.path.dirname(s.file_path), project.path) + subdir = self.file_system_service.relpath(os.path.dirname(s.file_path), project_path) return s.sample_name, subdir if subdir != "." else None def _sample_from_name(name_id, sample_files=None): - return Sample(name_id[0], project.name, sample_id=name_id[1], sample_files=sample_files) + return Sample(name_id[0], project_name, sample_id=name_id[1], sample_files=sample_files) def _sample_file_from_path(p): return self.sample_file_from_sample_path(p, runfolder) project_fastq_files = filter( _is_fastq_file, - self.file_system_service.list_files_recursively(project.path)) + self.file_system_service.list_files_recursively(project_path)) # create SampleFile objects from the paths project_sample_files = list(map( @@ -111,14 +111,15 @@ def sample_file_from_sample_path(self, sample_path, runfolder): checksum = None return SampleFile( - sample_path, + sample_path=sample_path, sample_name=sample_name, sample_index=sample_index, lane_no=lane_no, read_no=read_no, is_index=is_index, base_path=self.file_system_service.dirname(sample_path), - checksum=checksum) + checksum=checksum + ) @staticmethod def sample_lanes(sample): diff --git a/delivery/services/organise_service.py b/delivery/services/organise_service.py index e2e1984..4d75407 100644 --- a/delivery/services/organise_service.py +++ b/delivery/services/organise_service.py @@ -206,11 +206,12 @@ def organise_sample_file(self, sample_file, organised_sample_path, lanes): relative_path = self.file_system_service.relpath(sample_file.file_path, organised_sample_path) self.file_system_service.symlink(relative_path, link_name) return SampleFile( - link_name, + sample_path=link_name, sample_name=sample_file.sample_name, sample_index=sample_file.sample_index, lane_no=sample_file.lane_no, read_no=sample_file.read_no, is_index=sample_file.is_index, base_path=organised_sample_path, - checksum=sample_file.checksum) + checksum=sample_file.checksum + ) diff --git a/tests/test_utils.py b/tests/test_utils.py index fc84645..e508516 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -68,7 +68,7 @@ def lane_generator(): def report_type_generator(): - report_types = ["multiqc", "seqreports", "sisyphus"] + report_types = ["multiqc", "seqreports"] while True: for rt in report_types: yield rt diff --git a/tests/unit_tests/repositories/test_project_repository.py b/tests/unit_tests/repositories/test_project_repository.py index 0dfeb0d..8539879 100644 --- a/tests/unit_tests/repositories/test_project_repository.py +++ b/tests/unit_tests/repositories/test_project_repository.py @@ -52,10 +52,14 @@ def test_get_report_files(self): # missing report files will raise an exception at this point self.filesystem_service.exists.return_value = False + self.metadata_service.hash_file.side_effect = FileNotFoundError self.assertRaises( ProjectReportNotFoundException, self.project_repository.get_report_files, - self.runfolder.projects[0]) + self.runfolder.projects[0].path, + self.runfolder.projects[0].name, + self.runfolder + ) def test_get_projects(self): @@ -79,5 +83,9 @@ def test_override_log_message(self): self.filesystem_service.dirname.return_value = "foo/bar" with self.assertLogs(level='INFO') as log: - self.project_repository.get_report_files(self.runfolder.projects[0]) + self.project_repository.get_report_files( + self.runfolder.projects[0].path, + self.runfolder.projects[0].name, + self.runfolder + ) self.assertIn('overriding organisation of seqreports', log.output[0]) diff --git a/tests/unit_tests/repositories/test_sample_repository.py b/tests/unit_tests/repositories/test_sample_repository.py index bbb9465..9194ae9 100644 --- a/tests/unit_tests/repositories/test_sample_repository.py +++ b/tests/unit_tests/repositories/test_sample_repository.py @@ -27,7 +27,11 @@ def setUp(self): def test_get_samples(self): self.file_system_service.relpath.side_effect = os.path.relpath self.file_system_service.dirname = os.path.dirname - for sample in self.sample_repo.get_samples(self.project, self.runfolder): + for sample in self.sample_repo.get_samples( + self.project.path, + self.project.name, + self.runfolder + ): self.assertIn(sample, self.project.samples) for sample_file in sample.sample_files: sample_file_subdir = os.path.dirname( From d94875cb200a2d2933c6f813011591f0cc819697 Mon Sep 17 00:00:00 2001 From: b97pla Date: Thu, 18 Jan 2024 18:06:56 +0100 Subject: [PATCH 4/5] copy report file instead of symlink --- delivery/models/runfolder.py | 13 ++++++-- delivery/models/sample.py | 12 ++++++-- delivery/repositories/project_repository.py | 3 +- delivery/services/file_system_service.py | 14 +++++++++ delivery/services/organise_service.py | 33 ++++++++++++++------- 5 files changed, 58 insertions(+), 17 deletions(-) diff --git a/delivery/models/runfolder.py b/delivery/models/runfolder.py index 0bcf9e5..3a95458 100644 --- a/delivery/models/runfolder.py +++ b/delivery/models/runfolder.py @@ -41,7 +41,8 @@ def __init__( self, file_path, base_path=None, - file_checksum=None + file_checksum=None, + file_operation="symlink" ): """ A `RunfolderFile` object representing a file in the runfolder @@ -54,11 +55,15 @@ def __init__( :param file_path: the path to the file :param base_path: a path relative to which the file will be considered :param file_checksum: a computed checksum for the file + :param file_operation: the file operation to perform, symlink (default) or copy """ self.file_path = os.path.abspath(file_path) self.file_name = os.path.basename(file_path) self.base_path = base_path or os.path.dirname(self.file_path) self.checksum = file_checksum + self.file_operation = file_operation \ + if file_operation and file_operation in ["symlink", "copy"] \ + else "symlink" @classmethod def create_object_from_path( @@ -68,7 +73,8 @@ def create_object_from_path( filesystem_service, metadata_service, base_path=None, - checksums=None + checksums=None, + file_operation="symlink" ): checksums = checksums or {} relative_file_path = filesystem_service.relpath( @@ -83,5 +89,6 @@ def create_object_from_path( return cls( file_path, base_path=base_path, - file_checksum=checksum + file_checksum=checksum, + file_operation=file_operation ) diff --git a/delivery/models/sample.py b/delivery/models/sample.py index 6f3b2b1..d823e5c 100644 --- a/delivery/models/sample.py +++ b/delivery/models/sample.py @@ -47,7 +47,9 @@ def __init__( read_no=None, is_index=None, base_path=None, - checksum=None): + checksum=None, + file_operation="symlink" + ): """ A `SampleFile` object @@ -64,11 +66,13 @@ def __init__( :param is_index: if True, the sequence file contains index sequences :param base_path: a path relative to which the file will be considered :param checksum: the MD5 checksum for this SampleFile + :param file_operation: the file operation to perform, symlink (default) or copy """ super(SampleFile, self).__init__( sample_path, base_path=base_path, - file_checksum=checksum + file_checksum=checksum, + file_operation=file_operation ) self.sample_name = sample_name self.sample_index = sample_index @@ -88,4 +92,6 @@ def __hash__(self): self.lane_no, self.read_no, self.is_index, - self.checksum)) + self.checksum, + self.file_operation + )) diff --git a/delivery/repositories/project_repository.py b/delivery/repositories/project_repository.py index 97bbb68..a89478f 100644 --- a/delivery/repositories/project_repository.py +++ b/delivery/repositories/project_repository.py @@ -324,7 +324,8 @@ def get_project_readme( filesystem_service=self.filesystem_service, metadata_service=self.metadata_service, base_path=self.filesystem_service.dirname(readme_file), - checksums=checksums + checksums=checksums, + file_operation="copy" ) ] except FileNotFoundError: diff --git a/delivery/services/file_system_service.py b/delivery/services/file_system_service.py index 86bb4a6..de2470c 100644 --- a/delivery/services/file_system_service.py +++ b/delivery/services/file_system_service.py @@ -1,6 +1,7 @@ import os import logging +import shutil log = logging.getLogger(__name__) @@ -91,6 +92,19 @@ def symlink(self, source, link_name): self.makedirs(self.dirname(link_name), exist_ok=True) return os.symlink(source, link_name) + @staticmethod + def copy(source, dest): + """ + Shadows shutil.copyfile + :param source: + :param dest: + :return: None + """ + try: + return shutil.copyfile(source, dest) + except IsADirectoryError: + return shutil.copytree(source, dest, symlinks=True) + @staticmethod def mkdir(path): """ diff --git a/delivery/services/organise_service.py b/delivery/services/organise_service.py index 4d75407..7a2d169 100644 --- a/delivery/services/organise_service.py +++ b/delivery/services/organise_service.py @@ -128,10 +128,11 @@ def organise_project(self, runfolder, project, organised_projects_path, lanes): def organise_project_file(self, project_file, organised_project_path): """ - Find and symlink the project report to the organised project directory. + Find and symlink or copy the project-associated files to the organised project directory. - :param project: a Project instance representing the project before organisation - :param organised_project: a Project instance representing the project after organisation + :param project_file: a RunfolderFile instance representing the project-associated file + before organisation + :param organised_project_path: path where the project will be organised """ # the relative path from the project file base to the project file (e.g. plots/filename.png) @@ -145,11 +146,16 @@ def organise_project_file(self, project_file, organised_project_path): organised_project_path, relpath ) - # the relative path from the symlink to the original file - link_path = self.file_system_service.relpath( - project_file.file_path, - self.file_system_service.dirname(link_name)) - self.file_system_service.symlink(link_path, link_name) + if project_file.file_operation == "copy": + self.file_system_service.copy(project_file.file_path, link_name) + else: + # the relative path from the symlink to the original file + link_path = self.file_system_service.relpath( + project_file.file_path, + self.file_system_service.dirname(link_name) + ) + self.file_system_service.symlink(link_path, link_name) + return RunfolderFile( link_name, base_path=organised_project_path, @@ -203,8 +209,15 @@ def organise_sample_file(self, sample_file, organised_sample_path, lanes): # create the symlink in the supplied directory and relative to the file's original location link_name = os.path.join(organised_sample_path, sample_file.file_name) - relative_path = self.file_system_service.relpath(sample_file.file_path, organised_sample_path) - self.file_system_service.symlink(relative_path, link_name) + if sample_file.file_operation == "copy": + self.file_system_service.copy(sample_file.file_path, link_name) + else: + relative_path = self.file_system_service.relpath( + sample_file.file_path, + organised_sample_path + ) + self.file_system_service.symlink(relative_path, link_name) + return SampleFile( sample_path=link_name, sample_name=sample_file.sample_name, From 8246380ee3ec5260ad32b0affe13b938744bfc72 Mon Sep 17 00:00:00 2001 From: b97pla Date: Tue, 23 Jan 2024 14:31:29 +0100 Subject: [PATCH 5/5] simplify report fetching and file object --- delivery/models/project.py | 17 ++- delivery/models/runfolder.py | 25 ++-- delivery/models/sample.py | 10 +- delivery/repositories/project_repository.py | 123 ++++++++---------- delivery/services/organise_service.py | 78 ++++++----- tests/integration_tests/test_integration.py | 7 +- .../services/test_organise_service.py | 6 +- 7 files changed, 133 insertions(+), 133 deletions(-) diff --git a/delivery/models/project.py b/delivery/models/project.py index 47a69c0..cb2ae09 100644 --- a/delivery/models/project.py +++ b/delivery/models/project.py @@ -35,18 +35,29 @@ def __hash__(self): class RunfolderProject(BaseProject): """ - Model a project directory in a runfolder on disk. Note that this means that this project model only extends - to the idea of projects as subdirectories in a demultiplexed Illumina runfolder. + Model a project directory in a runfolder on disk. Note that this means that this project model + only extends to the idea of projects as subdirectories in a demultiplexed Illumina runfolder. """ - def __init__(self, name, path, runfolder_path, runfolder_name, samples=None, project_files=None): + def __init__( + self, + name, + path, + runfolder_path, + runfolder_name, + samples=None, + project_files=None + ): """ Instantiate a new `RunfolderProject` object + :param name: of the project :param path: path to the project :param runfolder_path: path the runfolder in which this project is stored. :param runfolder_name: name of the runfolder in which this project is stored :param samples: list of instances of Sample, representing samples in the project + :param project_files: list of instances of RunfolderFile, representing files belonging to + the project. These do not include files already accounted for in the `samples` list. """ self.name = name self.path = os.path.abspath(path) diff --git a/delivery/models/runfolder.py b/delivery/models/runfolder.py index 3a95458..0896433 100644 --- a/delivery/models/runfolder.py +++ b/delivery/models/runfolder.py @@ -41,8 +41,7 @@ def __init__( self, file_path, base_path=None, - file_checksum=None, - file_operation="symlink" + file_checksum=None ): """ A `RunfolderFile` object representing a file in the runfolder @@ -55,15 +54,11 @@ def __init__( :param file_path: the path to the file :param base_path: a path relative to which the file will be considered :param file_checksum: a computed checksum for the file - :param file_operation: the file operation to perform, symlink (default) or copy """ self.file_path = os.path.abspath(file_path) self.file_name = os.path.basename(file_path) self.base_path = base_path or os.path.dirname(self.file_path) self.checksum = file_checksum - self.file_operation = file_operation \ - if file_operation and file_operation in ["symlink", "copy"] \ - else "symlink" @classmethod def create_object_from_path( @@ -73,9 +68,20 @@ def create_object_from_path( filesystem_service, metadata_service, base_path=None, - checksums=None, - file_operation="symlink" + checksums=None ): + """ + Factory method that creates and returns a class instance. + + :param file_path: the path to the file + :param runfolder_path: the path to the runfolder containing the file + :param filesystem_service: a service which can access the file system + :param metadata_service: a MetadataService for reading and writing metadata files + :param base_path: a path relative to which the file will be considered + :param checksums: a list of pre-computed checksums that may or may not contain an entry for + the `file_path` relative to the `runfolder_path` + :return: a class object representing a file in the runfolder + """ checksums = checksums or {} relative_file_path = filesystem_service.relpath( file_path, @@ -89,6 +95,5 @@ def create_object_from_path( return cls( file_path, base_path=base_path, - file_checksum=checksum, - file_operation=file_operation + file_checksum=checksum ) diff --git a/delivery/models/sample.py b/delivery/models/sample.py index d823e5c..8187065 100644 --- a/delivery/models/sample.py +++ b/delivery/models/sample.py @@ -47,8 +47,7 @@ def __init__( read_no=None, is_index=None, base_path=None, - checksum=None, - file_operation="symlink" + checksum=None ): """ A `SampleFile` object @@ -66,13 +65,11 @@ def __init__( :param is_index: if True, the sequence file contains index sequences :param base_path: a path relative to which the file will be considered :param checksum: the MD5 checksum for this SampleFile - :param file_operation: the file operation to perform, symlink (default) or copy """ super(SampleFile, self).__init__( sample_path, base_path=base_path, - file_checksum=checksum, - file_operation=file_operation + file_checksum=checksum ) self.sample_name = sample_name self.sample_index = sample_index @@ -92,6 +89,5 @@ def __hash__(self): self.lane_no, self.read_no, self.is_index, - self.checksum, - self.file_operation + self.checksum )) diff --git a/delivery/repositories/project_repository.py b/delivery/repositories/project_repository.py index a89478f..1ede6d3 100644 --- a/delivery/repositories/project_repository.py +++ b/delivery/repositories/project_repository.py @@ -214,81 +214,63 @@ def get_report_files(self, project_path, project_name, runfolder, checksums=None :raises ProjectReportNotFoundException: if no MultiQC report was found for the project """ - report_files = self.project_multiqc_report_files(project_path, project_name) - if self.filesystem_service.exists(report_files[0]): - log.info( - f"MultiQC reports found in Unaligned/{project_name}, overriding organisation of " - f"seqreports" - ) - else: - log.info(f"Organising seqreports for {project_name}") - report_files = self.runfolder_project_multiqc_report_files( - project_name, - runfolder - ) - try: - report_path = self.filesystem_service.dirname(report_files[0]) - return [ - RunfolderFile.create_object_from_path( - file_path=report_file, - runfolder_path=runfolder.path, - filesystem_service=self.filesystem_service, - metadata_service=self.metadata_service, - base_path=report_path, - checksums=checksums - ) - for report_file in report_files - ] - except FileNotFoundError: - raise ProjectReportNotFoundException( - f"No project report found for {project_name}" - ) - - @staticmethod - def project_multiqc_report_files(project_path, project_name): - """ - Return a list of MultiQC report files found under the project directory. - - :param project_path: the path to the project folder - :param project_name: the name of the project - :return: a list of paths to MultiQC-report-related files - """ - return [ - os.path.join( - project_path, - f"{project_name}_multiqc_report.html" - ), + # look for the MultiQC report in these locations (in order of precedence) + report_paths = [ + project_path, os.path.join( - project_path, - f"{project_name}_multiqc_report_data.zip" + runfolder.path, + "seqreports", + "projects", + project_name ) ] + # depending on the location of the MultiQC report, use these different prefixes + report_prefixes = [ + project_name, + f"{runfolder.name}_{project_name}" + ] + report_names = [ + "multiqc_report.html", + "multiqc_report_data.zip" + ] + for report_path, report_prefix in zip(report_paths, report_prefixes): + # creating the RunfolderFile for a non-existing file should raise an exception + try: + report_files = [] + for report_name in report_names: + report_file = os.path.join( + report_path, + f"{report_prefix}_{report_name}" + ) + report_files.append( + RunfolderFile.create_object_from_path( + file_path=report_file, + runfolder_path=runfolder.path, + filesystem_service=self.filesystem_service, + metadata_service=self.metadata_service, + base_path=report_path, + checksums=checksums + ) + ) + except FileNotFoundError: + report_files = [] + # if no exception was raised, return the report files and log a message depending on + # where they were found + else: + if report_path == report_paths[0]: + log.info( + f"MultiQC reports found in Unaligned/{project_name}, overriding " + f"organisation of seqreports" + ) + else: + log.info(f"Organising seqreports for {project_name}") - @staticmethod - def runfolder_project_multiqc_report_files(project_name, runfolder): - """ - Return a list of MultiQC report files for a project found under the seqreports folder. + return report_files - :param project_name: the name of the project - :param runfolder: a Runfolder instance representing the runfolder containing the project - :return: a list of paths to MultiQC-report-related files - """ - report_dir = os.path.join( - runfolder.path, - "seqreports", - "projects", - project_name + # raise an exception if no report files at all were found + raise ProjectReportNotFoundException( + f"No project report found for {project_name}" ) - return [ - os.path.join( - report_dir, - f"{runfolder.name}_{project_name}_multiqc_report.html", - ), - os.path.join( - report_dir, - f"{runfolder.name}_{project_name}_multiqc_report_data.zip" - ) - ] def get_project_readme( self, @@ -324,8 +306,7 @@ def get_project_readme( filesystem_service=self.filesystem_service, metadata_service=self.metadata_service, base_path=self.filesystem_service.dirname(readme_file), - checksums=checksums, - file_operation="copy" + checksums=checksums ) ] except FileNotFoundError: diff --git a/delivery/services/organise_service.py b/delivery/services/organise_service.py index 7a2d169..5e128c0 100644 --- a/delivery/services/organise_service.py +++ b/delivery/services/organise_service.py @@ -81,15 +81,14 @@ def check_previously_organised_project(self, project, organised_projects_path, f def organise_project(self, runfolder, project, organised_projects_path, lanes): """ - Organise a project on a runfolder into its own directory and into a standard structure. If the project has - already been organised, a ProjectAlreadyOrganisedException will be raised, unless force is True. If force is - True, the existing project path will be renamed with a unique suffix. + Organise a project on a runfolder into its own directory and into a standard structure. If + the project has already been organised, a ProjectAlreadyOrganisedException will be raised. - :param runfolder: a Runfolder instance representing the runfolder on which the project belongs + :param runfolder: a Runfolder instance representing the runfolder on which the project + belongs :param project: a Project instance representing the project to be organised - :param lanes: if not None, only samples on any of the specified lanes will be organised - :param force: if True, a previously organised project will be renamed with a unique suffix - :raises ProjectAlreadyOrganisedException: if project has already been organised and force is False + :param lanes: if not None, only samples on any of the specified lanes will be organised fix + :raises ProjectAlreadyOrganisedException: if project has already been organised :return: a Project instance representing the project after organisation """ # symlink the samples @@ -141,23 +140,18 @@ def organise_project_file(self, project_file, organised_project_path): project_file.base_path ) - # the full path to the symlink - link_name = os.path.join( + # the full path to the organised file + destination = os.path.join( organised_project_path, relpath ) - if project_file.file_operation == "copy": - self.file_system_service.copy(project_file.file_path, link_name) - else: - # the relative path from the symlink to the original file - link_path = self.file_system_service.relpath( - project_file.file_path, - self.file_system_service.dirname(link_name) - ) - self.file_system_service.symlink(link_path, link_name) + # copy the source file to the organised file destination + self.file_system_service.copy(project_file.file_path, destination) + + # return a new RunFolder file object representing the organised file at its new location return RunfolderFile( - link_name, + destination, base_path=organised_project_path, file_checksum=project_file.checksum ) @@ -181,7 +175,13 @@ def organise_sample(self, sample, organised_project_path, lanes): # symlink the sample files using relative paths organised_sample_files = [] for sample_file in sample.sample_files: - organised_sample_files.append(self.organise_sample_file(sample_file, organised_sample_path, lanes)) + organised_sample_files.append( + self.organise_sample_file( + sample_file, + organised_sample_path, + lanes + ) + ) # clean up the list of sample files by removing None elements organised_sample_files = list(filter(None, organised_sample_files)) @@ -194,13 +194,16 @@ def organise_sample(self, sample, organised_project_path, lanes): def organise_sample_file(self, sample_file, organised_sample_path, lanes): """ - Organise a sample file by creating a relative symlink in the supplied directory, pointing back to the supplied - SampleFile's original file path. The Sample file can be excluded from organisation based on the lane it was - derived from. The supplied directory will be created if it doesn't exist. + Organise a sample file by creating a relative symlink in the supplied directory, + pointing back to the supplied SampleFile's original file path. The Sample file can be + excluded from organisation based on the lane it was derived from. The supplied directory + will be created if it doesn't exist. :param sample_file: a SampleFile instance representing the sample file to be organised - :param organised_sample_path: the path to the organised sample directory under which to place the symlink - :param lanes: if not None, only sample files derived from any of the specified lanes will be organised + :param organised_sample_path: the path to the organised sample directory under which to + place the symlink + :param lanes: if not None, only sample files derived from any of the specified lanes will + be organised :return: a new SampleFile instance representing the sample file after organisation """ # skip if the sample file data is derived from a lane that shouldn't be included @@ -208,18 +211,23 @@ def organise_sample_file(self, sample_file, organised_sample_path, lanes): return None # create the symlink in the supplied directory and relative to the file's original location - link_name = os.path.join(organised_sample_path, sample_file.file_name) - if sample_file.file_operation == "copy": - self.file_system_service.copy(sample_file.file_path, link_name) - else: - relative_path = self.file_system_service.relpath( - sample_file.file_path, - organised_sample_path - ) - self.file_system_service.symlink(relative_path, link_name) + destination = os.path.join( + organised_sample_path, + sample_file.file_name + ) + + # get the relative path between the source file and the organised directory + relative_path = self.file_system_service.relpath( + sample_file.file_path, + organised_sample_path + ) + + # create the symlink using the relative path + self.file_system_service.symlink(relative_path, destination) + # return a new SampleFile instance representing the sample file after organisation return SampleFile( - sample_path=link_name, + sample_path=destination, sample_name=sample_file.sample_name, sample_index=sample_file.sample_index, lane_no=sample_file.lane_no, diff --git a/tests/integration_tests/test_integration.py b/tests/integration_tests/test_integration.py index 71d54da..cbf68b4 100644 --- a/tests/integration_tests/test_integration.py +++ b/tests/integration_tests/test_integration.py @@ -93,10 +93,9 @@ def _verify_checksum(file_path, expected_checksum): project_file_base = os.path.dirname(project.project_files[0].file_path) relative_path = os.path.relpath(project_file.file_path, project_file_base) organised_project_file_path = os.path.join(organised_path, relative_path) - self.assertTrue( - os.path.samefile( - organised_project_file_path, - project_file.file_path)) + self.assertEqual( + os.path.basename(organised_project_file_path), + project_file.file_name) _verify_checksum(os.path.join(runfolder.name, relative_path), project_file.checksum) for sample in project.samples: sample_path = os.path.join(organised_path, sample.sample_id) diff --git a/tests/unit_tests/services/test_organise_service.py b/tests/unit_tests/services/test_organise_service.py index 5bfe57a..8f0c8c1 100644 --- a/tests/unit_tests/services/test_organise_service.py +++ b/tests/unit_tests/services/test_organise_service.py @@ -214,10 +214,10 @@ def test_organise_project_file(self): os.path.relpath(project_file.file_path, project_file_base)), organised_project_file.file_path) self.assertEqual(project_file.checksum, organised_project_file.checksum) - self.file_system_service.symlink.assert_has_calls([ + self.file_system_service.copy.assert_has_calls([ mock.call( - os.path.join("..", "..", "foo", "a-report-file"), + os.path.join(project_files[0].file_path), os.path.join(organised_project_path, "a-report-file")), mock.call( - os.path.join("..", "..", "..", "foo", "report-dir", "another-report-file"), + os.path.join(project_files[1].file_path), os.path.join(organised_project_path, "report-dir", "another-report-file"))])