From 7acfe1a2e87d4fe85f400263b576f0ac1131ca4f Mon Sep 17 00:00:00 2001 From: Pontus Larsson Date: Thu, 8 Feb 2024 09:42:40 +0100 Subject: [PATCH] organize project metadata files (#54) * organize project metadata files * ensure text encoding of file --- delivery/repositories/project_repository.py | 68 ++++++++++++++++--- delivery/repositories/runfolder_repository.py | 6 +- delivery/services/file_system_service.py | 8 +++ delivery/services/organise_service.py | 22 ++++-- tests/integration_tests/test_integration.py | 3 +- tests/resources/runfolders/readme/README.md | 1 + tests/test_utils.py | 57 +++++++++++++++- .../repositories/test_project_repository.py | 6 +- 8 files changed, 149 insertions(+), 22 deletions(-) create mode 120000 tests/resources/runfolders/readme/README.md diff --git a/delivery/repositories/project_repository.py b/delivery/repositories/project_repository.py index 1ede6d3..01118c0 100644 --- a/delivery/repositories/project_repository.py +++ b/delivery/repositories/project_repository.py @@ -140,18 +140,19 @@ def project_from_dir(d): project_name = os.path.basename(d) project_files = [] + # gather report files for the project from the runfolder try: project_files.extend( self.get_report_files( project_path, project_name, runfolder, - checksums=runfolder.checksums ) ) except ProjectReportNotFoundException as ex: log.warning(ex) + # gather the README to include with the project try: project_files.extend( self.get_project_readme( @@ -163,6 +164,17 @@ def project_from_dir(d): except ProjectReportNotFoundException as ex: log.warning(ex) + # gather metadata files for the project from the runfolder + try: + project_files.extend( + self.get_metadata_files( + project_name=project_name, + runfolder=runfolder + ) + ) + except ProjectReportNotFoundException as ex: + log.warning(ex) + samples = self.sample_repository.get_samples( project_path, project_name, @@ -198,7 +210,12 @@ def project_from_dir(d): f"Did not find {self.PROJECTS_DIR} folder for: {runfolder.name}" ) - def get_report_files(self, project_path, project_name, runfolder, checksums=None): + def get_report_files( + self, + project_path, + project_name, + runfolder + ): """ Gets the paths to files associated with the supplied project's MultiQC report. This report is fetched from seqreports unless there is a MultiQC report directly under the project's @@ -208,8 +225,6 @@ def get_report_files(self, project_path, project_name, runfolder, checksums=None :param project_path: the path to the project folder :param project_name: the name of the project :param runfolder: a Runfolder instance representing the runfolder containing the project - :param checksums: a dict with pre-calculated checksums for files. paths are keys and the - corresponding checksum is the value :return: a list of RunfolderFile objects representing project report files :raises ProjectReportNotFoundException: if no MultiQC report was found for the project """ @@ -249,7 +264,7 @@ def get_report_files(self, project_path, project_name, runfolder, checksums=None filesystem_service=self.filesystem_service, metadata_service=self.metadata_service, base_path=report_path, - checksums=checksums + checksums=runfolder.checksums ) ) except FileNotFoundError: @@ -276,7 +291,6 @@ def get_project_readme( self, project_name, runfolder, - checksums=None, with_undetermined=False ): """ @@ -284,11 +298,9 @@ def get_project_readme( :param project_name: the name of the project :param runfolder: a Runfolder instance representing the runfolder containing the project - :param checksums: a dict with pre-calculated checksums for files. paths are keys and the - corresponding checksum is the value :param with_undetermined: if True, the README should refer to data that includes undetermined reads - :return: the path to the README file wrapped in a list + :return: a list containing a RunfolderFile object representing the README :raises ProjectReportNotFoundException: if the README was not found """ log.info(f"Organising README for {project_name}") @@ -306,7 +318,7 @@ def get_project_readme( filesystem_service=self.filesystem_service, metadata_service=self.metadata_service, base_path=self.filesystem_service.dirname(readme_file), - checksums=checksums + checksums=runfolder.checksums ) ] except FileNotFoundError: @@ -315,6 +327,42 @@ def get_project_readme( f"{project_name}" ) + def get_metadata_files( + self, + project_name, + runfolder + ): + """ + Gather the metadata files to be included with the project on delivery + + :param project_name: the name of the project + :param runfolder: a Runfolder instance representing the runfolder containing the project + :return: a list of RunfolderFile objects representing the gathered metadata files + :raises ProjectReportNotFoundException: if the README was not found + """ + log.info(f"Fetching metadata files for {project_name}") + metadata_files = [ + RunfolderFile.create_object_from_path( + file_path=metafile, + runfolder_path=runfolder.path, + filesystem_service=self.filesystem_service, + metadata_service=self.metadata_service, + base_path=runfolder.path, + checksums=runfolder.checksums + ) + for metafile in self.filesystem_service.list_files_recursively( + os.path.join( + runfolder.path, + "metadata" + ) + ) if os.path.basename(metafile).startswith(project_name) + ] + if not metadata_files: + raise ProjectReportNotFoundException( + f"metadata files could not be found for {project_name}" + ) + return metadata_files + def is_sample_in_project(self, project, sample_project, sample_id, sample_lane): """ Checks if a matching sample is present in the project. diff --git a/delivery/repositories/runfolder_repository.py b/delivery/repositories/runfolder_repository.py index 98d30f2..a89616d 100644 --- a/delivery/repositories/runfolder_repository.py +++ b/delivery/repositories/runfolder_repository.py @@ -249,4 +249,8 @@ def get_project_report_files(self, runfolder, project): :param project: an instance of Project :return: a tuple with the path to the directory containing the report and a list of paths to the report files """ - return self.project_repository.get_report_files(project, checksums=runfolder.checksums) + return self.project_repository.get_report_files( + project_path=project.path, + project_name=project.name, + runfolder=runfolder + ) diff --git a/delivery/services/file_system_service.py b/delivery/services/file_system_service.py index de2470c..d94822a 100644 --- a/delivery/services/file_system_service.py +++ b/delivery/services/file_system_service.py @@ -100,6 +100,14 @@ def copy(source, dest): :param dest: :return: None """ + try: + FileSystemService.makedirs( + FileSystemService.dirname( + dest + ) + ) + except FileExistsError: + pass try: return shutil.copyfile(source, dest) except IsADirectoryError: diff --git a/delivery/services/organise_service.py b/delivery/services/organise_service.py index 5e128c0..49f927b 100644 --- a/delivery/services/organise_service.py +++ b/delivery/services/organise_service.py @@ -55,13 +55,21 @@ def organise_runfolder(self, runfolder_id, lanes, projects, force): # organise the projects and return a new Runfolder instance organised_projects = [] for project in projects_on_runfolder: - organised_projects.append(self.organise_project(runfolder, project, organised_projects_path, lanes)) + organised_projects.append( + self.organise_project( + runfolder, + project, + organised_projects_path, + lanes + ) + ) return Runfolder( runfolder.name, runfolder.path, projects=organised_projects, - checksums=runfolder.checksums) + checksums=runfolder.checksums + ) def check_previously_organised_project(self, project, organised_projects_path, force): organised_project_path = os.path.join(organised_projects_path, project.name) @@ -79,7 +87,13 @@ def check_previously_organised_project(self, project, organised_projects_path, f self.file_system_service.mkdir(organised_projects_backup_path) self.file_system_service.rename(organised_project_path, backup_path) - def organise_project(self, runfolder, project, organised_projects_path, lanes): + def organise_project( + self, + runfolder, + project, + organised_projects_path, + lanes + ): """ Organise a project on a runfolder into its own directory and into a standard structure. If the project has already been organised, a ProjectAlreadyOrganisedException will be raised. @@ -127,7 +141,7 @@ def organise_project(self, runfolder, project, organised_projects_path, lanes): def organise_project_file(self, project_file, organised_project_path): """ - Find and symlink or copy the project-associated files to the organised project directory. + Find and copy the project-associated files to the organised project directory. :param project_file: a RunfolderFile instance representing the project-associated file before organisation diff --git a/tests/integration_tests/test_integration.py b/tests/integration_tests/test_integration.py index cbf68b4..a5ec4d9 100644 --- a/tests/integration_tests/test_integration.py +++ b/tests/integration_tests/test_integration.py @@ -90,8 +90,7 @@ def _verify_checksum(file_path, expected_checksum): MetadataService.hash_file(samplesheet_file)) for project_file in project.project_files: - project_file_base = os.path.dirname(project.project_files[0].file_path) - relative_path = os.path.relpath(project_file.file_path, project_file_base) + relative_path = os.path.relpath(project_file.file_path, project_file.base_path) organised_project_file_path = os.path.join(organised_path, relative_path) self.assertEqual( os.path.basename(organised_project_file_path), diff --git a/tests/resources/runfolders/readme/README.md b/tests/resources/runfolders/readme/README.md new file mode 120000 index 0000000..d8c3d7e --- /dev/null +++ b/tests/resources/runfolders/readme/README.md @@ -0,0 +1 @@ +../../readme/README.md \ No newline at end of file diff --git a/tests/test_utils.py b/tests/test_utils.py index e508516..63168fd 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -20,8 +20,10 @@ def __init__(self): def spawn_callback(self, f, **args): f(**args) + class TestUtils: DUMMY_CONFIG = {"monitored_directory": "/foo"} + README_DIRECTORY = "/bar" class DummyConfig: @@ -29,6 +31,7 @@ class DummyConfig: def __getitem__(self, key): return TestUtils.DUMMY_CONFIG[key] + fake_directories = ["160930_ST-E00216_0111_BH37CWALXX", "160930_ST-E00216_0112_BH37CWALXX"] fake_projects = ["ABC_123", "DEF_456", "GHI_789"] @@ -120,7 +123,14 @@ def runfolder_project( runfolder_path=runfolder.path, runfolder_name=runfolder.name ) - project.project_files = project_report_files(project, next(report_type)) + project_files = project_report_files(project, next(report_type)) + project_files.append( + project_readme_file() + ) + project_files.extend( + project_metadata_files(project) + ) + project.project_files = project_files sample_names = sample_name_generator() # a straight-forward sample with files on one lane @@ -272,6 +282,50 @@ def project_report_files(project, report_type): ] +def project_readme_file(): + readme_file = os.path.join( + "tests", + "resources", + "readme", + "README.md" + ) + return RunfolderFile( + file_path=readme_file, + base_path=os.path.dirname(readme_file), + file_checksum=f"checksum-for-{readme_file}" + ) + + +def project_metadata_files(project, file_types=None): + file_suffixes = [ + "run", + "experiment" + ] + file_types = file_types or [ + "xml", + "json" + ] + metadata_path = os.path.join( + project.runfolder_path, + "metadata" + ) + metadata_files = [] + for file_type in file_types: + for file_suffix in file_suffixes: + metadata_file = os.path.join( + metadata_path, + f"{project.name}-{file_suffix}.{file_type}" + ) + metadata_files.append( + RunfolderFile( + file_path=metadata_file, + base_path=project.runfolder_path, + file_checksum=f"checksum-for-{metadata_file}" + ) + ) + return metadata_files + + _runfolder1 = Runfolder(name="160930_ST-E00216_0111_BH37CWALXX", path="/foo/160930_ST-E00216_0111_BH37CWALXX") @@ -306,7 +360,6 @@ def project_report_files(project, report_type): FAKE_RUNFOLDERS = [_runfolder1, _runfolder2] UNORGANISED_RUNFOLDER = unorganised_runfolder() -README_DIRECTORY = "/bar" def assert_eventually_equals(self, timeout, f, expected, delay=0.1): diff --git a/tests/unit_tests/repositories/test_project_repository.py b/tests/unit_tests/repositories/test_project_repository.py index 8539879..aee7c49 100644 --- a/tests/unit_tests/repositories/test_project_repository.py +++ b/tests/unit_tests/repositories/test_project_repository.py @@ -13,7 +13,7 @@ from delivery.services.file_system_service import FileSystemService from delivery.services.metadata_service import MetadataService -from tests.test_utils import README_DIRECTORY, UNORGANISED_RUNFOLDER +from tests.test_utils import UNORGANISED_RUNFOLDER class TestGeneralProjectRepository(unittest.TestCase): @@ -41,12 +41,12 @@ def setUp(self) -> None: self.sample_repository = mock.create_autospec(RunfolderProjectBasedSampleRepository) self.filesystem_service = mock.create_autospec(FileSystemService) self.metadata_service = mock.create_autospec(MetadataService) + self.runfolder = UNORGANISED_RUNFOLDER self.project_repository = UnorganisedRunfolderProjectRepository( sample_repository=self.sample_repository, - readme_directory=README_DIRECTORY, + readme_directory=self.runfolder.path, filesystem_service=self.filesystem_service, metadata_service=self.metadata_service) - self.runfolder = UNORGANISED_RUNFOLDER def test_get_report_files(self):