From 06e5c02d4416c0ddab97c1b73a276e60872a25fa Mon Sep 17 00:00:00 2001 From: Jeffrey Thiessen Date: Mon, 30 Mar 2020 19:14:16 -0500 Subject: [PATCH 01/19] moved csv_reader to a new common.py file to decrease dup code --- iridauploader/parsers/common.py | 35 +++++++++++++++++ .../parsers/directory/sample_parser.py | 36 +----------------- iridauploader/parsers/directory/validation.py | 4 +- .../parsers/miniseq/sample_parser.py | 38 ++----------------- iridauploader/parsers/miniseq/validation.py | 4 +- iridauploader/parsers/miseq/sample_parser.py | 38 ++----------------- iridauploader/parsers/miseq/validation.py | 4 +- .../parsers/nextseq/sample_parser.py | 37 ++---------------- iridauploader/parsers/nextseq/validation.py | 4 +- .../parsers/directory/test_sample_parser.py | 4 +- .../parsers/directory/test_validation.py | 6 +-- .../parsers/miniseq/test_sample_parser.py | 9 +++-- .../tests/parsers/miniseq/test_validation.py | 8 ++-- .../tests/parsers/miseq/test_sample_parser.py | 9 +++-- .../tests/parsers/miseq/test_validation.py | 8 ++-- .../parsers/nextseq/test_sample_parser.py | 9 +++-- .../tests/parsers/nextseq/test_validation.py | 8 ++-- 17 files changed, 86 insertions(+), 175 deletions(-) create mode 100644 iridauploader/parsers/common.py diff --git a/iridauploader/parsers/common.py b/iridauploader/parsers/common.py new file mode 100644 index 00000000..78be4620 --- /dev/null +++ b/iridauploader/parsers/common.py @@ -0,0 +1,35 @@ +from os import path +from csv import reader + +from iridauploader.parsers import exceptions + + +def get_csv_reader(sample_sheet_file): + + """ + tries to create a csv.reader object which will be used to + parse through the lines in SampleSheet.csv + raises an error if: + sample_sheet_file is not an existing file + sample_sheet_file contains null byte(s) + + arguments: + data_dir -- the directory that has SampleSheet.csv in it + + returns a csv.reader object + """ + + if path.isfile(sample_sheet_file): + csv_file = open(sample_sheet_file, "r") + # strip any trailing newline characters from the end of the line + # including Windows newline characters (\r\n) + csv_lines = [x.rstrip('\n') for x in csv_file] + csv_lines = [x.rstrip('\r') for x in csv_lines] + + # open and read file in binary then send it to be parsed by csv's reader + csv_reader = reader(csv_lines) + else: + raise exceptions.SampleSheetError("Sample sheet cannot be parsed as a CSV file because it's not a regular file.", + sample_sheet_file) + + return csv_reader diff --git a/iridauploader/parsers/directory/sample_parser.py b/iridauploader/parsers/directory/sample_parser.py index 09d74deb..62639fb5 100644 --- a/iridauploader/parsers/directory/sample_parser.py +++ b/iridauploader/parsers/directory/sample_parser.py @@ -1,11 +1,11 @@ from os import path, walk -from csv import reader from collections import OrderedDict from copy import deepcopy import logging import iridauploader.model as model from iridauploader.parsers import exceptions +from iridauploader.parsers import common def build_sequencing_run_from_samples(sample_sheet_file): @@ -143,7 +143,7 @@ def _parse_samples(sample_sheet_file): logging.info("Reading data from sample sheet {}".format(sample_sheet_file)) - csv_reader = get_csv_reader(sample_sheet_file) + csv_reader = common.get_csv_reader(sample_sheet_file) # start with an ordered dictionary so that keys are ordered in the same # way that they are inserted. sample_dict = OrderedDict() @@ -209,35 +209,3 @@ def _parse_samples(sample_sheet_file): sample_dict_list.append(deepcopy(sample_dict)) return sample_dict_list - - -def get_csv_reader(sample_sheet_file): - - """ - tries to create a csv.reader object which will be used to - parse through the lines in SampleSheet.csv - raises an error if: - sample_sheet_file is not an existing file - sample_sheet_file contains null byte(s) - - arguments: - data_dir -- the directory that has SampleSheet.csv in it - - returns a csv.reader object - """ - - if path.isfile(sample_sheet_file): - csv_file = open(sample_sheet_file, "r") - # strip any trailing newline characters from the end of the line - # including Windows newline characters (\r\n) - csv_lines = [x.rstrip('\n') for x in csv_file] - csv_lines = [x.rstrip('\r') for x in csv_lines] - - # open and read file in binary then send it to be parsed by csv's reader - csv_reader = reader(csv_lines) - else: - raise exceptions.SampleSheetError( - "Sample sheet cannot be parsed as a CSV file because it's not a regular file.", - sample_sheet_file) - - return csv_reader diff --git a/iridauploader/parsers/directory/validation.py b/iridauploader/parsers/directory/validation.py index 1f2e17fe..3897e38b 100644 --- a/iridauploader/parsers/directory/validation.py +++ b/iridauploader/parsers/directory/validation.py @@ -1,5 +1,5 @@ from iridauploader.parsers import exceptions -from iridauploader.parsers.directory.sample_parser import get_csv_reader +from iridauploader.parsers import common import iridauploader.model as model @@ -17,7 +17,7 @@ def validate_sample_sheet(sample_sheet_file): returns ValidationResult object - stores list of string error messages """ - csv_reader = get_csv_reader(sample_sheet_file) + csv_reader = common.get_csv_reader(sample_sheet_file) v_res = model.ValidationResult() diff --git a/iridauploader/parsers/miniseq/sample_parser.py b/iridauploader/parsers/miniseq/sample_parser.py index 87e3030d..d447bdb7 100644 --- a/iridauploader/parsers/miniseq/sample_parser.py +++ b/iridauploader/parsers/miniseq/sample_parser.py @@ -1,12 +1,12 @@ import re from os import path, walk -from csv import reader from collections import OrderedDict from copy import deepcopy import logging import iridauploader.model as model from iridauploader.parsers import exceptions +from iridauploader.parsers import common def parse_metadata(sample_sheet_file): @@ -25,7 +25,7 @@ def parse_metadata(sample_sheet_file): metadata_dict = {"readLengths": []} - csv_reader = get_csv_reader(sample_sheet_file) + csv_reader = common.get_csv_reader(sample_sheet_file) metadata_key_translation_dict = { 'Local Run Manager Analysis Id': 'localrunmanager', @@ -231,7 +231,7 @@ def _parse_samples(sample_sheet_file): logging.info("Reading data from sample sheet {}".format(sample_sheet_file)) - csv_reader = get_csv_reader(sample_sheet_file) + csv_reader = common.get_csv_reader(sample_sheet_file) # start with an ordered dictionary so that keys are ordered in the same # way that they are inserted. sample_dict = OrderedDict() @@ -336,35 +336,3 @@ def _parse_out_sequence_file(sample): sequence_file_dict[key] = sample_dict[key] return sequence_file_dict - - -def get_csv_reader(sample_sheet_file): - - """ - tries to create a csv.reader object which will be used to - parse through the lines in SampleSheet.csv - raises an error if: - sample_sheet_file is not an existing file - sample_sheet_file contains null byte(s) - - arguments: - data_dir -- the directory that has SampleSheet.csv in it - - returns a csv.reader object - """ - - if path.isfile(sample_sheet_file): - csv_file = open(sample_sheet_file, "r") - # strip any trailing newline characters from the end of the line - # including Windows newline characters (\r\n) - csv_lines = [x.rstrip('\n') for x in csv_file] - csv_lines = [x.rstrip('\r') for x in csv_lines] - - # open and read file in binary then send it to be parsed by csv's reader - csv_reader = reader(csv_lines) - else: - raise exceptions.SampleSheetError( - "Sample sheet cannot be parsed as a CSV file because it's not a regular file.", - sample_sheet_file) - - return csv_reader diff --git a/iridauploader/parsers/miniseq/validation.py b/iridauploader/parsers/miniseq/validation.py index 49cd0c1a..c2b36149 100644 --- a/iridauploader/parsers/miniseq/validation.py +++ b/iridauploader/parsers/miniseq/validation.py @@ -1,5 +1,5 @@ from iridauploader.parsers import exceptions -from iridauploader.parsers.miniseq.sample_parser import get_csv_reader +from iridauploader.parsers import common import iridauploader.model as model @@ -17,7 +17,7 @@ def validate_sample_sheet(sample_sheet_file): returns ValidationResult object - stores list of string error messages """ - csv_reader = get_csv_reader(sample_sheet_file) + csv_reader = common.get_csv_reader(sample_sheet_file) v_res = model.ValidationResult() diff --git a/iridauploader/parsers/miseq/sample_parser.py b/iridauploader/parsers/miseq/sample_parser.py index c19e4748..37795f79 100644 --- a/iridauploader/parsers/miseq/sample_parser.py +++ b/iridauploader/parsers/miseq/sample_parser.py @@ -1,12 +1,12 @@ import re from os import path, walk -from csv import reader from collections import OrderedDict from copy import deepcopy import logging import iridauploader.model as model from iridauploader.parsers import exceptions +from iridauploader.parsers import common def parse_metadata(sample_sheet_file): @@ -25,7 +25,7 @@ def parse_metadata(sample_sheet_file): metadata_dict = {"readLengths": []} - csv_reader = get_csv_reader(sample_sheet_file) + csv_reader = common.get_csv_reader(sample_sheet_file) metadata_key_translation_dict = { 'Assay': 'assay', @@ -226,7 +226,7 @@ def _parse_samples(sample_sheet_file): logging.info("Reading data from sample sheet {}".format(sample_sheet_file)) - csv_reader = get_csv_reader(sample_sheet_file) + csv_reader = common.get_csv_reader(sample_sheet_file) # start with an ordered dictionary so that keys are ordered in the same # way that they are inserted. sample_dict = OrderedDict() @@ -326,35 +326,3 @@ def _parse_out_sequence_file(sample): sequence_file_dict[key] = sample_dict[key] return sequence_file_dict - - -def get_csv_reader(sample_sheet_file): - - """ - tries to create a csv.reader object which will be used to - parse through the lines in SampleSheet.csv - raises an error if: - sample_sheet_file is not an existing file - sample_sheet_file contains null byte(s) - - arguments: - data_dir -- the directory that has SampleSheet.csv in it - - returns a csv.reader object - """ - - if path.isfile(sample_sheet_file): - csv_file = open(sample_sheet_file, "r") - # strip any trailing newline characters from the end of the line - # including Windows newline characters (\r\n) - csv_lines = [x.rstrip('\n') for x in csv_file] - csv_lines = [x.rstrip('\r') for x in csv_lines] - - # open and read file in binary then send it to be parsed by csv's reader - csv_reader = reader(csv_lines) - else: - raise exceptions.SampleSheetError( - "Sample sheet cannot be parsed as a CSV file because it's not a regular file.", - sample_sheet_file) - - return csv_reader diff --git a/iridauploader/parsers/miseq/validation.py b/iridauploader/parsers/miseq/validation.py index 00e5fe01..57a04d2f 100644 --- a/iridauploader/parsers/miseq/validation.py +++ b/iridauploader/parsers/miseq/validation.py @@ -1,5 +1,5 @@ from iridauploader.parsers import exceptions -from iridauploader.parsers.miseq.sample_parser import get_csv_reader +from iridauploader.parsers import common import iridauploader.model as model @@ -17,7 +17,7 @@ def validate_sample_sheet(sample_sheet_file): returns ValidationResult object - stores list of string error messages """ - csv_reader = get_csv_reader(sample_sheet_file) + csv_reader = common.get_csv_reader(sample_sheet_file) v_res = model.ValidationResult() diff --git a/iridauploader/parsers/nextseq/sample_parser.py b/iridauploader/parsers/nextseq/sample_parser.py index 40dcbba0..eb5f5a15 100644 --- a/iridauploader/parsers/nextseq/sample_parser.py +++ b/iridauploader/parsers/nextseq/sample_parser.py @@ -1,12 +1,12 @@ import re from os import path, walk -from csv import reader from collections import OrderedDict from copy import deepcopy import logging import iridauploader.model as model from iridauploader.parsers import exceptions +from iridauploader.parsers import common def parse_metadata(sample_sheet_file): @@ -25,7 +25,7 @@ def parse_metadata(sample_sheet_file): metadata_dict = {"readLengths": []} - csv_reader = get_csv_reader(sample_sheet_file) + csv_reader = common.get_csv_reader(sample_sheet_file) metadata_key_translation_dict = { 'Assay': 'assay', @@ -224,7 +224,7 @@ def _parse_samples(sample_sheet_file): logging.info("Reading data from sample sheet {}".format(sample_sheet_file)) - csv_reader = get_csv_reader(sample_sheet_file) + csv_reader = common.get_csv_reader(sample_sheet_file) # start with an ordered dictionary so that keys are ordered in the same # way that they are inserted. sample_dict = OrderedDict() @@ -324,34 +324,3 @@ def _parse_out_sequence_file(sample): sequence_file_dict[key] = sample_dict[key] return sequence_file_dict - - -def get_csv_reader(sample_sheet_file): - - """ - tries to create a csv.reader object which will be used to - parse through the lines in SampleSheet.csv - raises an error if: - sample_sheet_file is not an existing file - sample_sheet_file contains null byte(s) - - arguments: - data_dir -- the directory that has SampleSheet.csv in it - - returns a csv.reader object - """ - - if path.isfile(sample_sheet_file): - csv_file = open(sample_sheet_file, "r") - # strip any trailing newline characters from the end of the line - # including Windows newline characters (\r\n) - csv_lines = [x.rstrip('\n') for x in csv_file] - csv_lines = [x.rstrip('\r') for x in csv_lines] - - # open and read file in binary then send it to be parsed by csv's reader - csv_reader = reader(csv_lines) - else: - raise exceptions.SampleSheetError("Sample sheet cannot be parsed as a CSV file because it's not a regular file.", - sample_sheet_file) - - return csv_reader diff --git a/iridauploader/parsers/nextseq/validation.py b/iridauploader/parsers/nextseq/validation.py index 6cf248a0..57a04d2f 100644 --- a/iridauploader/parsers/nextseq/validation.py +++ b/iridauploader/parsers/nextseq/validation.py @@ -1,5 +1,5 @@ from iridauploader.parsers import exceptions -from iridauploader.parsers.nextseq.sample_parser import get_csv_reader +from iridauploader.parsers import common import iridauploader.model as model @@ -17,7 +17,7 @@ def validate_sample_sheet(sample_sheet_file): returns ValidationResult object - stores list of string error messages """ - csv_reader = get_csv_reader(sample_sheet_file) + csv_reader = common.get_csv_reader(sample_sheet_file) v_res = model.ValidationResult() diff --git a/iridauploader/tests/parsers/directory/test_sample_parser.py b/iridauploader/tests/parsers/directory/test_sample_parser.py index f14fbb7c..d90196b5 100644 --- a/iridauploader/tests/parsers/directory/test_sample_parser.py +++ b/iridauploader/tests/parsers/directory/test_sample_parser.py @@ -123,7 +123,7 @@ def test_get_csv_reader_valid_sheet(self): sheet_file = path.join(path_to_module, "fake_dir_data", "test_csv_reader.csv") - lines = sample_parser.get_csv_reader(sheet_file) + lines = parsers.common.get_csv_reader(sheet_file) # This is a sample of what the miseq sample sheet looks like, but it also makes a good # example for what we want our csv reader to be able to parse. correct_lines = [ @@ -165,7 +165,7 @@ def test_get_csv_reader_no_sheet(self): sheet_file = path.join(path_to_module, "fake_dir_data") with self.assertRaises(SampleSheetError): - sample_parser.get_csv_reader(sheet_file) + parsers.common.get_csv_reader(sheet_file) class TestParseSampleList(unittest.TestCase): diff --git a/iridauploader/tests/parsers/directory/test_validation.py b/iridauploader/tests/parsers/directory/test_validation.py index 8295edde..53c74bc2 100644 --- a/iridauploader/tests/parsers/directory/test_validation.py +++ b/iridauploader/tests/parsers/directory/test_validation.py @@ -15,7 +15,7 @@ class TestValidation(unittest.TestCase): def setUp(self): print("\nStarting " + self.__module__ + ": " + self._testMethodName) - @patch("iridauploader.parsers.directory.validation.get_csv_reader") + @patch("iridauploader.parsers.common.get_csv_reader") def test_validate_sample_sheet_no_data_header(self, mock_csv_reader): """ Given a sample sheet with no header, make sure the correct errors are included in the response @@ -48,7 +48,7 @@ def test_validate_sample_sheet_no_data_header(self, mock_csv_reader): self.assertEqual(type(res.error_list[0]), SampleSheetError) self.assertEqual(type(res.error_list[1]), SampleSheetError) - @patch("iridauploader.parsers.directory.validation.get_csv_reader") + @patch("iridauploader.parsers.common.get_csv_reader") def test_validate_sample_sheet_no_data(self, mock_csv_reader): """ Given a sample sheet with no data, make sure the correct errors are included in the response @@ -73,7 +73,7 @@ def test_validate_sample_sheet_no_data(self, mock_csv_reader): # Error type should be SampleSheetError self.assertEqual(type(res.error_list[0]), SampleSheetError) - @patch("iridauploader.parsers.directory.validation.get_csv_reader") + @patch("iridauploader.parsers.common.get_csv_reader") def test_validate_sample_sheet_valid(self, mock_csv_reader): """ Given a valid sample sheet, make sure the response shows as valid diff --git a/iridauploader/tests/parsers/miniseq/test_sample_parser.py b/iridauploader/tests/parsers/miniseq/test_sample_parser.py index ac8f2b37..6b8237c8 100644 --- a/iridauploader/tests/parsers/miniseq/test_sample_parser.py +++ b/iridauploader/tests/parsers/miniseq/test_sample_parser.py @@ -6,6 +6,7 @@ import iridauploader.parsers.miniseq.sample_parser as sample_parser from iridauploader.parsers.exceptions import SampleSheetError, SequenceFileError +from iridauploader.parsers import common import iridauploader.model as model path_to_module = path.abspath(path.dirname(__file__)) @@ -21,7 +22,7 @@ class TestParseMetadata(unittest.TestCase): def setUp(self): print("\nStarting " + self.__module__ + ": " + self._testMethodName) - @patch("iridauploader.parsers.miniseq.sample_parser.get_csv_reader") + @patch("iridauploader.parsers.common.get_csv_reader") def test_parse_metadata_paired_valid(self, mock_csv_reader): """ When given a valid directory, ensure valid metadata is built @@ -81,7 +82,7 @@ def test_parse_metadata_paired_valid(self, mock_csv_reader): self.assertEqual(metadata['description'], "12-34") self.assertEqual(metadata['chemistry'], "Yes") - @patch("iridauploader.parsers.miniseq.sample_parser.get_csv_reader") + @patch("iridauploader.parsers.common.get_csv_reader") def test_parse_metadata_single_valid(self, mock_csv_reader): """ When given a valid directory, ensure valid metadata is built @@ -222,7 +223,7 @@ def test_get_csv_reader_valid_sheet(self): sheet_file = path.join(path_to_module, "fake_ngs_data", "SampleSheet.csv") - lines = sample_parser.get_csv_reader(sheet_file) + lines = common.get_csv_reader(sheet_file) correct_lines = [ ['[Header]'], @@ -260,7 +261,7 @@ def test_get_csv_reader_no_sheet(self): "Alignment_1") with self.assertRaises(SampleSheetError): - sample_parser.get_csv_reader(sheet_file) + common.get_csv_reader(sheet_file) class TestValidatePfList(unittest.TestCase): diff --git a/iridauploader/tests/parsers/miniseq/test_validation.py b/iridauploader/tests/parsers/miniseq/test_validation.py index 6cd6d737..871fc0cb 100644 --- a/iridauploader/tests/parsers/miniseq/test_validation.py +++ b/iridauploader/tests/parsers/miniseq/test_validation.py @@ -15,7 +15,7 @@ class TestValidation(unittest.TestCase): def setUp(self): print("\nStarting " + self.__module__ + ": " + self._testMethodName) - @patch("iridauploader.parsers.miniseq.validation.get_csv_reader") + @patch("iridauploader.parsers.common.get_csv_reader") def test_validate_sample_sheet_no_header(self, mock_csv_reader): """ Given a sample sheet with no header, make sure the correct errors are included in the response @@ -60,7 +60,7 @@ def test_validate_sample_sheet_no_header(self, mock_csv_reader): # Error type should be SampleSheetError self.assertEqual(type(res.error_list[0]), SampleSheetError) - @patch("iridauploader.parsers.miniseq.validation.get_csv_reader") + @patch("iridauploader.parsers.common.get_csv_reader") def test_validate_sample_sheet_no_data(self, mock_csv_reader): """ Given a sample sheet with no data, make sure the correct errors are included in the response @@ -108,7 +108,7 @@ def test_validate_sample_sheet_no_data(self, mock_csv_reader): self.assertEqual(type(res.error_list[0]), SampleSheetError) self.assertEqual(type(res.error_list[1]), SampleSheetError) - @patch("iridauploader.parsers.miniseq.validation.get_csv_reader") + @patch("iridauploader.parsers.common.get_csv_reader") def test_validate_sample_sheet_missing_data_header(self, mock_csv_reader): """ Given a sample sheet with no data header, make sure the correct errors are included in the response @@ -164,7 +164,7 @@ def test_validate_sample_sheet_missing_data_header(self, mock_csv_reader): # Error type should be SampleSheetError self.assertEqual(type(res.error_list[0]), SampleSheetError) - @patch("iridauploader.parsers.miniseq.validation.get_csv_reader") + @patch("iridauploader.parsers.common.get_csv_reader") def test_validate_sample_sheet_valid(self, mock_csv_reader): """ Given a valid sample sheet, test that everything shows as valid diff --git a/iridauploader/tests/parsers/miseq/test_sample_parser.py b/iridauploader/tests/parsers/miseq/test_sample_parser.py index ae4e7d3f..d1e5e046 100644 --- a/iridauploader/tests/parsers/miseq/test_sample_parser.py +++ b/iridauploader/tests/parsers/miseq/test_sample_parser.py @@ -4,6 +4,7 @@ from csv import reader from io import StringIO +from iridauploader import parsers import iridauploader.parsers.miseq.sample_parser as sample_parser from iridauploader.parsers.exceptions import SampleSheetError, SequenceFileError import iridauploader.model as model @@ -21,7 +22,7 @@ class TestParseMetadata(unittest.TestCase): def setUp(self): print("\nStarting " + self.__module__ + ": " + self._testMethodName) - @patch("iridauploader.parsers.miseq.sample_parser.get_csv_reader") + @patch("iridauploader.parsers.common.get_csv_reader") def test_parse_metadata_paired_valid(self, mock_csv_reader): """ When given a valid directory, ensure valid metadata is built @@ -90,7 +91,7 @@ def test_parse_metadata_paired_valid(self, mock_csv_reader): self.assertEqual(metadata['description'], "12-34") self.assertEqual(metadata['chemistry'], "Yes") - @patch("iridauploader.parsers.miseq.sample_parser.get_csv_reader") + @patch("iridauploader.parsers.common.get_csv_reader") def test_parse_metadata_single_valid(self, mock_csv_reader): """ When given a valid directory, ensure valid metadata is built @@ -248,7 +249,7 @@ def test_get_csv_reader_valid_sheet(self): sheet_file = path.join(path_to_module, "fake_ngs_data", "SampleSheet.csv") - lines = sample_parser.get_csv_reader(sheet_file) + lines = parsers.common.get_csv_reader(sheet_file) correct_lines = [ ['[Header]'], @@ -290,7 +291,7 @@ def test_get_csv_reader_no_sheet(self): "Data") with self.assertRaises(SampleSheetError): - sample_parser.get_csv_reader(sheet_file) + parsers.common.get_csv_reader(sheet_file) class TestValidatePfList(unittest.TestCase): diff --git a/iridauploader/tests/parsers/miseq/test_validation.py b/iridauploader/tests/parsers/miseq/test_validation.py index 403bc310..2fb866e5 100644 --- a/iridauploader/tests/parsers/miseq/test_validation.py +++ b/iridauploader/tests/parsers/miseq/test_validation.py @@ -15,7 +15,7 @@ class TestValidation(unittest.TestCase): def setUp(self): print("\nStarting " + self.__module__ + ": " + self._testMethodName) - @patch("iridauploader.parsers.miseq.validation.get_csv_reader") + @patch("iridauploader.parsers.common.get_csv_reader") def test_validate_sample_sheet_no_header(self, mock_csv_reader): """ Given a sample sheet with no header, make sure the correct errors are included in the response @@ -61,7 +61,7 @@ def test_validate_sample_sheet_no_header(self, mock_csv_reader): # Error type should be SampleSheetError self.assertEqual(type(res.error_list[0]), SampleSheetError) - @patch("iridauploader.parsers.miseq.validation.get_csv_reader") + @patch("iridauploader.parsers.common.get_csv_reader") def test_validate_sample_sheet_no_data(self, mock_csv_reader): """ Given a sample sheet with no data, make sure the correct errors are included in the response @@ -112,7 +112,7 @@ def test_validate_sample_sheet_no_data(self, mock_csv_reader): self.assertEqual(type(res.error_list[0]), SampleSheetError) self.assertEqual(type(res.error_list[1]), SampleSheetError) - @patch("iridauploader.parsers.miseq.validation.get_csv_reader") + @patch("iridauploader.parsers.common.get_csv_reader") def test_validate_sample_sheet_missing_data_header(self, mock_csv_reader): """ Given a sample sheet with no data header, make sure the correct errors are included in the response @@ -172,7 +172,7 @@ def test_validate_sample_sheet_missing_data_header(self, mock_csv_reader): # Error type should be SampleSheetError self.assertEqual(type(res.error_list[0]), SampleSheetError) - @patch("iridauploader.parsers.miseq.validation.get_csv_reader") + @patch("iridauploader.parsers.common.get_csv_reader") def test_validate_sample_sheet_valid(self, mock_csv_reader): """ Given a valid sample sheet, test that everything shows as valid diff --git a/iridauploader/tests/parsers/nextseq/test_sample_parser.py b/iridauploader/tests/parsers/nextseq/test_sample_parser.py index 68de6afa..1140f385 100644 --- a/iridauploader/tests/parsers/nextseq/test_sample_parser.py +++ b/iridauploader/tests/parsers/nextseq/test_sample_parser.py @@ -4,6 +4,7 @@ from csv import reader from io import StringIO +from iridauploader import parsers import iridauploader.parsers.nextseq.sample_parser as sample_parser from iridauploader.parsers.exceptions import SampleSheetError, SequenceFileError import iridauploader.model as model @@ -21,7 +22,7 @@ class TestParseMetadata(unittest.TestCase): def setUp(self): print("\nStarting " + self.__module__ + ": " + self._testMethodName) - @patch("iridauploader.parsers.nextseq.sample_parser.get_csv_reader") + @patch("iridauploader.parsers.common.get_csv_reader") def test_parse_metadata_paired_valid(self, mock_csv_reader): """ When given a valid directory, ensure valid metadata is built @@ -90,7 +91,7 @@ def test_parse_metadata_paired_valid(self, mock_csv_reader): self.assertEqual(metadata['description'], "12-34") self.assertEqual(metadata['chemistry'], "Yes") - @patch("iridauploader.parsers.nextseq.sample_parser.get_csv_reader") + @patch("iridauploader.parsers.common.get_csv_reader") def test_parse_metadata_single_valid(self, mock_csv_reader): """ When given a valid directory, ensure valid metadata is built @@ -251,7 +252,7 @@ def test_get_csv_reader_valid_sheet(self): sheet_file = path.join(path_to_module, "fake_nextseq_run", "SampleSheet.csv") - lines = sample_parser.get_csv_reader(sheet_file) + lines = parsers.common.get_csv_reader(sheet_file) correct_lines = [ ['[Header]'], @@ -293,7 +294,7 @@ def test_get_csv_reader_no_sheet(self): "Data") with self.assertRaises(SampleSheetError): - sample_parser.get_csv_reader(sheet_file) + parsers.common.get_csv_reader(sheet_file) class TestValidatePfList(unittest.TestCase): diff --git a/iridauploader/tests/parsers/nextseq/test_validation.py b/iridauploader/tests/parsers/nextseq/test_validation.py index a5759c0f..cc72cb73 100644 --- a/iridauploader/tests/parsers/nextseq/test_validation.py +++ b/iridauploader/tests/parsers/nextseq/test_validation.py @@ -15,7 +15,7 @@ class TestValidation(unittest.TestCase): def setUp(self): print("\nStarting " + self.__module__ + ": " + self._testMethodName) - @patch("iridauploader.parsers.nextseq.validation.get_csv_reader") + @patch("iridauploader.parsers.common.get_csv_reader") def test_validate_sample_sheet_no_header(self, mock_csv_reader): """ Given a sample sheet with no header, make sure the correct errors are included in the response @@ -61,7 +61,7 @@ def test_validate_sample_sheet_no_header(self, mock_csv_reader): # Error type should be SampleSheetError self.assertEqual(type(res.error_list[0]), SampleSheetError) - @patch("iridauploader.parsers.nextseq.validation.get_csv_reader") + @patch("iridauploader.parsers.common.get_csv_reader") def test_validate_sample_sheet_no_data(self, mock_csv_reader): """ Given a sample sheet with no data, make sure the correct errors are included in the response @@ -112,7 +112,7 @@ def test_validate_sample_sheet_no_data(self, mock_csv_reader): self.assertEqual(type(res.error_list[0]), SampleSheetError) self.assertEqual(type(res.error_list[1]), SampleSheetError) - @patch("iridauploader.parsers.nextseq.validation.get_csv_reader") + @patch("iridauploader.parsers.common.get_csv_reader") def test_validate_sample_sheet_missing_data_header(self, mock_csv_reader): """ Given a sample sheet with no data header, make sure the correct errors are included in the response @@ -172,7 +172,7 @@ def test_validate_sample_sheet_missing_data_header(self, mock_csv_reader): # Error type should be SampleSheetError self.assertEqual(type(res.error_list[0]), SampleSheetError) - @patch("iridauploader.parsers.nextseq.validation.get_csv_reader") + @patch("iridauploader.parsers.common.get_csv_reader") def test_validate_sample_sheet_valid(self, mock_csv_reader): """ Given a valid sample sheet, test that everything shows as valid From c4239cd7c89c1b3dba0922846f500ae86bbc64a5 Mon Sep 17 00:00:00 2001 From: Jeffrey Thiessen Date: Tue, 31 Mar 2020 17:51:32 -0500 Subject: [PATCH 02/19] Moved directory finding code to common parser code --- iridauploader/parsers/common.py | 25 ++++++++++- iridauploader/parsers/directory/parser.py | 25 +---------- iridauploader/parsers/miniseq/parser.py | 29 ++----------- iridauploader/parsers/miseq/parser.py | 25 +---------- iridauploader/parsers/nextseq/parser.py | 25 +---------- .../tests/parsers/common/__init__.py | 0 .../tests/parsers/common/no_dirs/empty.txt | 0 .../tests/parsers/common/test_common.py | 42 +++++++++++++++++++ .../parsers/common/three_dirs/first/empty.txt | 0 .../common/three_dirs/second/empty.txt | 0 .../parsers/common/three_dirs/third/empty.txt | 0 .../tests/parsers/directory/test_parser.py | 34 --------------- .../tests/parsers/miniseq/test_parser.py | 34 --------------- .../tests/parsers/miseq/test_parser.py | 34 --------------- .../tests/parsers/nextseq/test_parser.py | 34 --------------- 15 files changed, 75 insertions(+), 232 deletions(-) create mode 100644 iridauploader/tests/parsers/common/__init__.py create mode 100644 iridauploader/tests/parsers/common/no_dirs/empty.txt create mode 100644 iridauploader/tests/parsers/common/test_common.py create mode 100644 iridauploader/tests/parsers/common/three_dirs/first/empty.txt create mode 100644 iridauploader/tests/parsers/common/three_dirs/second/empty.txt create mode 100644 iridauploader/tests/parsers/common/three_dirs/third/empty.txt diff --git a/iridauploader/parsers/common.py b/iridauploader/parsers/common.py index 78be4620..50fc8f06 100644 --- a/iridauploader/parsers/common.py +++ b/iridauploader/parsers/common.py @@ -1,4 +1,4 @@ -from os import path +import os from csv import reader from iridauploader.parsers import exceptions @@ -19,7 +19,7 @@ def get_csv_reader(sample_sheet_file): returns a csv.reader object """ - if path.isfile(sample_sheet_file): + if os.path.isfile(sample_sheet_file): csv_file = open(sample_sheet_file, "r") # strip any trailing newline characters from the end of the line # including Windows newline characters (\r\n) @@ -33,3 +33,24 @@ def get_csv_reader(sample_sheet_file): sample_sheet_file) return csv_reader + +def find_directory_list(directory): + """Find and return all directories in the specified directory. + + Arguments: + directory -- the directory to find directories in + + Returns: a list of directories including current directory + """ + + # Checks if we can access to the given directory, return empty and log a warning if we cannot. + if not os.access(directory, os.W_OK): + raise exceptions.DirectoryError("The directory is not writeable, " + "can not upload samples from this directory {}".format(directory), + directory) + + dir_list = next(os.walk(directory))[1] # Gets the list of directories in the directory + full_dir_list = [] + for d in dir_list: + full_dir_list.append(os.path.join(directory, d)) + return full_dir_list diff --git a/iridauploader/parsers/directory/parser.py b/iridauploader/parsers/directory/parser.py index 3f57e309..d0641883 100644 --- a/iridauploader/parsers/directory/parser.py +++ b/iridauploader/parsers/directory/parser.py @@ -4,6 +4,7 @@ import iridauploader.progress as progress from iridauploader.parsers import exceptions +from iridauploader.parsers import common from iridauploader.parsers.directory import sample_parser, validation @@ -19,28 +20,6 @@ def get_required_file_list(): """ return [Parser.SAMPLE_SHEET_FILE_NAME] - @staticmethod - def _find_directory_list(directory): - """Find and return all directories in the specified directory. - - Arguments: - directory -- the directory to find directories in - - Returns: a list of directories including current directory - """ - - # Checks if we can access to the given directory, return empty and log a warning if we cannot. - if not os.access(directory, os.W_OK): - raise exceptions.DirectoryError("The directory is not writeable, " - "can not upload samples from this directory {}".format(directory), - directory) - - dir_list = next(os.walk(directory))[1] # Gets the list of directories in the directory - full_dir_list = [] - for d in dir_list: - full_dir_list.append(os.path.join(directory, d)) - return full_dir_list - @staticmethod def find_runs(directory): """ @@ -52,7 +31,7 @@ def find_runs(directory): logging.info("Looking for runs in {}".format(directory)) runs = [] - directory_list = Parser._find_directory_list(directory) + directory_list = common.find_directory_list(directory) for d in directory_list: runs.append(progress.get_directory_status(d, Parser.get_required_file_list())) diff --git a/iridauploader/parsers/miniseq/parser.py b/iridauploader/parsers/miniseq/parser.py index 5cc35163..b27ffbf2 100644 --- a/iridauploader/parsers/miniseq/parser.py +++ b/iridauploader/parsers/miniseq/parser.py @@ -5,6 +5,7 @@ import iridauploader.progress as progress from iridauploader.parsers import exceptions +from iridauploader.parsers import common from iridauploader.parsers.miniseq import sample_parser, validation @@ -24,28 +25,6 @@ def get_required_file_list(): Parser.UPLOAD_COMPLETE_FILE_NAME ] - @staticmethod - def _find_directory_list(directory): - """Find and return all directories in the specified directory. - - Arguments: - directory -- the directory to find directories in - - Returns: a list of directories including current directory - """ - - # Checks if we can access to the given directory, return empty and log a warning if we cannot. - if not os.access(directory, os.W_OK): - raise exceptions.DirectoryError("The directory is not writeable, " - "can not upload samples from this directory {}".format(directory), - directory) - - dir_list = next(os.walk(directory))[1] # Gets the list of directories in the directory - full_dir_list = [] - for d in dir_list: - full_dir_list.append(os.path.join(directory, d)) - return full_dir_list - @staticmethod def find_runs(directory): """ @@ -57,7 +36,7 @@ def find_runs(directory): logging.info("looking for runs in {}".format(directory)) runs = [] - directory_list = Parser._find_directory_list(directory) + directory_list = common.find_directory_list(directory) for d in directory_list: runs.append(progress.get_directory_status(d, Parser.get_required_file_list())) @@ -106,9 +85,9 @@ def get_sample_sheet(directory): @staticmethod def get_sequencing_run(sample_sheet): """ - Does local validation on the integrety of the run directory / sample sheet + Does local validation on the integrity of the run directory / sample sheet - Throws a ValidationError with a valadation result attached if it cannot make a sequencing run + Throws a ValidationError with a validation result attached if it cannot make a sequencing run :param sample_sheet: :return: SequencingRun diff --git a/iridauploader/parsers/miseq/parser.py b/iridauploader/parsers/miseq/parser.py index 47d21a49..2842e8ff 100644 --- a/iridauploader/parsers/miseq/parser.py +++ b/iridauploader/parsers/miseq/parser.py @@ -5,6 +5,7 @@ import iridauploader.progress as progress from iridauploader.parsers import exceptions +from iridauploader.parsers import common from iridauploader.parsers.miseq import sample_parser, validation @@ -24,28 +25,6 @@ def get_required_file_list(): Parser.UPLOAD_COMPLETE_FILE_NAME ] - @staticmethod - def _find_directory_list(directory): - """Find and return all directories in the specified directory. - - Arguments: - directory -- the directory to find directories in - - Returns: a list of directories including current directory - """ - - # Checks if we can access to the given directory, return empty and log a warning if we cannot. - if not os.access(directory, os.W_OK): - raise exceptions.DirectoryError("The directory is not writeable, " - "can not upload samples from this directory {}".format(directory), - directory) - - dir_list = next(os.walk(directory))[1] # Gets the list of directories in the directory - full_dir_list = [] - for d in dir_list: - full_dir_list.append(os.path.join(directory, d)) - return full_dir_list - @staticmethod def find_runs(directory): """ @@ -57,7 +36,7 @@ def find_runs(directory): logging.info("Looking for runs in {}".format(directory)) runs = [] - directory_list = Parser._find_directory_list(directory) + directory_list = common.find_directory_list(directory) for d in directory_list: runs.append(progress.get_directory_status(d, Parser.get_required_file_list())) diff --git a/iridauploader/parsers/nextseq/parser.py b/iridauploader/parsers/nextseq/parser.py index a93ae4ec..6c628457 100644 --- a/iridauploader/parsers/nextseq/parser.py +++ b/iridauploader/parsers/nextseq/parser.py @@ -5,6 +5,7 @@ import iridauploader.progress as progress from iridauploader.parsers import exceptions +from iridauploader.parsers import common from iridauploader.parsers.nextseq import sample_parser, validation @@ -26,28 +27,6 @@ def get_required_file_list(): Parser.UPLOAD_COMPLETE_FILE_NAME ] - @staticmethod - def _find_directory_list(directory): - """Find and return all directories in the specified directory. - - Arguments: - directory -- the directory to find directories in - - Returns: a list of directories including current directory - """ - - # Checks if we can access to the given directory, return empty and log a warning if we cannot. - if not os.access(directory, os.W_OK): - raise exceptions.DirectoryError("The directory is not writeable, " - "can not upload samples from this directory {}".format(directory), - directory) - - dir_list = next(os.walk(directory))[1] # Gets the list of directories in the directory - full_dir_list = [] - for d in dir_list: - full_dir_list.append(os.path.join(directory, d)) - return full_dir_list - @staticmethod def find_runs(directory): """ @@ -59,7 +38,7 @@ def find_runs(directory): logging.info("looking for runs in {}".format(directory)) runs = [] - directory_list = Parser._find_directory_list(directory) + directory_list = common.find_directory_list(directory) for d in directory_list: runs.append(progress.get_directory_status(d, Parser.get_required_file_list())) diff --git a/iridauploader/tests/parsers/common/__init__.py b/iridauploader/tests/parsers/common/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/iridauploader/tests/parsers/common/no_dirs/empty.txt b/iridauploader/tests/parsers/common/no_dirs/empty.txt new file mode 100644 index 00000000..e69de29b diff --git a/iridauploader/tests/parsers/common/test_common.py b/iridauploader/tests/parsers/common/test_common.py new file mode 100644 index 00000000..27288637 --- /dev/null +++ b/iridauploader/tests/parsers/common/test_common.py @@ -0,0 +1,42 @@ +import unittest +import os + +from iridauploader.parsers import common + +path_to_module = os.path.abspath(os.path.dirname(__file__)) +if len(path_to_module) == 0: + path_to_module = '.' + + +class TestFindDirectoryList(unittest.TestCase): + """ + Test getting the list of directories + """ + + def setUp(self): + print("\nStarting " + self.__module__ + ": " + self._testMethodName) + + def test_find_three(self): + """ + Given a directory with 3 run directories in it, make sure all 3 directories are included in result + :return: + """ + directory = os.path.join(path_to_module, "three_dirs") + dir_1 = os.path.join(directory, "first") + dir_2 = os.path.join(directory, "second") + dir_3 = os.path.join(directory, "third") + res = common.find_directory_list(directory) + + self.assertIn(dir_1, res) + self.assertIn(dir_2, res) + self.assertIn(dir_3, res) + + def test_find_none(self): + """ + Given a directory with no sequencing run directories in it, make sure an empty list is returned + :return: + """ + directory = os.path.join(path_to_module, "no_dirs") + res = common.find_directory_list(directory) + + self.assertEqual(res, []) diff --git a/iridauploader/tests/parsers/common/three_dirs/first/empty.txt b/iridauploader/tests/parsers/common/three_dirs/first/empty.txt new file mode 100644 index 00000000..e69de29b diff --git a/iridauploader/tests/parsers/common/three_dirs/second/empty.txt b/iridauploader/tests/parsers/common/three_dirs/second/empty.txt new file mode 100644 index 00000000..e69de29b diff --git a/iridauploader/tests/parsers/common/three_dirs/third/empty.txt b/iridauploader/tests/parsers/common/three_dirs/third/empty.txt new file mode 100644 index 00000000..e69de29b diff --git a/iridauploader/tests/parsers/directory/test_parser.py b/iridauploader/tests/parsers/directory/test_parser.py index 7ba85b6c..41ae0c65 100644 --- a/iridauploader/tests/parsers/directory/test_parser.py +++ b/iridauploader/tests/parsers/directory/test_parser.py @@ -182,37 +182,3 @@ def test_valid_run(self): res = Parser.get_sequencing_run(sample_sheet) self.assertEqual(type(res), model.SequencingRun) - - -class TestFindDirectoryList(unittest.TestCase): - """ - Test getting the list of directories - """ - - def setUp(self): - print("\nStarting " + self.__module__ + ": " + self._testMethodName) - - def test_find_three(self): - """ - Given a directory with 3 run directories in it, make sure all 3 directories are included in result - :return: - """ - directory = path.join(path_to_module, "three_dirs") - dir_1 = path.join(directory, "first") - dir_2 = path.join(directory, "second") - dir_3 = path.join(directory, "third") - res = Parser._find_directory_list(directory) - - self.assertIn(dir_1, res) - self.assertIn(dir_2, res) - self.assertIn(dir_3, res) - - def test_find_none(self): - """ - Given a directory with no sequencing run directories in it, make sure an empty list is returned - :return: - """ - directory = path.join(path_to_module, "no_dirs") - res = Parser._find_directory_list(directory) - - self.assertEqual(res, []) diff --git a/iridauploader/tests/parsers/miniseq/test_parser.py b/iridauploader/tests/parsers/miniseq/test_parser.py index a1b50273..e362d873 100644 --- a/iridauploader/tests/parsers/miniseq/test_parser.py +++ b/iridauploader/tests/parsers/miniseq/test_parser.py @@ -195,37 +195,3 @@ def test_valid_run(self): res = Parser.get_sequencing_run(sample_sheet) self.assertEqual(type(res), model.SequencingRun) - - -class TestFindDirectoryList(unittest.TestCase): - """ - Test getting the list of directories - """ - - def setUp(self): - print("\nStarting " + self.__module__ + ": " + self._testMethodName) - - def test_find_three(self): - """ - Given a directory with 3 run directories in it, make sure all 3 directories are included in result - :return: - """ - directory = path.join(path_to_module, "three_dirs") - dir_1 = path.join(directory, "first") - dir_2 = path.join(directory, "second") - dir_3 = path.join(directory, "third") - res = Parser._find_directory_list(directory) - - self.assertIn(dir_1, res) - self.assertIn(dir_2, res) - self.assertIn(dir_3, res) - - def test_find_none(self): - """ - Given a directory with no sequencing run directories in it, make sure an empty list is returned - :return: - """ - directory = path.join(path_to_module, "no_dirs") - res = Parser._find_directory_list(directory) - - self.assertEqual(res, []) diff --git a/iridauploader/tests/parsers/miseq/test_parser.py b/iridauploader/tests/parsers/miseq/test_parser.py index 649b61f0..c41fecd2 100644 --- a/iridauploader/tests/parsers/miseq/test_parser.py +++ b/iridauploader/tests/parsers/miseq/test_parser.py @@ -195,37 +195,3 @@ def test_valid_run(self): res = Parser.get_sequencing_run(sample_sheet) self.assertEqual(type(res), model.SequencingRun) - - -class TestFindDirectoryList(unittest.TestCase): - """ - Test getting the list of directories - """ - - def setUp(self): - print("\nStarting " + self.__module__ + ": " + self._testMethodName) - - def test_find_three(self): - """ - Given a directory with 3 run directories in it, make sure all 3 directories are included in result - :return: - """ - directory = path.join(path_to_module, "three_dirs") - dir_1 = path.join(directory, "first") - dir_2 = path.join(directory, "second") - dir_3 = path.join(directory, "third") - res = Parser._find_directory_list(directory) - - self.assertIn(dir_1, res) - self.assertIn(dir_2, res) - self.assertIn(dir_3, res) - - def test_find_none(self): - """ - Given a directory with no sequencing run directories in it, make sure an empty list is returned - :return: - """ - directory = path.join(path_to_module, "no_dirs") - res = Parser._find_directory_list(directory) - - self.assertEqual(res, []) diff --git a/iridauploader/tests/parsers/nextseq/test_parser.py b/iridauploader/tests/parsers/nextseq/test_parser.py index caf03910..2548c7b0 100644 --- a/iridauploader/tests/parsers/nextseq/test_parser.py +++ b/iridauploader/tests/parsers/nextseq/test_parser.py @@ -195,37 +195,3 @@ def test_valid_run(self): res = Parser.get_sequencing_run(sample_sheet) self.assertEqual(type(res), model.SequencingRun) - - -class TestFindDirectoryList(unittest.TestCase): - """ - Test getting the list of directories - """ - - def setUp(self): - print("\nStarting " + self.__module__ + ": " + self._testMethodName) - - def test_find_three(self): - """ - Given a directory with 3 run directories in it, make sure all 3 directories are included in result - :return: - """ - directory = path.join(path_to_module, "three_dirs") - dir_1 = path.join(directory, "first") - dir_2 = path.join(directory, "second") - dir_3 = path.join(directory, "third") - res = Parser._find_directory_list(directory) - - self.assertIn(dir_1, res) - self.assertIn(dir_2, res) - self.assertIn(dir_3, res) - - def test_find_none(self): - """ - Given a directory with no sequencing run directories in it, make sure an empty list is returned - :return: - """ - directory = path.join(path_to_module, "no_dirs") - res = Parser._find_directory_list(directory) - - self.assertEqual(res, []) From 7b52198ff1c1e2958722beb5573caf9d9c8a2b93 Mon Sep 17 00:00:00 2001 From: Jeffrey Thiessen Date: Tue, 7 Apr 2020 18:05:20 -0500 Subject: [PATCH 03/19] Refactor miseq get_sequencing_run --- iridauploader/parsers/common.py | 41 +++++++++++++++ iridauploader/parsers/miseq/parser.py | 43 +++++++++++++-- iridauploader/parsers/miseq/sample_parser.py | 55 +++++--------------- 3 files changed, 92 insertions(+), 47 deletions(-) diff --git a/iridauploader/parsers/common.py b/iridauploader/parsers/common.py index 50fc8f06..c472ae02 100644 --- a/iridauploader/parsers/common.py +++ b/iridauploader/parsers/common.py @@ -1,7 +1,9 @@ import os from csv import reader +import logging from iridauploader.parsers import exceptions +from iridauploader import model def get_csv_reader(sample_sheet_file): @@ -54,3 +56,42 @@ def find_directory_list(directory): for d in dir_list: full_dir_list.append(os.path.join(directory, d)) return full_dir_list + +def build_sequencing_run_from_samples(sample_list, metadata): + """ + Create a SequencingRun object with full project/sample/sequence_file structure + + :param sample_list: List of Sample objects + :param metadata: metadata dict to add to the run + :return: SequencingRun + """ + + logging.debug("Building SequencingRun from parsed data") + + # create list of projects and add samples to appropriate project + project_list = [] + for sample in sample_list: + project = None + for p in project_list: + if sample.get('sample_project') == p.id: + project = p + if project is None: + project = model.Project(id=sample.get('sample_project')) + project_list.append(project) + + project.add_sample(sample) + + sequence_run = model.SequencingRun(metadata, project_list) + logging.debug("SequencingRun built") + return sequence_run + +def get_file_list(directory): + """ + Get the list of file names in the data directory + + :param data_directory: directory to search for files + :return: list of file names in data directory + """ + # Create a file list of the directory, only hit the os once + file_list = next(os.walk(directory))[2] + return file_list diff --git a/iridauploader/parsers/miseq/parser.py b/iridauploader/parsers/miseq/parser.py index 2842e8ff..a564188f 100644 --- a/iridauploader/parsers/miseq/parser.py +++ b/iridauploader/parsers/miseq/parser.py @@ -25,6 +25,30 @@ def get_required_file_list(): Parser.UPLOAD_COMPLETE_FILE_NAME ] + @staticmethod + def get_relative_data_directory(): + """ + Returns path to the sequence file directory, relative to the Sample Sheet + + This is not used in the application but is useful for scripting and cloud deployment + + :return: a string which represents the concatenated path components, as per os.path.join + """ + data_dir = os.path.join("Data", "Intensities", "BaseCalls") + return data_dir + + @staticmethod + def get_full_data_directory(sample_sheet): + """ + Returns the path to where the sequence data files can be found, including the sample_sheet directory + + :param sample_sheet: Sample sheet acts as the starting point for the data directory + :return: a string which represents the concatenated path components, as per os.path.join + """ + sample_sheet_dir = os.path.dirname(sample_sheet) + data_dir = os.path.join(sample_sheet_dir, Parser.get_relative_data_directory()) + return data_dir + @staticmethod def find_runs(directory): """ @@ -72,7 +96,7 @@ def get_sample_sheet(directory): "can not parse samples from this directory {}".format(directory), directory) sample_sheet_file_name = Parser.SAMPLE_SHEET_FILE_NAME - file_list = next(os.walk(directory))[2] # Gets the list of files in the directory + file_list = common.get_file_list(directory) # Gets the list of files in the directory if sample_sheet_file_name not in file_list: logging.error("No sample sheet file in the MiSeq format found") raise exceptions.DirectoryError("The directory {} has no sample sheet file in the MiSeq format" @@ -83,16 +107,26 @@ def get_sample_sheet(directory): return os.path.join(directory, sample_sheet_file_name) @staticmethod - def get_sequencing_run(sample_sheet): + def get_sequencing_run(sample_sheet, run_data_directory=None, run_data_directory_file_list=None): """ Does local validation on the integrety of the run directory / sample sheet Throws a ValidationError with a valadation result attached if it cannot make a sequencing run - :param sample_sheet: + :param sample_sheet: Sample Sheet File + :param run_data_directory: Optional: Directory (including run directory) to data files. + Can be provided for bypassing os calls when developing on cloud systems + :param run_data_directory_file_list: Optional: List of files in data directory. + Can be provided for bypassing os calls when developing on cloud systems :return: SequencingRun """ + # get data directory and file list + if run_data_directory is None: + run_data_directory = Parser.get_full_data_directory(sample_sheet) + if run_data_directory_file_list is None: + run_data_directory_file_list = common.get_file_list(run_data_directory) + # Try to get the sample sheet, validate that the sample sheet is valid validation_result = validation.validate_sample_sheet(sample_sheet) if not validation_result.is_valid(): @@ -110,7 +144,8 @@ def get_sequencing_run(sample_sheet): # Try to build sequencing run from sample sheet & meta data, raise validation error if errors occur try: - sequencing_run = sample_parser.build_sequencing_run_from_samples(sample_sheet, run_metadata) + sample_list = sample_parser.parse_sample_list(sample_sheet, run_data_directory, run_data_directory_file_list) + sequencing_run = common.build_sequencing_run_from_samples(sample_list, run_metadata) except exceptions.SequenceFileError as error: validation_result.add_error(error) logging.error("Errors occurred while building sequence run from sample sheet") diff --git a/iridauploader/parsers/miseq/sample_parser.py b/iridauploader/parsers/miseq/sample_parser.py index 37795f79..ccb48a5f 100644 --- a/iridauploader/parsers/miseq/sample_parser.py +++ b/iridauploader/parsers/miseq/sample_parser.py @@ -1,5 +1,5 @@ import re -from os import path, walk +from os import path from collections import OrderedDict from copy import deepcopy import logging @@ -90,47 +90,16 @@ def parse_metadata(sample_sheet_file): return metadata_dict -def build_sequencing_run_from_samples(sample_sheet_file, metadata): +def parse_sample_list(sample_sheet_file, run_data_directory, run_data_directory_file_list): """ - Create a SequencingRun object with full project/sample/sequence_file structure + Creates a list of Sample Objects - :param sample_sheet_file: - :param metadata: - :return: SequencingRun - """ - sample_list = _parse_sample_list(sample_sheet_file) - - logging.debug("Building SequencingRun from parsed data") - - # create list of projects and add samples to appropriate project - project_list = [] - for sample in sample_list: - project = None - for p in project_list: - if sample.get('sample_project') == p.id: - project = p - if project is None: - project = model.Project(id=sample.get('sample_project')) - project_list.append(project) - - project.add_sample(sample) - - sequence_run = model.SequencingRun(metadata, project_list) - logging.debug("SequencingRun built") - return sequence_run - - -def _parse_sample_list(sample_sheet_file): - """ - Creates a list of all samples in the sample_sheet_file, with accompanying data/metadata - - :param sample_sheet_file: - :return: list of samples + :param sample_sheet_file: Sample Sheet file + :param run_data_directory: Data directory including run directory (e.g. my_run/Data/Intensities/BaseCalls) + :param run_data_directory_file_list: The list of all files in the data directory + :return: list of Sample objects """ sample_list = _parse_samples(sample_sheet_file) - sample_sheet_dir = path.dirname(sample_sheet_file) - data_dir = path.join(sample_sheet_dir, "Data", "Intensities", "BaseCalls") - data_dir_file_list = next(walk(data_dir))[2] # Create a file list of the data directory, only hit the os once for sample in sample_list: properties_dict = _parse_out_sequence_file(sample) @@ -140,7 +109,7 @@ def _parse_sample_list(sample_sheet_file): sample_name=re.escape(sample.sample_name), sample_number=sample.sample_number) logging.info("Looking for files with pattern {}".format(file_pattern)) regex = re.compile(file_pattern) - pf_list = list(filter(regex.search, data_dir_file_list)) + pf_list = list(filter(regex.search, run_data_directory_file_list)) if not pf_list: # OK. So we didn't find any files using the **correct** file name # definition according to Illumina. Let's try again with our deprecated @@ -150,7 +119,7 @@ def _parse_sample_list(sample_sheet_file): logging.info("Looking for files with pattern {}".format(file_pattern)) regex = re.compile(file_pattern) - pf_list = list(filter(regex.search, data_dir_file_list)) + pf_list = list(filter(regex.search, run_data_directory_file_list)) if not pf_list: # we **still** didn't find anything. It's pretty likely, then that @@ -161,18 +130,18 @@ def _parse_sample_list(sample_sheet_file): ".fastq.gz for the sample in your sample sheet with name {} in the directory {}. " "This usually happens when the Illumina MiSeq Reporter tool " "does not generate any FastQ data.").format( - sample.sample_name, data_dir)) + sample.sample_name, run_data_directory)) # List of files may be invalid if directory searching in has been modified by user if not _validate_pf_list(pf_list): raise exceptions.SequenceFileError( ("The following file list {} found in the directory {} is invalid. " "Please verify the folder containing the sequence files matches the SampleSheet file").format( - pf_list, data_dir)) + pf_list, run_data_directory)) # Add the dir to each file to create the full path for i in range(len(pf_list)): - pf_list[i] = path.join(data_dir, pf_list[i]) + pf_list[i] = path.join(run_data_directory, pf_list[i]) sq = model.SequenceFile(file_list=pf_list, properties_dict=properties_dict) sample.sequence_file = deepcopy(sq) From 249b5301b6c854a71f96b1ec0597ebb20cfd2836 Mon Sep 17 00:00:00 2001 From: Jeffrey Thiessen Date: Tue, 7 Apr 2020 18:13:18 -0500 Subject: [PATCH 04/19] cleanup whitespace pep8 --- iridauploader/parsers/common.py | 33 +++++++++++++++------------ iridauploader/parsers/miseq/parser.py | 4 ++-- 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/iridauploader/parsers/common.py b/iridauploader/parsers/common.py index c472ae02..0f7788c2 100644 --- a/iridauploader/parsers/common.py +++ b/iridauploader/parsers/common.py @@ -36,26 +36,28 @@ def get_csv_reader(sample_sheet_file): return csv_reader + def find_directory_list(directory): - """Find and return all directories in the specified directory. + """Find and return all directories in the specified directory. + + Arguments: + directory -- the directory to find directories in - Arguments: - directory -- the directory to find directories in + Returns: a list of directories including current directory + """ - Returns: a list of directories including current directory - """ + # Checks if we can access to the given directory, return empty and log a warning if we cannot. + if not os.access(directory, os.W_OK): + raise exceptions.DirectoryError("The directory is not writeable, " + "can not upload samples from this directory {}".format(directory), + directory) - # Checks if we can access to the given directory, return empty and log a warning if we cannot. - if not os.access(directory, os.W_OK): - raise exceptions.DirectoryError("The directory is not writeable, " - "can not upload samples from this directory {}".format(directory), - directory) + dir_list = next(os.walk(directory))[1] # Gets the list of directories in the directory + full_dir_list = [] + for d in dir_list: + full_dir_list.append(os.path.join(directory, d)) + return full_dir_list - dir_list = next(os.walk(directory))[1] # Gets the list of directories in the directory - full_dir_list = [] - for d in dir_list: - full_dir_list.append(os.path.join(directory, d)) - return full_dir_list def build_sequencing_run_from_samples(sample_list, metadata): """ @@ -85,6 +87,7 @@ def build_sequencing_run_from_samples(sample_list, metadata): logging.debug("SequencingRun built") return sequence_run + def get_file_list(directory): """ Get the list of file names in the data directory diff --git a/iridauploader/parsers/miseq/parser.py b/iridauploader/parsers/miseq/parser.py index a564188f..873e7b78 100644 --- a/iridauploader/parsers/miseq/parser.py +++ b/iridauploader/parsers/miseq/parser.py @@ -114,7 +114,7 @@ def get_sequencing_run(sample_sheet, run_data_directory=None, run_data_directory Throws a ValidationError with a valadation result attached if it cannot make a sequencing run :param sample_sheet: Sample Sheet File - :param run_data_directory: Optional: Directory (including run directory) to data files. + :param run_data_directory: Optional: Directory (including run directory) to data files. Can be provided for bypassing os calls when developing on cloud systems :param run_data_directory_file_list: Optional: List of files in data directory. Can be provided for bypassing os calls when developing on cloud systems @@ -125,7 +125,7 @@ def get_sequencing_run(sample_sheet, run_data_directory=None, run_data_directory if run_data_directory is None: run_data_directory = Parser.get_full_data_directory(sample_sheet) if run_data_directory_file_list is None: - run_data_directory_file_list = common.get_file_list(run_data_directory) + run_data_directory_file_list = common.get_file_list(run_data_directory) # Try to get the sample sheet, validate that the sample sheet is valid validation_result = validation.validate_sample_sheet(sample_sheet) From ee588f9b468da7a9b2d8fccd40a1db2888b6aecc Mon Sep 17 00:00:00 2001 From: Jeffrey Thiessen Date: Fri, 17 Apr 2020 00:47:41 -0500 Subject: [PATCH 05/19] Change sample sheet error to directory error when dir does not exist --- iridauploader/parsers/common.py | 3 +++ iridauploader/parsers/miseq/parser.py | 15 +++++++++++---- iridauploader/tests/parsers/miseq/test_parser.py | 2 +- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/iridauploader/parsers/common.py b/iridauploader/parsers/common.py index 0f7788c2..5c21c18d 100644 --- a/iridauploader/parsers/common.py +++ b/iridauploader/parsers/common.py @@ -95,6 +95,9 @@ def get_file_list(directory): :param data_directory: directory to search for files :return: list of file names in data directory """ + # verify that directory exists + if not os.path.exists(directory): + raise exceptions.DirectoryError("Could not list files, as directory does not exist.", directory) # Create a file list of the directory, only hit the os once file_list = next(os.walk(directory))[2] return file_list diff --git a/iridauploader/parsers/miseq/parser.py b/iridauploader/parsers/miseq/parser.py index 873e7b78..361a8a88 100644 --- a/iridauploader/parsers/miseq/parser.py +++ b/iridauploader/parsers/miseq/parser.py @@ -122,10 +122,17 @@ def get_sequencing_run(sample_sheet, run_data_directory=None, run_data_directory """ # get data directory and file list - if run_data_directory is None: - run_data_directory = Parser.get_full_data_directory(sample_sheet) - if run_data_directory_file_list is None: - run_data_directory_file_list = common.get_file_list(run_data_directory) + validation_result = model.ValidationResult() + + try: + if run_data_directory is None: + run_data_directory = Parser.get_full_data_directory(sample_sheet) + if run_data_directory_file_list is None: + run_data_directory_file_list = common.get_file_list(run_data_directory) + except exceptions.DirectoryError as error: + validation_result.add_error(error) + logging.error("Errors occurred while parsing files") + raise exceptions.ValidationError("Errors occurred while parsing files", validation_result) # Try to get the sample sheet, validate that the sample sheet is valid validation_result = validation.validate_sample_sheet(sample_sheet) diff --git a/iridauploader/tests/parsers/miseq/test_parser.py b/iridauploader/tests/parsers/miseq/test_parser.py index c41fecd2..f869949f 100644 --- a/iridauploader/tests/parsers/miseq/test_parser.py +++ b/iridauploader/tests/parsers/miseq/test_parser.py @@ -183,7 +183,7 @@ def test_invalid_sample_sheets(self): self.assertEqual(type(validation_result), model.ValidationResult) for error in validation_result.error_list: - self.assertEqual(type(error), SampleSheetError) + self.assertEqual(type(error), DirectoryError) def test_valid_run(self): """ From 35237a76e8d0b0931112ccaa6521f1baa15b0498 Mon Sep 17 00:00:00 2001 From: Jeffrey Thiessen Date: Fri, 17 Apr 2020 15:06:25 -0500 Subject: [PATCH 06/19] Refactor miniseq parser for cloud support --- iridauploader/parsers/miniseq/parser.py | 65 +++++++++++++++++-- .../parsers/miniseq/sample_parser.py | 64 ++++-------------- 2 files changed, 73 insertions(+), 56 deletions(-) diff --git a/iridauploader/parsers/miniseq/parser.py b/iridauploader/parsers/miniseq/parser.py index b27ffbf2..04889fe2 100644 --- a/iridauploader/parsers/miniseq/parser.py +++ b/iridauploader/parsers/miniseq/parser.py @@ -25,6 +25,45 @@ def get_required_file_list(): Parser.UPLOAD_COMPLETE_FILE_NAME ] + @staticmethod + def get_relative_data_directory(): + """ + Returns path to the sequence file directory, relative to the Sample Sheet + + This is not used in the application but is useful for scripting and cloud deployment + + This includes a '*' character to be interpreted as a wildcard symbol, + as this sequencer does not make consitant names for data directories, and the * must be gotten from the filesystem + + :return: a string which represents the concatenated path components, as per os.path.join + """ + data_dir = os.path.join("Alignment_1", "*", "Fastq") + return data_dir + + @staticmethod + def get_full_data_directory(sample_sheet): + """ + Returns the path to where the sequence data files can be found, including the sample_sheet directory + + Note, this hits the os, and as such is not to be used with cloud solutions. + For cloud solutions, use get_relative_data_directory() and solve the actual path for your cloud environment + + :param sample_sheet: Sample sheet acts as the starting point for the data directory + :return: a string which represents the concatenated path components, as per os.path.join + """ + sample_sheet_dir = os.path.dirname(sample_sheet) + partial_data_dir = os.path.join(sample_sheet_dir, "Alignment_1") + # Verify the partial path exits, path could not exist if there was a sequencing error + # Also, if someone runs the miniseq parser on a miseq directory, this is the failure point + if not os.path.exists(partial_data_dir): + raise exceptions.DirectoryError( + ("The uploader was unable to find the data directory with the path: {}, Verify that the run directory is " + "undamaged, and that it is a MiniSeq sequencing run.").format(partial_data_dir)) + + # get the directories [1] get the first directory [0] + data_dir = os.path.join(partial_data_dir, next(os.walk(partial_data_dir))[1][0], "Fastq") + return data_dir + @staticmethod def find_runs(directory): """ @@ -72,7 +111,7 @@ def get_sample_sheet(directory): "can not parse samples from this directory {}".format(directory), directory) sample_sheet_file_name = Parser.SAMPLE_SHEET_FILE_NAME - file_list = next(os.walk(directory))[2] # Gets the list of files in the directory + file_list = common.get_file_list(directory) # Gets the list of files in the directory if sample_sheet_file_name not in file_list: logging.error("No sample sheet file in the MiniSeq format found") raise exceptions.DirectoryError("The directory {} has no sample sheet file in the MiniSeq format" @@ -83,16 +122,33 @@ def get_sample_sheet(directory): return os.path.join(directory, sample_sheet_file_name) @staticmethod - def get_sequencing_run(sample_sheet): + def get_sequencing_run(sample_sheet, run_data_directory=None, run_data_directory_file_list=None): """ Does local validation on the integrity of the run directory / sample sheet Throws a ValidationError with a validation result attached if it cannot make a sequencing run - :param sample_sheet: + :param sample_sheet: Sample Sheet File + :param run_data_directory: Optional: Directory (including run directory) to data files. + Can be provided for bypassing os calls when developing on cloud systems + :param run_data_directory_file_list: Optional: List of files in data directory. + Can be provided for bypassing os calls when developing on cloud systems :return: SequencingRun """ + # get data directory and file list + validation_result = model.ValidationResult() + + try: + if run_data_directory is None: + run_data_directory = Parser.get_full_data_directory(sample_sheet) + if run_data_directory_file_list is None: + run_data_directory_file_list = common.get_file_list(run_data_directory) + except exceptions.DirectoryError as error: + validation_result.add_error(error) + logging.error("Errors occurred while parsing files") + raise exceptions.ValidationError("Errors occurred while parsing files", validation_result) + # Try to get the sample sheet, validate that the sample sheet is valid validation_result = validation.validate_sample_sheet(sample_sheet) if not validation_result.is_valid(): @@ -110,7 +166,8 @@ def get_sequencing_run(sample_sheet): # Try to build sequencing run from sample sheet & meta data, raise validation error if errors occur try: - sequencing_run = sample_parser.build_sequencing_run_from_samples(sample_sheet, run_metadata) + sample_list = sample_parser.parse_sample_list(sample_sheet, run_data_directory, run_data_directory_file_list) + sequencing_run = common.build_sequencing_run_from_samples(sample_list, run_metadata) except exceptions.SequenceFileError as error: validation_result.add_error(error) logging.error("Errors occurred while building sequence run from sample sheet") diff --git a/iridauploader/parsers/miniseq/sample_parser.py b/iridauploader/parsers/miniseq/sample_parser.py index d447bdb7..73bcd8d5 100644 --- a/iridauploader/parsers/miniseq/sample_parser.py +++ b/iridauploader/parsers/miniseq/sample_parser.py @@ -1,5 +1,5 @@ import re -from os import path, walk +from os import path from collections import OrderedDict from copy import deepcopy import logging @@ -86,56 +86,16 @@ def parse_metadata(sample_sheet_file): return metadata_dict -def build_sequencing_run_from_samples(sample_sheet_file, metadata): +def parse_sample_list(sample_sheet_file, run_data_directory, run_data_directory_file_list): """ - Create a SequencingRun object with full project/sample/sequence_file structure + Creates a list of Sample Objects - :param sample_sheet_file: - :param metadata: - :return: SequencingRun - """ - sample_list = _parse_sample_list(sample_sheet_file) - - logging.debug("Building SequencingRun from parsed data") - - # create list of projects and add samples to appropriate project - project_list = [] - for sample in sample_list: - project = None - for p in project_list: - if sample.get('sample_project') == p.id: - project = p - if project is None: - project = model.Project(id=sample.get('sample_project')) - project_list.append(project) - - project.add_sample(sample) - - sequence_run = model.SequencingRun(metadata, project_list) - logging.debug("SequencingRun built") - return sequence_run - - -def _parse_sample_list(sample_sheet_file): - """ - Creates a list of all samples in the sample_sheet_file, with accompanying data/metadata - - :param sample_sheet_file: - :return: list of samples + :param sample_sheet_file: Sample Sheet file + :param run_data_directory: Data directory including run directory (e.g. my_run/Data/Intensities/BaseCalls) + :param run_data_directory_file_list: The list of all files in the data directory + :return: list of Sample objects """ sample_list = _parse_samples(sample_sheet_file) - sample_sheet_dir = path.dirname(sample_sheet_file) - partial_data_dir = path.join(sample_sheet_dir, "Alignment_1") - # Verify the partial path exits, path could not exist if there was a sequencing error - # Also, if someone runs the miniseq parser on a miseq directory, this is the failure point - if not path.exists(partial_data_dir): - raise exceptions.SequenceFileError( - ("The uploader was unable to find the data directory with the path: {}, Verify that the run directory is " - "undamaged, and that it is a MiniSeq sequencing run.").format(partial_data_dir)) - - # get the directories [1] get the first directory [0] - data_dir = path.join(partial_data_dir, next(walk(partial_data_dir))[1][0], "Fastq") - data_dir_file_list = next(walk(data_dir))[2] # Create a file list of the data directory, only hit the os once for sample in sample_list: properties_dict = _parse_out_sequence_file(sample) @@ -145,7 +105,7 @@ def _parse_sample_list(sample_sheet_file): sample_name=re.escape(sample.sample_name), sample_number=sample.sample_number) logging.info("Looking for files with pattern {}".format(file_pattern)) regex = re.compile(file_pattern) - pf_list = list(filter(regex.search, data_dir_file_list)) + pf_list = list(filter(regex.search, run_data_directory_file_list)) if not pf_list: # OK. So we didn't find any files using the **correct** file name # definition according to Illumina. Let's try again with our deprecated @@ -155,7 +115,7 @@ def _parse_sample_list(sample_sheet_file): logging.info("Looking for files with pattern {}".format(file_pattern)) regex = re.compile(file_pattern) - pf_list = list(filter(regex.search, data_dir_file_list)) + pf_list = list(filter(regex.search, run_data_directory_file_list)) if not pf_list: # we **still** didn't find anything. It's pretty likely, then that @@ -166,18 +126,18 @@ def _parse_sample_list(sample_sheet_file): ".fastq.gz for the sample in your sample sheet with name {} in the directory {}. " "This usually happens when the Illumina MiniSeq Reporter tool " "does not generate any FastQ data.").format( - sample.sample_name, data_dir)) + sample.sample_name, run_data_directory)) # List of files may be invalid if directory searching in has been modified by user if not _validate_pf_list(pf_list): raise exceptions.SequenceFileError( ("The following file list {} found in the directory {} is invalid. " "Please verify the folder containing the sequence files matches the SampleSheet file").format( - pf_list, data_dir)) + pf_list, run_data_directory)) # Add the dir to each file to create the full path for i in range(len(pf_list)): - pf_list[i] = path.join(data_dir, pf_list[i]) + pf_list[i] = path.join(run_data_directory, pf_list[i]) sq = model.SequenceFile(file_list=pf_list, properties_dict=properties_dict) sample.sequence_file = deepcopy(sq) From 00aea45f5d830f403e19435a5a7a3a0ee70c2b5f Mon Sep 17 00:00:00 2001 From: Jeffrey Thiessen Date: Fri, 17 Apr 2020 15:11:55 -0500 Subject: [PATCH 07/19] add warning about cloud deployment to miseq parser --- iridauploader/parsers/miseq/parser.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/iridauploader/parsers/miseq/parser.py b/iridauploader/parsers/miseq/parser.py index 361a8a88..56418d4c 100644 --- a/iridauploader/parsers/miseq/parser.py +++ b/iridauploader/parsers/miseq/parser.py @@ -42,6 +42,9 @@ def get_full_data_directory(sample_sheet): """ Returns the path to where the sequence data files can be found, including the sample_sheet directory + Note, this hits the os, and as such is not to be used with cloud solutions. + For cloud solutions, use get_relative_data_directory() and solve the actual path for your cloud environment + :param sample_sheet: Sample sheet acts as the starting point for the data directory :return: a string which represents the concatenated path components, as per os.path.join """ From b559e32becf6ab35d200d00757ac4371165f02cb Mon Sep 17 00:00:00 2001 From: Jeffrey Thiessen Date: Mon, 27 Apr 2020 15:49:49 -0500 Subject: [PATCH 08/19] fix miseq parser tests --- iridauploader/parsers/__init__.py | 1 + .../tests/parsers/miseq/test_parser.py | 17 +++++-- .../tests/parsers/miseq/test_sample_parser.py | 50 +++++++------------ 3 files changed, 32 insertions(+), 36 deletions(-) diff --git a/iridauploader/parsers/__init__.py b/iridauploader/parsers/__init__.py index c399b7cb..f81d0c0c 100644 --- a/iridauploader/parsers/__init__.py +++ b/iridauploader/parsers/__init__.py @@ -1,3 +1,4 @@ from iridauploader.parsers.parsers import Parser from iridauploader.parsers.parsers import supported_parsers from iridauploader.parsers import exceptions +from iridauploader.parsers import common diff --git a/iridauploader/tests/parsers/miseq/test_parser.py b/iridauploader/tests/parsers/miseq/test_parser.py index f869949f..63b04637 100644 --- a/iridauploader/tests/parsers/miseq/test_parser.py +++ b/iridauploader/tests/parsers/miseq/test_parser.py @@ -192,6 +192,17 @@ def test_valid_run(self): """ sample_sheet = path.join(path_to_module, "fake_ngs_data", "SampleSheet.csv") - res = Parser.get_sequencing_run(sample_sheet) - - self.assertEqual(type(res), model.SequencingRun) + sequencing_run = Parser.get_sequencing_run(sample_sheet) + + # Returns a SequencingRun + self.assertEqual(type(sequencing_run), model.SequencingRun) + # Includes a single project + self.assertEqual(len(sequencing_run.project_list), 1) + # is of type Project + self.assertEqual(type(sequencing_run.project_list[0]), model.Project) + # Project has 3 samples + self.assertEqual(len(sequencing_run.project_list[0].sample_list), 3) + # samples are of type Sample + self.assertEqual(type(sequencing_run.project_list[0].sample_list[0]), model.Sample) + # samples have SequenceFile + self.assertEqual(type(sequencing_run.project_list[0].sample_list[0].sequence_file), model.SequenceFile) diff --git a/iridauploader/tests/parsers/miseq/test_sample_parser.py b/iridauploader/tests/parsers/miseq/test_sample_parser.py index d1e5e046..f6229963 100644 --- a/iridauploader/tests/parsers/miseq/test_sample_parser.py +++ b/iridauploader/tests/parsers/miseq/test_sample_parser.py @@ -208,31 +208,6 @@ class TestBuildSequencingRunFromSamples(unittest.TestCase): def setUp(self): print("\nStarting " + self.__module__ + ": " + self._testMethodName) - def test_build_valid(self): - """ - When given a valid directory, ensure a valid SequencingRun is built with Projects, Samples, ect - :return: - """ - sheet_file = path.join(path_to_module, "fake_ngs_data", - "SampleSheet.csv") - meta_data = sample_parser.parse_metadata(sheet_file) - - sequencing_run = sample_parser.build_sequencing_run_from_samples(sheet_file, meta_data) - - # Returns a SequencingRun - self.assertEqual(type(sequencing_run), model.SequencingRun) - # Includes a single project - self.assertEqual(len(sequencing_run.project_list), 1) - # is of type Project - self.assertEqual(type(sequencing_run.project_list[0]), model.Project) - # Project has 3 samples - self.assertEqual(len(sequencing_run.project_list[0].sample_list), 3) - # samples are of type Sample - self.assertEqual(type(sequencing_run.project_list[0].sample_list[0]), model.Sample) - # samples have SequenceFile - self.assertEqual(type(sequencing_run.project_list[0].sample_list[0].sequence_file), model.SequenceFile) - - class TestGetCsvReader(unittest.TestCase): """ Test that the csv reader behaves as expected @@ -339,8 +314,11 @@ def test_valid(self): Ensure a a parsed valid directory matches the expected sample list :return: """ - sheet_file = path.join(path_to_module, "fake_ngs_data", - "SampleSheet.csv") + directory = path.join(path_to_module, "fake_ngs_data") + sheet_file = path.join(directory, "SampleSheet.csv") + data_dir = path.join(directory, parsers.miseq.Parser.get_relative_data_directory()) + file_list = parsers.common.get_file_list(data_dir) + sample = model.Sample( "01-1111", @@ -371,15 +349,15 @@ def test_valid(self): "fake_ngs_data", "Data", "Intensities", "BaseCalls", "01-1111_S1_L001_R1_001.fastq.gz") file_path_2 = path.join(path_to_module, "fake_ngs_data", "Data", "Intensities", "BaseCalls", "01-1111_S1_L001_R2_001.fastq.gz") - file_list = [file_path_1, file_path_2] + raw_file_list = [file_path_1, file_path_2] - res = sample_parser._parse_sample_list(sheet_file) + res = sample_parser.parse_sample_list(sample_sheet_file=sheet_file, run_data_directory=data_dir, run_data_directory_file_list=file_list) # Check sample is the same self.assertEqual(res[0].get_uploadable_dict(), sample.get_uploadable_dict()) # Check sequencing file is correct self.assertEqual(res[0].sequence_file.properties_dict, sequence_file_properties) - self.assertEqual(res[0].sequence_file.file_list.sort(), file_list.sort()) + self.assertEqual(res[0].sequence_file.file_list.sort(), raw_file_list.sort()) def test_not_pf_list(self): """ @@ -387,10 +365,12 @@ def test_not_pf_list(self): :return: """ directory = path.join(path_to_module, "ngs_not_pf_list") + data_dir = path.join(directory, parsers.miseq.Parser.get_relative_data_directory()) + file_list = parsers.common.get_file_list(data_dir) file_path = path.join(directory, "SampleSheet.csv") with self.assertRaises(SequenceFileError): - res = sample_parser._parse_sample_list(file_path) + res = sample_parser.parse_sample_list(sample_sheet_file=file_path, run_data_directory=data_dir, run_data_directory_file_list=file_list) def test_not_valid_pf_list(self): """ @@ -398,17 +378,21 @@ def test_not_valid_pf_list(self): :return: """ directory = path.join(path_to_module, "ngs_not_valid_pf_list") + data_dir = path.join(directory, parsers.miseq.Parser.get_relative_data_directory()) + file_list = parsers.common.get_file_list(data_dir) file_path = path.join(directory, "SampleSheet.csv") with self.assertRaises(SequenceFileError): - res = sample_parser._parse_sample_list(file_path) + res = sample_parser.parse_sample_list(sample_sheet_file=file_path, run_data_directory=data_dir, run_data_directory_file_list=file_list) def test_space_in_sample_name(self): directory = path.join(path_to_module, "ngs_space_in_sample_name") + data_dir = path.join(directory, parsers.miseq.Parser.get_relative_data_directory()) + file_list = parsers.common.get_file_list(data_dir) file_path = path.join(directory, "SampleSheet.csv") # Just making sure this doesn't thow an error - sample_parser._parse_sample_list(file_path) + sample_parser.parse_sample_list(sample_sheet_file=file_path, run_data_directory=data_dir, run_data_directory_file_list=file_list) class TestParseSamples(unittest.TestCase): From 29d23a4c26cd74d503b571d1b0a2ff8e5fe8f352 Mon Sep 17 00:00:00 2001 From: Jeffrey Thiessen Date: Mon, 27 Apr 2020 16:06:36 -0500 Subject: [PATCH 09/19] cleanup empty test suite --- iridauploader/tests/parsers/miseq/test_sample_parser.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/iridauploader/tests/parsers/miseq/test_sample_parser.py b/iridauploader/tests/parsers/miseq/test_sample_parser.py index f6229963..b5ba259e 100644 --- a/iridauploader/tests/parsers/miseq/test_sample_parser.py +++ b/iridauploader/tests/parsers/miseq/test_sample_parser.py @@ -200,14 +200,6 @@ def test_parse_metadata_extra_commas(self): self.assertEqual(correct_metadata, meta_data) -class TestBuildSequencingRunFromSamples(unittest.TestCase): - """ - Test building the sequencing run from a sample sheet with a csv reader - """ - - def setUp(self): - print("\nStarting " + self.__module__ + ": " + self._testMethodName) - class TestGetCsvReader(unittest.TestCase): """ Test that the csv reader behaves as expected From 013b75970ee52be5e5f9f02b03719a0f2c5ab43c Mon Sep 17 00:00:00 2001 From: Jeffrey Thiessen Date: Mon, 27 Apr 2020 16:29:01 -0500 Subject: [PATCH 10/19] fix miniseq tests --- iridauploader/parsers/miniseq/parser.py | 4 +- .../tests/parsers/miniseq/test_parser.py | 19 ++++- .../parsers/miniseq/test_sample_parser.py | 79 ++++++++----------- 3 files changed, 50 insertions(+), 52 deletions(-) diff --git a/iridauploader/parsers/miniseq/parser.py b/iridauploader/parsers/miniseq/parser.py index 04889fe2..6284d49c 100644 --- a/iridauploader/parsers/miniseq/parser.py +++ b/iridauploader/parsers/miniseq/parser.py @@ -57,8 +57,8 @@ def get_full_data_directory(sample_sheet): # Also, if someone runs the miniseq parser on a miseq directory, this is the failure point if not os.path.exists(partial_data_dir): raise exceptions.DirectoryError( - ("The uploader was unable to find the data directory with the path: {}, Verify that the run directory is " - "undamaged, and that it is a MiniSeq sequencing run.").format(partial_data_dir)) + ("The uploader was unable to find the data directory, Verify that the run directory is " + "undamaged, and that it is a MiniSeq sequencing run."), partial_data_dir) # get the directories [1] get the first directory [0] data_dir = os.path.join(partial_data_dir, next(os.walk(partial_data_dir))[1][0], "Fastq") diff --git a/iridauploader/tests/parsers/miniseq/test_parser.py b/iridauploader/tests/parsers/miniseq/test_parser.py index e362d873..22c30f74 100644 --- a/iridauploader/tests/parsers/miniseq/test_parser.py +++ b/iridauploader/tests/parsers/miniseq/test_parser.py @@ -183,7 +183,7 @@ def test_invalid_sample_sheets(self): self.assertEqual(type(validation_result), model.ValidationResult) for error in validation_result.error_list: - self.assertEqual(type(error), SampleSheetError) + self.assertEqual(type(error), DirectoryError) def test_valid_run(self): """ @@ -192,6 +192,17 @@ def test_valid_run(self): """ sample_sheet = path.join(path_to_module, "fake_ngs_data", "SampleSheet.csv") - res = Parser.get_sequencing_run(sample_sheet) - - self.assertEqual(type(res), model.SequencingRun) + sequencing_run = Parser.get_sequencing_run(sample_sheet) + + # Returns a SequencingRun + self.assertEqual(type(sequencing_run), model.SequencingRun) + # Includes a single project + self.assertEqual(len(sequencing_run.project_list), 1) + # is of type Project + self.assertEqual(type(sequencing_run.project_list[0]), model.Project) + # Project has 3 samples + self.assertEqual(len(sequencing_run.project_list[0].sample_list), 3) + # samples are of type Sample + self.assertEqual(type(sequencing_run.project_list[0].sample_list[0]), model.Sample) + # samples have SequenceFile + self.assertEqual(type(sequencing_run.project_list[0].sample_list[0].sequence_file), model.SequenceFile) diff --git a/iridauploader/tests/parsers/miniseq/test_sample_parser.py b/iridauploader/tests/parsers/miniseq/test_sample_parser.py index 6b8237c8..cb1520e1 100644 --- a/iridauploader/tests/parsers/miniseq/test_sample_parser.py +++ b/iridauploader/tests/parsers/miniseq/test_sample_parser.py @@ -4,6 +4,7 @@ from csv import reader from io import StringIO +from iridauploader import parsers import iridauploader.parsers.miniseq.sample_parser as sample_parser from iridauploader.parsers.exceptions import SampleSheetError, SequenceFileError from iridauploader.parsers import common @@ -174,39 +175,6 @@ def test_parse_metadata_extra_commas(self): self.assertEqual(correct_metadata, meta_data) -class TestBuildSequencingRunFromSamples(unittest.TestCase): - """ - Test building the sequencing run from a sample sheet with a csv reader - """ - - def setUp(self): - print("\nStarting " + self.__module__ + ": " + self._testMethodName) - - def test_build_valid(self): - """ - When given a valid directory, ensure a valid SequencingRun is built with Projects, Samples, ect - :return: - """ - sheet_file = path.join(path_to_module, "fake_ngs_data", - "SampleSheet.csv") - meta_data = sample_parser.parse_metadata(sheet_file) - - sequencing_run = sample_parser.build_sequencing_run_from_samples(sheet_file, meta_data) - - # Returns a SequencingRun - self.assertEqual(type(sequencing_run), model.SequencingRun) - # Includes a single project - self.assertEqual(len(sequencing_run.project_list), 1) - # is of type Project - self.assertEqual(type(sequencing_run.project_list[0]), model.Project) - # Project has 3 samples - self.assertEqual(len(sequencing_run.project_list[0].sample_list), 3) - # samples are of type Sample - self.assertEqual(type(sequencing_run.project_list[0].sample_list[0]), model.Sample) - # samples have SequenceFile - self.assertEqual(type(sequencing_run.project_list[0].sample_list[0].sequence_file), model.SequenceFile) - - class TestGetCsvReader(unittest.TestCase): """ Test that the csv reader behaves as expected @@ -309,8 +277,11 @@ def test_valid(self): Ensure a a parsed valid directory matches the expected sample list :return: """ - sheet_file = path.join(path_to_module, "fake_ngs_data", - "SampleSheet.csv") + directory = path.join(path_to_module, "fake_ngs_data") + sheet_file = path.join(directory, "SampleSheet.csv") + data_dir = path.join(directory, parsers.miniseq.Parser.get_relative_data_directory()) + data_dir = data_dir.replace("*", "some_dir") + file_list = parsers.common.get_file_list(data_dir) sample = model.Sample( "01-1111", @@ -339,15 +310,16 @@ def test_valid(self): "fake_ngs_data", "Alignment_1", "some_dir", "Fastq", "01-1111_S1_L001_R1_001.fastq.gz") file_path_2 = path.join(path_to_module, "fake_ngs_data", "Alignment_1", "some_dir", "Fastq", "01-1111_S1_L001_R2_001.fastq.gz") - file_list = [file_path_1, file_path_2] + raw_file_list = [file_path_1, file_path_2] + + res = sample_parser.parse_sample_list(sample_sheet_file=sheet_file, run_data_directory=data_dir, run_data_directory_file_list=file_list) - res = sample_parser._parse_sample_list(sheet_file) # Check sample is the same self.assertEqual(res[0].get_uploadable_dict(), sample.get_uploadable_dict()) # Check sequencing file is correct self.assertEqual(res[0].sequence_file.properties_dict, sequence_file_properties) - self.assertEqual(res[0].sequence_file.file_list.sort(), file_list.sort()) + self.assertEqual(res[0].sequence_file.file_list.sort(), raw_file_list.sort()) def test_not_pf_list(self): """ @@ -355,10 +327,13 @@ def test_not_pf_list(self): :return: """ directory = path.join(path_to_module, "ngs_not_pf_list") + data_dir = path.join(directory, parsers.miniseq.Parser.get_relative_data_directory()) + data_dir = data_dir.replace("*", "some_dir") + file_list = parsers.common.get_file_list(data_dir) file_path = path.join(directory, "SampleSheet.csv") with self.assertRaises(SequenceFileError): - res = sample_parser._parse_sample_list(file_path) + res = sample_parser.parse_sample_list(sample_sheet_file=file_path, run_data_directory=data_dir, run_data_directory_file_list=file_list) def test_not_valid_pf_list(self): """ @@ -366,17 +341,24 @@ def test_not_valid_pf_list(self): :return: """ directory = path.join(path_to_module, "ngs_not_valid_pf_list") + data_dir = path.join(directory, parsers.miniseq.Parser.get_relative_data_directory()) + data_dir = data_dir.replace("*", "some_dir") + file_list = parsers.common.get_file_list(data_dir) file_path = path.join(directory, "SampleSheet.csv") with self.assertRaises(SequenceFileError): - res = sample_parser._parse_sample_list(file_path) + res = sample_parser.parse_sample_list(sample_sheet_file=file_path, run_data_directory=data_dir, run_data_directory_file_list=file_list) + def test_space_in_sample_name(self): directory = path.join(path_to_module, "ngs_space_in_sample_name") + data_dir = path.join(directory, parsers.miniseq.Parser.get_relative_data_directory()) + data_dir = data_dir.replace("*", "some_dir") + file_list = parsers.common.get_file_list(data_dir) file_path = path.join(directory, "SampleSheet.csv") # Just making sure this doesn't throw an error - sample_parser._parse_sample_list(file_path) + sample_parser.parse_sample_list(sample_sheet_file=file_path, run_data_directory=data_dir, run_data_directory_file_list=file_list) class TestParseSamples(unittest.TestCase): @@ -508,12 +490,17 @@ def test_build_valid_with_description_field(self): When given a valid directory, ensure a valid SequencingRun is built with Projects, Samples, ect :return: """ - - sheet_file = path.join(path_to_module, "iseq_with_desc_field", - "SampleSheet.csv") + directory = path.join(path_to_module, "iseq_with_desc_field") + sheet_file = path.join(directory, "SampleSheet.csv") meta_data = sample_parser.parse_metadata(sheet_file) - - sequencing_run = sample_parser.build_sequencing_run_from_samples(sheet_file, meta_data) + data_dir = path.join(directory, parsers.miniseq.Parser.get_relative_data_directory()) + data_dir = data_dir.replace("*", "some_dir") + file_list = parsers.common.get_file_list(data_dir) + + sample_list = sample_parser.parse_sample_list(sample_sheet_file=sheet_file, + run_data_directory=data_dir, + run_data_directory_file_list=file_list) + sequencing_run = parsers.common.build_sequencing_run_from_samples(sample_list, meta_data) # Returns a SequencingRun self.assertEqual(type(sequencing_run), model.SequencingRun) From 398b50464ad467a7dd6d6ec9880475c94cbaaf3d Mon Sep 17 00:00:00 2001 From: Jeffrey Thiessen Date: Tue, 28 Apr 2020 17:35:24 -0500 Subject: [PATCH 11/19] improved testing for common csv parser --- .../tests/parsers/common/test_common.py | 61 +++++++++++++++++++ .../tests/parsers/common/test_csv_reader.csv | 24 ++++++++ 2 files changed, 85 insertions(+) create mode 100644 iridauploader/tests/parsers/common/test_csv_reader.csv diff --git a/iridauploader/tests/parsers/common/test_common.py b/iridauploader/tests/parsers/common/test_common.py index 27288637..450fb01a 100644 --- a/iridauploader/tests/parsers/common/test_common.py +++ b/iridauploader/tests/parsers/common/test_common.py @@ -2,6 +2,7 @@ import os from iridauploader.parsers import common +from iridauploader.parsers.exceptions import SampleSheetError path_to_module = os.path.abspath(os.path.dirname(__file__)) if len(path_to_module) == 0: @@ -40,3 +41,63 @@ def test_find_none(self): res = common.find_directory_list(directory) self.assertEqual(res, []) + + +class TestGetCsvReader(unittest.TestCase): + """ + Test that the csv reader behaves as expected + """ + + def setUp(self): + print("\nStarting " + self.__module__ + ": " + self._testMethodName) + + def test_get_csv_reader_valid_sheet(self): + """ + Given a valid sample sheet, ensure the parsed sheet matches expected output + :return: + """ + sheet_file = os.path.join(path_to_module, "test_csv_reader.csv") + + lines = common.get_csv_reader(sheet_file) + # This is a sample of what the miseq sample sheet looks like, but it also makes a good + # example for what we want our csv reader to be able to parse. + correct_lines = [ + ['[Header]'], + ['IEMFileVersion', '4'], + ['Investigator Name', 'Some Guy'], + ['Experiment Name', '1'], + ['Date', '10/15/2013'], + ['Workflow', 'GenerateFASTQ'], + ['Application', 'FASTQ Only'], + ['Assay', 'Nextera XT'], + ['Description', 'Superbug'], + ['Chemistry', 'Amplicon'], + [], + ['[Reads]'], + ['251'], + ['250'], + [], + ['[Settings]'], + ['ReverseComplement', '0'], + ['Adapter', 'AAAAGGGGAAAAGGGGAAA'], + [], + ['[Data]'], + ['Sample_ID', 'Sample_Name', 'Sample_Plate', 'Sample_Well', 'I7_Index_ID', 'index', 'I5_Index_ID', 'index2', + 'Sample_Project', 'Description'], + ['01-1111', '01-1111', '1', '01', 'N01', 'AAAAAAAA', 'S01', 'TTTTTTTT', '6', 'Super bug '], + ['02-2222', '02-2222', '2', '02', 'N02', 'GGGGGGGG', 'S02', 'CCCCCCCC', '6', 'Scary bug '], + ['03-3333', '03-3333', '3', '03', 'N03', 'CCCCCCCC', 'S03', 'GGGGGGGG', '6', 'Deadly bug '] + ] + + for line, c_line in zip(lines, correct_lines): + self.assertEqual(line, c_line) + + def test_get_csv_reader_no_sheet(self): + """ + When no sheet is given to parser, throw error + :return: + """ + sheet_file = os.path.join(path_to_module, "fake_dir_data") + + with self.assertRaises(SampleSheetError): + common.get_csv_reader(sheet_file) diff --git a/iridauploader/tests/parsers/common/test_csv_reader.csv b/iridauploader/tests/parsers/common/test_csv_reader.csv new file mode 100644 index 00000000..f66bb864 --- /dev/null +++ b/iridauploader/tests/parsers/common/test_csv_reader.csv @@ -0,0 +1,24 @@ +[Header] +IEMFileVersion,4 +Investigator Name,Some Guy +Experiment Name,1 +Date,10/15/2013 +Workflow,GenerateFASTQ +Application,FASTQ Only +Assay,Nextera XT +Description,Superbug +Chemistry,Amplicon + +[Reads] +251 +250 + +[Settings] +ReverseComplement,0 +Adapter,AAAAGGGGAAAAGGGGAAA + +[Data] +Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Description +01-1111,01-1111,1,01,N01,AAAAAAAA,S01,TTTTTTTT,6,Super bug +02-2222,02-2222,2,02,N02,GGGGGGGG,S02,CCCCCCCC,6,Scary bug +03-3333,03-3333,3,03,N03,CCCCCCCC,S03,GGGGGGGG,6,Deadly bug From 885c93ed34c93ba2e94109f883a02276ae8eac41 Mon Sep 17 00:00:00 2001 From: Jeffrey Thiessen Date: Tue, 28 Apr 2020 20:24:44 -0500 Subject: [PATCH 12/19] updated directory parser and tests for cloud deploy --- iridauploader/parsers/directory/parser.py | 21 +- .../parsers/directory/sample_parser.py | 108 ++++----- .../tests/parsers/directory/test_parser.py | 85 +++++++ .../parsers/directory/test_sample_parser.py | 211 +++--------------- 4 files changed, 182 insertions(+), 243 deletions(-) diff --git a/iridauploader/parsers/directory/parser.py b/iridauploader/parsers/directory/parser.py index d0641883..a64fd2d2 100644 --- a/iridauploader/parsers/directory/parser.py +++ b/iridauploader/parsers/directory/parser.py @@ -2,6 +2,7 @@ import os import iridauploader.progress as progress +import iridauploader.model as model from iridauploader.parsers import exceptions from iridauploader.parsers import common @@ -68,7 +69,7 @@ def get_sample_sheet(directory): directory) sample_sheet_file_name = Parser.SAMPLE_SHEET_FILE_NAME - file_list = next(os.walk(directory))[2] # Gets the list of files in the directory + file_list = common.get_file_list(directory) if sample_sheet_file_name not in file_list: logging.error("No sample sheet file in the Directory Upload format found") raise exceptions.DirectoryError("The directory {} has no sample sheet file in the Directory Upload format " @@ -79,7 +80,7 @@ def get_sample_sheet(directory): return os.path.join(directory, sample_sheet_file_name) @staticmethod - def get_sequencing_run(sample_sheet): + def get_sequencing_run(sample_sheet, run_data_directory_file_list=None): """ Does local validation on the integrity of the run directory / sample sheet @@ -89,6 +90,18 @@ def get_sequencing_run(sample_sheet): :return: SequencingRun """ + # get file list + validation_result = model.ValidationResult() + + try: + if run_data_directory_file_list is None: + data_dir = os.path.dirname(sample_sheet) + run_data_directory_file_list = common.get_file_list(data_dir) + except exceptions.DirectoryError as error: + validation_result.add_error(error) + logging.error("Errors occurred while parsing files") + raise exceptions.ValidationError("Errors occurred while parsing files", validation_result) + # Try to get the sample sheet, validate that the sample sheet is valid validation_result = validation.validate_sample_sheet(sample_sheet) if not validation_result.is_valid(): @@ -97,7 +110,9 @@ def get_sequencing_run(sample_sheet): # Try to build sequencing run from sample sheet & meta data, raise validation error if errors occur try: - sequencing_run = sample_parser.build_sequencing_run_from_samples(sample_sheet) + sample_list = sample_parser.parse_sample_list(sample_sheet, run_data_directory_file_list) + run_metadata = sample_parser.parse_metadata(sample_list) + sequencing_run = common.build_sequencing_run_from_samples(sample_list, run_metadata) except exceptions.SequenceFileError as error: validation_result.add_error(error) logging.error("Errors occurred while building sequence run from sample sheet") diff --git a/iridauploader/parsers/directory/sample_parser.py b/iridauploader/parsers/directory/sample_parser.py index 62639fb5..a1d26d63 100644 --- a/iridauploader/parsers/directory/sample_parser.py +++ b/iridauploader/parsers/directory/sample_parser.py @@ -8,86 +8,47 @@ from iridauploader.parsers import common -def build_sequencing_run_from_samples(sample_sheet_file): +def parse_metadata(sample_list): """ - Create a SequencingRun object with full project/sample/sequence_file structure + Determine if samples are paired or single end, and return metadata to match - :param sample_sheet_file: - :return: SequencingRun + :param sample_list: List of Sample objects + :return: metadata dictionary """ - sample_list = _parse_sample_list(sample_sheet_file) - - logging.debug("Building SequencingRun from parsed data") - - # create list of projects and add samples to appropriate project - project_list = [] - for sample_number, sample in enumerate(sample_list): - # get data from data dict - sample_name = sample['Sample_Name'] - project_id = sample['Project_ID'] - file_f = sample['File_Forward'] - file_r = sample['File_Reverse'] - - project = None - # see if project exists - for p in project_list: - if project_id == p.id: - project = p - # create project if it doesn't exitt yet - if project is None: - project = model.Project(id=project_id) - project_list.append(project) - - # create sequence file - if len(file_r) > 0: - # paired end read - sq = model.SequenceFile(properties_dict=None, file_list=[file_f, file_r]) - else: - # single end read - sq = model.SequenceFile(properties_dict=None, file_list=[file_f]) - - # create sample - sample_obj = model.Sample(sample_name=sample_name, sample_number=sample_number + 1) - - # add sequence file to sample - sample_obj.sequence_file = deepcopy(sq) - - # add sample to project - project.add_sample(sample_obj) - # add the layout type to the sequencing run so we know if it is paired or single end - if project_list[0].sample_list[0].sequence_file.is_paired_end(): + if sample_list[0].sequence_file.is_paired_end(): metadata = {'layoutType': 'PAIRED_END'} else: metadata = {'layoutType': 'SINGLE_END'} - sequence_run = model.SequencingRun(metadata=metadata, project_list=project_list) - logging.debug("SequencingRun built") - return sequence_run + return metadata -def _parse_sample_list(sample_sheet_file): +def parse_sample_list(sample_sheet_file, run_data_directory_file_list): """ Creates a list of all sample data in the sample_sheet_file Verifies data is valid for uploading :param sample_sheet_file: - :return: list of sample data dicts + :param run_data_directory_file_list: list of all files + :return: list of Sample objects """ - sample_dict_list = _parse_samples(sample_sheet_file) + sample_list = _parse_samples(sample_sheet_file) data_dir = path.dirname(sample_sheet_file) - data_dir_file_list = next(walk(data_dir))[2] # Create a file list of the data directory, only hit the os once data_dir_file_list_full_path = [] - for file_name in data_dir_file_list: + for file_name in run_data_directory_file_list: data_dir_file_list_full_path.append(path.join(path.abspath(data_dir), file_name)) + has_paired_end_read = False has_single_end_read = False logging.info("Verifying data parsed from sample sheet {}".format(sample_sheet_file)) - for sample_dict in sample_dict_list: + for sample in sample_list: + + sample_dict = sample.get_uploadable_dict() paired_end_read = len(sample_dict['File_Reverse']) > 0 # keep track if we have both paired and single end reads @@ -97,14 +58,14 @@ def _parse_sample_list(sample_sheet_file): has_single_end_read = True # Check if file names are in the files we found in the directory - if ((sample_dict['File_Forward'] not in data_dir_file_list) and ( + if ((sample_dict['File_Forward'] not in run_data_directory_file_list) and ( sample_dict['File_Forward'] not in data_dir_file_list_full_path)): raise exceptions.SampleSheetError( ("Your sample sheet is malformed. {} Does not match any file in the directory {}" "".format(sample_dict['File_Forward'], data_dir)), sample_sheet_file ) - if ((paired_end_read and sample_dict['File_Reverse'] not in data_dir_file_list) and ( + if ((paired_end_read and sample_dict['File_Reverse'] not in run_data_directory_file_list) and ( paired_end_read and sample_dict['File_Reverse'] not in data_dir_file_list_full_path)): raise exceptions.SampleSheetError( ("Your sample sheet is malformed. {} Does not match any file in the directory {}" @@ -112,11 +73,19 @@ def _parse_sample_list(sample_sheet_file): sample_sheet_file ) + # create file list of full paths + file_list = [] # Add the dir to each file to create the full path if sample_dict['File_Forward'] not in data_dir_file_list_full_path: sample_dict['File_Forward'] = path.join(data_dir, sample_dict['File_Forward']) + file_list.append(sample_dict['File_Forward']) if paired_end_read and sample_dict['File_Reverse'] not in data_dir_file_list_full_path: sample_dict['File_Reverse'] = path.join(data_dir, sample_dict['File_Reverse']) + file_list.append(sample_dict['File_Reverse']) + + # Create sequence file object and attach to sample + sq = model.SequenceFile(file_list=file_list) + sample.sequence_file = deepcopy(sq) # Verify we don't have both single end and paired end reads if has_single_end_read and has_paired_end_read: @@ -127,7 +96,7 @@ def _parse_sample_list(sample_sheet_file): sample_sheet_file ) - return sample_dict_list + return sample_list def _parse_samples(sample_sheet_file): @@ -138,7 +107,8 @@ def _parse_samples(sample_sheet_file): arguments: sample_sheet_file -- path to SampleSheet.csv - returns a list containing dictionaries with the properties from the csv file + returns a list containing Sample objects that have been created by a + dictionary from the parsed out key:pair values from .csv file """ logging.info("Reading data from sample sheet {}".format(sample_sheet_file)) @@ -147,7 +117,7 @@ def _parse_samples(sample_sheet_file): # start with an ordered dictionary so that keys are ordered in the same # way that they are inserted. sample_dict = OrderedDict() - sample_dict_list = [] + sample_list = [] sample_key_list = ['Sample_Name', 'Project_ID', 'File_Forward', 'File_Reverse'] @@ -206,6 +176,22 @@ def _parse_samples(sample_sheet_file): sample_dict[key] = value - sample_dict_list.append(deepcopy(sample_dict)) - return sample_dict_list + sample_key_list = ['Sample_Name', 'Project_ID', 'File_Forward', 'File_Reverse'] + + new_sample_dict = deepcopy(sample_dict) + new_sample_name = new_sample_dict['Sample_Name'] + new_sample_project = new_sample_dict['Project_ID'] + new_sample_dict['sample_project'] = new_sample_project + del new_sample_dict['Sample_Name'] + del new_sample_dict['Project_ID'] + + sample = model.Sample( + sample_name=new_sample_name, + description="", + sample_number=sample_number + 1, + samp_dict=new_sample_dict) + + sample_list.append(sample) + + return sample_list diff --git a/iridauploader/tests/parsers/directory/test_parser.py b/iridauploader/tests/parsers/directory/test_parser.py index 41ae0c65..8365799d 100644 --- a/iridauploader/tests/parsers/directory/test_parser.py +++ b/iridauploader/tests/parsers/directory/test_parser.py @@ -182,3 +182,88 @@ def test_valid_run(self): res = Parser.get_sequencing_run(sample_sheet) self.assertEqual(type(res), model.SequencingRun) + + def test_build_valid(self): + """ + When given a valid directory, ensure a valid SequencingRun is built with Projects, Samples, ect + :return: + """ + sheet_file = path.join(path_to_module, "fake_dir_data", + "SampleList.csv") + + sequencing_run = Parser.get_sequencing_run(sheet_file) + + # Returns a SequencingRun + self.assertEqual(type(sequencing_run), model.SequencingRun) + # Includes 2 projects + self.assertEqual(len(sequencing_run.project_list), 2) + # is of type Project + self.assertEqual(type(sequencing_run.project_list[0]), model.Project) + # Project has 2 samples + self.assertEqual(len(sequencing_run.project_list[0].sample_list), 2) + # Other Project has 1 sample + self.assertEqual(len(sequencing_run.project_list[1].sample_list), 1) + # samples are of type Sample + self.assertEqual(type(sequencing_run.project_list[0].sample_list[0]), model.Sample) + # samples have SequenceFile + self.assertEqual(type(sequencing_run.project_list[0].sample_list[0].sequence_file), model.SequenceFile) + + def test_build_valid_extra_line_on_sample_list(self): + """ + Ensure a valid SequencingRun is made when extra lines are present in sample list + :return: + """ + sheet_file = path.join(path_to_module, "fake_dir_data", + "SampleList_with_space.csv") + + sequencing_run = Parser.get_sequencing_run(sheet_file) + + # Returns a SequencingRun + self.assertEqual(type(sequencing_run), model.SequencingRun) + # Includes 2 projects + self.assertEqual(len(sequencing_run.project_list), 2) + # is of type Project + self.assertEqual(type(sequencing_run.project_list[0]), model.Project) + # Project has 2 samples + self.assertEqual(len(sequencing_run.project_list[0].sample_list), 2) + # Other Project has 1 sample + self.assertEqual(len(sequencing_run.project_list[1].sample_list), 1) + # samples are of type Sample + self.assertEqual(type(sequencing_run.project_list[0].sample_list[0]), model.Sample) + # samples have SequenceFile + self.assertEqual(type(sequencing_run.project_list[0].sample_list[0].sequence_file), model.SequenceFile) + + def test_parse_samples_valid(self): + """ + Verify samples created from parser match expected samples + :return: + """ + sheet_file = path.join(path_to_module, "fake_dir_data", + "SampleList.csv") + + sample1 = model.Sample( + "my-sample-1", + "", + ) + + sample2 = model.Sample( + "my-sample-2", + "", + ) + + sample3 = model.Sample( + "my-sample-3", + "", + ) + + res = Parser.get_sequencing_run(sheet_file) + + self.assertEqual(res.metadata, {'layoutType': 'PAIRED_END'}) + self.assertEqual(res.project_list[0].id, "75") + self.assertEqual(res.project_list[1].id, "76") + self.assertEqual(res.project_list[0].sample_list[0].sample_name, + sample1.sample_name) + self.assertEqual(res.project_list[0].sample_list[1].sample_name, + sample2.sample_name) + self.assertEqual(res.project_list[1].sample_list[0].sample_name, + sample3.sample_name) diff --git a/iridauploader/tests/parsers/directory/test_sample_parser.py b/iridauploader/tests/parsers/directory/test_sample_parser.py index d90196b5..d338af4a 100644 --- a/iridauploader/tests/parsers/directory/test_sample_parser.py +++ b/iridauploader/tests/parsers/directory/test_sample_parser.py @@ -13,161 +13,6 @@ path_to_module = '.' -class TestBuildSequencingRunFromSamples(unittest.TestCase): - """ - Test building the sequencing run from a sample sheet with a csv reader - """ - - def setUp(self): - print("\nStarting " + self.__module__ + ": " + self._testMethodName) - - def test_build_valid(self): - """ - When given a valid directory, ensure a valid SequencingRun is built with Projects, Samples, ect - :return: - """ - sheet_file = path.join(path_to_module, "fake_dir_data", - "SampleList.csv") - - sequencing_run = sample_parser.build_sequencing_run_from_samples(sheet_file) - - # Returns a SequencingRun - self.assertEqual(type(sequencing_run), model.SequencingRun) - # Includes 2 projects - self.assertEqual(len(sequencing_run.project_list), 2) - # is of type Project - self.assertEqual(type(sequencing_run.project_list[0]), model.Project) - # Project has 2 samples - self.assertEqual(len(sequencing_run.project_list[0].sample_list), 2) - # Other Project has 1 sample - self.assertEqual(len(sequencing_run.project_list[1].sample_list), 1) - # samples are of type Sample - self.assertEqual(type(sequencing_run.project_list[0].sample_list[0]), model.Sample) - # samples have SequenceFile - self.assertEqual(type(sequencing_run.project_list[0].sample_list[0].sequence_file), model.SequenceFile) - - def test_build_valid_extra_line_on_sample_list(self): - """ - Ensure a valid SequencingRun is made when extra lines are present in sample list - :return: - """ - sheet_file = path.join(path_to_module, "fake_dir_data", - "SampleList_with_space.csv") - - sequencing_run = sample_parser.build_sequencing_run_from_samples(sheet_file) - - # Returns a SequencingRun - self.assertEqual(type(sequencing_run), model.SequencingRun) - # Includes 2 projects - self.assertEqual(len(sequencing_run.project_list), 2) - # is of type Project - self.assertEqual(type(sequencing_run.project_list[0]), model.Project) - # Project has 2 samples - self.assertEqual(len(sequencing_run.project_list[0].sample_list), 2) - # Other Project has 1 sample - self.assertEqual(len(sequencing_run.project_list[1].sample_list), 1) - # samples are of type Sample - self.assertEqual(type(sequencing_run.project_list[0].sample_list[0]), model.Sample) - # samples have SequenceFile - self.assertEqual(type(sequencing_run.project_list[0].sample_list[0].sequence_file), model.SequenceFile) - - def test_parse_samples_valid(self): - """ - Verify samples created from parser match expected samples - :return: - """ - sheet_file = path.join(path_to_module, "fake_dir_data", - "SampleList.csv") - - sample1 = model.Sample( - "my-sample-1", - "", - ) - - sample2 = model.Sample( - "my-sample-2", - "", - ) - - sample3 = model.Sample( - "my-sample-3", - "", - ) - - res = sample_parser.build_sequencing_run_from_samples(sheet_file) - - self.assertEqual(res.metadata, {'layoutType': 'PAIRED_END'}) - self.assertEqual(res.project_list[0].id, "75") - self.assertEqual(res.project_list[1].id, "76") - self.assertEqual(res.project_list[0].sample_list[0].get_uploadable_dict(), - sample1.get_uploadable_dict()) - self.assertEqual(res.project_list[0].sample_list[1].get_uploadable_dict(), - sample2.get_uploadable_dict()) - self.assertEqual(res.project_list[1].sample_list[0].get_uploadable_dict(), - sample3.get_uploadable_dict()) - - -class TestGetCsvReader(unittest.TestCase): - """ - Test that the csv reader behaves as expected - """ - - def setUp(self): - print("\nStarting " + self.__module__ + ": " + self._testMethodName) - - def test_get_csv_reader_valid_sheet(self): - """ - Given a valid sample sheet, ensure the parsed sheet matches expected output - :return: - """ - sheet_file = path.join(path_to_module, "fake_dir_data", - "test_csv_reader.csv") - - lines = parsers.common.get_csv_reader(sheet_file) - # This is a sample of what the miseq sample sheet looks like, but it also makes a good - # example for what we want our csv reader to be able to parse. - correct_lines = [ - ['[Header]'], - ['IEMFileVersion', '4'], - ['Investigator Name', 'Some Guy'], - ['Experiment Name', '1'], - ['Date', '10/15/2013'], - ['Workflow', 'GenerateFASTQ'], - ['Application', 'FASTQ Only'], - ['Assay', 'Nextera XT'], - ['Description', 'Superbug'], - ['Chemistry', 'Amplicon'], - [], - ['[Reads]'], - ['251'], - ['250'], - [], - ['[Settings]'], - ['ReverseComplement', '0'], - ['Adapter', 'AAAAGGGGAAAAGGGGAAA'], - [], - ['[Data]'], - ['Sample_ID', 'Sample_Name', 'Sample_Plate', 'Sample_Well', 'I7_Index_ID', 'index', 'I5_Index_ID', 'index2', - 'Sample_Project', 'Description'], - ['01-1111', '01-1111', '1', '01', 'N01', 'AAAAAAAA', 'S01', 'TTTTTTTT', '6', 'Super bug '], - ['02-2222', '02-2222', '2', '02', 'N02', 'GGGGGGGG', 'S02', 'CCCCCCCC', '6', 'Scary bug '], - ['03-3333', '03-3333', '3', '03', 'N03', 'CCCCCCCC', 'S03', 'GGGGGGGG', '6', 'Deadly bug '] - ] - - for line, c_line in zip(lines, correct_lines): - self.assertEqual(line, c_line) - - def test_get_csv_reader_no_sheet(self): - """ - When no sheet is given to parser, throw error - :return: - """ - sheet_file = path.join(path_to_module, "fake_dir_data") - - with self.assertRaises(SampleSheetError): - parsers.common.get_csv_reader(sheet_file) - - class TestParseSampleList(unittest.TestCase): """ test parsing the list of samples from a sample sheet @@ -189,15 +34,16 @@ def test_valid(self): file_path_2 = path.join(path_to_module, "fake_dir_data", "file_2.fastq.gz") - res = sample_parser._parse_sample_list(sheet_file) + run_data_directory_file_list = ["file_1.fastq.gz", "file_2.fastq.gz"] + res = sample_parser.parse_sample_list(sheet_file, run_data_directory_file_list) # Check we have 1 sample self.assertEqual(len(res), 1) # Check if data is correct - self.assertEqual(res[0]["Sample_Name"], "my-sample-1") - self.assertEqual(res[0]["Project_ID"], "75") - self.assertEqual(res[0]["File_Forward"], file_path_1) - self.assertEqual(res[0]["File_Reverse"], file_path_2) + self.assertEqual(res[0].sample_name, "my-sample-1") + self.assertEqual(res[0].get_uploadable_dict()["sample_project"], "75") + self.assertEqual(res[0].get_uploadable_dict()["File_Forward"], "file_1.fastq.gz") + self.assertEqual(res[0].get_uploadable_dict()["File_Reverse"], "file_2.fastq.gz") @patch("iridauploader.parsers.directory.sample_parser._parse_samples") def test_valid_full_file_path(self, mock_parse_samples): @@ -213,25 +59,32 @@ def test_valid_full_file_path(self, mock_parse_samples): file_path_2 = path.join(path_to_module, "fake_dir_data", "file_2.fastq.gz") - sample_dict_list = [OrderedDict([ - ('Sample_Name', 'my-sample-1'), - ('Project_ID', '75'), - ('File_Forward', path.abspath(file_path_1)), - ('File_Reverse', path.abspath(file_path_2)) - ])] + sample_list = [ + model.Sample( + sample_name='my-sample-1', + description="", + sample_number=0, + samp_dict={ + ('sample_project', '75'), + ('File_Forward', path.abspath(file_path_1)), + ('File_Reverse', path.abspath(file_path_2)) + } + ) + ] - mock_parse_samples.return_value = sample_dict_list + mock_parse_samples.return_value = sample_list - res = parsers.directory.sample_parser._parse_sample_list(sheet_file) + run_data_directory_file_list = ["file_1.fastq.gz", "file_2.fastq.gz"] + res = sample_parser.parse_sample_list(sheet_file, run_data_directory_file_list) mock_parse_samples.assert_called_with(sheet_file) # Check we have 1 sample self.assertEqual(len(res), 1) # Check if data is correct - self.assertEqual(res[0]["Sample_Name"], "my-sample-1") - self.assertEqual(res[0]["Project_ID"], "75") - self.assertEqual(res[0]["File_Forward"], path.abspath(file_path_1)) - self.assertEqual(res[0]["File_Reverse"], path.abspath(file_path_2)) + self.assertEqual(res[0].sample_name, "my-sample-1") + self.assertEqual(res[0].get_uploadable_dict()["sample_project"], "75") + self.assertEqual(res[0].get_uploadable_dict()["File_Forward"], path.abspath(file_path_1)) + self.assertEqual(res[0].get_uploadable_dict()["File_Reverse"], path.abspath(file_path_2)) def test_no_forward_read(self): """ @@ -242,7 +95,7 @@ def test_no_forward_read(self): file_path = path.join(directory, "list_no_forward.csv") with self.assertRaises(SampleSheetError): - res = sample_parser._parse_sample_list(file_path) + res = sample_parser.parse_sample_list(file_path, directory) def test_no_reverse_read(self): """ @@ -253,14 +106,14 @@ def test_no_reverse_read(self): file_path = path.join(directory, "list_no_reverse.csv") with self.assertRaises(SampleSheetError): - res = sample_parser._parse_sample_list(file_path) + res = sample_parser.parse_sample_list(file_path, directory) def test_mixed_paired_and_single_reads(self): directory = path.join(path_to_module, "fake_dir_data") file_path = path.join(directory, "list_mixed.csv") with self.assertRaises(SampleSheetError): - res = sample_parser._parse_sample_list(file_path) + res = sample_parser.parse_sample_list(file_path, directory) class TestParseSamples(unittest.TestCase): @@ -287,10 +140,10 @@ def test_valid(self): # Check we have 1 sample self.assertEqual(len(res), 1) # Check if data is correct - self.assertEqual(res[0]["Sample_Name"], "my-sample-1") - self.assertEqual(res[0]["Project_ID"], "75") - self.assertEqual(res[0]["File_Forward"], file_name_1) - self.assertEqual(res[0]["File_Reverse"], file_name_2) + self.assertEqual(res[0].sample_name, "my-sample-1") + self.assertEqual(res[0].get_uploadable_dict()["sample_project"], "75") + self.assertEqual(res[0].get_uploadable_dict()["File_Forward"], file_name_1) + self.assertEqual(res[0].get_uploadable_dict()["File_Reverse"], file_name_2) def test_no_forward_read(self): """ From 4cbdc9c6cee43ff6454fb5fd4def967fa0a0b782 Mon Sep 17 00:00:00 2001 From: Jeffrey Thiessen Date: Wed, 29 Apr 2020 16:37:16 -0500 Subject: [PATCH 13/19] added page on cloud deployment to dev docs --- docs/developers/cloud.md | 183 +++++++++++++++++++++++++++++++++++++++ docs/index.md | 2 + 2 files changed, 185 insertions(+) create mode 100644 docs/developers/cloud.md diff --git a/docs/developers/cloud.md b/docs/developers/cloud.md new file mode 100644 index 00000000..67d51fe2 --- /dev/null +++ b/docs/developers/cloud.md @@ -0,0 +1,183 @@ + +# Deploying the uploader to the cloud + +While there is not an end to end solution that you can deploy onto the cloud, the iridauploader does allow you to use it's modules to simplify your code for cloud deployment. + + +#### Why can't I just deploy straight to cloud? + +The main difficulty is that each cloud storage solution maintains files differently, and it would not be feasible for us to support every cloud environment available. + +## How to Deploy to cloud + +The simplest way is to incorperate the `iridauploader` modules from `pip` / `PyPi` . + +`pip install iridauploader` + +Example for creating a new instance of the API, and a MiSeq Parser: + +```python +import iridauploader.api as api +import iridauploader.parsers as parsers + +api_instance = api.ApiCalls(client_id, client_secret, base_url, username, password, max_wait_time) +parser_instance = parsers.Parser.factory("miseq") +``` + +## Examples for deployment on Azure Cloud + +In these examples we have the following setup: +* We are using an Azure Function App using Python +* Files are stored in blob storage containers (in our example `myblobcontainer`) +* We use a BlobTrigger to run when a new run is uploaded with the path identifier `myblobcontainer/{name}.csv` + +Example `function.json` file: + +```json +{ + "scriptFile": "__init__.py", + "disabled": false, + "bindings": [ + { + "name": "myblob", + "type": "blobTrigger", + "direction": "in", + "path": "myblobcontainer/{name}.csv", + "connection":"AzureWebJobsStorage" + } + ] +} +``` + +For the following example, we have this simple setup at the top of our `__init__.py` function app file. + +```python +from azure.storage.blob import BlobServiceClient +from azure.storage.blob import BlobClient +from azure.storage.blob import ContainerClient +import azure.functions as func + +from iridauploader import parsers + + +# connect to our blob storage +connect_str = os.getenv('AZURE_STORAGE_CONNECTION_STRING') +blob_service_client = BlobServiceClient.from_connection_string(connect_str) +# These strings could be fetched somehow, but this works for an example +container_name = "myblobcontainer" +container_client = blob_service_client.get_container_client(container_name) +``` + +### Miseq example + +For this example, we will be getting the entire folder for a miseq run, as a set of blobs. When parsing directly from other sequencers, please consult the parser documentation for file structure differences. + +```python +def main(myblob: func.InputStream): + logging.info('Python blob trigger function %s', myblob.name) + + # download the sample sheet so it can be parsed + download_sample_sheet_file_path = os.path.join(local_path, local_file_name) + with open(download_sample_sheet_file_path, "wb") as download_file: + download_file.write(myblob.read()) + logging.info("done downloading") + + # get run directory (getting the middle portion) + # example 'myblobcontainer/miseq_run/SampleSheet.csv' -> 'miseq_run + run_directory_name = posixpath.split(posixpath.split(myblob.name)[0])[1] + + # we are gonna use miseq for this example + my_parser = parsers.Parser.factory("miseq") + logging.info("built parser") + + # This example was tested locally on a windows machine, so replacing \\ with / was needed for compatibility + relative_data_path = my_parser.get_relative_data_directory().replace("\\", "/") + full_data_dir = posixpath.join( + run_directory_name, + relative_data_path) + + # list the blobs of the run directory + blob_list = list(container_client.list_blobs(full_data_dir)) + file_list = [] + # The file_blob_tuple_list could be useful when moving to the uploading stage in the case where + # you do not want to use the iridauploader.api module to upload to irida, otherwise it can be ignored + file_blob_tuple_list = [] + for file_blob in blob_list: + file_name = remove_prefix(file_blob.name, full_data_dir) + file_list.append(file_name) + file_blob_tuple_list.append({"file_name": file_name, "blob": file_blob}) + + # TODO, put a try catch around this with the parser exceptions. + # We can catch errors within the samplesheet or missing files here + sequencing_run = my_parser.get_sequencing_run( + sample_sheet=download_sample_sheet_file_path, + run_data_directory=full_data_dir, + run_data_directory_file_list=file_list) + logging.info("built sequencing run") + + # move to upload / error handling when the parser finds an error in the run + + +def remove_prefix(text, prefix): + if text.startswith(prefix): + return text[len(prefix):] + raise Exception("should not happen") +``` + +### Directory example + +In this example we will be using the basic file layout for a directory upload. + +``` +.directory_run +├── file_1.fastq.gz +├── file_2.fastq.gz +└── SampleList.csv +``` + +```python +def main(myblob: func.InputStream): + logging.info('Python blob trigger function %s', myblob.name) + + # download the sample sheet + download_sample_sheet_file_path = os.path.join(local_path, local_file_name) + with open(download_sample_sheet_file_path, "wb") as download_file: + download_file.write(myblob.read()) + logging.info("done downloading") + + # get run directory (getting the middle portion) + # example 'myblobcontainer/directory_run/SampleSheet.csv' -> 'directory_run + run_directory_name = posixpath.split(posixpath.split(myblob.name)[0])[1] + + # we are gonna use directory for this example + my_parser = parsers.Parser.factory("directory") + logging.info("built parser") + + # list the blobs of the run directory + blob_list = list(container_client.list_blobs(run_directory_name)) + file_list = [] + # The file_blob_tuple_list could be useful when moving to the uploading stage in the case where + # you do not want to use the iridauploader.api module to upload to irida, otherwise it can be ignored + file_blob_tuple_list = [] + for file_blob in blob_list: + file_name = remove_prefix(file_blob.name, run_directory_name) + # trim the leading + file_name = file_name.replace("/","") + file_list.append(file_name) + file_blob_tuple_list.append({"file_name": file_name, "blob": file_blob}) + + # TODO, put a try catch around this with the parser exceptions. + # We can catch errors within the samplesheet or missing files here + sequencing_run = my_parser.get_sequencing_run( + sample_sheet=download_sample_sheet_file_path, + run_data_directory_file_list=file_list) + logging.info("built sequencing run") + + # move to upload / error handling when the parser finds an error in the run + + +def remove_prefix(text, prefix): + if text.startswith(prefix): + return text[len(prefix):] + raise Exception("should not happen") +``` diff --git a/docs/index.md b/docs/index.md index 6a53806e..b065185a 100644 --- a/docs/index.md +++ b/docs/index.md @@ -199,3 +199,5 @@ Want to create a parser for a sequencer that we don't yet support or have an ide [Information on the IRIDA python API](developers/api.md) [Object Model Reference](developers/objects.md) + +[Cloud Deployment](developers/cloud.md) From d453bf1b88c70b8cda1ae2a4e35ad90cb2b11979 Mon Sep 17 00:00:00 2001 From: Jeffrey Thiessen Date: Wed, 29 Apr 2020 17:34:58 -0500 Subject: [PATCH 14/19] Update version number and add to changelog --- CHANGELOG.md | 6 ++++++ iridauploader/core/cli_entry.py | 2 +- scripts/update_version.sh | 16 ++++++++++++++++ setup.py | 2 +- windows-gui-installer.cfg | 2 +- windows-installer.cfg | 2 +- 6 files changed, 26 insertions(+), 4 deletions(-) create mode 100644 scripts/update_version.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index bee49417..3720d9a1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,12 @@ Changes ======= +Beta 0.4.2 +---------- +Developer changes: +* Added support for cloud deployment by using the `iridauploader` available on `pip` +* Added version updater script to the `scripts` directory + Beta 0.4.1 ---------- Developer changes: diff --git a/iridauploader/core/cli_entry.py b/iridauploader/core/cli_entry.py index f5f30923..fe935778 100644 --- a/iridauploader/core/cli_entry.py +++ b/iridauploader/core/cli_entry.py @@ -9,7 +9,7 @@ from . import api_handler, parsing_handler, logger, exit_return -VERSION_NUMBER = "0.4.1" +VERSION_NUMBER = "0.4.2" def upload_run_single_entry(directory, force_upload=False): diff --git a/scripts/update_version.sh b/scripts/update_version.sh new file mode 100644 index 00000000..40b7641a --- /dev/null +++ b/scripts/update_version.sh @@ -0,0 +1,16 @@ +# Use example: +# $./scripts/update_version.sh 0.4.2 +# Use the first argument as the new version number +newversion=$1 + +setuppy=setup.py +sed -i "s/version=.*\,/version='$newversion',/" "$setuppy" + +wininstall=windows-installer.cfg +wininstallgui=windows-gui-installer.cfg +sed -i -z "s/version=.\..\../version=$newversion/" "$wininstall" +sed -i -z "s/version=.\..\../version=$newversion/" "$wininstallgui" + +clientry=iridauploader/core/cli_entry.py +sed -i "s/VERSION_NUMBER = \".\..\..\"/VERSION_NUMBER = \"$newversion\"/" "$clientry" + diff --git a/setup.py b/setup.py index 2e059b4f..91be32cb 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setuptools.setup( name='iridauploader', - version='0.4.1', + version='0.4.2', description='IRIDA uploader: upload NGS data to IRIDA system', url='https://https://github.com/phac-nml/irida-uploader', author='Jeffrey Thiessen', diff --git a/windows-gui-installer.cfg b/windows-gui-installer.cfg index af35651d..24e7ab87 100644 --- a/windows-gui-installer.cfg +++ b/windows-gui-installer.cfg @@ -1,6 +1,6 @@ [Application] name=IRIDA Sequence Uploader GUI -version=0.4.1 +version=0.4.2 entry_point=upload_gui:main icon=iridauploader/gui/images/icon.ico # Uncomment this to have a console show alongside the application diff --git a/windows-installer.cfg b/windows-installer.cfg index 34d201da..ec60e424 100644 --- a/windows-installer.cfg +++ b/windows-installer.cfg @@ -1,6 +1,6 @@ [Application] name=IRIDA Sequence Uploader -version=0.4.1 +version=0.4.2 entry_point=upload_run:main icon=iridauploader/gui/images/icon.ico # We need to set this to get a console: From bb45841cb1475ab2f8302150e1131cbda123e23f Mon Sep 17 00:00:00 2001 From: Jeffrey Thiessen Date: Thu, 30 Apr 2020 09:43:37 -0500 Subject: [PATCH 15/19] fix pep8 errors --- iridauploader/parsers/directory/sample_parser.py | 3 +-- iridauploader/parsers/miniseq/parser.py | 4 ++-- iridauploader/tests/parsers/miniseq/test_sample_parser.py | 2 -- iridauploader/tests/parsers/miseq/test_sample_parser.py | 1 - 4 files changed, 3 insertions(+), 7 deletions(-) diff --git a/iridauploader/parsers/directory/sample_parser.py b/iridauploader/parsers/directory/sample_parser.py index a1d26d63..57365570 100644 --- a/iridauploader/parsers/directory/sample_parser.py +++ b/iridauploader/parsers/directory/sample_parser.py @@ -48,7 +48,7 @@ def parse_sample_list(sample_sheet_file, run_data_directory_file_list): for sample in sample_list: - sample_dict = sample.get_uploadable_dict() + sample_dict = sample.get_uploadable_dict() paired_end_read = len(sample_dict['File_Reverse']) > 0 # keep track if we have both paired and single end reads @@ -176,7 +176,6 @@ def _parse_samples(sample_sheet_file): sample_dict[key] = value - sample_key_list = ['Sample_Name', 'Project_ID', 'File_Forward', 'File_Reverse'] new_sample_dict = deepcopy(sample_dict) diff --git a/iridauploader/parsers/miniseq/parser.py b/iridauploader/parsers/miniseq/parser.py index 6284d49c..5217d6aa 100644 --- a/iridauploader/parsers/miniseq/parser.py +++ b/iridauploader/parsers/miniseq/parser.py @@ -32,7 +32,7 @@ def get_relative_data_directory(): This is not used in the application but is useful for scripting and cloud deployment - This includes a '*' character to be interpreted as a wildcard symbol, + This includes a '*' character to be interpreted as a wildcard symbol, as this sequencer does not make consitant names for data directories, and the * must be gotten from the filesystem :return: a string which represents the concatenated path components, as per os.path.join @@ -58,7 +58,7 @@ def get_full_data_directory(sample_sheet): if not os.path.exists(partial_data_dir): raise exceptions.DirectoryError( ("The uploader was unable to find the data directory, Verify that the run directory is " - "undamaged, and that it is a MiniSeq sequencing run."), partial_data_dir) + "undamaged, and that it is a MiniSeq sequencing run."), partial_data_dir) # get the directories [1] get the first directory [0] data_dir = os.path.join(partial_data_dir, next(os.walk(partial_data_dir))[1][0], "Fastq") diff --git a/iridauploader/tests/parsers/miniseq/test_sample_parser.py b/iridauploader/tests/parsers/miniseq/test_sample_parser.py index cb1520e1..d7fb9da2 100644 --- a/iridauploader/tests/parsers/miniseq/test_sample_parser.py +++ b/iridauploader/tests/parsers/miniseq/test_sample_parser.py @@ -314,7 +314,6 @@ def test_valid(self): res = sample_parser.parse_sample_list(sample_sheet_file=sheet_file, run_data_directory=data_dir, run_data_directory_file_list=file_list) - # Check sample is the same self.assertEqual(res[0].get_uploadable_dict(), sample.get_uploadable_dict()) # Check sequencing file is correct @@ -349,7 +348,6 @@ def test_not_valid_pf_list(self): with self.assertRaises(SequenceFileError): res = sample_parser.parse_sample_list(sample_sheet_file=file_path, run_data_directory=data_dir, run_data_directory_file_list=file_list) - def test_space_in_sample_name(self): directory = path.join(path_to_module, "ngs_space_in_sample_name") data_dir = path.join(directory, parsers.miniseq.Parser.get_relative_data_directory()) diff --git a/iridauploader/tests/parsers/miseq/test_sample_parser.py b/iridauploader/tests/parsers/miseq/test_sample_parser.py index b5ba259e..9cb63ab5 100644 --- a/iridauploader/tests/parsers/miseq/test_sample_parser.py +++ b/iridauploader/tests/parsers/miseq/test_sample_parser.py @@ -311,7 +311,6 @@ def test_valid(self): data_dir = path.join(directory, parsers.miseq.Parser.get_relative_data_directory()) file_list = parsers.common.get_file_list(data_dir) - sample = model.Sample( "01-1111", "Super bug", From 6dca4e751b99578cce63688744c0e869171ba7e5 Mon Sep 17 00:00:00 2001 From: Jeffrey Thiessen Date: Mon, 1 Jun 2020 17:44:53 -0500 Subject: [PATCH 16/19] fix broken div id's for integration tests --- .../tests_integration/integration_data_setup.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/iridauploader/tests_integration/integration_data_setup.py b/iridauploader/tests_integration/integration_data_setup.py index 44f8c733..cfcd2d3e 100644 --- a/iridauploader/tests_integration/integration_data_setup.py +++ b/iridauploader/tests_integration/integration_data_setup.py @@ -123,12 +123,12 @@ def start_driver(self): def login(self): self.driver.get(self.base_url + "/login") - self.driver.find_element_by_id("emailTF").clear() - self.driver.find_element_by_id("emailTF").send_keys(self.user) - self.driver.find_element_by_id("passwordTF").clear() - self.driver.find_element_by_id("passwordTF").send_keys(self.password) + self.driver.find_element_by_id("loginForm_username").clear() + self.driver.find_element_by_id("loginForm_username").send_keys(self.user) + self.driver.find_element_by_id("loginForm_password").clear() + self.driver.find_element_by_id("loginForm_password").send_keys(self.password) with self.wait_for_page_load(timeout=10): - self.driver.find_element_by_id("submitBtn").click() + self.driver.find_element_by_id("t-submit-btn").click() def set_new_admin_pw(self): self.driver.find_element_by_id("password").clear() From 8b98bd3f5ceed01fb8303f154de586c327c19bad Mon Sep 17 00:00:00 2001 From: Jeffrey Thiessen Date: Mon, 1 Jun 2020 23:27:26 -0500 Subject: [PATCH 17/19] try to fix chromedriver once and for all --- .travis.yml | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index a869de09..9897f864 100644 --- a/.travis.yml +++ b/.travis.yml @@ -27,7 +27,14 @@ matrix: - export PATH=${JAVA_HOME}/bin:$PATH - java -version - echo $JAVA_HOME - - ln -s /usr/lib/chromium-browser/chromedriver ~/bin/chromedriver + - CHROME_DRIVER_VERSION=`curl -sS https://chromedriver.storage.googleapis.com/LATEST_RELEASE` + - echo chromedriverversion + - echo $CHROME_DRIVER_VERSION + - wget -N https://chromedriver.storage.googleapis.com/$CHROME_DRIVER_VERSION/chromedriver_linux64.zip -P ~/ + - unzip ~/chromedriver_linux64.zip -d ~/ + - rm ~/chromedriver_linux64.zip + - sudo mv -f ~/chromedriver ~/bin/chromedriver + - sudo chmod 777 ~/bin/chromedriver - env: "TEST_SUITE=integrationtestsdev" python: 3.6 before_script: @@ -40,7 +47,14 @@ matrix: - export PATH=${JAVA_HOME}/bin:$PATH - java -version - echo $JAVA_HOME - - ln -s /usr/lib/chromium-browser/chromedriver ~/bin/chromedriver + - CHROME_DRIVER_VERSION=`curl -sS https://chromedriver.storage.googleapis.com/LATEST_RELEASE` + - echo chromedriverversion + - echo $CHROME_DRIVER_VERSION + - wget -N https://chromedriver.storage.googleapis.com/$CHROME_DRIVER_VERSION/chromedriver_linux64.zip -P ~/ + - unzip ~/chromedriver_linux64.zip -d ~/ + - rm ~/chromedriver_linux64.zip + - sudo mv -f ~/chromedriver ~/bin/chromedriver + - sudo chmod 777 ~/bin/chromedriver - env: "TEST_SUITE=pep8" python: 3.6 @@ -54,7 +68,6 @@ services: addons: apt: packages: - - chromium-chromedriver - xvfb chrome: stable From b2e67b2c51c91cf1b6c393e4db3b2901a17aad65 Mon Sep 17 00:00:00 2001 From: Jeffrey Thiessen Date: Thu, 25 Jun 2020 17:11:10 -0500 Subject: [PATCH 18/19] add more comments + touch of cleanup --- iridauploader/parsers/common.py | 9 ++++++++- iridauploader/parsers/directory/sample_parser.py | 2 +- scripts/update_version.sh | 5 +++++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/iridauploader/parsers/common.py b/iridauploader/parsers/common.py index 5c21c18d..bbd3b87a 100644 --- a/iridauploader/parsers/common.py +++ b/iridauploader/parsers/common.py @@ -1,3 +1,10 @@ +""" +This file has generic utility methods that can be used by all parsers + +These methods can rely on the os module to function, and therefor not to be used with cloud environments. + +They should be used as generic utilities for any new parser that is added to the project. +""" import os from csv import reader import logging @@ -92,7 +99,7 @@ def get_file_list(directory): """ Get the list of file names in the data directory - :param data_directory: directory to search for files + :param directory: directory to search for files :return: list of file names in data directory """ # verify that directory exists diff --git a/iridauploader/parsers/directory/sample_parser.py b/iridauploader/parsers/directory/sample_parser.py index 57365570..7cd388aa 100644 --- a/iridauploader/parsers/directory/sample_parser.py +++ b/iridauploader/parsers/directory/sample_parser.py @@ -121,7 +121,7 @@ def _parse_samples(sample_sheet_file): sample_key_list = ['Sample_Name', 'Project_ID', 'File_Forward', 'File_Reverse'] - # initilize dictionary keys from first line (data headers/attributes) + # initialize dictionary keys from first line (data headers/attributes) set_attributes = False for line in csv_reader: diff --git a/scripts/update_version.sh b/scripts/update_version.sh index 40b7641a..3ec1f32c 100644 --- a/scripts/update_version.sh +++ b/scripts/update_version.sh @@ -1,3 +1,8 @@ +#!/usr/bin/env bash +# This script updates the version number in the various locations they need to be changed including: +# setup.py : for pypi / pip +# windows-installer.cfg / windows-gui-installer.cfg : for pynsist windows builds +# cli_entry.py : for version identifiers in the main code base # Use example: # $./scripts/update_version.sh 0.4.2 # Use the first argument as the new version number From 2193e88951f00baf01586856b817008e857ff2fe Mon Sep 17 00:00:00 2001 From: Jeffrey Thiessen Date: Fri, 26 Jun 2020 11:30:55 -0500 Subject: [PATCH 19/19] fixed seplling --- iridauploader/parsers/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/iridauploader/parsers/common.py b/iridauploader/parsers/common.py index bbd3b87a..a231cf39 100644 --- a/iridauploader/parsers/common.py +++ b/iridauploader/parsers/common.py @@ -1,7 +1,7 @@ """ This file has generic utility methods that can be used by all parsers -These methods can rely on the os module to function, and therefor not to be used with cloud environments. +These methods can rely on the os module to function, and therefore not to be used with cloud environments. They should be used as generic utilities for any new parser that is added to the project. """