Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Samplesheet multilevel nesting ick4 #24

Merged
merged 23 commits into from
Jun 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
1f5eea9
correct log to logger
Apr 5, 2024
7c3fc4b
accept R_1/R_2 format in addition to R1/R2
Apr 5, 2024
96bd820
remove function no lonnger using
Apr 5, 2024
1dd42b4
decided not to add the R_1/R_2 support
Apr 5, 2024
b899629
modify samplesheet creator to handle multilevel file nesting
Apr 11, 2024
cdaf01e
add the file_levels flag to nextflow pipeline
Apr 11, 2024
bc9fdeb
don't pass null as a project_name and don't use the project_name flag…
Apr 11, 2024
c24c2a5
add file_levels param to schema
Apr 12, 2024
d4c8651
screate_samplesheet.py mods, check for file levels and run or error out
Apr 18, 2024
dcb4139
adding and correcting logic to file_levels param
Apr 22, 2024
f3280ba
update mpxgfa.sif location to point to staphb dockerhub polkapox repo
Apr 29, 2024
a19cbbf
add missing comma
Apr 30, 2024
2329c31
fix list index error in fix names fx
May 2, 2024
eb9b4d3
fix dividing by strings error
May 6, 2024
d30c041
add reqd module
May 6, 2024
89d359b
fix Index OOR error in get_raw_filt_counts by returning NA if file is…
May 7, 2024
c4c8ded
avoid another index OOR error - get_kraken_stats
May 7, 2024
a9d4723
overhaul the whole summarize_qc.py script
May 8, 2024
e51fc05
correction to the R1 R2 substitute regex
May 9, 2024
c7361f9
correct name of the pipeline
May 9, 2024
25448cc
update documentation for multilevel samplesheet creation
May 30, 2024
68c04f7
remove the file_levels=all option from create_samplesheet.py and docu…
May 31, 2024
7a8e39f
enforce file levels in nextflow
Jun 4, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz
nextflow run polkapox/main.nf --input {SAMPLESHEET.csv OR input_directory} --outdir {OUTDIR} --fasta {REF.fa} -profile sge,singularity --kraken_db {PATH/TO/DB} --gff {ANNOTATION.gff} --workflow {WORKFLOW} --filter {true/false}
```

**note**: If you do not provide `--fasta`, `--gff`, or `--kraken_db`, they will default to the reference and gff in the `assets` folder of this repo, and a kraken db hosted on the SciComp file system, respectively. If you do not specify `--filter` then it will default to `true`. See `nextflow.config` for details.
**note**: If you do not provide `--fasta`, `--gff`, or `--kraken_db`, they will default to the reference and gff in the `assets` folder of this repo, and a kraken db hosted on the SciComp file system, respectively. If you do not specify `--filter` then it will default to `true`. See `nextflow.config` for details. Add `--file_levels {top (default)/nested}` if passing a directory as input. See [usage](/docs/usage.md) for details.

## Pipeline configuration

Expand Down Expand Up @@ -273,4 +273,4 @@ published through the [CDC web site](http://www.cdc.gov).
Please refer to [CDC's Template Repository](https://github.com/CDCgov/template)
for more information about [contributing to this repository](https://github.com/CDCgov/template/blob/master/CONTRIBUTING.md),
[public domain notices and disclaimers](https://github.com/CDCgov/template/blob/master/DISCLAIMER.md),
and [code of conduct](https://github.com/CDCgov/template/blob/master/code-of-conduct.md).
and [code of conduct](https://github.com/CDCgov/template/blob/master/code-of-conduct.md).
172 changes: 91 additions & 81 deletions bin/create_samplesheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
# Script to generate sample sheet from a directory of fastq files
# Paired-end files must end in (1 or R1) and (2 or R2)
# Sample sheet format is sample,fastq_1,fastq_2 (paired-end) or sample,fastq (single-end)
logger = logging.getLogger()
logger = logging.getLogger(__name__)

def parse_args():
""" Set up arguments
Expand All @@ -24,8 +24,8 @@ def parse_args():
)
parser.add_argument(
'-d', '--indir',
default=None,
required=False,
default=None,
required=False,
metavar="SAMPLES_DIR",
help="Samples directory")
parser.add_argument(
Expand All @@ -36,21 +36,23 @@ def parse_args():
parser.add_argument(
'-o', '--outdir',
default=None,
required=False,
required=False,
metavar="OUTPUT_DIR",
help="Path to project dir where samplesheet will be saved")
parser.add_argument(
'--project_name',
default=None,
required=False,
required=False,
metavar="PROJECT_NAME",
help="Project name which will be the prefix of samplesheet name")
parser.add_argument(
"-l",
"--log-level",
help="The desired log level (default WARNING).",
choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"),
default="WARNING",
'--file_levels',
choices=['nested', 'top'],
default='top',
required=False,
metavar="FILE_LEVELS",
help="Option for creating a sample sheet: 'nested' (default) for only nested files,\n"
"'all' for all files in the directory, 'top' for only top-level files"
)
return parser.parse_args()

Expand All @@ -68,48 +70,32 @@ def check_path(input_path):
input_path = os.path.realpath(input_path) + '/'
return input_path

def check_if_R1(file_path):
""" Return true if file has R1 in the name
:param path: a file path
:returns: true if valid filenames, else false
:rytpe: bool
"""
return "_R1" in file_path

def find_file_depth(samples_dir):
""" Return the file depth
def are_files_nested(samples_dir):
""" Return true if there are nested fastq files, else return false
:param samples_dir: a file path
:returns: dictionary of fastq file levels under samples_dir
:rtype: dict
:returns: true if there are fastq files nested one level under samples_dir
:rtype: boolean
"""
extensions = (".fastq",".fq",".fastq.gz",".fq.gz")
file_depth = {}
for dirpath, _, files in os.walk(samples_dir):
relative_path = os.path.relpath(dirpath, samples_dir)
depth = 0 if relative_path == "." else relative_path.count(os.path.sep) + 1
# Ignore files nested >1 level down because these are not handled by rest of code
if depth > 1:
if dirpath == samples_dir:
continue
for file in files:
if file not in file_depth and file.endswith(extensions):
file_depth[file] = depth
return file_depth

def handle_file_depth(depth_dict):
""" Return whether files are directly under sampls_dir or nested 1 level down
:param depth_dict: a dictionary with filepaths as keys and file level as value
:returns: 0 if directly under samples_dir, 1 if nested one level down
if file.endswith(extensions):
return True
return False

def are_files_top_level(samples_dir):
""" Return true if there are fastq files directly under samples_dir, else return false.
:param samples_dir: a directory path
:returns: true if there are files with specified extensions directly under samples_dir
:rtype: boolean
"""
unique_depths = set(depth_dict.values())
if unique_depths == {0}:
nested = 0 # directly under input dir
elif unique_depths == {1}:
nested = 1 # nested one level down
else:
# Store err val if files are at mixed depths or unexpected val in dict
nested = 9
return nested
extensions = (".fastq",".fq",".fastq.gz",".fq.gz")
for file in os.listdir(samples_dir):
if any(file.endswith(ext) for ext in extensions):
return True
return False

def remove_id(sample_name):
""" Removes the sample identifier common in CDC Core Sequencing Lab samples
Expand All @@ -122,86 +108,110 @@ def remove_id(sample_name):
new_sample_name = re.sub(id_pattern, '', sample_name)
return new_sample_name

def list_samples(samples_dir, single=False):
def list_samples(samples_dir, file_levels, single=False):
""" Return a list of all samples in a directory
:param samples_dir: a file path
:returns: a list of paths to all fastq/fq files in samples_dir
:rtype: list
"""
samples_dir = os.path.realpath(samples_dir)
extensions = (".fastq",".fq",".fastq.gz",".fq.gz")
seqfiles = []
seqfiles = {}
for filename in os.listdir(samples_dir):
if single and "_R2" in filename:
log.error("single flag is set to {single} but input directory contains R2 files")
# Check for fastq files directly under samples_dir
if filename.endswith(extensions) and "_R1_" in filename:
seqfiles.append(os.path.join(samples_dir, filename))
else:
logger.error("single flag is set to {single} but input directory contains R2 files")
if file_levels == 'top':
# Check for fastq files directly under samples_dir
if filename.endswith(extensions) and ("_R1" in filename or "_1." in filename):
#seqfiles.append(os.path.join(samples_dir, filename))
sample_path = os.path.join(samples_dir, filename)
s_name = os.path.splitext(Path(sample_path).stem)[0]
s_name = re.sub(r'_R1_001$', '', s_name) # remove R1_001 at the end of filename, if it exists
s_name = re.sub(r'_R1$', '', s_name) # remove R1 at the end of filename, if it exists
s_name = re.sub(r'_1$', '', s_name) # remove a _1 only if it occurs at end of filename
s_name = remove_id(s_name)
seqfiles[s_name] = sample_path
if file_levels == 'nested':
# Check for fastq files nested one level down
subdir = os.path.join(samples_dir, filename)
if os.path.isdir(subdir):
for subfilename in os.listdir(subdir):
if subfilename.endswith(extensions) and "_R1_" in subfilename:
seqfiles.append(os.path.join(subdir, subfilename))
if subfilename.endswith(extensions) and "_R1" in subfilename:
#seqfiles.append(os.path.join(subdir, subfilename))
sample_path = os.path.join(subdir, subfilename)
s_name = Path(sample_path).parent.name # use the subdirectory name as the sample name
s_name = remove_id(s_name)
seqfiles[s_name] = sample_path
return seqfiles

def create_samplesheet(samples_list, outdir, outfile, nested, single=False):

def create_samplesheet(samples_dict, outdir, outfile, single=False):
""" Create a samplesheet from files in a directory
:param samples_dir: a directory of fastq files
:param samples_list: a list of all samples in samples_dir
:returns:
:rtype:
"""
#todo: Add a date to the samplesheet name?
#todo: Add a date to the samplesheet name?
if single:
# process single-end files
with open(f'{outdir}{outfile}', 'w') as samplesheet:
samplesheet.write('sample,fastq_1,fastq_2\n')
for sample in sorted(samples_list):
if nested == 1:
s_name = Path(sample).parent.name
s_name = remove_id(s_name)
else:
s_name = os.path.splitext(Path(sample).stem)[0].replace('_R1_001','')
samplesheet.write(f'{s_name},{sample},\n')
samplesheet.write('sample,fastq_1\n')
for sample_name, sample_path in sorted(samples_dict.items()):
samplesheet.write(f'{sample_name},{sample_path},\n')
else:
# process as paired-end files
pattern = r'R1(?=\.|_001)|(?<=_)1(?=\.)' # regex for the R2 filepaths
# Perform the replacement
with open(f'{outdir}{outfile}', 'w') as samplesheet:
samplesheet.write('sample,fastq_1,fastq_2\n')
for sample in sorted(samples_list):
if nested == 1:
s_name = s_name = Path(sample).parent.name
s_name = remove_id(s_name)
else:
s_name = os.path.splitext(Path(sample).stem)[0].replace('_R1_001','')
r2 = sample.replace('R1','R2')
samplesheet.write(f'{s_name},{sample},{r2}\n')
for sample_name, sample_path in sorted(samples_dict.items()):
# replaces .R1 or .R1_001 with R2, and _1. with 2
r2 = re.sub(pattern, lambda m: 'R2' if m.group().startswith('R') else '2', sample_path)
samplesheet.write(f'{sample_name},{sample_path},{r2}\n')

def main():
args = parse_args()
logfile = f"{os.path.basename(__file__).strip('.py')}.log"
logging.basicConfig(filename=logfile,
level=logging.DEBUG)
input_dir = check_path(args.indir)
output_dir = check_path(args.outdir)
if args.project_name:
outfile = f"{args.project_name}_samplesheet.csv"
else:
outfile = f"samplesheet.csv"

if args.file_levels == 'top':
# check if there are top level files
if are_files_top_level(input_dir):
file_levels = 'top'
# if no top files, check if there are nested files
elif are_files_nested(input_dir):
logger.info(f"--file_levels is set to {args.file_levels} but {input_dir} does not contain any top-level files. Running on nested files.")
file_levels = 'nested'
elif args.file_levels == 'nested':
# check if there are nested files
if are_files_nested(input_dir):
file_levels = 'nested'
else:
logger.error(f"--file_levels is set to {args.file_levels} but {input_dir} doesn't contain nested files with fastq, fq, fastq.gz, or fq.gz extensions.")
sys.exit(1)
else:
logger.error(f"--file_levels must be 'nested' or 'top' ")


# Get the list of samples
if args.single:
samples_list = list_samples(input_dir, single=True)
samples_dict = list_samples(input_dir, file_levels = file_levels, single=True)
else:
samples_list = list_samples(input_dir)
samples_dict = list_samples(input_dir, file_levels = file_levels)

# Find whether fastq files are nested 1 level down or directly under input dir
depths_dict = find_file_depth(input_dir)
nested = handle_file_depth(depths_dict)
# Error if files are nested in a way the script doesn't handle
if nested == 9:
logger.error(f"fastq files in {input_dir} must be either directly under input folder or nested one level down")
sys.exit(1)
if args.single:
create_samplesheet(samples_list,output_dir, outfile, nested, single=True)
create_samplesheet(samples_dict, output_dir, outfile, single=True)
else:
create_samplesheet(samples_list, output_dir, outfile, nested)
create_samplesheet(samples_dict, output_dir, outfile)
logger.info(f"{outfile} created. Please review and be sure it is correct.")

if __name__ == '__main__':
Expand Down
Loading
Loading