Skip to content

Commit

Permalink
Merge pull request #195 from pneerincx/master
Browse files Browse the repository at this point in the history
Added  renameFastQs.bash script
  • Loading branch information
RoanKanninga authored May 24, 2018
2 parents 87aed73 + d83a6c5 commit d2a6a03
Showing 1 changed file with 288 additions and 0 deletions.
288 changes: 288 additions & 0 deletions renameFastQs.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1,288 @@
#!/bin/bash

#
##
### Environment and bash sanity.
##
#
set -u
set -e
umask 0027
SCRIPT_NAME=$(basename $0)
if [[ "${BASH_VERSINFO}" -lt 4 || "${BASH_VERSINFO[0]}" -lt 4 ]]
then
echo "Sorry, you need at least bash 4.x to use ${SCRIPT_NAME}." >&2
exit 1
fi

#
# Make sure dots are used as decimal separator.
#
LANG='en_US.UTF-8'
LC_NUMERIC="${LANG}"

#
# Trap all exit signals: HUP(1), INT(2), QUIT(3), TERM(15), ERR
#
trap '_reportError $LINENO' HUP INT QUIT TERM EXIT ERR

#
##
### Functions.
##
#

function _Usage() {
echo
echo 'Script to rename all FastQs matching a specified pattern in the current directory'
echo 'from GenomeScan FastQ naming conventions to UMCG Genetics dept. FastQ naming conventions.'
echo
echo " ${SCRIPT_NAME} [options]"
echo
echo " Options:"
echo " -v Enable verbose logging (optional)."
echo " -s YYMMDD sequencingStartDate"
echo " -f 'FastQ_filename_regex' Regex pattern to find the FastQ files that must be renamed."
echo " Note: the pattern must be single quoted to prevent expansion by the shell."
echo " E.g. -f 'HTVTYBBXX_103373-003*'"
echo
}

function _reportError() {
local _problematicLine="${1}"
local _exitStatus="${2:-$?}"
local _errorMessage="Unknown error."
_errorMessage="${3:-${_errorMessage}}"
#
# Notify on STDOUT.
#
echo "
$(hostname) - ${SCRIPT_NAME}:${_problematicLine}: FATAL: exit code = ${_exitStatus}
$(hostname) - ${SCRIPT_NAME}:${_problematicLine}: error message = ${_errorMessage}
"
#
# Reset trap and exit.
#
trap - EXIT
exit ${_exitStatus}
}

function _RenameFastQ() {
local _fastqPath="${1}"
local _sequencingStartDate="${2}"
echo "INFO: Processing FastQ ${_fastqPath}..."
#
# Example for FastQs from external lab:
# 1. GenomeScan FastQ file name format:
# Flowcell_Customer-BatchNumber-SampleNumber_Barcode1-Barcode2_Lane_Read.fastq.gz
# E.g.:
# HTVTYBBXX_103373-003-001_AGGCAGAA-AGGCTTAG_L001_R1.fastq.gz
#
# Example of sequence read header line as found in Illumina FastQ files produced with Casava / bcl2fastq >= 1.8:
# @sequencer:run:flowcell:lane:tile:xCoordinateOfClusterInTile:yCoordinateOfClusterInTile sequenceReadOfPair:filtered:barcode[+barcode2]
# E.g.:
# @K00296:315:HVLN7BBXX:7:1101:2940:1173 1:Y:0:GTAGAGGA+TCTACTAT
#
#
local _fastqDir="$(dirname "${_fastqPath}")"
local _fastqFile="$(basename "${_fastqPath}")"

#
# Do NOT parse filenames: get the essential stuff from the sequence read IDs instead
#
#local _regex='^([A-Z0-9]*XX)_(103373-[0-9][0-9]*-[0-9][0-9]*)_([ATCG][ATCG]*-[ATCG][ATCG]*)_L00([1-8])_R([12]).fastq.gz$'
#if [[ "${_fastqFile}" =~ ${_regex} ]]
#then
# local _flowcell="${BASH_REMATCH[1]}"
# local _customerBatchSampleCombi="${BASH_REMATCH[2]}"
# local _barcodes="${BASH_REMATCH[3]}"
# local _lane="${BASH_REMATCH[4]}"
# local _sequenceReadOfPair="${BASH_REMATCH[5]}"
# if [[ "${enableVerboseLogging}" -eq 1 ]]
# then
# echo "DEBUG: Found _flowcell ............... = ${_flowcell}"
# echo "DEBUG: Found _customerBatchSampleCombi = ${_customerBatchSampleCombi}"
# echo "DEBUG: Found _barcodes ............... = ${_barcodes}"
# echo "DEBUG: Found _lane ................... = ${_lane}"
# echo "DEBUG: Found _sequenceReadOfPair ..... = ${_sequenceReadOfPair}"
# fi
#else
# echo "FATAL: Failed to parse filename for ${_fastq}"
# exit 1
#fi

local _firstReadID=$(zcat "${_fastqPath}" | head -1)
if [[ "${enableVerboseLogging}" -eq 1 ]]
then
echo "DEBUG: Found _firstReadID ............ = ${_firstReadID}"
fi
local _regex='^@([A-Z0-9][A-Z0-9]*):([0-9][0-9]*):([A-Z0-9][A-Z0-9]*XX):([1-8]):[0-9]*:[0-9]*:[0-9]* ([1-2]):[YN]:[0-9][0-9]*:[ATCGN][ATCGN+]*'
if [[ "${_firstReadID}" =~ ${_regex} ]]
then
local _sequencer="${BASH_REMATCH[1]}"
local _run="${BASH_REMATCH[2]}"
local _flowcell="${BASH_REMATCH[3]}"
local _lane="${BASH_REMATCH[4]}"
local _sequenceReadOfPair="${BASH_REMATCH[5]}"
#
# Note: we do require the ID of the first read to contain a DNA barcode at the end,
# but we don't parse barcodes here. Due to sequencing errors the barcode may deviate slightly.
# Instead we parse the barcodes from the first reads and determine the most abundant one later on.
#

#
# Sanity check for run number and add leading zero when 3 < run number < 4.
#
if [[ "${#_run}" -lt 3 ]]
then
echo "FATAL: run number detected in ID of first read is too short (< 3): ${_run}"
exit 1
elif [[ "${#_run}" -eq 3 ]]
then
_run="0${_run}"
fi
if [[ "${enableVerboseLogging}" -eq 1 ]]
then
echo "DEBUG: Found _sequencer .............. = ${_sequencer}"
echo "DEBUG: Found _run .................... = ${_run}"
echo "DEBUG: Found _flowcell ............... = ${_flowcell}"
echo "DEBUG: Found _lane ................... = ${_lane}"
echo "DEBUG: Found _sequenceReadOfPair ..... = ${_sequenceReadOfPair}"
fi
else
echo "FATAL: Failed to required meta-data values from ID of first read of ${_fastqPath}"
exit 1
fi

local _mostAbundandBarcode=$(zcat "${_fastqPath}" | head -4000 | awk 'NR % 4 == 1' | awk -F ':' '{print $10}' | sort | uniq -c | sort -k 1,1nr | head -1 | awk '{print $2}' | tr -d '\n')
local _barcodeRegex='^([ATCG][ATCG+]*)$'
if [[ "${_mostAbundandBarcode}" =~ ${_barcodeRegex} ]]
then
local _barcodes="${BASH_REMATCH[1]}"
#
# In case the sample was double barcoded change the separator from '+' to '-'.
#
_barcodes="${_barcodes/+/-}"
if [[ "${enableVerboseLogging}" -eq 1 ]]
then
echo "DEBUG: Found _barcode(s) ............. = ${_barcodes}"
fi
elif [[ "${_mostAbundandBarcode}" =~ N ]]
then
echo "ERROR: Most abundant barcode(s) in max first 1000 reads contains Ns: ${_barcodes}"
echo "ERROR: Skipping discarded FastQ ${_fastqFile} due to poor sequencing quality of barcode(s)."
return
else
echo "ERROR: Failed to determine the most abundant barcode(s) from the max first 1000 reads."
echo "FATAL: Failed to process FastQ ${_fastqFile} due to poor sequencing quality of barcode(s)."
exit 1
fi

local _fastqChecksum=$(cat "${_fastqDir}/"*.md5 | grep "${_fastqFile}" | awk '{print $1}')
local _newFastqDir="${_fastqDir}/${_sequencingStartDate}_${_sequencer}_${_run}_${_flowcell}"
local _newFastqFile="${_sequencingStartDate}_${_sequencer}_${_run}_${_flowcell}_L${_lane}_${_barcodes}_${_sequenceReadOfPair}.fq.gz"

#
# Create sequence run subdir if it did not already exist.
#
mkdir -p "${_newFastqDir}"

#
# Check if new FastQ file path does not already exist to prevent overwriting a sample with another one.
#
if [[ -e "${_newFastqDir}/${_newFastqFile}" ]]
then
echo "FATAL: ${_newFastqDir}/${_newFastqFile} already exists."
echo "FATAL: will NOT move ${_fastqPath} -> ${_newFastqDir}/${_newFastqFile}."
exit 1
fi

#
# Move and rename FastQ + MD5 checksum on the fly.
#
echo "INFO: Moving ${_fastqPath} -> ${_newFastqDir}/${_newFastqFile}"
printf '%s %s\n' "${_fastqChecksum}" "${_newFastqFile}" > "${_newFastqDir}/${_newFastqFile}.md5"
mv "${_fastqPath}" "${_newFastqDir}/${_newFastqFile}"
}

#
##
### Main
##
#

#
# Get commandline arguments.
#
enableVerboseLogging=0 # Disabled by default.
while getopts "s:f:hv" opt
do
case $opt in
h)
_Usage
#
# Reset trap and exit.
#
trap - EXIT
exit 0
;;
s)
sequencingStartDate="${OPTARG}"
;;
f)
fastqFilePattern="${OPTARG}"
;;
v)
enableVerboseLogging=1
;;
\?)
_reportError ${LINENO} '1' "Invalid option -${OPTARG}. Try \"$(basename $0) -h\" for help."
;;
:)
_reportError ${LINENO} '1' "Option -${OPTARG} requires an argument. Try \"$(basename $0) -h\" for help."
;;
esac
done

#
# Make sure there are no extra arguments we did not expect nor need.
#
shift $(($OPTIND - 1))
if [[ ! -z ${1:-} ]]
then
_reportError ${LINENO} '1' "Invalid argument \"$1\". Try \"$(basename $0) -h\" for help."
fi

#
# Check if required args are present.
#
if [[ -z "${sequencingStartDate:-}" || -z "${fastqFilePattern:-}" ]]
then
_reportError ${LINENO} 1 "One ore more required arguments is missing. Try \"$(basename $0) -h\" for help."
fi

#
# Check if sequencingStartDate is in the expected format.
#
ssd_regex='^[1-9][0-9][0-1][1-9][0-3][0-9]$'
if [[ "${sequencingStartDate}" =~ ${ssd_regex} ]]
then
echo "INFO: Using sequencingStartDate ${sequencingStartDate}"
else
_reportError ${LINENO} 1 "sequencingStartDate in unsupported format. Must be YYMMDD, but got ${sequencingStartDate}."
fi

#
# Process FastQ files.
#
for FastQ in $(ls -1 ${fastqFilePattern})
do
_RenameFastQ "${FastQ}" "${sequencingStartDate}"
done

#
# Reset trap and exit.
#
echo "INFO: Finished successfully!"
trap - EXIT
exit 0

0 comments on commit d2a6a03

Please sign in to comment.