diff --git a/makeSamplesheet_external_samples.sh b/makeSamplesheet_external_samples.sh new file mode 100755 index 0000000..c1670c2 --- /dev/null +++ b/makeSamplesheet_external_samples.sh @@ -0,0 +1,308 @@ +#!/bin/bash + +set -e +set -u + + +function showHelp() { + # + # Display commandline help on STDOUT. + # + cat <= 1.8: + # @sequencer:run:flowcell:lane:tile:xCoordinateOfClusterInTile:yCoordinateOfClusterInTile sequenceReadOfPair:filtered:barcode[+barcode2] + # E.g.: + # @K00296:315:HVLN7BBXX:7:1101:2940:1173 1:Y:0:GTAGAGGA+TCTACTAT + # + # + local _fastqDir="$(dirname "${_fastqPath}")" + local _fastqFile="$(basename "${_fastqPath}")" + # + # Get essential meta-data from the sequence read IDs in the FastQ file. + # (Do NOT rely on parsing FastQ filenames!) + # + local _firstReadID=$(zcat "${_fastqPath}" | head -1) + + local _regex='^@([A-Z0-9][A-Z0-9]*):([0-9][0-9]*):([A-Z0-9][A-Z0-9]*):([1-8]):[0-9]*:[0-9]*:[0-9]* ([1-2]):[YN]:[0-9][0-9]*:[ATCGN][ATCGN+]*' + if [[ "${_firstReadID}" =~ ${_regex} ]] + then + local _sequencer="${BASH_REMATCH[1]}" + local _run="${BASH_REMATCH[2]}" + local _flowcell="${BASH_REMATCH[3]}" + local _lane="${BASH_REMATCH[4]}" + local _sequenceReadOfPair="${BASH_REMATCH[5]}" + # + # Note: we do require the ID of the first read to contain a DNA barcode at the end, + # but we don't parse barcodes here. See below for why... + # + # + # Sanity check for run number and add leading zero when run number < 4. + # + if [[ "${#_run}" -lt 1 ]] + then + _reportFatalError ${LINENO} '1' 'Run number detected in ID of first read is too short (< 1): '"${_run:-}." + elif [[ "${#_run}" -eq 1 ]] + then + _run="000${_run}" + elif [[ "${#_run}" -eq 2 ]] + then + _run="00${_run}" + elif [[ "${#_run}" -eq 3 ]] + then + _run="0${_run}" + fi + # + + else + _reportFatalError ${LINENO} '1' "Failed to parse required meta-data values from ID of first read of ${_fastqPath}". + fi + # + # Due to sequencing errors the barcode may deviate slightly, so looking only at the first one won't fly. + # In addition the start and end of a FastQ file tends to be enriched for sequencing errors / low quality. + # Therefore we: + # A. use a combination of head and tail commands to skip the first 100,000 reads (400,000 lines) + # and get data from in the middle of the FastQ file. + # (assuming the yield was high enough; otherwise you get the last 1000 reads). + # B. Next we parse the barcodes from the read ID lines, sort, count and determine the most abundant barcode. + # + local _mostAbundandBarcode=$(zcat "${_fastqPath}" | head -n 440000 | tail -n 4000 | awk 'NR % 4 == 1' | awk -F ':' '{print $10}' | sort | uniq -c | sort -k 1,1nr | head -1 | awk '{print $2}' | tr -d '\n') + local _barcodeRegex='^([ATCG][ATCG+]*)$' + if [[ "${_mostAbundandBarcode}" =~ ${_barcodeRegex} ]] + then + local _barcodes="${BASH_REMATCH[1]}" + # + # In case the sample was double barcoded change the separator from '+' to '-'. + # + _barcodes="${_barcodes/+/-}" + + elif [[ "${_mostAbundandBarcode}" =~ N ]] + then + if [[ "${allowN}" -eq '1' ]] + then + echo "WARN: Most abundant barcode(s) in max 1000 reads from middle of FastQ contains Ns: ${_mostAbundandBarcode}." + echo "WARN: Will continue processing FastQ ${_fastqFile} despite poor sequencing quality of barcode(s), because commandline option -n was specified." + else + qualityControl='failed' + echo "ERROR: Most abundant barcode(s) in max 1000 reads from middle of FastQ contains Ns: ${_mostAbundandBarcode}." + echo "ERROR: Skipping discarded FastQ ${_fastqFile} due to poor sequencing quality of barcode(s)." + return + fi + else + qualityControl='failed' + echo "ERROR: Failed to determine the most abundant barcodes from max 1000 reads from middle of FastQ." + echo "ERROR: Failed to parse barcode(s) from read IDs of FastQ file ${_fastqFile}." + return + fi + + local _newFastqFileInfo=",${_sequencingStartDate},${_sequencer},${_run},${_flowcell},L${_lane},${_barcodes}\n" + echo "INFO: new FastQ file information: ${_sequencingStartDate},${_sequencer},${_run},${_flowcell},L${_lane},${_barcodes}" + printf "${_newFastqFileInfo}" >> "${workDir}/${projectName}.csv" + } + + +# +# Make header for your sample sheet +# + +printf "externalSampleID,externalFastQ_1,externalFastQ_2,project,capturingKit,sampleType,seqType,prepKit,species,Gender,arrayFile,sequencingStartDate,sequencer,run,flowcell,lane,barcode\n" > "${workDir}/${projectName}.csv" + +# +# Process FastQ files, and add other necessary information +# + +for FastQ in $(ls "${externalSampleDirectoryPath}/"*"${pairedMateNameOne}"*".gz") +do + if [[ "${seqType}" == "PE" ]] + then + fileName=${FastQ%%.*} + baseNameFile=$(basename ${fileName}) + withoutExtension=${baseNameFile%%.*} + printf "${withoutExtension}" >> "${workDir}/${projectName}.csv" ##externalSampleID + printf ",${FastQ}," >> "${workDir}/${projectName}.csv" ## externalFastQ1 + + pairedMateNameTwo=$(echo "${pairedMateNameOne}" | sed 's|1|2|') + cleanPairedMateNameOne=$(echo $pairedMateNameOne | sed 's|\.|\\.|') + cleanPairedMateNameTwo=$(echo $pairedMateNameTwo | sed 's|\.|\\.|') + + printf "${FastQ}" | sed -r "s|${cleanPairedMateNameOne}|${cleanPairedMateNameTwo}|g" >> "${workDir}/${projectName}.csv" ## externalFastQ2 + printf ",${projectName}" >> "${workDir}/${projectName}.csv" ## project + printf ",${capturingKit}" >> "${workDir}/${projectName}.csv" ## capturingKit + printf ",${sampleType}" >> "${workDir}/${projectName}.csv" ## sampleType + printf ",${seqType}" >> "${workDir}/${projectName}.csv" ## seqType + printf ",${prepKit}" >> "${workDir}/${projectName}.csv" ## prepKit + printf ",${species}" >> "${workDir}/${projectName}.csv" ## species + printf "," >> "${workDir}/${projectName}.csv" ## Gender + printf "," >> "${workDir}/${projectName}.csv" ## arrayFile + _makeSampleInfo "${FastQ}" "${sequencingStartDate}" ## SequencingStartDate,sequencer,run,flowcell,lane,barcode + elif [[ "${seqType}" == "SR" ]] + then + fileName=${FastQ%%.*} + baseNameFile=$(basename ${fileName}) + withoutExtension=${baseNameFile%%.*} + printf "${withoutExtension}" >> "${workDir}/${projectName}.csv" ##externalSampleID + printf ",${FastQ}" >> "${workDir}/${projectName}.csv" ## externalFastQ1 + printf "," >> "${workDir}/${projectName}.csv" ## externalFastQ2 + printf ",${projectName}" >> "${workDir}/${projectName}.csv" ## project + printf ",${capturingKit}" >> "${workDir}/${projectName}.csv" ## capturingKit + printf ",${sampleType}" >> "${workDir}/${projectName}.csv" ## sampleType + printf ",${seqType}" >> "${workDir}/${projectName}.csv" ## seqType + printf ",${prepKit}" >> "${workDir}/${projectName}.csv" ## prepKit + printf ",${species}" >> "${workDir}/${projectName}.csv" ## species + printf "," >> "${workDir}/${projectName}.csv" ## Gender + printf "," >> "${workDir}/${projectName}.csv" ## arrayFile + _makeSampleInfo "${FastQ}" "${sequencingStartDate}" ## SequencingStartDate,sequencer,run,flowcell,lane,barcode + fi +done + +echo "Samplesheet is finished!" + + + + + + diff --git a/tortilla.sh b/tortilla.sh index 48222f7..7da9b43 100644 --- a/tortilla.sh +++ b/tortilla.sh @@ -9,7 +9,8 @@ cat <