Skip to content

Commit

Permalink
Bug fixes...
Browse files Browse the repository at this point in the history
  • Loading branch information
pneerincx committed Sep 21, 2018
1 parent b660075 commit a118316
Show file tree
Hide file tree
Showing 2 changed files with 81 additions and 74 deletions.
99 changes: 48 additions & 51 deletions checkSamplesheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,36 +4,38 @@
import os
import csv
import sys
import re
from collections import defaultdict
from os.path import basename
parser = argparse.ArgumentParser(description='Process some integers.')
parser = argparse.ArgumentParser(description='Process commandline opts.')
parser.add_argument("--input")
parser.add_argument("--logfile")
parser.add_argument("--log")
args = parser.parse_args()

columns = defaultdict(list)
f = open(args.input, 'r') # opens the csv file
print("inputfile:" + args.input)
reader = csv.DictReader(f) # creates the reader object
sampleName=(basename(os.path.splitext(args.input)[0]))
f = open(args.input, 'r') # opens the samplesheet file.
print("INFO: input = " + args.input)
reader = csv.DictReader(f) # creates the reader object.
inputFileName=(basename(args.input))
inputFileNameBase = re.sub('\..*$', '', inputFileName)

#
# Parse meta-data from the filename.
#
splittedFileName = sampleName.split('_')
sequencestartdate = splittedFileName[0]
sequencer= splittedFileName[1]
runid = splittedFileName[2]
flowcell = splittedFileName[3]
inputFileNameComponents = inputFileNameBase.split('_')
sequencingStartDate = inputFileNameComponents[0]
sequencer= inputFileNameComponents[1]
run = inputFileNameComponents[2]
flowcell = inputFileNameComponents[3]

if len(splittedFileName) > 4:
for i in range(4,len(splittedFileName)):
flowcell+="_"+ str(splittedFileName[i])
if len(inputFileNameComponents) > 4:
for i in range(4,len(inputFileNameComponents)):
flowcell+="_"+ str(inputFileNameComponents[i])

w = open(args.logfile, 'w')
print("logfile:" + args.logfile)
stopRun="false"
alreadyErrored="false"
w = open(args.log, 'w')
print("INFO: log = " + args.log)
sanityCheckOk=True
alreadyErrored=False
hasRows = False
listOfErrors=[]

Expand All @@ -47,52 +49,47 @@
#
for columnName in ('externalSampleID','project','sequencer','sequencingStartDate','flowcell','run','flowcell','lane','seqType','prepKit','capturingKit','barcode','barcodeType'):
if columnName not in row.keys():
if alreadyErrored == "false":
listOfErrors.extend("One required column is missing (or has a trailing space): " + columnName)
print("One required column is missing (or has a trailing space): " + columnName)
alreadyErrored="true"
sanityCheckOk=False
if not alreadyErrored:
listOfErrors.extend('ERROR: Required column is missing (or has a trailing space): ' + columnName)
alreadyErrored=True
else:
if row[columnName] == "":
sanityCheckOk=False
if columnName in ('capturingKit','barcode','barcodeType'):
if alreadyErrored == "false":
listOfErrors.append("The variable " + sleutel + " on line " + str(number) + " is empty! Please fill in None (this to be sure that it is not missing)")
stopRun="true"
alreadyErrored="true"
listOfErrors.append('ERROR on line ' + str(number) + ': Variable ' + columnName + ' is empty! Please fill in "None" (to make sure it is not missing).')
else:
if alreadyErrored == "false":
listOfErrors.append("The variable " + columnName + " on line " + str(number) + " is empty!")
stopRun="true"
alreadyErrored="true"
listOfErrors.append('ERROR on line ' + str(number) + ': Variable ' + columnName + ' is empty!')
#
# Check if the data inside the file matches the expected filename.
#
if row['sequencer'] != sequencer and 'sequencer' in row.keys():
stopRun="true"
listOfErrors.append("the sequencer in the samplesheet is not matching the sequencer in the filename on line: " + str(number + 1))
print("the sequencer in the samplesheet is not matching the sequencer in the filename on line: " + str(number + 1))
if row['sequencingStartDate'] != sequencestartdate and 'sequencingStartDate' in row.keys():
stopRun="true"
listOfErrors.append("the sequencingStartDate in the samplesheet is not matching the sequencingStartDate in the filename on line: " + str(number + 1))
print("the sequencingStartDate in the samplesheet is not matching the sequencingStartDate in the filename on line: " + str(number + 1))
if row['run'] != runid and 'run' in row.keys():
stopRun="true"
listOfErrors.append("the run in the samplesheet is not matching the run in the filename on line: " + str(number + 1))
print("the run in the samplesheet is not matching the run in the filename on line: " + str(number + 1))
sanityCheckOk=False
listOfErrors.append('ERROR on line ' + str(number) + ': sequencer value in samplesheet (' + row['sequencer'] + ') does not match sequencer in filename (' + sequencer + ').')
if row['sequencingStartDate'] != sequencingStartDate and 'sequencingStartDate' in row.keys():
sanityCheckOk=False
listOfErrors.append('ERROR on line ' + str(number) + ': sequencingStartDate value in samplesheet (' + row['sequencingStartDate'] + ') does not match sequencingStartDate in filename (' + sequencingStartDate + ').')
if row['run'] != run and 'run' in row.keys():
sanityCheckOk=False
listOfErrors.append('ERROR on line ' + str(number) + ': run value in samplesheet (' + row['run'] + ') does not match run in filename (' + run + ').')
if row['flowcell'] != flowcell and 'flowcell' in row.keys():
stopRun="true"
listOfErrors.append("the flowcell in the samplesheet is not matching the flowcell in the filename on line: " + str(number + 1))
print("the flowcell in the samplesheet is not matching the flowcell in the filename on line: " + str(number + 1))
sanityCheckOk=False
listOfErrors.append('ERROR on line ' + str(number) + ': flowcell value in samplesheet ' + row['flowcell'] + ' does not match flowcell in filename (' + flowcell + ').')

f.close()

if not hasRows:
print("The complete file is empty?! in ")
listOfErrors.append("The complete file is empty?!")
sanityCheckOk=False
print("File is empty?!")
listOfErrors.append("File is empty?!")

if stopRun == "true":
if sanityCheckOk:
w.write("OK")
w.close()
sys.exit(0)
else:
print('\n'.join(listOfErrors))
w.write('\n'.join(listOfErrors))
w.close()
sys.exit(1)
else:
w.write("OK")

w.close()
f.close()
56 changes: 33 additions & 23 deletions checkSamplesheet.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,34 +9,42 @@ SCRIPT_NAME="$(basename ${0})"
SCRIPT_NAME="${SCRIPT_NAME%.*sh}"

echo "INFO: processing samplesheets from ${sampleSheetsDir}/new/..."

for sampleSheet in $(ls -1 "${sampleSheetsDir}/new/"*'.csv')
do
#
# Create a copy of the original, so we preserve the original owner
# and can check who to notify when something is wrong with the samplesheet.
#
cp "${sampleSheet}"{,.converted}

#
# Make sure
# 1. The last line ends with a line end character.
# 2. We have the right line end character: convert any carriage return (\r) to newline (\n).
# 3. We remove empty lines.
#
cp "${sampleSheet}"{,.converted} \
&& printf '\n' >> "${sampleSheet}.converted" \
&& sed -i 's/\r/\n/g' "${sampleSheet}.converted" \
&& sed -i '/^\s*$/d' "${sampleSheet}.converted" \
&& mv -f "${sampleSheet}"{.converted,}
printf '\n' >> "${sampleSheet}.converted"
sed -i 's/\r/\n/g' "${sampleSheet}.converted"
sed -i '/^\s*$/d' "${sampleSheet}.converted"
#
# Parse content with Python sanity check script.
#
"${sampleSheetsDir}"/"${SCRIPT_NAME}".py --input "${sampleSheet}" --logfile "${sampleSheet}.log"
filename=$(basename "${sampleSheet}")
check=$(cat "${sampleSheet}.log")
if [[ "${check}" == "OK" ]]
check='failed' # default.
fileName=$(basename "${sampleSheet}")
if "${sampleSheetsDir}"/"${SCRIPT_NAME}".py --input "${sampleSheet}.converted" --log "${sampleSheet}.converted.log"
then
echo "INFO: Samplesheet is OK, moving ${sampleSheet} to ${sampleSheetsDir}..."
mv "${sampleSheet}" "${sampleSheetsDir}"
rm -f "${sampleSheet}.log.mailed"
rm -f "${sampleSheet}.log"
check=$(cat "${sampleSheet}.converted.log")
fi

if [[ "${check}" == 'OK' ]]
then
echo "INFO: Samplesheet is OK, moving ${sampleSheet}.converted to ${sampleSheetsDir}/${fileName}..."
mv "${sampleSheet}.converted" "${sampleSheetsDir}/${fileName}"
rm -f "${sampleSheet}"* # cleanup.
else
echo "ERROR: Samplesheet ${filename} is not correct, see log."
if [[ -e "${sampleSheet}.log.mailed" ]]
echo "ERROR: Samplesheet ${fileName} is not correct, see ${sampleSheet}.converted.log."
if [[ -e "${sampleSheet}.converted.log.mailed" ]]
then
echo "INFO: Notification was already sent."
else
Expand All @@ -49,33 +57,35 @@ do
then
mailAddress="$(cat "${baseDir}/logs/${SCRIPT_NAME}.mailinglist" | tr '\n' ' ')"
else
echo -e "ERROR: ${baseDir}/logs/${SCRIPT_NAME}.mailinglist is missing on $(hostname -s)\n" \
| mail -s "Samplesheet is wrong, but we cannot send email to the relevant users."
printf '%s\n' "ERROR: ${baseDir}/logs/${SCRIPT_NAME}.mailinglist is missing on $(hostname -s)." \
| mail -s "Samplesheet is wrong, but we cannot send email to the relevant users." '[email protected]'
fi
#
# Get email address for owner of the samplesheet.
#
fileOwner=$(stat -c "%U" "${sampleSheet}" | tr -d '\n')
fileOwner=$(stat -c '%U' "${sampleSheet}" | tr -d '\n')
mailAddressOwner="$(getent passwd "${fileOwner}" | cut -d ':' -s -f 5)"
if [[ -z "${mailAddressOwner:-}" ]]
then
echo -e "WARN: We do not have an email address for this user: ${fileOwner}\n" \
printf '%s\n' "WARN: We do not have an email address for this user: ${fileOwner}." \
| mail -s "Samplesheet is wrong on $(hostname -s), but we cannot email the owner." "${mailAddress:-}"
else
mailAddress="${mailAddress:-} ${mailAddressOwner:-}
mailAddress="${mailAddress:-} ${mailAddressOwner:-}"
fi
#
# Prepare message content.
#
header="Dear ${fileOwner},"
body="${SCRIPT_NAME} detected an error when parsing ${sampleSheet} on $(hostname -s): $(<"${sampleSheet}.log")"
body="${SCRIPT_NAME} detected an error when parsing ${sampleSheet} on $(hostname -s): $(<"${sampleSheet}.converted.log")"
footer='Cheers from the GCC.'
#
# Send email to notify users.
#
printf '%s\n\n%s\n\n%s\n' "${header}" "${body}" "${footer}" \
| mail -s "Samplesheet is wrong on $(hostname -s)" "${mailAddress:-}"
touch "${sampleSheet}.log.mailed"
| mail -s "Samplesheet is wrong on $(hostname -s)." "${mailAddress:-}"
touch "${sampleSheet}.converted.log.mailed"
fi
fi
done

echo "INFO: finished processing samplesheets."

0 comments on commit a118316

Please sign in to comment.