Skip to content

Commit

Permalink
Merge pull request #204 from pneerincx/master
Browse files Browse the repository at this point in the history
Improved checkSamplesheet procedure
  • Loading branch information
Gerbenvandervries authored Sep 24, 2018
2 parents f9efe0b + 1dc7cd2 commit c1278d6
Show file tree
Hide file tree
Showing 4 changed files with 194 additions and 105 deletions.
88 changes: 0 additions & 88 deletions checkSampleSheet_v2.py

This file was deleted.

95 changes: 95 additions & 0 deletions checkSamplesheet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#!/usr/bin/env python

import argparse
import os
import csv
import sys
import re
from collections import defaultdict
from os.path import basename
parser = argparse.ArgumentParser(description='Process commandline opts.')
parser.add_argument("--input")
parser.add_argument("--log")
args = parser.parse_args()

columns = defaultdict(list)
f = open(args.input, 'r') # opens the samplesheet file.
print("INFO: input = " + args.input)
reader = csv.DictReader(f) # creates the reader object.
inputFileName=(basename(args.input))
inputFileNameBase = re.sub('\..*$', '', inputFileName)

#
# Parse meta-data from the filename.
#
inputFileNameComponents = inputFileNameBase.split('_')
sequencingStartDate = inputFileNameComponents[0]
sequencer= inputFileNameComponents[1]
run = inputFileNameComponents[2]
flowcell = inputFileNameComponents[3]

if len(inputFileNameComponents) > 4:
for i in range(4,len(inputFileNameComponents)):
flowcell+="_"+ str(inputFileNameComponents[i])

w = open(args.log, 'w')
print("INFO: log = " + args.log)
sanityCheckOk=True
alreadyErrored=False
hasRows = False
listOfErrors=[]

#
# Iterate over the rows of the file.
#
for number, row in enumerate(reader,1):
hasRows = True
#
# Check if the required columns are present.
#
for columnName in ('externalSampleID','project','sequencer','sequencingStartDate','flowcell','run','flowcell','lane','seqType','prepKit','capturingKit','barcode','barcodeType'):
if columnName not in row.keys():
sanityCheckOk=False
if not alreadyErrored:
listOfErrors.append('ERROR: Required column is missing (or has a trailing space): ' + columnName + '.')
alreadyErrored=True
else:
if row[columnName] == "":
sanityCheckOk=False
if columnName in ('capturingKit','barcode','barcodeType'):
listOfErrors.append('ERROR on line ' + str(number) + ': Variable ' + columnName + ' is empty! Please fill in "None" (to make sure it is not missing).')
else:
listOfErrors.append('ERROR on line ' + str(number) + ': Variable ' + columnName + ' is empty!')
#
# Check if the data inside the file matches the expected filename.
#
if row['sequencer'] != sequencer and 'sequencer' in row.keys():
sanityCheckOk=False
listOfErrors.append('ERROR on line ' + str(number) + ': sequencer value in samplesheet (' + row['sequencer'] + ') does not match sequencer in filename (' + sequencer + ').')
if row['sequencingStartDate'] != sequencingStartDate and 'sequencingStartDate' in row.keys():
sanityCheckOk=False
listOfErrors.append('ERROR on line ' + str(number) + ': sequencingStartDate value in samplesheet (' + row['sequencingStartDate'] + ') does not match sequencingStartDate in filename (' + sequencingStartDate + ').')
if row['run'] != run and 'run' in row.keys():
sanityCheckOk=False
listOfErrors.append('ERROR on line ' + str(number) + ': run value in samplesheet (' + row['run'] + ') does not match run in filename (' + run + ').')
if row['flowcell'] != flowcell and 'flowcell' in row.keys():
sanityCheckOk=False
listOfErrors.append('ERROR on line ' + str(number) + ': flowcell value in samplesheet ' + row['flowcell'] + ' does not match flowcell in filename (' + flowcell + ').')

f.close()

if not hasRows:
sanityCheckOk=False
print("File is empty?!")
listOfErrors.append("File is empty?!")

if sanityCheckOk:
w.write("OK")
w.close()
sys.exit(0)
else:
print('\n'.join(listOfErrors))
w.write('\n'.join(listOfErrors))
w.close()
sys.exit(1)

99 changes: 99 additions & 0 deletions checkSamplesheet.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
#!/bin/bash

set -eu

SCRIPT_NAME="$(basename ${0})"
SCRIPT_NAME="${SCRIPT_NAME%.*sh}"
INSTALLATION_DIR="$(cd -P "$(dirname "${0}")/.." && pwd)/"
declare sampleSheetsDir="${INSTALLATION_DIR}"'Samplesheets/'

echo "INFO: Processing samplesheets from ${sampleSheetsDir}/new/..."

declare -a sampleSheets=($(find "${sampleSheetsDir}/new/" -name '*.csv'))
if [[ "${#sampleSheets[@]:-0}" -eq '0' ]]
then
echo "WARN: No samplesheets found in ${sampleSheetsDir}/new/."
exit 0
else
echo "DEBUG: samplesheets found: ${sampleSheets[@]}."
fi

for sampleSheet in "${sampleSheets[@]}"
do
#
# Create a copy of the original, so we preserve the original owner
# and can check who to notify when something is wrong with the samplesheet.
#
cp "${sampleSheet}"{,.converted}

#
# Make sure
# 1. The last line ends with a line end character.
# 2. We have the right line end character: convert any carriage return (\r) to newline (\n).
# 3. We remove empty lines.
#
printf '\n' >> "${sampleSheet}.converted"
sed -i 's/\r/\n/g' "${sampleSheet}.converted"
sed -i '/^\s*$/d' "${sampleSheet}.converted"
#
# Parse content with Python sanity check script.
#
check='failed' # default.
fileName=$(basename "${sampleSheet}")
if "${sampleSheetsDir}"/"${SCRIPT_NAME}".py --input "${sampleSheet}.converted" --log "${sampleSheet}.converted.log"
then
check=$(cat "${sampleSheet}.converted.log")
fi

if [[ "${check}" == 'OK' ]]
then
echo "INFO: Samplesheet is OK, moving ${sampleSheet}.converted to ${sampleSheetsDir}/${fileName}..."
mv "${sampleSheet}.converted" "${sampleSheetsDir}/${fileName}"
rm -f "${sampleSheet}"* # cleanup.
else
echo "ERROR: Samplesheet ${fileName} is not correct, see ${sampleSheet}.converted.log."
if [[ -e "${sampleSheet}.converted.log.mailed" ]]
then
echo "INFO: Notification was already sent."
else
echo "INFO: Trying to send email notification ..."
#
# Get email addresses for list of users that should always receive mail.
#
declare mailAddress=''
if [[ -e "${INSTALLATION_DIR}/logs/${SCRIPT_NAME}.mailinglist" ]]
then
mailAddress="$(cat "${INSTALLATION_DIR}/logs/${SCRIPT_NAME}.mailinglist" | tr '\n' ' ')"
else
printf '%s\n' "ERROR: ${INSTALLATION_DIR}/logs/${SCRIPT_NAME}.mailinglist is missing on $(hostname -s)." \
| mail -s "Samplesheet is wrong, but we cannot send email to the relevant users." '[email protected]'
fi
#
# Get email address for owner of the samplesheet.
#
fileOwner=$(stat -c '%U' "${sampleSheet}" | tr -d '\n')
mailAddressOwner="$(getent passwd "${fileOwner}" | cut -d ':' -s -f 5)"
if [[ -z "${mailAddressOwner:-}" ]]
then
printf '%s\n' "WARN: We do not have an email address for this user: ${fileOwner}." \
| mail -s "Samplesheet is wrong on $(hostname -s), but we cannot email the owner." "${mailAddress:-}"
else
mailAddress="${mailAddress:-} ${mailAddressOwner:-}"
fi
#
# Prepare message content.
#
header="Dear ${fileOwner},"
body="${SCRIPT_NAME} detected an error when parsing ${sampleSheet} on $(hostname -s): $(<"${sampleSheet}.converted.log")"
footer='Cheers from the GCC.'
#
# Send email to notify users.
#
printf '%s\n\n%s\n\n%s\n' "${header}" "${body}" "${footer}" \
| mail -s "Samplesheet is wrong on $(hostname -s)." "${mailAddress:-}"
touch "${sampleSheet}.converted.log.mailed"
fi
fi
done

echo "INFO: finished processing samplesheets."
17 changes: 0 additions & 17 deletions checkSamplesheetWrapper.sh

This file was deleted.

0 comments on commit c1278d6

Please sign in to comment.