mergeFiles

#!/bin/sh

for scratch_dir in /data /scratch /tmp; do
  if [ -d $scratch_dir ] && [ -w $scratch_dir ]; then
    break
  fi
done
CACHE_HOME=${scratch_dir}/$USER

die() {
  echo 2>&1 $1
  exit 1
}


PrintUsage() {
  echo "USAGE: $0 [options] output_file input_directory(s)"
  echo ""
  echo "OPTIONS:"
  echo "  --cache-dir=${CACHE_HOME}/<mergeName>"
  echo "  --match-input-files='*.root'"
  echo "  --exclude-input-files=pattern"
  echo "  --reuse-cache-files"
  echo "  --abort-on-copy-error"
  echo "  --failed-files=file (where to write names of files that could not be copied)"
  echo "  --copy-timeout=N  (seconds)"
  echo "  --parallel=N      (number of files to copy in parallel; default 3)"
  echo "  --merge-only      (merge whatever files already exist in local cache)"
  echo "  --use-hadd        (use the root 'hadd' program to do the merge)"
  echo "  --crab-unique     (filter out duplicate output files)"
  echo ""
  echo "Note that <mergeName> is taken from the name of the output file."
  echo
  echo "This script copies files from dCache to a local directory and then"
  echo "merges them together.  The purpose of copying the files is to better"
  echo "control what happens when access to a file in dCache hangs due to"
  echo "the dCache node(s) where the file resides being offline."
  echo "By default, this script will hang indefinitely in such cases, but"
  echo "you can do two things: one is to specify --copy-timeout.  The other"
  echo "is to kill the hung dccp process.  Unless you specify --abort-on-copy-error"
  echo "the merge will go ahead even if some files failed to be copied"
  echo "from dCache.  To know whether any files were skipped, search for"
  echo "WARNING in the output from this script."
  exit 2
}

OPTS=`getopt -o "h" -l "help,cache-dir:,match-input-files:,exclude-input-files:,reuse-cache-files,abort-on-copy-error,copy-timeout:,merge-only,use-hadd,parallel:,failed-files:,crab-unique" -- "$@"`
if [ $? -ne 0 ]; then PrintUsage; fi

eval set -- "$OPTS"

CACHE_DIR=
MATCH_INPUT_FILES='*.root'
EXCLUDE_INPUT_FILES=
REUSE_CACHE_FILES=
ABORT_ON_COPY_ERROR=0
COPY_TIMEOUT=0
MERGE_ONLY=0
USE_HADD=0
PARALLEL=3
CRAB_UNIQUE=0

while [ ! -z "$1" ]
do
  case "$1" in
    -h) PrintUsage;;
    --help) PrintUsage;;
    --cache-dir) shift; CACHE_DIR=$1;;
    --match-input-files) shift; MATCH_INPUT_FILES=$1;;
    --exclude-input-files) shift; EXCLUDE_INPUT_FILES=$1;;
    --reuse-cache-files) REUSE_CACHE_FILES=1;;
    --abort-on-copy-error) ABORT_ON_COPY_ERROR=1;;
    --copy-timeout) shift; COPY_TIMEOUT=$1;;
    --merge-only) MERGE_ONLY=1; REUSE_CACHE_FILES=1;;
    --use-hadd) USE_HADD=1;;
    --parallel) shift; PARALLEL=$1;;
    --failed-files) shift; FAILED_FILES_ARG="--failed_files=$1";;
    --crab-unique) CRAB_UNIQUE=1;;
    --) shift; break;;
    *) echo "Unexpected option $1"; PrintUsage;;
  esac
  shift
done

if [ "$#" -lt 2 ]; then PrintUsage; fi

MERGE_FILE=$1
shift

! [ -f "$MERGE_FILE" ] || die "ERROR: $MERGE_FILE already exists.  Aborting."

if [ "$CACHE_DIR" = "" ]; then
  CACHE_DIR=$CACHE_HOME/`basename $MERGE_FILE .root`
fi

if [ -d "$CACHE_DIR" ]; then
  if [ "$REUSE_CACHE_FILES" = "1" ]; then
    echo "Reusing any existing files in $CACHE_DIR."
  else
    echo "ERROR: cache directory already exists: $CACHE_DIR."
    echo "Either remove it or specify --reuse-cache-files."
    exit 1
  fi
fi

if [ "$USE_HADD" != "0" ]; then
  HADD_EXE=hadd
fi

# Check for some required utilities
for exe in root dccp dccp_many $HADD_EXE; do
  if ! which $exe >& /dev/null; then
    echo "Cannot find $exe in PATH.  Your environment is not correctly set up."
    exit 1
  fi
done

MERGE_C=$0
if ! [ -f $0 ]; then
  MERGE_C=`which $0`
fi
MERGE_C=$(dirname $MERGE_C)/mergeFiles.C
if ! [ -f $MERGE_C ]; then
  echo "ERROR: no such file: $MERGE_C"
  exit 1
fi

FILE_LIST=/tmp/mergeFiles_file_list.$$
rm -f $FILE_LIST
touch $FILE_LIST || die "Failed to write to $FILE_LIST."

mkdir -p $CACHE_DIR || die "Failed to create $CACHE_DIR."

for dir; do
  if [ "$MERGE_ONLY" = "1" ]; then continue; fi

  if [ "$EXCLUDE_INPUT_FILES" != "" ]; then
    EXCLUDE_OPTION="-not -name $EXCLUDE_INPUT_FILES"
  fi

  TOTAL_FILES=`find $dir -type f -name "$MATCH_INPUT_FILES" $EXCLUDE_OPTION | wc --lines`

  find $dir -type f -name "$MATCH_INPUT_FILES" $EXCLUDE_OPTION |
  while read fname; do
    dest_fname=${CACHE_DIR}/`basename $fname`
    if [ -s $dest_fname ]; then
      echo "Already in local cache: `basename $fname`."
    else
      echo $fname $dest_fname >> $FILE_LIST
    fi
  done
done

# check that the above loop (which executes in a sub-shell) succeeded
if [ "$?" -ne 0 ]; then
  rm -f $FILE_LIST
  exit 1
fi

if [ "$CRAB_UNIQUE" = "1" ]; then
  if ! which listUniqueCrabFiles > /dev/null; then
    echo "Did not find listUniqueCrabFiles in PATH, so cannot filter redundant CRAB files."
    exit 1
  fi

  rm -f $FILE_LIST.crab
  for dir; do
    listUniqueCrabFiles $dir >> $FILE_LIST.crab \
      || die "Failed to run listUniqueCrabFiles $dir"
  done

  grep -F -f $FILE_LIST.crab $FILE_LIST > $FILE_LIST.crab.filtered

  lines_before=`awk 'BEGIN{LINES=0}{LINES++}END{print LINES}' $FILE_LIST`
  lines_after=`awk 'BEGIN{LINES=0}{LINES++}END{print LINES}' $FILE_LIST.crab.filtered`
  lines_removed=$(($lines_before-$lines_after))

  if [ "$lines_removed" -gt 0 ]; then
    echo "Filtered out $lines_removed redundant CRAB output files."
  fi

  mv $FILE_LIST.crab.filtered $FILE_LIST
  rm -f $FILE_LIST.crab
fi

if [ "$ABORT_ON_COPY_ERROR" = 0 ]; then
  CONTINUE_ON_ERROR_ARG=--continue_on_error
fi

dccp_many --copyjobfile=$FILE_LIST --copy_timeout=$COPY_TIMEOUT $CONTINUE_ON_ERROR_ARG $FAILED_FILES_ARG --parallel=$PARALLEL
dccp_rc=$?

echo

rm -f $FILE_LIST

if [ $dccp_rc != 0 ]; then
  if [ "$ABORT_ON_COPY_ERROR" = 1 ]; then
    echo "Aborting because the copying of some files failed."
    exit 1
  fi
  echo "WARNING: the copying of some files failed, but --abort-on-copy-error was not specified, so continuing with the merge."
fi

files=`find $CACHE_DIR -type f -print | awk 'BEGIN{N=0} {N=N+1} END{print N}'`
if ! [ "$files" -gt "0" ]; then
  echo "WARNING: no files exist in $CACHE_DIR, so nothing to merge."
  exit 1
fi

echo
echo "Merging $files files in $CACHE_DIR into merge file $MERGE_FILE"

if [ "$USE_HADD" = "0" ]; then
  root -b -l -q "$MERGE_C(\"$MERGE_FILE\",\"$CACHE_DIR\")"
else
  hadd "$MERGE_FILE" "$CACHE_DIR"/*
fi

if [ "$?" != "0" ]; then
  echo "WARNING: the merge command exited with non-zero status"
  exit 1
fi

echo "Done merging $files in $CACHE_DIR into merge file $MERGE_FILE"