publishFarmoutput

#!/bin/sh

# DBS environment setup suitable for use with Wisconsin DBS server
. /cvmfs/cms.hep.wisc.edu/osg/app/cmssoft/cms/slc5_ia32_gcc434/cms/dbs-client/DBS_2_0_9_patch_4-cms8/etc/profile.d/init.sh

#DBS_URL=https://cmsdbsprod.cern.ch:8443/cms_dbs_prod_local_01_writer/servlet/DBSServlet
DBS_URL=http://honey.hep.wisc.edu:8080/DBS/servlet/DBSServlet


# CRAB environment setup

export PYTHONPATH=
#export PATH=/cms/sw/python-2.5/bin:$PATH
#source /cms/sw/CRAB_2_3_1/crab.sh
#source /cms/sw/CRAB_2_4_3/crab.sh

source /cms/sw/CRAB_2_7_2_p1/crab.sh

# I failed to use CRAB versions from 2_7_4 and on, because they
# depend on some sqlite Boss db that doesn't exist for farmout jobs
#source /cms/sw/CRAB_2_8_1/crab.sh

# for storing output
SEName=cmssrm.hep.wisc.edu

# for getting input
STORE_USER_URL=/hdfs/store/user/$USER

MIN_PROXY_HOURS=24

scratch_dir="/data"
if ! [ -d $scratch_dir ]; then
  scratch_dir="/scratch"
fi
if ! [ -d $scratch_dir ]; then
  scratch_dir="/tmp"
fi
SUBMIT_HOME=${scratch_dir}/$USER


basename() {
  # This shell function is faster than calling /bin/basename
  path=$1
  suffix=$2
  path=${path##*/}  # get everything after the final '/'
  if [ ! -z $suffix ]; then
    path=${path%$suffix}
  fi
  echo $path
}

realpath() {
  if ! [ -a "$1" ]; then
    echo "$1"
  fi
  readlink -f $1
}

logerror() {
  echo 2>&1 "$@"
}

die() {
  logerror
  logerror "$@"
  exit 1
}

check_proxy() {
  hours=$1
  proxy=$2
  if ! [ -f "$proxy" ]; then
    logerror
    logerror "NOTE: No grid proxy found.  (Expecting to find it here: $proxy.)"
    return 1
  fi

  #Issue a warning if less than this many seconds remain:
  min_proxy_lifetime=$((3600*$hours))

  seconds_left="`voms-proxy-info --timeleft --file=$proxy 2>/dev/null`"

  if [ "$seconds_left" = "" ]; then
    echo "WARNING: cannot find time remaining for grid proxy."
    voms-proxy-info -all -path $proxy
    return 0
  fi
  if [ "$seconds_left" -lt "$min_proxy_lifetime" ]; then
    logerror
    logerror "NOTE: grid proxy is about to expire:"
    logerror "voms-proxy-info"
    voms-proxy-info --file=$proxy
    return 1
  fi

}

PrintUsage() {
  echo "USAGE: publishFarmoutput [options] <jobName> <CMSSW Ver>"
  echo ""
  echo "OPTIONS:"
  echo "  --dbs-url=${DBS_URL} (default is OSG local DBS)"
  echo "  --output-dir=${STORE_USER_URL}/<jobName>"
  echo "  --submit-dir=${SUBMIT_HOME}/<jobName>"
  echo "  --PrimaryDataset=<jobName>"
  echo "  --ProcessedDataset=name"
  echo "  --DataTier=tier (default USER)"
  echo "  --ApplicationFamily=name (default 'output')"
  echo "  --SEName=${SEName}"
  echo "  --ignore-failed-job-reports"
  echo "  --strip-input-file-info  (do not publish info about input files)"
  echo ""
  echo "The published dataset will be <PrimaryDataset>/<ProcessedDataset>/<DataTier>"
  echo "Example: /PhotonJet_Pt_30_50/CMSSW_2_1_7-more-info/USER"
  exit 2
}

ModifyJobReport() {
  failed=0

  while read input_fjr; do

    # The publication script is stolen from CRAB.
    # It expects framework job reports of form res/crab_fjr*.
    output_fjr=${PUB_DIR}/res/crab_fjr_`basename ${input_fjr}`
    if [ -f ${output_fjr} ]; then
      echo "${output_fjr} already exists; skipping"
      continue
    fi

    python ${FARMOUT_HOME}/ModifyJobReport.py \
      --input-fjr=${input_fjr} \
      --output-fjr=${output_fjr} \
      --PrimaryDataset=${PrimaryDataset} \
      --ProcessedDataset=${ProcessedDataset} \
      --DataTier=${DataTier} \
      --ApplicationFamily=${ApplicationFamily} \
      --ApplicationName=${ApplicationName} \
      --ApplicationVersion=${ApplicationVersion} \
      --PSetHash=${PSetHash} \
      --SEName=${SEName} \
      --pfn-path=${OUTPUT_DIR} \
      --lfn-path=${LFN_DIR} \
      ${STRIP_INPUT_FILE_INFO}

    if [ "$?" != "0" ]; then
      logerror "Failed to prepare framework job report for publication: ${input_fjr}"
      rm -f "${output_fjr}"
      failed=1
    fi

  done

  if [ "$failed" = 1 ]; then
    if [ "$IGNORE_FAILED_JOB_REPORTS" = 1 ]; then
      logerror
      logerror "WARNING: some framework job reports will be ignored, because they could not be prepared for publication."
      failed=0
    else
      logerror
      logerror "Aborting publication to DBS, because some job reports could not be processed."
      logerror "To publish whatever jobs succeeded in the above preparation step, ignoring those that did not, you must specify --ignore-failed-job-reports"
    fi
  fi

  return $failed
}

OPTS=`getopt -o "h" -l "help,dbs-url:,output-dir:,submit-dir:,PrimaryDataset:,ProcessedDataset:,DataTier:,ApplicationFamily:,SEName:,continue,ignore-failed-job-reports,strip-input-file-info" -- "$@"`
if [ $? -ne 0 ]; then PrintUsage; fi

eval set -- "$OPTS"

IGNORE_FAILED_JOB_REPORTS=0
OUTPUT_DIR=
SUBMIT_DIR=
CFG_EXTENSION=
PrimaryDataset=
ProcessedDataset=
DataTier=USER
# Not sure what ApplicationFamily means.  Using default value same as
# ModuleLabel that appears in fjr for farmout jobs.
ApplicationFamily=output
ApplicationName=cmsRun
CONTINUE_PUB=0
STRIP_INPUT_FILE_INFO=""


while [ ! -z "$1" ]
do
  case "$1" in
    -h) PrintUsage;;
    --help) PrintUsage;;
    --dbs-url) shift; DBS_URL=$1;;
    --output-dir) shift; OUTPUT_DIR=$1;;
    --submit-dir) shift; SUBMIT_DIR=$1;;
    --PrimaryDataset) shift; PrimaryDataset=$1;;
    --ProcessedDataset) shift; ProcessedDataset=$1;;
    --DataTier) shift; DataTier=$1;;
    --ApplicationFamily) shift; ApplicationFamily=$1;;
    --SEName) shift; SEName=$1;;
    --continue) CONTINUE_PUB=1;;
    --ignore-failed-job-reports) IGNORE_FAILED_JOB_REPORTS=1;;
    --strip-input-file-info) STRIP_INPUT_FILE_INFO="--strip-input-file-info";;
    --) shift; break;;
    *) die "Unexpected option $1";;
  esac
  shift
done

if [ "$#" -ne 2 ]; then PrintUsage; fi


# Check for some required utilities
for exe in scramv1 readlink voms-proxy-info; do
  if ! which $exe >& /dev/null; then
    die "Cannot find $exe in PATH.  Your environment is not correctly set up."
  fi
done

# Additional command-line arguments

jobName=$1
ApplicationVersion=`basename $2`

OUTPUT_DIR=${OUTPUT_DIR:-${STORE_USER_URL}/$jobName}
SUBMIT_DIR=${SUBMIT_DIR:-${SUBMIT_HOME}/$jobName}
LFN_DIR=/store/${OUTPUT_DIR#*/store/}

# choose one of the job config files to represent the config file for the whole set
configTemplate=`find ${SUBMIT_DIR} -name '*.py' | sort | head -1`

CFG_EXTENSION="${configTemplate/*./}"
runName=`basename $configTemplate .${CFG_EXTENSION}`

FARMOUT_HOME=`realpath $0`
if ! [ -f "${FARMOUT_HOME}" ]; then
  FARMOUT_HOME=`basename $0`
  FARMOUT_HOME=`which ${FARMOUT_HOME}`
fi
FARMOUT_HOME=`dirname ${FARMOUT_HOME}`

PrimaryDataset=${PrimaryDataset:-${jobName}}
#ProcessedDataset=${ProcessedDataset:-${runName}}

for name in DataTier PrimaryDataset ProcessedDataset ApplicationFamily SEName; do
  if [ "${!name}" = "" ]; then
    die "You must specify --${name}."
  fi
done

proxy=${X509_USER_PROXY:-/tmp/x509up_u$UID}

if ! check_proxy $MIN_PROXY_HOURS $proxy; then
  logerror
  logerror "You must create a new grid proxy"
  logerror "and rerun this command.  Example of how to create a grid proxy:"
  logerror
  logerror "voms-proxy-init --voms=cms --valid=48:00"
  exit 1
fi

# CRAB uses edmConfigChecksum to generate the PSetHash.
# Unfortunately, our template is not parsable, so that does not work.
# Therefore, we just run md5sum.  However, this does not produce the
# same hash value as edmConfigTemplate.
# Perhaps it would be better to generate the hash based on the first
# job.  (NOTE: registering a different hash for each job is not what
# DBS is expecting; this results in errors at publication time.)
for field in `md5sum ${configTemplate}`; do
  PSetHash=${field}
  break # checksum is first field, so we are done
done

PUB_DIR=${SUBMIT_DIR}-pub

if [ "${CONTINUE_PUB}" != "1" ] && [ -d ${PUB_DIR} ]; then
  logerror
  logerror "The directory ${PUB_DIR} already exists."
  logerror "Either rerun this script with the --continue option or"
  logerror "remove it and rerun this script.."
  exit 1
fi

mkdir -p ${PUB_DIR}/res

find ${SUBMIT_DIR} -name \*.xml | ModifyJobReport || exit 1


cd ${PUB_DIR} || die

if [ "$DBS_URL" = "" ]; then
  logerror
  logerror "ERROR: no --dbs-url specified; aborting before publication step"
  exit 1
fi

# crab expects a log directory
mkdir -p log

python ${FARMOUT_HOME}/CallCrabPublisher.py \
  --dbs-url=${DBS_URL} \
  --ProcessedDataset=${ProcessedDataset} \
  --pset=${configTemplate}