forked from CU-DBMI/smartsheet-notebooks
-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_crawl.sh
executable file
·159 lines (133 loc) · 6.02 KB
/
run_crawl.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#!/usr/bin/env bash
# exit on any error
set -e
# create a network in which to run the PMC crawler and reformed
DOCKER_NETWORK="pmc-crawler"
docker network create pmc-crawler || echo "* Network '${DOCKER_NETWORK}' already exists, skipping creation..."
# which docker image to use to run the crawl.
# (you can either build pmc-crawler locally via build_all_images.sh, or use
# a version of the crawler from google's artifact repo. we're using the artifact
# repo version by default here since
CRAWLER_IMAGE=${CRAWLER_IMAGE:-"us-central1-docker.pkg.dev/cuhealthai-foundations/tools/pmc-crawler:latest"}
# ensure the format converter container is running
if ! ( docker ps | grep reformed >/dev/null 2>&1 ); then
echo "* Reformed isn't running, booting it now..."
docker run --rm -d \
--name reformed \
--network ${DOCKER_NETWORK} \
-p 8088:8000 \
ghcr.io/davidlougheed/reformed:sha-1b8f46b
fi
# produce default first-of-month and last-of-month values to use as start date, end date
t_first_date=$(date +%Y/%m/01)
if [[ "$OSTYPE" =~ ^darwin ]]; then
t_last_date=$( date -v1d -v+1m -v-1d +%Y/%m/%d )
else
t_last_date=$( date -d "`date +%Y%m01` +1 month -1 day" +%Y/%m/%d )
fi
# ---------------------------------------
# --- step 1. prompt for run parameters
# ---------------------------------------
# pre-step: extract params from the .env file, if available
ENV_FILE="./app/.env"
if [ -f "${ENV_FILE}" ]; then
ENV_AUTHORS_SHEET_ID=$( cat ${ENV_FILE} | grep -e '^AUTHORS_SHEET_ID=' | cut -d'=' -f2 )
ENV_AUTHORS_SHEET_PATH=$( cat ${ENV_FILE} | grep -e '^AUTHORS_SHEET_PATH=' | cut -d'=' -f2 )
ENV_DEPARTMENT=$( cat ${ENV_FILE} | grep -e '^DEPARTMENT=' | cut -d'=' -f2 )
ENV_DEPARTMENT_NAME=$( cat ${ENV_FILE} | grep -e '^DEPARTMENT_NAME=' | cut -d'=' -f2 )
fi
# load defaults for arguments in the following order:
# 1. from an env var with the same name as the argument
# 2. for authors, dept. args: from the ./app/.env file with the same name as the argument
# 3. for start, end dates, from a precomputed value
AUTHORS_SHEET_ID=${AUTHORS_SHEET_ID:-${ENV_AUTHORS_SHEET_ID:-""}}
AUTHORS_SHEET_PATH=${AUTHORS_SHEET_PATH:-${ENV_AUTHORS_SHEET_PATH:-""}}
DEPARTMENT=${DEPARTMENT:-${ENV_DEPARTMENT:-""}}
DEPARTMENT_NAME=${DEPARTMENT_NAME:-${ENV_DEPARTMENT_NAME:-""}}
# finally, accept the author sheet as an optional positional param
if [ ! -z "$1" ]; then
AUTHORS_SHEET_PATH=$1
fi
if [ ! ${START_DATE+x} ]; then
read -p "- Enter start date [${t_first_date}]: " INPUT_START_DATE
START_DATE=${INPUT_START_DATE:-${t_first_date}}
elif [ -z "${START_DATE}" ]; then
# use the default if an empty string was explicitly provided
START_DATE=${t_first_date}
fi
if [ ! ${END_DATE+x} ]; then
read -p "- Enter end date [${t_last_date}]: " INPUT_END_DATE
END_DATE=${INPUT_END_DATE:-${t_last_date}}
elif [ -z "${END_DATE}" ]; then
# use the default if an empty string was explicitly provided
END_DATE=${t_last_date}
fi
# if AUTHORS_SHEET_PATH is specified, don't prompt for a smartsheet sheet ID
if [ ! -z "${AUTHORS_SHEET_PATH}" ]; then
echo "* Using sheet specified in AUTHORS_SHEET_PATH (${AUTHORS_SHEET_PATH})"
else
# default to prompting for an author sheet ID
if [ -z "${AUTHORS_SHEET_ID}" ]; then
# if it's unset, give them a chance to set it
read -p "- Enter smartsheet ID for authors' sheet [${AUTHORS_SHEET_ID}]: " INPUT_AUTHORS_SHEET_ID
AUTHORS_SHEET_ID=${INPUT_AUTHORS_SHEET_ID:-${AUTHORS_SHEET_ID}}
else
# it was in the env file, so let the user know
echo "* Got AUTHORS_SHEET_ID from .env file: ${AUTHORS_SHEET_ID}"
fi
fi
if [ ! ${DEPARTMENT+x} ]; then
read -p "- Enter department (a blank value disables this filter): " INPUT_DEPARTMENT
DEPARTMENT=${INPUT_DEPARTMENT:-""}
else
# clear DEPARTMENT, disabling it, if the user provided the sentinel 'n/a' string
DEPARTMENT=""
fi
if [ -z "${DEPARTMENT_NAME}" ]; then
read -p "- Enter department name, for customizing the report: " INPUT_DEPARTMENT_NAME
DEPARTMENT_NAME=${INPUT_DEPARTMENT_NAME:-""}
else
# set it to a reasonable default
DEPARTMENT_NAME="List of Publications"
fi
# verify inputs
# if both AUTHORS_SHEET_ID and AUTHORS_SHEET_PATH are set, raise an error
if ( [ -z "${AUTHORS_SHEET_PATH}" ] && [ -z "${AUTHORS_SHEET_ID}" ] ) || [ "${AUTHORS_SHEET_ID:-1}" -eq -1 ]; then
echo "ERROR: either author sheet path or author sheet ID required, but neither were specified"
exit 1
fi
# if we're using AUTHORS_SHEET_PATH, ensure that it's an accessible file
if [ ! -z "${AUTHORS_SHEET_PATH}" ] && [ ! -f "${AUTHORS_SHEET_PATH}" ]; then
echo "ERROR: author sheet path ('${AUTHORS_SHEET_PATH}') is not accessible"
exit 1
fi
# -------------------------------------------------------------------
# --- step 2. start the run with the entered params, storing artifacts in ./output
# -------------------------------------------------------------------
mkdir -p ./app/input_sheets # should exist, but let's just make sure
mkdir -p output
mkdir -p intermediate
# if a local sheet was used, copy that into the container's input staging area
if [ ! -z "${AUTHORS_SHEET_PATH}" ]; then
cp "${AUTHORS_SHEET_PATH}" ./app/input_sheets/ || \
echo "ERROR: failed to copy author sheet to staging area, continuing..."
# remap author's sheet path so it's relative to this staging area
AUTHORS_SHEET_PATH=/app/input_sheets/$( basename "${AUTHORS_SHEET_PATH}" )
fi
# clean up any old containers before running
docker rm --force pmc-crawler >/dev/null 2>&1
time (
docker run --init -it --name pmc-crawler \
--network ${DOCKER_NETWORK} \
-e START_DATE="${START_DATE}" \
-e END_DATE="${END_DATE}" \
-e "AUTHORS_SHEET_ID=${AUTHORS_SHEET_ID}" \
-e "AUTHORS_SHEET_PATH=${AUTHORS_SHEET_PATH}" \
-e DEPARTMENT="${DEPARTMENT}" \
-e DEPARTMENT_NAME="${DEPARTMENT_NAME:-''}" \
-v $PWD/app:/app \
-v $PWD/output:/app/_build \
-v $PWD/intermediate:/app/_output \
--env-file ./app/.env \
${CRAWLER_IMAGE}
)