forked from vectara/vectara-ingest
-
Notifications
You must be signed in to change notification settings - Fork 0
/
run.sh
executable file
·108 lines (94 loc) · 4.11 KB
/
run.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
rgs[1] = config file
# args[2] = secrets profile
# example: sh run.sh <config>news-bbc.yaml dev
if [ $# -lt 2 ]; then
echo "Missing arguments."
echo "Usage: $0 <config-file> <secrets-profile>"
exit 1
fi
if [ ! -f "$1" ]; then
echo "Error: '$1' is not a valid configuration file"
exit 2
fi
if [ ! -f secrets.toml ]; then
echo "Error: secrets.toml file does not exist, please create one following the README instructions"
exit 3
fi
# retrieve the crawler type from the config file
crawler_type=`python3 -c "import yaml; print(yaml.safe_load(open('$1'))['crawling']['crawler_type'])" | tr '[:upper:]' '[:lower:]'`
# Mount secrets file and other files as needed into docker container
mkdir -p ~/tmp/mount
[ -f secrets.toml ] && cp secrets.toml ~/tmp/mount
cp "$1" ~/tmp/mount/
if [[ "$crawler_type" == "gdrive" ]]; then
[ -f credentials.json ] && cp credentials.json ~/tmp/mount
fi
# Build docker container
ARCH=$(uname -m)
if [[ "$ARCH" != "arm64" ]]; then
ARCH="amd64"
fi
# Determine the build command based on the availability of Buildx
function has_buildx() {
docker buildx version > /dev/null 2>&1
}
if has_buildx; then
BUILD_CMD="buildx build"
echo "Building for $ARCH with buildx"
else
BUILD_CMD="build"
echo "Building for $ARCH"
fi
sum_tables=`python3 -c "import yaml; print(yaml.safe_load(open('$1'))['vectara'].get('summarize_tables', 'false'))" | tr '[:upper:]' '[:lower:]'`
mask_pii=`python3 -c "import yaml; print(yaml.safe_load(open('$1'))['vectara'].get('mask_pii', 'false'))" | tr '[:upper:]' '[:lower:]'`
if [[ "$sum_tables" == "true" || "$mask_pii" == "true" ]]; then
echo "Building with extra features"
tag="vectara-ingest-full"
docker $BUILD_CMD --build-arg INSTALL_EXTRA="true" --platform linux/$ARCH . --tag="$tag:latest"
else
tag="vectara-ingest"
docker $BUILD_CMD --build-arg INSTALL_EXTRA="false" --platform linux/$ARCH . --tag="$tag:latest"
fi
if [ $? -eq 0 ]; then
echo "Docker build successful."
else
echo "Docker build failed. Please check the messages above. Exiting..."
exit 4
fi
# remove old container if it exists
docker container inspect vingest &>/dev/null && docker rm -f vingest
# Run docker container
config_file_name="${1##*/}"
if [[ "${crawler_type}" == "folder" ]]; then
# special handling of "folder crawler" where we need to mount the folder under /home/vectara/data
folder=`python3 -c "import yaml; print(yaml.safe_load(open('$1'))['folder_crawler']['path'])"`
if [ ! -d "$folder" ]; then
echo "Error: Folder '$folder' does not exist."
exit 6
fi
docker run -d -v ~/tmp/mount:/home/vectara/env -v "$folder:/home/vectara/data" -e CONFIG=/home/vectara/env/$config_file_name -e PROFILE=$2 --name vingest $tag
elif [[ "$crawler_type" == "csv" ]]; then
# special handling of "csv crawler" where we need to mount the csv file under /home/vectara/data
file_path=`python3 -c "import yaml; print(yaml.safe_load(open('$1'))['csv_crawler']['file_path'])"`
if [ ! -f "$file_path" ]; then
echo "Error: CSV file '$file_path' does not exist."
exit 5
fi
docker run -d -v ~/tmp/mount:/home/vectara/env -v "$file_path:/home/vectara/data/file" -e CONFIG=/home/vectara/env/$config_file_name -e PROFILE=$2 --name vingest $tag
elif [[ "$crawler_type" == "bulkupload" ]]; then
# special handling of "bulkupload crawler" where we need to mount the JSON file under /home/vectara/data
json_path=`python3 -c "import yaml; print(yaml.safe_load(open('$1'))['bulkupload_crawler']['json_path'])"`
if [ ! -f "$file_path" ]; then
echo "Error: CSV file '$json_path' does not exist."
exit 5
fi
docker run -d -v ~/tmp/mount:/home/vectara/env -v "$json_path:/home/vectara/data/file.json" -e CONFIG=/home/vectara/env/$config_file_name -e PROFILE=$2 --name vingest $tag
else
docker run -d -v ~/tmp/mount:/home/vectara/env -e CONFIG=/home/vectara/env/$config_file_name -e PROFILE=$2 --name vingest $tag
fi
if [ $? -eq 0 ]; then
echo "Success! Ingest job is running."
echo "You can try 'docker logs -f vingest' to see the progress."
else
echo "Ingest container failed to start. Please check the messages above."
fi