-
Notifications
You must be signed in to change notification settings - Fork 5
/
scrape-many-articles.sh
executable file
·51 lines (44 loc) · 1.26 KB
/
scrape-many-articles.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#!/bin/bash
# scrapes *many* articles from XML on the filesystem.
# see `generate-article-json.sh` to scrape *all* articles from XML.
# see `scrape-article.sh` to scrape a *single* article from XML.
set -eu
# a plain text file with one filename per line. should look similar to:
# elife-09419-v2.xml
# elife-15192-v2.xml
# elife-16019-v1.xml
scrape_file=${1:-scrape.txt}
output_dir=${2:-.}
if [ ! -f "$scrape_file" ]; then
echo "input file doesn't exist"
exit 1
fi
if [ ! -d "$output_dir" ]; then
echo "output directory doesn't exist"
exit 1
fi
# we ingest from the latest on the master branch
project_dir=$(pwd) # bot-lax project, where this script lives
xmlrepodir="$project_dir/article-xml/articles"
(
. download-elife-xml.sh
cd "$xmlrepodir"
# do this because 'download-elife-xml.sh' obeys article-xml repository pin
git reset --hard
git checkout master
git pull
)
source venv/bin/activate
while IFS= read -r line; do
article="$xmlrepodir/$line"
# skip whitespace
if [[ -z "${line// }" ]]; then
continue
fi
# skip missing files
if [ ! -f "$article" ]; then
echo "file not found: $article"
continue
fi
python src/main.py "$article" > "$output_dir/$line.json"
done < "$scrape_file"