forked from hadojae/DATA
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathslickshoes.sh
34 lines (26 loc) · 1.31 KB
/
slickshoes.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
#!/bin/bash
#pull links out of pdf files
#hadojae
#param is path to pdf files
pdf_parser_loc=''
if [ -z $pdf_parser_loc ]; then
echo -e "\nPlease set the full path location of pdf-parser.py in the first line of the script."
echo -e "eg. '/home/user/downloads/pdf-parser.py' "
echo -e "Pdf parser can be downloaded from https://blog.didierstevens.com/programs/pdf-tools/\n"
exit
fi
cd $1
for i in *
do
file_type="$(file -b $i)"
if [ "${file_type%%,*}" == "PDF document" ]; then
egrep -i -a -r -o --no-filename "http[^)]+" $i >> pdf_links.tmp
egrep -i -a -o --no-filename "\/URI\s*\([^)]+\)" $i | sed -r 's/.*\(([^)]+)\).*/\1/' >> pdf_links.tmp
python $pdf_parser_loc --regex --searchstream="https?:\/" --filter $i | egrep -i -a -o "http[^)\"\']+" >> pdf_links.tmp
fi
done
#if we found some urls, remove fp's, print file, and cleanup
if [ -f pdf_links.tmp ]; then
sort -u pdf_links.tmp | egrep -i -v -a "(DidierStevens\.com|fonts\.com|typoland\.com|monotypeimaging\.com|monotype\.com|dynaforms\.com|youtube\.com|radpdf\.com|igouv\.fr|support\.microsoft\.com|maps\.google\.com|wikipedia\.org|mitre\.org|code\.google\.com|www\.w3\.org|purl\.org|adobe\.com|convertapi\.com|wondershare\.net|iec\.ch|verisign\.com|microsoft\.com|neevia\.com|aiim\.org|pdf\-tools\.com|color\.org)";
rm pdf_links.tmp
fi