-
Notifications
You must be signed in to change notification settings - Fork 4
/
Makefile
59 lines (48 loc) · 3.37 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
############
# CLEANING #
############
# Make directory for clean dataset, if it doesn't already exist.
clean_data:
mkdir -p $@
# Clean products, product classes, promotions, and transactions.
# All cleaning tasks use the same generic python script.
clean_data/product.pkl: raw_data/product.csv clean_dataset.py clean_data
# '$<' is a special variable that stands for "1st input file".
# '$@' is a special variable that stands for "the output file".
cat $< | python clean_dataset.py $@
clean_data/product_class.pkl: raw_data/product_class.csv clean_dataset.py clean_data
cat $< | python clean_dataset.py $@
clean_data/promotion.pkl: raw_data/promotion.csv clean_dataset.py clean_data
# Clean up media_type values. Use sed to replace ', ' with pipes after line 1.
# promotions must be cleaned thru sed before being piped to cleaning script, or it will break.
# Note: make interprets '$' as a variable, '$$' makes sed filter work correctly.
cat $< | sed -e '2,$$ s/, / | /g' | python clean_dataset.py $@
clean_data/transactions.pkl: raw_data/transactions.csv clean_dataset.py clean_data
# Clean up fact_count values. Used sed to strip ');' characters from integers.
cat $< | sed 's/1);/1/g' | python clean_dataset.py $@
# Clean all datasets, if they haven't been cleaned already.
.PHONY: clean_datasets # This just tells Make that clean_dataset is a "phony" target - no output is produced.
clean_datasets: clean_data/product.pkl clean_data/product_class.pkl clean_data/promotion.pkl clean_data/transactions.pkl
############
# ANALYSIS #
############
# Run data quality analysis, which lives in a jupyter notebook, and save output to HTML.
notebooks/data_quality_analysis.html: clean_data/product.pkl clean_data/product_class.pkl clean_data/promotion.pkl clean_data/transactions.pkl notebooks/data_quality_analysis.ipynb
# Assign first 4 input files, the dataset filenames, to DATASET var.
$(eval DATASETS='$(wordlist 1, 4, $^)') # eval and wordlist are Make functions. '$^' is var for 'all input files'.
# Execute notebook cells with datasets as input, and send HTML output to stdout.
# Dataset filenames are passed to notebook using temp env var `datasets`.
datasets=$(DATASETS) jupyter nbconvert --execute notebooks/data_quality_analysis.ipynb --to html --stdout > $@
# Run complete analysis notebook for question one, save output to HTML.
notebooks/question_one.html: clean_data/product.pkl clean_data/product_class.pkl clean_data/promotion.pkl clean_data/transactions.pkl notebooks/question_one.ipynb
$(eval DATASETS='$(wordlist 1, 4, $^)')
datasets=$(DATASETS) jupyter nbconvert --execute notebooks/question_one.ipynb --to html --stdout > $@
# Run complete analysis notebook for question two, save output to HTML.
notebooks/question_two.html: clean_data/product.pkl clean_data/product_class.pkl clean_data/promotion.pkl clean_data/transactions.pkl notebooks/question_one.ipynb
$(eval DATASETS='$(wordlist 1, 4, $^)')
datasets=$(DATASETS) jupyter nbconvert --execute notebooks/question_two.ipynb --to html --stdout > $@
# Run executive summary notebook, save output to HTML.
notebooks/executive_summary.html: notebooks/question_one.html notebooks/question_two.html
jupyter nbconvert --execute notebooks/executive_summary.ipynb --to html --stdout > $@
.PHONY: all_analyses
all_analyses: notebooks/data_quality_analysis.html notebooks/question_one.html notebooks/question_two.html notebooks/executive_summary.html