generated from coderefinery/word-count
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit a965358
Showing
27 changed files
with
38,180 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
*~ | ||
__pycache__/ | ||
_build/ | ||
venv/ | ||
results/*.png | ||
results/*.txt | ||
processed_data/*.dat |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
# .readthedocs.yml | ||
# Read the Docs configuration file | ||
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details | ||
|
||
# Required | ||
version: 2 | ||
|
||
# Build documentation in the docs/ directory with Sphinx | ||
sphinx: | ||
configuration: doc/conf.py | ||
|
||
# Optionally build your docs in additional formats such as PDF and ePub | ||
formats: all |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
MIT License | ||
|
||
Copyright (c) 2019, CodeRefinery | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in all | ||
copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
SOFTWARE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
# directory containing source data | ||
SRCDIR := data | ||
|
||
# directory containing intermediate data | ||
TMPDIR := processed_data | ||
|
||
# results directory | ||
RESDIR := results | ||
|
||
# all source files (book texts) | ||
SRCS = $(wildcard $(SRCDIR)/*.txt) | ||
|
||
# all intermediate data files | ||
DATA = $(patsubst $(SRCDIR)/%.txt,$(TMPDIR)/%.dat,$(SRCS)) | ||
|
||
# all images | ||
IMAGES = $(patsubst $(SRCDIR)/%.txt,$(RESDIR)/%.png,$(SRCS)) | ||
|
||
all: $(DATA) $(IMAGES) $(RESDIR)/results.txt | ||
|
||
$(TMPDIR)/%.dat: $(SRCDIR)/%.txt source/wordcount.py | ||
python source/wordcount.py $< $@ | ||
|
||
$(RESDIR)/%.png: $(TMPDIR)/%.dat source/plotcount.py | ||
python source/plotcount.py $< $@ | ||
|
||
$(RESDIR)/results.txt: $(DATA) source/zipf_test.py | ||
python source/zipf_test.py $(DATA) > $@ | ||
|
||
clean: | ||
@$(RM) $(TMPDIR)/* | ||
@$(RM) $(RESDIR)/* | ||
|
||
.PHONY: clean directories |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/coderefinery/word-count/HEAD) | ||
|
||
# Word count example | ||
|
||
These programs will count words in a given text, plot a bar chart of the 10 | ||
most common words, and test [Zipf's | ||
law](https://en.wikipedia.org/wiki/Zipf%27s_law) on the two most common words. | ||
|
||
- Inspired by and derived from https://hpc-carpentry.github.io/hpc-python/ | ||
which is distributed under | ||
[Creative Commons Attribution license (CC-BY 4.0)](https://creativecommons.org/licenses/by/4.0/). | ||
- Documentation: https://word-count.readthedocs.io | ||
|
||
We use this example in two [CodeRefinery](https://coderefinery.org/) lessons: | ||
- https://coderefinery.github.io/reproducible-research/ | ||
- https://coderefinery.github.io/documentation/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
# a list of all the books we are analyzing | ||
DATA = glob_wildcards('data/{book}.txt').book | ||
|
||
# this is for running on HPC resources | ||
localrules: all, make_archive | ||
|
||
# the default rule | ||
rule all: | ||
input: | ||
'zipf_analysis.tar.gz' | ||
|
||
# count words in one of our books | ||
# logfiles from each run are put in .log files" | ||
rule count_words: | ||
input: | ||
wc='source/wordcount.py', | ||
book='data/{file}.txt' | ||
output: 'processed_data/{file}.dat' | ||
threads: 4 | ||
log: 'processed_data/{file}.log' | ||
shell: | ||
''' | ||
python {input.wc} {input.book} {output} >> {log} 2>&1 | ||
''' | ||
|
||
# create a plot for each book | ||
rule make_plot: | ||
input: | ||
plotcount='source/plotcount.py', | ||
book='processed_data/{file}.dat' | ||
output: 'results/{file}.png' | ||
shell: 'python {input.plotcount} {input.book} {output}' | ||
|
||
# generate summary table | ||
rule zipf_test: | ||
input: | ||
zipf='source/zipf_test.py', | ||
books=expand('processed_data/{book}.dat', book=DATA) | ||
output: 'results/results.txt' | ||
shell: 'python {input.zipf} {input.books} > {output}' | ||
|
||
# create an archive with all of our results | ||
rule make_archive: | ||
input: | ||
expand('results/{book}.png', book=DATA), | ||
expand('processed_data/{book}.dat', book=DATA), | ||
'results/results.txt' | ||
output: 'zipf_analysis.tar.gz' | ||
shell: 'tar -czvf {output} {input}' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
../environment.yml |
Oops, something went wrong.