Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
KiralyAgi committed May 18, 2021
0 parents commit a965358
Show file tree
Hide file tree
Showing 27 changed files with 38,180 additions and 0 deletions.
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
*~
__pycache__/
_build/
venv/
results/*.png
results/*.txt
processed_data/*.dat
13 changes: 13 additions & 0 deletions .readthedocs.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# .readthedocs.yml
# Read the Docs configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details

# Required
version: 2

# Build documentation in the docs/ directory with Sphinx
sphinx:
configuration: doc/conf.py

# Optionally build your docs in additional formats such as PDF and ePub
formats: all
21 changes: 21 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2019, CodeRefinery

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
34 changes: 34 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# directory containing source data
SRCDIR := data

# directory containing intermediate data
TMPDIR := processed_data

# results directory
RESDIR := results

# all source files (book texts)
SRCS = $(wildcard $(SRCDIR)/*.txt)

# all intermediate data files
DATA = $(patsubst $(SRCDIR)/%.txt,$(TMPDIR)/%.dat,$(SRCS))

# all images
IMAGES = $(patsubst $(SRCDIR)/%.txt,$(RESDIR)/%.png,$(SRCS))

all: $(DATA) $(IMAGES) $(RESDIR)/results.txt

$(TMPDIR)/%.dat: $(SRCDIR)/%.txt source/wordcount.py
python source/wordcount.py $< $@

$(RESDIR)/%.png: $(TMPDIR)/%.dat source/plotcount.py
python source/plotcount.py $< $@

$(RESDIR)/results.txt: $(DATA) source/zipf_test.py
python source/zipf_test.py $(DATA) > $@

clean:
@$(RM) $(TMPDIR)/*
@$(RM) $(RESDIR)/*

.PHONY: clean directories
16 changes: 16 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/coderefinery/word-count/HEAD)

# Word count example

These programs will count words in a given text, plot a bar chart of the 10
most common words, and test [Zipf's
law](https://en.wikipedia.org/wiki/Zipf%27s_law) on the two most common words.

- Inspired by and derived from https://hpc-carpentry.github.io/hpc-python/
which is distributed under
[Creative Commons Attribution license (CC-BY 4.0)](https://creativecommons.org/licenses/by/4.0/).
- Documentation: https://word-count.readthedocs.io

We use this example in two [CodeRefinery](https://coderefinery.org/) lessons:
- https://coderefinery.github.io/reproducible-research/
- https://coderefinery.github.io/documentation/
49 changes: 49 additions & 0 deletions Snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# a list of all the books we are analyzing
DATA = glob_wildcards('data/{book}.txt').book

# this is for running on HPC resources
localrules: all, make_archive

# the default rule
rule all:
input:
'zipf_analysis.tar.gz'

# count words in one of our books
# logfiles from each run are put in .log files"
rule count_words:
input:
wc='source/wordcount.py',
book='data/{file}.txt'
output: 'processed_data/{file}.dat'
threads: 4
log: 'processed_data/{file}.log'
shell:
'''
python {input.wc} {input.book} {output} >> {log} 2>&1
'''

# create a plot for each book
rule make_plot:
input:
plotcount='source/plotcount.py',
book='processed_data/{file}.dat'
output: 'results/{file}.png'
shell: 'python {input.plotcount} {input.book} {output}'

# generate summary table
rule zipf_test:
input:
zipf='source/zipf_test.py',
books=expand('processed_data/{book}.dat', book=DATA)
output: 'results/results.txt'
shell: 'python {input.zipf} {input.books} > {output}'

# create an archive with all of our results
rule make_archive:
input:
expand('results/{book}.png', book=DATA),
expand('processed_data/{book}.dat', book=DATA),
'results/results.txt'
output: 'zipf_analysis.tar.gz'
shell: 'tar -czvf {output} {input}'
1 change: 1 addition & 0 deletions binder/environment.yml
Loading

0 comments on commit a965358

Please sign in to comment.