From 8e89677822879ae7c7b16dbf148afa17ca086db6 Mon Sep 17 00:00:00 2001 From: Husni Almoubayyed Date: Tue, 2 Oct 2018 16:06:29 -0400 Subject: [PATCH 01/11] task-01 completed --- task-01/completed.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/task-01/completed.md b/task-01/completed.md index bca9187..baec4d1 100644 --- a/task-01/completed.md +++ b/task-01/completed.md @@ -1,2 +1,2 @@ ## Those who have completed this task: - +hsnee From e0d2f4caf2cf2fe2647470d669a325546de10020 Mon Sep 17 00:00:00 2001 From: Yao-Yuan Mao Date: Tue, 2 Oct 2018 14:10:25 -0400 Subject: [PATCH 02/11] bug fix --- task-03/get_top_names.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/task-03/get_top_names.py b/task-03/get_top_names.py index b77911c..4535204 100644 --- a/task-03/get_top_names.py +++ b/task-03/get_top_names.py @@ -16,7 +16,7 @@ def extract_data_lines(filename, start_text, end_text): # use `yield line` to return desired lines but keep the function going -if name == '__main__': +if __name__ == '__main__': filename = 'top5names.html' start_text = '2017' end_text = '' From 8a160151249a8cccea5f1d6ae8fcd7648ed7f822 Mon Sep 17 00:00:00 2001 From: Yao-Yuan Mao Date: Tue, 2 Oct 2018 19:35:40 -0400 Subject: [PATCH 03/11] add task 4 --- task-04/README.md | 68 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 task-04/README.md diff --git a/task-04/README.md b/task-04/README.md new file mode 100644 index 0000000..1336400 --- /dev/null +++ b/task-04/README.md @@ -0,0 +1,68 @@ +# Task 4: [database] Preparing for data scraping: design a data model for top baby names + +## Background + +Before we start to scrape the top baby names from the webpage, we need to design +a data model that we will use to store the data. + +The term "data model" has different meanings in different contexts. +We can ask what kind of object the data will be stored in. +A python list? A python dictionary? A pandas data frame? +For a given type, we can further ask how the data is stored. +For example, if we store the data in a pandas data frame, we can ask what +are the columns and rows. + +Let's look at some examples. +The original webpage store the names as a table, with columns being +`year`, `female_rank1`, `female_rank2`, `male_rank1`, `male_rank2`..., and +each row corresponds to one single year. + +A more extreme example would be storing the names as a sequence (say a python list), +the content of the sequence will be the names, while the indices of the sequence encode +year, ranking, and gender altogether. A possible way to encode the information is +```python +year = 2017 - index // 10 +rank = index % 5 + 1 +gender = 'female' if index % 10 < 5 else 'male` +``` +While this data model preserves all the information, it is unlikely that this +model will be very convenient when it comes to data exploration. + +Yet another totally different data model is to group the data by names. +Let's say we'll store the data in a python dictionary. A possible way is: +```python +{ + 'Emma':{ + 'gender': 'female', + 'years_ranked_1': [2017, 2016, 2015, 2014, ...], + 'years_ranked_2': [2013, 2012, 2009, ...], + 'years_ranked_3': [...], + }, + 'Noah':{ + ..., + }, + ..., +} +``` + +Note that the form (object) that the data is stored and how the data is structured +are two different things. (*Food for thoughts: why? can you give an example?*) + +Clearly, the choice of data model heavily depends on the questions that we would +like to answer with the data. +If the amount of data is very large, we will also need to consider the avabilable +computing resources like memory usage and I/O speed when designing the data model. +For now, we don't yet need to worry about the limitation due to computing resources. + + +## Task + +Try to come up with a data model that is good for answering each of the following questions. +Think about the code you'll need to write to interact with the data model to answer +these questions. + +1. Which years Emma is the most chosen names? +2. Which name had been the most chosen name for the longest consecutive years? +3. How many unique male names have be on top 5 between years 1980 and 2000? +4. Are there more unique male names or more unique female names that are on top 5? +5. What is the distribution of the numbers of consecutive years that a male name remains the most chosen name? From 458086869b9d18d5cd9df2d8a164c58bcd0a1e65 Mon Sep 17 00:00:00 2001 From: Yao-Yuan Mao Date: Wed, 3 Oct 2018 14:07:26 -0400 Subject: [PATCH 04/11] add task-01 solution --- task-01/solution.md | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 task-01/solution.md diff --git a/task-01/solution.md b/task-01/solution.md new file mode 100644 index 0000000..a194ab8 --- /dev/null +++ b/task-01/solution.md @@ -0,0 +1,43 @@ +# Solution to Task 1: [git] Fork a repo and submit a pull request + +## Steps + +1. Fork `astropgh/learning-by-doing` repository + +> Click the "fork" button on the upper right corner on GitHub. + +2. Clone your fork + +```bash +git clone git@github.com:yourusername/learning-by-doing.git +``` + +3. Checkout a new branch called `task/01` + +```bash +cd learning-by-doing +git checkout -b task/01 +``` + +4. Add your GitHub username to `task-01/completed.md` + +```bash +echo "yourusername" >> task-01/completed.md +``` + +5. Commit your change to `task/01` + +```bash +git add task-01/completed.md +git commit -m "add my username to complete task 01" +``` + +6. Push `task/01` to your fork + +```bash +git push origin task/01 +``` + +7. Submit a pull request + +> Click "Create pull request" button on GitHub From b033c2b5288f1c32fb569ee813d3bbb7e2879e79 Mon Sep 17 00:00:00 2001 From: Yao-Yuan Mao Date: Wed, 3 Oct 2018 22:39:39 -0400 Subject: [PATCH 05/11] fix typo --- task-02/README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/task-02/README.md b/task-02/README.md index 4b418ae..ab38dd1 100644 --- a/task-02/README.md +++ b/task-02/README.md @@ -16,8 +16,7 @@ - https://help.github.com/articles/configuring-a-remote-for-a-fork/ - https://help.github.com/articles/syncing-a-fork/ -## Food for thoughts +## Food for thought - What's the difference between a fork and a clone? - What's the difference between `origin` and `upstream` in this case? - What's the benefit to work on new branches like `task/01` and `task/02`, rather than on `master` directly? - From 633554a795eb51162cf65cd1172c45e0289be15a Mon Sep 17 00:00:00 2001 From: Yao-Yuan Mao Date: Wed, 3 Oct 2018 22:42:52 -0400 Subject: [PATCH 06/11] add task 5 --- task-05/README.md | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 task-05/README.md diff --git a/task-05/README.md b/task-05/README.md new file mode 100644 index 0000000..8c91522 --- /dev/null +++ b/task-05/README.md @@ -0,0 +1,27 @@ +# Task 5: [git] Merge and rebase + +*prerequisites*: [Task 1](../task-01), [Task 2](../task-02) + +We will now learn two basic operations of git branches: merge and rebase. +As always, you can find lots of information about this on the Internet, +and here we will go ahead to learn by trying them out. + +## Part 1 +Complete Level 1 through 4 on https://learngitbranching.js.org/ + +## Part 2 +1. Go back to your clone of `learning-by-doing`. Make sure you've completed Tasks [1](../task-01) and [2](../task-02). +2. Do **only** Step 2 of [Task 2](../task-02) again. +3. Now the `master` branch and your `task/01` branch have diverged, and you will rebase `task/01` onto `master`. +4. Go to see your PR at https://github.com/astropgh/learning-by-doing/pulls, does it somehow change? Why? + +## Part 3 +*Note: Do Part 2 first!* + +1. Checkout a new branch called `task/05` from `master` (*What does this mean?*) +2. Add a new file `task-05/test` and commit it to `task/05` +3. Merge `task/05` into `task/01` +4. Go to see your PR at https://github.com/astropgh/learning-by-doing/pulls, does it somehow change? Why? + +## Food for thought +- What's the difference between "rebase" and "merge"? From fc6cc2ad5db2982c198cb6f3286d7ef315c8f2a1 Mon Sep 17 00:00:00 2001 From: Yao-Yuan Mao Date: Wed, 3 Oct 2018 22:43:00 -0400 Subject: [PATCH 07/11] add task 6 --- task-06/README.md | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 task-06/README.md diff --git a/task-06/README.md b/task-06/README.md new file mode 100644 index 0000000..d86075f --- /dev/null +++ b/task-06/README.md @@ -0,0 +1,9 @@ +# Task 6: [database] Basic SQL + +Complete "Basic SQL" Lessons 1 through 6 on https://community.modeanalytics.com/sql/ + +## Extension +Complete "Basic SQL" Lessons 7 through 15 on https://community.modeanalytics.com/sql/ + +## Food for thought +- After learning the basic SQL operation, would you change your answers to [Task 4](../task-04)? From 36658e5d0eafbd28d2d72d37739e89fcfcf294a8 Mon Sep 17 00:00:00 2001 From: Husni Almoubayyed Date: Sun, 7 Oct 2018 15:16:58 -0400 Subject: [PATCH 08/11] completing get_top_names generator --- task-03/get_top_names.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/task-03/get_top_names.py b/task-03/get_top_names.py index 4535204..f7c4c2c 100644 --- a/task-03/get_top_names.py +++ b/task-03/get_top_names.py @@ -9,12 +9,24 @@ def extract_data_lines(filename, start_text, end_text): open `filename`, and yield the lines between the line that contains `start_text` and the line that contains `end_text` """ - # fill in code as needed + turn_on = False with open(filename) as fh: - for line in fh: - # fill in code as needed - # use `yield line` to return desired lines but keep the function going + for i,line in enumerate(fh): + if turn_on=='done': break + if end_text in line: + if include_end: + turn_on = 'done' + yield line + break + + if turn_on: yield line + + if start_text in line: + if include_start: + turn_on = True + yield line + turn_on = True if __name__ == '__main__': filename = 'top5names.html' From 36d4b885a8918b2ebc11c3a240daa3f53675a263 Mon Sep 17 00:00:00 2001 From: Husni Almoubayyed Date: Tue, 9 Oct 2018 17:12:37 -0400 Subject: [PATCH 09/11] didn't need to enumerate --- task-03/get_top_names.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/task-03/get_top_names.py b/task-03/get_top_names.py index f7c4c2c..54aaf9a 100644 --- a/task-03/get_top_names.py +++ b/task-03/get_top_names.py @@ -11,21 +11,17 @@ def extract_data_lines(filename, start_text, end_text): """ turn_on = False with open(filename) as fh: - for i,line in enumerate(fh): + for line in fh: if turn_on=='done': break if end_text in line: - if include_end: - turn_on = 'done' - yield line + if include_end: turn_on = 'done'; yield line break if turn_on: yield line if start_text in line: - if include_start: - turn_on = True - yield line + if include_start: turn_on = True; yield line turn_on = True if __name__ == '__main__': From 679ee5adea6058892f8bf86a6c133fcc9fec7f9f Mon Sep 17 00:00:00 2001 From: Husni Almoubayyed Date: Tue, 9 Oct 2018 17:13:54 -0400 Subject: [PATCH 10/11] slightly shorter --- task-03/get_top_names.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/task-03/get_top_names.py b/task-03/get_top_names.py index 54aaf9a..dc59efb 100644 --- a/task-03/get_top_names.py +++ b/task-03/get_top_names.py @@ -21,8 +21,9 @@ def extract_data_lines(filename, start_text, end_text): if turn_on: yield line if start_text in line: - if include_start: turn_on = True; yield line turn_on = True + if include_start: yield line + if __name__ == '__main__': filename = 'top5names.html' From 2f7a0229d589b261e606b6410e96bd0fbd4b2d59 Mon Sep 17 00:00:00 2001 From: Husni Almoubayyed Date: Tue, 9 Oct 2018 17:15:48 -0400 Subject: [PATCH 11/11] slightly shorterer --- task-03/get_top_names.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/task-03/get_top_names.py b/task-03/get_top_names.py index dc59efb..85d915a 100644 --- a/task-03/get_top_names.py +++ b/task-03/get_top_names.py @@ -15,8 +15,8 @@ def extract_data_lines(filename, start_text, end_text): if turn_on=='done': break if end_text in line: - if include_end: turn_on = 'done'; yield line - break + turn_on = 'done' + if include_end: yield line if turn_on: yield line