astropgh · hsnee · Oct 2, 2018 · Oct 2, 2018 · Oct 2, 2018 · Oct 3, 2018
diff --git a/task-01/completed.md b/task-01/completed.md
@@ -1,2 +1,2 @@
 ## Those who have completed this task:
-
+hsnee
diff --git a/task-03/get_top_names.py b/task-03/get_top_names.py
@@ -9,12 +9,24 @@ def extract_data_lines(filename, start_text, end_text):
     open `filename`, and yield the lines between
     the line that contains `start_text` and the line that contains `end_text`
     """
-    # fill in code as needed
+    turn_on = False
     with open(filename) as fh:
-        for line in fh:
-            # fill in code as needed
-            # use `yield line` to return desired lines but keep the function going
+        for i,line in enumerate(fh):
+            if turn_on=='done': break
 
+            if end_text in line:
+                if include_end:
+                    turn_on = 'done'
+                    yield line
+                break
+
+            if turn_on: yield line
+
+            if start_text in line:
+                if include_start:
+                    turn_on = True
+                    yield line
+                turn_on = True
 
 if __name__ == '__main__':
     filename = 'top5names.html'

diff --git a/task-04/README.md b/task-04/README.md
@@ -23,7 +23,7 @@ year, ranking, and gender altogether. A possible way to encode the information i
 ```python
 year = 2017 - index // 10
 rank = index % 5 + 1
-gender = 'female' if index % 10 < 5 else 'male`
+gender = 'female' if index % 10 < 5 else 'male'
 ```
 While this data model preserves all the information, it is unlikely that this
 model will be very convenient when it comes to data exploration.

diff --git a/task-07/README.md b/task-07/README.md
@@ -0,0 +1,33 @@
+# Task 7: Data scraping
+
+*prerequisites*: [Task 3](../task-03), [Task 4](../task-04)
+
+Finally, we will now actually do the data scraping!
+
+The data model we will use for this task would be
+a table with 4 columns: year, gender, rank, name.
+Each row in this table corresponds to one cell that contains one name
+in the original table on the website.
+
+Each year will result in 10 rows. The first two rows of this table would look like:
+
+| year | gender | rank | name |
+| ---- |--------| -----| ---- |
+| 2017 | female | 1 | Emma |
+| 2017 | female | 2 | Olivia |
+
+
+## Steps
+
+*(Do Step 1-3 in `task-07/get_top_names.py`)*
+
+1. Copy the answer from Task 3 to complete the function `extract_data_lines`
+2. Complete the data scraping: add a for loop that iterates over `data_lines` and
+   append a tuple to `records` for each name (corresponds to each row in the new table).
+3. Use the table we constructed (stored as a pandas data frame to answer the questions in Task 4.)
+4. Submit a pull request for your solution.
+
+## Food for thought:
+- Do you think this is a good data model? Why or why not?
+- What assumptions did you make when you implement Step 2?
+  How likely will your scraping method fail if the underlying webpage source changes?
diff --git a/task-07/get_top_names.py b/task-07/get_top_names.py
@@ -0,0 +1,76 @@
+"""
+get_top_names.py
+For astrophg/learning-by-doing: Task 7
+https://github.com/astropgh/learning-by-doing/tree/master/task-07
+"""
+
+import pandas as pd
+import re
+from itertools import groupby
+
+def extract_data_lines(filename, start_text, end_text, include_start=False, include_end=False):
+    """
+    open `filename`, and yield the lines between
+    the line that contains `start_text` and the line that contains `end_text`
+    """
+    turn_on = False
+    with open(filename) as fh:
+        for i, line in enumerate(fh):
+            if turn_on == 'done': break
+
+            if end_text in line:
+                if include_end:
+                    turn_on = 'done'
+                    yield line
+                break
+
+            if turn_on: yield line
+
+            if start_text in line:
+                if include_start:
+                    turn_on = True
+                    yield line
+                turn_on = True
+
+
+if __name__ == '__main__':
+    filename = '../task-03/top5names.html'
+    start_text = '<tr><td align="center">2017</td>'
+    end_text = '</table></center></div><!-- end #content -->'
+
+    records = []
+    data_lines = extract_data_lines(filename, start_text, end_text, include_start=True)
+    for data_line in data_lines:
+
+    genders = {i:'Female' if i<5 else 'Male' for i in range(10)}
+    for line in extract_data_lines(filename, start_text, end_text, include_start=True, include_end=False):
+            line = re.split(' |align="center"|<td|>|<td>|<tr>|</td>|</tr>|\n', line)
+            line = list(filter(None, line))
+            if len(line)==1:
+                year = int(line[0])
+            else:
+                for i, name in enumerate(line):
+                    records.append((year, genders[i], i%5+1, name))
+
+    data = pd.DataFrame.from_records(records, columns=['year', 'gender', 'rank', 'name'])
+
+    # Add your code for Step 3 here
+    # You will use `data` to find and print out the answers for each questions listed in Task 4
+
+    # For example, to answer question 1 Which years Emma is the most chosen names?:
+    print(data.query('name == "Emma"').query('rank == 1')['year'].tolist())
+
+    # question 2 Which name had been the most chosen name for the longest consecutive years?
+    for gender in ["Male", "Female"]:
+        df1 = df.query('gender == "'+str(gender)+'"').query('rank == 1')['name'].tolist()
+        print(gender+' name with most occurences is ',
+            sorted([(name, sum(1 for _ in occurence)) for name, occurence in groupby(df2)], key=lambda x: x[1])[:-1][0][0])
+    # question 3 How many unique male names have be on top 5 between years 1980 and 2000?
+    print(len(set(df[np.logical_and.reduce((df['gender']=='Male', df['rank']<=6 , df['year']>=1980, df['year']<=2000))]['name'])))
+
+    # question 4 Are there more unique male names or more unique female names that are on top 5? prints True if more unique male names
+    print(len(set(df[np.logical_and(df['gender']=='Male', df['rank']<=5)]['name']))/len(set(df[np.logical_and(df['gender']=='Female', df['rank']<=5)]['name']))>1)
+
+    # question 5 What is the distribution of the numbers of consecutive years that a male name remains the most chosen name?
+    df2 = df.query('gender == "Male"').query('rank == 1')['name'].tolist()
+    np.histogram([(sum(1 for _ in occurence)) for _, occurence in groupby(df2)])