diff --git a/README.md b/README.md index 5a7d2c1..a8f1660 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,56 @@ -# learning-by-doing - -a set of tasks for people to learn basic software and data science skills by completing the tasks - - -## License - -Except as otherwise noted, the tutorial content of this `astropgh/learning-by-doing` is licensed under the [Creative Commons Attribution-ShareAlike 4.0 International License](http://creativecommons.org/licenses/by-sa/4.0/) [![CC BY-SA 4.0](https://i.creativecommons.org/l/by-sa/4.0/80x15.png)](http://creativecommons.org/licenses/by-sa/4.0/), and the code samples are licensed under the [MIT License](https://opensource.org/licenses/MIT). +# Task 4: [database] Preparing for data scraping: design a data model for top baby names +## Background +Before we start to scrape the top baby names from the webpage, we need to design +a data model that we will use to store the data. +The term "data model" has different meanings in different contexts. +We can ask what kind of object the data will be stored in. +A python list? A python dictionary? A pandas data frame? +For a given type, we can further ask how the data is stored. +For example, if we store the data in a pandas data frame, we can ask what +are the columns and rows. +Let's look at some examples. +The original webpage store the names as a table, with columns being +`year`, `female_rank1`, `female_rank2`, `male_rank1`, `male_rank2`..., and +each row corresponds to one single year. +A more extreme example would be storing the names as a sequence (say a python list), +the content of the sequence will be the names, while the indices of the sequence encode +year, ranking, and gender altogether. A possible way to encode the information is +```python +year = 2017 - index // 10 +rank = index % 5 + 1 +gender = 'female' if index % 10 < 5 else 'male' +``` +While this data model preserves all the information, it is unlikely that this +model will be very convenient when it comes to data exploration. +Yet another totally different data model is to group the data by names. +Let's say we'll store the data in a python dictionary. A possible way is: +```python +{ + 'Emma':{ + 'gender': 'female', + 'years_ranked_1': [2017, 2016, 2015, 2014, ...], + 'years_ranked_2': [2013, 2012, 2009, ...], + 'years_ranked_3': [...], + }, + 'Noah':{ + ..., + }, + ..., +} +``` +Note that the form (object) that the data is stored and how the data is structured +are two different things. (*Food for thoughts: why? can you give an example?*) +Clearly, the choice of data model heavily depends on the questions that we would +like to answer with the data. +If the amount of data is very large, we will also need to consider the avabilable +computing resources like memory usage and I/O speed when designing the data model. +For now, we don't yet need to worry about the limitation due to computing resources. +## Task +Try to come up with a data model that is good for answering each of the following questions. +Think about the code you'll need to write to interact with the data model to answer +these questions. +1. Which years Emma is the most chosen names? +2. Which name had been the most chosen name for the longest consecutive years? +3. How many unique male names have be on top 5 between years 1980 and 2000? +4. Are there more unique male names or more unique female names that are on top 5? +5. What is the distribution of the numbers of consecutive years that a male name remains the most chosen name? diff --git a/data_structure.ipynb b/data_structure.ipynb new file mode 100755 index 0000000..32792f2 --- /dev/null +++ b/data_structure.ipynb @@ -0,0 +1,1237 @@ +{ + "metadata": { + "name": "", + "signature": "sha256:b99c50f5345e8fd1ce020369618bbc3cabda5bdfdff85e2a1cce6f7592d1a2cb" + }, + "nbformat": 3, + "nbformat_minor": 0, + "worksheets": [ + { + "cells": [ + { + "cell_type": "code", + "collapsed": false, + "input": [ + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 1 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "#\"\"\"\n", + "#get_top_names.py\n", + "#For astrophg/learning-by-doing: Task 3\n", + "#https://github.com/astropgh/learning-by-doing/tree/master/task-03\n", + "#\"\"\"\n", + "\n", + "def extract_data_lines(filename, start_text, end_text, include_start = False, include_end = False):\n", + " \"\"\"\n", + " open `filename`, and yield the lines between\n", + " the line that contains `start_text` and the line that contains `end_text`\n", + " \"\"\"\n", + "\n", + " # Needed to record the text in between\n", + " parsing = False\n", + " \n", + " # use `yield line` to return desired lines but keep the function going\n", + " with open(filename) as fh:\n", + " \n", + " for line in fh:\n", + " \n", + " ######################################################################\n", + " \n", + " if start_text in line:\n", + " \n", + " parsing = True\n", + " \n", + " if not include_start:\n", + " \n", + " continue\n", + " \n", + " ###################################################################### \n", + " \n", + " elif end_text in line:\n", + " \n", + " if include_end:\n", + " \n", + " #parsing = True\n", + " yield line\n", + " break\n", + " \n", + " else:\n", + " \n", + " parsing = False\n", + " \n", + " ######################################################################\n", + " \n", + " \n", + " if parsing: # Do stuff with the data\n", + " \n", + " yield line" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 2 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "Years = []\n", + "Names = []\n", + "\n", + "if __name__ == '__main__':\n", + " filename = 'top5names.html'\n", + " start_text = '2017'\n", + " end_text = ''\n", + " \n", + "\n", + " for line in extract_data_lines(filename, start_text, end_text, include_start = True, include_end = False):\n", + " \n", + " if '\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
1918191919201921192219231924192519261927...2008200920102011201220132014201520162017
F1MaryMaryMaryMaryMaryMaryMaryMaryMaryMary...EmmaIsabellaIsabellaSophiaSophiaSophiaEmmaEmmaEmmaEmma
F2HelenHelenDorothyDorothyDorothyDorothyDorothyDorothyDorothyDorothy...IsabellaEmmaSophiaIsabellaEmmaEmmaOliviaOliviaOliviaOlivia
F3DorothyDorothyHelenHelenHelenHelenHelenBettyBettyBetty...EmilyOliviaEmmaEmmaIsabellaOliviaSophiaSophiaAvaAva
F4MargaretMargaretMargaretMargaretMargaretMargaretBettyHelenHelenHelen...OliviaSophiaOliviaOliviaOliviaIsabellaIsabellaAvaSophiaIsabella
F5RuthRuthRuthRuthRuthBettyMargaretMargaretMargaretMargaret...AvaAvaAvaAvaAvaAvaAvaIsabellaIsabellaSophia
M1JohnJohnJohnJohnJohnJohnRobertRobertRobertRobert...JacobJacobJacobJacobJacobNoahNoahNoahNoahLiam
M2WilliamWilliamWilliamRobertRobertRobertJohnJohnJohnJohn...MichaelEthanEthanMasonMasonJacobLiamLiamLiamNoah
M3JamesJamesRobertWilliamWilliamWilliamWilliamWilliamJamesJames...EthanMichaelMichaelWilliamEthanLiamMasonMasonWilliamWilliam
M4RobertRobertJamesJamesJamesJamesJamesJamesWilliamWilliam...JoshuaAlexanderJaydenJaydenNoahMasonJacobJacobMasonJames
M5CharlesCharlesCharlesCharlesCharlesCharlesCharlesCharlesCharlesCharles...DanielWilliamWilliamNoahWilliamWilliamWilliamWilliamJamesLogan
\n", + "

10 rows \u00d7 100 columns

\n", + "" + ], + "metadata": {}, + "output_type": "pyout", + "prompt_number": 9, + "text": [ + " 1918 1919 1920 1921 1922 1923 1924 \\\n", + "F1 Mary Mary Mary Mary Mary Mary Mary \n", + "F2 Helen Helen Dorothy Dorothy Dorothy Dorothy Dorothy \n", + "F3 Dorothy Dorothy Helen Helen Helen Helen Helen \n", + "F4 Margaret Margaret Margaret Margaret Margaret Margaret Betty \n", + "F5 Ruth Ruth Ruth Ruth Ruth Betty Margaret \n", + "M1 John John John John John John Robert \n", + "M2 William William William Robert Robert Robert John \n", + "M3 James James Robert William William William William \n", + "M4 Robert Robert James James James James James \n", + "M5 Charles Charles Charles Charles Charles Charles Charles \n", + "\n", + " 1925 1926 1927 ... 2008 2009 2010 \\\n", + "F1 Mary Mary Mary ... Emma Isabella Isabella \n", + "F2 Dorothy Dorothy Dorothy ... Isabella Emma Sophia \n", + "F3 Betty Betty Betty ... Emily Olivia Emma \n", + "F4 Helen Helen Helen ... Olivia Sophia Olivia \n", + "F5 Margaret Margaret Margaret ... Ava Ava Ava \n", + "M1 Robert Robert Robert ... Jacob Jacob Jacob \n", + "M2 John John John ... Michael Ethan Ethan \n", + "M3 William James James ... Ethan Michael Michael \n", + "M4 James William William ... Joshua Alexander Jayden \n", + "M5 Charles Charles Charles ... Daniel William William \n", + "\n", + " 2011 2012 2013 2014 2015 2016 2017 \n", + "F1 Sophia Sophia Sophia Emma Emma Emma Emma \n", + "F2 Isabella Emma Emma Olivia Olivia Olivia Olivia \n", + "F3 Emma Isabella Olivia Sophia Sophia Ava Ava \n", + "F4 Olivia Olivia Isabella Isabella Ava Sophia Isabella \n", + "F5 Ava Ava Ava Ava Isabella Isabella Sophia \n", + "M1 Jacob Jacob Noah Noah Noah Noah Liam \n", + "M2 Mason Mason Jacob Liam Liam Liam Noah \n", + "M3 William Ethan Liam Mason Mason William William \n", + "M4 Jayden Noah Mason Jacob Jacob Mason James \n", + "M5 Noah William William William William James Logan \n", + "\n", + "[10 rows x 100 columns]" + ] + } + ], + "prompt_number": 9 + }, + { + "cell_type": "heading", + "level": 1, + "metadata": {}, + "source": [ + "QUESTION 1: Which years Emma is the most chosen names?" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "dic_names['1918']" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 10, + "text": [ + "array(['Mary', 'Helen', 'Dorothy', 'Margaret', 'Ruth', 'John', 'William',\n", + " 'James', 'Robert', 'Charles'], \n", + " dtype='|S11')" + ] + } + ], + "prompt_number": 10 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "df_names.get_value('F1','1918')" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 11, + "text": [ + "'Mary'" + ] + } + ], + "prompt_number": 11 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "df_names.apply(lambda row: row.astype(str).str.contains('Emma').any(), axis=0)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 12, + "text": [ + "1918 False\n", + "1919 False\n", + "1920 False\n", + "1921 False\n", + "1922 False\n", + "1923 False\n", + "1924 False\n", + "1925 False\n", + "1926 False\n", + "1927 False\n", + "1928 False\n", + "1929 False\n", + "1930 False\n", + "1931 False\n", + "1932 False\n", + "1933 False\n", + "1934 False\n", + "1935 False\n", + "1936 False\n", + "1937 False\n", + "1938 False\n", + "1939 False\n", + "1940 False\n", + "1941 False\n", + "1942 False\n", + "1943 False\n", + "1944 False\n", + "1945 False\n", + "1946 False\n", + "1947 False\n", + " ... \n", + "1988 False\n", + "1989 False\n", + "1990 False\n", + "1991 False\n", + "1992 False\n", + "1993 False\n", + "1994 False\n", + "1995 False\n", + "1996 False\n", + "1997 False\n", + "1998 False\n", + "1999 False\n", + "2000 False\n", + "2001 False\n", + "2002 True\n", + "2003 True\n", + "2004 True\n", + "2005 True\n", + "2006 True\n", + "2007 True\n", + "2008 True\n", + "2009 True\n", + "2010 True\n", + "2011 True\n", + "2012 True\n", + "2013 True\n", + "2014 True\n", + "2015 True\n", + "2016 True\n", + "2017 True\n", + "dtype: bool" + ] + } + ], + "prompt_number": 12 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "print df_names[df_names=='Emma'].index" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "Index([u'F1', u'F2', u'F3', u'F4', u'F5', u'M1', u'M2', u'M3', u'M4', u'M5'], dtype='object')\n" + ] + } + ], + "prompt_number": 13 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "ranking_Emma = []\n", + "year_Emma = []\n", + "\n", + "\n", + "#for row in range(df_names.shape[0]):\n", + "for row in list(df_names.index): # The column labels are strings, not integers\n", + " \n", + " #for col in range(df_names.shape[1]):\n", + " for col in list(df_names.columns.values): # The column labels are strings, not integers\n", + " \n", + " if df_names.get_value(row,col) == 'Emma':\n", + " \n", + " print(row, col)\n", + " #break\n", + " \n", + " ranking_Emma.append(int(row[-1]))\n", + " year_Emma.append(int(col))" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "('F1', '2008')\n", + "('F1', '2014')\n", + "('F1', '2015')\n", + "('F1', '2016')\n", + "('F1', '2017')\n", + "('F2', '2003')\n", + "('F2', '2004')\n", + "('F2', '2005')\n", + "('F2', '2006')\n", + "('F2', '2009')\n", + "('F2', '2012')\n", + "('F2', '2013')\n", + "('F3', '2007')\n", + "('F3', '2010')\n", + "('F3', '2011')\n", + "('F4', '2002')\n" + ] + } + ], + "prompt_number": 14 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "fig, ax = plt.subplots()\n", + "\n", + "ax.hist2d(year_Emma, ranking_Emma);\n", + "ax.set_yticks([1, 2, 3, 4]);\n", + "ax.tick_params(axis='both', which='major', pad=10, labelsize=12)\n", + "\n", + "\n", + "ax.set_title(r'$\\rm{Years \\, Emma \\, most \\, popular \\, female \\, name}$' + '\\n', fontsize=18);\n", + "\n", + "ax.set_xlabel(r'$\\rm{Year}$', fontsize=18, labelpad=5);\n", + "ax.set_ylabel(r'$\\rm{Ranking}$', fontsize=18, labelpad=10);" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "display_data", + "png": "iVBORw0KGgoAAAANSUhEUgAAAYsAAAFFCAYAAAAdAsFPAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAHFpJREFUeJzt3Xm4LHV95/H3927IjgtCBOEI6CDogAhmXOEKjoiSGEki\nqKCOC2KuW+KMKCDgRQVHHCUqgqAYNIiOcYRoHCNwMQ64xAX0gjIOHDCIT1QQWS4i937nj6ojbdPn\n/Ho9Vfec9+t5+jmna/nVt6ur+tO1dFVkJpIkzWVJ0wVIktrPsJAkFRkWkqQiw0KSVGRYSJKKDAtJ\nUpFhIUkqMiwkSUWGhSSpaFnTBUiaXxGxJbAr8BDg+5l5a8MlzTvnweAW9ZZFRJwWEVdFxIb6cXFE\n7NI1zOqIuDci7o6IL0bEI5uqdzYRcWhEXBQR6+rX8YWI+Hj9uDAivhER90XE55uuVa3wGOC/AV8F\nHjcfE4yI7evl8YSI+H5E7Dcf053DvM+DjV14bSiIiH8Ang/sl5nf6dH/EuCYzLxu3osbQER8AnhJ\nZi7t0e9JwN9m5h/Pf2ULQ0S8Efh8Zt7YdC2jioglwH3AAZn5tXmY3uXA/wZOBT4NfHA+pluoaV7n\nwcZuUW9ZdHgdcCdwVr0A/V5EHA5c0PagqCUQPXtkfgv44fyWs+CspJrHG73M3DDPk3wqcEVmbsjM\nv2zDh3MD82CjZlgAmXkzcAKwD7BqpntEbAO8IDPPaaq2UUXEMR1PL6/31WoAEbFZRLwWeB6zhLGK\nlrBAgnax8gD3/f4WOBJYHRGfzcxbgHcAb+8eMCKWAscB2wK/AB4PnJSZa+v+OwJvAH4NbAo8Eji2\nbpOIeGE9/i5U31YPAV4GHJ6Z36w/mDbUj+2B52Tmkwd9QXWdhwFnAmTm39Xdn0m1O+A/AEcAj6Va\nFvYFvgacRxWaG4D9gK9l5hmjjtvPvJnjtYw03bqNVfX4NwM7AD/MzLM6+s8234+gep8CeG9E3An8\nIDPfN0e9hwNvo3qPX0G1X3wpsDfwj5n54X5r67etiDgEOL1u5+WZ+YmIeBbVsv0YYGVmXj5HzSMt\nt7O0+VTglfXTYyPiZcAZmfm9udajBpezOdftWcaZrdZ9gMs73+s+5vGor3vg+vuWmT7qR/3m3gd8\nhmqz+aRZhjsPOLfj+V7ALcDm9fO/B67u6P8W4KquNp5Qv+EnUa343wcOrad7Rtewl/ZZ/3l1mx8H\nPgn8CFg/y7AProf9ErB13W1nYD3Vh8uD6m5Tdbc/GtO4xXkzx+sbZbqnd75ndbePAu+u/59zvtfT\n2QDsNMDy9Nh6nNd1dNsGuAl4e7+1DdjWpvVwR/Xo9oyuafxBt1GW2z7mRa/pn0fv9WiL+VrOesyD\nOWsaYtn8HbDdgPN4lNc9VP19Lc+jNrDQHsD76zfqcmBFj/571/2f0NX9BuCI+v8jgVM7+s2s6Nt3\ndJuqux3S1c6fAD8G9uf+ExD+tM/azwM2dDxfSvXNY64VuPODZknd7aU9uj11HOP2M28Kr3Hg6QK7\n1yvVPl1tPaHuvltpvne8X4OERc9xgLcCvwUe2k9tfbb1sK55dFTXcP2ExdDLbZ/vW+e0iuvRfCxn\nnXX1W9MQy+ZThqxr0OV8pPpLD3dDPdAZwOupNu/v7dF/Zf33kIh4Rkf3K4B1AJl5fkQ8PCJeDezY\nMcyKHu1d3/X8S8DhwGXAXRGxBjhl4FdR1bE+Ir7e2S0ijsnMMzs6TXcMvyEioFq4urs94AyrYcYd\ncN7MZtDpPptqF9JPu9r5Wd39PwNnM6b53ocrgeXA06g+fEu1/aSPtp4CXDRKUSMut4Mqrkcdpjtq\nnORyNkhNs+lV67KOboPU1autuV73OOqflWHxQDNnSKwv9L8gM3uuMPWCcBywKjPPjoidgeNnae/u\nruebA0cBJ1J9yz0IuCQinpaZ3+/zNfxeZr6to65daPikhgHnzbjMrEybU+3HnbFZ/Xc5A873iHh6\nZv7LkPXMfDCs77O2ftoa+cyeEZfbQRXXo1EMuZxNtKYR6urXROv3bKjB/XP9d+/OjvUZM/tGxA5U\nB5RXZ+bFde/lHcMdVmj/+VT7gP9vZp6TmYfX7R3UZ305R79DqI5jNGIM82ZYMwd1d+rqvmv99zLK\n833my8PM2VAr6V/3GVRPofqm93/6rK3ftmb8QXBExHbFAuf/vZlzPRql4RFey1cnVdOIdfVrovUb\nFg80szL2nDeZeQ31rqqu32S8meoMh63qNjo3+55HtV95M2CPrul0794J4E1Rb1/W7gG+3WftPU/t\njIjdqc6oubZ+/oDXOQ/d+p03vV/ckNPN6oeWZ3P/WTkzXgWcmZlXU57vP6N6f2c+xGfb8uzlRR21\nPRJ4DfC2zLytz9r6aqtjmB9RHQ+Z8WdUB1of3jHusO/NbMvtrDqm1bmbaK716PZZahzrctZjOVk7\nR02/7vM1jr2ufruNUn8//AV3h6h+Ab0P1Zt2K7AGeFdmfq/HsKuovtVNU+3Ouygzv173ezHVyn8Z\n1eb6VVT7zXen+mDYFPgb4IlUH95fycy/rsd9EfBoYEuq3RKbALdk5kfnqPvQenoHAQ8CLgZmPjw2\noTqT4knA3Zm5dX163gnAM+r6z6trXV13uwY4B1hLtck80+2sutvbBxz3WqoPvg+V5k1m9tzvHhEr\nR5lu3cYqqtNOb+f+awLN9CvO94h4HtUPOK8GvpiZa3rV2jH8FNW+/b+i+qD+HdV7/unM/GzXsLPW\nNkRb+1Lt2vgG1ZbmlcD/pFqmPwz8oGNeXgN8ODPPHGW5nWMe7A+cDDyd6n37DtVVBu7teN0PWI9G\nfb/7eC13dLS/FvhI3n8K8qzr9iyvsVTrsHUN/LqHqb9fhoU0IR0f8FOZeVNb2pKG4W4oaXLm3KXZ\nYFvSwFzwpAmI6tfOF1LtBrowIo5oQ1vSsNwNJUkqcstCklRkWEiSigwLSVKRYSFJKjIsJElFhoUk\nqciwkCQVGRaSpCLDQpJUZFhIkooMC0lSkWEhSSoyLCRJRYaFJKnIsJAkFRkWkqQiw0KSVNT6sIiI\nR0fEPRFxftO1SNJi1fqwAD4EfIvq/sOSpAa0Oiwi4nDgNuASIBouR5IWrdaGRURsBZwMvAmDQpIa\ntazpAuawGjgnM38WEQPvgjqBtzW+2+qUWNF0CZI0kMyTen45j8zGP1MfICL2Bj4JPCEzfxcRJwG7\nZuaRA7TRvhcmSS2XmT3Doq1bFvsDU8BNEQGwBbA0Ih6bmfv228jx+dbJVNenwbYs1gAHTKaQiVnD\nxlXzGjauesGa58MaNq56YXI1nzxrn7aGxdnABfX/AbyZKjxe01RBkrSYtTIsMnMdsG7meUTcCazL\nzF81V5UkLV6tDItumTn7ttGCMdV0AUOYarqAAU01XcAQppouYAhTTRcwoKmmCxjC1LxPsbWnzi4+\nU00XMISppgsY0FTTBQxhqukChjDVdAEDmmq6gCFMzfsUDQtJUpFhIUkqMiwkSUWGhSSpyLCQJBUZ\nFpKkIsNCklRkWEiSigwLSVKRYSFJKjIsJElFhoUkqciwkCQVGRaSpCLDQpJUtFHc/GhYg90DW5I0\nG7csJElFhoUkqciwkCQVGRaSpCLDQpJUZFhIkooMC0lSkWEhSSoyLCRJRYaFJKnIsJAkFRkWkqQi\nw0KSVGRYSJKKDAtJUpFhIUkqMiwkSUWGhSSpyLCQJBUZFpKkIsNCklRkWEiSigwLSVKRYSFJKjIs\nJElFhoUkqciwkCQVGRaSpCLDQpJUZFhIkooMC0lSkWEhSSoyLCRJRYaFJKnIsJAkFRkWkqQiw0KS\nVGRYSJKKDAtJUpFhIUkqMiwkSUWGhSSpyLCQJBUZFpKkomVNF6DF4fi8t+kSADglVjRdQmvmRRu0\n4f1Qf9yykCQVGRaSpKJWh0VEfDIibomI30TE9RFxXNM1SdJi1OqwAN4NPCoztwKeA7wuIg5uuCZJ\nWnRafYA7M9d2dboP+PcmapGkxaztWxZExIcj4i5gLXBKZn636ZokabFpfVhk5muBLYCDgFMi4kkN\nlyRJi06rd0PNyMwE1kTEZ4EjgG/1N+aajv+n6ockqTJdP8o2irDosBz4Vf+DHzCpOiRpAZjiD79E\nXz7rkK3dDRUR20bE4RGxeUQsjYhnA38BfKHp2iRpsWnzlkUCrwHOBAK4DjgyM7/daFWStAi1Niwy\n85e4H0mSWqG1u6EkSe1hWEiSioYKi4h4X0QcOEu/JRFxZET8xWilSZLaYthjFpsDP5il36nAs4Ab\nImJDZn5uyGlIklpi2N1QPwNeHxFXRcTbu/odSXUW058Dzx6lOElSOwwbFlsCuwD/BKyMiBcBRMRy\nYDvgh5m5AbhpLFVKkho17G6oezJzJiCWAKvr7tsAZOZd9fMNo5UnSWqDYbcs8vf/VFsQv62fLu0a\nLoZsX5LUIsNuWWwdEecD/wY8FfhcRGwKHAWsj4idqXZBPWo8ZUqSmjTslsWxwDrgYOAi4AbgNGBT\nYE/gXOCzwKVjqFGS1LCorv495kYj/hjYLTM/NfbG+68h4cSmJi9JG6GTycyehw/G/gvuiPirzPxm\nk0EhSRqvoS8kGBHbAbt2tbEEOAb40Ih1SZJaZKiwiIjXAh/ggWc/QceZUpKkhWHY3VD7A7sByzJz\nycyDKjy+NrbqJEmtMOxuqO9m5o3dHTMzI+KEEWuSJLXMsFsW90XE1rP0WzlsMZKkdhp2y+Ja4IKI\n+AYw3dF9CfAy7r/8hyRpARg2LC6gupjgwT36eYBbkhaYYXdD/Rh4aOfBbQ9wS9LCNfTlPjLztu6O\nWf0c/B2jlSRJapuhwiIz57rm0w5D1iJJaqm+jllExOOoNhzW1s+fMsu4S4D/CnxybBVKkhrX7wHu\nNcA9wI71808BO88yrAe4JWmB6Tcs3gb8ruP5T4H9MvOXnQNFROBlySVpwekrLDLz7K5Ob+wOinq4\njIjzxlGYJKk9hj0b6gVz9HvxkG1Kklpq2LB4VUQ8qLNDRCyNiHcCB45eliSpTYYNi22B9888iYg9\ngG8Cr6K63aokaQEZ9nIfJwDXRMSbgaD6Id7fA88Enjim2iRJLTFUWGTmOwEi4jDgLODPM/OLdbet\nxleeJKkNimFRH5t4+Cy9/xX4CLBlRDyEarfWW4AvjK1CSVLj+tmyeBrwlQHa9Ed5krTA9BMWtwJf\nBlYBGwrDBnDhqEVJktqln7CYBk7PzOv7aTAiTh+pIklS6xTDIjNvBS4ZoM1thy9HktRGw546S0Rs\nB+za1cYS4GjggyPWJUlqkaHCIiJeC3yA6s543TzALUkLzLC/4N4f2A1Y5m1VJWnhG3Y31Hcz88bu\njvVVZ08YsSZJUssMu2VxX0RsPUu/lcMWI0lqp2G3LK4FLoiIb1CdWjtjCfAyYPVoZUmS2mTYsLgA\n2BI4uEc/D3BL0gIz7G6oHwMP7Ty47QFuSVq4hg2LYzPztu6OmZlUlyuXJC0gw16i/NLubvXlyvcA\nrhq1KElSuwz9C26AiNgeWFE//TawFngvcNGIdUmSWmTYX3D/EdWVaB/f1eu3wIdGLUqS1C7Dblmc\nSnUb1YuAI4BzgeXA86i2LiRJC8iwB7h/mpmnZea1AJl5Y2b+JDPfD+w1vvIkSW0wbFis6/j/3yLi\n0I7nm41QjySphYbdDbVNRFwNfAl4H/Dt+pLl66gu9+Hps5K0gAwbFh8AdgDWZua/R8Q7gbOobqt6\n5LiKkyS1Q1S/oxtDQxE7UP2q++qxNDiiiEg4sekyJGkjcjKZGb36jPQ7i06ZeXNE7BMRz83Md4+r\n3VEcn/c2XUIrLI3m346TWxLcbVgmTokV5YHmQRvmRVu05T1ps2EPcPeUmRePu01JUvP6/mCPiJdE\nxC0R8eOI+E8d3beIiKdHxNER8TGq31pIkhaQvnZDRcTewMeofqG9DfC5iNgNeCnwP4BNOgY/atxF\nSpKa1e8xi9cBr83McyJiKXA88C7gvwCfBn4F3ANclpmXTKRSSVJj+g2LbTPzHIDMXF+fKnsjcEBm\nfm9i1UmSWqHfYxZ3dD7JzPuAzxgUkrQ49BsWvc6x+3mvASPio8OXI0lqo353Qz0iIh7S8TyAzbq6\nzbT35LFUJklqjX7D4lnAL3t0P6FHt/H8JFyS1Br9hsXtwBmUgyCozpwaWUSsAM4EDgQeAvw/4K2Z\n+eVxtC9J6l+/YXFpZvZ1vYaI2HOEejotA24CnpGZN0XEc4HPRMTjM/PGMU1DktSHfsPivQO0efow\nhXTLzLuBkzuefzEibgD2oTptV5I0T/o6Gyozr+y3wUGGHUR9v4zH4G1bJWnebRQX/YuI5cCngPMy\n87qm65GkxWZslyiflIhYApxPdTmRVYOMe/lJ//L7/3c+YCemDth5vMVJ0kZtun6UtTosIiKAc4Ft\ngUMyc/0g4+9/0tMnUpckLQxT9WPG5bMO2eqwoDp1dnfgoMz8bdPFSNJi1dpjFhGxM/BqYC/g5xFx\nR/04ouHSJGnRae2WRf1bitaGmSQtJn4YS5KKDAtJUpFhIUkqisyFeZHYiEjo63JWkiQATiYzo1cf\ntywkSUWGhSSpyLCQJBUZFpKkIsNCklRkWEiSigwLSVKRYSFJKjIsJElFhoUkqciwkCQVGRaSpCLD\nQpJUZFhIkooMC0lSkWEhSSoyLCRJRYaFJKnIsJAkFRkWkqQiw0KSVGRYSJKKDAtJUpFhIUkqMiwk\nSUWGhSSpyLCQJBUZFpKkIsNCklRkWEiSigwLSVKRYSFJKjIsJElFhoUkqciwkCQVGRaSpCLDQpJU\nZFhIkooMC0lSkWEhSSoyLCRJRYaFJKnIsJAkFRkWkqSiZU0XIM2n4/PepkvglFjRdAnqciInN11C\nK8w1F9yykCQVGRaSpCLDQpJUZFhIkooMC0lSkWEhSSoyLCRJRYaFJKnIsJAkFRkWkqQiw0KSVGRY\nSJKKDAtJUpFhIUkqMiwkSUWGhSSpqLVhERGrIuJfI+KeiPh40/VI0mLW2rAAbgZWAx9rupD5Md10\nAUOYbrqAAU03XcAQppsuYAjTTRcwoOmmCxjYdAPTbG1YZObnM/MLwK+armV+TDddwBCmmy5gQNNN\nFzCE6aYLGMJ00wUMaLrpAgY23cA0WxsWHaLpAiRpsdsYwiKbLkCSFrvIbPdncUScAuyQmS8fcLx2\nvzBJaqHM7Lk3Z9l8FzKEoT70Z3vBkqTBtTYsImIpsJyqxqURsQlwX2aub7YySVp82nzM4gTgbuAt\nwEuAdcBxjVYkSYtU649ZSJKa1+YtC0lSSxgWkqQiw0KSVGRYSJKKDAtJUpFhIUkqanVYRMSKiDg3\nIqYj4jcR8b2IOLij/4ER8aOIuCsiLo2InbrGPy0iflk/Tu3ovm1EXBARN0fEryPi6xHxpDbX3DXM\n/hGxISJWt73eiHhDRFwfEXdGxDUR8eg21xwRe0bEmnq5+GlEHD9qvaPWHBErI+KyuqYberQ9Vfe/\nKyKujYgD21zzpNa/Sc7jjuHGtu7NR81jXf8ys7UPYDPgRGCn+vlzgd8AOwEPA24HDgNWAO8BruwY\n92jgR8Aj6sda4Oi636OANwLbUV3V9lXAL4DN21pzxzDLge8DVwDvaHO9wCuBq4DdO+b7g1te83ep\n7qMSwC7Az4BDG655P+DF9XJ6Q4+2rwTeC2wCvAC4DXhYW2tmQuvfJOfxJNa9eVguxrr+jfxi5/tR\nv/gXAK8Gvt410+8GHlM/vwJ4ZUf/l3fO6B7t3g48oe01A8cCpwIfB1a3tV6qrdafAis3puUCuGdm\n5aqffwZ4S5M1d3Q/qPtDAXhMXfPmHd0up+tLRptqnqXdiax/46x3Pta9MS4XY1//Wr0bqltEbEe1\ncvwQ2JNqpgKQmXcDP6m7A+zR2R+4uqNfd7t7UyX3T9pcc0TsTPXhNvPNd+zGWO+OwA7A4yPipnpT\n+KSIGHvdY14uvgK8NCKWRcTuwJOBrzZU8+P6aGpP4PrMvKuj21XMsqyPYow1d7c7kfVvnPXOx7pX\nT2dcNY99/dtowiIilgOfAs7LzOuAzak21zr9Btiy/n8Lqm8rnf226NHuVsD5wEmZeUfLaz4DOL7+\nYEjGfK+PMde7Y/33WVQL90rgCOAVLa4Z4E3AC6muRXYNcE5mfqehmh+wvPbQ/Xpmxt2yx7BDG3PN\nne1OZP2bQL0TXfdg7DWPff3bKMIiIpZQLVD3AKvqzncCW3UNujVwxyz9t667dba7KXAxcEVmntbm\nmiPiUGCLzPzszCQY4zecCczjdfXf92TmbzLzRuAs4JC21hwRmwGXAm+n2v//SODgiDim4Zrn0mvc\nbXjgh8zQJlDzTLsTWf/GXe+k1716GuOex2Nf/1ofFvVm07nAtsBhef8lytcCe3UMtzmwa919pv/e\nHU3tRbVpNzP8JsD/Am7KzKM3gpqfCewbEbdExC3AXwJvjIjPt7TeHwP39pjcWL6RTajmPYEtM/OT\nmbkhM28GLmRMATdCzXNZC+wSEZ3fNvfqc9ymap7Y+jeheie27k2w5vGvf5M6SDPGgz0foTrbY/Ou\n7g8Dfk11IOhBVGcKXNHR/2iq3QiPoNp3txZ4dd1vOdU3ms8DSzeSmrcAHl4/tgM+DZwObNPGeuv+\nn6jn8xZUm8XXAi9v8Tx+MNW3uSOovkhtX0/jlIZrjrr7c4Bpqq2eFR39rwT+ez3MzNlQD21rzZNc\n/yZU78TWvQkvF2Nd/8b2Jk3iAewMbKA6A+COjscRdf8D6xlwN9Xug526xj8N+FX9OLWj+/51u3d2\ntfvUttbcYzofZzynzk6sXqr95hdQ7RK5iWqfb2uXi7rfc6hOn70duIVq0/1BTdYMHFCPuwFYX/+9\ntKvty+pxrwWe2fR8nqtmJrT+TXIeT2Ldm4flYqzrn/ezkCQVtf6YhSSpeYaFJKnIsJAkFRkWkqQi\nw0KSVGRYSJKKDAtJUpFhIUkqMiwkSUWGhdSHqG7FelV9S80NEXFxROzSNczqiLg3Iu6OiC9GxCOb\nqlcaNy/3IQ0gIv4BeD6wX/a4z0VEXAIck9X9CKQFw7CQBhARO1Bd2O064EmZuaGj3+FU9z04p6n6\npElxN5Q0gKzucXECsA/336SGiNgGeIFBoYXKLQtpQPVdzb4FPBrYPTNviYgzgA9n5o+6hl0KHEd1\nY5tfAI+nuoXo2rr/jsAbqO5bsCnV3fmOzcxb6v4vrMffherWmIcALwMOz8xvTvilSr+3rOkCpI1N\nZm6IiFdTBcYHIuIDwK3dQVE7F1ifma8AiIi9gK9GxG5Z3c/5PcDjMvM/1v3fAnyZ+g5pmXlhRFwH\nfAd4LrAa+DOqG/FI88awkIaQmd+NiA8Cr6e6e9qzuoeJiL2Bo4Andox3VUTcA/wJ1Y1p/onqxjQz\nLgLeHRHbZ+bP62631X+/ldUtNztvCyvNC8NCGt4ZVGHxj5nZ637HK+u/h0TEMzq6XwGsA8jM8yPi\n4fWWyo4dw6zo0d71Y6hZGophIQ1v5kyo9YX+F2Rmzw/6OiSOA1Zl5tkRsTNw/Czt3T10pdKIPBtK\nmpx/rv/+wW6jiNgsIvatT8M9E1idmRfXvZd3DHfY/JQplRkW0vCi/ttzPcrMa6h3VdVnUM14M9XZ\nT1vVbazr6Pc84LfAZsAeXdNZOp6ypcF56qw0hIj4BNVvLfYAbgXWAO/KzO/1GHYV8BRgmmrX70WZ\n+fW634uBVwKXUe1mugp4NrA7cDbV6bR/Q3WQ/FrgK5n51xN8aVJPhoUkqcjdUJKkIsNCklRkWEiS\nigwLSVKRYSFJKjIsJElFhoUkqciwkCQVGRaSpCLDQpJU9P8Bv+cRR6qYvXwAAAAASUVORK5CYII=\n", + "text": [ + "" + ] + } + ], + "prompt_number": 15 + }, + { + "cell_type": "heading", + "level": 1, + "metadata": {}, + "source": [ + "QUESTION 2: Which name had been the most chosen name for the longest consecutive years?" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "def count_max_dups_numbers(L):\n", + " ans = []\n", + " if not L:\n", + " return ans\n", + " running_count = 1\n", + " for i in range(len(L)-1):\n", + " if L[i] == L[i+1]:\n", + " running_count += 1\n", + " else:\n", + " ans.append(running_count)\n", + " running_count = 1\n", + " ans.append(running_count)\n", + " # return maximum ocurrence and number of times. Remember that Python starts at 0, which explains the -1 in cumsum\n", + " return [L[np.cumsum(ans)[np.argmax(ans)-1]], np.max(ans)]\n", + "\n", + "\n", + "\n", + "def count_max_dups_str(L):\n", + " ans = []\n", + " if not L:\n", + " return ans\n", + " running_count = 1\n", + " for i in range(len(L)-1):\n", + " if L[i] in L[i+1]:\n", + " running_count += 1\n", + " else:\n", + " ans.append(running_count)\n", + " running_count = 1\n", + " ans.append(running_count)\n", + " # return maximum ocurrence and number of times\n", + " #print np.cumsum(ans)[np.max(ans)-1]\n", + " return [L[np.cumsum(ans)[np.argmax(ans)-1]], np.max(ans)]" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 16 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "li = ['aaa','bbb','aaa','abb','abb','bbb','bbb','bbb','aaa','aaa']\n", + "\n", + "print count_max_dups_str(li)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "['bbb', 3]\n" + ] + } + ], + "prompt_number": 17 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "for i in range(np.shape(Names)[1]):\n", + " \n", + " # Print maximum consecutive duplicates by rank (F1--F5, M1--M5), and also age ranges (inverted, as Years goes from 2017 to 1918)\n", + " print count_max_dups_str(list(Names[:,i])), '\\b',\\\n", + " Years[np.where(array([label == count_max_dups_str(list(Names[:,i]))[0] for label in Names[:,i]]) == True)[0]][::-1]\n", + " print" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "['Mary', 29] \b ['1918' '1919' '1920' '1921' '1922' '1923' '1924' '1925' '1926' '1927'\n", + " '1928' '1929' '1930' '1931' '1932' '1933' '1934' '1935' '1936' '1937'\n", + " '1938' '1939' '1940' '1941' '1942' '1943' '1944' '1945' '1946' '1953'\n", + " '1954' '1955' '1956' '1957' '1958' '1959' '1960' '1961']\n", + "\n", + "['Barbara', 8] \b ['1937' '1938' '1939' '1940' '1941' '1942' '1943' '1944']\n", + "\n", + "['Patricia', 7] \b ['1937' '1938' '1939' '1940' '1941' '1942' '1943' '1946' '1947' '1949'\n", + " '1950' '1951' '1952']\n", + "\n", + "['Helen', 6] \b ['1925' '1926' '1927' '1928' '1929' '1930']\n", + "\n", + "['Ava', 7] \b ['2006' '2008' '2009' '2010' '2011' '2012' '2013' '2014']\n", + "\n", + "['Michael', 38] \b ['1954' '1955' '1956' '1957' '1958' '1959' '1961' '1962' '1963' '1964'\n", + " '1965' '1966' '1967' '1968' '1969' '1970' '1971' '1972' '1973' '1974'\n", + " '1975' '1976' '1977' '1978' '1979' '1980' '1981' '1982' '1983' '1984'\n", + " '1985' '1986' '1987' '1988' '1989' '1990' '1991' '1992' '1993' '1994'\n", + " '1995' '1996' '1997' '1998']\n", + "\n", + "['Christopher', 16] \b ['1972' '1973' '1979' '1980' '1981' '1982' '1983' '1984' '1985' '1986'\n", + " '1987' '1988' '1989' '1990' '1991' '1992' '1993' '1994']\n", + "\n", + "['John', 24] \b ['1929' '1930' '1931' '1932' '1933' '1934' '1935' '1936' '1937' '1938'\n", + " '1939' '1940' '1941' '1942' '1943' '1944' '1945' '1946' '1947' '1948'\n", + " '1949' '1950' '1951' '1952' '1961' '1962' '1968']\n", + "\n", + "['William', 24] \b " + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "['1926' '1927' '1928' '1929' '1930' '1931' '1932' '1933' '1934' '1935'\n", + " '1936' '1937' '1938' '1939' '1940' '1941' '1942' '1943' '1944' '1945'\n", + " '1946' '1947' '1948' '1949']\n", + "\n", + "['Richard', 18] \b ['1930' '1931' '1932' '1933' '1934' '1935' '1936' '1937' '1938' '1939'\n", + " '1940' '1941' '1942' '1943' '1944' '1945' '1946' '1947']\n", + "\n" + ] + } + ], + "prompt_number": 18 + }, + { + "cell_type": "heading", + "level": 1, + "metadata": {}, + "source": [ + "QUESTION 3: How many unique male names have be on top 5 between years 1980 and 2000?" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "wh_1980_2000 = np.arange(np.where(array([label == '2000' for label in Years]) == True)[0][0], \\\n", + " np.where(array([label == '1980' for label in Years]) == True)[0][0])" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 19 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "Female_names = Names[np.where(Gender == 1)].reshape( len(Years), len( np.where( array(gndr)==0 )[0] ))\n", + "Male_names = Names[np.where(Gender == 0)].reshape( len(Years), len( np.where( array(gndr)==0 )[0] ))" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 20 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "for i in range(np.shape(Male_names)[1]):\n", + " \n", + " print len(np.unique(Male_names[:,i][wh_1980_2000])), np.unique(Male_names[:,i][wh_1980_2000])" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "2 ['Jacob' 'Michael']\n", + "4 ['Christopher' 'Jacob' 'Matthew' 'Michael']\n", + "3 ['Christopher' 'Jacob' 'Matthew']\n", + "5 ['Christopher' 'David' 'Jacob' 'Jason' 'Joshua']\n", + "7 ['Andrew' 'Christopher' 'Daniel' 'David' 'Joshua' 'Nicholas' 'Tyler']\n" + ] + } + ], + "prompt_number": 21 + }, + { + "cell_type": "heading", + "level": 1, + "metadata": {}, + "source": [ + "QUESTION 4: Are there more unique male names or more unique female names that are on top 5?" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "print len(np.unique(Female_names))\n", + "print len(np.unique(Male_names))" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "43\n", + "24\n" + ] + } + ], + "prompt_number": 22 + }, + { + "cell_type": "heading", + "level": 1, + "metadata": {}, + "source": [ + "QUESTION 5: What is the distribution of the numbers of consecutive years that a male name remains the most chosen name?" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "def count_dups_numbers(L):\n", + " ans = []\n", + " if not L:\n", + " return ans\n", + " running_count = 1\n", + " for i in range(len(L)-1):\n", + " if L[i] == L[i+1]:\n", + " running_count += 1\n", + " else:\n", + " ans.append(running_count)\n", + " running_count = 1\n", + " ans.append(running_count)\n", + " # return maximum ocurrence and number of times. Remember that Python starts at 0, which explains the -1 in cumsum\n", + " return ans\n", + "\n", + "\n", + "\n", + "def count_dups_str(L):\n", + " ans = []\n", + " if not L:\n", + " return ans\n", + " running_count = 1\n", + " for i in range(len(L)-1):\n", + " if L[i] in L[i+1]:\n", + " running_count += 1\n", + " else:\n", + " ans.append(running_count)\n", + " running_count = 1\n", + " ans.append(running_count)\n", + " # return maximum ocurrence and number of times\n", + " #print np.cumsum(ans)[np.max(ans)-1]\n", + " return ans" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 23 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "li = ['aaa','bbb','aaa','abb','abb','bbb','bbb','bbb','aaa','aaa']\n", + "\n", + "print count_dups_str(li)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "[1, 1, 1, 2, 3, 2]\n" + ] + } + ], + "prompt_number": 24 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "num_con_years_male_M1 = []\n", + "\n", + "#for i in range(np.shape(Names)[1]/2, np.shape(Names)[1]): # this prints M1--M5, but I just want M1\n", + "for i in [5]:\n", + " \n", + " # Print consecutive duplicates for M1, and also age ranges (inverted, as Years goes from 2017 to 1918)\n", + " num_con_years_male_M1 = count_dups_str(list(Names[:,i]))[::-1]\n", + " \n", + " \n", + "num_con_years_male_M1 = np.asarray(num_con_years_male_M1)\n", + "\n", + "print num_con_years_male_M1" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "[ 6 16 13 1 6 1 38 14 4 1]\n" + ] + } + ], + "prompt_number": 25 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "fig, ax = plt.subplots()\n", + "\n", + "plt.scatter(np.arange(len(num_con_years_male_M1)), num_con_years_male_M1, color='blue', s = 50);\n", + "plt.plot(np.arange(len(num_con_years_male_M1)), num_con_years_male_M1, color='blue', lw=1.5);\n", + "\n", + "ax.set_xlim(-1., 10.);\n", + "ax.set_ylim(-3., 42.);\n", + "\n", + "ax.tick_params(labelbottom=False);\n", + "ax.tick_params(axis='y', which='major', pad=10, labelsize=12)\n", + "\n", + "\n", + "ax.set_title(r'$\\rm{Number \\, consecutive \\, years \\, male \\, name \\, top}$' + '\\n', fontsize=18);\n", + "\n", + "#ax.set_xlabel(r'$\\rm{Year}$', fontsize=18, labelpad=5);\n", + "#ax.set_ylabel(r'$\\rm{Number \\, consecutive \\, years \\, male \\, name \\, top}$', fontsize=18, labelpad=10);" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "display_data", + "png": "iVBORw0KGgoAAAANSUhEUgAAAXUAAAEXCAYAAABSwdSZAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3XmcW2XZ//HPPdN9aEuhUNm6sBSR8qItVRG6TAZQwZXl\n0SI/wAUQkPUpD/gI2mFTdkTZBXEDZRNRH1yBKS1lb8EyQ6EUWixgoQXsvs1cvz+uxIYwk2UmyTlJ\nvu/Xa14zOUlOrpNzcs2d+1znvoOZISIi1aEu6gBERKR4lNRFRKqIkrqISBVRUhcRqSJK6iIiVURJ\nXUSkiiipi4hUESV1EZEqoqQuIlJFlNRzCCF8KYTw+xDCzBDCh6KORyRqIYSBIYSxIYSmEMJWUccj\n7xebpB5C+FwI4dchhCUhhI4QwtSM+w8OITySvG9xCOGKcsRlZncBNwH7A33K8ZrSuRDCCSGEZSGE\nMVHHUuNGA2cDfwe0L2ImNkndzP5gZkcCfwMeAK4LIWyfdv+fzGwy8EdgrJmdVcbwWsv4WjUvhHBG\nCGFEJ3dtBNYBGrAoQmb2DPD/oo4jTrIcs2UXm6Se4RtAB3BbJ/ctN7N3yxyPlFeCThK3md1mZjua\nmf7JRszMOqKOIWY6PWajEMukbmZLgZOAg0II34o6HimPEMKAEMLJwGeBEHU8IrnE8ZjtFXUAXTGz\ne0IIdwCXhRD+ZmYvZT4mhHAIcCWwO/A1M/t5COEg4Md4v1+jmT0SQmgCLkk+7khgD3zbJwCPAD8D\nTsG/HXwUeMTMftRJWI0hhG3wf4bjgVlmdl1aPPXAucA2wNvAXkCzmbWGEL6cvG9n/L/6IcBXgalm\n9kS29yKE8FXgIODVZIzvmtnVafefkty214EdgOfN7KbkfV1t+3hghpldn7aek5Pr7wA+BBxsZp/I\ntW254gwhfAa4ghz7KRlfAv9wXBFCWAXMM7OrQgjjgR8COwFHmdnsEMJpwJnACKAFOMLM3gkh3AUc\nAcwGDjWzt/OJP+M9Px44B99fTwJfMLOlIYSfAccAjwFfMrPXc607hLAjcDrwHtA/uQ3fNrM3k/dn\nPTay7ZcuYu/R8Z4r3iyvW9B7nCPWzo7PXO9jT7e74PjJcsymrTfb53Mq8B18338DP0dRD4wF/pi+\n/Xkzs1j9ALel/T0YeA14HKjPvD95u39yJx3TybLJacuGJJc9AAxOLhsBtOPJpV9y2cjksu3Snjsy\n+dzT05bVA7OAs9OW/Qy4Ne323sCbwBbJ2+OS62lOPv9Z4HM53o//ST6uV/L20cl1fCR5+8r010wu\n+wnwgzy2fSMwLHl7f+BHGet5qIBtyxVnvvtpRHLZ8E7ei6GdPH7H5HYkMtb7i4znZo2/i/d+KLAG\n/8eQWjYAuCfPdTckb98B/CPt/nOA5zLW0emxkWu/ZIm9J8d7zniTyzP3RcHvcb7HZwHvY0+2u7vx\nZztm8/l87pF8/qlpy7bEc9/3cu3rD7xmoU8o9Q8fTNoHJt/873Z2f9rBdUwnyyZ3sux7abfrksuO\n7WTZ/mnLRpKWoNKWHw2sxj/8Y5OPGZfxmFeBIzPWc0ie78VWwFrgpIwD6BygN/Dh5HszPuN545LL\nd81j2/dL3v488CIwBQjJZV9I/s62bVNzxNmnkP2U9h594AOSZb/+Hvh12u2j8JPp5BH/kTn2wa3A\nw2m3vwGMKWTdyePkkrT7Uh/iD3Wy3YdkrKfL/ZLH8dPd4z1nvJn7oifvcT7HZzfiKmi7e3iMdHrM\nkufnM8vz/xdYDwzNZ3+nfmLb/ZJiZn8PIVwHfDeE8EARVrkobd0dIQTwHZe5rD7PdfUHJuE7BuCQ\nEMLktMfMxhNeulfyjHUi0Bf4T9eTmS0GLgUIIXwK/9r3z4znvZFc/kng5Yx4U+tJbWfqGHgAT9AP\nA6tDCC3ARcn7Elm2bV2uOMvgJ8DdIYStzOwdPKHfnnZ/tvgz902mHwFzQwh7mn8NH2NmtxaybjP7\nZQhh2xDCCfg3i5TOSmQzj41s+yUfi1J/5Hu8FxhvSk/e42yx9kpbVkhcna0r23YXI/5MhX4+Mz2G\nN972wxsueYl9Uk86G38Dfgn8I+JY0vVP/t6I/+cFbzHmStpr8lx/6oBbl+P+BrwPMGVA8nfvPF8n\ntY5jgOl4q/BA4MEQwkS8FQFdbFsI4dAccXZbCGGSmc3M8bAHgGXAV0MIf8Bbtumyxp+NmT0XQpgF\nnB5C+AWQGUvOdSeT0LnAKWZ2c7L07bwuXjLz2Ohyv5jZs4VsS74KjDel2+9xiePKV9HiTztme/r5\nTP2zKqjSKJbVL5nMbB3+1WtX4NBOHvK+jQ4hDCtHXPjJj7XAo8CDyWVjM2IZEEKY0M31z8b/YWSu\ns3/yApyW5KLhGc/bJfn74QJe64t4//4CM7vFzKYCN+BJ5G/Jx3S1bY/miDMln/2U+ueYqiRIdPKY\n9zGzdrz89Ti8VfubjIfkij+XH+NdOkcDvytk3SGEHfD38UIz+0Py7t5pjzs8x2tn2y9F14N4/578\nXczjvxhx5asn8Xd1zM5I/s7385lZPbMfm/NL3uKY1LcOIQzJXGhmTwE/oPP/bvOBrdNuH4onmW1T\nC0LyuxZp25zvsqRNpCWYEEJfvH/1O2b2bvKr+Y+A00II6c89Cz9bD5t3Wj5dO5iXdl6MtxIHpt31\nLWC1mc0BbsaTWbrjgRvM7B8FbGcAzkxbDt7yfsrM2rJtm5m9lSXO9JZnzv2EfzV9j80HfuoDk23f\ngPd97w5sbWar0u/IFX8n68r0W+AdYKFl1Gfnse5B+Hub/hX+s3hf6QDgI6nNS/7OPDa63C/ZAu7B\n8Z5XvJnPy/P470ms3Yor32U9iZ8ujlnzi7Ryfj7TfCUtvp2AE0nmlxyv/z6pEy+RCyF8Hn8DJ+Jn\nfe80s3MyHtMLmGkZ5VzJ/6Tn4VUyhvdF3YN/EK8H5gHfAybjfW0/w/9LXphc1gbcgl85em5y2Qv4\nG39d8mteM3At3g0UgN2AB8zs7oxYTsH/wy7Cu7d+b2azkmVr04B9kuv+q5n9d57vzXF4i60VT4IP\nmtnDafefgpdC/Rs/afmsJUstQwiJPLb9puR7tRswEP+q2Bd408x+kmvbCogz2366zsx+nHzcZ4FT\n8a62/zOzlhDCFHwfTAaeBy7L6DcnhHAfcIGZze3ifcwafzYhhB8B07v6gGVbdwjhKPyD/TD+T+45\nvL/1w/iHvj9dHBshhK+QY790Eks++zzb8Z4r3pVp628FbrRk6V2h73EesXY3roK3uzvxp23HB47Z\ntPu6/Hwm7x+Jn0v5Ft7A2YgfC7/JzC/5iE1SF4mTEEIfYBvzOvR+eEL/36jjkuqTltRHmtlrPV1f\nHLtfROLgYjb3iZ6ItwRFSiFbt2LBlNRFOncXMCuEcB4w38xezfUEkUIlu2XvxLsj7wwhHNnjdar7\nRUSkeqilLiJSRZTURUSqiJK6iEgVUVIXEakiSuoiIlVESV1EpIooqYuIVBEldRGRKqKkLiJSRZTU\nRUSqiJK6iEgVUVIXEakiSuoiIlVESV1EpIooqYuIVBEldRGRKqKkLiJSRZTURUSqiJK6iEgVUVIX\nEakiSuoiIlVESV1EpIr0ivLFQwgW5euLiFQqMwudLY+8pW5mZf+ZPn16JK8b5Y+2uTZ+tM218ZNN\n5EldRESKR0ldRKSK1GRSb2xsjDqEstM21wZts4Rc/TP/eWAIuwHzgLvN7OjksgOA64CdgCeAr5rZ\na3m/eAiW7+uLiIgLIWBFOFF6HfAkYMmVDgXuBc4FhgBPA3f2LFQREemJvJJ6CGEq8C7wIJD673AY\n8LyZ3WtmG4BmYO8QwuhSBCoiIrnlTOohhEHA+cCZbE7oAHsCz6VumNka4GVgTJFjFBGRPOXTUr8Q\nuMXM3sC7XlKd4A3AiozHrgC2KF54IiJSiKxXlIYQxgIHAONSi9jcWl8FDMp4ymBgZSEBNDc3/+fv\nxsZGnckWEcnQ0tJCS0tLXo/NWv0SQjgduJjNiXoLoB54AbgRONbMJiYf2wC8DYw1s5fyenFVv4iI\nFCxb9UuupN4fGJi6CZwFjAROTN5+Gfg68ABwATDRzPYrIDAldRGRAmVL6lm7X8xsLbA2bUWrgLVm\ntjx5+3DgWuBXwOPA1GIFLSIihcv74qOSvLha6iJF097uv+vro41DSq9YFx+JSAzNmQOJBPTp4z+J\nBMydG3VUEhW11EUq2Jw5MHkyrF79/uUNDfDIIzB+fDRxSWmppS5SpaZN+2BCB1921lnlj0eip5a6\nSIVqb/fulo6Ozu+vq4MNG9THXo3UUhcRqRFK6iIVqr7e+9O7MmWKWum1SN0vIhVs7lyYNKnzE6Uz\nZ8K4cZ0/Tyqbul9EqtS4cV7lMmHC5mXDhimh1zIldZEKN348nHmm/73XXt7lMnZstDFJdJTURapA\nayv06gUnnABvvAELFkQdkURFSV2kCrS1wW67wac+5bcfeijaeCQ6SuoiVaC1FfbcE3bdFXbYAR5+\nOOqIJCpK6iIVbt06WLgQPvIRCAGamjypq7CsNimpi1S4F1/0q0o/8hG/nUjA2297611qj5K6SIVr\na/Pfe+7pv5ua/Le6YGqTkrpIhWtt9TLG3Xbz2yNGwKhROllaq5TURSpcqvKlb9/Ny5qaYMaMzRNn\nSO1QUhepcG1tm/vTUxIJePddeO65aGKS6Cipi1Sw9evh5Zc396enJBL+W/3qtUdJXaSCvfSSd7Fk\nttS33x5231396rVISV2kgqXKFjNb6uCt9UcegY0byxuTREtJXaSCtbV55cvo0R+8r6kJVq2CZ54p\nf1wSHSV1kQrW2upDA6RXvqQ0Nvpv9avXFiV1kQrWWeVLyjbb+FC86levLUrqIhVq/XofYrerpA7e\nr/7oo/5YqQ1K6iIVasECr3zp7CRpSlMTrF0LTzxRvrgkWkrqIhUqVfmSraU+ebKP3Kh+9dqhpC5S\nodraoK7O69G7MmSIT3enpF47lNRFKlRrK+yyC/Trl/1xiQQ89ph3w0j1U1IXqVBtbdn701MSCdiw\nAWbPLn1MEj0ldZEKtGFD7sqXlEmT/AIllTbWBiV1kQq0YAFs2pRfS33gQPjoR9WvXiuU1EUqUGq2\no3xa6uCljU8+CStXli4miQcldZEK1Nqau/IlXSLhNe2zZpU2LomekrpIBWprg513hv7983v8fvtB\nnz7qV68FSuoiFai1Nf+uF4ABA2DffdWvXguU1EUqzMaNPjlGPidJ0zU1wZw5Ps2dVC8ldZEKk6p8\nKaSlDt6vbuYTZ0j1UlIXqTCpypdCW+of/7j3wasLprrlTOohhF+FEN4MIawIIbwSQjg37b4DQgjz\nQwirQwgPhRCGlzZcEWlt9UG68q18SenbF/bfXydLq10+LfUfAKPMbBBwMHBqCOFTIYShwG+Bc4Eh\nwNPAnSWLVESAzZUvAwYU/txEAubNg7ffLn5cEg85k7qZtZrZurRFG4G3gcOAeWZ2r5ltAJqBvUMI\nncyWKCLFUmjlS7qmJv/d0lK0cCRm8upTDyFcH0JYDbQCF5vZHGBP4LnUY8xsDfAyMKYUgYpI9ytf\nUvbZB7bYQv3q1SyvpG5mJwNbAAcCF4UQPgY0ACsyHroi+TgRKYGFCz2xd7el3ru3T5yhfvXq1Svf\nB5qZAS0hhLuBI4FVwKCMhw0GChpdorm5+T9/NzY20piaAl1EPiA121F3W+rg/eoPPABvvAHbb1+c\nuKS0WlpaaMmzzyx4rs5fCOEW4F/AYuBYM5uYXN6A97WPNbOX8lyXFfr6IrXswgth+nRYtap7J0rB\nL0DaZx/41a/gqKOKG5+URwgBMwud3Ze1+yWEsE0IYWoIoSGEUB9C+BTwX8D9wH3AmBDCYSGEfsB0\n4Nl8E7qIFK61FUaO7H5CB9h7b9hyS/WrV6tcfeoGnAgsAZYDFwJHm9lTZrYMOBy4GHgHmABMLWGs\nIjUv39mOsqmvh8ZG9atXq6x96snE3Zjl/geBPYock4h0YtMmePFFOPjgnq8rkYDf/Q4WLfKWv1QP\nDRMgUiEWLvRp7HraUofN9erqgqk+SuoiFSJV+dLdcsZ0e+4J22yjpF6NlNRFKkRqIK89itDhGYJ3\nwTz8sI/cKNVDSV2kQqQqXxoairO+RAKWLIGXXy7O+iQelNRFKkQxKl/SJRL+W10w1UVJXaQCpCpf\nitGfnjJ6tF9RqtLG6qKkLlIBXnkF1q8vbktd/erVSUldpAKkTpIWs6UOXtr41lub1y+VT0ldpAKk\nyhmLUfmSTv3q1UdJXaQCtLXBiBE+FnoxjRrlFTXqV68eSuoiFaAnsx3lkkj4TEgdHaVZv5SXkrpI\nzLW3w/z5xT1Jmq6pCd59F557LvdjJf6U1EViLlX5UsqWOqhfvVooqYvEXKoypVQt9R128Jp1JfXq\noKQuEnOlqnxJl0jAjBl+kZNUNiV1kZhra4Phw2HgwNK9RiIBK1f6VHdS2ZTURWKulJUvKan53lXa\nWPmU1EVirNSVLynDhvlrqF+98impi8TYokWwbl3pW+rgpY2zZvnsSlK5lNRFYqyYsx3lkkjAmjXw\n5JOlfy0pHSV1kRgr1UBenZkyxUduVL96ZVNSF4mx1lbYcUcYNKj0r7XVVjB2rPrVK52SukiMFXu2\no1yammD2bFi7tnyvKcWlpC4SUx0d8MIL5el6SUkk/ETpY4+V7zWluJTURWJq0SJvMZezpT5pEtTX\nq1+9kimpi8RUOStfUgYNggkT1K9eyZTURWKqnJUv6ZqavKxx1aryvq4Uh5K6SEy1tvoIioMHl/d1\nEwkf2GvWrPK+rhSHkrpITJW78iVl//2hd291wVQqJXWRGIqi8iVlwADYd1+dLK1USuoiMbR4sV+y\nH0VSB++CmTMH3nsvmteX7lNSF4mhUs92lEtTk39beOSRaF5fuk9JXSSGoihnTLfvvtCvn/rVK5GS\nukgMtbXB9tvDlltG8/p9+/oJU/WrVx4ldZEYKsdsR7kkEvCPf8CyZdHGIYVRUheJmVTlS1T96SlN\nTf67pSXSMKRASuoiMfPaa7B6dfQt9QkToKFB/eqVRkldJGairnxJ6d0bJk9Wv3qlUVIXiZmoK1/S\nJRI+8fWbb0YdieRLSV0kZtraYLvtYMiQqCPxpA7qV68kOZN6CKFPCOHWEMKiEMKKEMLcEMKn0+4/\nIIQwP4SwOoTwUAhheGlDFqlucah8SRk3zgcUUxdM5cinpd4LeA2YbGaDgPOAu0IIw0MIQ4HfAucC\nQ4CngTtLFaxItTPzlnpcknp9vU9IrZOllSNnUjezNWZ2vpm9lrz9f8CrwATgMGCemd1rZhuAZmDv\nEMLoEsYsUrX++U+vfIn6JGm6piZYuNCrciT+Cu5TDyEMA0YDzwN7As+l7jOzNcDLwJhiBShSS+J0\nkjQl1a+u1nplKCiphxB6A7cDPzOzl4AGYEXGw1YAWxQnPJHaEtVsR9mMGQNDh6pfvVL0yveBIYQ6\n4JfAOuCU5OJVwKCMhw4GVua73ubm5v/83djYSGNjY75PFak6ra0wbBhsvXXUkWxWVweNjd5SN4MQ\noo6o9rS0tNCSZwlSMLPcDwohAD8FhgOHmNn65PLjgWPNbGLydgPwNjA22ZLPtV7L5/VFasW++/pV\nnA8+GHUk73fDDXDyybBgAey6a9TRSAgBM+v032u+3S83AB8GPp9K6En3AWNCCIeFEPoB04Fn80no\nIvJ+cat8Sad+9cqRT536COAEYG/gXyGElcmfI81sGXA4cDHwDl4RM7WUAYtUqyVLYOXKeFW+pOy+\nu18QpX71+MvZp25mi8mS/M3sQWCPYgYlUoviWPmSEoK31h98UP3qcadhAkRiIi4DeXWlqQmWLvWx\nYCS+lNRFYqK1FbbdNl6VL+lS/erqgok3JXWRmIjrSdKUUaNg+HCdLI07JXWRGEhVvsS16wW8H72p\nyZN6R0fU0UhXlNRFYuD112HFini31MG7YN55B+bNizoS6YqSukgMxP0kaYr61eNPSV0kBuJczphu\np538ilL1q8eXkrpIDLS1wTbb+E/cNTXBjBmwaVPUkUhnlNRFYiBOsx3lkkh4///cuVFHIp1RUheJ\nWCVUvqRTv3q8KamXUXu7/4ike+MN+Pe/K6elPmyYx6p+9XhSUi+DOXO8ddOnj/8kEvrqKptVSuVL\nuqYmmDkTNmyIOhLJpKReYnPmwOTJ0NLiF2x0dPjfkyb5fSKVUvmSLpGANWvgqaeijkQyKamX2LRp\nPpFwptWr4ayzyh+PxE9bm4/3UgmVLylTpvgVpuqCiZ+8Zj4q2YtX+cxH7e3e3dLVJdV1df71tb6+\nvHFJvOy/P/Tq5WWClWTcOBgyRCdMo1CMmY+kBDo6YNmyqKOQKMV5tqNcEgmYPRvWrYs6EkmnpF5C\n9fXen57NLrvA2Wf7ONVSe/71L3jvvco6SZrS1ATr18Njj0UdiaRTUi+xK67wbpZMDQ1wzz3wxS/C\nlVf6sKZnnOEDO0ntqMSTpCmTJvmxrX71eFFSL7FnnvFulj328A9AXZ1/bZ05Ew4/HH71K59J5stf\nhmuvhZ139lnbX3st6silHCqxnDFl8GCYMEF96nGjpF5Cb70F55wDjY3eItuwwX8eeshPMqXsthvc\ndhssWABf/SrccosPmnT88fDKK1FFL+XQ2gpbbeUzHlWiRAKeeMKHDdCFdfGgpF5C//M/Xrp4/fVe\n/lVfn73SZdQouOkmWLgQTjgBfvlLGD0ajj0WXnyxfHFL+aSGB6jUiZx32skH9tpyS11YFxdK6iUy\nYwb84hee2PfYo7Dn7rSTd8W8+iqcdhrcfbev48gjN/fBSuUzq6yBvDLNmeMn+cG3RRfWxYOSegls\n2AAnnQQjR8K553Z/PdttB1ddBYsW+Yfnj3+EMWPgiCPg2WeLFa1EZelSePfdyuxPB7+wbs2aDy7X\nhXXRUlIvgauughde8Nb2gAE9X9+228Ill3hy/+534e9/9z75z39el2lXskqufGlvh0ce6fr+GTPU\nxx4VJfUiW7QILrgADj0UPvOZ4q5766193anXmDULPvYx+PSn4dFHu36eRoeMp1TlSyUmdYkvJfUi\nO+00L1u85prSvcaWW3qLffFib8HPmQMTJ/rFIC0t3r8JGh0y7lpb/TL7D30o6kgKl+vCuilTNPxF\nVDT2SxHdf79fTHT55eXtU1y9Gm6+GS67zK9QnDgRpk71fvjMPs+GBv/aPH58+eKTzk2e7CcXZ82K\nOpLumTvXT4pmDljX0ODXYaSX7UpxZRv7RUm9SFav9q/RgwZ5C7l37/LHsHYt3HorXHopLFnS9eMS\nCV0wEjUzGDrUT3rfdFPU0XTfnDnegJkxY/PAdTfeCN/8ZrRxVTsl9TI45xxvKc+a5aPuRWnNGthi\ni83dMJk0OmT0li71bpcf/hBOPz3qaHquvd0bFXvv7Q2a556Dvn2jjqp6aZTGEnv+ea94+frXo0/o\n4B+mSr2YpVZU8vAAnamv94bEtdf6hXJXXBF1RLVLSb2HOjq8Jn3QIO/2iAOdxIq/Si5nzObgg31M\no4su0hAXUVFS76Gf/9y7XC6/3PtI4+Kqq/yEVaZ+/XxUSIlWW5tXMW23XdSRFN8Pf+iTfpx6atdd\ngFI6Suo9sHy5DwOw//4+EFecjBvnVS6JhPehh+BljWPGqCohDlLDA1RjN9mOO8L558MDD8B990Ud\nTe1RUu+Bb3/bJzi44YbOx0yP2vjxXuWyYQNs3OjfJp5+WuNfx0FqIK9qddppftL09NNh5cqoo6kt\nMUxFlWH2bB8i98wzYa+9oo4mu9TokCecADvs4Bcu6WtxdN56y6cxrLb+9HS9enljZ8kSaG6OOpra\noqTeDZs2+cnRnXaC6dOjjiZ//fr5AGOPPgp//WvU0dSuWhke4BOf8DkBrrkG/vGPqKOpHUrq3fCj\nH/lBes01XsZVSb7xDRgxQq31KKUqX6q5+yXlkkt8KIQTT9x8cZKUlpJ6gZYs8db5Zz7jQwJUmj59\nPKE/9ZQP5Svl19bmJbDbbx91JKW31VZ+Luexx+CnP406mtqgK0oLdMQRfla/tdVnKqpEGzf6pBsD\nB/ocqnE8yVvNGhv95PXs2VFHUh5mfm1Ea6tfmBSn0t9K1aMrSkMIp4QQng4hrAsh3JZx3wEhhPkh\nhNUhhIdCCMOLFXQc/elPcO+9cN55lZvQwS/jnj7dJ9pQyVn5tbVVf396uhD8pOmKFZtnSpLSydlS\nDyEcCnQAnwL6m9nXksuHAi8D3wD+AFwETDKzT+T94hXUUl+71vtA+/b1cS369Ik6op5pb/ea9fp6\n3x5dYVoeb7/tk55cdZVXTtWS1PhIM2f6SKLSfT1qqZvZfWZ2P7A8467DgOfN7F4z2wA0A3uHEEb3\nNOA4uvhinzP0+usrP6GDJ/HmZv9KfNddUUdTO2ql8qUz3/seDB/ulWMbN0YdTfUqpDc187/CnsBz\nqRtmtgZvuY8pQlyxMn++tzCOPtqv0KwW//VfXmPf3OxlmlJ61TaQVyEaGrxy7PnnfSgBKY1Cknpm\nP0kDsCJj2Qqgwor8sjODk0/2A7LaRp6rq/PLuV96Ce64I+poakNrq1e+7LBD1JFE4wtfgM99zhsS\nr70WdTTVqVcBj81sqa8CBmUsGwwUdFFwc9rlZo2NjTQ2Nhby9JK74w6/rP6GG7wvtNp88Ys+Fsz5\n58ORR0YzuUctSZ0krcYxX/L14x/7e3D66TpRn6+WlhZaWlryemzeJY0hhAuBHdNOlB4PHGtmE5O3\nG4C3gbFm9lKe64z1idL33oPdd4eRI73OtlpL//74R289/eQncNxxUUdT3YYNg89+1meoqmWXXupj\nJ/3+937sSWF6WtJYH0Loh7fq60MIfUMI9cB9wJgQwmHJ+6cDz+ab0CvBuef6GB033li9CR38QqqP\nfxwuvBDWr486muq1bJmP+1KLJ0kznXmmvw+nnvrBOU6lZ/JJVd8F1gDnAP8PWAuca2bLgMOBi4F3\ngAnA1BLFWXZPPeVdLqecUv1D1YYAF1zgfZy13oIspVo+SZqpTx//fC1e7BNqSPHoitJOtLd7y/WN\nN7zyZVCBIvxrAAAMKElEQVTmmYMqZOazJb3yCrz8MvTvH3VE1efGG72cb/FiL+0Tn4fg9tv9Wgl9\ng8mf5igt0A03+OXzV19dGwkdvLV+4YX+j6ySZ7ePs9ZWHwBup52ijiQ+Lr/ch6s4+WQNMFcsaqln\nePNN+PCHvaX+l7/UXpXCAQd4HfErr3Q+HZ503wEHwKpV8MQTUUcSLzffDN/8pk8NecwxUUdTGdRS\nL8C0aX6y8Lrrai+hg7fW33rLt1+Kq7VV/emdOe442HdfOOsseOedqKOpfErqaf72N/j1r73Uarfd\noo4mGvvtB5/+tF9Bq2nIimf5cli6VP3Gnamr8y7P5cvhO9+JOprKp6SetG4dfOtbsOuuntRr2QUX\n+AfsmmuijqR6vPCC/1ZLvXNjx/q8pjffrO6pnlJST7rsMliwwLsd+vWLOppoffSj8PnPw5VX+gVY\n0nOp2Y7UUu/aBRf4xCEnnqixiHpCSR0v4fv+9+HLX4ZPfjLqaOLh/PM9oV99ddSRVIe2Nq98USlj\n1wYO9IG+nn1W53R6ouarX8zg4IN9Fpr582tjirF8HXGET1D96quw9dZRR1PZDjzQJ4l48smoI4k3\nMzjkEJ8c/YUXanfgs1xU/ZLFPfd46eJFFymhZzr/fC/Bq7bRKaNQa7MddVcIcO21Pt56rU0iUiw1\nndRXrIAzzvBhAE4+Oepo4mfPPWHqVB8D+623oo6mcr37rl//oJOk+dllF6+Cuftub3BJYWouqbe3\n+w/4PJ1vvumXb/cqZBDiGjJ9ulcGXXpp1JEUJn0/R62WZzvqrrPPhtGjvSJt7dqoo6ksNZPU58zx\nWYv69PGfCRO8BfrNb8LHPhZ1dPG1++4+49P11/sQAnGXuZ8TCZg7N9qYUpUvaqnnr29fP+YWLoRL\nLok6mspSEydK58zxwao6G+Lz4YchZvNyxM4rr3hyP/FEn+Agrrrazw0N8MgjMH58NHGdcYaPVb9y\nZXUP4VwKX/kK3HsvzJvnLXdxNX+idNq0rsdsvuCC8sZSiXbeGb72Nb8wJM5TkHW1n1ev9kvQo9La\n6l0vSuiFu+oqv27kW9/SgF/5qvrDrL3dW2ldmTEjPn2vcXbeef774oujjaMrcd7Pqnzpvg99yI+5\nv/8d7rwz6mgqQ9UndSmO4cPh+OPhpz/17hjJz3vv+bkI9ad330knwT77eInjv/8ddTTxV/VJvb7e\n+1m7MmWKP0Zy+853/L268MKoI/mg+nqYOLHr+/fZJ5r9rMqXnquv9wq1pUvhu9+NOpr4q/qkDt4v\n19nY4A0NPr6J5Gf77b3V9Itf+Dg5cbJ0qV930JkQfIz43/62vDGBprArlgkT/FqS667zE+LStZpI\n6uPGeX9rIuEnq+rq/O+ZM6t//tFi+/a3/cTV+edHHclmzzzjH/oXX4Qf/OCD+/nPf4a99oLDD4fm\nZujoKF9sra0+NeCIEeV7zWp10UWwzTZehaXzYFmYWWQ//vLltWmT/0j3nX22WQhmra1RR2J2xx1m\n/fqZ7bST2Zw5m5dn7ue1a82OPdYMzA491GzlyvLE98lPmo0fX57XqgW33+778Lrroo4kWsnc2Wle\nrYk6dSmuZctg1CgfCO2uu6KJob0dzj3Xr3SdONFrmbfdNvtzzHyM+GnTvI/7/vu9XLOUdtwRmpq8\ny0p6zgwOOgieftoH4NtmG19ea+fFar5OXYpr6FC/oObuu30W+HL79799vPdLL4UTToAHH8yd0MH7\n1s84w7tjXn/dx41/6KHSxvn66zpJWkwheL/6mjWw997xunI4LpTUpVv++79h8GAfG6acXnrJJwX/\n61/9MvKbbvIPdSEOOsiHwB02zMfPv/ba0lzYopOkpZG6wOytt/z8SEcHtLTApEk6iQpK6tJNQ4Z4\nN8b99/tX4XL48599nJ7ly/1ilJNO6v66dt0VHn/cx+4+9VSvwV+/vnixgsoZS2XaNB+aN1PUVw7H\nhZK6dNvpp8NWW8H3vlfa1zGDyy+Hz3wGRo6Ep57y6wt6atAg+N3vvG/+1lu97/tf/+r5elNSlS8j\nRxZvnbUuzlcOx4WSunTboEE+ROqf/gSPPVaa11i71keJPPtsL0l89NHiJsm6Oi+Vu/NOn0btox8t\n3jePtjb48Idr7yRelFR3oaQuPXTKKV6BUIor/ZYs8auBb7/dr2K9887OLyIrhi99yf9h1NV53+wd\nd/R8na2t6k8vtlxXiNfV+ZXPtTypi5K69EhDg1+Q9OCD/tW3WGbP9guK5s/3LpLzzvPKh1IaO9Zb\n6R/7GBx1FJxzTve/yq9Y4f+U1J9efF1dId6/v58Ev/xy/zZ35pmVMQdA0XVVwF6OHyK4+EiKb80a\ns+22M5s82ayjo+fru/VWsz59zHbZxez553u+vkKtX2924ol+kcvBB5u9+27h63j8cX/+/fcXPz4x\ne+YZs0TCrK7OfxKJzRefzZ9vdswxZvX1Zn37mp18stnixdHGW2xkufhISV2K4sc/9qPpb3/r/jo2\nbjQ77TRfz4EHmi1fXrz4uuOGG8x69TLbfXdPFIW49VbfjgULShObuGxXiC9caHbccWa9e/vP8cf7\nsmqgpC4lt26dX6q/777da60vW2bW1ORH5BlneIKPgxkzzIYONRs82OyBB/J/3rRpPnyBhqSI3uLF\n3lrv08db78cea/bii1FH1TPZkrr61KUo+vb1fu/HH/dqmEI8/7z3Y8+aBbfdBldfHZ+JwCdP9n72\nUaO8pPKyy/KrsGhtVeVLXAwf7lehvvKKn9i/807YYw+fKi81f2w10dgvUjQbN/pcpltt5bXk+ZzY\nvO8+L1kcOND/3nff0sfZHatXw9e/7mPdfOUrcMstfmKuKyNG+Jg0t99evhglP0uX+pDb11/vww0c\nfrg3SPbeO+rI8qexX6Qsevf2C5GeecavNM2mo8Pnhz3sMC/7e/rp+CZ08GqL3/zGp1b79a+97HHJ\nks4fu3Klz+WqcsZ4GjbMv3EtWuTlj3/9q1c+feEL5bs6uqS66pcpxw/qU686GzeajR5tttdeZu3t\nnT9m5Uqzww/3/vOjj/ZhcSvJ739vNnCg2bBhZo8++sH7n3jCt+2++8ofmxTunXfMmpvNttxyc8XT\n7NlRR5Ud6lOXcunVywf5mjcP7rnH67zTa71ffRX239+7Wq68En7+c590o5J87nN+7mCLLaCx0YcY\nSDdvnv9WS70yDBnix+zixfD973vX4X77wYEHdn3tReZxHStdZfty/KCWelXatMls553NBgzwyTTq\n6swaG81uusls6629RfTnP0cdZc8tX2520EHeujv1VK9Nb2z022A2Zcr7J+6QyrBqldkVV/g3MTCb\nNMlLdTs6vD6+sXFzfXxjYzT7GE2SIeU0Z463dDob9XDUKPjLX2C33cofVyls2uRXnl51lV+injlV\nXkODD0A1fnw08Un3rV3rJ8QvvdTHxd9rLx/6OfO4jmIfZztRqqQuRZdI+PjWnZk0Kfsoe5Vqjz18\nSIPOJBKlnYxDSmv9ei+1PeOMrodnLvc+VlKXsmlv90kruprcua4ONmyorvrtWtzmWtPe7tVdXaWr\ncu/jkpY0hhC2CiHcF0JYFUJYFEI4sqfrFBGJm1IPKFcsxah+uQ5YB2wLHAXcEELQ2HQ1KtfQqFOm\nVF+LtRa3udZU0j7uUfdLCKEBeAfY08xeTi77OfCGmf1vHs9X90sVmjvX+85Tc0mmNDTAzJkwblw0\ncZVSLW5zrYnTPi5l98toYFMqoSc9B6hCt4aNG+cnQxMJ72usq/O/qzm51eI215pK2cc9balPAu4y\ns+3Slh0PfMXMEnk8Xy31Kpe6QCMuX03LoRa3udZEvY+ztdR7OhbeKmBQxrLBwMp8V9Dc3Pyfvxsb\nG2lsbOxhSBIntZjYanGba02593FLSwstXdUJZyhFn/ovgX+a2XfyeL5a6iIiBSppnXoI4deAAccB\n44E/Ap8wsxfyeK6SuohIgUo99O7JQH/gLeBXwIn5JHQRESk+XVEqIlJhNEmGiEiNUFIXEakiSuoi\nIlVESV1EpIooqYuIVBEldRGRKlKTST3fy22riba5NmibRUm9Rmiba4O2WWoyqYuIVCsldRGRKhL5\nMAGRvbiISAUr2SiNIiISH+p+ERGpIkrqIiJVREldRKSKKKmLiFQRJXURkSry/wG5e+lTvNCXPgAA\nAABJRU5ErkJggg==\n", + "text": [ + "" + ] + } + ], + "prompt_number": 26 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 26 + } + ], + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/task-05/test b/task-05/test new file mode 100644 index 0000000..e69de29 diff --git a/task-09/get_top_names.py b/task-09/get_top_names.py index cccd0ad..b12f829 100644 --- a/task-09/get_top_names.py +++ b/task-09/get_top_names.py @@ -5,6 +5,7 @@ """ import pandas as pd +import sqlite3 def extract_data_lines(filename, start_text, end_text, include_start=False, include_end=False): @@ -31,17 +32,56 @@ class NameRecorder: def __init__(self): self.records = [] self.year = None + def add(self, name, is_female, rank): + + if self.year is None: + raise ValueError('One must set year first') + + + if is_female: + + self.gender = 'Female' + + else: + + self.gender = 'Male' - # complete this member function - raise NotImplementedError + + self.records.append((self.year, self.gender, rank, name)) + def to_pandas(self): - # complete this member function - raise NotImplementedError + + if self.records == []: + + raise ValueError('Empty data base') + + return pd.DataFrame.from_records(self.records, columns=['year', 'gender', 'rank', 'name']) + + + def to_sql(self, filename = None): + + # https://www.dataquest.io/blog/python-pandas-databases/ + # Check output by typing: + # recorder.to_sql("names") + # import sqlite3 + # conn = sqlite3.connect("names.db") + # pd.read_sql_query("select * from names;", conn) + + if filename is None: + + raise ValueError('Please, insert file name') + + else: # From pandas to SQL + + conn = sqlite3.connect("%s.db" % (filename)) + df = recorder.to_pandas() + return df.to_sql(filename, conn, if_exists="replace") + def clear(self): self.records.clear() @@ -66,4 +106,4 @@ def clear(self): data = recorder.to_pandas() - print(data.query('name == "Emma"').query('rank == 1')['year'].tolist()) + print(data.query('name == "Emma"').query('rank == 1')['year'].tolist()) \ No newline at end of file diff --git a/task-10/GitHub_API.ipynb b/task-10/GitHub_API.ipynb new file mode 100755 index 0000000..5fd5aa1 --- /dev/null +++ b/task-10/GitHub_API.ipynb @@ -0,0 +1,230 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[u'Hack-Hour', u'AoT', u'learning-by-doing']\n" + ] + } + ], + "source": [ + "import json\n", + "import requests\n", + "\n", + "repos = json.loads(requests.get('https://api.github.com/orgs/astropgh/repos').text)\n", + "print([repo['name'] for repo in repos])" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[u'2017-11-14T21:47:05Z', u'2017-11-14T22:12:02Z', u'2018-09-27T20:56:58Z']\n" + ] + } + ], + "source": [ + "print([repo['created_at'] for repo in repos])" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "hector-mr Task 09\n", + "kuanweih Task 09\n", + "hector-mr Task 08\n", + "troyraen Task/07\n", + "kuanweih Task/07\n", + "bretthandrews Completes Task/07\n", + "djperrefort Completes task-03\n", + "djperrefort Adds djperrefort to task-02/completed.md\n", + "troyraen Task/03\n", + "bretthandrews Completes Task/03\n", + "hector-mr Task 07\n", + "hsnee Task/07\n", + "hector-mr Data structure for task-04\n", + "KuanWang-Astro task03 completed\n", + "AlanPearl Completed task 1 by adding my username to completed.md\n", + "kuanweih completed Task/03\n", + "davidjsetton add my username to complete task 01\n", + "cfielder Task/02\n", + "troyraen Troy name in task-01/completed.md\n", + "kevindwilk added my username to task-02/completed.md\n", + "kevindwilk added my name Kevin Wilk\n", + "lizehan2008 I added my username\n", + "lizehan2008 I added my username \"lizehan2008\"\n", + "hector-mr Task 03\n", + "bretthandrews Completes Task/02\n", + "hsnee completed Task/03\n", + "hsnee finishing task 02\n", + "hsnee task-01 completed\n", + "KuanWang-Astro task/02 completed\n", + "KuanWang-Astro Task/01 completed, task/05 completed\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "\n", + "pulls_learndo = json.loads(requests.get('https://api.github.com/repos/astropgh/learning-by-doing/pulls').text)\n", + "\n", + "\n", + "\n", + "for p, pp in enumerate(pulls_learndo):\n", + " \n", + " print pp['user']['login'], pp['title']" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is the user with the highest number of pull requests: hector-mr\n" + ] + } + ], + "source": [ + "pulls_learndo_users = []\n", + "\n", + "for p, pp in enumerate(pulls_learndo):\n", + " \n", + " pulls_learndo_users.append(p)\n", + " pulls_learndo_users[p] = pp['user']['login']\n", + " \n", + " \n", + " \n", + " \n", + "#word_counter = {}\n", + "\n", + "#for word in pulls_learndo_users:\n", + " \n", + "# if word in word_counter:\n", + " \n", + "# word_counter[word] += 1\n", + " \n", + "# else:\n", + " \n", + "# word_counter[word] = 1\n", + " \n", + " \n", + "#popular_words = sorted(word_counter, key = word_counter.get, reverse = True)\n", + "\n", + "#print \"This is the user with the highest number of pull requests: \", popular_words[0]\n", + "\n", + "\n", + "\n", + "\n", + "def most_common(lst):\n", + " return max(set(lst), key=lst.count)\n", + "\n", + "print \"This is the user with the highest number of pull requests: \", most_common(pulls_learndo_users)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "numpy_branches = json.loads(requests.get('https://api.github.com/repos/numpy/numpy/branches').text) " + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "maintenance/1.0.3.x\n", + "maintenance/1.1.x\n", + "maintenance/1.2.x\n", + "maintenance/1.3.x\n", + "maintenance/1.4.x\n", + "maintenance/1.5.x\n", + "maintenance/1.6.x\n", + "maintenance/1.7.x\n", + "maintenance/1.8.x\n", + "maintenance/1.9.x\n", + "maintenance/1.10.x\n", + "maintenance/1.11.x\n", + "maintenance/1.12.x\n", + "maintenance/1.13.x\n", + "maintenance/1.14.x\n", + "maintenance/1.15.x\n", + "master\n", + "-----------------------------------------------------------------------------------------------\n", + "Charles Harris 2018-11-25T22:42:32Z\n", + "revert-11693-accept-nep18\n" + ] + } + ], + "source": [ + "for b, bb in enumerate(numpy_branches):\n", + " \n", + " print bb['name']\n", + " \n", + " if bb['name'] == 'master':\n", + " \n", + " print '-----------------------------------------------------------------------------------------------'\n", + " \n", + " numpy_master_comm_auth = json.loads(requests.get(bb['commit']['url']).text)['commit']['author']['name']\n", + " numpy_master_comm_date = json.loads(requests.get(bb['commit']['url']).text)['commit']['author']['date']\n", + " \n", + " print numpy_master_comm_auth, numpy_master_comm_date" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.15" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +}