From f37acf35aaaf8ee957ad5ec5cf46178afae02e03 Mon Sep 17 00:00:00 2001
From: Anisa Hawes <87070441+anisa-hawes@users.noreply.github.com>
Date: Thu, 19 Oct 2023 10:51:38 +0100
Subject: [PATCH] Upload corpus-analysis-with-spacy.ipynb

Upload python notebook into /assets directory
---
 .../corpus-analysis-with-spacy.ipynb          | 1225 +++++++++++++++++
 1 file changed, 1225 insertions(+)
 create mode 100644 assets/corpus-analysis-with-spacy/corpus-analysis-with-spacy.ipynb
diff --git a/assets/corpus-analysis-with-spacy/corpus-analysis-with-spacy.ipynb b/assets/corpus-analysis-with-spacy/corpus-analysis-with-spacy.ipynb
new file mode 100644
index 0000000000..5b85748606
--- /dev/null
+++ b/assets/corpus-analysis-with-spacy/corpus-analysis-with-spacy.ipynb
@@ -0,0 +1,1225 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/programminghistorian/ph-submissions/blob/gh-pages/assets/corpus-analysis-with-spacy/corpus-analysis-with-spacy.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "wVeD4Ik7D43F"
+      },
+      "source": [
+        "## Imports, Uploads, and Preprocessing\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Import Packages"
+      ],
+      "metadata": {
+        "id": "n_BVwGof2vi9"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Tyh52tgEA_HA"
+      },
+      "outputs": [],
+      "source": [
+        "# Import spacy\n",
+        "import spacy\n",
+        "\n",
+        "# Load spaCy visualizer\n",
+        "from spacy import displacy\n",
+        "\n",
+        "# Import pandas DataFrame packages\n",
+        "import pandas as pd\n",
+        "\n",
+        "# Import graphing package\n",
+        "import plotly.graph_objects as go\n",
+        "import plotly.express as px"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "IbNN7-PIBcW_"
+      },
+      "source": [
+        "### Upload Text Files"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "-r9zTBIAlLsI"
+      },
+      "outputs": [],
+      "source": [
+        "# Import drive and files to facilitate file uploads\n",
+        "from google.colab import files"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "XaVUPnFIE_kS"
+      },
+      "outputs": [],
+      "source": [
+        "# Selet multiple text files to upload from local folder\n",
+        "uploaded_files = files.upload()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "N3f8cxLrgzUq"
+      },
+      "outputs": [],
+      "source": [
+        "type(uploaded_files)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "s2w09XuhKqOq"
+      },
+      "outputs": [],
+      "source": [
+        "# Add files into DataFrame\n",
+        "paper_df = pd.DataFrame.from_dict(uploaded_files, orient='index')\n",
+        "paper_df.head()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "BJJPgl5FL9qY"
+      },
+      "outputs": [],
+      "source": [
+        "# Reset index and add column names to make wrangling easier\n",
+        "paper_df = paper_df.reset_index()\n",
+        "paper_df.columns = [\"Filename\", \"Text\"]\n",
+        "paper_df.head()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Pre-process Text Files"
+      ],
+      "metadata": {
+        "id": "uXI3nyVQ2-sf"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "2QTKf4DV00Aa"
+      },
+      "outputs": [],
+      "source": [
+        "# Convert papers from bytes to strings\n",
+        "paper_df['Text'] = paper_df['Text'].str.decode('utf-8')\n",
+        "paper_df.head()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "k633V0JbBInq"
+      },
+      "outputs": [],
+      "source": [
+        "# Remove extra spaces from papers\n",
+        "paper_df['Text'] = paper_df['Text'].str.replace('\\s+', ' ', regex=True).str.strip()\n",
+        "paper_df.head()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Upload and Merge Metadata Files"
+      ],
+      "metadata": {
+        "id": "oBCoNFow2W4U"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "ZCASvLyJcq7C"
+      },
+      "outputs": [],
+      "source": [
+        "# Upload csv with essay metadata\n",
+        "metadata = files.upload()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "gby6n4lzcq-c"
+      },
+      "outputs": [],
+      "source": [
+        "metadata_df = pd.read_csv('metadata.csv')\n",
+        "metadata_df = metadata_df.dropna(axis=1, how='all')\n",
+        "metadata_df.head()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "T5s_ZEt-BcXP"
+      },
+      "outputs": [],
+      "source": [
+        "# Remove .txt from title of each paper\n",
+        "paper_df['Filename'] = paper_df['Filename'].str.replace('.txt', '')\n",
+        "\n",
+        "# Rename column from paper ID to Title\n",
+        "metadata_df.rename(columns={\"PAPER ID\": \"Filename\"}, inplace=True)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "s_Ta2d77BcXP"
+      },
+      "outputs": [],
+      "source": [
+        "# Merge metadata and papers into new DataFrame\n",
+        "# Will only keep rows where both essay and metadata are present\n",
+        "final_paper_df = metadata_df.merge(paper_df,on='Filename')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "zOMIXCyOMOKl"
+      },
+      "outputs": [],
+      "source": [
+        "# Print DataFrame\n",
+        "final_paper_df.head()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "zQ8ve667EvxG"
+      },
+      "source": [
+        "### Alternate Code: Installs, Imports and Preprocessing in Jupyter Notebook"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# # Install and import spacy\n",
+        "# !pip install spaCy\n",
+        "\n",
+        "# # Import spacy\n",
+        "# import spacy\n",
+        "\n",
+        "# # Install English language model\n",
+        "# !spacy download en_core_web_sm\n",
+        "\n",
+        "# # Import os to upload documents and metadata\n",
+        "# import os\n",
+        "\n",
+        "# # Load spaCy visualizer\n",
+        "# from spacy import displacy\n",
+        "\n",
+        "# # Import pandas DataFrame packages\n",
+        "# import pandas as pd\n",
+        "\n",
+        "# # Import graphing package\n",
+        "# import plotly.graph_objects as go\n",
+        "# import plotly.express as px"
+      ],
+      "metadata": {
+        "id": "KlvUa2oX1645"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "ub8MflGfA_HB"
+      },
+      "outputs": [],
+      "source": [
+        "# # Create empty lists for file names and contents\n",
+        "# texts = []\n",
+        "# file_names = []\n",
+        "# # Iterate through each file in the path\n",
+        "# for _file_name in os.listdir('path_to_directory'):\n",
+        "# # Look for only text files\n",
+        "#     if _file_name.endswith('.txt'):\n",
+        "#     # Append contents of each text file to text list\n",
+        "#         texts.append(open('path_to_directory' + '/' + _file_name, 'r').read())\n",
+        "#         # Append name of each file to file name list\n",
+        "#         file_names.append(_file_name)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Z7BmHGFBA_HB",
+        "scrolled": true
+      },
+      "outputs": [],
+      "source": [
+        "# # Create dictionary object associating each file name with its text\n",
+        "# d = {'Filename':file_names,'Text':texts}"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "yC5-sbOPA_HB"
+      },
+      "outputs": [],
+      "source": [
+        "# # Turn dictionary into a dataframe\n",
+        "# paper_df = pd.DataFrame(d)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "cK3PvJkcA_HC"
+      },
+      "outputs": [],
+      "source": [
+        "# paper_df.head()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "kax8Ecu7A_HC"
+      },
+      "outputs": [],
+      "source": [
+        "# # Remove extra spaces from papers\n",
+        "# paper_df['Text'] = paper_df['Text'].str.replace('\\s+', ' ', regex=True).str.strip()\n",
+        "# paper_df.head()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "zGbZVIrFA_HC"
+      },
+      "outputs": [],
+      "source": [
+        "# metadata_df = pd.read_csv('path_to_directory/metadata.csv')\n",
+        "# metadata_df.head()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "RO4lwuwJcrID"
+      },
+      "outputs": [],
+      "source": [
+        "# # Remove .txt from title of each paper\n",
+        "# paper_df['Filename'] = paper_df['Filename'].str.replace('.txt', '', regex=True)\n",
+        "\n",
+        "# # Rename column from paper ID to Title\n",
+        "# metadata_df.rename(columns={\"PAPER ID\": \"Filename\"}, inplace=True)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "2eCYbDExeuqM",
+        "scrolled": true
+      },
+      "outputs": [],
+      "source": [
+        "# # Merge metadata and papers into new DataFrame\n",
+        "# # Will only keep rows where both essay and metadata are present\n",
+        "# final_paper_df = metadata_df.merge(paper_df,on='Filename')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "C-8mOPIZA_HD"
+      },
+      "outputs": [],
+      "source": [
+        "# # Print DataFrame\n",
+        "# final_paper_df.head()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "xdlHoqlBA_HD"
+      },
+      "source": [
+        "## Text Enrichment with spaCy"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Creating Doc Objects"
+      ],
+      "metadata": {
+        "id": "ejyC6xvA3w9q"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "2P1TUR3oA_HD"
+      },
+      "outputs": [],
+      "source": [
+        "# Load nlp pipeline\n",
+        "nlp = spacy.load('en_core_web_sm')\n",
+        "\n",
+        "# Check what functions it performs\n",
+        "print(nlp.pipe_names)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "ceknDmJZA_HE"
+      },
+      "outputs": [],
+      "source": [
+        "#Define example sentence\n",
+        "sentence = \"This is 'an' example? sentence\"\n",
+        "\n",
+        "# Call the nlp model on the sentence\n",
+        "doc = nlp(sentence)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "KWX1hxoBA_HE"
+      },
+      "outputs": [],
+      "source": [
+        "# Loop through each token in doc object\n",
+        "for token in doc:\n",
+        "    # Print text and part of speech for each\n",
+        "    print(token.text, token.pos_)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "5pE_8YJ_A_HE"
+      },
+      "outputs": [],
+      "source": [
+        "# Define a function that runs the nlp pipeline on any given input text\n",
+        "def process_text(text):\n",
+        "    return nlp(text)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "N3O8qhn1A_HE",
+        "scrolled": true
+      },
+      "outputs": [],
+      "source": [
+        "# Apply the function to the \"Text\" column, so that the nlp pipeline is called on each student essay\n",
+        "final_paper_df['Doc'] = final_paper_df['Text'].apply(process_text)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "BSNTL5msA_HE"
+      },
+      "source": [
+        "### Text Reduction"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "#### Tokenization"
+      ],
+      "metadata": {
+        "id": "5tgNt7NO35I0"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "SBEb3CErA_HE"
+      },
+      "outputs": [],
+      "source": [
+        "# Define a function to retrieve tokens from a doc object\n",
+        "def get_token(doc):\n",
+        "    # Loop through each token in the doc object\n",
+        "    for token in doc:\n",
+        "        # Retrieve the text of each token\n",
+        "        return token.text"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "GOQY6IYAA_HE"
+      },
+      "outputs": [],
+      "source": [
+        "# Define a function to retrieve tokens from a doc object\n",
+        "def get_token(doc):\n",
+        "    return [(token.text) for token in doc]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "7-qpJ-peA_HF"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the token retrieval function on the doc objects in the dataframe\n",
+        "final_paper_df['Tokens'] = final_paper_df['Doc'].apply(get_token)\n",
+        "final_paper_df.head()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "YLzPDKXqA_HF"
+      },
+      "source": [
+        "#### Lemmatization"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "50hlrH_KA_HF"
+      },
+      "outputs": [],
+      "source": [
+        "# Define a function to retrieve lemmas from a doc object\n",
+        "def get_lemma(doc):\n",
+        "    return [(token.lemma_) for token in doc]\n",
+        "\n",
+        "# Run the lemma retrieval function on the doc objects in the dataframe\n",
+        "final_paper_df['Lemmas'] = final_paper_df['Doc'].apply(get_lemma)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "WbrgpAIjA_HF",
+        "scrolled": true
+      },
+      "outputs": [],
+      "source": [
+        "print(f'\"Write\" appears in the text tokens column ' + str(final_paper_df['Tokens'].apply(lambda x: x.count('write')).sum()) + ' times.')\n",
+        "print(f'\"Write\" appears in the lemmas column ' + str(final_paper_df['Lemmas'].apply(lambda x: x.count('write')).sum()) + ' times.')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Text Annotation"
+      ],
+      "metadata": {
+        "id": "GvH1xTvZ3-MW"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "FLeLfyvVA_HF"
+      },
+      "source": [
+        "#### Part of Speech Tagging"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "T24RkcuTA_HF"
+      },
+      "outputs": [],
+      "source": [
+        "# Define a function to retrieve lemmas from a doc object\n",
+        "def get_pos(doc):\n",
+        "    #Return the coarse- and fine-grained part of speech text for each token in the doc\n",
+        "    return [(token.pos_, token.tag_) for token in doc]\n",
+        "\n",
+        "# Define a function to retrieve parts of speech from a doc object\n",
+        "final_paper_df['POS'] = final_paper_df['Doc'].apply(get_pos)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "_1fZXUynA_HF",
+        "scrolled": true
+      },
+      "outputs": [],
+      "source": [
+        "# Create a list of part of speech tags\n",
+        "list(final_paper_df['POS'])"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "2lhKj9xGA_HG"
+      },
+      "outputs": [],
+      "source": [
+        "spacy.explain(\"IN\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "L-k8yoClA_HG"
+      },
+      "outputs": [],
+      "source": [
+        "# Define function to extract proper nouns from Doc object\n",
+        "def extract_proper_nouns(doc):\n",
+        "    return [token.text for token in doc if token.pos_ == 'PROPN']\n",
+        "\n",
+        "# Apply function to Doc column and store resulting proper nouns in new column\n",
+        "final_paper_df['Proper_Nouns'] = final_paper_df['Doc'].apply(extract_proper_nouns)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "P2r_x9neA_HG"
+      },
+      "outputs": [],
+      "source": [
+        "list(final_paper_df['Proper_Nouns'])"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "e-8878WiA_HG"
+      },
+      "source": [
+        "#### Dependency Parsing"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Kka-2u7eA_HG",
+        "scrolled": true
+      },
+      "outputs": [],
+      "source": [
+        "# Extract the first sentence from the fifth Doc object\n",
+        "doc = final_paper_df['Doc'][5]\n",
+        "\n",
+        "# Create a list of sentence from the doc object\n",
+        "sentences = list(doc.sents)\n",
+        "\n",
+        "# Retrieve the first sentence\n",
+        "sentence = sentences[0]\n",
+        "\n",
+        "# Create dependency visualization for the first sentence of the 5th essay\n",
+        "displacy.render(sentence, style=\"dep\", jupyter=True)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "TqwjCcjDA_HG"
+      },
+      "outputs": [],
+      "source": [
+        "#Define function to extract parts of speech of all non-stopwords\n",
+        "def extract_stopwords(doc):\n",
+        "    return [token.text for token in doc if token.text not in nlp.Defaults.stop_words]\n",
+        "\n",
+        "#Create list of tokens without stopwords\n",
+        "final_paper_df['Tokens_NoStops'] = final_paper_df['Doc'].apply(extract_stopwords)\n",
+        "\n",
+        "#Turn list of stopwords into a string\n",
+        "final_paper_df['Text_NoStops'] = [' '.join(map(str, l)) for l in final_paper_df['Tokens_NoStops']]\n",
+        "\n",
+        "#Create new doc object from texts without stopwords\n",
+        "final_paper_df['Doc_NoStops'] = final_paper_df['Text_NoStops'].apply(process_text)\n",
+        "\n",
+        "# extract the first sentence from the first Doc object\n",
+        "doc = final_paper_df['Doc_NoStops'][5]\n",
+        "sentences = list(doc.sents)\n",
+        "sentence = sentences[0]\n",
+        "\n",
+        "# visualize the dependency parse tree for the sentence\n",
+        "displacy.render(sentence, style='dep', jupyter=True)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "yhnSUEC3A_HH"
+      },
+      "outputs": [],
+      "source": [
+        "# Define function to extract noun phrases from Doc object\n",
+        "def extract_noun_phrases(doc):\n",
+        "    return [chunk.text for chunk in doc.noun_chunks]\n",
+        "\n",
+        "# Apply function to Doc column and store resulting proper nouns in new column\n",
+        "final_paper_df['Noun_Phrases'] = final_paper_df['Doc'].apply(extract_noun_phrases)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "wod3AHAXA_HH",
+        "scrolled": true
+      },
+      "outputs": [],
+      "source": [
+        "final_paper_df['Noun_Phrases'][0]"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "39utqaRyh_M1"
+      },
+      "source": [
+        "#### Named Entity Recognition\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "wRffhMlUA_HI",
+        "scrolled": false
+      },
+      "outputs": [],
+      "source": [
+        "# Get all NE labels and assign to variable\n",
+        "labels = nlp.get_pipe(\"ner\").labels\n",
+        "\n",
+        "# Print each label and its description\n",
+        "for label in labels:\n",
+        "    print(label + ' : ' + spacy.explain(label))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "mow27zbdA_HI"
+      },
+      "outputs": [],
+      "source": [
+        "# Define function to extract named entities from doc objects\n",
+        "def extract_named_entities(doc):\n",
+        "    return [ent.label_ for ent in doc.ents]\n",
+        "\n",
+        "# Apply function to Doc column and store resulting named entities in new column\n",
+        "final_paper_df['Named_Entities'] = final_paper_df['Doc'].apply(extract_named_entities)\n",
+        "final_paper_df['Named_Entities']"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "XQYCg6fkA_HJ"
+      },
+      "outputs": [],
+      "source": [
+        "# Define function to extract text tagged with named entities from doc objects\n",
+        "def extract_named_entities(doc):\n",
+        "    return [ent for ent in doc.ents]\n",
+        "\n",
+        "# Apply function to Doc column and store resulting text in new column\n",
+        "final_paper_df['NE_Words'] = final_paper_df['Doc'].apply(extract_named_entities)\n",
+        "final_paper_df['NE_Words']"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "7ovPPNn4A_HJ"
+      },
+      "outputs": [],
+      "source": [
+        "# Extract the first Doc object\n",
+        "doc = final_paper_df['Doc'][1]\n",
+        "\n",
+        "# Visualize named entity tagging in a single paper\n",
+        "displacy.render(doc, style='ent', jupyter=True)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "rKiNOMoDB2ta"
+      },
+      "source": [
+        "### Download Enriched Dataset"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "S7USU7ipB9YK"
+      },
+      "outputs": [],
+      "source": [
+        "# Save DataFrame as csv (in Google Drive)\n",
+        "# Use this step only to save  csv to your computer's working directory\n",
+        "final_paper_df.to_csv('MICUSP_papers_with_spaCy_tags.csv')\n",
+        "\n",
+        "# Download csv to your computer from Google Drive\n",
+        "files.download('MICUSP_papers_with_spaCy_tags.csv')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "XUGnN4BX4C4c"
+      },
+      "source": [
+        "## Analysis of Linguistic Annotations"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "9QTJrt6byTlf"
+      },
+      "source": [
+        "### Part of Speech Analysis"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "T5RAD_t_SJt2"
+      },
+      "outputs": [],
+      "source": [
+        "# Create doc object from single sentence\n",
+        "doc = nlp(\"This is 'an' example? sentence\")\n",
+        "\n",
+        "# Print counts of each part of speech in sentence\n",
+        "print(doc.count_by(spacy.attrs.POS))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "QGPdo6FtS4bq"
+      },
+      "outputs": [],
+      "source": [
+        "# Store dictinoary with indexes and POS counts in a variable\n",
+        "num_pos = doc.count_by(spacy.attrs.POS)\n",
+        "\n",
+        "dictionary = {}\n",
+        "\n",
+        "# Create a new dictionary which replaces the index of each part of speech for its label (NOUN, VERB, ADJECTIVE)\n",
+        "for k,v in sorted(num_pos.items()):\n",
+        "  dictionary[doc.vocab[k].text] = v\n",
+        "\n",
+        "dictionary"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "TnDn9MpTA_HK"
+      },
+      "outputs": [],
+      "source": [
+        "# Create list to store each dictionary\n",
+        "num_list = []\n",
+        "\n",
+        "# Define a function to get part of speech tags and counts and append them to a new dictionary\n",
+        "def get_pos_tags(doc):\n",
+        "    dictionary = {}\n",
+        "    num_pos = doc.count_by(spacy.attrs.POS)\n",
+        "    for k,v in sorted(num_pos.items()):\n",
+        "        dictionary[doc.vocab[k].text] = v\n",
+        "    num_list.append(dictionary)\n",
+        "\n",
+        "# Apply function to each doc object in DataFrame\n",
+        "final_paper_df['C_POS'] = final_paper_df['Doc'].apply(get_pos_tags)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "DQ0UCcqMA_HK"
+      },
+      "outputs": [],
+      "source": [
+        "# Create new dataframe with part of speech counts\n",
+        "pos_counts = pd.DataFrame(num_list)\n",
+        "columns = list(pos_counts.columns)\n",
+        "\n",
+        "# Add discipline of each paper as new column to dataframe\n",
+        "idx = 0\n",
+        "new_col = final_paper_df['DISCIPLINE']\n",
+        "pos_counts.insert(loc=idx, column='DISCIPLINE', value=new_col)\n",
+        "\n",
+        "pos_counts"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "rbK1VH5lZ7T6"
+      },
+      "outputs": [],
+      "source": [
+        "# Get average part of speech counts used in papers of each discipline\n",
+        "average_pos_df = pos_counts.groupby(['DISCIPLINE']).mean()\n",
+        "\n",
+        "# Round calculations to the nearest whole number\n",
+        "average_pos_df = average_pos_df.round(0)\n",
+        "\n",
+        "# Reset index to improve DataFrame readability\n",
+        "average_pos_df = average_pos_df.reset_index()\n",
+        "\n",
+        "# Show dataframe\n",
+        "average_pos_df"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "F0NVs2tojM9A"
+      },
+      "outputs": [],
+      "source": [
+        "# Use plotly to plot proper noun use per genre\n",
+        "fig = px.bar(average_pos_df, x=\"DISCIPLINE\", y=[\"ADJ\", 'VERB', \"NUM\"], title=\"Average Part-of-Speech Use in Papers Written by Biology and English Students\", barmode='group')\n",
+        "fig.show()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Fine-Grained Part of Speech Analysis"
+      ],
+      "metadata": {
+        "id": "ZzJrF2Pv4YQO"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "KO4hSnXxZdqR",
+        "scrolled": true
+      },
+      "outputs": [],
+      "source": [
+        "# Create list to store each dictionary\n",
+        "tag_num_list = []\n",
+        "\n",
+        "# Define a function to get part of speech tags and counts and append them to a new dictionary\n",
+        "def get_fine_pos_tags(doc):\n",
+        "    dictionary = {}\n",
+        "    num_tag = doc.count_by(spacy.attrs.TAG)\n",
+        "    for k,v in sorted(num_tag.items()):\n",
+        "        dictionary[doc.vocab[k].text] = v\n",
+        "    tag_num_list.append(dictionary)\n",
+        "\n",
+        "# Apply function to each doc object in DataFrame\n",
+        "final_paper_df['F_POS'] = final_paper_df['Doc'].apply(get_fine_pos_tags)\n",
+        "\n",
+        "# Create new dataframe with part of speech counts\n",
+        "tag_counts = pd.DataFrame(tag_num_list)\n",
+        "columns = list(tag_counts.columns)\n",
+        "\n",
+        "# Add discipline of each paper as new column to dataframe\n",
+        "idx = 0\n",
+        "new_col = final_paper_df['DISCIPLINE']\n",
+        "tag_counts.insert(loc=idx, column='DISCIPLINE', value=new_col)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "W7NknlyqA_HL"
+      },
+      "outputs": [],
+      "source": [
+        "# Get average fine-grain part of speech counts used in papers of each discipline\n",
+        "average_tag_df = tag_counts.groupby(['DISCIPLINE']).mean()\n",
+        "\n",
+        "# Round calculations to the nearest whole number\n",
+        "average_tag_df = average_tag_df.round(0)\n",
+        "\n",
+        "# Reset index to improve DataFrame readability\n",
+        "average_tag_df = average_tag_df.reset_index()\n",
+        "\n",
+        "# Show dataframe\n",
+        "average_tag_df"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "IIt9zHCmA_HL"
+      },
+      "outputs": [],
+      "source": [
+        "# Use plotly to plot proper noun use per genre\n",
+        "fig = px.bar(average_tag_df, x=\"DISCIPLINE\", y=[\"VBD\", 'VBP', 'VBZ'], title=\"Average Verb Tense Usage Differences in Biology and English Student Writing\", barmode='group')\n",
+        "fig.show()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "xC6eZIpIyb8k"
+      },
+      "source": [
+        "### Named Entity Analysis"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "U_WJ6sFTifpA",
+        "scrolled": true
+      },
+      "outputs": [],
+      "source": [
+        "# Create new DataFrame for analysis purposes\n",
+        "ner_analysis_df = final_paper_df[['Filename','PAPER TYPE', 'Named_Entities', 'NE_Words']]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "2MgWXgOL4K2E"
+      },
+      "outputs": [],
+      "source": [
+        "# Convert named entity lists to strings so we can count specific entities\n",
+        "ner_analysis_df['Named_Entities'] = ner_analysis_df['Named_Entities'].apply(lambda x: ' '.join(x))\n",
+        "\n",
+        "# Get the number of each type of entity in each paper\n",
+        "person_counts = ner_analysis_df['Named_Entities'].str.count('PERSON')\n",
+        "loc_counts = ner_analysis_df['Named_Entities'].str.count('LOC')\n",
+        "date_counts = ner_analysis_df['Named_Entities'].str.count('DATE')\n",
+        "woa_counts = ner_analysis_df['Named_Entities'].str.count('WORK_OF_ART')\n",
+        "\n",
+        "# Append named entity counts to new DataFrame\n",
+        "ner_counts_df = pd.DataFrame()\n",
+        "ner_counts_df['Genre'] = ner_analysis_df[\"PAPER TYPE\"]\n",
+        "ner_counts_df['PERSON_Counts'] = person_counts\n",
+        "ner_counts_df['LOC_Counts'] = loc_counts\n",
+        "ner_counts_df['DATE_Counts'] = date_counts\n",
+        "ner_counts_df['WORK_OF_ART_Counts'] = woa_counts\n",
+        "\n",
+        "ner_counts_df.head()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "OXUlrCXhh_0f"
+      },
+      "outputs": [],
+      "source": [
+        "# Calculate average usage of each named entity type\n",
+        "average_ner_df = ner_counts_df.groupby(['Genre']).mean()\n",
+        "average_ner_df = average_ner_df.round(0)\n",
+        "average_ner_df = average_ner_df.reset_index()\n",
+        "average_ner_df\n",
+        "\n",
+        "# Use plotly to plot proper noun use per genre\n",
+        "fig = px.bar(average_ner_df, x=\"Genre\", y=[\"PERSON_Counts\", 'LOC_Counts', \"DATE_Counts\", 'WORK_OF_ART_Counts'], title=\"Average Named Entity Usage Across Student Paper Genres\", barmode='group')\n",
+        "fig.show()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Analysis of ```DATE``` Named Entities"
+      ],
+      "metadata": {
+        "id": "n8nl8w084fo4"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "6RvvclvkA_HM",
+        "scrolled": true
+      },
+      "outputs": [],
+      "source": [
+        "# Define function to extract words tagged as \"date\" named entities from doc objects\n",
+        "def extract_date_named_entities(doc):\n",
+        "    return [ent for ent in doc.ents if ent.label_ == 'DATE']\n",
+        "\n",
+        "# Get all date entity words and apply to new column of DataFrame\n",
+        "ner_analysis_df['Date_Named_Entities'] = final_paper_df['Doc'].apply(extract_date_named_entities)\n",
+        "\n",
+        "\n",
+        "# Make list of date entities a string so we can count their frequencies\n",
+        "ner_analysis_df['Date_Named_Entities'] = [', '.join(map(str, l)) for l in ner_analysis_df['Date_Named_Entities']]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "ELHvnBtNvLSH",
+        "scrolled": true
+      },
+      "outputs": [],
+      "source": [
+        "# Search for only date words in proposal papers\n",
+        "date_word_counts_df = ner_analysis_df[(ner_analysis_df == 'Proposal').any(axis=1)]\n",
+        "\n",
+        "# Count the frequency of each word in these essays and append to list\n",
+        "date_word_frequencies = date_word_counts_df.Date_Named_Entities.str.split(expand=True).stack().value_counts()\n",
+        "\n",
+        "# Get top 10 most common words and their frequencies\n",
+        "date_word_frequencies[:10]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "2iKKPP-swqe2"
+      },
+      "outputs": [],
+      "source": [
+        "# Search for only date words in critique/evaluation papers\n",
+        "date_word_counts_df = ner_analysis_df[(ner_analysis_df == 'Critique/Evaluation').any(axis=1)]\n",
+        "\n",
+        "# Count the frequency of each word in these essays and append to list\n",
+        "date_word_frequencies = date_word_counts_df.Date_Named_Entities.str.split(expand=True).stack().value_counts()\n",
+        "\n",
+        "# Get top 10 most common words and their frequencies\n",
+        "date_word_frequencies[:10]"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3 (ipykernel)",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.8.5"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
\ No newline at end of file