From f37acf35aaaf8ee957ad5ec5cf46178afae02e03 Mon Sep 17 00:00:00 2001 From: Anisa Hawes <87070441+anisa-hawes@users.noreply.github.com> Date: Thu, 19 Oct 2023 10:51:38 +0100 Subject: [PATCH] Upload corpus-analysis-with-spacy.ipynb Upload python notebook into /assets directory --- .../corpus-analysis-with-spacy.ipynb | 1225 +++++++++++++++++ 1 file changed, 1225 insertions(+) create mode 100644 assets/corpus-analysis-with-spacy/corpus-analysis-with-spacy.ipynb diff --git a/assets/corpus-analysis-with-spacy/corpus-analysis-with-spacy.ipynb b/assets/corpus-analysis-with-spacy/corpus-analysis-with-spacy.ipynb new file mode 100644 index 000000000..5b8574860 --- /dev/null +++ b/assets/corpus-analysis-with-spacy/corpus-analysis-with-spacy.ipynb @@ -0,0 +1,1225 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wVeD4Ik7D43F" + }, + "source": [ + "## Imports, Uploads, and Preprocessing\n" + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Import Packages" + ], + "metadata": { + "id": "n_BVwGof2vi9" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Tyh52tgEA_HA" + }, + "outputs": [], + "source": [ + "# Import spacy\n", + "import spacy\n", + "\n", + "# Load spaCy visualizer\n", + "from spacy import displacy\n", + "\n", + "# Import pandas DataFrame packages\n", + "import pandas as pd\n", + "\n", + "# Import graphing package\n", + "import plotly.graph_objects as go\n", + "import plotly.express as px" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IbNN7-PIBcW_" + }, + "source": [ + "### Upload Text Files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-r9zTBIAlLsI" + }, + "outputs": [], + "source": [ + "# Import drive and files to facilitate file uploads\n", + "from google.colab import files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XaVUPnFIE_kS" + }, + "outputs": [], + "source": [ + "# Selet multiple text files to upload from local folder\n", + "uploaded_files = files.upload()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "N3f8cxLrgzUq" + }, + "outputs": [], + "source": [ + "type(uploaded_files)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "s2w09XuhKqOq" + }, + "outputs": [], + "source": [ + "# Add files into DataFrame\n", + "paper_df = pd.DataFrame.from_dict(uploaded_files, orient='index')\n", + "paper_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BJJPgl5FL9qY" + }, + "outputs": [], + "source": [ + "# Reset index and add column names to make wrangling easier\n", + "paper_df = paper_df.reset_index()\n", + "paper_df.columns = [\"Filename\", \"Text\"]\n", + "paper_df.head()" + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Pre-process Text Files" + ], + "metadata": { + "id": "uXI3nyVQ2-sf" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2QTKf4DV00Aa" + }, + "outputs": [], + "source": [ + "# Convert papers from bytes to strings\n", + "paper_df['Text'] = paper_df['Text'].str.decode('utf-8')\n", + "paper_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "k633V0JbBInq" + }, + "outputs": [], + "source": [ + "# Remove extra spaces from papers\n", + "paper_df['Text'] = paper_df['Text'].str.replace('\\s+', ' ', regex=True).str.strip()\n", + "paper_df.head()" + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Upload and Merge Metadata Files" + ], + "metadata": { + "id": "oBCoNFow2W4U" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZCASvLyJcq7C" + }, + "outputs": [], + "source": [ + "# Upload csv with essay metadata\n", + "metadata = files.upload()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gby6n4lzcq-c" + }, + "outputs": [], + "source": [ + "metadata_df = pd.read_csv('metadata.csv')\n", + "metadata_df = metadata_df.dropna(axis=1, how='all')\n", + "metadata_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "T5s_ZEt-BcXP" + }, + "outputs": [], + "source": [ + "# Remove .txt from title of each paper\n", + "paper_df['Filename'] = paper_df['Filename'].str.replace('.txt', '')\n", + "\n", + "# Rename column from paper ID to Title\n", + "metadata_df.rename(columns={\"PAPER ID\": \"Filename\"}, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "s_Ta2d77BcXP" + }, + "outputs": [], + "source": [ + "# Merge metadata and papers into new DataFrame\n", + "# Will only keep rows where both essay and metadata are present\n", + "final_paper_df = metadata_df.merge(paper_df,on='Filename')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zOMIXCyOMOKl" + }, + "outputs": [], + "source": [ + "# Print DataFrame\n", + "final_paper_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zQ8ve667EvxG" + }, + "source": [ + "### Alternate Code: Installs, Imports and Preprocessing in Jupyter Notebook" + ] + }, + { + "cell_type": "code", + "source": [ + "# # Install and import spacy\n", + "# !pip install spaCy\n", + "\n", + "# # Import spacy\n", + "# import spacy\n", + "\n", + "# # Install English language model\n", + "# !spacy download en_core_web_sm\n", + "\n", + "# # Import os to upload documents and metadata\n", + "# import os\n", + "\n", + "# # Load spaCy visualizer\n", + "# from spacy import displacy\n", + "\n", + "# # Import pandas DataFrame packages\n", + "# import pandas as pd\n", + "\n", + "# # Import graphing package\n", + "# import plotly.graph_objects as go\n", + "# import plotly.express as px" + ], + "metadata": { + "id": "KlvUa2oX1645" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ub8MflGfA_HB" + }, + "outputs": [], + "source": [ + "# # Create empty lists for file names and contents\n", + "# texts = []\n", + "# file_names = []\n", + "# # Iterate through each file in the path\n", + "# for _file_name in os.listdir('path_to_directory'):\n", + "# # Look for only text files\n", + "# if _file_name.endswith('.txt'):\n", + "# # Append contents of each text file to text list\n", + "# texts.append(open('path_to_directory' + '/' + _file_name, 'r').read())\n", + "# # Append name of each file to file name list\n", + "# file_names.append(_file_name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Z7BmHGFBA_HB", + "scrolled": true + }, + "outputs": [], + "source": [ + "# # Create dictionary object associating each file name with its text\n", + "# d = {'Filename':file_names,'Text':texts}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "yC5-sbOPA_HB" + }, + "outputs": [], + "source": [ + "# # Turn dictionary into a dataframe\n", + "# paper_df = pd.DataFrame(d)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cK3PvJkcA_HC" + }, + "outputs": [], + "source": [ + "# paper_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kax8Ecu7A_HC" + }, + "outputs": [], + "source": [ + "# # Remove extra spaces from papers\n", + "# paper_df['Text'] = paper_df['Text'].str.replace('\\s+', ' ', regex=True).str.strip()\n", + "# paper_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zGbZVIrFA_HC" + }, + "outputs": [], + "source": [ + "# metadata_df = pd.read_csv('path_to_directory/metadata.csv')\n", + "# metadata_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "RO4lwuwJcrID" + }, + "outputs": [], + "source": [ + "# # Remove .txt from title of each paper\n", + "# paper_df['Filename'] = paper_df['Filename'].str.replace('.txt', '', regex=True)\n", + "\n", + "# # Rename column from paper ID to Title\n", + "# metadata_df.rename(columns={\"PAPER ID\": \"Filename\"}, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2eCYbDExeuqM", + "scrolled": true + }, + "outputs": [], + "source": [ + "# # Merge metadata and papers into new DataFrame\n", + "# # Will only keep rows where both essay and metadata are present\n", + "# final_paper_df = metadata_df.merge(paper_df,on='Filename')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "C-8mOPIZA_HD" + }, + "outputs": [], + "source": [ + "# # Print DataFrame\n", + "# final_paper_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xdlHoqlBA_HD" + }, + "source": [ + "## Text Enrichment with spaCy" + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Creating Doc Objects" + ], + "metadata": { + "id": "ejyC6xvA3w9q" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2P1TUR3oA_HD" + }, + "outputs": [], + "source": [ + "# Load nlp pipeline\n", + "nlp = spacy.load('en_core_web_sm')\n", + "\n", + "# Check what functions it performs\n", + "print(nlp.pipe_names)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ceknDmJZA_HE" + }, + "outputs": [], + "source": [ + "#Define example sentence\n", + "sentence = \"This is 'an' example? sentence\"\n", + "\n", + "# Call the nlp model on the sentence\n", + "doc = nlp(sentence)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "KWX1hxoBA_HE" + }, + "outputs": [], + "source": [ + "# Loop through each token in doc object\n", + "for token in doc:\n", + " # Print text and part of speech for each\n", + " print(token.text, token.pos_)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5pE_8YJ_A_HE" + }, + "outputs": [], + "source": [ + "# Define a function that runs the nlp pipeline on any given input text\n", + "def process_text(text):\n", + " return nlp(text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "N3O8qhn1A_HE", + "scrolled": true + }, + "outputs": [], + "source": [ + "# Apply the function to the \"Text\" column, so that the nlp pipeline is called on each student essay\n", + "final_paper_df['Doc'] = final_paper_df['Text'].apply(process_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BSNTL5msA_HE" + }, + "source": [ + "### Text Reduction" + ] + }, + { + "cell_type": "markdown", + "source": [ + "#### Tokenization" + ], + "metadata": { + "id": "5tgNt7NO35I0" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SBEb3CErA_HE" + }, + "outputs": [], + "source": [ + "# Define a function to retrieve tokens from a doc object\n", + "def get_token(doc):\n", + " # Loop through each token in the doc object\n", + " for token in doc:\n", + " # Retrieve the text of each token\n", + " return token.text" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GOQY6IYAA_HE" + }, + "outputs": [], + "source": [ + "# Define a function to retrieve tokens from a doc object\n", + "def get_token(doc):\n", + " return [(token.text) for token in doc]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7-qpJ-peA_HF" + }, + "outputs": [], + "source": [ + "# Run the token retrieval function on the doc objects in the dataframe\n", + "final_paper_df['Tokens'] = final_paper_df['Doc'].apply(get_token)\n", + "final_paper_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YLzPDKXqA_HF" + }, + "source": [ + "#### Lemmatization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "50hlrH_KA_HF" + }, + "outputs": [], + "source": [ + "# Define a function to retrieve lemmas from a doc object\n", + "def get_lemma(doc):\n", + " return [(token.lemma_) for token in doc]\n", + "\n", + "# Run the lemma retrieval function on the doc objects in the dataframe\n", + "final_paper_df['Lemmas'] = final_paper_df['Doc'].apply(get_lemma)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "WbrgpAIjA_HF", + "scrolled": true + }, + "outputs": [], + "source": [ + "print(f'\"Write\" appears in the text tokens column ' + str(final_paper_df['Tokens'].apply(lambda x: x.count('write')).sum()) + ' times.')\n", + "print(f'\"Write\" appears in the lemmas column ' + str(final_paper_df['Lemmas'].apply(lambda x: x.count('write')).sum()) + ' times.')" + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Text Annotation" + ], + "metadata": { + "id": "GvH1xTvZ3-MW" + } + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FLeLfyvVA_HF" + }, + "source": [ + "#### Part of Speech Tagging" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "T24RkcuTA_HF" + }, + "outputs": [], + "source": [ + "# Define a function to retrieve lemmas from a doc object\n", + "def get_pos(doc):\n", + " #Return the coarse- and fine-grained part of speech text for each token in the doc\n", + " return [(token.pos_, token.tag_) for token in doc]\n", + "\n", + "# Define a function to retrieve parts of speech from a doc object\n", + "final_paper_df['POS'] = final_paper_df['Doc'].apply(get_pos)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_1fZXUynA_HF", + "scrolled": true + }, + "outputs": [], + "source": [ + "# Create a list of part of speech tags\n", + "list(final_paper_df['POS'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2lhKj9xGA_HG" + }, + "outputs": [], + "source": [ + "spacy.explain(\"IN\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "L-k8yoClA_HG" + }, + "outputs": [], + "source": [ + "# Define function to extract proper nouns from Doc object\n", + "def extract_proper_nouns(doc):\n", + " return [token.text for token in doc if token.pos_ == 'PROPN']\n", + "\n", + "# Apply function to Doc column and store resulting proper nouns in new column\n", + "final_paper_df['Proper_Nouns'] = final_paper_df['Doc'].apply(extract_proper_nouns)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "P2r_x9neA_HG" + }, + "outputs": [], + "source": [ + "list(final_paper_df['Proper_Nouns'])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e-8878WiA_HG" + }, + "source": [ + "#### Dependency Parsing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Kka-2u7eA_HG", + "scrolled": true + }, + "outputs": [], + "source": [ + "# Extract the first sentence from the fifth Doc object\n", + "doc = final_paper_df['Doc'][5]\n", + "\n", + "# Create a list of sentence from the doc object\n", + "sentences = list(doc.sents)\n", + "\n", + "# Retrieve the first sentence\n", + "sentence = sentences[0]\n", + "\n", + "# Create dependency visualization for the first sentence of the 5th essay\n", + "displacy.render(sentence, style=\"dep\", jupyter=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "TqwjCcjDA_HG" + }, + "outputs": [], + "source": [ + "#Define function to extract parts of speech of all non-stopwords\n", + "def extract_stopwords(doc):\n", + " return [token.text for token in doc if token.text not in nlp.Defaults.stop_words]\n", + "\n", + "#Create list of tokens without stopwords\n", + "final_paper_df['Tokens_NoStops'] = final_paper_df['Doc'].apply(extract_stopwords)\n", + "\n", + "#Turn list of stopwords into a string\n", + "final_paper_df['Text_NoStops'] = [' '.join(map(str, l)) for l in final_paper_df['Tokens_NoStops']]\n", + "\n", + "#Create new doc object from texts without stopwords\n", + "final_paper_df['Doc_NoStops'] = final_paper_df['Text_NoStops'].apply(process_text)\n", + "\n", + "# extract the first sentence from the first Doc object\n", + "doc = final_paper_df['Doc_NoStops'][5]\n", + "sentences = list(doc.sents)\n", + "sentence = sentences[0]\n", + "\n", + "# visualize the dependency parse tree for the sentence\n", + "displacy.render(sentence, style='dep', jupyter=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "yhnSUEC3A_HH" + }, + "outputs": [], + "source": [ + "# Define function to extract noun phrases from Doc object\n", + "def extract_noun_phrases(doc):\n", + " return [chunk.text for chunk in doc.noun_chunks]\n", + "\n", + "# Apply function to Doc column and store resulting proper nouns in new column\n", + "final_paper_df['Noun_Phrases'] = final_paper_df['Doc'].apply(extract_noun_phrases)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wod3AHAXA_HH", + "scrolled": true + }, + "outputs": [], + "source": [ + "final_paper_df['Noun_Phrases'][0]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "39utqaRyh_M1" + }, + "source": [ + "#### Named Entity Recognition\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wRffhMlUA_HI", + "scrolled": false + }, + "outputs": [], + "source": [ + "# Get all NE labels and assign to variable\n", + "labels = nlp.get_pipe(\"ner\").labels\n", + "\n", + "# Print each label and its description\n", + "for label in labels:\n", + " print(label + ' : ' + spacy.explain(label))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "mow27zbdA_HI" + }, + "outputs": [], + "source": [ + "# Define function to extract named entities from doc objects\n", + "def extract_named_entities(doc):\n", + " return [ent.label_ for ent in doc.ents]\n", + "\n", + "# Apply function to Doc column and store resulting named entities in new column\n", + "final_paper_df['Named_Entities'] = final_paper_df['Doc'].apply(extract_named_entities)\n", + "final_paper_df['Named_Entities']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XQYCg6fkA_HJ" + }, + "outputs": [], + "source": [ + "# Define function to extract text tagged with named entities from doc objects\n", + "def extract_named_entities(doc):\n", + " return [ent for ent in doc.ents]\n", + "\n", + "# Apply function to Doc column and store resulting text in new column\n", + "final_paper_df['NE_Words'] = final_paper_df['Doc'].apply(extract_named_entities)\n", + "final_paper_df['NE_Words']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7ovPPNn4A_HJ" + }, + "outputs": [], + "source": [ + "# Extract the first Doc object\n", + "doc = final_paper_df['Doc'][1]\n", + "\n", + "# Visualize named entity tagging in a single paper\n", + "displacy.render(doc, style='ent', jupyter=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rKiNOMoDB2ta" + }, + "source": [ + "### Download Enriched Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "S7USU7ipB9YK" + }, + "outputs": [], + "source": [ + "# Save DataFrame as csv (in Google Drive)\n", + "# Use this step only to save csv to your computer's working directory\n", + "final_paper_df.to_csv('MICUSP_papers_with_spaCy_tags.csv')\n", + "\n", + "# Download csv to your computer from Google Drive\n", + "files.download('MICUSP_papers_with_spaCy_tags.csv')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XUGnN4BX4C4c" + }, + "source": [ + "## Analysis of Linguistic Annotations" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9QTJrt6byTlf" + }, + "source": [ + "### Part of Speech Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "T5RAD_t_SJt2" + }, + "outputs": [], + "source": [ + "# Create doc object from single sentence\n", + "doc = nlp(\"This is 'an' example? sentence\")\n", + "\n", + "# Print counts of each part of speech in sentence\n", + "print(doc.count_by(spacy.attrs.POS))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QGPdo6FtS4bq" + }, + "outputs": [], + "source": [ + "# Store dictinoary with indexes and POS counts in a variable\n", + "num_pos = doc.count_by(spacy.attrs.POS)\n", + "\n", + "dictionary = {}\n", + "\n", + "# Create a new dictionary which replaces the index of each part of speech for its label (NOUN, VERB, ADJECTIVE)\n", + "for k,v in sorted(num_pos.items()):\n", + " dictionary[doc.vocab[k].text] = v\n", + "\n", + "dictionary" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "TnDn9MpTA_HK" + }, + "outputs": [], + "source": [ + "# Create list to store each dictionary\n", + "num_list = []\n", + "\n", + "# Define a function to get part of speech tags and counts and append them to a new dictionary\n", + "def get_pos_tags(doc):\n", + " dictionary = {}\n", + " num_pos = doc.count_by(spacy.attrs.POS)\n", + " for k,v in sorted(num_pos.items()):\n", + " dictionary[doc.vocab[k].text] = v\n", + " num_list.append(dictionary)\n", + "\n", + "# Apply function to each doc object in DataFrame\n", + "final_paper_df['C_POS'] = final_paper_df['Doc'].apply(get_pos_tags)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "DQ0UCcqMA_HK" + }, + "outputs": [], + "source": [ + "# Create new dataframe with part of speech counts\n", + "pos_counts = pd.DataFrame(num_list)\n", + "columns = list(pos_counts.columns)\n", + "\n", + "# Add discipline of each paper as new column to dataframe\n", + "idx = 0\n", + "new_col = final_paper_df['DISCIPLINE']\n", + "pos_counts.insert(loc=idx, column='DISCIPLINE', value=new_col)\n", + "\n", + "pos_counts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rbK1VH5lZ7T6" + }, + "outputs": [], + "source": [ + "# Get average part of speech counts used in papers of each discipline\n", + "average_pos_df = pos_counts.groupby(['DISCIPLINE']).mean()\n", + "\n", + "# Round calculations to the nearest whole number\n", + "average_pos_df = average_pos_df.round(0)\n", + "\n", + "# Reset index to improve DataFrame readability\n", + "average_pos_df = average_pos_df.reset_index()\n", + "\n", + "# Show dataframe\n", + "average_pos_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "F0NVs2tojM9A" + }, + "outputs": [], + "source": [ + "# Use plotly to plot proper noun use per genre\n", + "fig = px.bar(average_pos_df, x=\"DISCIPLINE\", y=[\"ADJ\", 'VERB', \"NUM\"], title=\"Average Part-of-Speech Use in Papers Written by Biology and English Students\", barmode='group')\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Fine-Grained Part of Speech Analysis" + ], + "metadata": { + "id": "ZzJrF2Pv4YQO" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "KO4hSnXxZdqR", + "scrolled": true + }, + "outputs": [], + "source": [ + "# Create list to store each dictionary\n", + "tag_num_list = []\n", + "\n", + "# Define a function to get part of speech tags and counts and append them to a new dictionary\n", + "def get_fine_pos_tags(doc):\n", + " dictionary = {}\n", + " num_tag = doc.count_by(spacy.attrs.TAG)\n", + " for k,v in sorted(num_tag.items()):\n", + " dictionary[doc.vocab[k].text] = v\n", + " tag_num_list.append(dictionary)\n", + "\n", + "# Apply function to each doc object in DataFrame\n", + "final_paper_df['F_POS'] = final_paper_df['Doc'].apply(get_fine_pos_tags)\n", + "\n", + "# Create new dataframe with part of speech counts\n", + "tag_counts = pd.DataFrame(tag_num_list)\n", + "columns = list(tag_counts.columns)\n", + "\n", + "# Add discipline of each paper as new column to dataframe\n", + "idx = 0\n", + "new_col = final_paper_df['DISCIPLINE']\n", + "tag_counts.insert(loc=idx, column='DISCIPLINE', value=new_col)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "W7NknlyqA_HL" + }, + "outputs": [], + "source": [ + "# Get average fine-grain part of speech counts used in papers of each discipline\n", + "average_tag_df = tag_counts.groupby(['DISCIPLINE']).mean()\n", + "\n", + "# Round calculations to the nearest whole number\n", + "average_tag_df = average_tag_df.round(0)\n", + "\n", + "# Reset index to improve DataFrame readability\n", + "average_tag_df = average_tag_df.reset_index()\n", + "\n", + "# Show dataframe\n", + "average_tag_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "IIt9zHCmA_HL" + }, + "outputs": [], + "source": [ + "# Use plotly to plot proper noun use per genre\n", + "fig = px.bar(average_tag_df, x=\"DISCIPLINE\", y=[\"VBD\", 'VBP', 'VBZ'], title=\"Average Verb Tense Usage Differences in Biology and English Student Writing\", barmode='group')\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xC6eZIpIyb8k" + }, + "source": [ + "### Named Entity Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "U_WJ6sFTifpA", + "scrolled": true + }, + "outputs": [], + "source": [ + "# Create new DataFrame for analysis purposes\n", + "ner_analysis_df = final_paper_df[['Filename','PAPER TYPE', 'Named_Entities', 'NE_Words']]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2MgWXgOL4K2E" + }, + "outputs": [], + "source": [ + "# Convert named entity lists to strings so we can count specific entities\n", + "ner_analysis_df['Named_Entities'] = ner_analysis_df['Named_Entities'].apply(lambda x: ' '.join(x))\n", + "\n", + "# Get the number of each type of entity in each paper\n", + "person_counts = ner_analysis_df['Named_Entities'].str.count('PERSON')\n", + "loc_counts = ner_analysis_df['Named_Entities'].str.count('LOC')\n", + "date_counts = ner_analysis_df['Named_Entities'].str.count('DATE')\n", + "woa_counts = ner_analysis_df['Named_Entities'].str.count('WORK_OF_ART')\n", + "\n", + "# Append named entity counts to new DataFrame\n", + "ner_counts_df = pd.DataFrame()\n", + "ner_counts_df['Genre'] = ner_analysis_df[\"PAPER TYPE\"]\n", + "ner_counts_df['PERSON_Counts'] = person_counts\n", + "ner_counts_df['LOC_Counts'] = loc_counts\n", + "ner_counts_df['DATE_Counts'] = date_counts\n", + "ner_counts_df['WORK_OF_ART_Counts'] = woa_counts\n", + "\n", + "ner_counts_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "OXUlrCXhh_0f" + }, + "outputs": [], + "source": [ + "# Calculate average usage of each named entity type\n", + "average_ner_df = ner_counts_df.groupby(['Genre']).mean()\n", + "average_ner_df = average_ner_df.round(0)\n", + "average_ner_df = average_ner_df.reset_index()\n", + "average_ner_df\n", + "\n", + "# Use plotly to plot proper noun use per genre\n", + "fig = px.bar(average_ner_df, x=\"Genre\", y=[\"PERSON_Counts\", 'LOC_Counts', \"DATE_Counts\", 'WORK_OF_ART_Counts'], title=\"Average Named Entity Usage Across Student Paper Genres\", barmode='group')\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Analysis of ```DATE``` Named Entities" + ], + "metadata": { + "id": "n8nl8w084fo4" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6RvvclvkA_HM", + "scrolled": true + }, + "outputs": [], + "source": [ + "# Define function to extract words tagged as \"date\" named entities from doc objects\n", + "def extract_date_named_entities(doc):\n", + " return [ent for ent in doc.ents if ent.label_ == 'DATE']\n", + "\n", + "# Get all date entity words and apply to new column of DataFrame\n", + "ner_analysis_df['Date_Named_Entities'] = final_paper_df['Doc'].apply(extract_date_named_entities)\n", + "\n", + "\n", + "# Make list of date entities a string so we can count their frequencies\n", + "ner_analysis_df['Date_Named_Entities'] = [', '.join(map(str, l)) for l in ner_analysis_df['Date_Named_Entities']]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ELHvnBtNvLSH", + "scrolled": true + }, + "outputs": [], + "source": [ + "# Search for only date words in proposal papers\n", + "date_word_counts_df = ner_analysis_df[(ner_analysis_df == 'Proposal').any(axis=1)]\n", + "\n", + "# Count the frequency of each word in these essays and append to list\n", + "date_word_frequencies = date_word_counts_df.Date_Named_Entities.str.split(expand=True).stack().value_counts()\n", + "\n", + "# Get top 10 most common words and their frequencies\n", + "date_word_frequencies[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2iKKPP-swqe2" + }, + "outputs": [], + "source": [ + "# Search for only date words in critique/evaluation papers\n", + "date_word_counts_df = ner_analysis_df[(ner_analysis_df == 'Critique/Evaluation').any(axis=1)]\n", + "\n", + "# Count the frequency of each word in these essays and append to list\n", + "date_word_frequencies = date_word_counts_df.Date_Named_Entities.str.split(expand=True).stack().value_counts()\n", + "\n", + "# Get top 10 most common words and their frequencies\n", + "date_word_frequencies[:10]" + ] + } + ], + "metadata": { + "colab": { + "provenance": [], + "include_colab_link": true + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file