Skip to content

Commit

Permalink
Keyword search in text from reference dictionary (#46)
Browse files Browse the repository at this point in the history
* init file and pull sample data

* add helper functions and reference dict

* clean up text, tokenize, add new labels from regex function

* move example into its own file, seperate out functions to use

* clean functions and make generic names

* clean up example

* move reference dict to example_data

* remove pdb

* move dict out of functions file

* missed import

* add metrics to measure accuracy

* rename other category to unknown for fair comparison

* move keyword search to preprocess step, util

* move add label fucntion to classify

* fix imports, add comment for viz

* remove temp file

* change arg to dict

* add keyword util

* update formating

* whitespace

* update args to match

* remove whitespace

* function rename

* clean up dict

* add sample data

* update docstring and change col dict name

* clean up and use sample data in examples

* add plot function

* add in visualization function for confusion matrix

* add more detail to dox string

* clean up function to expect predicted_col

* clean up example

* remove whitespace

* change col name from new to predicted

* clean up sample data

* add notebook example

* add functionality to replace text in preprocessing

* add text replacement step

* linter

* remove example .py file in favor of jupyter notebook

* remove reference dict

* add new csv mapping rfiles

* add csv

* update jupyternotebook example

* use df instead of dict

* remove dict

* remove unused imports and functions

* update jupyter notebook and remove module

* remove plot

* update docstrings

* linter

* remove whitespace
  • Loading branch information
charity-kwha authored Jul 7, 2023
1 parent 2526d49 commit a4de9a4
Show file tree
Hide file tree
Showing 7 changed files with 420 additions and 0 deletions.
25 changes: 25 additions & 0 deletions examples/example_data/mappings_equipment.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
in,out_
combiner,combiner
comb,combiner
cb,combiner
battery,battery
bess,battery
inverter,inverter
invert,inverter
inv,inverter
met,met
meter,meter
module,module
mod,module
recloser,recloser
reclose,recloser
relay,relay
substation,substation
switchgear,switchgear
switch,switchgear
tracker,tracker
transformer,transformer
xfmr,transformer
wiring,wiring
wire,wiring
wires,wiring
20 changes: 20 additions & 0 deletions examples/example_data/mappings_pv_terms.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
in,out_
comm,communication
energy,energy
kwh,energy
mwh,energy
grid,grid
curtailment,grid
curtail,grid
poi,grid
offline,outage
solar,solar
pv,solar
photovoltaic,solar
system,system
site,system
farm,system
project,system
sma,make_model
cm,corrective_maintence
pm,preventative_maintence
187 changes: 187 additions & 0 deletions examples/tutorial_text_classify_regex_example.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Adding keyword labels to O&M data\n",
"This notebook demonstrates the use of the `pvops.classify.get_attributes_from_keywords` module for adding asset labels based off O&M notes."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.metrics import accuracy_score\n",
"\n",
"from pvops.text import utils, preprocess\n",
"from pvops.text.classify import get_attributes_from_keywords\n",
"from pvops.text.visualize import visualize_classification_confusion_matrix"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Step 0: Get sample data, remap assets"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# pull in sample data and remap assets for ease of comparison\n",
"\n",
"om_df = pd.read_csv('example_data/example_ML_ticket_data.csv')\n",
"col_dict = {\n",
" \"data\" : \"CompletionDesc\",\n",
" \"eventstart\" : \"Date_EventStart\",\n",
" \"save_data_column\" : \"processed_data\",\n",
" \"save_date_column\" : \"processed_date\",\n",
" \"attribute_col\" : \"Asset\",\n",
" \"predicted_col\" : \"Keyword_Asset\",\n",
" \"remapping_col_from\": \"in\",\n",
" \"remapping_col_to\": \"out_\"\n",
"}\n",
"\n",
"# remap assets\n",
"remapping_df = pd.read_csv('example_data/remappings_asset.csv')\n",
"remapping_df['out_'] = remapping_df['out_'].replace({'met station': 'met',\n",
" 'energy storage': 'battery',\n",
" 'energy meter': 'meter'})\n",
"om_df = utils.remap_attributes(om_df, remapping_df, col_dict, allow_missing_mappings=True)\n",
"om_df.head()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Step 1: Text preprocessing"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# preprocessing steps\n",
"om_df[col_dict['attribute_col']] = om_df.apply(lambda row: row[col_dict['attribute_col']].lower(), axis=1)\n",
"om_df = preprocess.preprocessor(om_df, lst_stopwords=[], col_dict=col_dict, print_info=False, extract_dates_only=False)\n",
"\n",
"DATA_COL = col_dict['data']\n",
"om_df[DATA_COL] = om_df['processed_data']\n",
"\n",
"# replace terms\n",
"equipment_df = pd.read_csv('~/pvOps/examples/example_data/mappings_equipment.csv')\n",
"pv_terms_df = pd.read_csv('~/pvOps/examples/example_data/mappings_pv_terms.csv')\n",
"pv_reference_df = pd.concat([equipment_df, pv_terms_df])\n",
"om_df = utils.remap_words_in_text(om_df=om_df, remapping_df=pv_reference_df, remapping_col_dict=col_dict)\n",
"\n",
"om_df.head()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Step 2: Search for keywords to use as labels"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# add asset labels from keyword reference dict\n",
"om_df = get_attributes_from_keywords(om_df=om_df,\n",
" col_dict=col_dict,\n",
" reference_df=equipment_df)\n",
"om_df.head()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Step 3: Metrics"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# get accuracy measures and count metrics\n",
"PREDICT_COL = col_dict['predicted_col']\n",
"LABEL_COL = col_dict['attribute_col']\n",
"\n",
"# entries with some keyword over interest, over all entries\n",
"label_count = om_df[PREDICT_COL].count() / len(om_df)\n",
"\n",
"# replace 'Other' values with 'Unknown'\n",
"om_df[LABEL_COL] = om_df[LABEL_COL].replace('other', 'unknown')\n",
"# replace NaN values to use accuracy score\n",
"om_df[[LABEL_COL, PREDICT_COL]] = om_df[[LABEL_COL, PREDICT_COL]].fillna('unknown')\n",
"acc_score = accuracy_score(y_true=om_df[LABEL_COL], y_pred=om_df[PREDICT_COL])\n",
"\n",
"msg = f'{label_count:.2%} of entries had a keyword of interest, with {acc_score:.2%} accuracy.'\n",
"print(msg)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Step 4: Visualization"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# plot confusion matrix\n",
"title = 'Confusion Matrix of Actual and Predicted Asset Labels'\n",
"visualize_classification_confusion_matrix(om_df, col_dict, title)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.5"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
51 changes: 51 additions & 0 deletions pvops/text/classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import pandas as pd
import copy

from pvops.text.preprocess import get_keywords_of_interest

def classification_deployer(
X,
Expand Down Expand Up @@ -187,3 +188,53 @@ def classification_deployer(
best_gs_instance = gs_clf

return pd.concat(rows, axis=1).T, best_gs_instance.best_estimator_

def get_attributes_from_keywords(om_df, col_dict, reference_df, reference_col_dict):
"""Find keywords of interest in specified column of dataframe, return as new column value.
If keywords of interest given in a reference dataframe are in the specified column of the
dataframe, return the keyword category, or categories.
For example, if the string 'inverter' is in the list of text, return ['inverter'].
Parameters
----------
om_df : pd.DataFrame
Dataframe to search for keywords of interest, must include text_col.
col_dict : dict of {str : str}
A dictionary that contains the column names needed:
- data : string, should be assigned to associated column which stores the tokenized text logs
- predicted_col : string, will be used to create keyword search label column
reference_df : DataFrame
Holds columns that define the reference dictionary to search for keywords of interest,
Note: This function can currently only handle single words, no n-gram functionality.
reference_col_dict : dict of {str : str}
A dictionary that contains the column names that describes how
referencing is going to be done
- reference_col_from : string, should be assigned to
associated column name in reference_df that are possible input reference values
Example: pd.Series(['inverter', 'invert', 'inv'])
- reference_col_to : string, should be assigned to
associated column name in reference_df that are the output reference values
of interest
Example: pd.Series(['inverter', 'inverter', 'inverter'])
Returns
-------
om_df: pd.DataFrame
Input df with new_col added, where each found keyword is its own row, may result in
duplicate rows if more than one keywords of interest was found in text_col.
"""
om_df[col_dict['predicted_col']] = om_df[col_dict['data']].apply(get_keywords_of_interest,
reference_df=reference_df,
reference_col_dict=reference_col_dict)

# each multi-category now in its own row, some logs have multiple equipment issues
multiple_keywords_df = om_df[om_df[col_dict['predicted_col']].str.len() > 1]
om_df = om_df.explode(col_dict['predicted_col'])

msg = f'{len(multiple_keywords_df)} entries had multiple keywords of interest. Reference: {multiple_keywords_df.index} in original dataframe.'
print(msg)

return om_df
44 changes: 44 additions & 0 deletions pvops/text/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,3 +449,47 @@ def text_remove_numbers_stopwords(document, lst_stopwords):
document = " ".join(document)

return document


def get_keywords_of_interest(document_tok, reference_df, reference_col_dict):
"""Find keywords of interest in list of strings from reference dict.
If keywords of interest given in a reference dict are in the list of
strings, return the keyword category, or categories. For example,
if the string 'inverter' is in the list of text, return ['inverter'].
Parameters
----------
document_tok : list of str
Tokenized text, functionally a list of string values.
reference_df : DataFrame
Holds columns that define the reference dictionary to search for keywords of interest,
Note: This function can currently only handle single words, no n-gram functionality.
reference_col_dict : dict of {str : str}
A dictionary that contains the column names that describes how
referencing is going to be done
- reference_col_from : string, should be assigned to
associated column name in reference_df that are possible input reference values
Example: pd.Series(['inverter', 'invert', 'inv'])
- reference_col_to : string, should be assigned to
associated column name in reference_df that are the output reference values
of interest
Example: pd.Series(['inverter', 'inverter', 'inverter'])
Returns
-------
included_equipment: list of str
List of keywords from reference_dict found in list_of_txt, can be more than one value.
"""
REFERENCE_COL_FROM = reference_col_dict["reference_col_from"]
REFERENCE_COL_TO = reference_col_dict["reference_col_to"]

reference_dict = dict(
zip(reference_df[REFERENCE_COL_FROM], reference_df[REFERENCE_COL_TO])
)

# keywords of interest
overlap_keywords = reference_dict.keys() & document_tok
included_keywords = list({reference_dict[x] for x in overlap_keywords})
return included_keywords
Loading

0 comments on commit a4de9a4

Please sign in to comment.