From df8a52d51a53b38a8cdaee33bf5e722056867bee Mon Sep 17 00:00:00 2001 From: Matt Hall Date: Mon, 18 Sep 2023 22:34:25 +0200 Subject: [PATCH] More stuff for pandas etc. --- docs/notebooks/_Pandas_accessor.ipynb | 132 ++++++++++++++++++-------- src/redflag/__init__.py | 1 + src/redflag/pandas.py | 67 ++++++++++++- tests/test_sklearn.py | 2 +- 4 files changed, 154 insertions(+), 48 deletions(-) diff --git a/docs/notebooks/_Pandas_accessor.ipynb b/docs/notebooks/_Pandas_accessor.ipynb index 7620114..8026941 100644 --- a/docs/notebooks/_Pandas_accessor.ipynb +++ b/docs/notebooks/_Pandas_accessor.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "id": "75bb8303", "metadata": {}, "outputs": [ @@ -90,7 +90,7 @@ "4 4237.5 2448.6 2.472231 sandstone" ] }, - "execution_count": 2, + "execution_count": 1, "metadata": {}, "output_type": "execute_result" } @@ -105,7 +105,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "id": "39832c6c", "metadata": {}, "outputs": [ @@ -115,7 +115,7 @@ "1.4130434782602501" ] }, - "execution_count": 3, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -129,20 +129,12 @@ { "cell_type": "code", "execution_count": 4, - "id": "92e29966", - "metadata": {}, - "outputs": [], - "source": [ - "from pandas.api.extensions import register_dataframe_accessor" - ] - }, - { - "cell_type": "code", - "execution_count": 5, "id": "372a6bf1", "metadata": {}, "outputs": [], "source": [ + "from pandas.api.extensions import register_dataframe_accessor\n", + "\n", "@register_dataframe_accessor(\"redflag\")\n", "class RedflagAccessor:\n", " def __init__(self, pandas_obj):\n", @@ -157,7 +149,28 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 14, + "id": "b110936f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rf.dummy_re([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0])" + ] + }, + { + "cell_type": "code", + "execution_count": 5, "id": "7c3963ec", "metadata": {}, "outputs": [ @@ -167,7 +180,7 @@ "-1.0" ] }, - "execution_count": 6, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -186,7 +199,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "id": "94f7c2cd", "metadata": {}, "outputs": [ @@ -196,7 +209,7 @@ "array([], dtype=float64)" ] }, - "execution_count": 7, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -207,7 +220,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "id": "62ea78b5", "metadata": {}, "outputs": [ @@ -217,7 +230,7 @@ "array([], dtype=int64)" ] }, - "execution_count": 8, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -231,17 +244,17 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "id": "84c883db", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([1.3, 1.1, 1.8, 1.6, 1.5, 1.2, 1.7, 1.9, 1.4, 1. ])" + "array([1.8, 1. , 1.2, 1.6, 1.4, 1.5, 1.1, 1.9, 1.3, 1.7])" ] }, - "execution_count": 9, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -262,17 +275,17 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "id": "6427e5ee", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([1.3, 1.1, 1.8, 1.6, 1.5, 1.2, 1.7, 1.9, 1.4, 1. ])" + "array([1.8, 1. , 1.2, 1.6, 1.4, 1.5, 1.1, 1.9, 1.3, 1.7])" ] }, - "execution_count": 10, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -283,26 +296,26 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "id": "6e912a70", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([[3],\n", - " [1],\n", - " [8],\n", + "array([[8],\n", + " [0],\n", + " [2],\n", " [6],\n", + " [4],\n", " [5],\n", - " [2],\n", - " [7],\n", + " [1],\n", " [9],\n", - " [4],\n", - " [0]])" + " [3],\n", + " [7]])" ] }, - "execution_count": 11, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -321,7 +334,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "id": "7ec28d7f", "metadata": {}, "outputs": [], @@ -338,12 +351,24 @@ " return rf.imbalance_degree(self._obj)\n", "\n", " def minority_classes(self):\n", - " return rf.minority_classes(self._obj)" + " return rf.minority_classes(self._obj)\n", + " \n", + "\n", + " def dummy_scores(self, task=None, random_state=None):\n", + " if task is None:\n", + " task = 'regression' if rf.is_continuous(self._obj) else 'classification'\n", + " if task == 'classification':\n", + " return rf.dummy_classification_scores(self._obj, random_state=random_state)\n", + " elif task == 'regression':\n", + " return rf.dummy_regression_scores(self._obj)\n", + " else:\n", + " raise ValueError(\"`task` must be 'classification' or 'regression', or None to decide automatically.\")\n", + " " ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "id": "88447a57", "metadata": {}, "outputs": [ @@ -353,7 +378,7 @@ "-1.0" ] }, - "execution_count": 13, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -362,6 +387,29 @@ "df['Lithology'].redflag.imbalance_degree()" ] }, + { + "cell_type": "code", + "execution_count": 13, + "id": "5f89c66d", + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "module 'redflag' has no attribute 'dummy_classification_scores'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn [13], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mLithology\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mredflag\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdummy_scores\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "Cell \u001b[0;32mIn [11], line 20\u001b[0m, in \u001b[0;36mSeriesAccessor.dummy_scores\u001b[0;34m(self, task, random_state)\u001b[0m\n\u001b[1;32m 18\u001b[0m task \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mregression\u001b[39m\u001b[38;5;124m'\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m rf\u001b[38;5;241m.\u001b[39mis_continuous(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_obj) \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mclassification\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m 19\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m task \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mclassification\u001b[39m\u001b[38;5;124m'\u001b[39m:\n\u001b[0;32m---> 20\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mrf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdummy_classification_scores\u001b[49m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_obj, random_state\u001b[38;5;241m=\u001b[39mrandom_state)\n\u001b[1;32m 21\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m task \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mregression\u001b[39m\u001b[38;5;124m'\u001b[39m:\n\u001b[1;32m 22\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m rf\u001b[38;5;241m.\u001b[39mdummy_regression_scores(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_obj)\n", + "\u001b[0;31mAttributeError\u001b[0m: module 'redflag' has no attribute 'dummy_classification_scores'" + ] + } + ], + "source": [ + "df['Lithology'].redflag.dummy_scores()" + ] + }, { "cell_type": "markdown", "id": "369cf2f6", @@ -645,7 +693,7 @@ { "cell_type": "code", "execution_count": 72, - "id": "154b6ed5", + "id": "2add677d", "metadata": {}, "outputs": [], "source": [ @@ -669,7 +717,7 @@ { "cell_type": "code", "execution_count": 73, - "id": "978b8ba6", + "id": "41775588", "metadata": {}, "outputs": [ { @@ -691,7 +739,7 @@ { "cell_type": "code", "execution_count": null, - "id": "072db9c3", + "id": "f537e306", "metadata": {}, "outputs": [], "source": [] diff --git a/src/redflag/__init__.py b/src/redflag/__init__.py index c482b84..3ca8a81 100644 --- a/src/redflag/__init__.py +++ b/src/redflag/__init__.py @@ -1,5 +1,6 @@ from .utils import * from .sklearn import * +from .pandas import * # Targets from .target import * diff --git a/src/redflag/pandas.py b/src/redflag/pandas.py index c5c887a..d954457 100644 --- a/src/redflag/pandas.py +++ b/src/redflag/pandas.py @@ -1,8 +1,28 @@ -from sklearn.dummy import DummyClassifier, DummyRegressor +""" +Pandas accessors. + +Author: Matt Hall, scienxlab.org +Licence: Apache 2.0 + +Copyright 2023 Redflag contributors + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +import warnings from .imbalance import imbalance_degree, imbalance_ratio, minority_classes from .outliers import get_outliers -from .target import is_continuous +from .target import is_continuous, dummy_classification_scores, dummy_regression_scores from .independence import is_correlated @@ -30,19 +50,39 @@ def __init__(self, pandas_obj): self._obj = pandas_obj def imbalance_degree(self): + if is_continuous(self._obj): + warnings.warn('The Series does not seem categorical.') return imbalance_degree(self._obj) def minority_classes(self): + if is_continuous(self._obj): + warnings.warn('The Series does not seem categorical.') return minority_classes(self._obj) - - def check(self): + + def dummy_scores(self, task=None, random_state=None): + if task is None: + task = 'regression' if is_continuous(self._obj) else 'classification' + + if task == 'classification': + scores = dummy_classification_scores(self._obj, random_state=random_state) + elif task == 'regression': + scores = dummy_regression_scores(self._obj) + else: + raise ValueError("`task` must be 'classification' or 'regression', or None to decide automatically.") + + return scores + + def check(self, random_state=None): results = {} if is_continuous(self._obj): results['outliers'] = get_outliers(self._obj) results['correlated'] = is_correlated(self._obj) + results['dummy_scores'] = dummy_regression_scores(self._obj) + else: + # Categorical. results['imbalance'] = imbalance_degree(self._obj) - + results['dummy_scores'] = dummy_classification_scores(self._obj, random_state=random_state) @register_dataframe_accessor("redflag") @@ -51,7 +91,24 @@ def __init__(self, pandas_obj): self._obj = pandas_obj def imbalance_degree(self, target=None): + if is_continuous(self._obj): + warnings.warn('The column does not seem categorical.') return imbalance_degree(self._obj[target]) def minority_classes(self, target=None): + if is_continuous(self._obj): + warnings.warn('The column does not seem categorical.') return minority_classes(self._obj[target]) + + def dummy_scores(self, target=None, task=None, random_state=None): + if task is None: + task = 'regression' if is_continuous(self._obj[target]) else 'classification' + + if task == 'classification': + scores = dummy_classification_scores(self._obj[target], random_state=random_state) + elif task == 'regression': + scores = dummy_regression_scores(self._obj[target]) + else: + raise ValueError("`task` must be 'classification' or 'regression', or None to decide automatically.") + + return scores diff --git a/tests/test_sklearn.py b/tests/test_sklearn.py index 2f0c8d9..f3dd234 100644 --- a/tests/test_sklearn.py +++ b/tests/test_sklearn.py @@ -1,4 +1,4 @@ -"""Test redflag""" +"""Test sklearn classes.""" import pytest import numpy as np from sklearn.pipeline import make_pipeline