Skip to content

Commit

Permalink
inculde new summary fact panel
Browse files Browse the repository at this point in the history
  • Loading branch information
willeppy committed Feb 17, 2023
1 parent 9ee7ad8 commit 961eb2e
Show file tree
Hide file tree
Showing 15 changed files with 603 additions and 106 deletions.
86 changes: 71 additions & 15 deletions diginlineprofiler/profile_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import pandas as pd
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_bool_dtype, is_categorical_dtype, is_datetime64_dtype

from .utils import convertDescribe

####### type checks #######
def isNumeric(colData: pd.Series):
Expand Down Expand Up @@ -36,14 +36,6 @@ def getColumns(dfName: pd.DataFrame):
def getShape(dfName: pd.DataFrame):
return dfName.shape

def getQuantMeta(dfName: pd.DataFrame, colName: str, isIndex=False):
if isIndex:
colData = dfName.index.to_series()
else:
colData = dfName[colName]
m = colData.describe()
return m

def getColMeta(dfName: pd.DataFrame, colName: str, isIndex=False):
if isIndex:
colData = dfName.index.to_series()
Expand Down Expand Up @@ -87,10 +79,74 @@ def getTempInterval(dfName: pd.DataFrame, colName: str, isIndex=False):
timerange = colData.max() - colData.min()
return {"months": 0, "days": timerange.days, "micros": 0}

# def getVariableNamesInPythonStr(codeString: str):
# import tokenize, io
# print(set([ t.string for t in tokenize.generate_tokens(io.StringIO(codeString).readline) if t.type == 1]))
def getStringMeta(dfName: pd.DataFrame, colName: str, isIndex=False):
if isIndex:
lengths = dfName.index.to_series().str.len()
else:
lengths = dfName[colName].str.len()

return {
"minLength": lengths.min(),
"maxLength": lengths.max(),
"meanLength": lengths.mean(),
}

def getStringStats(dfName: pd.DataFrame, colName: str):
lengths = dfName[colName].str.len()
return lengths.min(), lengths.max(), lengths.mean()
def getQuantMeta(dfName: pd.DataFrame, colName: str, isIndex=False):
if isIndex:
colData = dfName.index.to_series()
else:
colData = dfName[colName]

describe = colData.describe()
sd = describe.loc['std']
mean = describe.loc['mean']
q3 = describe.loc['75%']
q1 = describe.loc['25%']

# get num outliers > 3 std away from mean
normalized = (colData - mean) / sd
sd_num_outliers = sum( abs(normalized) > 3)

# get iqr outliers that are 1.5 * iqr away from q1 or q3
iqr = q3 - q1
lower = q1 - 1.5 * iqr
upper = q3 + 1.5 * iqr
iqr_num_outliers = sum((colData < lower) | (colData > upper))

# get sortedness
if colData.is_monotonic_increasing:
sortedness = "ascending"
elif colData.is_monotonic_decreasing:
sortedness = "descending"
else:
sortedness = "noSort"

n_zero = sum(colData == 0)
n_negative = sum(colData < 0)
n_positive = sum(colData > 0)

# make serializable

statistics = convertDescribe(describe)
statistics["sd_outlier"] = sd_num_outliers
statistics["iqr_outlier"] = iqr_num_outliers
statistics["sortedness"] = sortedness
statistics["n_zero"] = n_zero
statistics["n_positive"] = n_positive
statistics["n_negative"] = n_negative

return statistics

def getTemporalMeta(dfName:pd.DataFrame, colName:str, isIndex=False):
if isIndex:
colData = dfName.index
else:
colData = dfName[colName]

if colData.is_monotonic_increasing:
result = "ascending"
elif colData.is_monotonic_decreasing:
result = "descending"
else:
result = "noSort"
return {"sortedness": result}
2 changes: 1 addition & 1 deletion diginlineprofiler/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ def convertVC(vc: pd.Series, colName: str):
columns={"index": "value",
colName: "count"}).to_dict('records')

def convertQMeta(statistics: pd.Series):
def convertDescribe(statistics: pd.Series):
s = statistics.to_dict()

return {
Expand Down
22 changes: 10 additions & 12 deletions diginlineprofiler/visualizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@
from ipylab import JupyterFrontEnd

from ._frontend import module_name, module_version
from .profile_lib import isNumeric, isTimestamp, isCategorical, isBoolean, getColumns, getShape, \
getQuantMeta, getColMeta, getValueCounts, getQuantBinnedData, getTempBinnedData, getTempInterval, \
getStringStats
from .utils import convertVC, convertQMeta, convertBinned
from .profile_lib import isNumeric, isTimestamp, isCategorical, getShape, \
getColMeta, getValueCounts, getQuantBinnedData, getTempBinnedData, getTempInterval, \
getQuantMeta, getStringMeta, getTemporalMeta
from .utils import convertVC, convertBinned

class Visualizer(DOMWidget):
# boilerplate for ipywidgets syncing
Expand Down Expand Up @@ -79,31 +79,29 @@ def calculateChartData(self, dfName: str):
# get data
chartData = getQuantBinnedData(df, cName, isIndex=False)
statistics = getQuantMeta(df, cName, isIndex=False)

# convert to JSON serializable
statistics = convertQMeta(statistics)
chartData = convertBinned(chartData, statistics["min"])

cd["summary"]["statistics"] = statistics
cd["summary"]["quantMeta"] = statistics
cd["summary"]["histogram"] = chartData
elif isTimestamp(df[cName]):
# get data
vc, true_min = getTempBinnedData(df, cName, isIndex=False)
interval = getTempInterval(df, cName, isIndex=False)
temporalMeta = getTemporalMeta(df, cName, isIndex=False)

# convert to JSON serializable
histogram = convertBinned(vc, true_min)

cd["summary"]["histogram"] = histogram
cd["summary"]["timeInterval"] = interval
cd["summary"]["temporalMeta"] = temporalMeta

elif isCategorical(df[cName]):
minLength, maxLength, meanLength = getStringStats(df, cName)
stringMeta = getStringMeta(df, cName)

cd["summary"]["stringSummary"] = {
"minLength": minLength,
"maxLength": maxLength,
"meanLength": meanLength
}
cd["summary"]["stringMeta"] = stringMeta

colProfiles.append(cd)

Expand Down
15 changes: 13 additions & 2 deletions src/common/exchangeInterfaces.ts
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,12 @@ export type IQuantMeta = {
q75: number;
max: number;
mean: number;
sd_outlier: number;
iqr_outlier: number;
sortedness: string;
n_zero: number;
n_positive: number;
n_negative: number;
};

export type IColMeta = {
Expand All @@ -63,6 +69,9 @@ export type IStringMeta = {
meanLength: number;
};

export type ITemporalMeta = {
sortedness: string;
}
export type ColumnProfileData = {
name: string;
type: string;
Expand All @@ -76,9 +85,11 @@ export interface ColumnSummary {
cardinality: number; // num unique
topK: ValueCount[];
histogram?: IHistogram;
statistics?: IQuantMeta;
timeInterval?: Interval;
stringSummary?: IStringMeta;
statistics?: IQuantMeta;
quantMeta?: IQuantMeta;
stringMeta?: IStringMeta;
temporalMeta?: ITemporalMeta;
}


Expand Down
85 changes: 11 additions & 74 deletions src/components/ColumnProfile.svelte
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,13 @@
import ColumnEntry from './ColumnEntry.svelte';
import DataTypeIcon from './data-types/DataTypeIcon.svelte';
import BarAndLabel from './viz//BarAndLabel.svelte';
import TopKSummary from './viz/TopKSummary.svelte';
import FormattedDataType from './data-types/FormattedDataType.svelte';
import { config, getSummarySize } from './utils/sizes';
import {
formatInteger,
formatCompactInteger,
formatPercentage
} from './utils/formatters';
import { convertToTimeBin } from './utils/convertTypes';
import {
CATEGORICALS,
NUMERICS,
Expand All @@ -22,12 +20,9 @@
import Tooltip from './tooltip/Tooltip.svelte';
import TooltipContent from './tooltip/TooltipContent.svelte';
import Histogram from './viz/histogram/SmallHistogram.svelte';
import NumericHistogram from './viz/histogram/NumericHistogram.svelte';
import TimestampDetail from './viz/timestamp/TimestampDetail.svelte';
import ExportChartButton from './export-code/ExportChartButton.svelte';
import type { ColumnSummary } from '../common/exchangeInterfaces';
import StringStats from './viz/categorical/StringStats.svelte';
import { showIndex } from '../stores';
import VizOrText from './fact-panel/VizOrStats.svelte';
// props
export let dfName: string;
Expand Down Expand Up @@ -189,74 +184,16 @@
>
<div bind:clientWidth={wrapperDivWidth}>
{#if totalRows !== 0 && nullCount !== totalRows}
{#if (CATEGORICALS.has(type) || BOOLEANS.has(type)) && summary?.topK}
<TopKSummary
color={DATA_TYPE_COLORS[type].bgClass}
{totalRows}
topK={summary.topK}
/>
{#if CATEGORICALS.has(type)}
<StringStats
stats={summary.stringSummary}
/>
{/if}
<div class="mt-1">
<ExportChartButton
chartType={'cat'}
{dfName}
{colName}
{isIndex}
/>
</div>
{:else if NUMERICS.has(type) && summary?.statistics && summary?.histogram?.length}
<NumericHistogram
{dfName}
{colName}
{type}
{isIndex}
width={wrapperDivWidth}
height={65}
data={summary.histogram}
min={summary.statistics.min}
qlow={summary.statistics.q25}
median={summary.statistics.q50}
qhigh={summary.statistics.q75}
mean={summary.statistics.mean}
max={summary.statistics.max}
/>
<div class="mt-1">
<ExportChartButton
chartType={'quant'}
{dfName}
{colName}
exportOptions={{
numBins: summary.histogram.length
}}
{isIndex}
/>
</div>
{:else if TIMESTAMPS.has(type) && summary?.histogram?.length}
<TimestampDetail
data={convertToTimeBin(summary?.histogram)}
xAccessor="ts_end"
yAccessor="count"
height={160}
width={wrapperDivWidth}
interval={summary?.timeInterval}
/>
<div class="mt-1">
<ExportChartButton
chartType={'temporal'}
{dfName}
{colName}
exportOptions={{
shouldDisableMaxRows:
totalRows > 5000
}}
{isIndex}
/>
</div>
{/if}
<VizOrText
{dfName}
{colName}
{totalRows}
{type}
{summary}
{isIndex}
{wrapperDivWidth}
{nullCount}
/>
{:else}
<p>No values to show for this column</p>
{/if}
Expand Down
42 changes: 42 additions & 0 deletions src/components/export-code/ExportFactButton.svelte
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
<script lang="ts">
import { getContext } from 'svelte';
import ExportIcon from '../icons/ExportIcon.svelte';
import Tooltip from '../tooltip/Tooltip.svelte';
import TooltipContent from '../tooltip/TooltipContent.svelte';
import type { Writable } from 'svelte/store';
import { DUPLICATES, IQR_OUTLIERS, SD_OUTLIERS } from './ExportableCode';
export let type: 'outliers_iqr' | 'outliers_sd' | 'duplicates';
export let dfName: string;
export let colName: string;
export let isIndex = false;
export let tooltipText = 'Export text fact to code';
const exportedCode: Writable<string> = getContext(
'inlineprofiler:exportedCode'
);
function addCode() {
let text: string;
if (type === 'outliers_iqr') {
text = IQR_OUTLIERS(dfName, colName, isIndex);
} else if (type === 'outliers_sd') {
text = SD_OUTLIERS(dfName, colName, isIndex);
} else if (type === 'duplicates') {
text = DUPLICATES(dfName, colName, isIndex);
}
$exportedCode = text;
}
</script>

<Tooltip location="bottom" alignment="center" distance={8}>
<button
class="grid place-items-center rounded hover:bg-gray-100 text-gray-500"
style="width: 20px; height: 20px;"
on:click={addCode}
>
<ExportIcon size="14px" />
</button>

<TooltipContent slot="tooltip-content">{tooltipText}</TooltipContent>
</Tooltip>
Loading

0 comments on commit 961eb2e

Please sign in to comment.