Skip to content

Commit

Permalink
Improve plot_day for smaller data sizes
Browse files Browse the repository at this point in the history
  • Loading branch information
BaksiLi committed Apr 29, 2022
1 parent 14bc603 commit 8009e78
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 13 deletions.
Binary file modified assets/example-day.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified assets/example-kde.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified assets/example-stats.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
45 changes: 32 additions & 13 deletions sex_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from matplotlib.gridspec import GridSpec
from pandas import DataFrame, read_csv, to_datetime

__version__ = '1.2 (i-miss-you)'
__version__ = '1.3 (i-miss-you)'


class Regex(str, Enum):
Expand Down Expand Up @@ -130,21 +130,42 @@ def plot_freq_bar(df, offset_alias: str = 'M', ax=None, legend: bool = True):
ax.set_xlabel(f'Period ({offset_alias})')


def time_function(df: DataFrame):
"""Returns a function that calculates o’clock fractions.
Should I keep narrowing the divisions as the size goes bigger?
-> no, because < 1 min does not make sense. 6 mins already suffice.
"""
data_size = df.shape[0]

def fn(x):
if data_size <= 50: # use whole hour
step = round(x.minute/60)
elif data_size <= 100: # use half division (every 30 mins)
step = round(x.minute/60*2)/2
elif data_size <= 200: # use quatre division (every 15 mins)
step = round(x.minute/60*4)/4
else: # use tenth division of hour (every 6 mins)
step = round(x.minute/60, 1)
return x.hour + step

return fn


def plot_density(df, ax=None, legend: bool = True):
"""Plot KDE (Kernel Density Estimation).
"""
if not ax:
ax = plt.subplot()

time_function = lambda x: (x.hour + round(x.minute/60, 1))

grouped = df.set_index('TimeStamp')\
.groupby(time_function)['Kind'].value_counts().unstack()
.groupby(time_function(df))['Kind'].value_counts().unstack()

grouped.plot.kde(ax=ax, legend=legend)

ax.set_title('Kernel Density Estimation')
ax.set_ylabel('Density')
ax.set_xlabel('Repeated Times')


def plot_day_hour(df, ax=None, legend: bool = True):
Expand All @@ -153,14 +174,12 @@ def plot_day_hour(df, ax=None, legend: bool = True):
if not ax:
ax = plt.subplot()

time_function = lambda x: (x.hour + round(x.minute/60, 1))

grouped = df.set_index('TimeStamp')\
.groupby(time_function)['Kind'].value_counts().unstack()
.groupby(time_function(df))['Kind'].value_counts().unstack()

# Plot mean
grouped_mean = grouped.mean(axis=1) # .fillna(0)
ax.plot(grouped_mean.index, grouped_mean, color='grey')
ax.plot(grouped_mean.index, grouped_mean, color='grey', linestyle='-', label='Mean')

# Plot scattered points
for kind in grouped.columns:
Expand Down Expand Up @@ -189,13 +208,13 @@ def plot_all(df):
ax = fig.add_subplot(gs[0, :])
plot_day_hour(df, ax=ax, legend=False)

ax2 = fig.add_subplot(gs[1, 1])
plot_density(df, ax=ax2, legend=False)
ax2 = fig.add_subplot(gs[1, 0])
plot_freq_bar(df, ax=ax2, legend=False)

ax3 = fig.add_subplot(gs[1, 0])
plot_freq_bar(df, ax=ax3, legend=False)
ax3 = fig.add_subplot(gs[1, 1])
plot_density(df, ax=ax3, legend=False)

handles, labels = ax.get_legend_handles_labels()
handles, labels = ax3.get_legend_handles_labels()
fig.legend(handles, labels, title='Kind', loc='upper right')


Expand Down

0 comments on commit 8009e78

Please sign in to comment.