Skip to content

Commit

Permalink
0.3.3:
Browse files Browse the repository at this point in the history
- 10x speedup of weighting!
- Fixed incompatibility with numpy 2.0
  • Loading branch information
Braffolk committed Jul 12, 2024
1 parent c04c975 commit fb7afc5
Show file tree
Hide file tree
Showing 6 changed files with 143 additions and 33 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def version_libs(libs, precisions, versions):
INSTALL_REQUIRES = version_libs(libs, precisions, versions)

setup(name='weightipy',
version='0.3.2',
version='0.3.3',
author='Remi Sebastian Kits',
author_email='[email protected]',
packages=find_packages(exclude=['tests']),
Expand Down
2 changes: 1 addition & 1 deletion tests/parameters_chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from itertools import chain as ichain
from operator import add

NAN = np.NaN
NAN = np.nan
AST = '*'
EMPTY = ''

Expand Down
95 changes: 95 additions & 0 deletions tests/performance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import time

import pandas as pd

from weightipy import weight_dataframe, weighting_efficiency
from weightipy.rim import Rim
from weightipy.weight_engine import WeightEngine

"""
Performance:
0.3.2 - 0.3980s
0.3.3 - 0.0336s
"""


class PerformanceTest:
def __init__(self):
self.path = ''
self.scheme_name_A2 = 'scheme_name_A2'

name_data_A = 'engine_A'
self.path_data_A = '{}{}_data.csv'.format(self.path, name_data_A)

name_data_B = 'engine_B'
self.path_meta_B = '{}{}_meta.json'.format(self.path, name_data_B)
self.path_data_B = '{}{}_data.csv'.format(self.path, name_data_B)

name_data_exA = 'Example Data (A)'
self.path_meta_exA = '{}{}.json'.format(self.path, name_data_exA)
self.path_data_exA = '{}{}.csv'.format(self.path, name_data_exA)

# Setup engine_A
data = pd.read_csv(self.path_data_A)
self.engine_A = WeightEngine(data=data)

# Setup engine_A
data = pd.read_csv(self.path_data_A)
self.engine_A = WeightEngine(data=data)


self.scheme_A2 = Rim(self.scheme_name_A2)
self.scheme_A2.target_cols = ['column1', 'column2']
self.scheme_A2.add_group(name='Senior Type 1', filter_def='column3==1',
targets=[
{'column1': {code: prop for code, prop
in enumerate([37.00, 32.00, 31.00], start=1)}},
{'column2': {code: prop for code, prop
in enumerate([13.3, 23.13, 14.32, 4.78, 4.70,
2.65, 2.61, 3.47, 31.04], start=1)}}
])
self.scheme_A2.add_group(name='Senior Type 2', filter_def='column3==1',
targets=[
{'column1': {code: prop for code, prop
in enumerate([33.2, 33.40, 33.40], start=1)}},
{'column2': {code: prop for code, prop
in enumerate([11.11, 11.11, 11.11, 11.11, 11.11,
11.11, 11.11, 11.11, 11.12], start=1)}}
])
self.scheme_A2.add_group(name='Senior Type 3', filter_def='column3==3',
targets=[
{'column1': {code: prop for code, prop
in enumerate([37.1, 33.2, 29.7], start=1)}},
{'column2': {code: prop for code, prop
in enumerate([13.3, 23.13, 14.32, 4.78, 4.70,
2.65, 2.61, 3.47, 31.04], start=1)}}
])
self.scheme_A2.add_group(name='Senior Type 4', filter_def='column3==4',
targets=[
{'column1': {code: prop for code, prop
in enumerate([37.1, 33.2, 29.7], start=1)}},
{'column2': {code: prop for code, prop
in enumerate([12.00, 23.13, 14.32, 4.78, 4.70,
2.65, 2.61, 3.47, 32.34], start=1)}}
])

def run_performance_test(self):
df = self.engine_A._df

# measure perf
samples = 10
time_start = time.time()

for i in range(samples):
weight_dataframe(df, self.scheme_A2, "weights_")

time_end = time.time()
time_sample = (time_end - time_start) / samples

# print seconds
print("Time per sample: {}s".format(time_sample))


if __name__ == '__main__':
PerformanceTest().run_performance_test()

71 changes: 45 additions & 26 deletions weightipy/rim.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,6 @@ def set_targets(self, targets, group_name=None):
self.target_cols.append(list(target.keys())[0])
self.groups[gn][self._TARGETS] = targets


def add_group(self, name=None, filter_def=None, targets=None):
"""
Set weight groups using flexible filter and target defintions.
Expand Down Expand Up @@ -143,7 +142,7 @@ def add_group(self, name=None, filter_def=None, targets=None):
def _compute(self):
self._get_base_factors()
self._df[self._weight_name()] = self._df[self._weight_name()].replace(0.00, 1.00)
self._df[self._weight_name()] = self._df[self._weight_name()].replace(-1.00, np.NaN)
self._df[self._weight_name()] = self._df[self._weight_name()].replace(-1.00, np.nan)
if list(self._group_targets.keys()):
self._adjust_groups()
if self.total > 0 and not list(self._group_targets.keys()):
Expand Down Expand Up @@ -200,14 +199,13 @@ def _get_base_factors(self):

def _scale_total(self):
weight_var = self._weight_name()
self._df[weight_var].replace(1.00, np.NaN, inplace=True)
self._df[weight_var].replace(1.00, np.nan, inplace=True)
unw_total = len(self._df[weight_var].dropna().index)
self._df[weight_var].replace(np.NaN, 0.00, inplace=True)
self._df[weight_var].replace(np.nan, 0.00, inplace=True)
scale_factor = float(unw_total) / float(self.total)
self._df[weight_var] = self._df[weight_var] / scale_factor
self._df[weight_var].replace(0.00, 1.00, inplace=True)


def _adjust_groups(self):
adj_w_vec = []
for group in self.groups:
Expand All @@ -227,7 +225,6 @@ def _adjust_groups(self):
adj_w_vec = pd.concat(adj_w_vec).dropna()
self._df[self._weight_name()] = adj_w_vec


def _get_group_filter_cols(self, filter_def):
filter_cols = []
if filter_def is not None:
Expand Down Expand Up @@ -316,7 +313,7 @@ def dataframe(self, df, key_column=None):
columns = self._columns(add_columns=[key_column])
columns.extend(all_filter_cols)
df = df.copy()[columns]
df[self._weight_name()] = df[self._weight_name()].replace(0, np.NaN)
df[self._weight_name()] = df[self._weight_name()].replace(0, np.nan)
df.dropna(subset=[self._weight_name()], inplace=True)
return df

Expand Down Expand Up @@ -382,7 +379,7 @@ def _check_targets(self, verbose):
"""

some_nans = '*** Warning: Scheme "{0}", group "{1}" ***\n'\
'np.NaN found in weight variables:\n{2}\n'\
'np.nan found in weight variables:\n{2}\n'\
'Please check if weighted results are acceptable!\n'

len_err_less = '*** Warning: Scheme "{0}", group "{1}" ***\nTargets for variable '\
Expand Down Expand Up @@ -449,7 +446,6 @@ def _check_targets(self, verbose):
raise ValueError(sum_err.format(self.name, group,
target_col, np.sum(target_props)))


def validate(self):
"""
Summary on scheme target variables to detect and handle missing data.
Expand Down Expand Up @@ -528,19 +524,26 @@ def __init__(self, dataframe, targets,
if cap < 1.5 and _use_cap:
print("Cap is very low, the model may take a long time to run.")

def rakeonvar(self, target):
def rakeonvar(self, target, weights: np.array):
target_values = list(target.values())[0]
target_col = list(target.keys())[0]
for target_code, target_prop in list(target.values())[0].items():

for target_code, target_prop in target_values.items():
index_array = self._cache_indices[(target_col, target_code)]
if index_array.shape[0] == 0:
continue
if target_prop == 0.00:
target_prop = 0.00000001

try:
df = self.dataframe[(self.dataframe[target_col] == target_code)]
index_array = (self.dataframe[target_col] == target_code)
data = df[self.weight_column_name] * (target_prop / sum(df[self.weight_column_name]))
self.dataframe.loc[index_array, self.weight_column_name] = data
multiplier = target_prop / weights[index_array].sum()
weights[index_array] *= multiplier
except:
pass

return weights


def calc_weight_efficiency(self):
numerator = 100*sum(self.dataframe[self.weight_column_name] *
self.pre_weight) ** 2
Expand Down Expand Up @@ -585,6 +588,18 @@ def start(self):
diff_error = 999999
diff_error_old = 99999999999

self._cache_indices = {}
for target in self.targets:
target_values = list(target.values())[0]
target_col = list(target.keys())[0]

for target_code, target_prop in target_values.items():
key = (target_col, target_code)
index_array = (self.dataframe[target_col] == target_code)
# ignore pandas index, use numpy index
index_array = np.where(index_array)[0]
self._cache_indices[key] = index_array

#cap (this needs more rigorous testings)
if isinstance(self.cap, (list, tuple)):
min_cap = self.cap[0]
Expand All @@ -598,29 +613,33 @@ def start(self):
if min_cap is not None:
min_cap -= 0.0001

weights = self.dataframe[self.weight_column_name].values

for iteration in range(1, self.max_iterations+1):
old_weights = self.dataframe[self.weight_column_name].copy()
old_weights = weights.copy()

if not diff_error < pct_still * diff_error_old:
break

for target in self.targets:
self.rakeonvar(target)
weights = self.rakeonvar(target, weights)

if self._use_cap:

if min_cap is None:
while self.dataframe[self.weight_column_name].max() > max_cap:
self.dataframe.loc[self.dataframe[self.weight_column_name] > max_cap, self.weight_column_name] = max_cap
self.dataframe[self.weight_column_name] = self.dataframe[self.weight_column_name]/np.mean(self.dataframe[self.weight_column_name])
while weights.max() > max_cap:
weights[weights > max_cap] = max_cap
weights = weights / np.mean(weights)
else:
while (self.dataframe[self.weight_column_name].min() < min_cap) or (self.dataframe[self.weight_column_name].max() > max_cap):
self.dataframe.loc[self.dataframe[self.weight_column_name] < min_cap, self.weight_column_name] = min_cap
self.dataframe.loc[self.dataframe[self.weight_column_name] > max_cap, self.weight_column_name] = max_cap
self.dataframe[self.weight_column_name] = self.dataframe[self.weight_column_name]/np.mean(self.dataframe[self.weight_column_name])
while (weights.min() < min_cap) or (weights.max() > max_cap):
weights[weights < min_cap] = min_cap
weights[weights > max_cap] = max_cap
weights = weights / np.mean(weights)

diff_error_old = diff_error
diff_error = sum(abs(self.dataframe[self.weight_column_name]-old_weights))
diff_error = np.sum(np.abs(weights - old_weights))

del self._cache_indices
self.dataframe[self.weight_column_name] = weights

self.iteration_counter = iteration # for the report
self.dataframe[self.weight_column_name] = self.dataframe[self.weight_column_name].replace({0.0: 1.0})
Expand Down
2 changes: 1 addition & 1 deletion weightipy/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
version = '0.3.2'
version = '0.3.3'
4 changes: 0 additions & 4 deletions weightipy/weight_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,7 @@
import pandas as pd


#from weightipy.core.dataset import DataSet


class WeightEngine:

def __init__(self,
data=None,
dropna=True):
Expand Down

0 comments on commit fb7afc5

Please sign in to comment.