Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merge pull request #2 from data-mining-in-action/master #5

Open
wants to merge 19 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
520 changes: 520 additions & 0 deletions hw02/Collections_solution.ipynb

Large diffs are not rendered by default.

836 changes: 836 additions & 0 deletions hw02/NumPy_SciPy_solution.ipynb

Large diffs are not rendered by default.

912 changes: 912 additions & 0 deletions hw02/Pandas_solution.ipynb

Large diffs are not rendered by default.

1,095 changes: 1,095 additions & 0 deletions hw02/Visualize_solution.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion hw03/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

## Часть 1

Нужно подобрать оптимальные параметры xgboost для набора данных HR.csv в папке второго семинара. Код для проверки качества представлен в скрипте xgboost_params_checker.py, а пример набора параметров в xgboost_params_example.json. Чекер с вашими параметрами должен отработать за 2 минуты на машинке для проверки. Для сравнения на xgboost_params_example.json чекер работает 20 секунд.
Нужно подобрать оптимальные параметры xgboost для набора данных HR.csv. Код для проверки качества представлен в скрипте xgboost_params_checker.py, а пример набора параметров в xgboost_params_example.json. Чекер с вашими параметрами должен отработать за 2 минуты на машинке для проверки. Для сравнения на xgboost_params_example.json чекер работает 20 секунд.

## Часть 2

Expand Down
10 changes: 10 additions & 0 deletions hw05/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Домашнее задание №5

Дедлайн - 23:59 21.04.2019

## Описание

Нужно написать обсчёт стат. значимости теста. На вход будут подавать конверсии сессий в покупки, а также для каждой сесси индекс пользователя, котором она была совершена. Можете считать, что в тесте и контроле пользовтаели различны (даже если индексы совпадают). Код проверки приведён в файле checker.py, а пример функции в example.py.


Задание надо сдавать в [форму](https://forms.gle/1PNXSjmaDc4ugNab6).
32 changes: 32 additions & 0 deletions hw05/check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import os
import json
import re
from checker import Checker


if __name__ == '__main__':
checker = Checker()
scores = {}
results = {}
folder_path = 'hw_data/significance/'
for filename in os.listdir(folder_path):
if filename.endswith('.py'):
name, score = checker.check(folder_path + filename)
print(name, score)
if name is not None and score is not None:
print('score is', max(round(2 ** (6 * (score - 2.8)), 2), 0.05))
results[name] = score
else:
scores[name] = 0.05

best_accuracy = max(results.values())
for name in results:
scores[name] = max(round(2 ** (6 * (results[name] - 2.8)), 2), 0.05)

with open('hw_data/significance.json', 'w') as f:
json.dump(scores, f, indent=4)

with open('hw_data/significance.csv', 'w') as f:
f.write('email, score\n')
for name in sorted(scores):
f.write('{},{}\n'.format(name, scores[name]))
85 changes: 85 additions & 0 deletions hw05/checker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import numpy as np
import os
import imp
import signal
import traceback
import sys


SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))

LOW = 0.3
HIGH = 0.8
USERS_SIZE = 500
CONVERSIONS_SIZE = 10000


def signal_handler(signum, frame):
raise Exception("Timed out!")


class Checker(object):
def __init__(self):
self.applications = 0

@staticmethod
def perform_test(random_gen, evaluate, effect):
train_user_probs = random_gen.uniform(low=LOW, high=HIGH, size=USERS_SIZE)
train_indices = random_gen.choice(USERS_SIZE, size=CONVERSIONS_SIZE, replace=True)
train_conversions = (
random_gen.uniform(size=CONVERSIONS_SIZE) < train_user_probs[train_indices]
).astype(int)

test_user_probs = effect + random_gen.uniform(low=LOW, high=HIGH, size=USERS_SIZE)
test_indices = random_gen.choice(USERS_SIZE, size=CONVERSIONS_SIZE, replace=True)
test_conversions = (
random_gen.uniform(size=CONVERSIONS_SIZE) < test_user_probs[test_indices]
).astype(int)

pvalue = evaluate(
train_conversions,
train_indices,
test_conversions,
test_indices
)

return pvalue

@staticmethod
def eval_hits_ratio(random_gen, evaluate, effect, significance):
hits = 0
for _ in range(1000):
hits += Checker.perform_test(random_gen, evaluate, effect) < significance
return hits / 1000.


def check(self, script_path):
AUTHOR_EMAIL = None
random_gen = np.random.RandomState(42)
try:
signal.signal(signal.SIGALRM, signal_handler)
signal.alarm(120)
module = imp.load_source('significance_{}'.format(self.applications), script_path)
AUTHOR_EMAIL = module.AUTHOR_EMAIL

correctness = self.eval_hits_ratio(random_gen, module.evaluate, effect=0., significance=0.05)
if correctness > 0.07:
raise ValueError('Incorrect test: hits raitio is {}'.format(correctness))

power_005 = self.eval_hits_ratio(random_gen, module.evaluate, effect=0.005, significance=0.05)
power_010 = self.eval_hits_ratio(random_gen, module.evaluate, effect=0.01, significance=0.05)
power_050 = self.eval_hits_ratio(random_gen, module.evaluate, effect=0.05, significance=0.05)

print(correctness, power_005, power_010, power_050)

return AUTHOR_EMAIL, power_050 + power_010 * 4 + power_005 * 16
except:
traceback.print_exception(*sys.exc_info())
return AUTHOR_EMAIL, None
finally:
self.applications += 1



if __name__ == '__main__':
print(Checker().check(SCRIPT_DIR + '/example.py'))
14 changes: 14 additions & 0 deletions hw05/example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from scipy.stats import ttest_ind
import numpy as np


AUTHOR_EMAIL = '[email protected]'

def evaluate(
train_conversions,
train_indices,
test_conversions,
test_indices
):
# return ttest_ind(train_conversions, test_conversions, equal_var=False).pvalue
return np.random.uniform()
Binary file added lecture04/linears_neurals.pdf
Binary file not shown.
Binary file added lecture05/metrics.pdf
Binary file not shown.
Binary file added lecture06/AB_testing.pdf
Binary file not shown.
Binary file added lecture06/summary.pdf
Binary file not shown.
Binary file added lecture07/development.pdf
Binary file not shown.
33 changes: 30 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,36 @@
ipykernel==4.9.0
appnope==0.1.0
backcall==0.1.0
certifi==2019.3.9
cycler==0.10.0
decorator==4.3.2
ipykernel==4.9.0
ipython==7.3.0
ipython-genutils==0.2.0
jedi==0.13.3
jupyter-client==5.2.4
jupyter-core==4.4.0
kiwisolver==1.0.1
matplotlib==2.2.2
numpy==1.14.1
scipy==1.1.0
pandas==0.22.0
parso==0.3.4
patsy==0.5.1
pexpect==4.6.0
pickleshare==0.7.5
prompt-toolkit==2.0.9
ptyprocess==0.6.0
Pygments==2.3.1
pyparsing==2.3.1
python-dateutil==2.8.0
pytz==2018.9
pyzmq==18.0.1
scikit-learn==0.19.2
matplotlib==2.2.2
scipy==1.1.0
seaborn==0.8.1
six==1.12.0
statsmodels==0.9.0
tornado==6.0.1
traitlets==4.3.2
wcwidth==0.1.7
xgboost==0.82

4 changes: 4 additions & 0 deletions seminar04/multi_task/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Multi-task learning
В этом семинаре мы с вами обучим модели предсказывать пол и расу человека по фото. Сначала для этого будут обучено две разные модели, а затем одна срaзу для двух задач.

Веса моделей и эмбеддинги доступны по [`ссылке`](https://drive.google.com/open?id=1LcaIDe0AIWe_MzS2BBHxtGKUy0F8J73P)
30 changes: 30 additions & 0 deletions seminar04/multi_task/dataloaders.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from torch.utils.data import Dataset
import cv2


def load_img(img_path):
img = cv2.imread(str(img_path))
if img is None:
raise FileNotFoundError(img_path)
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
return img


class ImagesDataset(Dataset):
def __init__(self, df, image_paths_name, labels_names, is_train, transform):
df = df[df["is_train"] == int(is_train)]
self.image_paths = df[image_paths_name].values
self.transform = transform
self.labels = df[labels_names].values

def __len__(self):
return len(self.image_paths)

def __getitem__(self, idx):
image_orig = load_img(self.image_paths[idx])
image = image_orig
label = self.labels[idx]
if self.transform:
image = self.transform(image)

return image, label.squeeze()
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added seminar04/multi_task/images/gender&race_race.PNG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added seminar04/multi_task/images/gender_only_race.PNG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading