-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsample.py
144 lines (123 loc) · 5.08 KB
/
sample.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from cmca import CMCA
# Congressional Voting Records Data Set
# https://archive.ics.uci.edu/ml/datasets/Congressional+Voting+Records
df = pd.read_csv("./data/house-votes-84.data", header=None)
with open("./data/house-votes-84.col_names", "r") as f:
# chr(10) is newline (to avoid newline when generating doc with sphinx)
df.columns = [line.replace(chr(10), "") for line in f]
X = df.iloc[:, 1:]
y = np.array(df.iloc[:, 0])
fg = X.iloc[y == "democrat"]
bg = X.iloc[y == "republican"]
# alpha = 0 (normal MCA on fg)
# alpha = 10 (contrastive MCA fg vs bg)
# alpha = 'auto' (contrastive MCA with auto selection of alpha)
cmca = CMCA(n_components=2)
for alpha in [0, 10, "auto"]:
### cMCA
auto_alpha = False
if alpha == "auto":
alpha = None
auto_alpha = True
cmca.fit(fg, bg, alpha=alpha, auto_alpha_selection=auto_alpha)
# row coordinates (cloud of individuals)
Y_fg_row = np.array(cmca.transform(fg, axis="row"))
Y_bg_row = np.array(cmca.transform(bg, axis="row"))
# col coordinates (cloud of categories)
Y_fg_col = np.array(cmca.transform(fg, axis="col"))
Y_bg_col = np.array(cmca.transform(bg, axis="col"))
# cPC loadings
loadings = cmca.loadings
# category names
categories = cmca.categories
### Plot the results
alpha = int(cmca.alpha * 100) / 100
plt.figure(figsize=[8, 8])
# plot row coordinates
plt.subplot(2, 2, 1)
plt.scatter(Y_fg_row[:, 0], Y_fg_row[:, 1], c="b", s=5, label="demo")
plt.scatter(Y_bg_row[:, 0], Y_bg_row[:, 1], c="r", s=5, label="rep")
plt.legend(loc="best", shadow=False, scatterpoints=1, fontsize=10)
plt.title(f"cMCA row coords. alpha = {alpha}")
# plot col coordinates
plt.subplot(2, 2, 2)
plt.scatter(Y_fg_col[:, 0], Y_fg_col[:, 1], c="g", s=5, label="cate")
for i in range(Y_fg_col.shape[0]):
plt.text(Y_fg_col[i, 0], Y_fg_col[i, 1], str(i), fontsize=6)
plt.legend(loc="best", shadow=False, scatterpoints=1, fontsize=10)
plt.title(f"cMCA col coords. alpha = {alpha}")
# plot cPC loadings
plt.subplot(2, 2, 3)
plt.scatter(loadings[:, 0], loadings[:, 1], c="g", s=5, label="cate")
for i in range(loadings.shape[0]):
plt.text(loadings[i, 0], loadings[i, 1], str(i), fontsize=6)
plt.legend(loc="best", shadow=False, scatterpoints=1, fontsize=10)
plt.title(f"cPC loadings. alpha = {alpha}")
# plot category names
plt.subplot(2, 2, 4)
for i, cate in enumerate(categories):
plt.text(0.1, i, str(i) + ": " + cate, fontsize=6, c="g")
plt.xticks([])
plt.yticks([])
plt.xlim([0, 1])
plt.ylim([len(categories) + 1, -1])
plt.title("Categories")
plt.tight_layout()
plt.show()
### Examples using analysis helper methods
# accessing supplementary info
print("Supplementary information of each question")
supp_info = cmca.get_questions_info()
questions = list(supp_info.keys())
print(supp_info[questions[0]])
# plot column coordinates and loadings with colors
fig = cmca.plot_quesitions(
plot_type="colcoord", # colcoord, loading, or component
X=fg,
k_loadings_to_color=3,
rank_loadings_by={"criterion": "variance", "pc_idx": 0},
display_mode="hide_non_top_k_text",
)
plt.show()
fig = cmca.plot_quesitions(
plot_type="loading", # colcoord, loading, or component
k_loadings_to_color=3,
# colored_questions=['handicapped-infants', 'water-project-cost-sharing'],
rank_loadings_by={"criterion": "variance", "pc_idx": 0},
display_mode="hide_non_top_k_text",
)
plt.show()
# After the first fit, if you only update the result with a new alpha,
# you can use update_fit()
cmca.update_fit(alpha=10000)
# Usage Example of preprocessed data with one-hot encoding
# This way can save computational cost if you select different sets of fg and bg
# from the same dataset to produce multiple cMCA results
X_oh = CMCA.OneHotEncoder().fit_transform(X)
fg_oh = X_oh.iloc[y == "democrat"]
bg_oh = X_oh.iloc[y == "republican"]
cmca.fit(fg_oh, bg_oh, onehot_encoded=True)
Y_fg_row = np.array(cmca.transform(fg_oh, axis="row", onehot_encoded=True))
Y_bg_row = np.array(cmca.transform(bg_oh, axis="row", onehot_encoded=True))
alpha = int(cmca.alpha * 100) / 100
plt.figure(figsize=[8, 8])
plt.scatter(Y_fg_row[:, 0], Y_fg_row[:, 1], c="b", s=5, label="demo")
plt.scatter(Y_bg_row[:, 0], Y_bg_row[:, 1], c="r", s=5, label="rep")
plt.legend(loc="best", shadow=False, scatterpoints=1, fontsize=10)
plt.title(f"cMCA row coords. alpha = {alpha}")
plt.show()
fg_oh = X_oh.iloc[y == "republican"]
bg_oh = X_oh.iloc[y == "democrat"]
cmca.fit(fg_oh, bg_oh, onehot_encoded=True)
Y_fg_row = np.array(cmca.transform(fg_oh, axis="row", onehot_encoded=True))
Y_bg_row = np.array(cmca.transform(bg_oh, axis="row", onehot_encoded=True))
alpha = int(cmca.alpha * 100) / 100
plt.figure(figsize=[8, 8])
plt.scatter(Y_fg_row[:, 0], Y_fg_row[:, 1], c="r", s=5, label="rep")
plt.scatter(Y_bg_row[:, 0], Y_bg_row[:, 1], c="b", s=5, label="demo")
plt.legend(loc="best", shadow=False, scatterpoints=1, fontsize=10)
plt.title(f"cMCA row coords. alpha = {alpha}")
plt.show()