Add DINA examples and docs. Update docs of IRT and KaNCD. Modify pyte…

…st-flake8 version requirement in setup.py
bigdata-ustc · Sep 21, 2024 · 9aa8460 · 9aa8460
1 parent 550a3eb
commit 9aa8460
Show file tree

Hide file tree

Showing 7 changed files with 90 additions and 158 deletions.
diff --git a/EduCDM/DINA/DINA.py b/EduCDM/DINA/DINA.py
@@ -45,102 +45,6 @@ def forward(self, user, item, knowledge, *args):
             return (1 - slip) ** n * guess ** (1 - n)
 
 
-# class STEFunction(autograd.Function):
-#     @staticmethod
-#     def forward(ctx, input):
-#         return (input > 0).float()
-
-#     @staticmethod
-#     def backward(ctx, grad_output):
-#         return F.hardtanh(grad_output)
-
-
-# class StraightThroughEstimator(nn.Module):
-#     def __init__(self):
-#         super(StraightThroughEstimator, self).__init__()
-
-#     def forward(self, x):
-#         x = STEFunction.apply(x)
-#         return x
-
-
-# class STEDINANet(DINANet):
-#     def __init__(self, user_num, item_num, hidden_dim, max_slip=0.4, max_guess=0.4, *args, **kwargs):
-#         super(STEDINANet, self).__init__(user_num, item_num, hidden_dim, max_slip, max_guess, *args, **kwargs)
-#         self.sign = StraightThroughEstimator()
-
-#     def forward(self, user, item, knowledge, *args):
-#         theta = self.sign(self.theta(user))
-#         slip = torch.squeeze(torch.sigmoid(self.slip(item)) * self.max_slip)
-#         guess = torch.squeeze(torch.sigmoid(self.guess(item)) * self.max_guess)
-#         mask_theta = (knowledge == 0) + (knowledge == 1) * theta
-#         n = torch.prod((mask_theta + 1) / 2, dim=-1)
-#         return torch.pow(1 - slip, n) * torch.pow(guess, 1 - n)
-
-
-# class DINA(CDM):
-#     def __init__(self, user_num, item_num, hidden_dim, ste=False):
-#         super(DINA, self).__init__()
-#         if ste:
-#             self.dina_net = STEDINANet(user_num, item_num, hidden_dim)
-#         else:
-#             self.dina_net = DINANet(user_num, item_num, hidden_dim)
-
-#     def train(self, train_data, test_data=None, *, epoch: int, device="cpu", lr=0.001) -> ...:
-#         self.dina_net = self.dina_net.to(device)
-#         loss_function = nn.BCELoss()
-
-#         trainer = torch.optim.Adam(self.dina_net.parameters(), lr)
-
-#         for e in range(epoch):
-#             losses = []
-#             for batch_data in tqdm(train_data, "Epoch %s" % e):
-#                 user_id, item_id, knowledge, response = batch_data
-#                 user_id: torch.Tensor = user_id.to(device)
-#                 item_id: torch.Tensor = item_id.to(device)
-#                 knowledge: torch.Tensor = knowledge.to(device)
-#                 predicted_response: torch.Tensor = self.dina_net(user_id, item_id, knowledge)
-#                 response: torch.Tensor = response.to(device)
-#                 loss = loss_function(predicted_response, response)
-
-#                 # back propagation
-#                 trainer.zero_grad()
-#                 loss.backward()
-#                 trainer.step()
-
-#                 losses.append(loss.mean().item())
-#             print("[Epoch %d] LogisticLoss: %.6f" % (e, float(np.mean(losses))))
-
-#             if test_data is not None:
-#                 auc, accuracy = self.eval(test_data, device=device)
-#                 print("[Epoch %d] auc: %.6f, accuracy: %.6f" % (e, auc, accuracy))
-
-#     def eval(self, test_data, device="cpu") -> tuple:
-#         self.dina_net = self.dina_net.to(device)
-#         self.dina_net.eval()
-#         y_pred = []
-#         y_true = []
-#         for batch_data in tqdm(test_data, "evaluating"):
-#             user_id, item_id, knowledge, response = batch_data
-#             user_id: torch.Tensor = user_id.to(device)
-#             item_id: torch.Tensor = item_id.to(device)
-#             knowledge: torch.Tensor = knowledge.to(device)
-#             pred: torch.Tensor = self.dina_net(user_id, item_id, knowledge)
-#             y_pred.extend(pred.tolist())
-#             y_true.extend(response.tolist())
-
-#         self.dina_net.train()
-#         return roc_auc_score(y_true, y_pred), accuracy_score(y_true, np.array(y_pred) >= 0.5)
-
-#     def save(self, filepath):
-#         torch.save(self.dina_net.state_dict(), filepath)
-#         logging.info("save parameters to %s" % filepath)
-
-#     def load(self, filepath):
-#         self.dina_net.load_state_dict(torch.load(filepath))
-#         logging.info("load parameters from %s" % filepath)
-
-
 class DINA(CDM):
 
     def __init__(self, meta_data: dict,
@@ -314,49 +218,3 @@ def load(self, filepath):
         self.net.load_state_dict(torch.load(filepath, map_location=lambda s, loc: s))
         logging.info("load parameters from %s" % filepath)
 
-
-# def _test():
-#     train_data = pd.read_csv('../tests/data/train_0.8_0.2.csv').head(100)
-#     q_matrix = np.loadtxt('../tests/data/Q_matrix.txt')
-#     # train_data = pd.DataFrame({
-#     #     'userId': [
-#     #         '001', '001', '001', '001', '002', '002',
-#     #         '002', '002', '003', '003', '003', '003'],
-#     #     'itemId': [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3],
-#     #     'response': [0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1]
-#     # })
-#     # q_matrix = np.array([
-#     #     [1, 1, 0, 0],
-#     #     [0, 1, 1, 0],
-#     #     [0, 0, 1, 1],
-#     #     [1, 0, 0, 1]
-#     # ])
-
-#     train_data['skill'] = 0
-#     for id in range(train_data.shape[0]):
-#         item_id = train_data.loc[id, 'itemId']
-#         concepts = np.where(
-#             q_matrix[item_id] > 0)[0].tolist()
-#         train_data.loc[id, 'skill'] = str(concepts)
-#     meta_data = {'userId': [], 'itemId': [], 'skill': []}
-#     meta_data['userId'] = train_data['userId'].unique().tolist()
-#     meta_data['itemId'] = train_data['itemId'].unique().tolist()
-#     meta_data['skill'] = [i for i in range(q_matrix.shape[1])]
-
-#     dina = DINA(meta_data)
-#     dina.fit(
-#         train_data,
-#         val_data=train_data,
-#         batch_size=4, epoch=5, lr=0.01)
-#     dina.save('./dina.pt')
-#     new_dina = DINA(meta_data)
-#     new_dina.load('./dina.pt')
-#     new_dina.fit(
-#         train_data,
-#         val_data=train_data,
-#         batch_size=1, epoch=3, lr=0.01)
-#     new_dina.eval(train_data)
-
-
-# if __name__ == '__main__':
-#     _test()
diff --git a/docs/DINA.md b/docs/DINA.md
@@ -1,17 +1,43 @@
 # Deterministic Inputs, Noisy “And” gate model
+The implementation of the classical cognitive diagnosis model, i.e., DINA (Deterministic Inputs, Noisy “And” gate) model. The training process is adapted to using gradient descending methods. If the reader wants to know the details of DINA, please refer to the Appendix of the paper: *[DINA model and parameter estimation: A didactic](https://journals.sagepub.com/doi/10.3102/1076998607309474)*.
 
-If the reader wants to know the details of DINA, please refer to the Appendix of the paper: *[DINA model and parameter estimation: A didactic](https://journals.sagepub.com/doi/10.3102/1076998607309474)*.
-```bibtex
-@article{de2009dina,
-  title={DINA model and parameter estimation: A didactic},
-  author={De La Torre, Jimmy},
-  journal={Journal of educational and behavioral statistics},
-  volume={34},
-  number={1},
-  pages={115--130},
-  year={2009},
-  publisher={Sage Publications Sage CA: Los Angeles, CA}
+If this repository is helpful for you, please cite our work
+
+```
+@misc{bigdata2021educdm,
+  title={EduCDM},
+  author={bigdata-ustc},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  year = {2021},
+  howpublished = {\url{https://github.com/bigdata-ustc/EduCDM}},
 }
 ```
 
-![model](_static/DINA.png)
+## Model description
+DINA model is a classical cognitive diagnosis model, where each learner $i$ is represented with a binary vector ($[\alpha_{ik}, k=1,2,...K]$ in the following figure) indicating the learner's knowledge mastery pattern. A Q-matrix $Q=\{0, 1\}^{J\times K}$ indicates relevant skills (or knowledge components) of the test items. For each test item $j$, there are possibilities to slip on it and guess the correct answer, which are characterized by the parameters $s_j$ and $g_j$ respectively. Overall, the probability that learner $i$ would provide a correct response to item $j$ is calculated as follows:
+$$Pr(X_{ij}=1|\alpha_i,q_j, s_j, g_j) = (1-s_j)^{\eta_{ij}}g_j^{1-\eta_{ij}},$$
+
+$$
+\eta_{ij} = \prod_{k=1}^{K}\alpha_{ik}^{q_{jk}}.
+$$
+
+<img src=_static/DINA.png width=90%>
+
+## Parameters description
+
+| Parameters | Type | Description                              |
+| ---------- | ---- | ---------------------------------------- |
+| meta_data  | dict | a dictionary containing all the userIds, itemIds, and skills. |
+| max_slip        | float  | the maximum value of possible slipping. default: 0.4 |
+| max_guess    | float  | the maximum value of possible slipping. default: 0.4 |
+
+
+## Methods summary
+
+| Methods           | Description                              |
+| ----------------- | ---------------------------------------- |
+| fit               | Fits the model to the training data.     |
+| fit_predict       | Use the model to predict the responses in the testing data and returns the results. The responses are either 1 (i.e., correct answer) or 0 (i.e., incorrect answer). |
+| fit_predict_proba | Use the model to predict the responses in the testing data and returns the probabilities (that the correct answers will be provided). |
+
diff --git a/docs/IRT.md b/docs/IRT.md
@@ -33,7 +33,7 @@ In EMIRT, EM algorithm is adopted to estimate the parameters.
 | dim        | int  | the  dimension of student's ability. Default: 1 |
 | skip_value | int  | the skip_value of the item response matrix. Default: -1 |
 
-##Examples
+## Examples
 
 ```python
 import pandas as pd

diff --git a/docs/KaNCD.md b/docs/KaNCD.md
@@ -24,11 +24,11 @@ KaNCD is an **K**nowledge-**a**ssociation based extension of the **N**eural**CD*
 
 The knowledge difficulty vector of an exercise is calculated from the latent trait of the exercise and the latent trait of each knowledge concept. 
 
-![KDM_MF](F:\git_project\EduCDM\EduCDM\docs\_static\KDM_MF.png)
+![KDM_MF](_static\KDM_MF.png)
 
 Similarly, the knowledge proficiency vector of a student is calculated from the latent trait of the student and the latent trait of each knowledge concept.
 
-![KPM_MF](F:\git_project\EduCDM\EduCDM\docs\_static\KPM_MF.png)
+![KPM_MF](_static\KPM_MF.png)
 
 Please refer to the paper for more details.
 

diff --git a/examples/DINA/DINA.py b/examples/DINA/DINA.py
@@ -0,0 +1,48 @@
+# coding: utf-8
+# @ WangFei
+import logging
+from EduCDM import DINA
+import torch
+from torch.utils.data import TensorDataset, DataLoader
+import pandas as pd
+import numpy as np
+from EduData import get_data
+
+
+# get_data("cdbd-a0910", "../../data")   # Download dataset "cdbd-a0910"
+
+# load data and transform it to the required format
+train_data = pd.read_csv("../../data/a0910/train.csv")
+valid_data = pd.read_csv("../../data/a0910/valid.csv")
+test_data = pd.read_csv("../../data/a0910/test.csv")
+df_item = pd.read_csv("../../data/a0910/item.csv")
+knowledge_set, item_set = set(), set()
+for i, s in df_item.iterrows():
+    item_id, knowledge_codes = s['item_id'], list(set(eval(s['knowledge_code'])))
+    knowledge_set.update(knowledge_codes)
+    item_set.add(item_id)
+userIds = train_data['user_id'].unique()
+meta_data = {'userId': list(userIds), 'itemId': list(item_set), 'skill': list(knowledge_set)}
+train_data = (pd.merge(train_data, df_item, how='left', on='item_id')
+              .rename(columns={'user_id': 'userId', 'item_id': 'itemId', 'knowledge_code': 'skill', 'score': 'response'}))
+valid_data = pd.merge(valid_data, df_item, how='left', on='item_id').rename(columns={'user_id': 'userId', 'item_id': 'itemId', 'knowledge_code': 'skill', 'score': 'response'})
+test_data = pd.merge(test_data, df_item, how='left', on='item_id').rename(columns={'user_id': 'userId', 'item_id': 'itemId', 'knowledge_code': 'skill', 'score': 'response'})
+
+# model training
+batch_size = 32
+logging.getLogger().setLevel(logging.INFO)
+cdm = DINA(meta_data)
+cdm.fit(train_data, epoch=1, val_data=valid_data, device="cuda")
+
+# predict using the trained model
+print(cdm.predict(test_data))
+
+# save model
+cdm.save("dina.snapshot")
+
+# load model and evaluate it on the test set
+cdm.load("dina.snapshot")
+auc, accuracy = cdm.eval(test_data)
+print("auc: %.6f, accuracy: %.6f" % (auc, accuracy))
+
+
diff --git a/examples/DINA/dina.snapshot b/examples/DINA/dina.snapshot
diff --git a/setup.py b/setup.py
@@ -4,7 +4,7 @@
     'pytest>=4',
     'pytest-cov>=2.6.0',
     # 'pytest-flake8==4.0.1',
-    'pytest-flake8<5.0.0',
+    'pytest-flake8<1.1.2',  # When >=1.2.2, incompatable with flake8>=5.0.0. E.g., flake8.options.config.load_config
     'flake8<5.0.0'
 ]