Skip to content

Commit

Permalink
Add DINA examples and docs. Update docs of IRT and KaNCD. Modify pyte…
Browse files Browse the repository at this point in the history
…st-flake8 version requirement in setup.py
  • Loading branch information
LegionKing committed Sep 21, 2024
1 parent 550a3eb commit 9aa8460
Show file tree
Hide file tree
Showing 7 changed files with 90 additions and 158 deletions.
142 changes: 0 additions & 142 deletions EduCDM/DINA/DINA.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,102 +45,6 @@ def forward(self, user, item, knowledge, *args):
return (1 - slip) ** n * guess ** (1 - n)


# class STEFunction(autograd.Function):
# @staticmethod
# def forward(ctx, input):
# return (input > 0).float()

# @staticmethod
# def backward(ctx, grad_output):
# return F.hardtanh(grad_output)


# class StraightThroughEstimator(nn.Module):
# def __init__(self):
# super(StraightThroughEstimator, self).__init__()

# def forward(self, x):
# x = STEFunction.apply(x)
# return x


# class STEDINANet(DINANet):
# def __init__(self, user_num, item_num, hidden_dim, max_slip=0.4, max_guess=0.4, *args, **kwargs):
# super(STEDINANet, self).__init__(user_num, item_num, hidden_dim, max_slip, max_guess, *args, **kwargs)
# self.sign = StraightThroughEstimator()

# def forward(self, user, item, knowledge, *args):
# theta = self.sign(self.theta(user))
# slip = torch.squeeze(torch.sigmoid(self.slip(item)) * self.max_slip)
# guess = torch.squeeze(torch.sigmoid(self.guess(item)) * self.max_guess)
# mask_theta = (knowledge == 0) + (knowledge == 1) * theta
# n = torch.prod((mask_theta + 1) / 2, dim=-1)
# return torch.pow(1 - slip, n) * torch.pow(guess, 1 - n)


# class DINA(CDM):
# def __init__(self, user_num, item_num, hidden_dim, ste=False):
# super(DINA, self).__init__()
# if ste:
# self.dina_net = STEDINANet(user_num, item_num, hidden_dim)
# else:
# self.dina_net = DINANet(user_num, item_num, hidden_dim)

# def train(self, train_data, test_data=None, *, epoch: int, device="cpu", lr=0.001) -> ...:
# self.dina_net = self.dina_net.to(device)
# loss_function = nn.BCELoss()

# trainer = torch.optim.Adam(self.dina_net.parameters(), lr)

# for e in range(epoch):
# losses = []
# for batch_data in tqdm(train_data, "Epoch %s" % e):
# user_id, item_id, knowledge, response = batch_data
# user_id: torch.Tensor = user_id.to(device)
# item_id: torch.Tensor = item_id.to(device)
# knowledge: torch.Tensor = knowledge.to(device)
# predicted_response: torch.Tensor = self.dina_net(user_id, item_id, knowledge)
# response: torch.Tensor = response.to(device)
# loss = loss_function(predicted_response, response)

# # back propagation
# trainer.zero_grad()
# loss.backward()
# trainer.step()

# losses.append(loss.mean().item())
# print("[Epoch %d] LogisticLoss: %.6f" % (e, float(np.mean(losses))))

# if test_data is not None:
# auc, accuracy = self.eval(test_data, device=device)
# print("[Epoch %d] auc: %.6f, accuracy: %.6f" % (e, auc, accuracy))

# def eval(self, test_data, device="cpu") -> tuple:
# self.dina_net = self.dina_net.to(device)
# self.dina_net.eval()
# y_pred = []
# y_true = []
# for batch_data in tqdm(test_data, "evaluating"):
# user_id, item_id, knowledge, response = batch_data
# user_id: torch.Tensor = user_id.to(device)
# item_id: torch.Tensor = item_id.to(device)
# knowledge: torch.Tensor = knowledge.to(device)
# pred: torch.Tensor = self.dina_net(user_id, item_id, knowledge)
# y_pred.extend(pred.tolist())
# y_true.extend(response.tolist())

# self.dina_net.train()
# return roc_auc_score(y_true, y_pred), accuracy_score(y_true, np.array(y_pred) >= 0.5)

# def save(self, filepath):
# torch.save(self.dina_net.state_dict(), filepath)
# logging.info("save parameters to %s" % filepath)

# def load(self, filepath):
# self.dina_net.load_state_dict(torch.load(filepath))
# logging.info("load parameters from %s" % filepath)


class DINA(CDM):

def __init__(self, meta_data: dict,
Expand Down Expand Up @@ -314,49 +218,3 @@ def load(self, filepath):
self.net.load_state_dict(torch.load(filepath, map_location=lambda s, loc: s))
logging.info("load parameters from %s" % filepath)


# def _test():
# train_data = pd.read_csv('../tests/data/train_0.8_0.2.csv').head(100)
# q_matrix = np.loadtxt('../tests/data/Q_matrix.txt')
# # train_data = pd.DataFrame({
# # 'userId': [
# # '001', '001', '001', '001', '002', '002',
# # '002', '002', '003', '003', '003', '003'],
# # 'itemId': [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3],
# # 'response': [0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1]
# # })
# # q_matrix = np.array([
# # [1, 1, 0, 0],
# # [0, 1, 1, 0],
# # [0, 0, 1, 1],
# # [1, 0, 0, 1]
# # ])

# train_data['skill'] = 0
# for id in range(train_data.shape[0]):
# item_id = train_data.loc[id, 'itemId']
# concepts = np.where(
# q_matrix[item_id] > 0)[0].tolist()
# train_data.loc[id, 'skill'] = str(concepts)
# meta_data = {'userId': [], 'itemId': [], 'skill': []}
# meta_data['userId'] = train_data['userId'].unique().tolist()
# meta_data['itemId'] = train_data['itemId'].unique().tolist()
# meta_data['skill'] = [i for i in range(q_matrix.shape[1])]

# dina = DINA(meta_data)
# dina.fit(
# train_data,
# val_data=train_data,
# batch_size=4, epoch=5, lr=0.01)
# dina.save('./dina.pt')
# new_dina = DINA(meta_data)
# new_dina.load('./dina.pt')
# new_dina.fit(
# train_data,
# val_data=train_data,
# batch_size=1, epoch=3, lr=0.01)
# new_dina.eval(train_data)


# if __name__ == '__main__':
# _test()
50 changes: 38 additions & 12 deletions docs/DINA.md
Original file line number Diff line number Diff line change
@@ -1,17 +1,43 @@
# Deterministic Inputs, Noisy “And” gate model
The implementation of the classical cognitive diagnosis model, i.e., DINA (Deterministic Inputs, Noisy “And” gate) model. The training process is adapted to using gradient descending methods. If the reader wants to know the details of DINA, please refer to the Appendix of the paper: *[DINA model and parameter estimation: A didactic](https://journals.sagepub.com/doi/10.3102/1076998607309474)*.

If the reader wants to know the details of DINA, please refer to the Appendix of the paper: *[DINA model and parameter estimation: A didactic](https://journals.sagepub.com/doi/10.3102/1076998607309474)*.
```bibtex
@article{de2009dina,
title={DINA model and parameter estimation: A didactic},
author={De La Torre, Jimmy},
journal={Journal of educational and behavioral statistics},
volume={34},
number={1},
pages={115--130},
year={2009},
publisher={Sage Publications Sage CA: Los Angeles, CA}
If this repository is helpful for you, please cite our work

```
@misc{bigdata2021educdm,
title={EduCDM},
author={bigdata-ustc},
publisher = {GitHub},
journal = {GitHub repository},
year = {2021},
howpublished = {\url{https://github.com/bigdata-ustc/EduCDM}},
}
```

![model](_static/DINA.png)
## Model description
DINA model is a classical cognitive diagnosis model, where each learner $i$ is represented with a binary vector ($[\alpha_{ik}, k=1,2,...K]$ in the following figure) indicating the learner's knowledge mastery pattern. A Q-matrix $Q=\{0, 1\}^{J\times K}$ indicates relevant skills (or knowledge components) of the test items. For each test item $j$, there are possibilities to slip on it and guess the correct answer, which are characterized by the parameters $s_j$ and $g_j$ respectively. Overall, the probability that learner $i$ would provide a correct response to item $j$ is calculated as follows:
$$Pr(X_{ij}=1|\alpha_i,q_j, s_j, g_j) = (1-s_j)^{\eta_{ij}}g_j^{1-\eta_{ij}},$$

$$
\eta_{ij} = \prod_{k=1}^{K}\alpha_{ik}^{q_{jk}}.
$$

<img src=_static/DINA.png width=90%>

## Parameters description

| Parameters | Type | Description |
| ---------- | ---- | ---------------------------------------- |
| meta_data | dict | a dictionary containing all the userIds, itemIds, and skills. |
| max_slip | float | the maximum value of possible slipping. default: 0.4 |
| max_guess | float | the maximum value of possible slipping. default: 0.4 |


## Methods summary

| Methods | Description |
| ----------------- | ---------------------------------------- |
| fit | Fits the model to the training data. |
| fit_predict | Use the model to predict the responses in the testing data and returns the results. The responses are either 1 (i.e., correct answer) or 0 (i.e., incorrect answer). |
| fit_predict_proba | Use the model to predict the responses in the testing data and returns the probabilities (that the correct answers will be provided). |

2 changes: 1 addition & 1 deletion docs/IRT.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ In EMIRT, EM algorithm is adopted to estimate the parameters.
| dim | int | the dimension of student's ability. Default: 1 |
| skip_value | int | the skip_value of the item response matrix. Default: -1 |

##Examples
## Examples

```python
import pandas as pd
Expand Down
4 changes: 2 additions & 2 deletions docs/KaNCD.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@ KaNCD is an **K**nowledge-**a**ssociation based extension of the **N**eural**CD*

The knowledge difficulty vector of an exercise is calculated from the latent trait of the exercise and the latent trait of each knowledge concept.

![KDM_MF](F:\git_project\EduCDM\EduCDM\docs\_static\KDM_MF.png)
![KDM_MF](_static\KDM_MF.png)

Similarly, the knowledge proficiency vector of a student is calculated from the latent trait of the student and the latent trait of each knowledge concept.

![KPM_MF](F:\git_project\EduCDM\EduCDM\docs\_static\KPM_MF.png)
![KPM_MF](_static\KPM_MF.png)

Please refer to the paper for more details.

Expand Down
48 changes: 48 additions & 0 deletions examples/DINA/DINA.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# coding: utf-8
# @ WangFei
import logging
from EduCDM import DINA
import torch
from torch.utils.data import TensorDataset, DataLoader
import pandas as pd
import numpy as np
from EduData import get_data


# get_data("cdbd-a0910", "../../data") # Download dataset "cdbd-a0910"

# load data and transform it to the required format
train_data = pd.read_csv("../../data/a0910/train.csv")
valid_data = pd.read_csv("../../data/a0910/valid.csv")
test_data = pd.read_csv("../../data/a0910/test.csv")
df_item = pd.read_csv("../../data/a0910/item.csv")
knowledge_set, item_set = set(), set()
for i, s in df_item.iterrows():
item_id, knowledge_codes = s['item_id'], list(set(eval(s['knowledge_code'])))
knowledge_set.update(knowledge_codes)
item_set.add(item_id)
userIds = train_data['user_id'].unique()
meta_data = {'userId': list(userIds), 'itemId': list(item_set), 'skill': list(knowledge_set)}
train_data = (pd.merge(train_data, df_item, how='left', on='item_id')
.rename(columns={'user_id': 'userId', 'item_id': 'itemId', 'knowledge_code': 'skill', 'score': 'response'}))
valid_data = pd.merge(valid_data, df_item, how='left', on='item_id').rename(columns={'user_id': 'userId', 'item_id': 'itemId', 'knowledge_code': 'skill', 'score': 'response'})
test_data = pd.merge(test_data, df_item, how='left', on='item_id').rename(columns={'user_id': 'userId', 'item_id': 'itemId', 'knowledge_code': 'skill', 'score': 'response'})

# model training
batch_size = 32
logging.getLogger().setLevel(logging.INFO)
cdm = DINA(meta_data)
cdm.fit(train_data, epoch=1, val_data=valid_data, device="cuda")

# predict using the trained model
print(cdm.predict(test_data))

# save model
cdm.save("dina.snapshot")

# load model and evaluate it on the test set
cdm.load("dina.snapshot")
auc, accuracy = cdm.eval(test_data)
print("auc: %.6f, accuracy: %.6f" % (auc, accuracy))


Binary file added examples/DINA/dina.snapshot
Binary file not shown.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
'pytest>=4',
'pytest-cov>=2.6.0',
# 'pytest-flake8==4.0.1',
'pytest-flake8<5.0.0',
'pytest-flake8<1.1.2', # When >=1.2.2, incompatable with flake8>=5.0.0. E.g., flake8.options.config.load_config
'flake8<5.0.0'
]

Expand Down

0 comments on commit 9aa8460

Please sign in to comment.