EMIRT push

bigdata-ustc · Dec 31, 2023 · be43483 · be43483
1 parent 5ce4ddb
commit be43483
Show file tree

Hide file tree

Showing 13 changed files with 474 additions and 1 deletion.
diff --git a/EduCDM/IRT/EM/IRT.py b/EduCDM/IRT/EM/IRT.py
@@ -0,0 +1,314 @@
+# coding: utf-8
+# 2023/12/26 @ ChenSiHang
+import pandas as pd
+import sys
+import logging
+from typing import Any
+import numpy as np
+import pickle
+from pandas.core.api import DataFrame as DataFrame
+from tqdm import tqdm
+from scipy import stats
+sys.path.append("..")
+from irt import irt3pl
+from EduCDM import CDM, re_index
+
+
+def init_parameters(prob_num, dim):
+    r"""
+    Purpose:
+        Initialize the parameters of IRT model
+    Parameter: 
+        prob_num: the number of problems
+        dim: the dimension of  student's ability
+    Return value:
+        alpha: Discrimination of the problems
+        beta: Difficulty of the problems
+        gamma: Guess parameters of the problems
+    """
+    alpha = stats.norm.rvs(loc=0.75, scale=0.01, size=(prob_num, dim))
+    beta = stats.norm.rvs(size=(prob_num, dim))
+    gamma = stats.uniform.rvs(size=prob_num)
+    return alpha, beta, gamma
+
+
+def init_prior_prof_distribution(dim):
+    r"""
+    Purpose:
+        Initialize the prior distribution of student abilities
+    Parameter:
+        dim: Dimension of student's ability
+    Return value:
+        prof: Uniform distribution from -4 to 4, 100*dim matrix
+        norm_dis: Normalized distribution of student abilities
+    """
+    prof = stats.uniform.rvs(loc=-4, scale=8, size=(100, dim))  # shape = (100,dim)
+    dis = stats.multivariate_normal.pdf(prof, mean=np.zeros(dim), cov=np.identity(dim))
+    norm_dis = dis / np.sum(dis)  # shape = (100,)
+    return prof, norm_dis
+
+
+def get_Likelihood(a, b, c, prof, R):
+    r"""
+    Purpose:
+        get the likelihood function
+    Parameter:
+        a: Discrimination of the problems
+        b: Difficulty of the problems
+        c: Guess parameters of the problems
+        prof: Normalized distribution of student's ability
+        R: matrix of item response
+    Return value:
+        prof_prob: Probability matrix for 100 ability levels of people answering each question correctly
+        prob_stu: Probability matrix of which ability level a student belongs to
+    """
+    stu_num, prob_num = R.shape[0], R.shape[1]
+    prof_prob = irt3pl(np.sum(a * (np.expand_dims(prof, axis=1) - b), axis=-1), 1, 0, c)  # shape = (100, prob_num)
+    tmp1, tmp2 = np.zeros(shape=(prob_num, stu_num)), np.zeros(shape=(prob_num, stu_num))
+    tmp1[np.where(R == 1)[1], np.where(R == 1)[0]] = 1 # shape = (prob_num, stu_num)
+    tmp2[np.where(R == 0)[1], np.where(R == 0)[0]] = 1 # shape = (prob_num, stu_num)
+    prob_stu = np.exp(np.dot(np.log(prof_prob + 1e-9), tmp1) + np.dot(np.log(1 - prof_prob + 1e-9), tmp2))
+    return prof_prob, prob_stu # shape = (100, prob_num), (100, stu_num)
+
+
+def update_prior(prior_dis, prof_stu_like):
+    r"""
+    Purpose:
+        update the prior distribution of student abilities
+    Parameter:
+        prior_dis: prior distribution of student abilities
+        prof_stu_like: Probability matrix of which ability level a student belongs to
+    Return value:
+        update_prior_dis: updated prior distribution of student abilities
+        norm_dis_like: Normalized distribution of student abilities
+    """
+    dis_like = prof_stu_like * np.expand_dims(prior_dis, axis=1)  # shape = (100, stu_num)
+    norm_dis_like = dis_like / np.sum(dis_like, axis=0) # shape = (100, stu_num)
+    update_prior_dis = np.sum(norm_dis_like, axis=1) / np.sum(norm_dis_like) # shape = (100,)
+    return update_prior_dis, norm_dis_like # shape = (100,), (100, stu_num)
+
+
+def update_irt(a, b, c, D, prof, R, r_ek, s_ek, lr, epoch=10, epsilon=1e-3):
+    r"""
+    Purpose:
+        update the parameters of IRT model
+    Parameter:
+        a: Discrimination of the problems
+        b: Difficulty of the problems
+        c: Guess parameters of the problems
+        D: the value of D
+        prof: Normalized distribution of student's ability
+        R: matrix of item response
+        r_ek: the number of students who answered correctly
+        s_ek: the number of students who answered
+        lr: learning rate
+        epoch: the number of iterations
+        epsilon: threshold of convergence
+    Return value:
+        a: Discrimination of the problems
+        b: Difficulty of the problems
+        c: Guess parameters of the problems
+    """
+    for iteration in range(epoch):
+        a_tmp, b_tmp, c_tmp = np.copy(a), np.copy(b), np.copy(c)
+        prof_prob, _ = get_Likelihood(a, b, c, prof, R)
+        common_term = (r_ek - s_ek * prof_prob) / prof_prob / (1 - c + 1e-9)  # shape = (100, prob_num)
+        a_1 = np.transpose(
+            D * common_term * (prof_prob - c) * np.transpose(np.expand_dims(prof, axis=1) - b, (2, 0, 1)), (1, 2, 0))
+        b_1 = D * common_term * (c - prof_prob)
+        a_grad = np.sum(a_1, axis=0)
+        b_grad = a * np.expand_dims(np.sum(b_1, axis=0), axis=1)
+        c_grad = np.sum(common_term, axis=0)
+        a = a + lr * a_grad
+        b = b + lr * b_grad
+        c = np.clip(c + lr * c_grad, 0, 1)
+        change = max(np.max(np.abs(a - a_tmp)), np.max(np.abs(b - b_tmp)), np.max(np.abs(c - c_tmp)))
+        if iteration > 5 and change < epsilon:
+            break
+    return a, b, c
+
+
+class IRT(CDM):
+    r"""
+    IRT model, training (EM) and testing methods
+    Args:
+        meta_data: a dictionary containing all the userIds, itemIds, and skills.
+        dim: int
+            dimension of student/problem embedding, MIRT for dim > 1
+        skip_value: int
+            Unavailable value in matrix need to be skipped
+    """
+    def __init__(self, meta_data: dict, dim=1, skip_value=-1):
+        super(IRT, self).__init__()
+        self.id_reindex, _ = re_index(meta_data)
+        self.stu_num = len(self.id_reindex['userId'])
+        self.prob_num = len(self.id_reindex['itemId'])
+        self.dim = dim
+        self.skip_value = skip_value
+        self.a, self.b, self.c = init_parameters(self.prob_num, dim)
+        self.D = 1.702
+        self.prof, self.prior_dis = init_prior_prof_distribution(dim)
+        self.stu_prof = np.zeros(shape=(self.stu_num, dim))
+
+
+    def transform__(self, train_data: pd.DataFrame):
+        r"""
+        transform the train data to matrix of item response
+        
+        Args:
+            train_data: training dataset
+        Return value:
+            R: matrix of item response
+        """
+        R = np.full((self.stu_num, self.prob_num), self.skip_value)
+        for index, i in tqdm(train_data.iterrows(), "transforming"):
+            stu, test_id, true_score = i['userId'], i['itemId'], i['response']
+            re_stu_id, re_item_id = self.id_reindex['userId'][stu], self.id_reindex['itemId'][test_id]
+            R[re_stu_id, re_item_id] = true_score
+        return R
+
+
+    def fit(self, train_data: pd.DataFrame, lr, epoch=10, epoch_m=10, epsilon=1e-3):
+        r"""
+        Train the IRT model
+        This function uses the EM algorithm to train the model, in which the M step uses gradient descent to maximize parameters.
+        
+        Args:
+            train_data: training dataset
+            test_data: test dataset
+            stu_num: the number of students
+            prob_num: the number of problems
+        Return value:
+            R: matrix of item response
+            test_set: test dataset
+        """
+        R = self.transform__(train_data)
+        a, b, c = np.copy(self.a), np.copy(self.b), np.copy(self.c)
+        prior_dis = np.copy(self.prior_dis)
+        for iteration in range(epoch):
+            a_tmp, b_tmp, c_tmp, prior_dis_tmp = np.copy(a), np.copy(b), np.copy(c), np.copy(prior_dis)
+            prof_prob_like, prof_stu_like = get_Likelihood(a, b, c, self.prof,  R)
+            prior_dis, norm_dis_like = update_prior(prior_dis, prof_stu_like)
+            r_1 = np.zeros(shape=(self.stu_num, self.prob_num))
+            r_1[np.where( R == 1)[0], np.where( R == 1)[1]] = 1
+            r_ek = np.dot(norm_dis_like, r_1)  # shape = (100, prob_num)
+            r_1[np.where( R != self.skip_value)[0], np.where( R != self.skip_value)[1]] = 1
+            s_ek = np.dot(norm_dis_like, r_1)  # shape = (100, prob_num)
+            a, b, c = update_irt(a, b, c, self.D, self.prof,  R, r_ek, s_ek, lr, epoch_m, epsilon)
+            change = max(np.max(np.abs(a - a_tmp)), np.max(np.abs(b - b_tmp)), np.max(np.abs(c - c_tmp)),
+                         np.max(np.abs(prior_dis_tmp - prior_dis_tmp)))
+            if iteration > 20 and change < epsilon:
+                break
+        self.a, self.b, self.c, self.prior_dis = a, b, c, prior_dis
+        self.stu_prof = self.Get_stu_ability(R)
+
+
+    def predict_proba(self) -> pd.DataFrame:
+        r"""
+        calculate the probability
+        
+        Args:
+            None
+        Return value:
+            pred_score: the probability of students response correctly
+        """
+        pred_score = irt3pl(np.sum(self.a * (np.expand_dims(self.stu_prof, axis=1) - self.b), axis=-1), 1, 0, self.c)
+        return pd.DataFrame(pred_score)
+
+
+    def pridict(self) -> pd.DataFrame:
+        r"""
+        predict the probability
+        
+        Args:
+            None
+        Return value:
+            df_pred: the probability
+        """
+        irt_proba = self.predict_proba()
+        for i in range(self.stu_num):
+            for j in range(self.prob_num):
+                irt_proba[i][j] = 1 if irt_proba[i][j] >= 0.5 else 0
+        df_pred = pd.DataFrame(irt_proba)
+        return df_pred
+
+
+    def eval(self, val_data: pd.DataFrame):
+        r"""
+        Evaluate the IRT model
+        
+        Args:
+            val_data: validation dataset
+        Return value:
+            test_rmse: RMSE of test dataset
+            test_mae: MAE of test dataset
+        """
+        pred_score = self.predict_proba()
+        print(pred_score)
+        test_rmse, test_mae = [], []
+        for index, i in tqdm(val_data.iterrows(), "evaluating"):
+            stu, test_id, true_score = i['userId'], i['itemId'], i['response']
+            re_stu_id, re_item_id = self.id_reindex['userId'][stu], self.id_reindex['itemId'][test_id]
+            test_rmse.append((pred_score.iloc[re_stu_id, re_item_id] - true_score) ** 2)
+            test_mae.append(abs(pred_score.iloc[re_stu_id, re_item_id] - true_score))
+        return np.sqrt(np.average(test_rmse)), np.average(test_mae)
+
+
+    def save(self, filepath: str):
+        r"""
+        save the parameters of IRT model
+        
+        Args:
+            filepath: the path of file
+        Return value:
+            None
+        """
+        with open(filepath, 'wb') as file:
+            pickle.dump({"a": self.a, "b": self.b, "c": self.c, "prof": self.stu_prof}, file)
+            logging.info("save parameters to %s" % filepath)
+
+
+    def load(self, filepath: str):
+        r"""
+        load the parameters of IRT model
+        
+        Args:
+            filepath: the path of file
+        Return value:
+            None
+        """
+        with open(filepath, 'rb') as file:
+            self.a, self.b, self.c, self.stu_prof = pickle.load(file).values()
+            logging.info("load parameters from %s" % filepath)
+
+
+    def Get_stu_ability(self, records, lr=1e-3, epoch=10, epsilon=1e-3):  
+        r"""
+        Transform the records to student's ability by MLE method 
+        can evaluate multiple students' states simultaneously, thus output shape = (stu_num, dim)
+        initialization stu_prof, shape = (stu_num, dim)
+        
+        Args:
+            records: records of students' answers
+            lr: learning rate
+            epoch: the number of iterations
+            epsilon: threshold of convergence
+        Return value:
+            stu_prof: student's ability distribution
+        """
+        if len(records.shape) == 1:  # one student
+            records = np.expand_dims(records, axis=0)
+        _, prof_stu_like = get_Likelihood(self.a, self.b, self.c, self.prof, records)
+        stu_prof = self.prof[np.argmax(prof_stu_like, axis=0)]
+
+        for iteration in range(epoch):
+            prof_tmp = np.copy(stu_prof)
+            ans_prob = irt3pl(np.sum(self.a * (np.expand_dims(stu_prof, axis=1) - self.b), axis=-1), 1, 0, self.c)
+            ans_1 = self.D * (records - ans_prob) / ans_prob * (ans_prob - self.c) / (1 - self.c + 1e-9)
+            ans_1[np.where(records == self.skip_value)[0], np.where(records == self.skip_value)[1]] = 0
+            prof_grad = np.dot(ans_1, self.a)
+            stu_prof = stu_prof - lr * prof_grad
+            change = np.max(np.abs(stu_prof - prof_tmp))
+            if iteration > 5 and change < epsilon:
+                break
+        return stu_prof  # shape = (stu_num, dim)
diff --git a/EduCDM/IRT/EM/__init__.py b/EduCDM/IRT/EM/__init__.py
@@ -0,0 +1,4 @@
+# coding: utf-8
+# 2021/5/2 @ liujiayu
+
+from .IRT import IRT
diff --git a/EduCDM/IRT/__init__.py b/EduCDM/IRT/__init__.py
@@ -0,0 +1,6 @@
+# coding: utf-8
+# 2021/4/23 @ tongshiwei
+
+
+from .GD import IRT as GDIRT
+from .EM import IRT as EMIRT
diff --git a/EduCDM/IRT/irt.py b/EduCDM/IRT/irt.py
@@ -0,0 +1,13 @@
+# coding: utf-8
+# 2021/4/23 @ tongshiwei
+
+import numpy as np
+
+__all__ = ["irf", "irt3pl"]
+
+
+def irf(theta, a, b, c, D=1.702, *, F=np):
+    return c + (1 - c) / (1 + F.exp(-D * a * (theta - b)))
+
+
+irt3pl = irf
diff --git a/docs/IRT.md b/docs/IRT.md
@@ -1,11 +1,53 @@
 # Item response theory
 
 If the reader wants to know the details of EMIRT, please refer to the paper: *[Estimation for Item Response Models using the EM Algorithm for Finite Mixtures](https://files.eric.ed.gov/fulltext/ED405356.pdf)*.
+
 ```bibtex
 @article{woodruff1996estimation,
   title={Estimation of Item Response Models Using the EM Algorithm for Finite Mixtures.},
   author={Woodruff, David J and Hanson, Bradley A},
   year={1996},
   publisher={ERIC}
 }
-```
+```
+
+## Introduction of model
+
+![这是图片](_static\IRT\EMIRT\emirt1.png "Magic Gardens")
+
+![这是图片](_static\IRT\EMIRT\emirt2.png "Magic Gardens")
+
+![这是图片](_static\IRT\EMIRT\emirt3.png "Magic Gardens")
+
+![这是图片](/assets/img/philly-magic-garden.jpg "Magic Gardens")
+
+## Parameters description
+
+| PARAMETERS | TYPE | DESCRIPTION                                                    |
+| ---------- | ---- | -------------------------------------------------------------- |
+| meta_data  | dict | a  dictionary containing all the userIds, itemIds, and skills. |
+| dim        | int  | the  dimension of student's ability. Default: 1                |
+| skip_value | int  | the skip_value of the item response matrix. Default: -1        |
+
+##Examples
+
+```python
+import pandas as pd
+from EduCDM import EMIRT
+meta_data = {'userId': ['001', '002', '003'], 'itemId': ['adf', 'w5'], 'skill': ['skill1', 'skill2', 'skill3', 'skill4']}
+model = EMIRT(meta_data)
+
+train_data = pd.DataFrame({'userId':[1,1,2,2,3,3], 'itemId': [1,2,1,3,2,3], 'skill': ["[1]", "[1,3]", "[1]", "[1,2,3]", "[1,3]", "[1,2,3]"], 'response': [1,1,0,1,1,0]})
+test_data = pd.DataFrame({'userId':[1,2,3], 'itemId': [3,2,1], 'skill': ["[1,2,3]", "[1,3]", "[1]"], 'response': [1,1,0]})
+model.fit(train_data, epoch=2)
+predict = model.predict()
+mrse, mse = model.eval(test_data)
+```
+
+## Methods summary
+
+| METHODS           | DESCRIPTION                                                                                                                                                             |
+| ----------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| fit               | Fits  the model to the training data.                                                                                                                                   |
+| fit_predict       | Use  the model to predict the responses in the testing data and returns the  results. The responses are either 1 (i.e., correct answer) or 0 (i.e.,  incorrect answer). |
+| fit_predict_proba | Use  the model to predict the responses in the testing data and returns the  probabilities (that the correct answers will be provided).                                 |
diff --git a/docs/_static/IRT/EMIRT/emirt1.png b/docs/_static/IRT/EMIRT/emirt1.png
diff --git a/docs/_static/IRT/EMIRT/emirt2.png b/docs/_static/IRT/EMIRT/emirt2.png
diff --git a/docs/_static/IRT/EMIRT/emirt3.png b/docs/_static/IRT/EMIRT/emirt3.png
diff --git a/docs/_static/IRT/EMIRT/emirt4.png b/docs/_static/IRT/EMIRT/emirt4.png