diff --git a/AUTHORS.md b/AUTHORS.md
index 6e5a9bc..ecf20c5 100644
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -32,4 +32,6 @@
[Shizhe Zhu](https://github.com/icarushhh)
+[Hongxu Chen](https://github.com/ustcchx)
+
The stared contributors are the main authors.
diff --git a/README.md b/README.md
index 4e564d4..62fde9b 100644
--- a/README.md
+++ b/README.md
@@ -49,6 +49,8 @@ The [datasets](https://edudata.readthedocs.io/en/latest/tutorial/zh/DataSet.html
* [OpenLUNA](http://base.ustc.edu.cn/data/OpenLUNA/)
+* [MOOCCubeX](https://github.com/THU-KEG/MOOCCubeX) [[Analysis]](docs/analysis/MOOCCubeX.ipynb)
+
Your can also visit our datashop [BaseData](http://base.ustc.edu.cn/data/) to get those mentioned-above (most of them) datasets.
Except those mentioned-above dataset, we also provide some benchmark dataset for some specified task, which is listed as follows:
diff --git a/docs/analysis/MOOCCubeX.ipynb b/docs/analysis/MOOCCubeX.ipynb
new file mode 100644
index 0000000..7aaf2a1
--- /dev/null
+++ b/docs/analysis/MOOCCubeX.ipynb
@@ -0,0 +1,751 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "36f54933",
+ "metadata": {},
+ "source": [
+ " # MOOCCUBEX Data Analysis"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a47e427d",
+ "metadata": {},
+ "source": [
+ "## Data loading"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3b2423c4",
+ "metadata": {},
+ "source": [
+ "The name of the data file used is \"user-problem.json\". "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9fc1c223",
+ "metadata": {},
+ "source": [
+ "|Field |description | \n",
+ "|:----------|:---------- | \n",
+ "|log_id |ID of the user's question record, combined with a unique key of user_id and problem_id |\n",
+ "|user_id |User ID, starting with U_|\n",
+ "|problem_id\t|Problem ID, starting with Pm_|\n",
+ "|is_correct\t|Is the question correct|\n",
+ "|attempts\t|Number of attempted questions|\n",
+ "|score\t|score|\n",
+ "|submit_time\t|Question time|"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "37a45801",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-05-13T10:57:56.771732Z",
+ "start_time": "2023-05-13T10:57:55.332487Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import random \n",
+ "import re\n",
+ "import plotly.express as px\n",
+ "from plotly.subplots import make_subplots\n",
+ "import plotly.graph_objs as go\n",
+ "import json"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9a56c232",
+ "metadata": {},
+ "source": [
+ "As the file size is approximately 21GB,we randomly select records from the file with a probability of 1%."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "0f76bb66",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-05-13T11:18:03.504560Z",
+ "start_time": "2023-05-13T10:57:56.819529Z"
+ },
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "r_file=open(\"user-problem.json\",\"r\")\n",
+ "length = 0\n",
+ "num = 0\n",
+ "arr,chunks=[],[]\n",
+ "while(True) :\n",
+ " line = r_file.readline()\n",
+ " if not line :\n",
+ " chunks.append(pd.DataFrame(arr))\n",
+ " arr = []\n",
+ " break\n",
+ " if (random.randint(1,100))%100 == 1 :\n",
+ " text = json.loads(line)\n",
+ " arr.append(text.values())\n",
+ " length += 1\n",
+ " if length == 100000 :\n",
+ " chunks.append(pd.DataFrame(arr))\n",
+ " arr = []\n",
+ " length = 0\n",
+ "df=pd.concat(chunks,ignore_index=True)\n",
+ "chunks.clear()\n",
+ "arr.clear()\n",
+ "# Clear the memory of temporary variable\n",
+ "df.columns=['log_id','problem_id','user_id','is_correct','attempts','score','submit_time']\n",
+ "# Reset the index"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "77cf9b36",
+ "metadata": {},
+ "source": [
+ "## General feature "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "8a352a50",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-05-13T11:20:46.320391Z",
+ "start_time": "2023-05-13T11:20:46.228527Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " log_id | \n",
+ " problem_id | \n",
+ " user_id | \n",
+ " is_correct | \n",
+ " attempts | \n",
+ " score | \n",
+ " submit_time | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 10000444_5567925 | \n",
+ " Pm_5567925 | \n",
+ " U_10000444 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " -1.0 | \n",
+ " 2020-07-21 00:01:57 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 10000676_3548507 | \n",
+ " Pm_3548507 | \n",
+ " U_10000676 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 50.0 | \n",
+ " 2020-05-06 16:14:12 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 10000676_4462607 | \n",
+ " Pm_4462607 | \n",
+ " U_10000676 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 50.0 | \n",
+ " 2020-07-01 16:24:34 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 10000677_4913875 | \n",
+ " Pm_4913875 | \n",
+ " U_10000677 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " -1.0 | \n",
+ " 2020-07-15 08:53:43 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 10000684_5186646 | \n",
+ " Pm_5186646 | \n",
+ " U_10000684 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " -1.0 | \n",
+ " 2020-07-20 23:26:24 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " log_id problem_id user_id is_correct attempts score \n",
+ "0 10000444_5567925 Pm_5567925 U_10000444 0 1 -1.0 \\\n",
+ "1 10000676_3548507 Pm_3548507 U_10000676 1 1 50.0 \n",
+ "2 10000676_4462607 Pm_4462607 U_10000676 1 1 50.0 \n",
+ "3 10000677_4913875 Pm_4913875 U_10000677 0 1 -1.0 \n",
+ "4 10000684_5186646 Pm_5186646 U_10000684 0 1 -1.0 \n",
+ "\n",
+ " submit_time \n",
+ "0 2020-07-21 00:01:57 \n",
+ "1 2020-05-06 16:14:12 \n",
+ "2 2020-07-01 16:24:34 \n",
+ "3 2020-07-15 08:53:43 \n",
+ "4 2020-07-20 23:26:24 "
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "312bd565",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-05-13T11:21:41.232376Z",
+ "start_time": "2023-05-13T11:20:46.322391Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "The number of records:\n",
+ "1334037\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"The number of records:\")\n",
+ "print(df['log_id'].count())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "36576d5b",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-05-13T11:24:44.783992Z",
+ "start_time": "2023-05-13T11:21:41.246055Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Part of missing values for every column\n",
+ "log_id 0.000000\n",
+ "problem_id 0.000000\n",
+ "user_id 0.000000\n",
+ "is_correct 0.000000\n",
+ "attempts 0.000000\n",
+ "score 0.530487\n",
+ "submit_time 0.000000\n",
+ "dtype: float64\n"
+ ]
+ }
+ ],
+ "source": [
+ "print('Part of missing values for every column')\n",
+ "print(df.isnull().sum() / len(df))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "6f057650",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-05-13T11:25:50.891802Z",
+ "start_time": "2023-05-13T11:24:44.799161Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "The number of users\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "583149"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "print(\"The number of users\")\n",
+ "len(df.user_id.unique())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "f3dcbf7d",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-05-13T11:26:16.036938Z",
+ "start_time": "2023-05-13T11:25:50.905805Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "The number of problems\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "414848"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "print(\"The number of problems\")\n",
+ "len(df.problem_id.unique())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "07dab257",
+ "metadata": {},
+ "source": [
+ "## Sort by user_id"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "e1602c0d",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-05-13T11:26:26.220286Z",
+ "start_time": "2023-05-13T11:26:16.052509Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/svg+xml": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "ds = df['user_id'].value_counts().reset_index() \n",
+ "ds.columns = [\n",
+ " 'user_id',\n",
+ " 'count'\n",
+ "]\n",
+ "ds['user_id'] = ds['user_id'].astype(str) + '-' \n",
+ "#Transform 'int' type to 'str' type.If not,the program error would be caused by the type of the Y-axis.\n",
+ "ds = ds.sort_values(['count']).tail(50)\n",
+ "fig = px.bar(\n",
+ " ds,\n",
+ " x = 'count',\n",
+ " y = 'user_id',\n",
+ " orientation='h',\n",
+ " title='Top 50 users by the number of solving problem'\n",
+ ")\n",
+ "fig.show(\"svg\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "4696ec65",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-05-13T11:26:37.616672Z",
+ "start_time": "2023-05-13T11:26:29.295488Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/svg+xml": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "ds = df['user_id'].value_counts().reset_index() \n",
+ "ds.columns = [\n",
+ " 'user_id',\n",
+ " 'count'\n",
+ "]\n",
+ "ds= ds.astype(str)\n",
+ "ds = ds.applymap(lambda x: re.sub(r'U_','', x))\n",
+ "ds = ds.astype(int)\n",
+ "ds = ds.sort_values('user_id')\n",
+ "fig = px.histogram(\n",
+ " ds,\n",
+ " x = 'user_id',\n",
+ " y = 'count',\n",
+ " title = 'User solving problem distribution'\n",
+ ")\n",
+ "fig.show(\"svg\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b3138ea9",
+ "metadata": {},
+ "source": [
+ "## Sort by correct answer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "58ef6f6d",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-05-13T11:26:45.076759Z",
+ "start_time": "2023-05-13T11:26:43.765499Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/svg+xml": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "ds = df['is_correct'].value_counts().reset_index()\n",
+ "ds.columns = [\n",
+ " 'is_correct',\n",
+ " 'percent'\n",
+ "]\n",
+ "ds['percent'] /= len(df)\n",
+ "ds = ds.sort_values(['percent'])\n",
+ "fig = px.pie(\n",
+ " ds,\n",
+ " names = ['0', '1'],\n",
+ " values = 'percent',\n",
+ " title = 'Percent of the record that user solve the problem correctly ' \n",
+ ")\n",
+ "fig.show(\"svg\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "f8334ebb",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/svg+xml": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "temp =df.groupby('problem_id')['is_correct'].mean() \n",
+ "temp=temp.reset_index()\n",
+ "temp.columns=['problem_id','correct_percent']\n",
+ "temp=temp['correct_percent'].value_counts().reset_index() \n",
+ "temp=temp.sort_values(['correct_percent'])\n",
+ "fig = px.histogram(\n",
+ " temp,\n",
+ " x = 'correct_percent',\n",
+ " y = 'count',\n",
+ " title = 'Correct answer percent distribution'\n",
+ ")\n",
+ "fig.show(\"svg\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "120d707a",
+ "metadata": {},
+ "source": [
+ "## Sorted by submit_time"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "a53fe1e3",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-05-13T11:36:50.654262Z",
+ "start_time": "2023-05-13T11:36:50.641259Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/svg+xml": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "ds=df['submit_time']\n",
+ "ds=pd.DataFrame(ds)\n",
+ "ds = ds.applymap(lambda x: x[0:7])\n",
+ "ds = ds.sort_values(['submit_time'])\n",
+ "ds = ds['submit_time'].value_counts().reset_index()\n",
+ "ds.columns=[\n",
+ " 'submit_time',\n",
+ " 'count'\n",
+ "]\n",
+ "fig = px.bar(\n",
+ " ds,\n",
+ " x = 'submit_time',\n",
+ " y = 'count',\n",
+ " title = 'User solving problem date distribution'\n",
+ ")\n",
+ "fig.show(\"svg\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e717fbfd",
+ "metadata": {},
+ "source": [
+ "## Sorted by attempts"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "4b0a27e8",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-05-13T11:34:35.594448Z",
+ "start_time": "2023-05-13T11:34:35.579790Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/svg+xml": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "grouped=df.groupby(by=['problem_id'],as_index=False)\n",
+ "series=grouped['attempts'].mean()\n",
+ "series = series.sort_values(['attempts']).tail(50)\n",
+ "fig = px.bar(\n",
+ " series,\n",
+ " x = 'attempts',\n",
+ " y = 'problem_id',\n",
+ " orientation='h',\n",
+ " title='Top 50 problems with the highest average number of attempts'\n",
+ ")\n",
+ "fig.show(\"svg\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "81428311",
+ "metadata": {},
+ "source": [
+ "## Sorted by problem_id"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "df74434d",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-05-13T11:35:42.029106Z",
+ "start_time": "2023-05-13T11:35:26.091157Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/svg+xml": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "ds = df['problem_id'].value_counts().reset_index()\n",
+ "ds.columns = [\n",
+ " 'problem_id', \n",
+ " 'count'\n",
+ "]\n",
+ "ds = ds.sort_values('problem_id')\n",
+ "ds= ds.astype(str)\n",
+ "ds = ds.applymap(lambda x: re.sub(r'Pm_','', x))\n",
+ "ds = ds.astype(int)\n",
+ "fig = px.histogram(\n",
+ " ds, \n",
+ " x='problem_id', \n",
+ " y='count', \n",
+ " title=\"Records distribution on problem_id\"\n",
+ ")\n",
+ "fig.show(\"svg\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0bff5ed9",
+ "metadata": {},
+ "source": [
+ "## Sorted by the score of the problems"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "2d72da44",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2023-05-13T11:35:50.187615Z",
+ "start_time": "2023-05-13T11:35:50.092595Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/svg+xml": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "grouped=df.groupby(by=['score'],as_index=False)\n",
+ "ds=grouped['score'].value_counts()\n",
+ "ds.columns=[\n",
+ " 'problem_score',\n",
+ " 'count'\n",
+ "]\n",
+ "ds=ds.sort_values(['problem_score'])\n",
+ "fig = px.histogram(\n",
+ " ds, \n",
+ " x='problem_score', \n",
+ " y='count', \n",
+ " title='Problem score distribution'\n",
+ ")\n",
+ "fig.show(\"svg\")"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.7"
+ },
+ "toc": {
+ "base_numbering": 1,
+ "nav_menu": {},
+ "number_sections": true,
+ "sideBar": true,
+ "skip_h1_title": false,
+ "title_cell": "Table of Contents",
+ "title_sidebar": "Contents",
+ "toc_cell": false,
+ "toc_position": {},
+ "toc_section_display": true,
+ "toc_window_display": false
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}