diff --git a/AUTHORS.md b/AUTHORS.md index 6e5a9bc..ecf20c5 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -32,4 +32,6 @@ [Shizhe Zhu](https://github.com/icarushhh) +[Hongxu Chen](https://github.com/ustcchx) + The stared contributors are the main authors. diff --git a/README.md b/README.md index 4e564d4..62fde9b 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,8 @@ The [datasets](https://edudata.readthedocs.io/en/latest/tutorial/zh/DataSet.html * [OpenLUNA](http://base.ustc.edu.cn/data/OpenLUNA/) +* [MOOCCubeX](https://github.com/THU-KEG/MOOCCubeX) [[Analysis]](docs/analysis/MOOCCubeX.ipynb) + Your can also visit our datashop [BaseData](http://base.ustc.edu.cn/data/) to get those mentioned-above (most of them) datasets. Except those mentioned-above dataset, we also provide some benchmark dataset for some specified task, which is listed as follows: diff --git a/docs/analysis/MOOCCubeX.ipynb b/docs/analysis/MOOCCubeX.ipynb new file mode 100644 index 0000000..7aaf2a1 --- /dev/null +++ b/docs/analysis/MOOCCubeX.ipynb @@ -0,0 +1,751 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "36f54933", + "metadata": {}, + "source": [ + " # MOOCCUBEX Data Analysis" + ] + }, + { + "cell_type": "markdown", + "id": "a47e427d", + "metadata": {}, + "source": [ + "## Data loading" + ] + }, + { + "cell_type": "markdown", + "id": "3b2423c4", + "metadata": {}, + "source": [ + "The name of the data file used is \"user-problem.json\". " + ] + }, + { + "cell_type": "markdown", + "id": "9fc1c223", + "metadata": {}, + "source": [ + "|Field |description | \n", + "|:----------|:---------- | \n", + "|log_id |ID of the user's question record, combined with a unique key of user_id and problem_id |\n", + "|user_id |User ID, starting with U_|\n", + "|problem_id\t|Problem ID, starting with Pm_|\n", + "|is_correct\t|Is the question correct|\n", + "|attempts\t|Number of attempted questions|\n", + "|score\t|score|\n", + "|submit_time\t|Question time|" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "37a45801", + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-13T10:57:56.771732Z", + "start_time": "2023-05-13T10:57:55.332487Z" + } + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import random \n", + "import re\n", + "import plotly.express as px\n", + "from plotly.subplots import make_subplots\n", + "import plotly.graph_objs as go\n", + "import json" + ] + }, + { + "cell_type": "markdown", + "id": "9a56c232", + "metadata": {}, + "source": [ + "As the file size is approximately 21GB,we randomly select records from the file with a probability of 1%." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "0f76bb66", + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-13T11:18:03.504560Z", + "start_time": "2023-05-13T10:57:56.819529Z" + }, + "scrolled": true + }, + "outputs": [], + "source": [ + "r_file=open(\"user-problem.json\",\"r\")\n", + "length = 0\n", + "num = 0\n", + "arr,chunks=[],[]\n", + "while(True) :\n", + " line = r_file.readline()\n", + " if not line :\n", + " chunks.append(pd.DataFrame(arr))\n", + " arr = []\n", + " break\n", + " if (random.randint(1,100))%100 == 1 :\n", + " text = json.loads(line)\n", + " arr.append(text.values())\n", + " length += 1\n", + " if length == 100000 :\n", + " chunks.append(pd.DataFrame(arr))\n", + " arr = []\n", + " length = 0\n", + "df=pd.concat(chunks,ignore_index=True)\n", + "chunks.clear()\n", + "arr.clear()\n", + "# Clear the memory of temporary variable\n", + "df.columns=['log_id','problem_id','user_id','is_correct','attempts','score','submit_time']\n", + "# Reset the index" + ] + }, + { + "cell_type": "markdown", + "id": "77cf9b36", + "metadata": {}, + "source": [ + "## General feature " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "8a352a50", + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-13T11:20:46.320391Z", + "start_time": "2023-05-13T11:20:46.228527Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
log_idproblem_iduser_idis_correctattemptsscoresubmit_time
010000444_5567925Pm_5567925U_1000044401-1.02020-07-21 00:01:57
110000676_3548507Pm_3548507U_100006761150.02020-05-06 16:14:12
210000676_4462607Pm_4462607U_100006761150.02020-07-01 16:24:34
310000677_4913875Pm_4913875U_1000067701-1.02020-07-15 08:53:43
410000684_5186646Pm_5186646U_1000068401-1.02020-07-20 23:26:24
\n", + "
" + ], + "text/plain": [ + " log_id problem_id user_id is_correct attempts score \n", + "0 10000444_5567925 Pm_5567925 U_10000444 0 1 -1.0 \\\n", + "1 10000676_3548507 Pm_3548507 U_10000676 1 1 50.0 \n", + "2 10000676_4462607 Pm_4462607 U_10000676 1 1 50.0 \n", + "3 10000677_4913875 Pm_4913875 U_10000677 0 1 -1.0 \n", + "4 10000684_5186646 Pm_5186646 U_10000684 0 1 -1.0 \n", + "\n", + " submit_time \n", + "0 2020-07-21 00:01:57 \n", + "1 2020-05-06 16:14:12 \n", + "2 2020-07-01 16:24:34 \n", + "3 2020-07-15 08:53:43 \n", + "4 2020-07-20 23:26:24 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "312bd565", + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-13T11:21:41.232376Z", + "start_time": "2023-05-13T11:20:46.322391Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The number of records:\n", + "1334037\n" + ] + } + ], + "source": [ + "print(\"The number of records:\")\n", + "print(df['log_id'].count())" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "36576d5b", + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-13T11:24:44.783992Z", + "start_time": "2023-05-13T11:21:41.246055Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Part of missing values for every column\n", + "log_id 0.000000\n", + "problem_id 0.000000\n", + "user_id 0.000000\n", + "is_correct 0.000000\n", + "attempts 0.000000\n", + "score 0.530487\n", + "submit_time 0.000000\n", + "dtype: float64\n" + ] + } + ], + "source": [ + "print('Part of missing values for every column')\n", + "print(df.isnull().sum() / len(df))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "6f057650", + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-13T11:25:50.891802Z", + "start_time": "2023-05-13T11:24:44.799161Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The number of users\n" + ] + }, + { + "data": { + "text/plain": [ + "583149" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(\"The number of users\")\n", + "len(df.user_id.unique())" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "f3dcbf7d", + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-13T11:26:16.036938Z", + "start_time": "2023-05-13T11:25:50.905805Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The number of problems\n" + ] + }, + { + "data": { + "text/plain": [ + "414848" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(\"The number of problems\")\n", + "len(df.problem_id.unique())" + ] + }, + { + "cell_type": "markdown", + "id": "07dab257", + "metadata": {}, + "source": [ + "## Sort by user_id" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "e1602c0d", + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-13T11:26:26.220286Z", + "start_time": "2023-05-13T11:26:16.052509Z" + } + }, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "050100150200250U_11559576-U_33785335-U_35783644-U_13191984-U_11641049-U_15202737-U_12532760-U_15202717-U_15348827-U_12533291-U_33187613-U_12532715-U_30003776-U_11389851-U_11641103-U_11385862-U_55721-Top 50 users by the number of solving problemcountuser_id" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "ds = df['user_id'].value_counts().reset_index() \n", + "ds.columns = [\n", + " 'user_id',\n", + " 'count'\n", + "]\n", + "ds['user_id'] = ds['user_id'].astype(str) + '-' \n", + "#Transform 'int' type to 'str' type.If not,the program error would be caused by the type of the Y-axis.\n", + "ds = ds.sort_values(['count']).tail(50)\n", + "fig = px.bar(\n", + " ds,\n", + " x = 'count',\n", + " y = 'user_id',\n", + " orientation='h',\n", + " title='Top 50 users by the number of solving problem'\n", + ")\n", + "fig.show(\"svg\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "4696ec65", + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-13T11:26:37.616672Z", + "start_time": "2023-05-13T11:26:29.295488Z" + } + }, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "05M10M15M20M25M30M35M05k10k15k20k25k30k35k40kUser solving problem distributionuser_idsum of count" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "ds = df['user_id'].value_counts().reset_index() \n", + "ds.columns = [\n", + " 'user_id',\n", + " 'count'\n", + "]\n", + "ds= ds.astype(str)\n", + "ds = ds.applymap(lambda x: re.sub(r'U_','', x))\n", + "ds = ds.astype(int)\n", + "ds = ds.sort_values('user_id')\n", + "fig = px.histogram(\n", + " ds,\n", + " x = 'user_id',\n", + " y = 'count',\n", + " title = 'User solving problem distribution'\n", + ")\n", + "fig.show(\"svg\")" + ] + }, + { + "cell_type": "markdown", + "id": "b3138ea9", + "metadata": {}, + "source": [ + "## Sort by correct answer" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "58ef6f6d", + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-13T11:26:45.076759Z", + "start_time": "2023-05-13T11:26:43.765499Z" + } + }, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "86.2%13.8%10Percent of the record that user solve the problem correctly" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "ds = df['is_correct'].value_counts().reset_index()\n", + "ds.columns = [\n", + " 'is_correct',\n", + " 'percent'\n", + "]\n", + "ds['percent'] /= len(df)\n", + "ds = ds.sort_values(['percent'])\n", + "fig = px.pie(\n", + " ds,\n", + " names = ['0', '1'],\n", + " values = 'percent',\n", + " title = 'Percent of the record that user solve the problem correctly ' \n", + ")\n", + "fig.show(\"svg\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "f8334ebb", + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "00.20.40.60.81050k100k150k200k250k300kCorrect answer percent distributioncorrect_percentsum of count" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "temp =df.groupby('problem_id')['is_correct'].mean() \n", + "temp=temp.reset_index()\n", + "temp.columns=['problem_id','correct_percent']\n", + "temp=temp['correct_percent'].value_counts().reset_index() \n", + "temp=temp.sort_values(['correct_percent'])\n", + "fig = px.histogram(\n", + " temp,\n", + " x = 'correct_percent',\n", + " y = 'count',\n", + " title = 'Correct answer percent distribution'\n", + ")\n", + "fig.show(\"svg\")" + ] + }, + { + "cell_type": "markdown", + "id": "120d707a", + "metadata": {}, + "source": [ + "## Sorted by submit_time" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "a53fe1e3", + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-13T11:36:50.654262Z", + "start_time": "2023-05-13T11:36:50.641259Z" + } + }, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "May 2020Jul 2020Sep 2020Nov 2020050k100k150k200k250k300kUser solving problem date distributionsubmit_timecount" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "ds=df['submit_time']\n", + "ds=pd.DataFrame(ds)\n", + "ds = ds.applymap(lambda x: x[0:7])\n", + "ds = ds.sort_values(['submit_time'])\n", + "ds = ds['submit_time'].value_counts().reset_index()\n", + "ds.columns=[\n", + " 'submit_time',\n", + " 'count'\n", + "]\n", + "fig = px.bar(\n", + " ds,\n", + " x = 'submit_time',\n", + " y = 'count',\n", + " title = 'User solving problem date distribution'\n", + ")\n", + "fig.show(\"svg\")" + ] + }, + { + "cell_type": "markdown", + "id": "e717fbfd", + "metadata": {}, + "source": [ + "## Sorted by attempts" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "4b0a27e8", + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-13T11:34:35.594448Z", + "start_time": "2023-05-13T11:34:35.579790Z" + } + }, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "0102030405060Pm_8157710Pm_7693110Pm_3824387Pm_8473863Pm_7805164Pm_5598452Pm_8474040Pm_3697846Pm_8157662Pm_8427163Pm_7803978Pm_5405542Pm_5636237Pm_1927763Pm_7413440Pm_7413468Pm_1874727Top 50 problems with the highest average number of attemptsattemptsproblem_id" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "grouped=df.groupby(by=['problem_id'],as_index=False)\n", + "series=grouped['attempts'].mean()\n", + "series = series.sort_values(['attempts']).tail(50)\n", + "fig = px.bar(\n", + " series,\n", + " x = 'attempts',\n", + " y = 'problem_id',\n", + " orientation='h',\n", + " title='Top 50 problems with the highest average number of attempts'\n", + ")\n", + "fig.show(\"svg\")" + ] + }, + { + "cell_type": "markdown", + "id": "81428311", + "metadata": {}, + "source": [ + "## Sorted by problem_id" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "df74434d", + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-13T11:35:42.029106Z", + "start_time": "2023-05-13T11:35:26.091157Z" + } + }, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "2M4M6M8M010k20k30k40k50k60k70k80kRecords distribution on problem_idproblem_idsum of count" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "ds = df['problem_id'].value_counts().reset_index()\n", + "ds.columns = [\n", + " 'problem_id', \n", + " 'count'\n", + "]\n", + "ds = ds.sort_values('problem_id')\n", + "ds= ds.astype(str)\n", + "ds = ds.applymap(lambda x: re.sub(r'Pm_','', x))\n", + "ds = ds.astype(int)\n", + "fig = px.histogram(\n", + " ds, \n", + " x='problem_id', \n", + " y='count', \n", + " title=\"Records distribution on problem_id\"\n", + ")\n", + "fig.show(\"svg\")" + ] + }, + { + "cell_type": "markdown", + "id": "0bff5ed9", + "metadata": {}, + "source": [ + "## Sorted by the score of the problems" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "2d72da44", + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-13T11:35:50.187615Z", + "start_time": "2023-05-13T11:35:50.092595Z" + } + }, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "0204060801000100k200k300k400k500k600kProblem score distributionproblem_scoresum of count" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "grouped=df.groupby(by=['score'],as_index=False)\n", + "ds=grouped['score'].value_counts()\n", + "ds.columns=[\n", + " 'problem_score',\n", + " 'count'\n", + "]\n", + "ds=ds.sort_values(['problem_score'])\n", + "fig = px.histogram(\n", + " ds, \n", + " x='problem_score', \n", + " y='count', \n", + " title='Problem score distribution'\n", + ")\n", + "fig.show(\"svg\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}