diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b6e4761 --- /dev/null +++ b/.gitignore @@ -0,0 +1,129 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ diff --git a/LICENSE b/LICENSE new file mode 100755 index 0000000..5dc609b --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2019 Lewis Tian + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100755 index 0000000..33ba210 --- /dev/null +++ b/README.md @@ -0,0 +1,231 @@ +# 我到底在华科吃了些啥 (华科校园卡年度报告) + +HUST + +## Table of Contents + +- [起因](#起因) +- [数据获取](#数据获取) +- [数据分析](#数据分析) + - [2019 数据详情](#2019-数据详情) + - [2020 数据详情](#2020-数据详情) +- [如何使用](#如何使用) + +## 起因 + +今天(2019-12-23)在东一一楼烧腊窗口恰饭,我点的是鸡排饭,然后我没说要啥汁,那个姐姐(感觉叫阿姨不太对)直接说:番茄是吧。我:?,我挺疑惑的,然后在她把饭递给我的时候问了句:你咋知道的?她:因为你经常吃啊(笑)。 + +想了下好像也没有经常吃吧,虽然之前高中也有过类似的经历,那是因为吃拉面不要香菜、热干面不要葱、炒面炒粉不要葱,然后吃了一段时间就被老板记住了。 + +很有趣的一次是吃热干面,老板没抬头看,就调好葱姜蒜芝麻酱准备递给我,然后看到我,说:哎呀,你不要葱,这碗给后面的同学,重新给你下一碗 2333。扯远了,因为挺好奇的,于是就打算把今年这一年的吃饭记录都爬下来康康,我这一年都吃的是啥? + +## 数据获取 + +从 [校园卡服务平台](http://ecard.hust.edu.cn/Default.aspx) 可以查到流水,开始我以为所谓的“导出所有”是把所有记录导出,结果跟“导出当前页流水”是一样的。 + +所以直接用 Python 写了个爬虫直接把今年(2019)所有月份数据都爬下来,去掉多余的信息,仅保存 *时间*、*价格*、*食堂* 这三项数据,然后存为 CSV 文件。 + +- 过程: + +![](images/record.gif) + +- 结果: + +![](images/TIM20191223152311.png) + +## 数据分析 + +所以这一年到底吃了啥?下面就见分晓! + +首先将这些 csv 文件合并,使用 `pandas` 库很简单,下面就是: + +```python +def merge_all_files(): + files = glob.glob("csv/*.csv") + df = pd.concat([pd.read_csv(file) for file in files]) + df.to_csv("csv/2019.csv", index=False, encoding='utf-8-sig') +``` + +
+ 2019 + +### 2019 数据详情 + +截止到今天,每个月使用一卡通的消费次数(包括超市和自动售卖机): + +``` +{ + 1: 95, + 2: 50, + 3: 142, + 4: 121, + 5: 131, + 6: 111, + 7: 104, + 8: 138, + 9: 150, + 10: 102, + 11: 125, + 12: 104 +} +``` + +
+ +![](images/consume_times.png) +
+ +各个食堂窗口的食用情况(指的是刷卡次数,吃个晚饭可能会刷几次卡)TOP10 + +```bash +集贤楼食堂红案: 249 +集贤楼蒸菜净荤组: 123 +东一二楼特色菜品: 108 +集贤楼食堂商店: 99 +东一二楼华科速7: 84 +东一一楼蒸点稀食: 81 +东一二楼湘味小钵: 70 +东一一楼烧腊饭: 68 +东一二楼大众菜(一): 66 +集贤楼食堂煎烙: 65 +``` + +
+ +![](images/windows_times.png) +
+ +各个食堂的食用情况 + +``` +{ + '东一': 581, + '集贤楼': 575, + '自助售货机': 9, + '西一': 139, + '百品屋': 8, + '集锦园': 4, + '校园网': 6, + '图书馆': 1, + '东学超市': 3, + '东三': 27, + '紫荆园': 4, + '百景': 12, + '喻园': 4 +} +``` + +
+ +![](images/hall_times.png) +
+ +
+ +
+ 2020 + +### 2020 数据详情 + +上面过程和分析都是 19 年写的,偶然翻代码翻到这个东西,于是今年(2021)更新了下。 + +1、各个食堂的食用情况 + +| 食堂 | 消费次数 | +|:----------:|:-------:| +| 东一 | 587 | +| 自助售货机 | 58 | +| 东学超市 | 9 | +| 集贤楼 | 6 | +| 校医院 | 2 | +| 集锦园 | 4 | +| 东三 | 2 | +| 校园网 | 2 | +| 后勤开水机 | 2 | +| 百景 | 10 | + +![](images/hall_times_2020.png) + +2、各个窗口的食用情况 top 10 + +| 窗口 | 消费次数 | +|:--------------------:|:-------:| +| 东一二楼华科速7 | 58 | +| 东一二楼大众菜(二) | 60 | +| 自助售货机 | 58 | +| 东一二楼香霸王卤肉饭 | 30 | +| 东一一楼蒸点稀食 | 101 | +| 东一二楼特色菜品 | 55 | +| 东学超市柜三 | 2 | +| 东学超市柜一 | 6 | +| 东一二楼湘味小钵 | 81 | +| 东一一楼烧腊饭 | 7 | + +![](images/windows_times_2020.png) + +3、各时间段的食用情况 + +![](images/hours_2020.png) + +4、各个月的食用情况 + +![](images/consume_times_2020.png) + +5、总体概况 + +在 2020,你连续在 东一二楼大众菜(一) 窗口消费了 4 次,看来你很喜欢这个窗口! + +在 2020,你一共消费了 682 次,共花费 3331.35 元! + +
+ +## 如何使用 + +运行 `ecard.py` 前需要登录拿到 `JSESSIONID` 然后填到 `ecard.py` 对应位置;画图前记得解压字体文件:`SourceHanSansCN-Light.7z`,也可以自定义字体,修改 `utils.py` 中的 `myfont = FontProperties(fname="SourceHanSansCN-Light.otf")` 即可。 + +```Bash +git clone git@github.com:taseikyo/hust-ecard-annual.git +cd hust-ecard-annual +pip3 install -r requirements.txt +# 获取数据 +python3 ecard.py +# 画图 +python3 utils.py +``` + +由于校园卡消费记录的是食堂窗口,所以为了提取出食堂我是设了几个食堂关键词: + +``` +HALLS = { + "东一", + "集贤楼", + "自助售货机", + "西一", + "百品屋", + "集锦园", + "校园网", + "图书馆", + "东学超市", + "东三", + "紫荆园", + "百景", + "喻园", +} +``` + +当然这些肯定是不全的,毕竟有一共三十多个食堂,所以在代码里面不在 `HALLS` 的食堂直接保存。 + +```Python +has_found = False +for hall in HALLS: + if row[-1].find(hall) >= 0: + has_found = True + halls[hall] += 1 + break +if not has_found: + halls[row[-1]] += 1 +``` + +## LICENSE + +Copyright (c) 2019 Lewis Tian. Licensed under the MIT license. diff --git a/SourceHanSansCN-Light.7z b/SourceHanSansCN-Light.7z new file mode 100755 index 0000000..21c5544 Binary files /dev/null and b/SourceHanSansCN-Light.7z differ diff --git a/csv/2019.7z b/csv/2019.7z new file mode 100755 index 0000000..40b454f Binary files /dev/null and b/csv/2019.7z differ diff --git a/csv/2020.7z b/csv/2020.7z new file mode 100755 index 0000000..47ab0ce Binary files /dev/null and b/csv/2020.7z differ diff --git a/ecard.py b/ecard.py new file mode 100755 index 0000000..4e67775 --- /dev/null +++ b/ecard.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# @Date : 2019-12-23 14:48:11 +# @Author : Lewis Tian (taseikyo@gmail.com) +# @Link : github.com/taseikyo +# @Version : python3.8 + +""" +retrieve and save my ecard consume detail +""" + +import csv +import random +import time + +import requests + +DETAIL = [] + + +def one_moonth(year=2019, month=12, page=1, total=0): + global DETAIL + url = "http://218.199.85.15/pcard/gettrjndataList.action" + headers = { + "Cookie": "JSESSIONID={xxx}", + "Referer": "http://218.199.85.15/pcard/pcard/acchistrjn.action", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36", + "X-Requested-With": "XMLHttpRequest", + } + post_data = { + "page": f"{page}", + "rp": "10", + "sortname": "jndatetime", + "sortorder": "desc", + "query": "", + "qtype": "", + "accquary": "215799", + "trjnquary": f"{year}-{month:02}", + } + + print(f"retrieve {year}-{month} page {page} data...") + r = requests.post(url, headers=headers, data=post_data) + data = r.json() + + for row in data["rows"]: + # bank card transfer + if row["cell"][5] == "0": + continue + e_time = row["cell"][0] + e_money = row["cell"][3][1:] + e_hall = row["cell"][8].strip() + temp = [e_time, e_money, e_hall] + DETAIL.append(temp) + + total += 10 + if total < data["total"]: + time.sleep(random.randint(1000, 2000) / 1000) + one_moonth(year, month, page + 1, total) + else: + dump_as_csv(year, month) + + +def dump_as_csv(year, month): + global DETAIL + print(f"save {year}-{month} data...") + with open(f"csv/{year}-{month}.csv", "w", encoding="utf-8", newline="") as csvfile: + writer = csv.writer(csvfile) + writer.writerow(["e_time", "e_money", "e_hall"]) + writer.writerows(DETAIL) + DETAIL = [] + + +if __name__ == "__main__": + for month in range(1, 13): + one_moonth(2020, month=month) diff --git a/images/TIM20191223152311.png b/images/TIM20191223152311.png new file mode 100755 index 0000000..47894d5 Binary files /dev/null and b/images/TIM20191223152311.png differ diff --git a/images/consume_times.png b/images/consume_times.png new file mode 100755 index 0000000..ac9c9db Binary files /dev/null and b/images/consume_times.png differ diff --git a/images/consume_times_2020.png b/images/consume_times_2020.png new file mode 100755 index 0000000..f24da1c Binary files /dev/null and b/images/consume_times_2020.png differ diff --git a/images/hall_times.png b/images/hall_times.png new file mode 100755 index 0000000..9d29d4e Binary files /dev/null and b/images/hall_times.png differ diff --git a/images/hall_times_2020.png b/images/hall_times_2020.png new file mode 100755 index 0000000..1dc0d61 Binary files /dev/null and b/images/hall_times_2020.png differ diff --git a/images/hours_2020.png b/images/hours_2020.png new file mode 100755 index 0000000..bb0b825 Binary files /dev/null and b/images/hours_2020.png differ diff --git a/images/hust.jpg b/images/hust.jpg new file mode 100755 index 0000000..4935da1 Binary files /dev/null and b/images/hust.jpg differ diff --git a/images/record.gif b/images/record.gif new file mode 100755 index 0000000..045c9db Binary files /dev/null and b/images/record.gif differ diff --git a/images/windows_times.png b/images/windows_times.png new file mode 100755 index 0000000..4696641 Binary files /dev/null and b/images/windows_times.png differ diff --git a/images/windows_times_2020.png b/images/windows_times_2020.png new file mode 100755 index 0000000..173aa04 Binary files /dev/null and b/images/windows_times_2020.png differ diff --git a/requirements.txt b/requirements.txt new file mode 100755 index 0000000..1131ee3 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +requests +matplotlib +pandas +seaborn \ No newline at end of file diff --git a/utils.py b/utils.py new file mode 100755 index 0000000..3296399 --- /dev/null +++ b/utils.py @@ -0,0 +1,204 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# @Date : 2019-12-23 15:41:06 +# @Author : Lewis Tian (taseikyo@gmail.com) +# @Link : github.com/taseikyo +# @Version : python3.8 + +import calendar +import csv +import glob +from collections import defaultdict + +import matplotlib.pyplot as plt +import pandas as pd +import seaborn as sns +from matplotlib.font_manager import FontProperties + +myfont = FontProperties(fname="SourceHanSansCN-Light.otf") + +sns.set( + style="ticks", + font=myfont.get_name(), + rc={ + "figure.figsize": [16, 9], + "text.color": "white", + "axes.labelcolor": "white", + "axes.edgecolor": "white", + "xtick.color": "white", + "ytick.color": "white", + "axes.facecolor": "#443941", + "figure.facecolor": "#443941", + }, +) + +# 不全,需要手动添加 +HALLS = { + "东一", + "集贤楼", + "自助售货机", + "西一", + "百品屋", + "集锦园", + "校园网", + "图书馆", + "东学超市", + "东三", + "紫荆园", + "百景", + "喻园", +} + + +def merge_all_files(year=2019): + """ + 汇总所有月份的 csv + """ + files = glob.glob("csv/*.csv") + df = pd.concat([pd.read_csv(file) for file in files]) + df.to_csv(f"csv/{year}.csv", index=False, encoding="utf-8-sig") + + +def draw_consume_times(year=2019): + """ + 按月份显示消费次数 + """ + times = {} + for x in range(1, 13): + with open(f"csv/{year}-{x}.csv", encoding="utf-8") as f: + lines = f.readlines() + times[x] = len(lines) - 1 + print(times) + plt.figure(figsize=(16, 6)) + plt.plot(list(times.keys()), list(times.values()), label="消费次数", color="white") + plt.legend() + # 图上画出数据 + x = range(1, len(times) + 1) + y_text = list(times.values()) + for i in range(len(times)): + plt.text(x[i], y_text[i] + 2, y_text[i], ha="center", fontsize=12) + + plt.grid(False) + plt.xlabel("月份", fontsize=16) + plt.ylabel("次数", fontsize=16) + plt.xticks(range(14), [""] + calendar.month_name[1:13] + [""]) + plt.title("每月的消费次数", fontsize=20) + plt.show() + + +def get_all_windows_halls(year=2019): + """ + 食堂窗口 & 食堂 + """ + halls = defaultdict(int) + windows = defaultdict(int) + for x in range(1, 13): + with open(f"csv/{year}-{x}.csv", encoding="utf-8") as f: + next(f) + reader = csv.reader(f) + for row in reader: + windows[row[-1]] += 1 + has_found = False + for hall in HALLS: + if row[-1].find(hall) >= 0: + has_found = True + halls[hall] += 1 + break + if not has_found: + halls[row[-1]] += 1 + print(halls) + with open("csv/halls.csv", "w", encoding="utf-8", newline="") as csvfile: + writer = csv.writer(csvfile) + writer.writerow(["e_hall", "e_count"]) + writer.writerows(halls.items()) + with open("csv/windows.csv", "w", encoding="utf-8", newline="") as csvfile: + writer = csv.writer(csvfile) + writer.writerow(["e_windows", "e_count"]) + writer.writerows(windows.items()) + + +def draw_bars(path, title, colname, rotate=0): + df = pd.read_csv(path) + df = df.sort_values("e_count", ascending=False) + print(df.head(10)) + plt.figure(figsize=(16, 6)) + plt.bar(df[f"{colname}"], df.e_count, label="消费次数", color="white") + plt.legend() + # plt.grid(False) + # 图上画出数据 + x = range(len(df.e_count) + 1) + y_text = list(df.e_count) + for i in range(len(df.e_count)): + plt.text(x[i], y_text[i] + 2, y_text[i], ha="center", fontsize=12) + plt.xlabel(title, fontsize=16) + plt.ylabel("次数", fontsize=16) + plt.xticks(rotation=rotate) + plt.title(f"{title}的消费次数", fontsize=20) + plt.tight_layout() + plt.show() + + +def draw_hour_times(year=2019): + hours = defaultdict(int) + with open(f"csv/{year}.csv", encoding="utf-8") as f: + next(f) + for line in f: + hour = int(line.split(":")[0].split(" ")[1]) + hours[hour] += 1 + print(hours) + plt.figure(figsize=(16, 6)) + x, y = [], [] + for i, j in sorted(hours.items(), key=lambda x: x[0]): + x.append(i) + y.append(j) + print(x, y) + plt.plot(x, y, label="消费次数", color="white") + plt.xlabel("小时", fontsize=16) + plt.ylabel("次数", fontsize=16) + plt.title("每时间段的消费次数", fontsize=20) + plt.xticks(range(x[0], x[-1]+1)) + # 图上画出数据 + for i in range(len(hours)): + plt.text(x[i]+0.1, y[i] + 2, y[i], ha="center", fontsize=12) + plt.show() + +def max_continue_times(year=2019): + """ + 最大连续次数 + """ + max_time = 0 + cur_time = 1 + max_hall = None + pre_element = None + with open(f"csv/{year}.csv", encoding="utf-8") as f: + next(f) + reader = csv.reader(f) + for row in reader: + if row[2] == pre_element: + cur_time += 1 + if max_time < cur_time: + max_time = cur_time + max_hall = pre_element + else: + pre_element = row[2] + cur_time = 1 + print(f"在 {year},你连续在 {max_hall} 窗口消费了 {max_time} 次,看来你很喜欢这个窗口!") + + +def get_total_money_time(year=2019): + """ + 总消费次数、钱数 + """ + df = pd.read_csv(f"csv/{year}.csv") + print(f"在 {year},你一共消费了 {len(df)} 次,共花费 {df.e_money.sum()} 元!") + + +if __name__ == "__main__": + merge_all_files(2020) + draw_consume_times(2020) + get_all_windows_halls(2020) + draw_bars("csv/halls.csv", "食堂", "e_hall") + draw_bars("csv/windows.csv", "食堂窗口", "e_windows", 35) + draw_hour_times(2020) + max_continue_times(2020) + get_total_money_time(2020)