From 2eba982f2d5d8162be34f9093072bf42525fed11 Mon Sep 17 00:00:00 2001 From: Re-st Date: Mon, 27 Nov 2023 11:42:51 +0900 Subject: [PATCH] =?UTF-8?q?[analysis]=20metro=5Fcouncil=EB=8F=84=20?= =?UTF-8?q?=EA=B0=80=EB=8A=A5=ED=95=98=EB=8F=84=EB=A1=9D.=20[API]=20sgType?= =?UTF-8?q?code=20=EB=B3=B5=EC=88=98=EA=B0=9C=20=EC=A3=BC=EB=8F=84?= =?UTF-8?q?=EB=A1=9D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- API/candidate.py | 20 +-- API/elected.py | 26 +-- analysis/age/__init__.py | 3 + analysis/age/hist_groups.py | 311 ++++++++++++++++-------------------- analysis/age/main.py | 64 +++++--- 5 files changed, 201 insertions(+), 223 deletions(-) diff --git a/API/candidate.py b/API/candidate.py index bdce5e3..8bbb027 100644 --- a/API/candidate.py +++ b/API/candidate.py @@ -62,7 +62,7 @@ def fetch_all_data( if __name__ == "__main__": parser = argparse.ArgumentParser(description="공공데이터포털 API로부터 후보자 정보를 가져옵니다.") - parser.add_argument("sgTypecode", type=str, help="원하는 sgTypecode 하나를 입력하세요") + parser.add_argument("sgTypecodes", type=str, help="원하는 sgTypecode를 ','로 구분하여 입력하세요") parser.add_argument("sgIds", type=str, help="원하는 sgId를 ','로 구분하여 입력하세요") parser.add_argument( "--drop-columns", @@ -84,11 +84,13 @@ def fetch_all_data( sgIds = args.sgIds.split(",") drop_columns = args.drop_columns.split(",") if args.drop_columns else [] - data_list = fetch_all_data(sgIds, args.sgTypecode, drop_columns=drop_columns) - - if args.save_method == "excel": - save_to_excel(data_list, args.sgTypecode, is_elected=False) - elif args.save_method == "mongo": - save_to_mongo( - data_list, args.sgTypecode, CANDIDATE_TYPECODE_TYPE[args.sgTypecode] - ) + data_list = fetch_all_data(sgIds, args.sgTypecodes, drop_columns=drop_columns) + for sgTypecode in args.sgTypecodes.split(","): + if sgTypecode not in SG_TYPECODE: + raise ValueError(f"Invalid sgTypecode: {sgTypecode}") + if args.save_method == "excel": + save_to_excel(data_list, sgTypecode, is_elected=False) + elif args.save_method == "mongo": + save_to_mongo( + data_list, sgTypecode, CANDIDATE_TYPECODE_TYPE[sgTypecode] + ) diff --git a/API/elected.py b/API/elected.py index 61d9820..1b4a534 100644 --- a/API/elected.py +++ b/API/elected.py @@ -51,18 +51,19 @@ def fetch_data( def fetch_all_data( - sgIds: List[str], sgTypecode: str, drop_columns: List[str] + sgIds: List[str], sgTypecodes: str, drop_columns: List[str] ) -> List[dict]: data_list = [] - for sgId in sgIds: - data_list.extend(fetch_data(sgId, sgTypecode, drop_columns=drop_columns)) + for sgTypecode in sgTypecodes.split(","): + for sgId in sgIds: + data_list.extend(fetch_data(sgId, sgTypecode, drop_columns=drop_columns)) return data_list if __name__ == "__main__": parser = argparse.ArgumentParser(description="공공데이터포털 API로부터 당선자 정보를 가져옵니다.") - parser.add_argument("sgTypecode", type=str, help="원하는 sgTypecode 하나를 입력하세요") + parser.add_argument("sgTypecodes", type=str, help="원하는 sgTypecode를 ','로 구분하여 입력하세요") parser.add_argument("sgIds", type=str, help="원하는 sgId를 ','로 구분하여 입력하세요") parser.add_argument( "--drop-columns", @@ -82,10 +83,13 @@ def fetch_all_data( sgIds = args.sgIds.split(",") drop_columns = args.drop_columns.split(",") if args.drop_columns else [] - data_list = fetch_all_data(sgIds, args.sgTypecode, drop_columns=drop_columns) - if args.save_method == "excel": - save_to_excel(data_list, args.sgTypecode, is_elected=True) - elif args.save_method == "mongo": - save_to_mongo( - data_list, args.sgTypecode, ELECTED_TYPECODE_TYPE[args.sgTypecode] - ) + data_list = fetch_all_data(sgIds, args.sgTypecodes, drop_columns=drop_columns) + for sgTypecode in args.sgTypecodes.split(","): + if sgTypecode not in SG_TYPECODE: + raise ValueError(f"Invalid sgTypecode: {sgTypecode}") + if args.save_method == "excel": + save_to_excel(data_list, sgTypecode, is_elected=True) + elif args.save_method == "mongo": + save_to_mongo( + data_list, sgTypecode, ELECTED_TYPECODE_TYPE[sgTypecode] + ) diff --git a/analysis/age/__init__.py b/analysis/age/__init__.py index ccad035..d84ff4a 100644 --- a/analysis/age/__init__.py +++ b/analysis/age/__init__.py @@ -1,3 +1,6 @@ """ 공공데이터포털 API로 수집한 데이터를 분석하기 위한 패키지입니다. """ +class BasicArgument: + def __init__(self, **kwargs): + self.__dict__.update(kwargs) diff --git a/analysis/age/hist_groups.py b/analysis/age/hist_groups.py index 4c3e666..2ab0471 100644 --- a/analysis/age/hist_groups.py +++ b/analysis/age/hist_groups.py @@ -7,7 +7,7 @@ from matplotlib import cm from analysis.age.draw import make_scatterplot, make_hist from db.client import client - +from analysis.age import BasicArgument def plot_young_and_old(youngest_cluster, oldest_cluster): try: @@ -122,6 +122,7 @@ def insert_data_to_mongo( histcoll.find_one_and_update( { "councilorType": dic["councilorType"], + "is_elected": dic["is_elected"], "year": dic["year"], "level": dic["level"], "method": dic["method"], @@ -136,6 +137,7 @@ def insert_data_to_mongo( statcoll.find_one_and_update( { "councilorType": dic["councilorType"], + "is_elected": dic["is_elected"], "year": dic["year"], "level": dic["level"], "method": dic["method"], @@ -150,6 +152,7 @@ def insert_data_to_mongo( histcoll.find_one_and_update( { "councilorType": dic["councilorType"], + "is_elected": dic["is_elected"], "year": dic["year"], "level": dic["level"], "method": dic["method"], @@ -165,6 +168,7 @@ def insert_data_to_mongo( statcoll.find_one_and_update( { "councilorType": dic["councilorType"], + "is_elected": dic["is_elected"], "year": dic["year"], "level": dic["level"], "method": dic["method"], @@ -176,202 +180,155 @@ def insert_data_to_mongo( ) -def cluster(df, year, n_clst, method, cluster_by, outdir, font_name, folder_name): +def cluster(df_original, n_clst, basedic): """구역별 그룹을 만듭니다. - df: 데이터프레임 - year: 선거 연도 + df_original: 데이터프레임 n_clst: 그룹 수 - method: "kmeans" 또는 "equal" - cluster_by: "sdName" (1단계) 또는 "wiwName" (2단계) - outdir: 출력 디렉토리 - font_name: 폰트 이름 - folder_name: 출력 디렉토리의 하위 디렉토리 이름. 현재 '지선-당선' 또는 '지선-후보'. - 결과가 mongodb등으로 옮겨가야 하므로, 사용하지 않도록 바꿔야 함. + basedic: 기본 정보가 담긴 딕셔너리 """ - os.makedirs(os.path.join(outdir, method), exist_ok=True) distdb = client["district"] statdb = client["stats"] metroIds = distdb["metro_district"] localIds = distdb["local_district"] histcoll = statdb["age_hist"] statcoll = statdb["age_stat"] # method = "equal"에서 써 줄 통계. - councilorType = ( - "elected" - if folder_name[-2:] == "당선" - else "candidate" - if folder_name[-2:] == "후보" - else ValueError("folder_name should end with '당선' or '후보'") - ) - level = 1 if cluster_by == "sdName" else 2 - basedic = { - "councilorType": councilorType, - "year": year, - "level": level, - "method": method, - } # 기존 histogram 정보는 삭제 (나이별로 넣는 것이기 때문에 찌꺼기값 존재가능) - # histcoll.delete_many(basedic) - # if method == "equal": - # statcoll.delete_many(basedic) - youngest_age = ("", 100) - oldest_age = ("", 0) - print(f"({year}), {n_clst} clusters") - print(f"{'-' * 20}") - # # Get a colormap for generating unique colors for clusters - # colors = cm.rainbow(np.linspace(0, 1, n_clst)) + histcoll.delete_many(basedic.__dict__) + if basedic.method == "equal": + statcoll.delete_many(basedic.__dict__) + # 연도별로 데이터 찾아서 넣기! + years = [int(sgId//10000) for sgId in df_original["sgId"].unique()] + for year in years: + basedic.year = year + df = df_original[df_original["sgId"] // 10000 == year] + youngest_age = ("", 100) + oldest_age = ("", 0) + print(f"year {year}, {n_clst} clusters") + print(f"{'-' * 20}") + # # Get a colormap for generating unique colors for clusters + # colors = cm.rainbow(np.linspace(0, 1, n_clst)) - # wiwName을 처리합니다 - if level == 2: - df["sdName"] = df[["sdName", "wiwName"]].apply( - lambda x: local_to_metro_list(*x), axis=1 - ) - df["wiwName"] = df[["sdName", "wiwName"]].apply( - lambda x: change_local_name(*x), axis=1 - ) - # # 데이터프레임에서 시도별로 묶은 후 나이 열만 가져옵니다. - # df_age = pd.DataFrame(columns=["area", "age"]) - for area, df_clst in df.groupby(cluster_by): - df_clst = cluster_data(method, n_clst, df_clst) - first_q = df_clst[df_clst["cluster_label"] == 0]["age"].max() - last_q = df_clst[df_clst["cluster_label"] == n_clst - 1]["age"].min() - # 클러스터 중심 나이를 계산합니다. - clst_age_mean = [] - for i in range(n_clst): - clst_data = df_clst[df_clst["cluster_label"] == i] - # print(f"Cluster {i} in {area}: {clst_data['age'].min()} - {clst_data['age'].max()}") - cluster_center_age = round(clst_data["age"].mean(), 2) # 나이를 소수점 2자리까지 반올림 - clst_age_mean.append(cluster_center_age) - clst_of_young = 0 - clst_of_old = n_clst - 1 - if method == "kmeans": - clst_of_young = clst_age_mean.index(min(clst_age_mean)) - clst_of_old = clst_age_mean.index(max(clst_age_mean)) - clst_age_mean.sort() - # new_data = pd.DataFrame({"area": area, "age": clst_age_mean}) - # df_age = pd.concat([df_age, new_data], ignore_index=True) - # 지역의 가장 젊은, 나이든 그룹을 찾습니다 - yb_clst = df_clst[df_clst["cluster_label"] == clst_of_young] - ob_clst = df_clst[df_clst["cluster_label"] == clst_of_old] - print(f"Youngest in {area}: {yb_clst['age'].min()} - {yb_clst['age'].max()}") - print(f"Oldest in {area}: {ob_clst['age'].min()} - {ob_clst['age'].max()}") - # 그룹의 성비를 계산합니다. - young_group_sexratio = ( - yb_clst[yb_clst["gender"] == "여"].shape[0] / yb_clst.shape[0] - ) - old_group_sexratio = ( - ob_clst[ob_clst["gender"] == "여"].shape[0] / ob_clst.shape[0] - ) - print( - f"젊은 층의 성비는 여자가 {young_group_sexratio}, 노인층의 성비는 여자가 {old_group_sexratio}" - ) - # 년도의 가장 젊은, 나이든 그룹이 있는 지역을 찾습니다 - if clst_age_mean[0] < youngest_age[1]: - youngest_age = (area, clst_age_mean[0]) - if clst_age_mean[-1] > oldest_age[1]: - oldest_age = (area, clst_age_mean[-1]) - # 히스토그램을 그립니다. - histdata = [ - { - "minAge": int(age), - "maxAge": int(age) + 1, - "count": df_clst[df_clst["age"] == age].shape[0], - "ageGroup": int( - df_clst.loc[df_clst["age"] == age].iloc[0]["cluster_label"] - ), - } - for age in df_clst["age"].unique() - ] - statdata = None - if method == "equal": - statdata = [ + # wiwName을 처리합니다 + if basedic.level == 2: + df["sdName"] = df[["sdName", "wiwName"]].apply( + lambda x: local_to_metro_list(*x), axis=1 + ) + df["wiwName"] = df[["sdName", "wiwName"]].apply( + lambda x: change_local_name(*x), axis=1 + ) + # # 데이터프레임에서 시도별로 묶은 후 나이 열만 가져옵니다. + # df_age = pd.DataFrame(columns=["area", "age"]) + cluster_by = "sdName" if basedic.level == 1 else "wiwName" + for area, df_clst in df.groupby(cluster_by): + df_clst = cluster_data(basedic.method, n_clst, df_clst) + first_q = df_clst[df_clst["cluster_label"] == 0]["age"].max() + last_q = df_clst[df_clst["cluster_label"] == n_clst - 1]["age"].min() + # 클러스터 중심 나이를 계산합니다. + clst_age_mean = [] + for i in range(n_clst): + clst_data = df_clst[df_clst["cluster_label"] == i] + # print(f"Cluster {i} in {area}: {clst_data['age'].min()} - {clst_data['age'].max()}") + cluster_center_age = round(clst_data["age"].mean(), 2) # 나이를 소수점 2자리까지 반올림 + clst_age_mean.append(cluster_center_age) + clst_of_young = 0 + clst_of_old = n_clst - 1 + if basedic.method == "kmeans": + clst_of_young = clst_age_mean.index(min(clst_age_mean)) + clst_of_old = clst_age_mean.index(max(clst_age_mean)) + clst_age_mean.sort() + # new_data = pd.DataFrame({"area": area, "age": clst_age_mean}) + # df_age = pd.concat([df_age, new_data], ignore_index=True) + # 지역의 가장 젊은, 나이든 그룹을 찾습니다 + yb_clst = df_clst[df_clst["cluster_label"] == clst_of_young] + ob_clst = df_clst[df_clst["cluster_label"] == clst_of_old] + print(f"Youngest in {area}: {yb_clst['age'].min()} - {yb_clst['age'].max()}") + print(f"Oldest in {area}: {ob_clst['age'].min()} - {ob_clst['age'].max()}") + # 그룹의 성비를 계산합니다. + young_group_sexratio = ( + yb_clst[yb_clst["gender"] == "여"].shape[0] / yb_clst.shape[0] + ) + old_group_sexratio = ( + ob_clst[ob_clst["gender"] == "여"].shape[0] / ob_clst.shape[0] + ) + print( + f"젊은 층의 성비는 여자가 {young_group_sexratio}, 노인층의 성비는 여자가 {old_group_sexratio}" + ) + # 년도의 가장 젊은, 나이든 그룹이 있는 지역을 찾습니다 + if clst_age_mean[0] < youngest_age[1]: + youngest_age = (area, clst_age_mean[0]) + if clst_age_mean[-1] > oldest_age[1]: + oldest_age = (area, clst_age_mean[-1]) + # 히스토그램을 그립니다. + histdata = [ { - "firstquintile": int(first_q), - "lastquintile": int(last_q), - "population": int(df_clst.shape[0]), + "minAge": int(age), + "maxAge": int(age) + 1, + "count": df_clst[df_clst["age"] == age].shape[0], + "ageGroup": int(df_clst.loc[df_clst["age"] == age].iloc[0]["cluster_label"]) } + for age in df_clst["age"].unique() ] - # 지역 id를 잘 설정해줍니다. - metroname = df_clst["sdName"].iloc[0] - metroId = metroIds.find_one({"sdName": metroname})["metroId"] - if level == 1: - print("sdName is ", metroname) - dic = basedic.copy() - dic["metroId"] = metroId - insert_data_to_mongo( - dic, histdata, histcoll, statdata=statdata, statcoll=statcoll - ) - elif metroname in change_lvl2to1.values(): - print("sdName is ", metroname) - dic = basedic.copy() - dic["level"] = 1 - dic["metroId"] = metroId - # histcoll.delete_many(dic) # 기존 정보를 삭제 - if method == "kmeans": - insert_data_to_mongo(dic, histdata, histcoll) + statdata = None + if basedic.method == "equal": + statdata = [ + { + "firstquintile": int(first_q), + "lastquintile": int(last_q), + "population": int(df_clst.shape[0]), + } + ] + # 지역 id를 잘 설정해줍니다. + metroname = df_clst["sdName"].iloc[0] + metroId = metroIds.find_one({"sdName": metroname})["metroId"] + if basedic.level == 1: + print("sdName is ", metroname) + dic = basedic.__dict__.copy() + dic["metroId"] = metroId + insert_data_to_mongo( + dic, histdata, histcoll, statdata=statdata, statcoll=statcoll + ) + elif metroname in change_lvl2to1.values(): + print("sdName is ", metroname) + dic = basedic.__dict__.copy() + dic["level"] = 1 + dic["metroId"] = metroId + # histcoll.delete_many(dic) # 기존 정보를 삭제 + if basedic.method == "kmeans": + insert_data_to_mongo(dic, histdata, histcoll) + else: + # l1statcoll = statcollection[ + # folder_name + "_" + year + "_1level_" + method + # ] + # statcoll.delete_many(dic) + insert_data_to_mongo( + dic, + histdata, + histcoll, + statdata=statdata, + statcoll=statcoll, + ) else: - # l1statcoll = statcollection[ - # folder_name + "_" + year + "_1level_" + method - # ] - # statcoll.delete_many(dic) + localname = df_clst["wiwName"].iloc[0] + print("sdName is ", metroname, "wiwName is", localname) + localId = localIds.find_one({"sdName": metroname, "wiwName": localname})[ + "localId" + ] + dic = basedic.__dict__.copy() + dic["metroId"] = metroId insert_data_to_mongo( dic, histdata, histcoll, statdata=statdata, statcoll=statcoll, + localId=localId, ) - else: - localname = df_clst["wiwName"].iloc[0] - print("sdName is ", metroname, "wiwName is", localname) - localId = localIds.find_one({"sdName": metroname, "wiwName": localname})[ - "localId" - ] - dic = basedic.copy() - dic["metroId"] = metroId - insert_data_to_mongo( - dic, - histdata, - histcoll, - statdata=statdata, - statcoll=statcoll, - localId=localId, - ) - # # 그리기 - # package = ( - # outdir, - # df_clst, - # year, - # area, - # n_clst, - # method, - # cluster_by, - # folder_name, - # colors, - # font_name, - # ) - # make_hist(package) - - print(f"Number of data points per cluster for {area}, method {method}") - for cluster_label in range(n_clst): - closest_data_count = sum(df_clst["cluster_label"] == cluster_label) - print( - f"Cluster {cluster_label}: Age {clst_age_mean[cluster_label]}, {closest_data_count} closest data points" - ) - print(f"Youngest in {youngest_age[0]}: {youngest_age[1]}") - print(f"Oldest in {oldest_age[0]}: {oldest_age[1]}") - - # # 그리기 - # package = ( - # outdir, - # df.shape[0], - # year, - # df_age, - # n_clst, - # method, - # cluster_by, - # folder_name, - # colors, - # font_name, - # ) - # make_scatterplot(package) + print(f"Number of data points per cluster for {area}, method {basedic.method}") + for cluster_label in range(n_clst): + closest_data_count = sum(df_clst["cluster_label"] == cluster_label) + print( + f"Cluster {cluster_label}: Age {clst_age_mean[cluster_label]}, {closest_data_count} closest data points" + ) + print(f"Youngest in {youngest_age[0]}: {youngest_age[1]}") + print(f"Oldest in {oldest_age[0]}: {oldest_age[1]}") \ No newline at end of file diff --git a/analysis/age/main.py b/analysis/age/main.py index 721e2bc..f2029f1 100644 --- a/analysis/age/main.py +++ b/analysis/age/main.py @@ -5,6 +5,7 @@ from matplotlib import font_manager from analysis.age.most_common_age_group import most_common_age_group from analysis.age.hist_groups import cluster +from analysis.age import BasicArgument # 경고 무시 warnings.filterwarnings("ignore", category=FutureWarning) @@ -15,36 +16,47 @@ fname=os.path.join(BASE_DIR, "_data", "NanumSquareL.ttf") ).get_name() +councilordict = { + "시도의원": "metro_councilor", + "광역의원비례대표": "metro_councilor", + "구시군의회의원": "local_councilor", + "기초의원비례대표": "local_councilor", +} -def main(N=5): +def main(N=5, folder_name="To_be_filled"): ## TO-DO: excel말고 mongodb에서 받아오도록 합니다. ## 이 링크에 구현될 save_to_mongo함수 참고 : https://github.com/NewWays-TechForImpactKAIST/API-scrap-and-analysis//blob/bd817e9a15086d313d9615b2515a81e0dbd73850/API/utils.py#L34 - for folder_name in ["지선-당선", "지선-후보"]: - for cluster_by in ["sdName", "wiwName"]: - # folder_name = input("_data 내의 폴더 이름은 무엇인가요?") - # cluster_by = input("구역을 나눌 기준을 입력해주세요 (sdName 즉 시/도 또는 wiwName 즉 기초단체단위): ") - datadir = os.path.join(BASE_DIR, "_data", folder_name) - outdir = os.path.join( - BASE_DIR, "output", f"age_all_{cluster_by}", folder_name + ## 1. 지역의회 + # cluster_by = input("구역을 나눌 기준을 입력해주세요 (sdName 즉 시/도 또는 wiwName 즉 기초단체단위): ") + cluster_by = "sdName" + assert cluster_by in ["sdName", "wiwName"] + level = 1 if cluster_by == "sdName" else 2 + datadir = os.path.join(BASE_DIR, "_data", folder_name) + for d in os.listdir(datadir): + # xlsx 파일을 읽어옵니다. + if not d.endswith(".xlsx"): + continue + df = pd.read_excel(os.path.join(datadir, d)) + # 필요한 열만 추출합니다. + if level == 1: + df = df[["sgId", "sdName", "name", "age", "gender"]] + else: + df = df[["sgId", "sdName", "wiwName", "name", "age", "gender"]] + df = df.sort_values(by="age") + is_elected = ( + True + if "당선" in d + else False + if "후보" in d + else ValueError("엑셀파일 이름에 '당선'이든지 '후보'가 있어야 합니다.") + ) + councilorType = councilordict[d.split('[')[-1].split(']')[0]] + for method in ["kmeans", "equal"]: + basedic = BasicArgument(councilorType=councilorType, is_elected=is_elected, level=level, method=method) + cluster( + df, N, basedic ) - - for d in os.listdir(datadir): - # xlsx 파일을 읽어옵니다. - if not d.endswith(".xlsx"): - continue - df = pd.read_excel(os.path.join(datadir, d)) - - # 필요한 열만 추출합니다. - df = df[["sdName", "wiwName", "name", "age", "gender"]] - df = df.sort_values(by="age") - year = int(d[7:11]) - # most_common_age_group(df, year) - cluster( - df, year, N, "kmeans", cluster_by, outdir, font_name, folder_name - ) - cluster( - df, year, N, "equal", cluster_by, outdir, font_name, folder_name - ) + ## 2. 광역의회 main()