diff --git a/analysis/age/hist_groups.py b/analysis/age/hist_groups.py index 2ab0471..2a01d59 100644 --- a/analysis/age/hist_groups.py +++ b/analysis/age/hist_groups.py @@ -197,10 +197,12 @@ def cluster(df_original, n_clst, basedic): if basedic.method == "equal": statcoll.delete_many(basedic.__dict__) # 연도별로 데이터 찾아서 넣기! - years = [int(sgId//10000) for sgId in df_original["sgId"].unique()] + df_original["year"] = df_original["sgId"] // 10000 + df_original = df_original[df_original["year"].isin([2010, 2014, 2018, 2022])] + years = df_original["year"].unique() for year in years: - basedic.year = year - df = df_original[df_original["sgId"] // 10000 == year] + basedic.year = int(year) + df = df_original[df_original["year"] == year] youngest_age = ("", 100) oldest_age = ("", 0) print(f"year {year}, {n_clst} clusters") diff --git a/analysis/age/main.py b/analysis/age/main.py index f2029f1..61a43ff 100644 --- a/analysis/age/main.py +++ b/analysis/age/main.py @@ -28,34 +28,41 @@ def main(N=5, folder_name="To_be_filled"): ## 이 링크에 구현될 save_to_mongo함수 참고 : https://github.com/NewWays-TechForImpactKAIST/API-scrap-and-analysis//blob/bd817e9a15086d313d9615b2515a81e0dbd73850/API/utils.py#L34 ## 1. 지역의회 # cluster_by = input("구역을 나눌 기준을 입력해주세요 (sdName 즉 시/도 또는 wiwName 즉 기초단체단위): ") - cluster_by = "sdName" + cluster_by = "wiwName" assert cluster_by in ["sdName", "wiwName"] level = 1 if cluster_by == "sdName" else 2 datadir = os.path.join(BASE_DIR, "_data", folder_name) - for d in os.listdir(datadir): + # for d in os.listdir(datadir): # xlsx 파일을 읽어옵니다. - if not d.endswith(".xlsx"): - continue - df = pd.read_excel(os.path.join(datadir, d)) - # 필요한 열만 추출합니다. - if level == 1: - df = df[["sgId", "sdName", "name", "age", "gender"]] - else: - df = df[["sgId", "sdName", "wiwName", "name", "age", "gender"]] - df = df.sort_values(by="age") - is_elected = ( - True - if "당선" in d - else False - if "후보" in d - else ValueError("엑셀파일 이름에 '당선'이든지 '후보'가 있어야 합니다.") + # if not d.endswith(".xlsx"): + # continue + # df = pd.read_excel(os.path.join(datadir, d)) + # d = "[당선][시도의원].xlsx" + d = "[당선][구시군의회의원].xlsx" + df_1 = pd.read_excel(os.path.join(datadir, d)) + # d = "[당선][광역의원비례대표].xlsx" + d = "[당선][기초의원비례대표].xlsx" + df_2 = pd.read_excel(os.path.join(datadir, d)) + df = pd.concat([df_1, df_2]) + # 필요한 열만 추출합니다. + if level == 1: + df = df[["sgId", "sdName", "name", "age", "gender"]] + else: + df = df[["sgId", "sdName", "wiwName", "name", "age", "gender"]] + df = df.sort_values(by="age") + is_elected = ( + True + if "당선" in d + else False + if "후보" in d + else ValueError("엑셀파일 이름에 '당선'이든지 '후보'가 있어야 합니다.") + ) + councilorType = councilordict[d.split('[')[-1].split(']')[0]] + for method in ["kmeans", "equal"]: + basedic = BasicArgument(councilorType=councilorType, is_elected=is_elected, level=level, method=method) + cluster( + df, N, basedic ) - councilorType = councilordict[d.split('[')[-1].split(']')[0]] - for method in ["kmeans", "equal"]: - basedic = BasicArgument(councilorType=councilorType, is_elected=is_elected, level=level, method=method) - cluster( - df, N, basedic - ) ## 2. 광역의회