From 151d773584d6a4bc247a47b167f13fd70396e8e4 Mon Sep 17 00:00:00 2001 From: Re-st Date: Wed, 22 Nov 2023 17:08:40 +0900 Subject: [PATCH] =?UTF-8?q?[analysis]=20=ED=96=89=EC=A0=95=EA=B5=AC?= =?UTF-8?q?=EC=97=AD=20=EA=B0=9C=ED=8E=B8=EC=9C=BC=EB=A1=9C=20=EC=9D=B8?= =?UTF-8?q?=ED=95=9C=20=EC=9D=B4=EB=A6=84=EB=B3=80=EA=B2=BD=20=EA=B3=A0?= =?UTF-8?q?=EC=B9=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- analysis/age/hist_groups.py | 51 ++++++++++++++++++++++++++++++------- 1 file changed, 42 insertions(+), 9 deletions(-) diff --git a/analysis/age/hist_groups.py b/analysis/age/hist_groups.py index ebdadf9..0a3d476 100644 --- a/analysis/age/hist_groups.py +++ b/analysis/age/hist_groups.py @@ -65,19 +65,45 @@ def cluster_data(method, n_clst, df): df.loc[df["age"] == min_age, "cluster_label"] = i return df -def extract_city(area): +# 이름이 바뀐 경우 +change_city_name = { + ("충청남도", "당진군"): "당진시", + ("경상남도", "마산시"): "창원시", + ("경상남도", "진해시"): "창원시", + ("경기도", "여주군"): "여주시", + ("충청북도", "청원군"): "청주시", + ("인천광역시", "남구"): "미추홀구", +} + +# +change_lvl2to1 = {"연기군": "세종특별자치시"} + +def change_local_name(sdName, wiwName): """ - 만약 '시' 와 '구'가 모두 wiwName에 있다면, '시' 까지만 쓰기 + 1. 만약 '시' 와 '구'가 모두 wiwName에 있다면, '시' 까지만 쓰기 ex) '용인시수지구' (선거 단위) -> '용인시' (의회 단위) + 2. 지역이 승급되면 이름 바꾸기 + ex) '당진군' (~2011) -> '당진시' (2012~) Keyword arguments: argument -- string Return: processed string """ - if '구' in area and '시' in area: - return area.split('시')[0] + '시' + if (sdName, wiwName) in change_city_name: + return change_city_name[(sdName, wiwName)] + if '구' in wiwName and '시' in wiwName: + return wiwName.split('시')[0] + '시' else: - return area + return wiwName +def local_to_metro_list(sdName, wiwName): + """ + 구시군에서 광역시/도로 승격한 경우 + """ + if wiwName in change_lvl2to1: + print('change', wiwName, 'to', change_lvl2to1[wiwName]) + return change_lvl2to1[wiwName] + else: + return sdName def cluster(df, year, n_clst, method, cluster_by, outdir, font_name, folder_name): """구역별 그룹을 만듭니다. df: 데이터프레임 @@ -94,7 +120,7 @@ def cluster(df, year, n_clst, method, cluster_by, outdir, font_name, folder_name ids = client["district"] metroIds = ids["metro_district"] localIds = ids["local_district"] - db = client["agehistogram"] + db = client["age_hist"] level = "1level" if cluster_by == "sdName" else "2level" main_collection = db[folder_name + "_" + year + "_" + level + "_" + method] # 기존 histogram 정보는 삭제 (나이별로 넣는 것이기 때문에 찌꺼기값 존재가능) @@ -107,12 +133,12 @@ def cluster(df, year, n_clst, method, cluster_by, outdir, font_name, folder_name # colors = cm.rainbow(np.linspace(0, 1, n_clst)) # wiwName을 처리합니다 - df['wiwName'] = df['wiwName'].apply(extract_city) + if level == "2level": + df['sdName'] = df[['sdName', 'wiwName']].apply(lambda x: local_to_metro_list(*x), axis=1) + df['wiwName'] = df[['sdName', 'wiwName']].apply(lambda x: change_local_name(*x), axis=1) # 데이터프레임에서 시도별로 묶은 후 나이 열만 가져옵니다. df_age = pd.DataFrame(columns=["area", "age"]) for area, df_clst in df.groupby(cluster_by): - metroname = df_clst["sdName"].iloc[0] - localname = df_clst["wiwName"].iloc[0] df_clst = cluster_data(method, n_clst, df_clst) # 클러스터 중심 나이를 계산합니다. clst_age_mean = [] @@ -160,10 +186,17 @@ def cluster(df, year, n_clst, method, cluster_by, outdir, font_name, folder_name df_clst.groupby('age')['cluster_label'].first() ) ] + metroname = df_clst["sdName"].iloc[0] metroId = metroIds.find_one({"sdName": metroname})["metroId"] if level == "1level": + print ("sdName is ", metroname) main_collection.insert_one({"metroId": metroId, "data": data}) + elif metroname in change_lvl2to1.values(): + print ("sdName is ", metroname) + lvl1_collection = db[folder_name + "_" + year + "_1level_" + method] + lvl1_collection.insert_one({"metroId": metroId, "data": data}) else: + localname = df_clst["wiwName"].iloc[0] print ("sdName is ", metroname, "wiwName is", localname) localId = localIds.find_one({"sdName": metroname, "wiwName": localname})["localId"] main_collection.insert_one({"metroId": metroId, "localId": localId, "data": data})