diff --git a/.gitignore b/.gitignore index be72122..b21ce63 100644 --- a/.gitignore +++ b/.gitignore @@ -222,4 +222,5 @@ pyrightconfig.json /_data /output -__pycache__ \ No newline at end of file +__pycache__ +/logs \ No newline at end of file diff --git a/db/types.py b/db/types.py index 4b04658..7e22fdd 100644 --- a/db/types.py +++ b/db/types.py @@ -9,9 +9,9 @@ class CouncilType(str, Enum): 의회의 종류를 나타내는 열거형입니다. """ - LOCAL_COUNCIL = "local_council" - NATIONAL_COUNCIL = "national_council" - METROPOLITAN_COUNCIL = "metropolitan_council" + LOCAL_COUNCIL = "local_councilor" + NATIONAL_COUNCIL = "national_councilor" + METROPOLITAN_COUNCIL = "metropolitan_councilor" LOCAL_LEADER = "local_leader" METRO_LEADER = "metro_leader" """ diff --git a/requirements.txt b/requirements.txt index 8437846..726b96d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,4 +12,5 @@ gspread==5.11.2 pymongo==4.5.0 python-dotenv==1.0.0 openpyxl -selenium \ No newline at end of file +selenium +tqdm \ No newline at end of file diff --git a/scrap/local_councils/basic.py b/scrap/local_councils/basic.py index 4bbcfa8..d4ac5e6 100644 --- a/scrap/local_councils/basic.py +++ b/scrap/local_councils/basic.py @@ -138,7 +138,7 @@ def sel_getname(profile, element, class_, wrapper_element, wrapper_class_): if keyword in name: # 인천 서구 등 name = name.replace(keyword, "").strip() break - print(name, "is name\n") + # print(name, "is name\n") maybe_name = name.split()[0] # 이름 뒤에 직책이 따라오는 경우 if len(maybe_name) == 1: # 외자 이름이 띄어쓰기 때문에 분리된 경우 name = "".join(name.split()[0:2]) @@ -266,7 +266,7 @@ def scrap_basic(url, cid, args: ScrapBasicArgument, encoding="utf-8") -> ScrapRe profiles = getprofiles( soup, args.pf_elt, args.pf_cls, args.pf_memlistelt, args.pf_memlistcls ) - print(cid, "번째 의회에는,", len(profiles), "명의 의원이 있습니다.") # 디버깅용. + # print(cid, "번째 의회에는,", len(profiles), "명의 의원이 있습니다.") # 디버깅용. for profile in profiles: name = party = "" @@ -314,7 +314,7 @@ def sel_scrap_basic(url, cid, args: ScrapBasicArgument) -> ScrapResult: profiles = sel_getprofiles( browser, args.pf_elt, args.pf_cls, args.pf_memlistelt, args.pf_memlistcls ) - print(cid, "번째 의회에는,", len(profiles), "명의 의원이 있습니다.") # 디버깅용. + # print(cid, "번째 의회에는,", len(profiles), "명의 의원이 있습니다.") # 디버깅용. for profile in profiles: name = party = "" diff --git a/scrap/local_councils/gyeonggi.py b/scrap/local_councils/gyeonggi.py index 55ffbd0..2d97d79 100644 --- a/scrap/local_councils/gyeonggi.py +++ b/scrap/local_councils/gyeonggi.py @@ -94,7 +94,7 @@ def scrap_88(url, cid, args: ScrapBasicArgument) -> ScrapResult: profiles = get_profiles_88_103( soup, args.pf_elt, args.pf_cls, args.pf_memlistelt, args.pf_memlistcls ) - print(cid, "번째 의회에는,", len(profiles), "명의 의원이 있습니다.") # 디버깅용. + # print(cid, "번째 의회에는,", len(profiles), "명의 의원이 있습니다.") # 디버깅용. for profile in profiles: name = getname( @@ -153,7 +153,7 @@ def scrap_103(url, cid, args: ScrapBasicArgument) -> ScrapResult: profiles = get_profiles_88_103( soup, args.pf_elt, args.pf_cls, args.pf_memlistelt, args.pf_memlistcls ) - print(cid, "번째 의회에는,", len(profiles), "명의 의원이 있습니다.") # 디버깅용. + # print(cid, "번째 의회에는,", len(profiles), "명의 의원이 있습니다.") # 디버깅용. for profile in profiles: name = getname( diff --git a/scrap/local_councils/incheon.py b/scrap/local_councils/incheon.py index 5a45f57..72c60b6 100644 --- a/scrap/local_councils/incheon.py +++ b/scrap/local_councils/incheon.py @@ -170,7 +170,7 @@ def scrap_57(url, args) -> ScrapResult: profiles = getprofiles( soup, args.pf_elt, args.pf_cls, args.pf_memlistelt, args.pf_memlistcls ) - print(cid, "번째 의회에는,", len(profiles), "명의 의원이 있습니다.") # 디버깅용. + # print(cid, "번째 의회에는,", len(profiles), "명의 의원이 있습니다.") # 디버깅용. for profile in profiles: name = getname( diff --git a/scrap/utils/database.py b/scrap/utils/database.py index 2c351ad..74c3526 100644 --- a/scrap/utils/database.py +++ b/scrap/utils/database.py @@ -1,3 +1,5 @@ +from typing import List + from db.client import client from configurations.secrets import MongoDBSecrets @@ -7,74 +9,90 @@ import json -# Note: MongoDB는 데이터베이스가 존재하지 않으면 자동으로 생성합니다. -# MongoDB 데이터베이스는 하나 이상의 컬렉션으로 구성됩니다. -# 컬렉션은 하나 이상의 문서로 구성됩니다. -db = client[str(MongoDBSecrets.database_name)] - - -def save_to_database(record: ScrapResult): +def save_to_database(record: ScrapResult) -> bool: """ 지방의회 크롤링 결과를 데이터베이스에 저장합니다. - 각 의회의 기존 데이터는 덮어 씌워집니다. - 예시는 scrap/utils/database.py를 참조해주세요. :param record: 지방의회 크롤링 결과 :return: 저장 성공 여부를 불리언 값으로 반환합니다. """ - try: - # MongoDB는 JSON을 저장할 수 있습니다. - # JSON 형태로 변환한 후, MongoDB에 저장합니다. - # serialized_record = json.dumps(dataclasses.asdict(record), ensure_ascii=False) - collection = db[str(record.council_type)] - result = collection.find_one( - {"council_id": record.council_id}, + db = client[str(MongoDBSecrets.database_name)] + collection = db[str(record.council_type)] + + cid = record.council_id + + new_councilors = check_new_councilors(collection, record) + resigned_councilors = check_resigned_councilors(collection, record) + other_councilors = [ + councilor + for councilor in record.councilors + if councilor not in new_councilors + and councilor.name + not in [councilor["name"] for councilor in resigned_councilors] + ] + + # TODO: DB에 없던 새로운 의원 핸들링 + update_councilors(collection, cid, other_councilors) + remove_councilors(collection, resigned_councilors) + + return True + + +def check_new_councilors(collection, record: ScrapResult) -> list[Councilor]: + """ + DB에 없던 새로운 의원을 찾아 반환합니다. + :param collection: MongoDB 컬렉션 + :param record: 지방의회 크롤링 결과 + :return: 새로운 의원 목록 + """ + new_councilors = [] + + for councilor in record.councilors: + if ( + collection.find_one({"localId": record.council_id, "name": councilor.name}) + is None + ): + new_councilors.append(councilor) + + return new_councilors + + +def check_resigned_councilors(collection, record: ScrapResult) -> list[dict]: + """ + DB에 있었으나 사퇴한 의원을 찾아 반환합니다. + :param collection: MongoDB 컬렉션 + :param record: 지방의회 크롤링 결과 + :return: 사퇴한 의원 목록 + """ + resigned_councilors = [] + + for councilor in collection.find({"localId": record.council_id}): + if councilor["name"] not in [councilor.name for councilor in record.councilors]: + resigned_councilors.append(councilor) + + return resigned_councilors + + +def update_councilors(collection, cid: int, councilors: List[Councilor]) -> None: + """ + 신규/사퇴 의원이 아닌 의원 정보를 업데이트합니다. + :param collection: MongoDB 컬렉션 + :param councilors: 업데이트할 의원 리스트 + """ + for councilor in councilors: + collection.update_one( + {"localId": cid, "name": councilor.name}, + {"$set": asdict(councilor)}, + upsert=False, + ) + + +def remove_councilors(collection, councilors: List[Councilor]) -> None: + """ + 사퇴한 의원을 데이터베이스에서 제거합니다. + :param collection: MongoDB 컬렉션 + :param councilors: 제거할 의원 리스트 + """ + for councilor in councilors: + collection.delete_one( + {"localId": councilor["localId"], "name": councilor["name"]} ) - if result is not None: - before_councilors = result["councilors"] # List[dict] - updated_councilors = [] - updated_names = set() - - name_data_map_for_update = {d.name: asdict(d) for d in record.councilors} - - for d in before_councilors: - if d["name"] in name_data_map_for_update: - d.update( - { - k: v - for k, v in name_data_map_for_update[d["name"]].items() - if k in d - } - ) - updated_names.add(d["name"]) - updated_councilors.append(d) - - for d in record.councilors: - if d.name not in updated_names: - updated_councilors.append(asdict(d)) - - collection.find_one_and_update( - {"council_id": record.council_id}, - {"$set": {"councilors": updated_councilors}}, - upsert=True, - ) - else: - return False - - return True - except Exception as e: - t - print(e) - return False - - -if __name__ == "__main__": - test_record = ScrapResult( - council_id="test-test", - council_type=CouncilType.LOCAL_COUNCIL, - councilors=[ - Councilor(name="김철수", jdName="국민의힘"), - Councilor(name="김영희", jdName="Birthday Party"), - Councilor(name="테스트", jdName="테스트당"), - ], - ) - print(save_to_database(test_record)) diff --git a/scrap/utils/export.py b/scrap/utils/export.py new file mode 100644 index 0000000..014da59 --- /dev/null +++ b/scrap/utils/export.py @@ -0,0 +1,39 @@ +import os +import json +from dataclasses import asdict + +from scrap.utils.types import ScrapResult, ScrapBasicArgument + + +def export_results_to_json( + results: dict[int, ScrapResult], output_path: str, current_time: str +): + os.makedirs(output_path, exist_ok=True) + results = { + k: [asdict(councilor) for councilor in v.councilors] for k, v in results.items() + } + + with open( + os.path.join(output_path, f"scraping_result_{current_time}.json"), + "w", + encoding="utf-8", + ) as f: + json.dump(results, f, ensure_ascii=False, indent=4) + + +def export_results_to_txt( + results: dict[int, ScrapResult], output_path: str, current_time: str +): + os.makedirs(output_path, exist_ok=True) + results = { + k: [asdict(councilor) for councilor in v.councilors] for k, v in results.items() + } + + with open( + os.path.join(output_path, f"scraping_result_{current_time}.txt"), + "w", + encoding="utf-8", + ) as f: + for cid, councilors in results.items(): + councilors = "\n".join([c.to_txt() for c in councilors]) + f.write(f"| {cid} | {councilors}\n") diff --git a/scrap/utils/runner.py b/scrap/utils/runner.py new file mode 100644 index 0000000..fc0955d --- /dev/null +++ b/scrap/utils/runner.py @@ -0,0 +1,203 @@ +import os +import sys +import json +import argparse +import datetime +import logging +import warnings +from typing import List, Dict, Optional +from collections.abc import Iterable +from tqdm import tqdm + +from scrap.utils.export import export_results_to_json, export_results_to_txt +from scrap.utils.database import save_to_database +from scrap.utils.types import ScrapResult, ScrapBasicArgument +from scrap.utils.spreadsheet import read_record_from_spreadsheet +from scrap.local_councils.seoul import * +from scrap.local_councils.busan import * +from scrap.local_councils.daegu import * +from scrap.local_councils.incheon import * +from scrap.local_councils.gwangju import * + +# from scrap.local_councils.daejeon import * +from scrap.local_councils.ulsan import * +from scrap.local_councils.gyeonggi import * +from scrap.local_councils.gangwon import * +from scrap.local_councils.chungcheong import * +from scrap.local_councils.jeolla import * +from scrap.local_councils.gyeongsang import * +from scrap.local_councils import * +from requests.exceptions import Timeout + + +BASE_DIR = os.path.join(os.path.dirname(__file__), os.pardir, os.pardir) + + +class ScraperRunner: + def __init__( + self, + runner_args_path: str, + council_args_path: str, + data_source: str, + kwargs: Dict[str, str] = {}, + ): + with open(runner_args_path, "r") as f: + self.runner_args = json.load(f) + with open(council_args_path, "r") as f: + self.council_args = json.load(f) + + self.get_records_from_data_source(data_source) + + self.setup_logging(kwargs.get("log_path"), kwargs.get("current_time")) + self.error_log = dict() + self.timeout_count = 0 + self.parseerror_count = 0 + + def setup_logging(self, log_path: str, current_time: str): + if not os.path.exists(log_path): + os.makedirs(log_path) + + log_path = os.path.join(BASE_DIR, log_path, f"scraping_log_{current_time}.log") + + logging.basicConfig( + filename=log_path, + level=logging.INFO, + format="[%(asctime)s] %(levelname)s - %(message)s", + ) + + def get_records_from_data_source(self, data_source: str): + if data_source == "google_sheets": + self.url_records = read_record_from_spreadsheet() + elif data_source == "mongodb": + # TODO: Implement MongoDB -> MongoDB에 지방의회별 URL을 저장할 필요성 논의 + raise NotImplementedError("MongoDB에 아직 데이터가 없습니다.") + + # Helper Functions + def is_euc_kr(self, n: int) -> bool: + return n in self.runner_args["euc_kr"] + + def is_special_function(self, n: int) -> bool: + return n in self.runner_args["special_functions"] + + def is_selenium_basic(self, n: int) -> bool: + return n in self.runner_args["selenium_basic"] + + def handle_errors(self, cid: int, error): + self.error_log[cid] = str(error) + + if isinstance(error, Timeout): + self.timeout_count += 1 + elif isinstance(error, ValueError) and "정보 없음" in str(error): + self.parseerror_count += 1 + logging.error(f"| {cid} | 오류: {error}") + + def run_single_council(self, n: int) -> ScrapResult: + encoding = "euc-kr" if self.is_euc_kr(n) else "utf-8" + council_url = self.url_records[n - 1]["URL"] + council_args = self.council_args.get(str(n), None) + if council_args is not None: + council_args = ScrapBasicArgument(**council_args) + + if self.is_special_function(n): + function_name = f"scrap_{n}" + if hasattr(sys.modules[__name__], function_name): + function_to_call = getattr(sys.modules[__name__], function_name) # type: ignore + result = function_to_call(council_url, n, args=council_args) + else: + raise NotImplementedError(f"함수를 찾을 수 없습니다: {function_name}") + else: + if council_args is None: + raise ValueError(f"{n}번 의회에 대한 ScrapBasicArgument가 없습니다.") + + if self.is_selenium_basic(n): + result = sel_scrap_basic(council_url, n, council_args) + else: + result = scrap_basic(council_url, n, council_args, encoding) + + return result + + def run_all_councils(self, cids: Iterable[int]) -> Dict[int, ScrapResult]: + scrape_results = dict() + + for cid in tqdm(cids): + try: + result = self.run_single_council(cid) + if "정보 없음" in str(result.councilors): + raise ValueError("정보 없음이 포함되어 있습니다.") + scrape_results[cid] = result + except Exception as e: + self.handle_errors(cid, e) + + logging.info( + f"| 총 실행 횟수: {len(cids)} | 에러: {list(self.error_log.keys())}, 총 {len(self.error_log)}회 | 그 중 정보 없음 횟수: {self.parseerror_count} | 타임아웃 횟수: {self.timeout_count} |" + ) + + return scrape_results + + +def main(args: Dict[str, str]) -> None: + warnings.filterwarnings("ignore") + + current_time = datetime.datetime.now().strftime("%Y%m%d_%H%M") + runner_kwargs = { + "log_path": args.get("log_path"), + "current_time": current_time, + } + + runner = ScraperRunner( + args["runner_args_path"], + args["council_args_path"], + args["data_source"], + runner_kwargs, + ) + + cids_to_run = parse_cids(args.get("cids")) + results = runner.run_all_councils(cids_to_run) + + if args.get("update_mongo"): + for result in results.values(): + # TODO: 잘 작동하는지 확인 필요 + save_to_database(result) + + if args.get("output_store"): + if args.get("output_format") == "json": + export_results_to_json(results, args.get("output_path"), current_time) + elif args.get("output_format") == "txt": + export_results_to_txt(results, args.get("output_path"), current_time) + + +def parse_cids(cids_str: Optional[str]) -> list[int]: + if cids_str: + return [int(cid.strip()) for cid in cids_str.split(",")] + else: + return range(1, 227) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="지방의회 스크랩 스크립트 실행") + parser.add_argument("runner_args_path", help="runner_args JSON 파일 경로") + parser.add_argument("council_args_path", help="council_args JSON 파일 경로") + parser.add_argument( + "data_source", + help="사용할 데이터 소스 ('google_sheets', 'mongodb')", + choices=["google_sheets", "mongodb"], + default="google_sheets", + ) + parser.add_argument("-c", "--cids", help="스크랩할 지방의회 ID 목록 (쉼표로 구분)", default=None) + parser.add_argument("-l", "--log_path", help="로그 파일 경로", default="logs") + parser.add_argument( + "-m", "--update_mongo", help="스크랩 결과를 MongoDB에 업데이트", action="store_true" + ) + parser.add_argument( + "-o", "--output_store", help="스크랩 결과를 로컬에 저장", action="store_true" + ) + parser.add_argument( + "--output_format", + help="스크랩 결과 저장 형식 ('json', 'txt')", + choices=["json", "txt"], + default="json", + ) + parser.add_argument("--output_path", help="스크랩 결과 저장 경로", default="output") + args = vars(parser.parse_args()) + + main(args) diff --git a/scrap/utils/runner_args.json b/scrap/utils/runner_args.json new file mode 100644 index 0000000..ddad42b --- /dev/null +++ b/scrap/utils/runner_args.json @@ -0,0 +1,19 @@ +{ + "euc_kr": [ + 6, 13, 16, 31, 72, 88, 112, 134, 154, 157, 163, 165, 167, 176, 181, 197, + 202, 222 + ], + "special_functions": [ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, + 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 62, 63, 64, + 88, 97, 103, 107, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, + 124, 125, 126, 132, 134, 140, 142, 154, 155, 156, 157, 160, 161, 162, 163, + 164, 165, 167, 177, 178, 179, 182, 183, 184, 186, 188, 189, 190, 191, 194, + 195, 196, 198, 199, 201, 203, 206, 208, 209, 210, 212, 213, 214, 215, 216, + 217, 218, 219, 220, 222, 223, 224, 226 + ], + "selenium_basic": [76, 78, 101, 169, 173, 177], + "no_information": [18, 29, 106, 111, 172, 181, 185, 187, 197, 200, 204, 207], + "error_unresolved": [170, 171] +} diff --git a/scrap/utils/spreadsheet.py b/scrap/utils/spreadsheet.py index f3df8bf..fef0e23 100644 --- a/scrap/utils/spreadsheet.py +++ b/scrap/utils/spreadsheet.py @@ -14,7 +14,8 @@ from scrap.local_councils.daegu import * from scrap.local_councils.incheon import * from scrap.local_councils.gwangju import * -from scrap.local_councils.daejeon import * + +# from scrap.local_councils.daejeon import * from scrap.local_councils.ulsan import * from scrap.local_councils.gyeonggi import * from scrap.local_councils.gangwon import * @@ -60,16 +61,16 @@ def google_authorization(): return gspread.authorize(creds) -def main() -> None: - # Google Sheets API 설정 - client: gspread.client.Client = google_authorization() - - # 스프레드시트 열기 +def read_record_from_spreadsheet() -> list[dict]: + client = google_authorization() link = "https://docs.google.com/spreadsheets/d/1fBDJjkw8FSN5wXrvos9Q2wDsyItkUtNFGOxUZYE-h0M/edit#gid=1127955905" # T4I-의회목록 - spreadsheet: gspread.Spreadsheet = client.open_by_url(link) - worksheet: gspread.Worksheet = spreadsheet.get_worksheet( - 0 - ) # 원하는 워크시트 선택 (0은 첫 번째 워크시트입니다.) + spreadsheet = client.open_by_url(link) + worksheet = spreadsheet.get_worksheet(0) + + return worksheet.get_all_records() + + +def main() -> None: # TODO - 홈페이지 위 charset=euc-kr 등을 인식해 바로 가져오기. euc_kr = [ 6, @@ -129,22 +130,14 @@ def main() -> None: f.close() # 데이터 가져오기 - data: list[dict] = worksheet.get_all_records() + # data: list[dict] = worksheet.get_all_records() + data = read_record_from_spreadsheet() result: str = "" parse_error_times = 0 timeouts = 0 N = 226 - emessages: str = "" - enumbers = [] - - def add_error(n, msg): - nonlocal emessages - emsg: str = f"| {n:3} | 오류: {msg}" - emessages += emsg - enumbers.append(n) - - for n in range(1, N + 1): + for n in range(1, N + 1): # range(1, N + 1): if n in no_information + error_unsolved: emsg: str = ( ( @@ -190,7 +183,8 @@ def add_error(n, msg): emsg = "스크랩 결과에 '정보 없음'이 포함되어 있습니다. 일부 인명에\ 대해 스크랩이 실패했다는 뜻이에요. 함수나 인자를 점검해 주세요." parse_error_times += 1 - add_error(n, emsg) + errors.append(n) + # print(f"| {n} | {result}") except Timeout: emsg = f"{council_url}에 시도한 연결이 타임아웃됐어요." timeouts += 1