diff --git a/scrap/local_councils/basic.py b/scrap/local_councils/basic.py index d4ac5e6..f36a252 100644 --- a/scrap/local_councils/basic.py +++ b/scrap/local_councils/basic.py @@ -124,9 +124,6 @@ def sel_getname(profile, element, class_, wrapper_element, wrapper_class_): name = strong_element.text.strip() except NoSuchElementException: pass - # print(name+"\n") - if name == "": - return "231114" name = name.split("(")[0].split(":")[-1].strip() # 이름 뒷 한자이름, 앞 '이 름:' 제거 # TODO : 만약 이름이 우연히 아래 단어를 포함하는 경우를 생각해볼만 함. if len(name) > 3: @@ -154,7 +151,7 @@ def extract_party(string): return None -def goto_profilesite(profile, wrapper_element, wrapper_class_, wrapper_txt, url): +def goto_profilesite(profile, wrapper_element, wrapper_class_, wrapper_txt, url, inner_euckr=False): # 의원 프로필에서 프로필보기 링크를 가져옴 parsed_url = urlparse(url) base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" @@ -169,7 +166,10 @@ def goto_profilesite(profile, wrapper_element, wrapper_class_, wrapper_txt, url) # base_url = base_url + '/' profile_url = base_url + profile_link["href"] try: - profile = get_soup(profile_url, verify=False) + if inner_euckr: + profile = get_soup(profile_url, verify=False, encoding="euc-kr") + else: + profile = get_soup(profile_url, verify=False) except Exception: raise RuntimeError("[basic.py] '//'가 있진 않나요?", " url: ", profile_url) return profile @@ -212,7 +212,7 @@ def getpty(profile, element, class_, wrapper_element, wrapper_class_, wrapper_tx ) ) if party_pulp_list == []: - raise RuntimeError("[basic.py] 정당정보 regex 실패") + raise Exception("[basic.py] 정당정보 regex 실패") party_pulp = party_pulp_list[0] party_string = party_pulp.get_text(strip=True).split(" ")[-1] while True: @@ -221,14 +221,14 @@ def getpty(profile, element, class_, wrapper_element, wrapper_class_, wrapper_tx if (party_pulp := party_pulp.find_next("span")) is not None: party_string = party_pulp.text.strip().split(" ")[-1] else: - return "[basic.py] 정당 정보 파싱 불가" + raise Exception("[basic.py] 정당 정보 파싱 불가") -def getpty_easy(profile, wrapper_element, wrapper_class_, wrapper_txt, url): +def getpty_easy(profile, wrapper_element, wrapper_class_, wrapper_txt, url, inner_euckr=False): # 의원 프로필에서 의원이 몸담는 정당 이름을 가져옴 if wrapper_element is not None: profile = goto_profilesite( - profile, wrapper_element, wrapper_class_, wrapper_txt, url + profile, wrapper_element, wrapper_class_, wrapper_txt, url, inner_euckr ) party = extract_party(profile.text) assert party is not None @@ -253,7 +253,7 @@ def sel_getpty_easy( return party -def scrap_basic(url, cid, args: ScrapBasicArgument, encoding="utf-8") -> ScrapResult: +def scrap_basic(url, cid, args: ScrapBasicArgument, encoding="utf-8", inner_euckr=False) -> ScrapResult: """의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :param cid: 의회 id @@ -293,7 +293,7 @@ def scrap_basic(url, cid, args: ScrapBasicArgument, encoding="utf-8") -> ScrapRe except Exception as e: try: party = getpty_easy( - profile, args.pty_wrapelt, args.pty_wrapcls, args.pty_wraptxt, url + profile, args.pty_wrapelt, args.pty_wrapcls, args.pty_wraptxt, url, inner_euckr ) except Exception: raise RuntimeError("[basic.py] 의원 정당을 가져오는데 실패했습니다. 이유: " + str(e)) @@ -331,8 +331,6 @@ def sel_scrap_basic(url, cid, args: ScrapBasicArgument) -> ScrapResult: raise RuntimeError( "[basic.py/selenium] 의원 이름을 가져오는데 실패했습니다. 이유 : " + str(e) ) - if name == "231114": - continue try: party = sel_getpty_easy( profile, args.pty_wrapelt, args.pty_wrapcls, args.pty_wraptxt, url diff --git a/scrap/local_councils/gyeongsang.py b/scrap/local_councils/gyeongsang.py index 230735b..e1290bc 100644 --- a/scrap/local_councils/gyeongsang.py +++ b/scrap/local_councils/gyeongsang.py @@ -10,6 +10,8 @@ regex_pattern, ) +party_keywords = getPartyList() +party_keywords.append("무소속") def scrap_186( url, @@ -122,6 +124,33 @@ def scrap_191( return ret_local_councilors(cid, councilors) +def scrap_192( + url, + cid, + args: ArgsType = None, +) -> ScrapResult: + """경상북도 구미시""" + soup = get_soup(url, verify=False, encoding="euc-kr") + councilors: List[Councilor] = [] + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find("li", class_="name") + name = name_tag.get_text(strip=True).split("(")[0] if name_tag else "이름 정보 없음" + + party = "정당 정보 없음" + profile_link = profile.find_all("a")[1] + parsed_url = urlparse(url) + base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" + profile_url = base_url + profile_link["href"] + profile = get_soup(profile_url, verify=False, encoding="euc-kr") + party="" + for keyword in party_keywords: + if keyword in profile.text: + party=keyword + break + councilors.append(Councilor(name=name, jdName=party)) + + return ret_local_councilors(cid, councilors) + def scrap_194( url, @@ -188,6 +217,24 @@ def scrap_196( return ret_local_councilors(cid, councilors) +def scrap_197( + url, + cid, + args: ArgsType = None, +) -> ScrapResult: + """경상북도 경산시""" + soup = get_soup(url, verify=False, encoding="euc-kr") + councilors: List[Councilor] = [] + for profile in soup.find_all('div', class_='memberL') + soup.find_all('div', class_='memberR'): + party = profile.find_previous('h4', class_='title').text.strip() + assert(party in party_keywords) + name = profile.find('dt').text.strip() + + councilors.append(Councilor(name=name, jdName=party)) + + return ret_local_councilors(cid, councilors) + + def scrap_198( url, cid, @@ -216,18 +263,18 @@ def scrap_199( args: ArgsType = None, ) -> ScrapResult: """경상북도 고령군""" - soup = get_soup(url, verify=False) - councilors: List[Councilor] = [] - for profile in soup.find_all("div", class_="profile"): - name_tag = profile.find("em", class_="name") - name = name_tag.get_text(strip=True).split("\r")[0] if name_tag else "이름 정보 없음" - + browser = get_selenium(url) + councilors: list[Councilor] = [] + for profile in browser.find_elements(By.CSS_SELECTOR, "div[class='profile']"): + name_tag = profile.find_element(By.CSS_SELECTOR, "em[class='name']") + name = name_tag.text.strip().split("\r")[0] if name_tag else "이름 정보 없음" + party = "" + for keyword in party_keywords: + if keyword in profile.text: + party = keyword + break party = "정당 정보 없음" - party_info = profile.find("em", string="정 당 : ") - if party_info: - party = party_info.find_next("span").get_text(strip=True) - - councilors.append(Councilor(name=name, jdName=party)) + councilors.append(Councilor(name, party)) return ret_local_councilors(cid, councilors) @@ -260,6 +307,31 @@ def scrap_201( return ret_local_councilors(cid, councilors) +def scrap_202( + url, + cid, + args: ArgsType = None, +) -> ScrapResult: + """경상북도 군위군""" + soup = get_soup(url, verify=False, encoding="euc-kr") + councilors: List[Councilor] = [] + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find("li", class_="name") + name = name_tag.get_text(strip=True).split("(")[0] if name_tag else "이름 정보 없음" + link = profile.find("p", class_="btn").find("a")["href"] + parsed_url = urlparse(url) + base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" + profile_url = base_url + link + profile = get_soup(profile_url, verify=False, encoding="euc-kr") + party="" + for keyword in party_keywords: + if keyword in profile.text: + party=keyword + break + councilors.append(Councilor(name=name, jdName=party)) + + return ret_local_councilors(cid, councilors) + def scrap_203( url, cid, @@ -281,6 +353,34 @@ def scrap_203( return ret_local_councilors(cid, councilors) +def scrap_204( + url, + cid, + args: ArgsType = None, +) -> ScrapResult: + """경상북도 청송군""" + soup = get_soup(url, verify=False) + councilors: List[Councilor] = [] + for profile in soup.find_all("div", class_="box3vm1"): + name_tag = profile.find("span", class_="t3") + name = name_tag.get_text(strip=True).split()[-1] if name_tag else "이름 정보 없음" + link = profile.find("a", class_="button")["href"] + parsed_url = urlparse(url) + base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" + profile_url = base_url + link + profile = get_soup(profile_url, verify=False) + link = profile.find('a', text='의원소개', href=True) + profile_url = base_url + link['href'] + profile = get_soup(profile_url, verify=False) + + party="" + for keyword in party_keywords: + if keyword in profile.text: + party=keyword + break + councilors.append(Councilor(name=name, jdName=party)) + + return ret_local_councilors(cid, councilors) def scrap_206( url, diff --git a/scrap/local_councils/jeolla.py b/scrap/local_councils/jeolla.py index 7301ed5..64e02d4 100644 --- a/scrap/local_councils/jeolla.py +++ b/scrap/local_councils/jeolla.py @@ -7,9 +7,12 @@ extract_party, find, findall, + sel_find, regex_pattern, ) +party_keywords = getPartyList() +party_keywords.append("무소속") def scrap_154( url, @@ -334,6 +337,30 @@ def scrap_167( # return ret_local_councilors(cid, councilors) +def scrap_175( + url, + cid, + args: ArgsType = None, +) -> ScrapResult: + """전라남도 화순군""" + browser = get_selenium(url) + councilors: list[Councilor] = [] + for profileList in browser.find_elements(By.CSS_SELECTOR, "ul[id='councilList']"): + for profile in profileList.find_elements(By.CSS_SELECTOR, "ul[class='name_51']"): + name_tag = profile.find_element(By.TAG_NAME, "li") + name = name_tag.text.strip() if name_tag else "이름 정보 없음" + + profile_link = sel_find(profile, "a") + page_content = get_selenium(profile_link.get_attribute("href")).page_source + party = "" + for keyword in party_keywords: + if keyword in page_content: + party = keyword + break + + councilors.append(Councilor(name, party)) + + return ret_local_councilors(cid, councilors) def scrap_177( url, @@ -341,24 +368,19 @@ def scrap_177( args: ArgsType = None, ) -> ScrapResult: """전라남도 강진군""" - soup = get_soup(url, verify=False) - councilors: List[Councilor] = [] - mlist = soup.find_all("ul", class_="memlist")[0] - - for profile in mlist.find_all("li", recursive=False): - info = profile.find("ul", class_="info") - name = ( - info.find("h5").get_text(strip=True) - if info.find("h5").get_text(strip=True) - else "이름 정보 없음" - ) - - li = info.find_all("li", recursive=False)[6] - party = "정당 정보 없음" - party_dd = li.find("dd") - if party_dd: - party = party_dd.get_text(strip=True) - councilors.append(Councilor(name=name, jdName=party)) + browser = get_selenium(url) + councilors: list[Councilor] = [] + for profileList in browser.find_elements(By.CSS_SELECTOR, "ul[id='memlist']"): + for profile in profileList.find_elements(By.CSS_SELECTOR, "ul[class='info']"): + name_tag = profile.find_element(By.TAG_NAME, "h5") + name = name_tag.text.strip() if name_tag else "이름 정보 없음" + party = "" + for keyword in party_keywords: + if keyword in profile.text: + party = keyword + break + party = "정당 정보 없음" + councilors.append(Councilor(name, party)) return ret_local_councilors(cid, councilors) @@ -369,14 +391,20 @@ def scrap_178( args: ArgsType = None, ) -> ScrapResult: """전라남도 완도군""" - councilors: List[Councilor] = [] - - result = requests.get(url) - result_json = result.json() - for profile in result_json["list"]: - name = profile["cmNm"] - party = profile["mpParty"] - councilors.append(Councilor(name=name, jdName=party)) + browser = get_selenium(url) + councilors: list[Councilor] = [] + for profileList in browser.find_elements(By.CSS_SELECTOR, "div[class='congressperson_list']"): + for profile in profileList.find_elements(By.CSS_SELECTOR, "div[class='col-lg-6']"): + name_tag = profile.find_element(By.TAG_NAME, "strong") + name = name_tag.text.strip() if name_tag else "이름 정보 없음" + profile_link = sel_find(profile, "a", class_="icon_btn") + page_content = get_selenium(profile_link.get_attribute("href")).page_source + party = "" + for keyword in party_keywords: + if keyword in page_content: + party = keyword + break + councilors.append(Councilor(name, party)) return ret_local_councilors(cid, councilors) @@ -395,9 +423,10 @@ def scrap_179( name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" party = "정당 정보 없음" - party_info = profile.find("span", string="소속정당 :") - if party_info: - party = party_info.find_next("span").get_text(strip=True) + for keyword in party_keywords: + if keyword in profile.text: + party = keyword + break councilors.append(Councilor(name=name, jdName=party)) return ret_local_councilors(cid, councilors) diff --git a/scrap/utils/runner.py b/scrap/utils/runner.py index 7113323..a21761a 100644 --- a/scrap/utils/runner.py +++ b/scrap/utils/runner.py @@ -106,7 +106,8 @@ def get_records_from_data_source(self, data_source: str): # Helper Functions def is_euc_kr(self, n: int) -> bool: return n in self.runner_args["euc_kr"] - + def inner_euckr(self, n: int) -> bool: + return n in self.runner_args["inner_euckr"] def is_special_function(self, n: int) -> bool: return n in self.runner_args["special_functions"] @@ -115,6 +116,7 @@ def is_selenium_basic(self, n: int) -> bool: def run_single(self, cid: int) -> ScrapResult: encoding = "euc-kr" if self.is_euc_kr(cid) else "utf-8" + inner_euckr = self.inner_euckr(cid) council_url = self.url_records[cid - 1]["URL"] council_args = self.council_args.get(str(cid), None) if council_args is not None: @@ -134,7 +136,7 @@ def run_single(self, cid: int) -> ScrapResult: if self.is_selenium_basic(cid): result = sel_scrap_basic(council_url, cid, council_args) else: - result = scrap_basic(council_url, cid, council_args, encoding) + result = scrap_basic(council_url, cid, council_args, encoding, inner_euckr) return result diff --git a/scrap/utils/runner_args.json b/scrap/utils/runner_args.json index ddad42b..d03af28 100644 --- a/scrap/utils/runner_args.json +++ b/scrap/utils/runner_args.json @@ -1,6 +1,6 @@ { "euc_kr": [ - 6, 13, 16, 31, 72, 88, 112, 134, 154, 157, 163, 165, 167, 176, 181, 197, + 6, 13, 16, 31, 72, 88, 112, 134, 154, 157, 163, 165, 167, 176, 181, 200, 202, 222 ], "special_functions": [ @@ -9,11 +9,12 @@ 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 62, 63, 64, 88, 97, 103, 107, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 132, 134, 140, 142, 154, 155, 156, 157, 160, 161, 162, 163, - 164, 165, 167, 177, 178, 179, 182, 183, 184, 186, 188, 189, 190, 191, 194, - 195, 196, 198, 199, 201, 203, 206, 208, 209, 210, 212, 213, 214, 215, 216, + 164, 165, 167, 175, 177, 178, 179, 182, 183, 184, 186, 188, 189, 190, 191, 192, 194, + 195, 196, 197, 198, 199, 201, 202, 203, 204, 206, 208, 209, 210, 212, 213, 214, 215, 216, 217, 218, 219, 220, 222, 223, 224, 226 ], - "selenium_basic": [76, 78, 101, 169, 173, 177], - "no_information": [18, 29, 106, 111, 172, 181, 185, 187, 197, 200, 204, 207], - "error_unresolved": [170, 171] + "selenium_basic": [76, 78, 101, 169, 173], + "no_information": [18, 29, 106, 111, 172, 174, 181, 185, 187, 207], + "error_unresolved": [170, 171], + "inner_euckr": [200] } diff --git a/scrap/utils/scrap_args.json b/scrap/utils/scrap_args.json index e854932..1bb1405 100644 --- a/scrap/utils/scrap_args.json +++ b/scrap/utils/scrap_args.json @@ -644,13 +644,21 @@ "pty_elt": "ul", "pty_cls": "dot" }, - "177": { - "pf_elt": "li", - "pf_cls": "item_box", + "180": { + "pf_elt": "dl", + "pf_cls": "ml_desc", "pf_memlistelt": "ul", - "pf_memlistcls": "memlist", - "name_elt": "h5", - "name_cls": "dd", - "pty_elt": "dl" + "pf_memlistcls": "member_list", + "name_elt": "dt", + "pty_elt": "dd", + "pty_cls": "ml_data" + }, + "200": { + "pf_elt": "dl", + "pf_memlistelt": "div", + "pf_memlistcls": "list", + "name_elt": "dd", + "name_cls": "name", + "pty_wrapelt": "a" } } \ No newline at end of file diff --git a/scrap/utils/spreadsheet.py b/scrap/utils/spreadsheet.py index da8abcb..d93ba62 100644 --- a/scrap/utils/spreadsheet.py +++ b/scrap/utils/spreadsheet.py @@ -88,30 +88,33 @@ def scrap_all_metro_councils() -> None: else: emsg: str = f"[scrap/metropolitan_council.py]에 {n}번 지역을 위한\ 함수가 없네요." - add_error(n, emsg) + # add_error(n, emsg) + print(emsg) if "정보 없음" in result: emsg = "스크랩 결과에 '정보 없음'이 포함되어 있습니다. 일부 인명에\ 대해 스크랩이 실패했다는 뜻이에요. 함수나 인자를 점검해 주세요." parse_error_times += 1 - errors.append(n) + # errors.append(n) + print(emsg) # print(f"| {n} | {result}") except Timeout: - emsg = f"{council_url}에 시도한 연결이 타임아웃됐어요." + emsg = f"시도한 연결이 타임아웃됐어요." timeouts += 1 - add_error(n, emsg) + print(emsg) + # add_error(n, emsg) except Exception as e: - add_error(n, "기타 오류 - " + str(e)) - emessages = ( - f""" - 총 실행 횟수: {N} - 에러: {enumbers}, 총 {len(enumbers)}회 - 그 중 '정보 없음' 횟수: {parse_error_times} - 타임아웃 횟수: {timeouts} - """ - + emessages - ) - email_result(emessages) - + print(e) + # add_error(n, "기타 오류 - " + str(e)) + # emessages = ( + # f""" + # 총 실행 횟수: {N} + # 에러: {enumbers}, 총 {len(enumbers)}회 + # 그 중 '정보 없음' 횟수: {parse_error_times} + # 타임아웃 횟수: {timeouts} + # """ + # + emessages + # ) + # email_result(emessages) def scrap_all_local_councils() -> None: # TODO - 홈페이지 위 charset=euc-kr 등을 인식해 바로 가져오기. @@ -131,10 +134,11 @@ def scrap_all_local_councils() -> None: 167, 176, 181, - 197, + 200, 202, 222, ] + inner_euckr=[200] special_functions = ( list(range(1, 57)) + [62, 63, 64, 88, 97, 103, 107] @@ -149,14 +153,14 @@ def scrap_all_local_councils() -> None: 188, 189, 190, - 191, + 191,192, 194, 195, - 196, + 196,197, 198, 199, - 201, - 203, + 201,202, + 203,204, 206, 208, 209, @@ -166,7 +170,7 @@ def scrap_all_local_councils() -> None: + [222, 223, 224, 226] ) selenium_basic = [76, 78, 101, 169, 173, 177] - no_information = [18, 29, 106, 111, 172, 181, 185, 187, 197, 200, 204, 207] + no_information = [18, 29, 106, 111, 172, 181, 185, 187, 207] error_unsolved = [170, 171] f = open(JSON_PATH, "r") args = json.load(f) @@ -180,7 +184,7 @@ def scrap_all_local_councils() -> None: parse_error_times = 0 timeouts = 0 N = 226 - for n in range(1, N + 1): # range(1, N + 1): + for n in [204]: if n in no_information + error_unsolved: emsg: str = ( ( @@ -192,9 +196,10 @@ def scrap_all_local_councils() -> None: + " 링크: " + data[n - 1]["URL"] ) - add_error(n, emsg) + # add_error(n, emsg) continue encoding = "euc-kr" if n in euc_kr else "utf-8" + inner_euckr = True if n in inner_euckr else False council_url: str = "" try: council_url = data[n - 1]["URL"] @@ -215,39 +220,42 @@ def scrap_all_local_councils() -> None: 명시되어 있는데 함수가 정의되어 있지 않네요. [scrap/utils/\ spreadsheet.py의 special_functions에 함수 번호를 빼고 \ 다시 시도해 보시겠어요?]" - add_error(n, emsg) + # add_error(n, emsg) elif n in selenium_basic: result = str(sel_scrap_basic(council_url, n, council_args).councilors) else: result = str( - scrap_basic(council_url, n, council_args, encoding).councilors + scrap_basic(council_url, n, council_args, encoding, inner_euckr).councilors ) if "정보 없음" in result: emsg = "스크랩 결과에 '정보 없음'이 포함되어 있습니다. 일부 인명에\ 대해 스크랩이 실패했다는 뜻이에요. 함수나 인자를 점검해 주세요." parse_error_times += 1 - errors.append(n) + print(emsg) + # errors.append(n) # print(f"| {n} | {result}") except Timeout: emsg = f"{council_url}에 시도한 연결이 타임아웃됐어요." timeouts += 1 - add_error(n, emsg) + # add_error(n, emsg) except Exception as e: - add_error(n, "기타 오류 - " + str(e)) - emessages = ( - f""" - 총 실행 횟수: {N} - 에러: {enumbers}, 총 {len(enumbers)}회 - 그 중 '정보 없음' 횟수: {parse_error_times} - 타임아웃 횟수: {timeouts} - """ - + emessages - ) - email_result(emessages) + print(e) + print(result) + # add_error(n, "기타 오류 - " + str(e)) + # emessages = ( + # f""" + # 총 실행 횟수: {N} + # 에러: {enumbers}, 총 {len(enumbers)}회 + # 그 중 '정보 없음' 횟수: {parse_error_times} + # 타임아웃 횟수: {timeouts} + # """ + # + emessages + # ) + # email_result(emessages) def main() -> None: - scrap_all_metro_councils() + # scrap_all_metro_councils() scrap_all_local_councils()