From 006d956b37efd4764525cac0bdf770331231a435 Mon Sep 17 00:00:00 2001 From: Re-st Date: Sun, 8 Oct 2023 16:09:42 +0900 Subject: [PATCH] =?UTF-8?q?=EC=A0=95=EB=8B=B9=EC=9D=B4=EB=A6=84=20?= =?UTF-8?q?=EA=B4=80=EB=A6=AC=20utils.py=20=EC=88=98=EC=A0=95,=20=EC=9A=A9?= =?UTF-8?q?=EC=82=B0=EA=B5=AC=20=ED=85=8C=EC=BC=80=20=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scrap/local_councils/basic.py | 31 ++++++++++++++++--------------- scrap/utils/utils.py | 11 +++++++++-- 2 files changed, 25 insertions(+), 17 deletions(-) diff --git a/scrap/local_councils/basic.py b/scrap/local_councils/basic.py index a0d3acb..aecf9ab 100644 --- a/scrap/local_councils/basic.py +++ b/scrap/local_councils/basic.py @@ -2,26 +2,27 @@ from scrap.utils.types import CouncilType, Councilor, ScrapResult from scrap.utils.requests import get_soup +from scrap.utils.utils import getPartyList import re import requests regex_pattern = re.compile(r'정\s*\S*\s*당', re.IGNORECASE) # Case-insensitive -party_keywords = ['국민의힘', '더불어민주당', '정의당', '진보당', '기본소득당', '시대전환', '한국의희망', '무소속'] # 이상 원내정당. -# 원외정당의 경우, 나무위키 피셜이지만 현재는 지방의회 진출당이 없다. 사실 당 이름이 매번 바뀌므로 다른 어프로치를 찾아야 할 듯.. => getPartyList() 참고. +party_keywords = getPartyList() +party_keywords.append('무소속') -pf_elt = [None, 'div'] -pf_cls = [None, 'profile'] -pf_memlistelt = [None, None] +pf_elt = [None, 'div', 'div'] +pf_cls = [None, 'profile', 'profile'] +pf_memlistelt = [None, None, None] -name_elt = [None, 'em'] -name_cls = [None, 'name'] -name_wrapelt= [None, None] -name_wrapcls = [None, None] +name_elt = [None, 'em', 'em'] +name_cls = [None, 'name', 'name'] +name_wrapelt= [None, None, None] +name_wrapcls = [None, None, None] -pty_elt = [None, 'em'] -pty_cls = [None, None] -pty_wrapelt = [None, None] -pty_wrapcls = [None, None] +pty_elt = [None, 'em', 'em'] +pty_cls = [None, None, None] +pty_wrapelt = [None, None, None] +pty_wrapcls = [None, None, None] def get_profiles(soup, element, class_, memberlistelement): # 의원 목록 사이트에서 의원 프로필을 가져옴 @@ -107,10 +108,10 @@ def scrap_basic(url, cid, encoding = 'utf-8') -> ScrapResult: councilors.append(Councilor(name=name, party=party)) return ScrapResult( - council_id=cid, + council_id=str(cid), council_type=CouncilType.LOCAL_COUNCIL, councilors=councilors ) if __name__ == '__main__': - print(scrap_basic('https://02jgnew.council.or.kr/kr/member/active', '2')) # 서울 중구 \ No newline at end of file + print(scrap_basic('https://www.yscl.go.kr/kr/member/name.do', 3)) # 서울 용산구 \ No newline at end of file diff --git a/scrap/utils/utils.py b/scrap/utils/utils.py index 089f552..025ac0e 100644 --- a/scrap/utils/utils.py +++ b/scrap/utils/utils.py @@ -1,6 +1,9 @@ from scrap.utils.requests import get_soup def getPartyList(): + """ + 중앙선거관리위원회에서 제공하는 정당 목록을 가져옵니다. + """ url = 'https://www.nec.go.kr/site/nec/ex/bbs/List.do?cbIdx=1239' soup = get_soup(url) table = soup.find('table', class_='list type2') @@ -9,5 +12,9 @@ def getPartyList(): td = tr.find_all('td') if td[0].get_text(strip=True).split("
")[0] == '시도': continue - partyList.append(td[0].get_text(strip=True).split("
")[0]) - return partyList \ No newline at end of file + # 더불어민주당(민주당, 더민주) 등은 약자가 괄호 안에 있다. + partyList.append(td[0].get_text(strip=True).split("
")[0].split("(")[0]) + return partyList + +if __name__ == '__main__': + print(getPartyList()) \ No newline at end of file