From 5b4f2dc10939b84f6da4076cf9e52956175051ad Mon Sep 17 00:00:00 2001 From: Keon Lee Date: Tue, 10 Oct 2023 15:21:12 +0900 Subject: [PATCH 1/3] feat: add black formatter to CI This change is needed to ensure that the codebase is formatted consistently and follows the black formatter guidelines. This will help improve code readability and maintainability. This change adds a GitHub Actions workflow that runs the black formatter on every push and pull request. The workflow uses the rickstaa/action-black@v1 action to format the codebase and the stefanzweifel/git-auto-commit-action@v5 action to automatically commit the changes. N/A --- .github/workflows/format-black.yml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 .github/workflows/format-black.yml diff --git a/.github/workflows/format-black.yml b/.github/workflows/format-black.yml new file mode 100644 index 0000000..94e591a --- /dev/null +++ b/.github/workflows/format-black.yml @@ -0,0 +1,16 @@ +name: Format with black +on: [push, pull_request] + +jobs: + format: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Format files using the black formatter + uses: rickstaa/action-black@v1 + id: action_black + with: + black_args: "." + - uses: stefanzweifel/git-auto-commit-action@v5 + with: + commit_message: Formatted with black \ No newline at end of file From 5428e2e452bd3f513f837e46cc34bc613eac4bcb Mon Sep 17 00:00:00 2001 From: keonly Date: Fri, 13 Oct 2023 07:46:49 +0000 Subject: [PATCH 2/3] Formatted with black --- API/__init__.py | 2 +- API/candidate.py | 108 ++++---- API/elected.py | 118 ++++---- API/votecode.py | 31 ++- __init__.py | 2 +- configurations/__init__.py | 2 +- configurations/secrets.py | 15 +- db/__init__.py | 2 +- db/client.py | 2 +- scrap/__init__.py | 2 +- scrap/examples/__init__.py | 2 +- scrap/examples/database.py | 12 +- scrap/examples/junggu_scrap.py | 36 +-- scrap/local_councils/__init__.py | 2 +- scrap/local_councils/basic.py | 99 ++++--- scrap/local_councils/busan.py | 275 +++++++++++-------- scrap/local_councils/daegu.py | 164 +++++++----- scrap/local_councils/daejeon.py | 89 ++++--- scrap/local_councils/incheon.py | 146 +++++----- scrap/local_councils/seoul.py | 445 ++++++++++++++++++------------- scrap/local_councils/ulsan.py | 122 +++++---- scrap/national_council.py | 79 +++--- scrap/utils/database.py | 14 +- scrap/utils/requests.py | 21 +- scrap/utils/spreadsheet.py | 41 +-- scrap/utils/types.py | 13 +- scrap/utils/utils.py | 16 +- 27 files changed, 1077 insertions(+), 783 deletions(-) diff --git a/API/__init__.py b/API/__init__.py index 6a8f860..106136f 100644 --- a/API/__init__.py +++ b/API/__init__.py @@ -1,3 +1,3 @@ """ 공공데이터포털 API를 이용한 데이터 수집을 위한 패키지입니다. -""" \ No newline at end of file +""" diff --git a/API/candidate.py b/API/candidate.py index 1e2c311..464da71 100644 --- a/API/candidate.py +++ b/API/candidate.py @@ -6,24 +6,24 @@ from configurations.secrets import OpenDataPortalSecrets BASE_DIR = os.path.join(os.path.dirname(__file__), os.pardir) -base_url = 'http://apis.data.go.kr/9760000/PofelcddInfoInqireService/getPofelcddRegistSttusInfoInqire' +base_url = "http://apis.data.go.kr/9760000/PofelcddInfoInqireService/getPofelcddRegistSttusInfoInqire" page_no = 1 num_of_rows = 10000 parliamentVote = [20220601, 20230405] -sgCodes = input("Input the number of sgTypecode: ").split(',') +sgCodes = input("Input the number of sgTypecode: ").split(",") data_list = [] for sgId in parliamentVote: for code in sgCodes: params = { - 'serviceKey': OpenDataPortalSecrets.service_key, - 'pageNo': str(page_no), - 'numOfRows': str(num_of_rows), - 'sgId': str(sgId), - 'sgTypecode': str(code), - 'sggName': '', - 'sdName': '', - 'jdName': '' + "serviceKey": OpenDataPortalSecrets.service_key, + "pageNo": str(page_no), + "numOfRows": str(num_of_rows), + "sgId": str(sgId), + "sgTypecode": str(code), + "sggName": "", + "sdName": "", + "jdName": "", } response = requests.get(base_url, params=params) @@ -34,56 +34,58 @@ root = ET.fromstring(response.content) for item in root.findall(".//item"): - sgId = item.find('sgId').text - sggName = item.find('sggName').text - sdName = item.find('sdName').text - wiwName = item.find('wiwName').text - giho = item.find('giho').text - jdName = item.find('jdName').text - name = item.find('name').text - hanjaName = item.find('hanjaName').text - gender = item.find('gender').text - birthday = item.find('birthday').text - age = item.find('age').text - addr = item.find('addr').text - jobId = item.find('jobId').text - job = item.find('job').text - eduId = item.find('eduId').text - edu = item.find('edu').text - career1 = item.find('career1').text - career2 = item.find('career2').text - status = item.find('status').text + sgId = item.find("sgId").text + sggName = item.find("sggName").text + sdName = item.find("sdName").text + wiwName = item.find("wiwName").text + giho = item.find("giho").text + jdName = item.find("jdName").text + name = item.find("name").text + hanjaName = item.find("hanjaName").text + gender = item.find("gender").text + birthday = item.find("birthday").text + age = item.find("age").text + addr = item.find("addr").text + jobId = item.find("jobId").text + job = item.find("job").text + eduId = item.find("eduId").text + edu = item.find("edu").text + career1 = item.find("career1").text + career2 = item.find("career2").text + status = item.find("status").text - data_list.append({ - 'sgId': sgId, - 'sggName': sggName, - 'sdName': sdName, - 'wiwName': wiwName, - 'giho': giho, - 'jdName': jdName, - 'name': name, - 'hanjaName': hanjaName, - 'gender': gender, - 'birthday': birthday, - 'age': age, - 'addr': addr, - 'jobId': jobId, - 'job': job, - 'eduId': eduId, - 'edu': edu, - 'career1': career1, - 'career2': career2, - 'status': status - }) + data_list.append( + { + "sgId": sgId, + "sggName": sggName, + "sdName": sdName, + "wiwName": wiwName, + "giho": giho, + "jdName": jdName, + "name": name, + "hanjaName": hanjaName, + "gender": gender, + "birthday": birthday, + "age": age, + "addr": addr, + "jobId": jobId, + "job": job, + "eduId": eduId, + "edu": edu, + "career1": career1, + "career2": career2, + "status": status, + } + ) # Create a DataFrame from the collected data df = pd.DataFrame(data_list) # Save the DataFrame to an Excel file -directory_path = os.path.join(BASE_DIR, 'output') +directory_path = os.path.join(BASE_DIR, "output") if not os.path.exists(directory_path): os.makedirs(directory_path) -excel_file = '[후보][구시군의회의원].xlsx' +excel_file = "[후보][구시군의회의원].xlsx" df.to_excel(os.path.join(directory_path, excel_file), index=False) -print(f'Data has been saved to {excel_file}') +print(f"Data has been saved to {excel_file}") diff --git a/API/elected.py b/API/elected.py index 98311e6..38a6030 100644 --- a/API/elected.py +++ b/API/elected.py @@ -6,26 +6,34 @@ from configurations.secrets import OpenDataPortalSecrets BASE_DIR = os.path.join(os.path.dirname(__file__), os.pardir) -base_url = 'http://apis.data.go.kr/9760000/WinnerInfoInqireService2/getWinnerInfoInqire' -params ={'serviceKey' : OpenDataPortalSecrets.service_key,\ - 'pageNo' : '1', 'numOfRows' : '10', 'sgId' : '20230405', 'sgTypecode' : '2', 'sdName' : '전라북도', 'sggName' : '전주시을', 'jdName' : ''} +base_url = "http://apis.data.go.kr/9760000/WinnerInfoInqireService2/getWinnerInfoInqire" +params = { + "serviceKey": OpenDataPortalSecrets.service_key, + "pageNo": "1", + "numOfRows": "10", + "sgId": "20230405", + "sgTypecode": "2", + "sdName": "전라북도", + "sggName": "전주시을", + "jdName": "", +} page_no = 1 num_of_rows = 10000 parliamentVote = [20200415, 20210407, 20220601, 20230405] -sgCodes = input("Input the number of sgTypecode: ").split(',') +sgCodes = input("Input the number of sgTypecode: ").split(",") data_list = [] for sgId in parliamentVote: for code in sgCodes: params = { - 'serviceKey': OpenDataPortalSecrets.service_key, - 'pageNo': str(page_no), - 'numOfRows': str(num_of_rows), - 'sgId': str(sgId), - 'sgTypecode': str(code), - 'sggName': '', - 'sdName': '', - 'jdName': '' + "serviceKey": OpenDataPortalSecrets.service_key, + "pageNo": str(page_no), + "numOfRows": str(num_of_rows), + "sgId": str(sgId), + "sgTypecode": str(code), + "sggName": "", + "sdName": "", + "jdName": "", } response = requests.get(base_url, params=params) @@ -36,56 +44,58 @@ root = ET.fromstring(response.content) for item in root.findall(".//item"): - sgId = item.find('sgId').text - sggName = item.find('sggName').text - sdName = item.find('sdName').text - wiwName = item.find('wiwName').text - giho = item.find('giho').text - jdName = item.find('jdName').text - name = item.find('name').text - hanjaName = item.find('hanjaName').text - gender = item.find('gender').text - birthday = item.find('birthday').text - age = item.find('age').text - addr = item.find('addr').text - jobId = item.find('jobId').text - job = item.find('job').text - eduId = item.find('eduId').text - edu = item.find('edu').text - career1 = item.find('career1').text - career2 = item.find('career2').text + sgId = item.find("sgId").text + sggName = item.find("sggName").text + sdName = item.find("sdName").text + wiwName = item.find("wiwName").text + giho = item.find("giho").text + jdName = item.find("jdName").text + name = item.find("name").text + hanjaName = item.find("hanjaName").text + gender = item.find("gender").text + birthday = item.find("birthday").text + age = item.find("age").text + addr = item.find("addr").text + jobId = item.find("jobId").text + job = item.find("job").text + eduId = item.find("eduId").text + edu = item.find("edu").text + career1 = item.find("career1").text + career2 = item.find("career2").text # status = item.find('status').text - data_list.append({ - 'sgId': sgId, - 'sggName': sggName, - 'sdName': sdName, - 'wiwName': wiwName, - 'giho': giho, - 'jdName': jdName, - 'name': name, - 'hanjaName': hanjaName, - 'gender': gender, - 'birthday': birthday, - 'age': age, - 'addr': addr, - 'jobId': jobId, - 'job': job, - 'eduId': eduId, - 'edu': edu, - 'career1': career1, - 'career2': career2, - # 'status': status - }) + data_list.append( + { + "sgId": sgId, + "sggName": sggName, + "sdName": sdName, + "wiwName": wiwName, + "giho": giho, + "jdName": jdName, + "name": name, + "hanjaName": hanjaName, + "gender": gender, + "birthday": birthday, + "age": age, + "addr": addr, + "jobId": jobId, + "job": job, + "eduId": eduId, + "edu": edu, + "career1": career1, + "career2": career2, + # 'status': status + } + ) # Create a DataFrame from the collected data df = pd.DataFrame(data_list) # Save the DataFrame to an Excel file -directory_path = os.path.join(BASE_DIR, 'output') +directory_path = os.path.join(BASE_DIR, "output") if not os.path.exists(directory_path): os.makedirs(directory_path) -excel_file = '[당선][구시군의회의원].xlsx' +excel_file = "[당선][구시군의회의원].xlsx" df.to_excel(os.path.join(directory_path, excel_file), index=False) -print(f'Data has been saved to {excel_file}') \ No newline at end of file +print(f"Data has been saved to {excel_file}") diff --git a/API/votecode.py b/API/votecode.py index b9f7e05..d040987 100644 --- a/API/votecode.py +++ b/API/votecode.py @@ -6,32 +6,39 @@ import argparse parser = argparse.ArgumentParser() -parser.add_argument('-c', '--code', action='store_true', help='코드를 출력합니다.') +parser.add_argument("-c", "--code", action="store_true", help="코드를 출력합니다.") args = parser.parse_args() if args.code: - print("(0) 대표선거명 (1)대통령,(2)국회의원,(3)시도지사,(4)구시군장,(5)시도의원,\ - (6)구시군의회의원, (7)국회의원비례대표,(8)광역의원비례대표,(9)기초의원비례대표,(10)교육의원,(11)교육감") + print( + "(0) 대표선거명 (1)대통령,(2)국회의원,(3)시도지사,(4)구시군장,(5)시도의원,\ + (6)구시군의회의원, (7)국회의원비례대표,(8)광역의원비례대표,(9)기초의원비례대표,(10)교육의원,(11)교육감" + ) else: - print("sgTypecode를 입력하면 해당 sgTypecode와 일치하는 sgId 값을 출력합니다. 여러 개 입력하고 싶으면 ,로 구분해 주세요.") + print( + "sgTypecode를 입력하면 해당 sgTypecode와 일치하는 sgId 값을 출력합니다. 여러 개 입력하고 싶으면 ,로 구분해 주세요." + ) -url = 'http://apis.data.go.kr/9760000/CommonCodeService/getCommonSgCodeList' -params ={'serviceKey' : OpenDataPortalSecrets.service_key,\ - 'pageNo' : '1', 'numOfRows' : '1000'} +url = "http://apis.data.go.kr/9760000/CommonCodeService/getCommonSgCodeList" +params = { + "serviceKey": OpenDataPortalSecrets.service_key, + "pageNo": "1", + "numOfRows": "1000", +} response = requests.get(url, params=params) -xml_data = response.content.decode('utf-8') +xml_data = response.content.decode("utf-8") # Parse the XML data root = ET.fromstring(xml_data) # Find all elements where sgTypecode is equal to INPUT and extract their sgId values sgIds = set() -for code in input("Input the number of sgTypecode: ").split(','): - for item in root.findall(f'.//item[sgTypecode=\"{code}\"]'): - sgId_element = item.find('sgId') +for code in input("Input the number of sgTypecode: ").split(","): + for item in root.findall(f'.//item[sgTypecode="{code}"]'): + sgId_element = item.find("sgId") if sgId_element is not None: sgId = sgId_element.text sgIds.add(sgId) # Print the sgId values for sgId in sorted(sgIds): - print(sgId) \ No newline at end of file + print(sgId) diff --git a/__init__.py b/__init__.py index d2639ae..20221e2 100644 --- a/__init__.py +++ b/__init__.py @@ -1,3 +1,3 @@ """ 이 파일은 프로젝트 루트 폴더를 패키지로 인식하게 해주는 역할을 합니다. -""" \ No newline at end of file +""" diff --git a/configurations/__init__.py b/configurations/__init__.py index e88c5ca..21135bf 100644 --- a/configurations/__init__.py +++ b/configurations/__init__.py @@ -1,4 +1,4 @@ """ 스크립트 실행에 필요한 환경변수를 정의합니다. 환경변수는 프로젝트 루트 폴더에 .env 파일을 생성하여 불러올 수 있습니다. -""" \ No newline at end of file +""" diff --git a/configurations/secrets.py b/configurations/secrets.py index f8adc5b..2bcdfc7 100644 --- a/configurations/secrets.py +++ b/configurations/secrets.py @@ -5,22 +5,25 @@ from dotenv import load_dotenv # .env 파일로부터 환경변수를 불러옵니다. -load_dotenv( - verbose=False, - override=False -) +load_dotenv(verbose=False, override=False) + class MongoDBSecrets: """ MongoDB 연결을 위한 연결 정보를 정의합니다. """ - connection_uri = str(os.getenv("MONGO_CONNECTION_URI") or "mongodb://localhost:27017") + + connection_uri = str( + os.getenv("MONGO_CONNECTION_URI") or "mongodb://localhost:27017" + ) """PyMongo 클라이언트에서 데이터베이스 연결에 사용할 연결 uri입니다.""" database_name = str(os.getenv("MONGO_DATABASE") or "local") """PyMongo 클라이언트에서 사용할 데이터베이스 이름입니다.""" + class OpenDataPortalSecrets: """ 공공데이터포털(data.go.kr) API 호출에 필요한 서비스 키를 정의합니다. """ - service_key = str(os.getenv("OPEN_DATA_SERICE_KEY") or "") \ No newline at end of file + + service_key = str(os.getenv("OPEN_DATA_SERICE_KEY") or "") diff --git a/db/__init__.py b/db/__init__.py index b528b32..c2cab83 100644 --- a/db/__init__.py +++ b/db/__init__.py @@ -1,3 +1,3 @@ """ MongoDB 클라이언트 객체 및 데이터베이스에 값을 넣고 빼는 함수를 정의합니다. -""" \ No newline at end of file +""" diff --git a/db/client.py b/db/client.py index 3d83d42..194119e 100644 --- a/db/client.py +++ b/db/client.py @@ -4,4 +4,4 @@ client = pymongo.MongoClient(MongoDBSecrets.connection_uri) """ MongoDB 클라이언트 객체입니다. -""" \ No newline at end of file +""" diff --git a/scrap/__init__.py b/scrap/__init__.py index bb9b620..ae277ea 100644 --- a/scrap/__init__.py +++ b/scrap/__init__.py @@ -1,3 +1,3 @@ """ 지방의회 크롤링을 위한 파일들을 모아놓은 패키지입니다. -""" \ No newline at end of file +""" diff --git a/scrap/examples/__init__.py b/scrap/examples/__init__.py index fd09f19..ab05e7a 100644 --- a/scrap/examples/__init__.py +++ b/scrap/examples/__init__.py @@ -1,3 +1,3 @@ """ 예시 파일들을 모아놓은 폴더입니다. -""" \ No newline at end of file +""" diff --git a/scrap/examples/database.py b/scrap/examples/database.py index 371b168..d8c3dfe 100644 --- a/scrap/examples/database.py +++ b/scrap/examples/database.py @@ -3,7 +3,12 @@ """ from scrap.utils.database import save_to_database -from scrap.local_councils.seoul import scrap_dongdaemungu, scrap_gwangjingu, scrap_junggu +from scrap.local_councils.seoul import ( + scrap_dongdaemungu, + scrap_gwangjingu, + scrap_junggu, +) + def main() -> None: # 서울시 동대문구의회 크롤링 결과를 데이터베이스에 저장합니다. @@ -13,5 +18,6 @@ def main() -> None: # 서울시 중구의회 크롤링 결과를 데이터베이스에 저장합니다. save_to_database(scrap_junggu()) -if __name__ == '__main__': - main() \ No newline at end of file + +if __name__ == "__main__": + main() diff --git a/scrap/examples/junggu_scrap.py b/scrap/examples/junggu_scrap.py index 7416668..17b74cf 100644 --- a/scrap/examples/junggu_scrap.py +++ b/scrap/examples/junggu_scrap.py @@ -10,23 +10,29 @@ full_url = base_url + link response = requests.get(full_url, verify=False) -soup = BeautifulSoup(response.text, 'html.parser') +soup = BeautifulSoup(response.text, "html.parser") -profiles = soup.find_all('div', class_='profile') +profiles = soup.find_all("div", class_="profile") for profile in profiles: - name = profile.find('em', class_='name').text - party = profile.find('ul', class_='dot').find('li').find_next_sibling('li').find('span').text - + name = profile.find("em", class_="name").text + party = ( + profile.find("ul", class_="dot") + .find("li") + .find_next_sibling("li") + .find("span") + .text + ) + # 프로필보기 링크 가져오기 - profile_link = profile.find('a', class_='start') + profile_link = profile.find("a", class_="start") if profile_link: - profile_url = base_url + profile_link['href'] - + profile_url = base_url + profile_link["href"] + # 프로필 페이지로 이동 profile_response = requests.get(profile_url, verify=False) - profile_soup = BeautifulSoup(profile_response.text, 'html.parser') - + profile_soup = BeautifulSoup(profile_response.text, "html.parser") + # 프로필 페이지에서 원하는 정보를 추출하고 출력 # 여기에서 필요한 정보를 추출하는 방법에 따라 코드를 작성해주세요. @@ -34,10 +40,8 @@ # print('프로필 페이지 URL:', profile_url) # print('---') # "소속정당" 정보 추출 - party_info = profile_soup.find('em', text='소속정당 : ') - party = party_info.find_next('span').string if party_info else '정당 정보 없음' - - print('이름:', name) - print('정당:', party) - + party_info = profile_soup.find("em", text="소속정당 : ") + party = party_info.find_next("span").string if party_info else "정당 정보 없음" + print("이름:", name) + print("정당:", party) diff --git a/scrap/local_councils/__init__.py b/scrap/local_councils/__init__.py index d4f68ed..a4e1fc8 100644 --- a/scrap/local_councils/__init__.py +++ b/scrap/local_councils/__init__.py @@ -4,4 +4,4 @@ """ from .daejeon import * from .ulsan import * -from .basic import * \ No newline at end of file +from .basic import * diff --git a/scrap/local_councils/basic.py b/scrap/local_councils/basic.py index aecf9ab..3df915b 100644 --- a/scrap/local_councils/basic.py +++ b/scrap/local_councils/basic.py @@ -6,35 +6,41 @@ import re import requests -regex_pattern = re.compile(r'정\s*\S*\s*당', re.IGNORECASE) # Case-insensitive +regex_pattern = re.compile(r"정\s*\S*\s*당", re.IGNORECASE) # Case-insensitive party_keywords = getPartyList() -party_keywords.append('무소속') +party_keywords.append("무소속") -pf_elt = [None, 'div', 'div'] -pf_cls = [None, 'profile', 'profile'] +pf_elt = [None, "div", "div"] +pf_cls = [None, "profile", "profile"] pf_memlistelt = [None, None, None] -name_elt = [None, 'em', 'em'] -name_cls = [None, 'name', 'name'] -name_wrapelt= [None, None, None] +name_elt = [None, "em", "em"] +name_cls = [None, "name", "name"] +name_wrapelt = [None, None, None] name_wrapcls = [None, None, None] -pty_elt = [None, 'em', 'em'] +pty_elt = [None, "em", "em"] pty_cls = [None, None, None] pty_wrapelt = [None, None, None] pty_wrapcls = [None, None, None] + def get_profiles(soup, element, class_, memberlistelement): # 의원 목록 사이트에서 의원 프로필을 가져옴 if memberlistelement is not None: - soup = soup.find_all(memberlistelement, class_='memberList')[0] + soup = soup.find_all(memberlistelement, class_="memberList")[0] return soup.find_all(element, class_) + def getDataFromAPI(url_format, data_uid, name_id, party_id) -> Councilor: # API로부터 의원 정보를 가져옴 url = url_format.format(data_uid) result = requests.get(url).json() - return Councilor(name=result[name_id] if result[name_id] else '이름 정보 없음', party=result[party_id] if result[party_id] else '정당 정보 없음') + return Councilor( + name=result[name_id] if result[name_id] else "이름 정보 없음", + party=result[party_id] if result[party_id] else "정당 정보 없음", + ) + def get_name(profile, element, class_, wrapper_element, wrapper_class_): # 의원 프로필에서 의원 이름을 가져옴 @@ -42,76 +48,101 @@ def get_name(profile, element, class_, wrapper_element, wrapper_class_): profile = profile.find_all(wrapper_element, class_=wrapper_class_)[0] name_tag = profile.find(element, class_) name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - if len(name) > 10: # strong태그 등 많은 걸 name 태그 안에 포함하는 경우. 은평구 등. + if len(name) > 10: # strong태그 등 많은 걸 name 태그 안에 포함하는 경우. 은평구 등. name = name_tag.strong.get_text(strip=True) if name_tag.strong else "이름 정보 없음" - name = name.split('(')[0].split(':')[-1] # 이름 뒷 한자이름, 앞 '이 름:' 제거 + name = name.split("(")[0].split(":")[-1] # 이름 뒷 한자이름, 앞 '이 름:' 제거 # 수식어가 이름 뒤에 붙어있는 경우 while len(name) > 5: - if name[-3:] in ['부의장']: # 119 등. + if name[-3:] in ["부의장"]: # 119 등. name = name[:-3].strip() else: break while len(name) > 4: - if name[-2:] in ['의원', '의장']: # 강서구 등. + if name[-2:] in ["의원", "의장"]: # 강서구 등. name = name[:-2].strip() else: - break # 4자 이름 고려. + break # 4자 이름 고려. return name + def extract_party(string): for keyword in party_keywords: if keyword in string: return keyword return None -def get_party(profile, element, class_, wrapper_element, wrapper_class_, party_in_main_page, url): + +def get_party( + profile, element, class_, wrapper_element, wrapper_class_, party_in_main_page, url +): # 의원 프로필에서 의원이 몸담는 정당 이름을 가져옴 if not party_in_main_page: parsed_url = urlparse(url) base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" # 프로필보기 링크 가져오기 - profile_link = profile.find('a', class_='start') - profile_url = base_url + profile_link['href'] + profile_link = profile.find("a", class_="start") + profile_url = base_url + profile_link["href"] profile = get_soup(profile_url, verify=False) - party_pulp_list = list(filter(lambda x: regex_pattern.search(str(x)), profile.find_all(element, class_))) + party_pulp_list = list( + filter( + lambda x: regex_pattern.search(str(x)), profile.find_all(element, class_) + ) + ) party_pulp = party_pulp_list[0] party_string = party_pulp.get_text(strip=True) - party_string = party_string.split(' ')[-1].strip() + party_string = party_string.split(" ")[-1].strip() while True: if (party := extract_party(party_string)) is not None: return party - if (party_span := party_pulp.find_next('span')) is not None: - party_string = party_span.text.split(' ')[-1] + if (party_span := party_pulp.find_next("span")) is not None: + party_string = party_span.text.split(" ")[-1] else: return "정당 정보 파싱 불가" -def scrap_basic(url, cid, encoding = 'utf-8') -> ScrapResult: - '''의원 상세약력 스크랩 + +def scrap_basic(url, cid, encoding="utf-8") -> ScrapResult: + """의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :param n: 의회 id :param encoding: 받아온 soup 인코딩 :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 - ''' + """ soup = get_soup(url, verify=False, encoding=encoding) councilors: list[Councilor] = [] party_in_main_page = any(keyword in soup.text for keyword in party_keywords) - - profiles = get_profiles(soup, pf_elt[cid - 1], pf_cls[cid - 1], pf_memlistelt[cid - 1]) - print(cid, '번째 의회에는,', len(profiles), '명의 의원이 있습니다.') # 디버깅용. + + profiles = get_profiles( + soup, pf_elt[cid - 1], pf_cls[cid - 1], pf_memlistelt[cid - 1] + ) + print(cid, "번째 의회에는,", len(profiles), "명의 의원이 있습니다.") # 디버깅용. for profile in profiles: - name = get_name(profile, name_elt[cid - 1], name_cls[cid - 1], name_wrapelt[cid - 1], name_wrapcls[cid - 1]) - party = get_party(profile, pty_elt[cid - 1], pty_cls[cid - 1], pty_wrapelt[cid - 1], pty_wrapcls[cid - 1], party_in_main_page, url) - + name = get_name( + profile, + name_elt[cid - 1], + name_cls[cid - 1], + name_wrapelt[cid - 1], + name_wrapcls[cid - 1], + ) + party = get_party( + profile, + pty_elt[cid - 1], + pty_cls[cid - 1], + pty_wrapelt[cid - 1], + pty_wrapcls[cid - 1], + party_in_main_page, + url, + ) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id=str(cid), council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -if __name__ == '__main__': - print(scrap_basic('https://www.yscl.go.kr/kr/member/name.do', 3)) # 서울 용산구 \ No newline at end of file + +if __name__ == "__main__": + print(scrap_basic("https://www.yscl.go.kr/kr/member/name.do", 3)) # 서울 용산구 diff --git a/scrap/local_councils/busan.py b/scrap/local_councils/busan.py index e859a82..be10dc3 100644 --- a/scrap/local_councils/busan.py +++ b/scrap/local_councils/busan.py @@ -4,7 +4,9 @@ from scrap.utils.requests import get_soup -def scrap_26(url='https://www.bsjunggu.go.kr/council/board/list.junggu?boardId=BBS_0000118&menuCd=DOM_000000503003000000&contentsSid=755&cpath=%2Fcouncil') -> ScrapResult: +def scrap_26( + url="https://www.bsjunggu.go.kr/council/board/list.junggu?boardId=BBS_0000118&menuCd=DOM_000000503003000000&contentsSid=755&cpath=%2Fcouncil", +) -> ScrapResult: """부산시 중구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -13,12 +15,16 @@ def scrap_26(url='https://www.bsjunggu.go.kr/council/board/list.junggu?boardId=B soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find('div', class_='bbs_blog council').find_all('dl'): - name_tag = profile.find_next('dt') - name = name_tag.get_text(strip=True).split()[-1].strip() if name_tag else "이름 정보 없음" + for profile in soup.find("div", class_="bbs_blog council").find_all("dl"): + name_tag = profile.find_next("dt") + name = ( + name_tag.get_text(strip=True).split()[-1].strip() + if name_tag + else "이름 정보 없음" + ) - party = '정당 정보 없음' - party_info = profile.find_next('li') + party = "정당 정보 없음" + party_info = profile.find_next("li") if party_info: party = party_info.get_text(strip=True)[3:] @@ -27,11 +33,13 @@ def scrap_26(url='https://www.bsjunggu.go.kr/council/board/list.junggu?boardId=B return ScrapResult( council_id="busan-junggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_27(url='https://www.bsseogu.go.kr/council/board/list.bsseogu?boardId=BBS_0000097&categoryCode1=8&menuCd=DOM_000000603001000000&contentsSid=785&cpath=%2Fcouncil') -> ScrapResult: +def scrap_27( + url="https://www.bsseogu.go.kr/council/board/list.bsseogu?boardId=BBS_0000097&categoryCode1=8&menuCd=DOM_000000603001000000&contentsSid=785&cpath=%2Fcouncil", +) -> ScrapResult: """부산시 서구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -40,22 +48,22 @@ def scrap_27(url='https://www.bsseogu.go.kr/council/board/list.bsseogu?boardId=B soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - # 프로필 링크 스크랩을 위해 base_url 추출 + # 프로필 링크 스크랩을 위해 base_url 추출 parsed_url = urlparse(url) base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" - for profile in soup.find_all('div', class_='intro'): - name_tag = profile.find_next('span').find_next('span') + for profile in soup.find_all("div", class_="intro"): + name_tag = profile.find_next("span").find_next("span") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' + party = "정당 정보 없음" # 프로필보기 링크 가져오기 - profile_link = profile.find('a') + profile_link = profile.find("a") if profile_link: - profile_url = base_url + '/council' + profile_link['href'] + profile_url = base_url + "/council" + profile_link["href"] profile_soup = get_soup(profile_url, verify=False) - party_info = profile_soup.find('span', string='소속정당') + party_info = profile_soup.find("span", string="소속정당") if party_info and (party_span := party_info.parent) is not None: party = party_span.text[4:].strip() @@ -64,11 +72,13 @@ def scrap_27(url='https://www.bsseogu.go.kr/council/board/list.bsseogu?boardId=B return ScrapResult( council_id="busan-seogu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_28(url='https://www.bsdonggu.go.kr/council/index.donggu?menuCd=DOM_000000502004000000') -> ScrapResult: +def scrap_28( + url="https://www.bsdonggu.go.kr/council/index.donggu?menuCd=DOM_000000502004000000", +) -> ScrapResult: """부산시 동구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -77,25 +87,25 @@ def scrap_28(url='https://www.bsdonggu.go.kr/council/index.donggu?menuCd=DOM_000 soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='council_box'): - name_tag = profile.find_next('span', class_='n2') + for profile in soup.find_all("div", class_="council_box"): + name_tag = profile.find_next("span", class_="n2") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' - party_info = profile.find_next('span', class_='n1') + party = "정당 정보 없음" + party_info = profile.find_next("span", class_="n1") if party_info: - party = party_info.get_text(strip=True).split('(')[1][:-1].strip() + party = party_info.get_text(strip=True).split("(")[1][:-1].strip() councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="busan-donggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_29(url='https://www.yeongdo.go.kr/council/01211/01212.web') -> ScrapResult: +def scrap_29(url="https://www.yeongdo.go.kr/council/01211/01212.web") -> ScrapResult: """부산시 영도구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -104,11 +114,15 @@ def scrap_29(url='https://www.yeongdo.go.kr/council/01211/01212.web') -> ScrapRe soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='even-grid gap3pct panel1 p01205bg'): - name_tag = profile.find_next('strong', class_='h1 title') - name = name_tag.get_text(strip=True).split(' ')[0].strip() if name_tag else "이름 정보 없음" + for profile in soup.find_all("div", class_="even-grid gap3pct panel1 p01205bg"): + name_tag = profile.find_next("strong", class_="h1 title") + name = ( + name_tag.get_text(strip=True).split(" ")[0].strip() + if name_tag + else "이름 정보 없음" + ) - party = '정당 정보 없음' + party = "정당 정보 없음" # TODO councilors.append(Councilor(name=name, party=party)) @@ -116,51 +130,57 @@ def scrap_29(url='https://www.yeongdo.go.kr/council/01211/01212.web') -> ScrapRe return ScrapResult( council_id="busan-yeongdogu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_30(url='https://council.busanjin.go.kr/content/member/member.html') -> ScrapResult: +def scrap_30( + url="https://council.busanjin.go.kr/content/member/member.html", +) -> ScrapResult: """부산시 부산진구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 """ - soup = get_soup(url, verify=False).find('ul', class_='mlist') + soup = get_soup(url, verify=False).find("ul", class_="mlist") councilors: list[Councilor] = [] - for profile in soup.find_all('dl'): - name_tag = profile.find('dd', class_='name') + for profile in soup.find_all("dl"): + name_tag = profile.find("dd", class_="name") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' - party_info = profile.find_all('b')[2] + party = "정당 정보 없음" + party_info = profile.find_all("b")[2] if party_info: - party = party_info.find_next('span', class_='itemContent').get_text(strip=True) + party = party_info.find_next("span", class_="itemContent").get_text( + strip=True + ) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="busan-busanjingu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_31(url='http://council.dongnae.go.kr/source/kr/member/active.html') -> ScrapResult: +def scrap_31( + url="http://council.dongnae.go.kr/source/kr/member/active.html", +) -> ScrapResult: """부산시 동래구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 """ - soup = get_soup(url, verify=False, encoding='euc-kr') + soup = get_soup(url, verify=False, encoding="euc-kr") councilors: list[Councilor] = [] - for name_tag in soup.find_all('li', class_='name'): + for name_tag in soup.find_all("li", class_="name"): name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' - party_info = name_tag.find_next('li').find_next('li') + party = "정당 정보 없음" + party_info = name_tag.find_next("li").find_next("li") if party_info: party = party_info.get_text(strip=True).split()[-1].strip() @@ -169,11 +189,11 @@ def scrap_31(url='http://council.dongnae.go.kr/source/kr/member/active.html') -> return ScrapResult( council_id="busan-dongnaegu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_32(url='https://council.bsnamgu.go.kr/kr/member/active') -> ScrapResult: +def scrap_32(url="https://council.bsnamgu.go.kr/kr/member/active") -> ScrapResult: """부산시 남구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -182,25 +202,32 @@ def scrap_32(url='https://council.bsnamgu.go.kr/kr/member/active') -> ScrapResul soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('dl', class_='profile'): - name_tag = profile.find('strong') + for profile in soup.find_all("dl", class_="profile"): + name_tag = profile.find("strong") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' - party_info = profile.find('span', class_='sbj', string='정 당') + party = "정당 정보 없음" + party_info = profile.find("span", class_="sbj", string="정 당") if party_info: - party = party_info.find_next('span', class_='detail').get_text(strip=True).split()[-1].strip() + party = ( + party_info.find_next("span", class_="detail") + .get_text(strip=True) + .split()[-1] + .strip() + ) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="busan-namgu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_33(url='https://www.bsbukgu.go.kr/council/index.bsbukgu?menuCd=DOM_000000808001001000') -> ScrapResult: +def scrap_33( + url="https://www.bsbukgu.go.kr/council/index.bsbukgu?menuCd=DOM_000000808001001000", +) -> ScrapResult: """부산시 북구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -209,12 +236,12 @@ def scrap_33(url='https://www.bsbukgu.go.kr/council/index.bsbukgu?menuCd=DOM_000 soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('dl', class_='info'): - name_tag = profile.find('span') + for profile in soup.find_all("dl", class_="info"): + name_tag = profile.find("span") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' - party_info = profile.find('span', string='소속정당') + party = "정당 정보 없음" + party_info = profile.find("span", string="소속정당") if party_info: party = party_info.parent.get_text(strip=True).split()[-1].strip() @@ -223,33 +250,35 @@ def scrap_33(url='https://www.bsbukgu.go.kr/council/index.bsbukgu?menuCd=DOM_000 return ScrapResult( council_id="busan-bukgu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_34(url='https://council.haeundae.go.kr/board/list.do?boardId=BBS_0000096&categoryCode1=08&menuCd=DOM_000000702001001000&contentsSid=330') -> ScrapResult: +def scrap_34( + url="https://council.haeundae.go.kr/board/list.do?boardId=BBS_0000096&categoryCode1=08&menuCd=DOM_000000702001001000&contentsSid=330", +) -> ScrapResult: """부산시 해운대구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 """ - soup = get_soup(url, verify=False).find('div', class_='initial_list') + soup = get_soup(url, verify=False).find("div", class_="initial_list") councilors: list[Councilor] = [] # 프로필 링크 스크랩을 위해 base_url 추출 parsed_url = urlparse(url) base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" - for name_tag in soup.find_all('dd'): + for name_tag in soup.find_all("dd"): name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" # 프로필보기 링크 가져오기 - profile_link = name_tag.find('a') + profile_link = name_tag.find("a") if profile_link: - profile_url = base_url + profile_link['href'] + profile_url = base_url + profile_link["href"] profile_soup = get_soup(profile_url, verify=False) - party_info = profile_soup.find('span', string='소속정당') + party_info = profile_soup.find("span", string="소속정당") if party_info and (party_span := party_info.parent) is not None: party = party_span.text[4:].strip() @@ -258,26 +287,28 @@ def scrap_34(url='https://council.haeundae.go.kr/board/list.do?boardId=BBS_00000 return ScrapResult( council_id="busan-haeundaegu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_35(url='https://council.gijang.go.kr/source/korean/member/active.html') -> ScrapResult: +def scrap_35( + url="https://council.gijang.go.kr/source/korean/member/active.html", +) -> ScrapResult: """부산시 기장군 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 """ - soup = get_soup(url, verify=False, encoding='euc-kr') + soup = get_soup(url, verify=False, encoding="euc-kr") councilors: list[Councilor] = [] - for profile in soup.find_all('ul', class_='wulli bul02'): - li_tags = profile.find_all('li') + for profile in soup.find_all("ul", class_="wulli bul02"): + li_tags = profile.find_all("li") name_tag = li_tags[0] name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' + party = "정당 정보 없음" party_info = li_tags[2] if party_info: party = party_info.get_text(strip=True).split()[-1].strip() @@ -287,11 +318,13 @@ def scrap_35(url='https://council.gijang.go.kr/source/korean/member/active.html' return ScrapResult( council_id="busan-gijanggun", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_36(url='https://www.saha.go.kr/council/congressMember/list03.do?mId=0403000000') -> ScrapResult: +def scrap_36( + url="https://www.saha.go.kr/council/congressMember/list03.do?mId=0403000000", +) -> ScrapResult: """부산시 사하구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -300,12 +333,12 @@ def scrap_36(url='https://www.saha.go.kr/council/congressMember/list03.do?mId=04 soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for district_tag in soup.find_all('div', class_='list_member'): - for name_tag in district_tag.find_all('h4', class_='name'): + for district_tag in soup.find_all("div", class_="list_member"): + for name_tag in district_tag.find_all("h4", class_="name"): name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' - party_info = name_tag.find_next('span', string='소속당 : ') + party = "정당 정보 없음" + party_info = name_tag.find_next("span", string="소속당 : ") if party_info: party = party_info.parent.get_text(strip=True)[7:].strip() @@ -314,27 +347,29 @@ def scrap_36(url='https://www.saha.go.kr/council/congressMember/list03.do?mId=04 return ScrapResult( council_id="busan-sahagu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_37(url='https://council.geumjeong.go.kr/index.geumj?menuCd=DOM_000000716001000000') -> ScrapResult: +def scrap_37( + url="https://council.geumjeong.go.kr/index.geumj?menuCd=DOM_000000716001000000", +) -> ScrapResult: """부산시 금정구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 """ - soup = get_soup(url, verify=False).find('div', class_='council_list') + soup = get_soup(url, verify=False).find("div", class_="council_list") councilors: list[Councilor] = [] - for profile in soup.find_all('a'): - name_tag = profile.find('span', class_='tit').find('span') + for profile in soup.find_all("a"): + name_tag = profile.find("span", class_="tit").find("span") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - profile_url = profile['href'][:65] + '1' + profile['href'][66:] + profile_url = profile["href"][:65] + "1" + profile["href"][66:] profile_soup = get_soup(profile_url, verify=False) - party_info = profile_soup.find('span', class_='name', string='정당') + party_info = profile_soup.find("span", class_="name", string="정당") if party_info and (party_span := party_info.parent) is not None: party = party_span.text[2:].strip() @@ -343,11 +378,13 @@ def scrap_37(url='https://council.geumjeong.go.kr/index.geumj?menuCd=DOM_0000007 return ScrapResult( council_id="busan-geumjeonggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_38(url='https://www.bsgangseo.go.kr/council/contents.do?mId=0203000000') -> ScrapResult: +def scrap_38( + url="https://www.bsgangseo.go.kr/council/contents.do?mId=0203000000", +) -> ScrapResult: """부산시 강서구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -356,14 +393,16 @@ def scrap_38(url='https://www.bsgangseo.go.kr/council/contents.do?mId=0203000000 soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile_img in soup.find_all('button', class_='btn_close'): - profile = profile_img.find_next('dl') + for profile_img in soup.find_all("button", class_="btn_close"): + profile = profile_img.find_next("dl") - name_tag = profile.find('dd', class_='name') - name = name_tag.get_text(strip=True).split()[0].strip() if name_tag else "이름 정보 없음" + name_tag = profile.find("dd", class_="name") + name = ( + name_tag.get_text(strip=True).split()[0].strip() if name_tag else "이름 정보 없음" + ) - party = '정당 정보 없음' - party_info = profile.find('span', class_='bold', string='정당 : ') + party = "정당 정보 없음" + party_info = profile.find("span", class_="bold", string="정당 : ") if party_info: party = party_info.parent.get_text(strip=True)[5:].strip() @@ -372,11 +411,13 @@ def scrap_38(url='https://www.bsgangseo.go.kr/council/contents.do?mId=0203000000 return ScrapResult( council_id="busan-gangseogu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_39(url='https://www.yeonje.go.kr/council/assemblyIntro/list.do?mId=0201000000') -> ScrapResult: +def scrap_39( + url="https://www.yeonje.go.kr/council/assemblyIntro/list.do?mId=0201000000", +) -> ScrapResult: """부산시 연제구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -385,11 +426,11 @@ def scrap_39(url='https://www.yeonje.go.kr/council/assemblyIntro/list.do?mId=020 soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('dl', class_='info'): - name_tag = profile.find('span') + for profile in soup.find_all("dl", class_="info"): + name_tag = profile.find("span") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당정보없음' + party = "정당정보없음" # TODO @@ -398,11 +439,13 @@ def scrap_39(url='https://www.yeonje.go.kr/council/assemblyIntro/list.do?mId=020 return ScrapResult( council_id="busan-yeonjegu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_40(url='https://www.suyeong.go.kr/council/index.suyeong?menuCd=DOM_000001402001001000&link=success&cpath=%2Fcouncil') -> ScrapResult: +def scrap_40( + url="https://www.suyeong.go.kr/council/index.suyeong?menuCd=DOM_000001402001001000&link=success&cpath=%2Fcouncil", +) -> ScrapResult: """부산시 수영구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -411,12 +454,12 @@ def scrap_40(url='https://www.suyeong.go.kr/council/index.suyeong?menuCd=DOM_000 soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='mem_info'): - name_tag = profile.find('span', class_='name').find('span') + for profile in soup.find_all("div", class_="mem_info"): + name_tag = profile.find("span", class_="name").find("span") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' - party_info = profile.find('span', string='소속정당 :') + party = "정당 정보 없음" + party_info = profile.find("span", string="소속정당 :") if party_info: party = party_info.parent.get_text(strip=True)[6:].strip() @@ -425,11 +468,13 @@ def scrap_40(url='https://www.suyeong.go.kr/council/index.suyeong?menuCd=DOM_000 return ScrapResult( council_id="busan-suyeonggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_41(url='https://www.sasang.go.kr/council/index.sasang?menuCd=DOM_000000202005000000') -> ScrapResult: +def scrap_41( + url="https://www.sasang.go.kr/council/index.sasang?menuCd=DOM_000000202005000000", +) -> ScrapResult: """부산시 사상구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -438,24 +483,28 @@ def scrap_41(url='https://www.sasang.go.kr/council/index.sasang?menuCd=DOM_00000 soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for district in soup.find_all('ul', class_='council_list'): - for profile in district.find_all('li'): - name_tag = profile.find('span', class_='tit') - name = name_tag.get_text(strip=True).split()[0].strip() if name_tag else "이름 정보 없음" - - party = '정당 정보 없음' - party_info = profile.find('span', class_='con') + for district in soup.find_all("ul", class_="council_list"): + for profile in district.find_all("li"): + name_tag = profile.find("span", class_="tit") + name = ( + name_tag.get_text(strip=True).split()[0].strip() + if name_tag + else "이름 정보 없음" + ) + + party = "정당 정보 없음" + party_info = profile.find("span", class_="con") if party_info: - party = party_info.get_text(strip=True).split(']')[0].strip()[1:] + party = party_info.get_text(strip=True).split("]")[0].strip()[1:] councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="busan-sasanggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -if __name__ == '__main__': - print(scrap_41()) \ No newline at end of file +if __name__ == "__main__": + print(scrap_41()) diff --git a/scrap/local_councils/daegu.py b/scrap/local_councils/daegu.py index a11baac..f565e11 100644 --- a/scrap/local_councils/daegu.py +++ b/scrap/local_councils/daegu.py @@ -4,21 +4,25 @@ from scrap.utils.requests import get_soup -def scrap_42(url='https://junggucouncil.daegu.kr/source/main03/main01.html?d_th=8') -> ScrapResult: +def scrap_42( + url="https://junggucouncil.daegu.kr/source/main03/main01.html?d_th=8", +) -> ScrapResult: """대전시 중구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 """ - soup = get_soup(url, verify=False, encoding='euc-kr') + soup = get_soup(url, verify=False, encoding="euc-kr") councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='profile'): - name_tag = profile.find('li', class_='name') - name = name_tag.get_text(strip=True).split()[1].strip() if name_tag else "이름 정보 없음" + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find("li", class_="name") + name = ( + name_tag.get_text(strip=True).split()[1].strip() if name_tag else "이름 정보 없음" + ) - party = '정당 정보 없음' - party_info = name_tag.find_next('li').find_next('li') + party = "정당 정보 없음" + party_info = name_tag.find_next("li").find_next("li") if party_info: party = party_info.get_text(strip=True).split()[-1].strip() @@ -27,11 +31,13 @@ def scrap_42(url='https://junggucouncil.daegu.kr/source/main03/main01.html?d_th= return ScrapResult( council_id="daejeon-junggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_43(url='https://www.donggucl.daegu.kr/content/member/member.html') -> ScrapResult: +def scrap_43( + url="https://www.donggucl.daegu.kr/content/member/member.html", +) -> ScrapResult: """대전시 동구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -39,22 +45,26 @@ def scrap_43(url='https://www.donggucl.daegu.kr/content/member/member.html') -> """ soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - - # 프로필 링크 스크랩을 위해 base_url 추출 + + # 프로필 링크 스크랩을 위해 base_url 추출 parsed_url = urlparse(url) base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" - for name_tag in soup.find_all('dd', class_='name'): - name = name_tag.get_text(strip=True).split('(')[0].strip() if name_tag else "이름 정보 없음" - party = '정당 정보 없음' + for name_tag in soup.find_all("dd", class_="name"): + name = ( + name_tag.get_text(strip=True).split("(")[0].strip() + if name_tag + else "이름 정보 없음" + ) + party = "정당 정보 없음" - profile_link = name_tag.find_next('a', class_='abtn_profile') + profile_link = name_tag.find_next("a", class_="abtn_profile") if profile_link: - profile_url = base_url + profile_link['href'] + profile_url = base_url + profile_link["href"] profile_soup = get_soup(profile_url, verify=False) - party_info = profile_soup.find('th', scope='row', string='소속정당') - if party_info and (party_span := party_info.find_next('td')) is not None: + party_info = profile_soup.find("th", scope="row", string="소속정당") + if party_info and (party_span := party_info.find_next("td")) is not None: party = party_span.get_text(strip=True) councilors.append(Councilor(name=name, party=party)) @@ -62,11 +72,11 @@ def scrap_43(url='https://www.donggucl.daegu.kr/content/member/member.html') -> return ScrapResult( council_id="daejeon-donggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_44(url='https://www.dgscouncil.go.kr/kr/member/active') -> ScrapResult: +def scrap_44(url="https://www.dgscouncil.go.kr/kr/member/active") -> ScrapResult: """대전시 서구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -75,12 +85,16 @@ def scrap_44(url='https://www.dgscouncil.go.kr/kr/member/active') -> ScrapResult soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('dl', class_='profile'): - name_tag = profile.find('strong', class_='name') - name = name_tag.get_text(strip=True).split('(')[0].strip() if name_tag else "이름 정보 없음" + for profile in soup.find_all("dl", class_="profile"): + name_tag = profile.find("strong", class_="name") + name = ( + name_tag.get_text(strip=True).split("(")[0].strip() + if name_tag + else "이름 정보 없음" + ) - party = '정당 정보 없음' - party_info = profile.find('li').find_next('li').find_next('li') + party = "정당 정보 없음" + party_info = profile.find("li").find_next("li").find_next("li") if party_info: party = party_info.get_text(strip=True).split()[-1].strip() @@ -89,11 +103,13 @@ def scrap_44(url='https://www.dgscouncil.go.kr/kr/member/active') -> ScrapResult return ScrapResult( council_id="daejeon-seogu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_45(url='https://nam.daegu.kr/council/index.do?menu_id=00000548') -> ScrapResult: +def scrap_45( + url="https://nam.daegu.kr/council/index.do?menu_id=00000548", +) -> ScrapResult: """대전시 남구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -102,12 +118,14 @@ def scrap_45(url='https://nam.daegu.kr/council/index.do?menu_id=00000548') -> Sc soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='profile'): - name_tag = profile.find('span', class_='name2') + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find("span", class_="name2") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' - party_info = profile.find('span', class_='name', string='소속정당').find_next('span', class_='name3') + party = "정당 정보 없음" + party_info = profile.find("span", class_="name", string="소속정당").find_next( + "span", class_="name3" + ) if party_info: party = party_info.get_text(strip=True) @@ -116,11 +134,11 @@ def scrap_45(url='https://nam.daegu.kr/council/index.do?menu_id=00000548') -> Sc return ScrapResult( council_id="daejeon-namgu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_46(url='https://bukgucouncil.daegu.kr/kr/member/name.do') -> ScrapResult: +def scrap_46(url="https://bukgucouncil.daegu.kr/kr/member/name.do") -> ScrapResult: """대전시 북구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -129,12 +147,14 @@ def scrap_46(url='https://bukgucouncil.daegu.kr/kr/member/name.do') -> ScrapResu soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='profile'): - name_tag = profile.find('em', class_='name') - name = name_tag.get_text(strip=True).split()[0].strip() if name_tag else "이름 정보 없음" + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find("em", class_="name") + name = ( + name_tag.get_text(strip=True).split()[0].strip() if name_tag else "이름 정보 없음" + ) - party = '정당 정보 없음' - party_info = profile.find('em', string='소속정당 : ').find_next('span') + party = "정당 정보 없음" + party_info = profile.find("em", string="소속정당 : ").find_next("span") if party_info: party = party_info.get_text(strip=True) @@ -143,11 +163,13 @@ def scrap_46(url='https://bukgucouncil.daegu.kr/kr/member/name.do') -> ScrapResu return ScrapResult( council_id="daejeon-bukgu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_47(url='https://suseongcouncil.suseong.kr/ss_council/content/?pos=active&me_code=2010') -> ScrapResult: +def scrap_47( + url="https://suseongcouncil.suseong.kr/ss_council/content/?pos=active&me_code=2010", +) -> ScrapResult: """대전시 수성구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -156,12 +178,12 @@ def scrap_47(url='https://suseongcouncil.suseong.kr/ss_council/content/?pos=acti soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='item'): - name_tag = profile.find('p', class_='name').find('span') + for profile in soup.find_all("div", class_="item"): + name_tag = profile.find("p", class_="name").find("span") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' - party_info = profile.find_all('li')[2].find('span') + party = "정당 정보 없음" + party_info = profile.find_all("li")[2].find("span") if party_info: party = party_info.get_text(strip=True) @@ -170,11 +192,13 @@ def scrap_47(url='https://suseongcouncil.suseong.kr/ss_council/content/?pos=acti return ScrapResult( council_id="daejeon-suseonggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_48(url='https://www.dalseocouncil.daegu.kr/content/member/member.html') -> ScrapResult: +def scrap_48( + url="https://www.dalseocouncil.daegu.kr/content/member/member.html", +) -> ScrapResult: """대전시 달서구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -183,11 +207,15 @@ def scrap_48(url='https://www.dalseocouncil.daegu.kr/content/member/member.html' soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for name_tag in soup.find_all('dd', class_='name'): - name = name_tag.get_text(strip=True).split('(')[0].strip() if name_tag else "이름 정보 없음" + for name_tag in soup.find_all("dd", class_="name"): + name = ( + name_tag.get_text(strip=True).split("(")[0].strip() + if name_tag + else "이름 정보 없음" + ) - party = '정당 정보 없음' - party_info = name_tag.find_next('span', string='소속정당').parent + party = "정당 정보 없음" + party_info = name_tag.find_next("span", string="소속정당").parent if party_info: party = party_info.get_text(strip=True).split()[-1].strip() @@ -196,11 +224,13 @@ def scrap_48(url='https://www.dalseocouncil.daegu.kr/content/member/member.html' return ScrapResult( council_id="daejeon-dalseogu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_49(url='https://council.dalseong.go.kr/content/member/member.html') -> ScrapResult: +def scrap_49( + url="https://council.dalseong.go.kr/content/member/member.html", +) -> ScrapResult: """대전시 달성군 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -213,27 +243,35 @@ def scrap_49(url='https://council.dalseong.go.kr/content/member/member.html') -> parsed_url = urlparse(url) base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" - for name_tag in soup.find_all('dd', class_='name'): - name = name_tag.get_text(strip=True).split('(')[0].strip() if name_tag else "이름 정보 없음" - party = '정당 정보 없음' + for name_tag in soup.find_all("dd", class_="name"): + name = ( + name_tag.get_text(strip=True).split("(")[0].strip() + if name_tag + else "이름 정보 없음" + ) + party = "정당 정보 없음" - profile_link = name_tag.find_next('a', class_='abtn1') + profile_link = name_tag.find_next("a", class_="abtn1") if profile_link: - profile_url = base_url + profile_link['href'] + profile_url = base_url + profile_link["href"] profile_soup = get_soup(profile_url, verify=False) - party_info = profile_soup.find('span', class_='item', string='소속정당') - if party_info and (party_span := party_info.find_next('span', class_='item_content')) is not None: + party_info = profile_soup.find("span", class_="item", string="소속정당") + if ( + party_info + and (party_span := party_info.find_next("span", class_="item_content")) + is not None + ): party = party_span.get_text(strip=True) - + councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="daejeon-dalseonggun", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -if __name__ == '__main__': - print(scrap_49()) \ No newline at end of file +if __name__ == "__main__": + print(scrap_49()) diff --git a/scrap/local_councils/daejeon.py b/scrap/local_councils/daejeon.py index 14a484f..091a130 100644 --- a/scrap/local_councils/daejeon.py +++ b/scrap/local_councils/daejeon.py @@ -5,12 +5,13 @@ from scrap.utils.requests import get_soup import re -def scrap_65(url = 'https://council.donggu.go.kr/kr/member/active') -> ScrapResult: - '''대전시 동구 페이지에서 의원 상세약력 스크랩 + +def scrap_65(url="https://council.donggu.go.kr/kr/member/active") -> ScrapResult: + """대전시 동구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 - ''' + """ soup = get_soup(url, verify=False) councilors: List[Councilor] = [] @@ -18,20 +19,23 @@ def scrap_65(url = 'https://council.donggu.go.kr/kr/member/active') -> ScrapResu parsed_url = urlparse(url) base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" - for profile in soup.find_all('dl', class_='profile'): + for profile in soup.find_all("dl", class_="profile"): name_tag = profile.find("strong", class_="name") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' + party = "정당 정보 없음" # 프로필보기 링크 가져오기 - profile_link = profile.find('a', class_='start') + profile_link = profile.find("a", class_="start") if profile_link: - data_uid = profile_link.get('data-uid') + data_uid = profile_link.get("data-uid") if data_uid: - profile_url = base_url + f'/kr/member/profile_popup?uid={data_uid}' + profile_url = base_url + f"/kr/member/profile_popup?uid={data_uid}" profile_soup = get_soup(profile_url, verify=False) - party_info = profile_soup.find('strong', string='정 당') - if party_info and (party_span := party_info.find_next('span')) is not None: + party_info = profile_soup.find("strong", string="정 당") + if ( + party_info + and (party_span := party_info.find_next("span")) is not None + ): party = party_span.text councilors.append(Councilor(name=name, party=party)) @@ -39,19 +43,20 @@ def scrap_65(url = 'https://council.donggu.go.kr/kr/member/active') -> ScrapResu return ScrapResult( council_id="daejeon-donggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_66(url = 'https://council.djjunggu.go.kr/kr/member/name.do') -> ScrapResult: - '''대전시 중구 페이지에서 의원 상세약력 스크랩 + +def scrap_66(url="https://council.djjunggu.go.kr/kr/member/name.do") -> ScrapResult: + """대전시 중구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 - ''' + """ soup = get_soup(url, verify=False) councilors: List[Councilor] = [] - for profile in soup.find_all('div', class_='profile'): + for profile in soup.find_all("div", class_="profile"): name_tag = profile.find("div", class_="name") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" @@ -64,24 +69,29 @@ def scrap_66(url = 'https://council.djjunggu.go.kr/kr/member/name.do') -> ScrapR return ScrapResult( council_id="daejeon-junggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_67(url = 'https://www.seogucouncil.daejeon.kr/svc/mbr/MbrPresent.do') -> ScrapResult: - '''대전시 서구 페이지에서 의원 상세약력 스크랩 + +def scrap_67( + url="https://www.seogucouncil.daejeon.kr/svc/mbr/MbrPresent.do", +) -> ScrapResult: + """대전시 서구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 - ''' + """ soup = get_soup(url, verify=False) councilors: List[Councilor] = [] - for profile in soup.find_all('dl'): + for profile in soup.find_all("dl"): name_tag = profile.find("dd", class_="name") - name = name_tag.get_text(strip=True).replace(" 의원", "") if name_tag else "이름 정보 없음" + name = ( + name_tag.get_text(strip=True).replace(" 의원", "") if name_tag else "이름 정보 없음" + ) party = "정당 정보 없음" - party_info = list(filter(lambda x: '정당' in str(x), profile.find_all("dd"))) + party_info = list(filter(lambda x: "정당" in str(x), profile.find_all("dd"))) if party_info: party = party_info[0].get_text(strip=True).replace("정당: ", "") @@ -90,25 +100,26 @@ def scrap_67(url = 'https://www.seogucouncil.daejeon.kr/svc/mbr/MbrPresent.do') return ScrapResult( council_id="daejeon-seogu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_68(url = 'https://yuseonggucouncil.go.kr/page/page02_01_01.php') -> ScrapResult: - '''대전시 유성구 페이지에서 의원 상세약력 스크랩 + +def scrap_68(url="https://yuseonggucouncil.go.kr/page/page02_01_01.php") -> ScrapResult: + """대전시 유성구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 - ''' + """ soup = get_soup(url, verify=False) councilors: List[Councilor] = [] - for profile in soup.find_all('div', class_='profile'): + for profile in soup.find_all("div", class_="profile"): name_tag = profile.find("em", class_="name") # () 안에 있는 한자를 제거 (ex. 김영희(金英姬) -> 김영희) - name = name_tag.get_text(strip=True).split('(')[0] if name_tag else "이름 정보 없음" + name = name_tag.get_text(strip=True).split("(")[0] if name_tag else "이름 정보 없음" party = "정당 정보 없음" - regex_pattern = re.compile(r'정\s*당\s*:', re.IGNORECASE) # Case-insensitive + regex_pattern = re.compile(r"정\s*당\s*:", re.IGNORECASE) # Case-insensitive party_info = profile.find("em", string=regex_pattern) if party_info: party = party_info.find_next("span").get_text(strip=True) @@ -117,24 +128,25 @@ def scrap_68(url = 'https://yuseonggucouncil.go.kr/page/page02_01_01.php') -> Sc return ScrapResult( council_id="daejeon-yuseonggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_69(url = 'https://council.daedeok.go.kr/kr/member/name.do') -> ScrapResult: - '''대전시 대덕구 페이지에서 의원 상세약력 스크랩 + +def scrap_69(url="https://council.daedeok.go.kr/kr/member/name.do") -> ScrapResult: + """대전시 대덕구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 - ''' + """ soup = get_soup(url, verify=False) councilors: List[Councilor] = [] - for profile in soup.find_all('div', class_='profile'): + for profile in soup.find_all("div", class_="profile"): name_tag = profile.find("em", class_="name") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" party = "정당 정보 없음" - regex_pattern = re.compile(r'정\s*당\s*:', re.IGNORECASE) # Case-insensitive + regex_pattern = re.compile(r"정\s*당\s*:", re.IGNORECASE) # Case-insensitive party_info = profile.find("em", string=regex_pattern) if party_info: party = party_info.find_next("span").get_text(strip=True) @@ -143,8 +155,9 @@ def scrap_69(url = 'https://council.daedeok.go.kr/kr/member/name.do') -> ScrapRe return ScrapResult( council_id="daejeon-daedeokgu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -if __name__ == '__main__': - print(scrap_69()) \ No newline at end of file + +if __name__ == "__main__": + print(scrap_69()) diff --git a/scrap/local_councils/incheon.py b/scrap/local_councils/incheon.py index 0e40bb7..fb70b4a 100644 --- a/scrap/local_councils/incheon.py +++ b/scrap/local_councils/incheon.py @@ -4,7 +4,7 @@ from scrap.utils.requests import get_soup -def scrap_50(url='https://www.icjg.go.kr/council/cnmi0101c') -> ScrapResult: +def scrap_50(url="https://www.icjg.go.kr/council/cnmi0101c") -> ScrapResult: """인천시 중구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -13,58 +13,61 @@ def scrap_50(url='https://www.icjg.go.kr/council/cnmi0101c') -> ScrapResult: soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for name_tag in soup.find_all('p', class_='name'): - name_tag_str = name_tag.get_text(strip=True).split('[') + for name_tag in soup.find_all("p", class_="name"): + name_tag_str = name_tag.get_text(strip=True).split("[") name = name_tag_str[0].strip() party = name_tag_str[-1][:-1].strip() - + councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="incheon-junggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_51(url='https://council.icdonggu.go.kr/korean/member/active') -> ScrapResult: +def scrap_51(url="https://council.icdonggu.go.kr/korean/member/active") -> ScrapResult: """인천시 동구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 """ - raise Exception('현재 인천시 동구의회 사이트는 SSLV3_ALERT_HANDSHAKE_FAILURE 에러가 발생합니다') + raise Exception("현재 인천시 동구의회 사이트는 SSLV3_ALERT_HANDSHAKE_FAILURE 에러가 발생합니다") # soup = get_soup(url, verify=False) # councilors: list[Councilor] = [] - # # 프로필 링크 스크랩을 위해 base_url 추출 - # parsed_url = urlparse(url) - # base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" - # for name_tag in soup.find_all('strong', class_='name'): - # name = name_tag.get_text(strip=True) - # party = '정당 정보 없음' - - # profile_link = name_tag.find_next('a', class_='abtn1') - # if profile_link: - # profile_url = base_url + profile_link['onclick'][13:104] - # profile_soup = get_soup(profile_url, verify=False) - - # party_info = profile_soup.find('span', class_='subject', string='소속정당') - # if party_info and (party_span := party_info.find_next('span', class_='detail')) is not None: - # party = party_span.get_text(strip=True) - - # councilors.append(Councilor(name=name, party=party)) +# # 프로필 링크 스크랩을 위해 base_url 추출 +# parsed_url = urlparse(url) +# base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" - # return ScrapResult( - # council_id="incheon-donggu", - # council_type=CouncilType.LOCAL_COUNCIL, - # councilors=councilors - # ) +# for name_tag in soup.find_all('strong', class_='name'): +# name = name_tag.get_text(strip=True) +# party = '정당 정보 없음' + +# profile_link = name_tag.find_next('a', class_='abtn1') +# if profile_link: +# profile_url = base_url + profile_link['onclick'][13:104] +# profile_soup = get_soup(profile_url, verify=False) + +# party_info = profile_soup.find('span', class_='subject', string='소속정당') +# if party_info and (party_span := party_info.find_next('span', class_='detail')) is not None: +# party = party_span.get_text(strip=True) + +# councilors.append(Councilor(name=name, party=party)) + +# return ScrapResult( +# council_id="incheon-donggu", +# council_type=CouncilType.LOCAL_COUNCIL, +# councilors=councilors +# ) -def scrap_52(url='https://www.michuhol.go.kr/council/introduction/career.asp') -> ScrapResult: +def scrap_52( + url="https://www.michuhol.go.kr/council/introduction/career.asp", +) -> ScrapResult: """인천시 미추홀구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -72,19 +75,23 @@ def scrap_52(url='https://www.michuhol.go.kr/council/introduction/career.asp') - """ soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - - script = soup.find('div', class_='contents_header').find_next('script').get_text(strip=True) - # TODO + script = ( + soup.find("div", class_="contents_header") + .find_next("script") + .get_text(strip=True) + ) + + # TODO return ScrapResult( council_id="incheon-michuholgu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_53(url='https://council.yeonsu.go.kr/kr/member/name.do') -> ScrapResult: +def scrap_53(url="https://council.yeonsu.go.kr/kr/member/name.do") -> ScrapResult: """인천시 연수구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -93,25 +100,27 @@ def scrap_53(url='https://council.yeonsu.go.kr/kr/member/name.do') -> ScrapResul soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='profile'): - name_tag = profile.find('strong') - name = name_tag.get_text(strip=True) if name_tag else '이름 정보 없음' - - party = '정당 정보 없음' - party_info = profile.find('em', string='소속정당').find_next('span').find_next('span') + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find("strong") + name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" + + party = "정당 정보 없음" + party_info = ( + profile.find("em", string="소속정당").find_next("span").find_next("span") + ) if party_info: party = party_info.get_text(strip=True) - + councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="incheon-yeonsugu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_54(url='https://council.namdong.go.kr/kr/member/active.do') -> ScrapResult: +def scrap_54(url="https://council.namdong.go.kr/kr/member/active.do") -> ScrapResult: """인천시 남동구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -120,31 +129,31 @@ def scrap_54(url='https://council.namdong.go.kr/kr/member/active.do') -> ScrapRe soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='profile'): - name_tag = profile.find('em', class_='name') - name = name_tag.get_text(strip=True) if name_tag else '이름 정보 없음' - - party = '정당 정보 없음' - party_info = profile.find('em', string='정 당 : ').find_next('span') + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find("em", class_="name") + name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" + + party = "정당 정보 없음" + party_info = profile.find("em", string="정 당 : ").find_next("span") if party_info: party = party_info.get_text(strip=True) - + councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="incheon-namdonggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_55(url='https://council.icbp.go.kr/kr/member/active') -> ScrapResult: +def scrap_55(url="https://council.icbp.go.kr/kr/member/active") -> ScrapResult: """인천시 부평구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 """ - raise Exception('현재 인천시 부평구의회 사이트는 SSLV3_ALERT_HANDSHAKE_FAILURE 에러가 발생합니다') + raise Exception("현재 인천시 부평구의회 사이트는 SSLV3_ALERT_HANDSHAKE_FAILURE 에러가 발생합니다") # soup = get_soup(url, verify=False) # councilors: list[Councilor] = [] @@ -152,12 +161,12 @@ def scrap_55(url='https://council.icbp.go.kr/kr/member/active') -> ScrapResult: # for profile in soup.find_all('div', class_='profile'): # name_tag = profile.find('strong', class_='name') # name = name_tag.get_text(strip=True).split()[0].strip() if name_tag else '이름 정보 없음' - + # party = '정당 정보 없음' # party_info = profile.find('strong', string='소속정당').find_next('span') # if party_info: # party = party_info.get_text(strip=True).split()[-1].strip() - + # councilors.append(Councilor(name=name, party=party)) # return ScrapResult( @@ -167,7 +176,9 @@ def scrap_55(url='https://council.icbp.go.kr/kr/member/active') -> ScrapResult: # ) -def scrap_56(url='https://www.gyeyang.go.kr/open_content/council/member/present/present.jsp') -> ScrapResult: +def scrap_56( + url="https://www.gyeyang.go.kr/open_content/council/member/present/present.jsp", +) -> ScrapResult: """인천시 계양구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -176,23 +187,24 @@ def scrap_56(url='https://www.gyeyang.go.kr/open_content/council/member/present/ soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for name_tag in soup.find_all('li', class_='name'): - name = name_tag.get_text(strip=True) if name_tag else '이름 정보 없음' - - party = '정당 정보 없음' - party_info = name_tag.find_next('li').find_next('li').find('span', class_='span_sfont') + for name_tag in soup.find_all("li", class_="name"): + name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" + + party = "정당 정보 없음" + party_info = ( + name_tag.find_next("li").find_next("li").find("span", class_="span_sfont") + ) if party_info: party = party_info.get_text(strip=True) - + councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="incheon-gyeyanggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) - -if __name__ == '__main__': - print(scrap_56()) \ No newline at end of file +if __name__ == "__main__": + print(scrap_56()) diff --git a/scrap/local_councils/seoul.py b/scrap/local_councils/seoul.py index f1db182..8e5bd44 100644 --- a/scrap/local_councils/seoul.py +++ b/scrap/local_councils/seoul.py @@ -5,19 +5,21 @@ from scrap.utils.requests import get_soup -def scrap_1(url = 'https://council.jongno.go.kr/council/councilAsemby/list/estList.do?menuNo=400021') -> ScrapResult: - '''서울시 종로구 페이지에서 의원 상세약력 스크랩 +def scrap_1( + url="https://council.jongno.go.kr/council/councilAsemby/list/estList.do?menuNo=400021", +) -> ScrapResult: + """서울시 종로구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 - ''' + """ soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - - for profile in soup.find_all('div', class_='chairman-info'): + + for profile in soup.find_all("div", class_="chairman-info"): name_tag = profile.find_next("strong") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' + party = "정당 정보 없음" # TODO councilors.append(Councilor(name=name, party=party)) @@ -25,16 +27,16 @@ def scrap_1(url = 'https://council.jongno.go.kr/council/councilAsemby/list/estLi return ScrapResult( council_id="seoul-junggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_2(url = 'https://02jgnew.council.or.kr/kr/member/active') -> ScrapResult: - '''서울시 중구 페이지에서 의원 상세약력 스크랩 +def scrap_2(url="https://02jgnew.council.or.kr/kr/member/active") -> ScrapResult: + """서울시 중구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 - ''' + """ parliment_soup = get_soup(url, verify=False) councilors: list[Councilor] = [] @@ -42,19 +44,19 @@ def scrap_2(url = 'https://02jgnew.council.or.kr/kr/member/active') -> ScrapResu parsed_url = urlparse(url) base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" - for profile in parliment_soup.find_all('div', class_='profile'): + for profile in parliment_soup.find_all("div", class_="profile"): name_tag = profile.find("em", class_="name") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' + party = "정당 정보 없음" # 프로필보기 링크 가져오기 - profile_link = profile.find('a', class_='start') + profile_link = profile.find("a", class_="start") if profile_link: - profile_url = base_url + profile_link['href'] + profile_url = base_url + profile_link["href"] profile_soup = get_soup(profile_url, verify=False) - party_info = profile_soup.find('em', string='소속정당 : ') - if party_info and (party_span := party_info.find_next('span')) is not None: + party_info = profile_soup.find("em", string="소속정당 : ") + if party_info and (party_span := party_info.find_next("span")) is not None: party = party_span.text councilors.append(Councilor(name=name, party=party)) @@ -62,7 +64,7 @@ def scrap_2(url = 'https://02jgnew.council.or.kr/kr/member/active') -> ScrapResu return ScrapResult( council_id="seoul-junggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) @@ -80,7 +82,7 @@ def scrap_3(url="https://www.yscl.go.kr/kr/member/name.do") -> ScrapResult: name_tag = profile.find("em", class_="name") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' + party = "정당 정보 없음" party_info = profile.find("em", string="소속정당") if party_info: party = party_info.find_next("span").get_text(strip=True) @@ -90,11 +92,11 @@ def scrap_3(url="https://www.yscl.go.kr/kr/member/name.do") -> ScrapResult: return ScrapResult( council_id="seoul-yongsangu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_4(url='https://sdcouncil.sd.go.kr/kr/member/active2') -> ScrapResult: +def scrap_4(url="https://sdcouncil.sd.go.kr/kr/member/active2") -> ScrapResult: """서울시 성동구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -104,11 +106,11 @@ def scrap_4(url='https://sdcouncil.sd.go.kr/kr/member/active2') -> ScrapResult: councilors: list[Councilor] = [] for profile in soup.find_all("dl", class_="profile"): - name_tag = profile.find('strong', class_='name') + name_tag = profile.find("strong", class_="name") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' - party_info = profile.find("strong", string='정 당 : ') + party = "정당 정보 없음" + party_info = profile.find("strong", string="정 당 : ") if party_info: party = party_info.find_next("span").get_text(strip=True) @@ -117,11 +119,11 @@ def scrap_4(url='https://sdcouncil.sd.go.kr/kr/member/active2') -> ScrapResult: return ScrapResult( council_id="seoul-seongdonggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_5(url='https://council.gwangjin.go.kr/kr/member/active') -> ScrapResult: +def scrap_5(url="https://council.gwangjin.go.kr/kr/member/active") -> ScrapResult: """서울시 광진구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -130,51 +132,57 @@ def scrap_5(url='https://council.gwangjin.go.kr/kr/member/active') -> ScrapResul soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all("div", class_=lambda x: x in ('profile', 'profile_none')): - name_tag = profile.find('strong') + for profile in soup.find_all( + "div", class_=lambda x: x in ("profile", "profile_none") + ): + name_tag = profile.find("strong") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' - party_info = profile.find("em", string='소속정당') + party = "정당 정보 없음" + party_info = profile.find("em", string="소속정당") if party_info: - party = party_info.find_next("span").find_next('span').get_text(strip=True) + party = party_info.find_next("span").find_next("span").get_text(strip=True) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="seoul-gwangjingu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_6(url='http://council.ddm.go.kr/citizen/menu1.asp') -> ScrapResult: +def scrap_6(url="http://council.ddm.go.kr/citizen/menu1.asp") -> ScrapResult: """서울시 동대문구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 """ - parliment_soup = get_soup(url, verify=False, encoding='euc-kr') + parliment_soup = get_soup(url, verify=False, encoding="euc-kr") councilors: list[Councilor] = [] # 프로필 링크 스크랩을 위해 base_url 추출 parsed_url = urlparse(url) base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" - for profile in parliment_soup.find_all('div', class_='intro_text tm_lg_6'): - name = profile.find('p', class_='intro_text_title').string.strip().split(' ')[0] - party = '정당 정보 없음' + for profile in parliment_soup.find_all("div", class_="intro_text tm_lg_6"): + name = profile.find("p", class_="intro_text_title").string.strip().split(" ")[0] + party = "정당 정보 없음" # 프로필보기 링크 가져오기 - profile_link = profile.find('a') + profile_link = profile.find("a") if profile_link: - profile_url = base_url + '/assemblyman/greeting/menu02.asp?assembly_id=' + profile_link['href'][1:] - profile_soup = get_soup(profile_url, verify=False, encoding='euc-kr') - - profile_info = profile_soup.find('div', class_='profileTxt') + profile_url = ( + base_url + + "/assemblyman/greeting/menu02.asp?assembly_id=" + + profile_link["href"][1:] + ) + profile_soup = get_soup(profile_url, verify=False, encoding="euc-kr") + + profile_info = profile_soup.find("div", class_="profileTxt") if profile_info: - profile_string = profile_info.get_text().strip().split('\xa0') - idx = profile_string.index('소속정당') + profile_string = profile_info.get_text().strip().split("\xa0") + idx = profile_string.index("소속정당") party = profile_string[idx + 2] councilors.append(Councilor(name=name, party=party)) @@ -182,11 +190,11 @@ def scrap_6(url='http://council.ddm.go.kr/citizen/menu1.asp') -> ScrapResult: return ScrapResult( council_id="seoul-dongdaemungu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_7(url='https://council.jungnang.go.kr/kr/member/name2.do') -> ScrapResult: +def scrap_7(url="https://council.jungnang.go.kr/kr/member/name2.do") -> ScrapResult: """서울시 중랑구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -195,25 +203,25 @@ def scrap_7(url='https://council.jungnang.go.kr/kr/member/name2.do') -> ScrapRes soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all("div", class_='profile'): - name_tag = profile.find('em', class_='name') + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find("em", class_="name") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' - party_info = profile.find("em", string='소속정당') + party = "정당 정보 없음" + party_info = profile.find("em", string="소속정당") if party_info: - party = party_info.find_next("span").find_next('span').get_text(strip=True) + party = party_info.find_next("span").find_next("span").get_text(strip=True) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="seoul-jungnanggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_8(url='https://www.sbc.go.kr/kr/member/active.do') -> ScrapResult: +def scrap_8(url="https://www.sbc.go.kr/kr/member/active.do") -> ScrapResult: """서울시 성북구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -222,25 +230,27 @@ def scrap_8(url='https://www.sbc.go.kr/kr/member/active.do') -> ScrapResult: soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all("div", class_='profile'): - name_tag = profile.find('em', class_='name') + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find("em", class_="name") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' - party_info = profile.find("em", string='소속정당') + party = "정당 정보 없음" + party_info = profile.find("em", string="소속정당") if party_info: - party = party_info.find_next("span").get_text(strip=True).split(' ')[-1].strip() + party = ( + party_info.find_next("span").get_text(strip=True).split(" ")[-1].strip() + ) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="seoul-seongbukgu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_9(url='https://council.gangbuk.go.kr/kr/member/name.do') -> ScrapResult: +def scrap_9(url="https://council.gangbuk.go.kr/kr/member/name.do") -> ScrapResult: """서울시 강북구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -249,25 +259,31 @@ def scrap_9(url='https://council.gangbuk.go.kr/kr/member/name.do') -> ScrapResul soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all("div", class_='profile'): - name_tag = profile.find('div', class_='name') - name = name_tag.find_next('strong').get_text(strip=True) if name_tag else "이름 정보 없음" - - party = '정당 정보 없음' - party_info = profile.find('em', string='소속정당') + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find("div", class_="name") + name = ( + name_tag.find_next("strong").get_text(strip=True) + if name_tag + else "이름 정보 없음" + ) + + party = "정당 정보 없음" + party_info = profile.find("em", string="소속정당") if party_info: - party = party_info.find_next("span").find_next('span').get_text(strip=True) + party = party_info.find_next("span").find_next("span").get_text(strip=True) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="seoul-gangbukgu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_10(url='https://www.council-dobong.seoul.kr/kr/member/active.do') -> ScrapResult: +def scrap_10( + url="https://www.council-dobong.seoul.kr/kr/member/active.do", +) -> ScrapResult: """서울시 도봉구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -276,25 +292,27 @@ def scrap_10(url='https://www.council-dobong.seoul.kr/kr/member/active.do') -> S soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all("div", class_='profile'): - name_tag = profile.find('em', class_='name') + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find("em", class_="name") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' - party_info = profile.find('em', string='소속정당') + party = "정당 정보 없음" + party_info = profile.find("em", string="소속정당") if party_info: - party = party_info.find_next("span").get_text(strip=True).split(' ')[-1].strip() + party = ( + party_info.find_next("span").get_text(strip=True).split(" ")[-1].strip() + ) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="seoul-dobonggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_11(url='https://council.nowon.kr/kr/member/active.do') -> ScrapResult: +def scrap_11(url="https://council.nowon.kr/kr/member/active.do") -> ScrapResult: """서울시 노원구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -303,25 +321,27 @@ def scrap_11(url='https://council.nowon.kr/kr/member/active.do') -> ScrapResult: soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all("div", class_='profile'): - name_tag = profile.find('em', class_='name') + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find("em", class_="name") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' - party_info = profile.find('em', string='소속정당') + party = "정당 정보 없음" + party_info = profile.find("em", string="소속정당") if party_info: - party = party_info.find_next("span").get_text(strip=True).split(' ')[-1].strip() + party = ( + party_info.find_next("span").get_text(strip=True).split(" ")[-1].strip() + ) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="seoul-nowongu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_12(url='https://council.ep.go.kr/kr/member/name.do') -> ScrapResult: +def scrap_12(url="https://council.ep.go.kr/kr/member/name.do") -> ScrapResult: """서울시 은평구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -330,52 +350,65 @@ def scrap_12(url='https://council.ep.go.kr/kr/member/name.do') -> ScrapResult: soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all("div", class_='profile'): - name_tag = profile.find('div', class_='name') - name = name_tag.find_next('strong').get_text(strip=True) if name_tag else "이름 정보 없음" - - party = '정당 정보 없음' - party_info = profile.find('em', string='소속정당') + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find("div", class_="name") + name = ( + name_tag.find_next("strong").get_text(strip=True) + if name_tag + else "이름 정보 없음" + ) + + party = "정당 정보 없음" + party_info = profile.find("em", string="소속정당") if party_info: - party = party_info.find_next('span').find_next('span').get_text(strip=True) + party = party_info.find_next("span").find_next("span").get_text(strip=True) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="seoul-eunpyeonggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_13(url='https://www.sdmcouncil.go.kr/source/korean/square/ascending.html') -> ScrapResult: +def scrap_13( + url="https://www.sdmcouncil.go.kr/source/korean/square/ascending.html", +) -> ScrapResult: """서울시 서대문구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 """ - soup = get_soup(url, verify=False, encoding='euc-kr') + soup = get_soup(url, verify=False, encoding="euc-kr") councilors: list[Councilor] = [] - for profile in soup.find_all('dl', class_='card_desc'): - name_tag = profile.find_next('dt') + for profile in soup.find_all("dl", class_="card_desc"): + name_tag = profile.find_next("dt") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' - party_info = profile.find('ul') + party = "정당 정보 없음" + party_info = profile.find("ul") if party_info: - party = party_info.find_next('li').find_next('li').find_next('li').get_text(strip=True).split(' ')[-1].strip() + party = ( + party_info.find_next("li") + .find_next("li") + .find_next("li") + .get_text(strip=True) + .split(" ")[-1] + .strip() + ) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="seoul-seodaemungu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_14(url='https://council.mapo.seoul.kr/kr/member/active.do') -> ScrapResult: +def scrap_14(url="https://council.mapo.seoul.kr/kr/member/active.do") -> ScrapResult: """서울시 마포구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -384,25 +417,25 @@ def scrap_14(url='https://council.mapo.seoul.kr/kr/member/active.do') -> ScrapRe soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='wrap'): - name_tag = profile.find_next('div', class_='right') - name = name_tag.find_next('h4').get_text(strip=True) if name_tag else "이름 정보 없음" + for profile in soup.find_all("div", class_="wrap"): + name_tag = profile.find_next("div", class_="right") + name = name_tag.find_next("h4").get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' - party_info = profile.find('span', class_='tit', string='소속정당 : ') + party = "정당 정보 없음" + party_info = profile.find("span", class_="tit", string="소속정당 : ") if party_info: - party = party_info.find_next('span', class_='con').get_text(strip=True) + party = party_info.find_next("span", class_="con").get_text(strip=True) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="seoul-mapogu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_15(url='https://www.ycc.go.kr/kr/member/active') -> ScrapResult: +def scrap_15(url="https://www.ycc.go.kr/kr/member/active") -> ScrapResult: """서울시 양천구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -415,19 +448,23 @@ def scrap_15(url='https://www.ycc.go.kr/kr/member/active') -> ScrapResult: parsed_url = urlparse(url) base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" - for profile in soup.find_all('div', class_='profile'): - name_tag = profile.find_next('div', class_='name') - name = name_tag.find_next('strong').get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find_next("div", class_="name") + name = ( + name_tag.find_next("strong").get_text(strip=True) + if name_tag + else "이름 정보 없음" + ) + party = "정당 정보 없음" # 프로필보기 링크 가져오기 - profile_uid = profile.find('a', class_='start')['data-uid'] + profile_uid = profile.find("a", class_="start")["data-uid"] if profile_uid: - profile_url = base_url + '/kr/member/profile_popup?uid=' + profile_uid + profile_url = base_url + "/kr/member/profile_popup?uid=" + profile_uid profile_soup = get_soup(profile_url, verify=False) - party_info = profile_soup.find('em', string='소속정당') - if party_info and (party_span := party_info.find_next('span')): + party_info = profile_soup.find("em", string="소속정당") + if party_info and (party_span := party_info.find_next("span")): party = party_span.get_text(strip=True) councilors.append(Councilor(name=name, party=party)) @@ -435,38 +472,44 @@ def scrap_15(url='https://www.ycc.go.kr/kr/member/active') -> ScrapResult: return ScrapResult( council_id="seoul-yangcheongu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_16(url='https://gsc.gangseo.seoul.kr/member/org.asp') -> ScrapResult: +def scrap_16(url="https://gsc.gangseo.seoul.kr/member/org.asp") -> ScrapResult: """서울시 강서구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 """ - soup = get_soup(url, verify=False, encoding='euc-kr') + soup = get_soup(url, verify=False, encoding="euc-kr") councilors: list[Councilor] = [] - for profile in soup.find_all('ul', class_='mb-15'): - name_tag = profile.find_next('span', class_='fs-18 fw-700') - name = name_tag.get_text(strip=True).split()[0].strip() if name_tag else "이름 정보 없음" - - party = '정당 정보 없음' - party_info = profile.find_next('span', class_='title').find_next('span', class_='title').find_next('span', class_='title') + for profile in soup.find_all("ul", class_="mb-15"): + name_tag = profile.find_next("span", class_="fs-18 fw-700") + name = ( + name_tag.get_text(strip=True).split()[0].strip() if name_tag else "이름 정보 없음" + ) + + party = "정당 정보 없음" + party_info = ( + profile.find_next("span", class_="title") + .find_next("span", class_="title") + .find_next("span", class_="title") + ) if party_info: - party = party_info.find_next('span').get_text(strip=True) + party = party_info.find_next("span").get_text(strip=True) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="seoul-gangseogu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_17(url='https://www.guroc.go.kr/kr/member/name.do') -> ScrapResult: +def scrap_17(url="https://www.guroc.go.kr/kr/member/name.do") -> ScrapResult: """서울시 구로구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -475,38 +518,46 @@ def scrap_17(url='https://www.guroc.go.kr/kr/member/name.do') -> ScrapResult: soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='profile'): - name_tag = profile.find_next('div', class_='name') - name = name_tag.find_next('strong').get_text(strip=True) if name_tag else "이름 정보 없음" - - party = '정당 정보 없음' - party_info = profile.find('em', string='소속정당') + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find_next("div", class_="name") + name = ( + name_tag.find_next("strong").get_text(strip=True) + if name_tag + else "이름 정보 없음" + ) + + party = "정당 정보 없음" + party_info = profile.find("em", string="소속정당") if party_info: - party = party_info.find_next('span').find_next('span').get_text(strip=True) + party = party_info.find_next("span").find_next("span").get_text(strip=True) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="seoul-gurogu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_18(url='https://council.geumcheon.go.kr/member/member.asp') -> ScrapResult: +def scrap_18(url="https://council.geumcheon.go.kr/member/member.asp") -> ScrapResult: """서울시 금천구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 """ - soup = get_soup(url, verify=False, encoding='euc-kr') + soup = get_soup(url, verify=False, encoding="euc-kr") councilors: list[Councilor] = [] - for profile in soup.find_all('li', class_='name'): - name_tag = profile.find_next('strong') - name = name_tag.get_text(strip=True).split('(')[0].strip() if name_tag else "이름 정보 없음" + for profile in soup.find_all("li", class_="name"): + name_tag = profile.find_next("strong") + name = ( + name_tag.get_text(strip=True).split("(")[0].strip() + if name_tag + else "이름 정보 없음" + ) - party = '정당 정보 없음' + party = "정당 정보 없음" # TODO councilors.append(Councilor(name=name, party=party)) @@ -514,11 +565,11 @@ def scrap_18(url='https://council.geumcheon.go.kr/member/member.asp') -> ScrapRe return ScrapResult( council_id="seoul-geumcheongu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_19(url='https://www.ydpc.go.kr/kr/member/active.do') -> ScrapResult: +def scrap_19(url="https://www.ydpc.go.kr/kr/member/active.do") -> ScrapResult: """서울시 영등포구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -527,25 +578,25 @@ def scrap_19(url='https://www.ydpc.go.kr/kr/member/active.do') -> ScrapResult: soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='profile'): - name_tag = profile.find_next('em', class_='name') + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find_next("em", class_="name") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' - party_info = profile.find('em', string='소속정당 : ') + party = "정당 정보 없음" + party_info = profile.find("em", string="소속정당 : ") if party_info: - party = party_info.find_next('span').get_text(strip=True) + party = party_info.find_next("span").get_text(strip=True) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="seoul-yeongdeungpogu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_20(url='http://assembly.dongjak.go.kr/kr/member/name.do') -> ScrapResult: +def scrap_20(url="http://assembly.dongjak.go.kr/kr/member/name.do") -> ScrapResult: """서울시 동작구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -554,25 +605,25 @@ def scrap_20(url='http://assembly.dongjak.go.kr/kr/member/name.do') -> ScrapResu soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='profile'): - name_tag = profile.find_next('em', class_='name') + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find_next("em", class_="name") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' - party_info = profile.find('em', string='소속정당') + party = "정당 정보 없음" + party_info = profile.find("em", string="소속정당") if party_info: - party = party_info.find_next('span').find_next('span').get_text(strip=True) + party = party_info.find_next("span").find_next("span").get_text(strip=True) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="seoul-dongjakgu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_21(url='https://www.ga21c.seoul.kr/kr/member/name.do') -> ScrapResult: +def scrap_21(url="https://www.ga21c.seoul.kr/kr/member/name.do") -> ScrapResult: """서울시 관악구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -581,25 +632,27 @@ def scrap_21(url='https://www.ga21c.seoul.kr/kr/member/name.do') -> ScrapResult: soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='profile'): - name_tag = profile.find_next('em', class_='name') + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find_next("em", class_="name") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' - party_info = profile.find('em', string='소속정당') + party = "정당 정보 없음" + party_info = profile.find("em", string="소속정당") if party_info: - party = party_info.find_next('span').get_text(strip=True).split(' ')[-1].strip() + party = ( + party_info.find_next("span").get_text(strip=True).split(" ")[-1].strip() + ) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="seoul-gwanakgu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_22(url='https://www.sdc.seoul.kr/kr/member/active.do') -> ScrapResult: +def scrap_22(url="https://www.sdc.seoul.kr/kr/member/active.do") -> ScrapResult: """서울시 서초구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -608,25 +661,27 @@ def scrap_22(url='https://www.sdc.seoul.kr/kr/member/active.do') -> ScrapResult: soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='profile'): - name_tag = profile.find_next('em', class_='name') - name = name_tag.get_text(strip=True).split()[0].strip() if name_tag else "이름 정보 없음" + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find_next("em", class_="name") + name = ( + name_tag.get_text(strip=True).split()[0].strip() if name_tag else "이름 정보 없음" + ) - party = '정당 정보 없음' - party_info = profile.find('em', string='소속정당 : ') + party = "정당 정보 없음" + party_info = profile.find("em", string="소속정당 : ") if party_info: - party = party_info.find_next('span').get_text(strip=True) + party = party_info.find_next("span").get_text(strip=True) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="seoul-seochogu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_23(url='https://www.gncouncil.go.kr/kr/member/name.do') -> ScrapResult: +def scrap_23(url="https://www.gncouncil.go.kr/kr/member/name.do") -> ScrapResult: """서울시 강남구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -635,35 +690,39 @@ def scrap_23(url='https://www.gncouncil.go.kr/kr/member/name.do') -> ScrapResult soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='profile'): - name_tag = profile.find_next('div', class_='name') - name = name_tag.find_next('strong').get_text(strip=True) if name_tag else "이름 정보 없음" - - party = '정당 정보 없음' - party_info = profile.find('em', string='소속정당') + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find_next("div", class_="name") + name = ( + name_tag.find_next("strong").get_text(strip=True) + if name_tag + else "이름 정보 없음" + ) + + party = "정당 정보 없음" + party_info = profile.find("em", string="소속정당") if party_info: - party = party_info.find_next('span').find_next('span').get_text(strip=True) + party = party_info.find_next("span").find_next("span").get_text(strip=True) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="seoul-gangnamgu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_24(url='https://council.songpa.go.kr/kr/member/active.do') -> ScrapResult: +def scrap_24(url="https://council.songpa.go.kr/kr/member/active.do") -> ScrapResult: """서울시 송파구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 """ # TODO - raise Exception('송파구 의회 사이트는 현재 먹통입니다') + raise Exception("송파구 의회 사이트는 현재 먹통입니다") -def scrap_25(url='https://council.gangdong.go.kr/kr/member/active.do') -> ScrapResult: +def scrap_25(url="https://council.gangdong.go.kr/kr/member/active.do") -> ScrapResult: """서울시 강동구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -672,23 +731,25 @@ def scrap_25(url='https://council.gangdong.go.kr/kr/member/active.do') -> ScrapR soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='profile'): - name_tag = profile.find_next('em', class_='name') - name = name_tag.get_text(strip=True).split()[0].strip() if name_tag else "이름 정보 없음" + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find_next("em", class_="name") + name = ( + name_tag.get_text(strip=True).split()[0].strip() if name_tag else "이름 정보 없음" + ) - party = '정당 정보 없음' - party_info = profile.find('em', string='소속정당 : ') + party = "정당 정보 없음" + party_info = profile.find("em", string="소속정당 : ") if party_info: - party = party_info.find_next('span').get_text(strip=True) + party = party_info.find_next("span").get_text(strip=True) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="seoul-gangdonggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -if __name__ == '__main__': - print(scrap_2()) \ No newline at end of file +if __name__ == "__main__": + print(scrap_2()) diff --git a/scrap/local_councils/ulsan.py b/scrap/local_councils/ulsan.py index f2a2219..52bcf31 100644 --- a/scrap/local_councils/ulsan.py +++ b/scrap/local_councils/ulsan.py @@ -5,24 +5,33 @@ from scrap.utils.requests import get_soup import re -regex_pattern = re.compile(r'정\s*\S*\s*당', re.IGNORECASE) # Case-insensitive +regex_pattern = re.compile(r"정\s*\S*\s*당", re.IGNORECASE) # Case-insensitive -def scrap_70(url = 'https://council.junggu.ulsan.kr/content/member/memberName.html') -> ScrapResult: - '''울산시 중구 페이지에서 의원 상세약력 스크랩 + +def scrap_70( + url="https://council.junggu.ulsan.kr/content/member/memberName.html", +) -> ScrapResult: + """울산시 중구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 - ''' + """ soup = get_soup(url, verify=False) councilors: List[Councilor] = [] - for profile in soup.find_all('dl'): + for profile in soup.find_all("dl"): name_tag = profile.find("dd", class_="name") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - + party = "정당 정보 없음" - party_info = list(filter(lambda x: regex_pattern.search(str(x)), profile.find_all("dd"))) - if party_info and (party_span := party_info[0].find_next('span').find_next('span')) is not None: + party_info = list( + filter(lambda x: regex_pattern.search(str(x)), profile.find_all("dd")) + ) + if ( + party_info + and (party_span := party_info[0].find_next("span").find_next("span")) + is not None + ): party = party_span.text councilors.append(Councilor(name=name, party=party)) @@ -30,25 +39,36 @@ def scrap_70(url = 'https://council.junggu.ulsan.kr/content/member/memberName.ht return ScrapResult( council_id="ulsan-junggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_71(url = 'https://www.namgucouncil.ulsan.kr/content/member/memberName.html') -> ScrapResult: - '''울산시 남구 페이지에서 의원 상세약력 스크랩 + +def scrap_71( + url="https://www.namgucouncil.ulsan.kr/content/member/memberName.html", +) -> ScrapResult: + """울산시 남구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 - ''' + """ soup = get_soup(url, verify=False) councilors: List[Councilor] = [] - for profile in soup.find_all('dl'): + for profile in soup.find_all("dl"): name_tag = profile.find("dd", class_="name") - name = name_tag.get_text(strip=True).replace(" 의원", "") if name_tag else "이름 정보 없음" + name = ( + name_tag.get_text(strip=True).replace(" 의원", "") if name_tag else "이름 정보 없음" + ) party = "정당 정보 없음" - party_info = list(filter(lambda x: regex_pattern.search(str(x)), profile.find_all("dd"))) - if party_info and (party_span := party_info[0].find_next('span').find_next('span')) is not None: + party_info = list( + filter(lambda x: regex_pattern.search(str(x)), profile.find_all("dd")) + ) + if ( + party_info + and (party_span := party_info[0].find_next("span").find_next("span")) + is not None + ): party = party_span.text councilors.append(Councilor(name=name, party=party)) @@ -56,65 +76,74 @@ def scrap_71(url = 'https://www.namgucouncil.ulsan.kr/content/member/memberName. return ScrapResult( council_id="ulsan-namgu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_72(url = 'https://www.donggu-council.ulsan.kr/source/korean/member/active.html') -> ScrapResult: - '''울산시 동구 페이지에서 의원 상세약력 스크랩 + +def scrap_72( + url="https://www.donggu-council.ulsan.kr/source/korean/member/active.html", +) -> ScrapResult: + """울산시 동구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 - ''' - soup = get_soup(url, verify=False, encoding='euc-kr') + """ + soup = get_soup(url, verify=False, encoding="euc-kr") councilors: List[Councilor] = [] - for profile in soup.find_all('div', class_='profile'): + for profile in soup.find_all("div", class_="profile"): name_tag = profile.find("li", class_="name") # () 안에 있는 한자를 제거 (ex. 김영희(金英姬) -> 김영희) - name = name_tag.get_text(strip=True).split('(')[0] if name_tag else "이름 정보 없음" + name = name_tag.get_text(strip=True).split("(")[0] if name_tag else "이름 정보 없음" party = "정당 정보 없음" - party_info = list(filter(lambda x: regex_pattern.search(str(x)), profile.find_all("li"))) + party_info = list( + filter(lambda x: regex_pattern.search(str(x)), profile.find_all("li")) + ) if party_info: - party = party_info[0].get_text(strip=True).split(': ')[1] + party = party_info[0].get_text(strip=True).split(": ")[1] councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="ulsan-donggu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_73(url = 'https://council.bukgu.ulsan.kr/kr/member/active.do') -> ScrapResult: - '''울산시 북구 페이지에서 의원 상세약력 스크랩 + +def scrap_73(url="https://council.bukgu.ulsan.kr/kr/member/active.do") -> ScrapResult: + """울산시 북구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 - ''' + """ soup = get_soup(url, verify=False) councilors: List[Councilor] = [] - for profile in soup.find_all('dl', class_='profile'): + for profile in soup.find_all("dl", class_="profile"): name_tag = profile.find("strong", class_="name") # () 안에 있는 한자를 제거 (ex. 김영희(金英姬) -> 김영희) - name = name_tag.get_text(strip=True).split('(')[0] if name_tag else "이름 정보 없음" + name = name_tag.get_text(strip=True).split("(")[0] if name_tag else "이름 정보 없음" party = "정당 정보 없음" - party_info = list(filter(lambda x: regex_pattern.search(str(x)), profile.find_all("li"))) + party_info = list( + filter(lambda x: regex_pattern.search(str(x)), profile.find_all("li")) + ) if party_info: - party = party_info[0].get_text(strip=True).split(': ')[1] + party = party_info[0].get_text(strip=True).split(": ")[1] councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="ulsan-bukgu", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_74(url = 'https://assembly.ulju.ulsan.kr/kr/member/active') -> ScrapResult: - '''울산시 울주군 페이지에서 의원 상세약력 스크랩 + +def scrap_74(url="https://assembly.ulju.ulsan.kr/kr/member/active") -> ScrapResult: + """울산시 울주군 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 - ''' + """ soup = get_soup(url, verify=False) councilors: List[Councilor] = [] @@ -122,18 +151,18 @@ def scrap_74(url = 'https://assembly.ulju.ulsan.kr/kr/member/active') -> ScrapRe parsed_url = urlparse(url) base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" - for profile in soup.find_all('div', class_='profile'): + for profile in soup.find_all("div", class_="profile"): name_tag = profile.find("em", class_="name") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' + party = "정당 정보 없음" # 프로필보기 링크 가져오기 - profile_link = profile.find('a', class_='start') + profile_link = profile.find("a", class_="start") if profile_link: - profile_url = base_url + profile_link['href'] + profile_url = base_url + profile_link["href"] profile_soup = get_soup(profile_url, verify=False) - party_info = profile_soup.find('em', string=regex_pattern) - if party_info and (party_span := party_info.find_next('span')) is not None: + party_info = profile_soup.find("em", string=regex_pattern) + if party_info and (party_span := party_info.find_next("span")) is not None: party = party_span.text councilors.append(Councilor(name=name, party=party)) @@ -141,8 +170,9 @@ def scrap_74(url = 'https://assembly.ulju.ulsan.kr/kr/member/active') -> ScrapRe return ScrapResult( council_id="ulsan_uljugun", council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -if __name__ == '__main__': - print(scrap_70()) \ No newline at end of file + +if __name__ == "__main__": + print(scrap_70()) diff --git a/scrap/national_council.py b/scrap/national_council.py index 9f9fa2d..fbc79f7 100644 --- a/scrap/national_council.py +++ b/scrap/national_council.py @@ -6,42 +6,45 @@ BASE_DIR = os.path.join(os.path.dirname(__file__), os.pardir) + def scrap_national_council(cd: int) -> ScrapResult: - '''열린국회정보 Open API를 이용해 역대 국회의원 인적사항 스크랩 - _data 폴더에 assembly_api_key.json 파일을 만들어야 하며, - 해당 JSON은 {"key":"(Open API에서 발급받은 인증키)"} 꼴을 가져야 한다. - https://open.assembly.go.kr/portal/data/service/selectAPIServicePage.do/OBL7NF0011935G18076#none - - :param cd: 국회의원 대수. 제20대 국회의원을 스크랩하고자 하면 20 - :return: 국회의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 - ''' - - key_json_path = os.path.join(BASE_DIR, '_data', 'assembly_api_key.json') - if not os.path.exists(key_json_path): - raise Exception('열린국회정보 Open API에 회원가입 후 인증키를 발급받아주세요.\nhttps://open.assembly.go.kr/portal/openapi/openApiDevPage.do') - with open(key_json_path, 'r') as key_json: - assembly_key = json.load(key_json)['key'] - - request_url = f"https://open.assembly.go.kr/portal/openapi/npffdutiapkzbfyvr?KEY={assembly_key}&UNIT_CD={cd + 100000}" - response = requests.get(request_url) - - if response.status_code != 200: - raise Exception(f'Open API 요청에 실패했습니다 (상태 코드 {response.status_code})') - - root = ET.fromstring(response.text) - councilors: list[Councilor] = [] - - for row in root.iter('row'): - councilors.append(Councilor( - name=row.find('HG_NM').text, - party=row.find('POLY_NM').text - )) - - return ScrapResult( - council_id='national', - council_type=CouncilType.NATIONAL_COUNCIL, - councilors=councilors - ) - -if __name__ == '__main__': - print(scrap_national_council(20)) \ No newline at end of file + """열린국회정보 Open API를 이용해 역대 국회의원 인적사항 스크랩 + _data 폴더에 assembly_api_key.json 파일을 만들어야 하며, + 해당 JSON은 {"key":"(Open API에서 발급받은 인증키)"} 꼴을 가져야 한다. + https://open.assembly.go.kr/portal/data/service/selectAPIServicePage.do/OBL7NF0011935G18076#none + + :param cd: 국회의원 대수. 제20대 국회의원을 스크랩하고자 하면 20 + :return: 국회의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 + """ + + key_json_path = os.path.join(BASE_DIR, "_data", "assembly_api_key.json") + if not os.path.exists(key_json_path): + raise Exception( + "열린국회정보 Open API에 회원가입 후 인증키를 발급받아주세요.\nhttps://open.assembly.go.kr/portal/openapi/openApiDevPage.do" + ) + with open(key_json_path, "r") as key_json: + assembly_key = json.load(key_json)["key"] + + request_url = f"https://open.assembly.go.kr/portal/openapi/npffdutiapkzbfyvr?KEY={assembly_key}&UNIT_CD={cd + 100000}" + response = requests.get(request_url) + + if response.status_code != 200: + raise Exception(f"Open API 요청에 실패했습니다 (상태 코드 {response.status_code})") + + root = ET.fromstring(response.text) + councilors: list[Councilor] = [] + + for row in root.iter("row"): + councilors.append( + Councilor(name=row.find("HG_NM").text, party=row.find("POLY_NM").text) + ) + + return ScrapResult( + council_id="national", + council_type=CouncilType.NATIONAL_COUNCIL, + councilors=councilors, + ) + + +if __name__ == "__main__": + print(scrap_national_council(20)) diff --git a/scrap/utils/database.py b/scrap/utils/database.py index c197c6e..9014802 100644 --- a/scrap/utils/database.py +++ b/scrap/utils/database.py @@ -9,6 +9,7 @@ # 컬렉션은 하나 이상의 문서로 구성됩니다. db = client[str(MongoDBSecrets.database_name)] + def save_to_database(record: ScrapResult): """ 지방의회 크롤링 결과를 데이터베이스에 저장합니다. @@ -25,20 +26,21 @@ def save_to_database(record: ScrapResult): collection.find_one_and_update( {"councilId": record.council_id}, {"$set": dataclasses.asdict(record)}, - upsert=True + upsert=True, ) return True except Exception as e: print(e) return False -if __name__ == "__main__": - test_record = (ScrapResult( + +if __name__ == "__main__": + test_record = ScrapResult( council_id="test-test", council_type=CouncilType.LOCAL_COUNCIL, councilors=[ Councilor(name="김철수", party="국민의힘"), Councilor(name="김영희", party="더불어민주당"), - ] - )) - print(save_to_database(test_record)) \ No newline at end of file + ], + ) + print(save_to_database(test_record)) diff --git a/scrap/utils/requests.py b/scrap/utils/requests.py index 2bf72b6..16a2135 100644 --- a/scrap/utils/requests.py +++ b/scrap/utils/requests.py @@ -8,15 +8,18 @@ from unicodedata import normalize # SSL 인증서 검증 경고 무시 -requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning) # type: ignore +requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning) # type: ignore # 충청북도 보은군, 강진시에서 타임아웃이 timeout_time = 60 -def get_soup(url: str, additional_headers={}, verify=True, encoding="utf-8") -> BeautifulSoup: + +def get_soup( + url: str, additional_headers={}, verify=True, encoding="utf-8" +) -> BeautifulSoup: """ url을 입력받아 BeautifulSoup 객체를 반환합니다. requests 라이브러리를 사용합니다. 크롤링 결과가 정상적으로 나오지 않을 경우, Selenium 라이브러리를 사용할 수 있습니다. - + :param url: 크롤링할 페이지의 url입니다. :param additional_headers: 추가적으로 포함할 헤더입니다. 딕셔너리 형태로 입력받습니다. :param verify: SSL 인증서 검증 여부입니다. 인증서가 만료된 페이지를 크롤링할 경우 False로 설정합니다. @@ -25,11 +28,13 @@ def get_soup(url: str, additional_headers={}, verify=True, encoding="utf-8") -> # HTTP 요청에 포함해줄 헤더 http_headers = { - "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36" + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36" } http_headers.update(additional_headers) - - response = requests.get(url, verify=verify, headers=http_headers, timeout=timeout_time) + + response = requests.get( + url, verify=verify, headers=http_headers, timeout=timeout_time + ) response.encoding = encoding - sanitized_response = normalize('NFKC', unescape(response.text)) - return BeautifulSoup(sanitized_response, 'html.parser') \ No newline at end of file + sanitized_response = normalize("NFKC", unescape(response.text)) + return BeautifulSoup(sanitized_response, "html.parser") diff --git a/scrap/utils/spreadsheet.py b/scrap/utils/spreadsheet.py index ca78fa8..7ce9082 100644 --- a/scrap/utils/spreadsheet.py +++ b/scrap/utils/spreadsheet.py @@ -12,53 +12,62 @@ # 변경 시 token.json 삭제 후 재인증 필요 SCOPES = ["https://www.googleapis.com/auth/spreadsheets"] BASE_DIR = os.path.join(os.path.dirname(__file__), os.pardir, os.pardir) + + def google_authorization(): - '''Google Sheets API 활용을 위한 인증 정보 요청 + """Google Sheets API 활용을 위한 인증 정보 요청 credentials.json 파일을 토대로 인증을 요청하되, token.json 파일이 존재할 경우 거기에 저장된 정보 활용 :todo: credentials.json 파일, token.json 파일 값을 환경변수로 설정 - :return: gspread.client.Client 인스턴스''' + :return: gspread.client.Client 인스턴스""" creds = None - token_json_path = os.path.join(BASE_DIR, '_data', 'token.json') + token_json_path = os.path.join(BASE_DIR, "_data", "token.json") # 이미 저장된 인증 정보가 있는지 확인 if os.path.exists(token_json_path): creds = Credentials.from_authorized_user_file(token_json_path, SCOPES) - + # 인증 정보가 없거나 비정상적인 경우 인증 재요청 if not creds or not creds.valid: if creds and creds.expired and creds.refresh_token: creds.refresh(Request()) else: - flow= InstalledAppFlow.from_client_secrets_file(os.path.join(BASE_DIR, '_data', 'credentials.json'), SCOPES) + flow = InstalledAppFlow.from_client_secrets_file( + os.path.join(BASE_DIR, "_data", "credentials.json"), SCOPES + ) creds = flow.run_local_server(port=0) - with open(token_json_path, 'w') as token: + with open(token_json_path, "w") as token: token.write(creds.to_json()) return gspread.authorize(creds) + def main() -> None: # Google Sheets API 설정 client: gspread.client.Client = google_authorization() # 스프레드시트 열기 - spreadsheet: gspread.Spreadsheet = client.open_by_url('https://docs.google.com/spreadsheets/d/1Eq2x7xZCw_5ng2GdHDnpUIhhwbmOAKEl4abX09JLyuA/edit#gid=1044938838') - worksheet: gspread.Worksheet = spreadsheet.get_worksheet(1) # 원하는 워크시트 선택 (0은 첫 번째 워크시트입니다.) + spreadsheet: gspread.Spreadsheet = client.open_by_url( + "https://docs.google.com/spreadsheets/d/1Eq2x7xZCw_5ng2GdHDnpUIhhwbmOAKEl4abX09JLyuA/edit#gid=1044938838" + ) + worksheet: gspread.Worksheet = spreadsheet.get_worksheet( + 1 + ) # 원하는 워크시트 선택 (0은 첫 번째 워크시트입니다.) # 데이터 가져오기 data: list[dict] = worksheet.get_all_records() - print(scrap_junggu(data[1]['상세약력 링크'])) - print(scrap_gwangjingu(data[4]['상세약력 링크'])) - print(scrap_dongdaemungu(data[5]['상세약력 링크'])) - for n in range (65, 75): + print(scrap_junggu(data[1]["상세약력 링크"])) + print(scrap_gwangjingu(data[4]["상세약력 링크"])) + print(scrap_dongdaemungu(data[5]["상세약력 링크"])) + for n in range(65, 75): function_name = f"scrap_{n}" if hasattr(sys.modules[__name__], function_name): function_to_call = getattr(sys.modules[__name__], function_name) print(function_to_call) if n in [66, 70, 74]: - result = function_to_call() # 스프레드시트 링크 터짐 (울산 울주군처럼 애먼데 링크인 경우도 있다) + result = function_to_call() # 스프레드시트 링크 터짐 (울산 울주군처럼 애먼데 링크인 경우도 있다) else: - result = function_to_call(data[n - 1]['상세약력 링크']) + result = function_to_call(data[n - 1]["상세약력 링크"]) print(result) else: print(f"함수 {function_name}를 찾을 수 없습니다.") @@ -84,5 +93,7 @@ def main() -> None: # error_times += 1 # continue # 에러가 발생하면 다음 반복으로 넘어감 # print(f"| 총 실행 횟수: {N} | 에러 횟수: {error_times} | 정보 없음 횟수: {parse_error_times} | 타임아웃 횟수: {timeouts} |") -if __name__ == '__main__': + + +if __name__ == "__main__": main() diff --git a/scrap/utils/types.py b/scrap/utils/types.py index f56dba8..481bfa0 100644 --- a/scrap/utils/types.py +++ b/scrap/utils/types.py @@ -1,4 +1,4 @@ -#coding: utf-8 +# coding: utf-8 """ 의회 크롤링 결과를 나타내는 타입을 정의합니다. """ @@ -6,34 +6,41 @@ from typing import Optional, List from dataclasses import dataclass + class CouncilType(str, Enum): """ 의회의 종류를 나타내는 열거형입니다. """ - LOCAL_COUNCIL = "local_council" + + LOCAL_COUNCIL = "local_council" NATIONAL_COUNCIL = "national_council" """ 기초의회 """ + def __str__(self): """ JSON으로 직렬화하기 위해 문자열로 변환하는 함수를 오버라이드합니다. """ return str(self.value) + @dataclass class Councilor: """ 의원(이름 및 정당)을 나타내는 타입입니다. """ + name: str party: str + @dataclass class ScrapResult: """ 의회 크롤링 결과를 나타내는 타입입니다. """ + council_id: str """ 의회를 구분하기 위한 문자열입니다. @@ -46,5 +53,3 @@ class ScrapResult: """ 의회 의원 목록입니다. """ - - diff --git a/scrap/utils/utils.py b/scrap/utils/utils.py index 025ac0e..ef37957 100644 --- a/scrap/utils/utils.py +++ b/scrap/utils/utils.py @@ -1,20 +1,22 @@ from scrap.utils.requests import get_soup + def getPartyList(): """ 중앙선거관리위원회에서 제공하는 정당 목록을 가져옵니다. """ - url = 'https://www.nec.go.kr/site/nec/ex/bbs/List.do?cbIdx=1239' + url = "https://www.nec.go.kr/site/nec/ex/bbs/List.do?cbIdx=1239" soup = get_soup(url) - table = soup.find('table', class_='list type2') + table = soup.find("table", class_="list type2") partyList = [] - for tr in table.find('tbody').find_all('tr'): - td = tr.find_all('td') - if td[0].get_text(strip=True).split("
")[0] == '시도': + for tr in table.find("tbody").find_all("tr"): + td = tr.find_all("td") + if td[0].get_text(strip=True).split("
")[0] == "시도": continue # 더불어민주당(민주당, 더민주) 등은 약자가 괄호 안에 있다. partyList.append(td[0].get_text(strip=True).split("
")[0].split("(")[0]) return partyList -if __name__ == '__main__': - print(getPartyList()) \ No newline at end of file + +if __name__ == "__main__": + print(getPartyList()) From 59f1559558345ab80f85f3da0f87f5610e1be2b6 Mon Sep 17 00:00:00 2001 From: Re-st Date: Fri, 13 Oct 2023 08:03:47 +0000 Subject: [PATCH 3/3] Formatted with black --- scrap/local_councils/basic.py | 116 +++++---- scrap/local_councils/gwangju.py | 2 +- scrap/local_councils/gyeonggi.py | 100 +++++--- scrap/local_councils/incheon.py | 30 ++- scrap/local_councils/seoul.py | 8 +- scrap/metropolitan_council.py | 297 +++++++++++++-------- scrap/national_council.py | 79 +++--- scrap/utils/spreadsheet.py | 428 ++++++++++++++++++++++++++----- scrap/utils/types.py | 37 +-- 9 files changed, 763 insertions(+), 334 deletions(-) diff --git a/scrap/local_councils/basic.py b/scrap/local_councils/basic.py index b9d2cd7..ec74750 100644 --- a/scrap/local_councils/basic.py +++ b/scrap/local_councils/basic.py @@ -11,6 +11,7 @@ party_keywords = getPartyList() party_keywords.append("무소속") + def find(soup, element, class_): if class_ is None: return soup.find(element) @@ -29,12 +30,12 @@ def get_profiles(soup, element, class_, memberlistelement, memberlistclass_): # 의원 목록 사이트에서 의원 프로필을 가져옴 if memberlistelement is not None: try: - soup = find_all(soup, memberlistelement, - class_=memberlistclass_)[0] + soup = find_all(soup, memberlistelement, class_=memberlistclass_)[0] except Exception: - raise RuntimeError('[basic.py] 의원 목록 사이트에서 의원 프로필을 가져오는데 실패했습니다.') + raise RuntimeError("[basic.py] 의원 목록 사이트에서 의원 프로필을 가져오는데 실패했습니다.") return find_all(soup, element, class_) + def getDataFromAPI(url_format, data_uid, name_id, party_id) -> Councilor: # API로부터 의원 정보를 가져옴 url = url_format.format(data_uid) @@ -45,36 +46,33 @@ def getDataFromAPI(url_format, data_uid, name_id, party_id) -> Councilor: ) - def get_name(profile, element, class_, wrapper_element, wrapper_class_): # 의원 프로필에서 의원 이름을 가져옴 if wrapper_element is not None: profile = find_all(profile, wrapper_element, class_=wrapper_class_)[0] name_tag = find(profile, element, class_) - if name_tag.find('span'): + if name_tag.find("span"): name_tag = copy.copy(name_tag) # span 태그 안의 것들을 다 지움 - for span in name_tag.find_all('span'): + for span in name_tag.find_all("span"): span.decompose() name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" # name은 길고 그 중 strong태그 안에 이름이 있는 경우. 은평구, 수원시 등. if name_tag.strong is not None: - name = name_tag.strong.get_text( - strip=True) if name_tag.strong else "이름 정보 없음" - name = name.split('(')[0].split( - ':')[-1].strip() # 이름 뒷 한자이름, 앞 '이 름:' 제거 + name = name_tag.strong.get_text(strip=True) if name_tag.strong else "이름 정보 없음" + name = name.split("(")[0].split(":")[-1].strip() # 이름 뒷 한자이름, 앞 '이 름:' 제거 # TODO : 만약 이름이 우연히 아래 단어를 포함하는 경우를 생각해볼만 함. if len(name) > 3: # 수식어가 이름 앞이나 뒤에 붙어있는 경우 - for keyword in ['부의장', '의원', '의장']: # 119, 강서구 등 + for keyword in ["부의장", "의원", "의장"]: # 119, 강서구 등 if keyword in name: - name = name.replace(keyword, '').strip() + name = name.replace(keyword, "").strip() for keyword in party_keywords: if keyword in name: # 인천 서구 등 - name = name.replace(keyword, '').strip() + name = name.replace(keyword, "").strip() break - name = name.split(' ')[0] # 이름 뒤에 직책이 따라오는 경우 + name = name.split(" ")[0] # 이름 뒤에 직책이 따라오는 경우 return name @@ -92,37 +90,42 @@ def goto_profilesite(profile, wrapper_element, wrapper_class_, wrapper_txt, url) # 프로필보기 링크 가져오기 profile_link = find(profile, wrapper_element, class_=wrapper_class_) if wrapper_txt is not None: - profile_links = find_all(profile, 'a', class_=wrapper_class_) - profile_link = [ - link for link in profile_links if link.text == wrapper_txt][0] + profile_links = find_all(profile, "a", class_=wrapper_class_) + profile_link = [link for link in profile_links if link.text == wrapper_txt][0] if profile_link is None: - raise RuntimeError('[basic.py] 의원 프로필에서 프로필보기 링크를 가져오는데 실패했습니다.') + raise RuntimeError("[basic.py] 의원 프로필에서 프로필보기 링크를 가져오는데 실패했습니다.") # if base_url[-1] != '/': # base_url = base_url + '/' - profile_url = base_url + profile_link['href'] + profile_url = base_url + profile_link["href"] try: profile = get_soup(profile_url, verify=False) except Exception: - raise RuntimeError('[basic.py] \'//\'가 있진 않나요?', ' url: ', profile_url) + raise RuntimeError("[basic.py] '//'가 있진 않나요?", " url: ", profile_url) return profile -def get_party(profile, element, class_, wrapper_element, wrapper_class_, wrapper_txt, url): +def get_party( + profile, element, class_, wrapper_element, wrapper_class_, wrapper_txt, url +): # 의원 프로필에서 의원이 몸담는 정당 이름을 가져옴 if wrapper_element is not None: profile = goto_profilesite( - profile, wrapper_element, wrapper_class_, wrapper_txt, url) - party_pulp_list = list(filter(lambda x: regex_pattern.search( - str(x)), find_all(profile, element, class_))) + profile, wrapper_element, wrapper_class_, wrapper_txt, url + ) + party_pulp_list = list( + filter( + lambda x: regex_pattern.search(str(x)), find_all(profile, element, class_) + ) + ) if party_pulp_list == []: - raise RuntimeError('[basic.py] 정당정보 regex 실패') + raise RuntimeError("[basic.py] 정당정보 regex 실패") party_pulp = party_pulp_list[0] - party_string = party_pulp.get_text(strip=True).split(' ')[-1] + party_string = party_pulp.get_text(strip=True).split(" ")[-1] while True: if (party := extract_party(party_string)) is not None: return party - if (party_pulp := party_pulp.find_next('span')) is not None: - party_string = party_pulp.text.strip().split(' ')[-1] + if (party_pulp := party_pulp.find_next("span")) is not None: + party_string = party_pulp.text.strip().split(" ")[-1] else: return "[basic.py] 정당 정보 파싱 불가" @@ -131,44 +134,57 @@ def get_party_easy(profile, wrapper_element, wrapper_class_, wrapper_txt, url): # 의원 프로필에서 의원이 몸담는 정당 이름을 가져옴 if wrapper_element is not None: profile = goto_profilesite( - profile, wrapper_element, wrapper_class_, wrapper_txt, url) + profile, wrapper_element, wrapper_class_, wrapper_txt, url + ) party = extract_party(profile.text) - assert (party is not None) + assert party is not None return party -def scrap_basic(url, cid, args: ScrapBasicArgument, encoding='utf-8') -> ScrapResult: - '''의원 상세약력 스크랩 +def scrap_basic(url, cid, args: ScrapBasicArgument, encoding="utf-8") -> ScrapResult: + """의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :param n: 의회 id :param encoding: 받아온 soup 인코딩 :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 - ''' + """ soup = get_soup(url, verify=False, encoding=encoding) councilors: list[Councilor] = [] - profiles = get_profiles(soup, args.pf_elt, args.pf_cls, - args.pf_memlistelt, args.pf_memlistcls) - print(cid, '번째 의회에는,', len(profiles), '명의 의원이 있습니다.') # 디버깅용. + profiles = get_profiles( + soup, args.pf_elt, args.pf_cls, args.pf_memlistelt, args.pf_memlistcls + ) + print(cid, "번째 의회에는,", len(profiles), "명의 의원이 있습니다.") # 디버깅용. for profile in profiles: - name = party = '' + name = party = "" try: - name = get_name(profile, args.name_elt, args.name_cls, - args.name_wrapelt, args.name_wrapcls) + name = get_name( + profile, + args.name_elt, + args.name_cls, + args.name_wrapelt, + args.name_wrapcls, + ) except Exception as e: - raise RuntimeError( - '[basic.py] 의원 이름을 가져오는데 실패했습니다. 이유 : ' + str(e)) + raise RuntimeError("[basic.py] 의원 이름을 가져오는데 실패했습니다. 이유 : " + str(e)) try: - party = get_party(profile, args.pty_elt, args.pty_cls, - args.pty_wrapelt, args.pty_wrapcls, args.pty_wraptxt, url) + party = get_party( + profile, + args.pty_elt, + args.pty_cls, + args.pty_wrapelt, + args.pty_wrapcls, + args.pty_wraptxt, + url, + ) except Exception as e: try: party = get_party_easy( - profile, args.pty_wrapelt, args.pty_wrapcls, args.pty_wraptxt, url) + profile, args.pty_wrapelt, args.pty_wrapcls, args.pty_wraptxt, url + ) except Exception: - raise RuntimeError( - '[basic.py] 의원 정당을 가져오는데 실패했습니다. 이유: ' + str(e)) + raise RuntimeError("[basic.py] 의원 정당을 가져오는데 실패했습니다. 이유: " + str(e)) councilors.append(Councilor(name=name, party=party)) return ScrapResult( @@ -177,7 +193,9 @@ def scrap_basic(url, cid, args: ScrapBasicArgument, encoding='utf-8') -> ScrapRe councilors=councilors, ) -if __name__ == '__main__': + +if __name__ == "__main__": args3 = ScrapBasicArgument( - pf_elt='div', pf_cls='profile', name_elt='em', name_cls='name', pty_elt='em') - print(scrap_basic('https://www.yscl.go.kr/kr/member/name.do', 3, args3)) # 서울 용산구 \ No newline at end of file + pf_elt="div", pf_cls="profile", name_elt="em", name_cls="name", pty_elt="em" + ) + print(scrap_basic("https://www.yscl.go.kr/kr/member/name.do", 3, args3)) # 서울 용산구 diff --git a/scrap/local_councils/gwangju.py b/scrap/local_councils/gwangju.py index b34f872..a162b4a 100644 --- a/scrap/local_councils/gwangju.py +++ b/scrap/local_councils/gwangju.py @@ -2,4 +2,4 @@ """ from scrap.utils.types import CouncilType, Councilor, ScrapResult from scrap.utils.requests import get_soup -from scrap.local_councils.basic import * \ No newline at end of file +from scrap.local_councils.basic import * diff --git a/scrap/local_councils/gyeonggi.py b/scrap/local_councils/gyeonggi.py index 7fc2627..8d22ab0 100644 --- a/scrap/local_councils/gyeonggi.py +++ b/scrap/local_councils/gyeonggi.py @@ -4,66 +4,88 @@ from scrap.utils.requests import get_soup from scrap.local_councils.basic import * + def get_profiles_88(soup, element, class_, memberlistelement, memberlistclass_): # 의원 목록 사이트에서 의원 프로필을 가져옴 if memberlistelement is not None: try: soup = soup.find_all(memberlistelement, id=memberlistclass_)[0] except Exception: - raise RuntimeError('[basic.py] 의원 목록 사이트에서 의원 프로필을 가져오는데 실패했습니다.') + raise RuntimeError("[basic.py] 의원 목록 사이트에서 의원 프로필을 가져오는데 실패했습니다.") return soup.find_all(element, class_) + def get_party_88(profile, element, class_, wrapper_element, wrapper_class_, url): # 의원 프로필에서 의원이 몸담는 정당 이름을 가져옴 if wrapper_element is not None: parsed_url = urlparse(url) base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" # 프로필보기 링크 가져오기 - profile_link = find(profile, wrapper_element, class_=wrapper_class_).find('a') - profile_url = base_url + profile_link['href'] - profile = get_soup(profile_url, verify=False, encoding='euc-kr') - party_pulp_list = list(filter(lambda x: regex_pattern.search(str(x)), find_all(profile, element, class_))) - if party_pulp_list == []: raise RuntimeError('[basic.py] 정당정보 regex 실패') + profile_link = find(profile, wrapper_element, class_=wrapper_class_).find("a") + profile_url = base_url + profile_link["href"] + profile = get_soup(profile_url, verify=False, encoding="euc-kr") + party_pulp_list = list( + filter( + lambda x: regex_pattern.search(str(x)), find_all(profile, element, class_) + ) + ) + if party_pulp_list == []: + raise RuntimeError("[basic.py] 정당정보 regex 실패") party_pulp = party_pulp_list[0] - party_string = party_pulp.get_text(strip=True).split(' ')[-1] + party_string = party_pulp.get_text(strip=True).split(" ")[-1] while True: if (party := extract_party(party_string)) is not None: return party - if (party_pulp := party_pulp.find_next('span')) is not None: - party_string = party_pulp.text.strip().split(' ')[-1] + if (party_pulp := party_pulp.find_next("span")) is not None: + party_string = party_pulp.text.strip().split(" ")[-1] else: return "[basic.py] 정당 정보 파싱 불가" + def scrap_88(url, args: ScrapBasicArgument) -> ScrapResult: - '''의원 상세약력 스크랩 + """의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :param args: ScrapBasicArgument 객체 :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 - ''' + """ cid = 88 - encoding = 'euc-kr' + encoding = "euc-kr" soup = get_soup(url, verify=False, encoding=encoding) councilors: list[Councilor] = [] party_in_main_page = any(keyword in soup.text for keyword in party_keywords) - profiles = get_profiles_88(soup, args.pf_elt, args.pf_cls, args.pf_memlistelt, args.pf_memlistcls) - print(cid, '번째 의회에는,', len(profiles), '명의 의원이 있습니다.') # 디버깅용. + profiles = get_profiles_88( + soup, args.pf_elt, args.pf_cls, args.pf_memlistelt, args.pf_memlistcls + ) + print(cid, "번째 의회에는,", len(profiles), "명의 의원이 있습니다.") # 디버깅용. for profile in profiles: - name = get_name(profile, args.name_elt, args.name_cls, args.name_wrapelt, args.name_wrapcls) - party = '' + name = get_name( + profile, args.name_elt, args.name_cls, args.name_wrapelt, args.name_wrapcls + ) + party = "" try: - party = get_party_88(profile, args.pty_elt, args.pty_cls, args.pty_wrapelt, args.pty_wrapcls, url) + party = get_party_88( + profile, + args.pty_elt, + args.pty_cls, + args.pty_wrapelt, + args.pty_wrapcls, + url, + ) except Exception: - party = get_party_easy(profile, args.pty_wrapelt, args.pty_wrapcls, args.pty_wraptxt, url) + party = get_party_easy( + profile, args.pty_wrapelt, args.pty_wrapcls, args.pty_wraptxt, url + ) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id=str(cid), council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) + def get_party_103(profile, element, class_, wrapper_element, wrapper_class_, url): # 의원 프로필에서 의원이 몸담는 정당 이름을 가져옴 if wrapper_element is not None: @@ -71,41 +93,53 @@ def get_party_103(profile, element, class_, wrapper_element, wrapper_class_, url base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" # 프로필보기 링크 가져오기 profile_link = profile.find(wrapper_element, class_=wrapper_class_) - profile_url = base_url + '/member/' + profile_link['href'] + profile_url = base_url + "/member/" + profile_link["href"] profile = get_soup(profile_url, verify=False) - party_pulp_list = list(filter(lambda x: regex_pattern.search(str(x)), find_all(profile, element, class_))) - if party_pulp_list == []: raise RuntimeError('[basic.py] 정당정보 regex 실패') + party_pulp_list = list( + filter( + lambda x: regex_pattern.search(str(x)), find_all(profile, element, class_) + ) + ) + if party_pulp_list == []: + raise RuntimeError("[basic.py] 정당정보 regex 실패") party_pulp = party_pulp_list[0] - party_string = party_pulp.get_text(strip=True).split(' ')[-1] + party_string = party_pulp.get_text(strip=True).split(" ")[-1] while True: if (party := extract_party(party_string)) is not None: return party - if (party_pulp := party_pulp.find_next('span')) is not None: - party_string = party_pulp.text.strip().split(' ')[-1] + if (party_pulp := party_pulp.find_next("span")) is not None: + party_string = party_pulp.text.strip().split(" ")[-1] else: return "[basic.py] 정당 정보 파싱 불가" + def scrap_103(url, args: ScrapBasicArgument) -> ScrapResult: - '''의원 상세약력 스크랩 + """의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :param args: ScrapBasicArgument 객체 :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 - ''' + """ cid = 103 soup = get_soup(url, verify=False) councilors: list[Councilor] = [] party_in_main_page = any(keyword in soup.text for keyword in party_keywords) - profiles = get_profiles_88(soup, args.pf_elt, args.pf_cls, args.pf_memlistelt, args.pf_memlistcls) - print(cid, '번째 의회에는,', len(profiles), '명의 의원이 있습니다.') # 디버깅용. + profiles = get_profiles_88( + soup, args.pf_elt, args.pf_cls, args.pf_memlistelt, args.pf_memlistcls + ) + print(cid, "번째 의회에는,", len(profiles), "명의 의원이 있습니다.") # 디버깅용. for profile in profiles: - name = get_name(profile, args.name_elt, args.name_cls, args.name_wrapelt, args.name_wrapcls) - party = get_party_103(profile, args.pty_elt, args.pty_cls, args.pty_wrapelt, args.pty_wrapcls, url) + name = get_name( + profile, args.name_elt, args.name_cls, args.name_wrapelt, args.name_wrapcls + ) + party = get_party_103( + profile, args.pty_elt, args.pty_cls, args.pty_wrapelt, args.pty_wrapcls, url + ) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id=str(cid), council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors - ) \ No newline at end of file + councilors=councilors, + ) diff --git a/scrap/local_councils/incheon.py b/scrap/local_councils/incheon.py index 740e1ad..58384c3 100644 --- a/scrap/local_councils/incheon.py +++ b/scrap/local_councils/incheon.py @@ -4,6 +4,7 @@ from scrap.utils.requests import get_soup from scrap.local_councils.basic import * + def scrap_50(url="https://www.icjg.go.kr/council/cnmi0101c") -> ScrapResult: """인천시 중구 페이지에서 의원 상세약력 스크랩 @@ -205,6 +206,7 @@ def scrap_56( councilors=councilors, ) + def scrap_57(url, args) -> ScrapResult: """인천시 서구 페이지에서 의원 상세약력 스크랩 @@ -215,23 +217,28 @@ def scrap_57(url, args) -> ScrapResult: councilors: list[Councilor] = [] cid = 57 - profiles = get_profiles(soup, args.pf_elt, args.pf_cls, args.pf_memlistelt, args.pf_memlistcls) - print(cid, '번째 의회에는,', len(profiles), '명의 의원이 있습니다.') # 디버깅용. + profiles = get_profiles( + soup, args.pf_elt, args.pf_cls, args.pf_memlistelt, args.pf_memlistcls + ) + print(cid, "번째 의회에는,", len(profiles), "명의 의원이 있습니다.") # 디버깅용. for profile in profiles: - name = get_name(profile, args.name_elt, args.name_cls, args.name_wrapelt, args.name_wrapcls) + name = get_name( + profile, args.name_elt, args.name_cls, args.name_wrapelt, args.name_wrapcls + ) - party = '정당 정보 없음' + party = "정당 정보 없음" party_pulp = find(profile, args.pty_elt, class_=args.pty_cls) - if party_pulp is None: raise AssertionError('[incheon.py] 정당정보 실패') + if party_pulp is None: + raise AssertionError("[incheon.py] 정당정보 실패") party_string = party_pulp.get_text(strip=True) - party_string = party_string.split(' ')[-1].strip() + party_string = party_string.split(" ")[-1].strip() while True: party = extract_party(party_string) if party is not None: break - if (party_pulp := party_pulp.find_next('span')) is not None: - party_string = party_pulp.text.split(' ')[-1] + if (party_pulp := party_pulp.find_next("span")) is not None: + party_string = party_pulp.text.split(" ")[-1] else: raise RuntimeError("[incheon.py] 정당 정보 파싱 불가") @@ -240,8 +247,9 @@ def scrap_57(url, args) -> ScrapResult: return ScrapResult( council_id=str(cid), council_type=CouncilType.LOCAL_COUNCIL, - councilors=councilors + councilors=councilors, ) -if __name__ == '__main__': - print(scrap_56()) \ No newline at end of file + +if __name__ == "__main__": + print(scrap_56()) diff --git a/scrap/local_councils/seoul.py b/scrap/local_councils/seoul.py index c59b815..af74bd7 100644 --- a/scrap/local_councils/seoul.py +++ b/scrap/local_councils/seoul.py @@ -4,7 +4,9 @@ from scrap.utils.requests import get_soup -def scrap_1(url = 'https://bookcouncil.jongno.go.kr/record/recordView.do?key=99784f935fce5c1d7c8c08c2f9e35dda1c0a6128428ecb1a87f87ee2b4e82890ffcf12563e01473f') -> ScrapResult: +def scrap_1( + url="https://bookcouncil.jongno.go.kr/record/recordView.do?key=99784f935fce5c1d7c8c08c2f9e35dda1c0a6128428ecb1a87f87ee2b4e82890ffcf12563e01473f", +) -> ScrapResult: """서울시 종로구 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -13,14 +15,14 @@ def scrap_1(url = 'https://bookcouncil.jongno.go.kr/record/recordView.do?key=997 soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='pop_profile'): + for profile in soup.find_all("div", class_="pop_profile"): info = profile.find("div", class_="info") data_ul = info.find("ul", class_="detail") data_lis = data_ul.find_all("li") name = data_lis[0].find("span").get_text(strip=True) party = data_lis[2].find("span").get_text(strip=True) name = name if name else "이름 정보 없음" - party = party if party else '정당 정보 없음' + party = party if party else "정당 정보 없음" councilors.append(Councilor(name=name, party=party)) diff --git a/scrap/metropolitan_council.py b/scrap/metropolitan_council.py index 1c19078..1c31cdf 100644 --- a/scrap/metropolitan_council.py +++ b/scrap/metropolitan_council.py @@ -4,12 +4,14 @@ from scrap.utils.requests import get_soup -def scrap_metro_1(url = 'https://www.smc.seoul.kr/main/memIntro01.do?menuId=001002001001') -> ScrapResult: - '''서울시 페이지에서 의원 상세약력 스크랩 +def scrap_metro_1( + url="https://www.smc.seoul.kr/main/memIntro01.do?menuId=001002001001", +) -> ScrapResult: + """서울시 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 - ''' + """ soup = get_soup(url, verify=False) councilors: list[Councilor] = [] @@ -17,54 +19,60 @@ def scrap_metro_1(url = 'https://www.smc.seoul.kr/main/memIntro01.do?menuId=0010 parsed_url = urlparse(url) base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" - for profile in soup.find_all('input', class_='memLinkk'): - name = profile['value'].strip() if profile else '이름 정보 없음' - party = '정당 정보 없음' + for profile in soup.find_all("input", class_="memLinkk"): + name = profile["value"].strip() if profile else "이름 정보 없음" + party = "정당 정보 없음" # 프로필보기 링크 가져오기 - profile_url = base_url + '/home/' + profile['data-url'] + profile_url = base_url + "/home/" + profile["data-url"] profile_soup = get_soup(profile_url, verify=False) - party_info = profile_soup.find('div', class_='profile') - if party_info and (party_span := party_info.find('li')) is not None: - party = party_span.find_next('li').get_text(strip=True) + party_info = profile_soup.find("div", class_="profile") + if party_info and (party_span := party_info.find("li")) is not None: + party = party_span.find_next("li").get_text(strip=True) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="seoul", council_type=CouncilType.METROPOLITAN_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_metro_2(url = 'https://council.busan.go.kr/council/past02') -> ScrapResult: - '''부산시 페이지에서 의원 상세약력 스크랩 +def scrap_metro_2(url="https://council.busan.go.kr/council/past02") -> ScrapResult: + """부산시 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 - ''' - soup = get_soup(url, verify=False).find('ul', class_='inmemList') + """ + soup = get_soup(url, verify=False).find("ul", class_="inmemList") councilors: list[Councilor] = [] - for profile in soup.find_all('a', class_='detail'): - name = profile.get_text(strip=True) if profile else '이름 정보 없음' - party = '정당 정보 없음' + for profile in soup.find_all("a", class_="detail"): + name = profile.get_text(strip=True) if profile else "이름 정보 없음" + party = "정당 정보 없음" # 프로필보기 링크 가져오기 - profile_url = profile['href'] + profile_url = profile["href"] profile_soup = get_soup(profile_url, verify=False) - party_info = profile_soup.find('ul', class_='vs-list-st-type01') - if party_info and (party_span := party_info.find('li')) is not None: - party = party_span.find_next('li').find_next('li').get_text(strip=True).split()[-1].strip() + party_info = profile_soup.find("ul", class_="vs-list-st-type01") + if party_info and (party_span := party_info.find("li")) is not None: + party = ( + party_span.find_next("li") + .find_next("li") + .get_text(strip=True) + .split()[-1] + .strip() + ) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="busan", council_type=CouncilType.METROPOLITAN_COUNCIL, - councilors=councilors + councilors=councilors, ) @@ -82,7 +90,7 @@ def scrap_metro_3(url="https://council.daegu.go.kr/kr/member/active") -> ScrapRe name_tag = profile.find("p", class_="name") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party = '정당 정보 없음' + party = "정당 정보 없음" party_info = profile.find("em", string="소속정당") if party_info: party = party_info.find_next("span").get_text(strip=True) @@ -92,7 +100,7 @@ def scrap_metro_3(url="https://council.daegu.go.kr/kr/member/active") -> ScrapRe return ScrapResult( council_id="daegu", council_type=CouncilType.METROPOLITAN_COUNCIL, - councilors=councilors + councilors=councilors, ) @@ -103,11 +111,11 @@ def scrap_metro_4(url="https://www.icouncil.go.kr/main/member/name.jsp") -> Scra :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 """ - soup = get_soup(url, verify=False).find('table', class_='data').find('tbody') + soup = get_soup(url, verify=False).find("table", class_="data").find("tbody") councilors: list[Councilor] = [] for profile in soup.find_all("tr"): - columns = profile.find_all('td') + columns = profile.find_all("td") name_tag = columns[0] name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" @@ -120,7 +128,7 @@ def scrap_metro_4(url="https://www.icouncil.go.kr/main/member/name.jsp") -> Scra return ScrapResult( council_id="incheon", council_type=CouncilType.METROPOLITAN_COUNCIL, - councilors=councilors + councilors=councilors, ) @@ -131,7 +139,7 @@ def scrap_metro_5(url="https://council.gwangju.go.kr/index.do?PID=029") -> Scrap :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 """ - soup = get_soup(url, verify=False).find('table', class_='data').find('tbody') + soup = get_soup(url, verify=False).find("table", class_="data").find("tbody") councilors: list[Councilor] = [] # TODO @@ -139,37 +147,41 @@ def scrap_metro_5(url="https://council.gwangju.go.kr/index.do?PID=029") -> Scrap return ScrapResult( council_id="gwangju", council_type=CouncilType.METROPOLITAN_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_metro_6(url="https://council.daejeon.go.kr/svc/cmp/MbrListByPhoto.do") -> ScrapResult: +def scrap_metro_6( + url="https://council.daejeon.go.kr/svc/cmp/MbrListByPhoto.do", +) -> ScrapResult: """대전시 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 """ - soup = get_soup(url, verify=False).find('ul', class_='mlist') + soup = get_soup(url, verify=False).find("ul", class_="mlist") councilors: list[Councilor] = [] for profile in soup.find_all("dl"): - name_tag = profile.find('dd', class_='name') - name = name_tag.find('strong').get_text(strip=True) if name_tag else "이름 정보 없음" + name_tag = profile.find("dd", class_="name") + name = name_tag.find("strong").get_text(strip=True) if name_tag else "이름 정보 없음" - party_tag = name_tag.find_next('dd').find_next('dd') - party = party_tag.find('i').get_text(strip=True) if party_tag else "정당 정보 없음" + party_tag = name_tag.find_next("dd").find_next("dd") + party = party_tag.find("i").get_text(strip=True) if party_tag else "정당 정보 없음" councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="daejeon", council_type=CouncilType.METROPOLITAN_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_metro_7(url="https://www.council.ulsan.kr/kor/councillor/viewByPerson.do") -> ScrapResult: +def scrap_metro_7( + url="https://www.council.ulsan.kr/kor/councillor/viewByPerson.do", +) -> ScrapResult: """울산시 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -179,10 +191,10 @@ def scrap_metro_7(url="https://www.council.ulsan.kr/kor/councillor/viewByPerson. soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for name_tag in soup.find_all("div", class_='name'): + for name_tag in soup.find_all("div", class_="name"): name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party_tag = name_tag.find_next('li').find_next('li') + party_tag = name_tag.find_next("li").find_next("li") party = party_tag.get_text(strip=True) if party_tag else "정당 정보 없음" councilors.append(Councilor(name=name, party=party)) @@ -190,57 +202,71 @@ def scrap_metro_7(url="https://www.council.ulsan.kr/kor/councillor/viewByPerson. return ScrapResult( council_id="ulsan", council_type=CouncilType.METROPOLITAN_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_metro_8(url="https://council.sejong.go.kr/mnu/pom/introductionMemberByName.do") -> ScrapResult: +def scrap_metro_8( + url="https://council.sejong.go.kr/mnu/pom/introductionMemberByName.do", +) -> ScrapResult: """세종시 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 """ - soup = get_soup(url, verify=False).find('ul', class_='ml') + soup = get_soup(url, verify=False).find("ul", class_="ml") councilors: list[Councilor] = [] - for profile in soup.find_all('dl'): - name_tag = profile.find('dd', class_='name') - name = name_tag.find(string=True, recursive=False).strip() if name_tag else "이름 정보 없음" - - party_tag = name_tag.find_next('dd').find_next('dd') - party = party_tag.get_text(strip=True).split()[-1].strip() if party_tag else "정당 정보 없음" + for profile in soup.find_all("dl"): + name_tag = profile.find("dd", class_="name") + name = ( + name_tag.find(string=True, recursive=False).strip() + if name_tag + else "이름 정보 없음" + ) + + party_tag = name_tag.find_next("dd").find_next("dd") + party = ( + party_tag.get_text(strip=True).split()[-1].strip() + if party_tag + else "정당 정보 없음" + ) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="sejong", council_type=CouncilType.METROPOLITAN_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_metro_9(url="https://www.ggc.go.kr/site/main/memberInfo/actvMmbr/list?cp=1&menu=consonant&sortOrder=MI_NAME&sortDirection=ASC") -> ScrapResult: +def scrap_metro_9( + url="https://www.ggc.go.kr/site/main/memberInfo/actvMmbr/list?cp=1&menu=consonant&sortOrder=MI_NAME&sortDirection=ASC", +) -> ScrapResult: """경기도 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url :return: 의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 """ - soup = get_soup(url, verify=False).find('div', class_='paging2 clearfix') + soup = get_soup(url, verify=False).find("div", class_="paging2 clearfix") councilors: list[Councilor] = [] - + parsed_url = urlparse(url) base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" - - for page in soup.find_all('a'): - page_url = base_url + page['href'] - page_soup = get_soup(page_url, verify=False).find('ul', class_='memberList3 clear') - for profile in page_soup.find_all('li', recursive=False): - name_tag = profile.find('p', class_='f22 blue3') + + for page in soup.find_all("a"): + page_url = base_url + page["href"] + page_soup = get_soup(page_url, verify=False).find( + "ul", class_="memberList3 clear" + ) + for profile in page_soup.find_all("li", recursive=False): + name_tag = profile.find("p", class_="f22 blue3") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party_tag = profile.find('li', class_='f15 m0') + party_tag = profile.find("li", class_="f15 m0") party = party_tag.get_text(strip=True) if party_tag else "정당 정보 없음" councilors.append(Councilor(name=name, party=party)) @@ -248,11 +274,13 @@ def scrap_metro_9(url="https://www.ggc.go.kr/site/main/memberInfo/actvMmbr/list? return ScrapResult( council_id="gyeonggi", council_type=CouncilType.METROPOLITAN_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_metro_10(url="https://council.chungbuk.kr/kr/member/active.do") -> ScrapResult: +def scrap_metro_10( + url="https://council.chungbuk.kr/kr/member/active.do", +) -> ScrapResult: """충청북도 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -262,23 +290,31 @@ def scrap_metro_10(url="https://council.chungbuk.kr/kr/member/active.do") -> Scr soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='profile'): - name_tag = profile.find('em', class_='name') - name = name_tag.get_text(strip=True).split()[0].strip() if name_tag else "이름 정보 없음" + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find("em", class_="name") + name = ( + name_tag.get_text(strip=True).split()[0].strip() if name_tag else "이름 정보 없음" + ) - party_tag = profile.find('em', string='소속정당') - party = party_tag.find_next('span').find_next('span').get_text(strip=True) if party_tag else "정당 정보 없음" + party_tag = profile.find("em", string="소속정당") + party = ( + party_tag.find_next("span").find_next("span").get_text(strip=True) + if party_tag + else "정당 정보 없음" + ) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="chungbuk", council_type=CouncilType.METROPOLITAN_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_metro_11(url="https://council.chungnam.go.kr/kr/member/name.do") -> ScrapResult: +def scrap_metro_11( + url="https://council.chungnam.go.kr/kr/member/name.do", +) -> ScrapResult: """충청남도 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -288,23 +324,31 @@ def scrap_metro_11(url="https://council.chungnam.go.kr/kr/member/name.do") -> Sc soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='profile'): - name_tag = profile.find('em', class_='name') - name = name_tag.get_text(strip=True).split()[0].strip() if name_tag else "이름 정보 없음" + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find("em", class_="name") + name = ( + name_tag.get_text(strip=True).split()[0].strip() if name_tag else "이름 정보 없음" + ) - party_tag = profile.find('em', string='소속정당 : ') - party = party_tag.find_next('span').get_text(strip=True) if party_tag else "정당 정보 없음" + party_tag = profile.find("em", string="소속정당 : ") + party = ( + party_tag.find_next("span").get_text(strip=True) + if party_tag + else "정당 정보 없음" + ) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="chungnam", council_type=CouncilType.METROPOLITAN_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_metro_12(url="https://www.assem.jeonbuk.kr/board/list.do?boardId=2018_assemblyman&searchType=assem_check&keyword=1&menuCd=DOM_000000103001000000&contentsSid=453") -> ScrapResult: +def scrap_metro_12( + url="https://www.assem.jeonbuk.kr/board/list.do?boardId=2018_assemblyman&searchType=assem_check&keyword=1&menuCd=DOM_000000103001000000&contentsSid=453", +) -> ScrapResult: """전라북도 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -314,23 +358,29 @@ def scrap_metro_12(url="https://www.assem.jeonbuk.kr/board/list.do?boardId=2018_ soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('li', class_='career'): - name_tag = profile.find('tr', class_='name') + for profile in soup.find_all("li", class_="career"): + name_tag = profile.find("tr", class_="name") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party_tag = profile.find('tr', class_='list1') - party = party_tag.find('td', class_='co2').get_text(strip=True) if party_tag else "정당 정보 없음" + party_tag = profile.find("tr", class_="list1") + party = ( + party_tag.find("td", class_="co2").get_text(strip=True) + if party_tag + else "정당 정보 없음" + ) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="jeonbuk", council_type=CouncilType.METROPOLITAN_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_metro_13(url="https://www.jnassembly.go.kr/profileHistory.es?mid=a10202010000&cs_daesoo=12") -> ScrapResult: +def scrap_metro_13( + url="https://www.jnassembly.go.kr/profileHistory.es?mid=a10202010000&cs_daesoo=12", +) -> ScrapResult: """전라남도 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -340,19 +390,23 @@ def scrap_metro_13(url="https://www.jnassembly.go.kr/profileHistory.es?mid=a1020 soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('tbody'): - name_tag = profile.find('p') + for profile in soup.find_all("tbody"): + name_tag = profile.find("p") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party_tag = profile.find('th', string='소속정당') - party = party_tag.find_next('td', class_='txt_left').get_text(strip=True) if party_tag else "정당 정보 없음" + party_tag = profile.find("th", string="소속정당") + party = ( + party_tag.find_next("td", class_="txt_left").get_text(strip=True) + if party_tag + else "정당 정보 없음" + ) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="jeonnam", council_type=CouncilType.METROPOLITAN_COUNCIL, - councilors=councilors + councilors=councilors, ) @@ -366,23 +420,29 @@ def scrap_metro_14(url="https://council.gb.go.kr/kr/member/name") -> ScrapResult soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='profile'): - name_tag = profile.find('div', class_='name') - name = name_tag.find('strong').get_text(strip=True) if name_tag else "이름 정보 없음" + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find("div", class_="name") + name = name_tag.find("strong").get_text(strip=True) if name_tag else "이름 정보 없음" - party_tag = profile.find('em', string='소속정당') - party = party_tag.find_next('span').find_next('span').get_text(strip=True) if party_tag else "정당 정보 없음" + party_tag = profile.find("em", string="소속정당") + party = ( + party_tag.find_next("span").find_next("span").get_text(strip=True) + if party_tag + else "정당 정보 없음" + ) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="gyeongbuk", council_type=CouncilType.METROPOLITAN_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_metro_15(url="https://council.gyeongnam.go.kr/kr/member/active.do") -> ScrapResult: +def scrap_metro_15( + url="https://council.gyeongnam.go.kr/kr/member/active.do", +) -> ScrapResult: """경상남도 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -392,19 +452,27 @@ def scrap_metro_15(url="https://council.gyeongnam.go.kr/kr/member/active.do") -> soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='profile'): - name_tag = profile.find('div', class_='name') - name = name_tag.find('strong').get_text(strip=True).split('(')[0].strip() if name_tag else "이름 정보 없음" - - party_tag = profile.find('em', class_='ls2', string='정당') - party = party_tag.find_next('span').get_text(strip=True) if party_tag else "정당 정보 없음" + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find("div", class_="name") + name = ( + name_tag.find("strong").get_text(strip=True).split("(")[0].strip() + if name_tag + else "이름 정보 없음" + ) + + party_tag = profile.find("em", class_="ls2", string="정당") + party = ( + party_tag.find_next("span").get_text(strip=True) + if party_tag + else "정당 정보 없음" + ) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="gyeongnam", council_type=CouncilType.METROPOLITAN_COUNCIL, - councilors=councilors + councilors=councilors, ) @@ -418,23 +486,29 @@ def scrap_metro_16(url="https://council.gangwon.kr/kr/member/name.do") -> ScrapR soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for profile in soup.find_all('div', class_='profile'): - name_tag = profile.find('em', class_='name') + for profile in soup.find_all("div", class_="profile"): + name_tag = profile.find("em", class_="name") name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음" - party_tag = profile.find('em', string='소속정당') - party = party_tag.find_next('span').get_text(strip=True).split()[-1].strip() if party_tag else "정당 정보 없음" + party_tag = profile.find("em", string="소속정당") + party = ( + party_tag.find_next("span").get_text(strip=True).split()[-1].strip() + if party_tag + else "정당 정보 없음" + ) councilors.append(Councilor(name=name, party=party)) return ScrapResult( council_id="gangwon", council_type=CouncilType.METROPOLITAN_COUNCIL, - councilors=councilors + councilors=councilors, ) -def scrap_metro_17(url="https://www.council.jeju.kr/cmember/active/name.do") -> ScrapResult: +def scrap_metro_17( + url="https://www.council.jeju.kr/cmember/active/name.do", +) -> ScrapResult: """제주도 페이지에서 의원 상세약력 스크랩 :param url: 의원 목록 사이트 url @@ -444,7 +518,7 @@ def scrap_metro_17(url="https://www.council.jeju.kr/cmember/active/name.do") -> soup = get_soup(url, verify=False) councilors: list[Councilor] = [] - for tag in soup.find_all('p', class_='name'): + for tag in soup.find_all("p", class_="name"): text = tag.get_text(strip=True).split("(") # print(text) name = text[0].strip() @@ -455,10 +529,9 @@ def scrap_metro_17(url="https://www.council.jeju.kr/cmember/active/name.do") -> return ScrapResult( council_id="jeju", council_type=CouncilType.METROPOLITAN_COUNCIL, - councilors=councilors + councilors=councilors, ) - -if __name__ == '__main__': - print(scrap_metro_17()) \ No newline at end of file +if __name__ == "__main__": + print(scrap_metro_17()) diff --git a/scrap/national_council.py b/scrap/national_council.py index b058abf..6c4656e 100644 --- a/scrap/national_council.py +++ b/scrap/national_council.py @@ -8,42 +8,43 @@ def scrap_national_council(cd: int) -> ScrapResult: - '''열린국회정보 Open API를 이용해 역대 국회의원 인적사항 스크랩 - _data 폴더에 assembly_api_key.json 파일을 만들어야 하며, - 해당 JSON은 {"key":"(Open API에서 발급받은 인증키)"} 꼴을 가져야 한다. - https://open.assembly.go.kr/portal/data/service/selectAPIServicePage.do/OBL7NF0011935G18076#none - - :param cd: 국회의원 대수. 제21대 국회의원을 스크랩하고자 하면 21 - :return: 국회의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 - ''' - - key_json_path = os.path.join(BASE_DIR, '_data', 'assembly_api_key.json') - if not os.path.exists(key_json_path): - raise Exception('열린국회정보 Open API에 회원가입 후 인증키를 발급받아주세요.\nhttps://open.assembly.go.kr/portal/openapi/openApiDevPage.do') - with open(key_json_path, 'r') as key_json: - assembly_key = json.load(key_json)['key'] - - request_url = f"https://open.assembly.go.kr/portal/openapi/nwvrqwxyaytdsfvhu?KEY={assembly_key}&pSize=500&UNIT_CD={cd + 100000}" - response = requests.get(request_url) - - if response.status_code != 200: - raise Exception(f'Open API 요청에 실패했습니다 (상태 코드 {response.status_code})') - - root = ET.fromstring(response.text) - councilors: list[Councilor] = [] - - for row in root.iter('row'): - councilors.append(Councilor( - name=row.find('HG_NM').text, - party=row.find('POLY_NM').text - )) - - return ScrapResult( - council_id='national', - council_type=CouncilType.NATIONAL_COUNCIL, - councilors=councilors - ) - - -if __name__ == '__main__': - print(scrap_national_council(21)) \ No newline at end of file + """열린국회정보 Open API를 이용해 역대 국회의원 인적사항 스크랩 + _data 폴더에 assembly_api_key.json 파일을 만들어야 하며, + 해당 JSON은 {"key":"(Open API에서 발급받은 인증키)"} 꼴을 가져야 한다. + https://open.assembly.go.kr/portal/data/service/selectAPIServicePage.do/OBL7NF0011935G18076#none + + :param cd: 국회의원 대수. 제21대 국회의원을 스크랩하고자 하면 21 + :return: 국회의원들의 이름과 정당 데이터를 담은 ScrapResult 객체 + """ + + key_json_path = os.path.join(BASE_DIR, "_data", "assembly_api_key.json") + if not os.path.exists(key_json_path): + raise Exception( + "열린국회정보 Open API에 회원가입 후 인증키를 발급받아주세요.\nhttps://open.assembly.go.kr/portal/openapi/openApiDevPage.do" + ) + with open(key_json_path, "r") as key_json: + assembly_key = json.load(key_json)["key"] + + request_url = f"https://open.assembly.go.kr/portal/openapi/nwvrqwxyaytdsfvhu?KEY={assembly_key}&pSize=500&UNIT_CD={cd + 100000}" + response = requests.get(request_url) + + if response.status_code != 200: + raise Exception(f"Open API 요청에 실패했습니다 (상태 코드 {response.status_code})") + + root = ET.fromstring(response.text) + councilors: list[Councilor] = [] + + for row in root.iter("row"): + councilors.append( + Councilor(name=row.find("HG_NM").text, party=row.find("POLY_NM").text) + ) + + return ScrapResult( + council_id="national", + council_type=CouncilType.NATIONAL_COUNCIL, + councilors=councilors, + ) + + +if __name__ == "__main__": + print(scrap_national_council(21)) diff --git a/scrap/utils/spreadsheet.py b/scrap/utils/spreadsheet.py index a4bcf25..47da40a 100644 --- a/scrap/utils/spreadsheet.py +++ b/scrap/utils/spreadsheet.py @@ -49,90 +49,368 @@ def main() -> None: client: gspread.client.Client = google_authorization() # 스프레드시트 열기 - link = 'https://docs.google.com/spreadsheets/d/1fBDJjkw8FSN5wXrvos9Q2wDsyItkUtNFGOxUZYE-h0M/edit#gid=1127955905' # T4I-의회목록 + link = "https://docs.google.com/spreadsheets/d/1fBDJjkw8FSN5wXrvos9Q2wDsyItkUtNFGOxUZYE-h0M/edit#gid=1127955905" # T4I-의회목록 spreadsheet: gspread.Spreadsheet = client.open_by_url(link) - worksheet: gspread.Worksheet = spreadsheet.get_worksheet(0) # 원하는 워크시트 선택 (0은 첫 번째 워크시트입니다.) - # TODO - 홈페이지 위 charset=euc-kr 등을 인식해 바로 가져오기. + worksheet: gspread.Worksheet = spreadsheet.get_worksheet( + 0 + ) # 원하는 워크시트 선택 (0은 첫 번째 워크시트입니다.) + # TODO - 홈페이지 위 charset=euc-kr 등을 인식해 바로 가져오기. euc_kr = [6, 13, 16, 31, 72, 88, 112, 154, 157, 163, 167, 181, 197, 202] special_functions = list(range(1, 57)) + [57, 88, 103] args = { - 2 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='em', name_cls='name', pty_elt='em'), - 3 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='em', name_cls='name', pty_elt='em'), + 2: ScrapBasicArgument( + pf_elt="div", pf_cls="profile", name_elt="em", name_cls="name", pty_elt="em" + ), + 3: ScrapBasicArgument( + pf_elt="div", pf_cls="profile", name_elt="em", name_cls="name", pty_elt="em" + ), # 인천 - 57 : ScrapBasicArgument(pf_elt='div', pf_cls='box', name_elt='p', name_cls='mem_tit2', pty_elt='p', pty_cls='mem_tit2'), - 58 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='em', name_cls='name', pty_elt='em'), - 59 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='div', name_cls='name', pty_elt='em'), + 57: ScrapBasicArgument( + pf_elt="div", + pf_cls="box", + name_elt="p", + name_cls="mem_tit2", + pty_elt="p", + pty_cls="mem_tit2", + ), + 58: ScrapBasicArgument( + pf_elt="div", pf_cls="profile", name_elt="em", name_cls="name", pty_elt="em" + ), + 59: ScrapBasicArgument( + pf_elt="div", + pf_cls="profile", + name_elt="div", + name_cls="name", + pty_elt="em", + ), # 광주 - 60 : ScrapBasicArgument(pf_elt='div', pf_cls='content', name_elt='h5', pty_wrapelt='a', pty_elt='li'), - 61 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='em', name_cls='name', pty_elt='em'), + 60: ScrapBasicArgument( + pf_elt="div", pf_cls="content", name_elt="h5", pty_wrapelt="a", pty_elt="li" + ), + 61: ScrapBasicArgument( + pf_elt="div", pf_cls="profile", name_elt="em", name_cls="name", pty_elt="em" + ), # 62 : TODO! /common/selectCouncilMemberProfile.json 을 어떻게 얻을지.. # 63 : TODO! 홈페이지 터짐 # 64 : TODO! /common/selectCouncilMemberProfile.json 을 어떻게 얻을지.. # 대전 - 65 : ScrapBasicArgument(pf_elt='dl', pf_cls='profile', name_elt='strong', name_cls='name', pty_elt='strong'), - 66 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='div', name_cls='name', pty_elt='em'), - 67 : ScrapBasicArgument(pf_memlistelt='section', pf_memlistcls='member', pf_elt='dl', name_elt='dd', name_cls='name', pty_elt='dd'), - 68 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='em', name_cls='name', pty_elt='em'), - 69 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='em', name_cls='name', pty_elt='em'), + 65: ScrapBasicArgument( + pf_elt="dl", + pf_cls="profile", + name_elt="strong", + name_cls="name", + pty_elt="strong", + ), + 66: ScrapBasicArgument( + pf_elt="div", + pf_cls="profile", + name_elt="div", + name_cls="name", + pty_elt="em", + ), + 67: ScrapBasicArgument( + pf_memlistelt="section", + pf_memlistcls="member", + pf_elt="dl", + name_elt="dd", + name_cls="name", + pty_elt="dd", + ), + 68: ScrapBasicArgument( + pf_elt="div", pf_cls="profile", name_elt="em", name_cls="name", pty_elt="em" + ), + 69: ScrapBasicArgument( + pf_elt="div", pf_cls="profile", name_elt="em", name_cls="name", pty_elt="em" + ), # 울산 - 70 : ScrapBasicArgument(pf_memlistelt='section', pf_memlistcls='memberName', pf_elt='dl', name_elt='dd', name_cls='name', pty_elt='dd'), - 71 : ScrapBasicArgument(pf_memlistelt='section', pf_memlistcls='memberName', pf_elt='dl', name_elt='dd', name_cls='name', pty_elt='dd'), - 72 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='li', name_cls='name', pty_elt='li'), - 73 : ScrapBasicArgument(pf_elt='dl', pf_cls='profile', name_elt='strong', name_cls='name', pty_elt='li'), - 74 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='em', name_cls='name', pty_wrapelt='a', pty_wrapcls='start', pty_elt='li'), + 70: ScrapBasicArgument( + pf_memlistelt="section", + pf_memlistcls="memberName", + pf_elt="dl", + name_elt="dd", + name_cls="name", + pty_elt="dd", + ), + 71: ScrapBasicArgument( + pf_memlistelt="section", + pf_memlistcls="memberName", + pf_elt="dl", + name_elt="dd", + name_cls="name", + pty_elt="dd", + ), + 72: ScrapBasicArgument( + pf_elt="div", pf_cls="profile", name_elt="li", name_cls="name", pty_elt="li" + ), + 73: ScrapBasicArgument( + pf_elt="dl", + pf_cls="profile", + name_elt="strong", + name_cls="name", + pty_elt="li", + ), + 74: ScrapBasicArgument( + pf_elt="div", + pf_cls="profile", + name_elt="em", + name_cls="name", + pty_wrapelt="a", + pty_wrapcls="start", + pty_elt="li", + ), # 경기 - 75 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='div', name_cls='name', pty_elt='em'), - 76 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='em', name_cls='name', pty_elt='em'), - 77 : ScrapBasicArgument(pf_memlistelt='section', pf_memlistcls='mbrListByName', pf_elt='dl', name_elt='dd', name_cls='name', pty_elt='dd'), - 78 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='div', name_cls='name', pty_wrapelt='a', pty_wrapcls='end', pty_elt='li'), - 79 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='em', name_cls='name', pty_elt='em'), - 80 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='em', name_cls='name', pty_elt='em'), - 81 : ScrapBasicArgument(pf_memlistelt='div', pf_memlistcls='member_list', pf_elt='dd', name_elt='p', pty_elt='tr'), - 82 : ScrapBasicArgument(pf_memlistelt='div', pf_memlistcls='cts1426_box', pf_elt='div', pf_cls='conbox', name_elt='p', pty_elt='li'), + 75: ScrapBasicArgument( + pf_elt="div", + pf_cls="profile", + name_elt="div", + name_cls="name", + pty_elt="em", + ), + 76: ScrapBasicArgument( + pf_elt="div", pf_cls="profile", name_elt="em", name_cls="name", pty_elt="em" + ), + 77: ScrapBasicArgument( + pf_memlistelt="section", + pf_memlistcls="mbrListByName", + pf_elt="dl", + name_elt="dd", + name_cls="name", + pty_elt="dd", + ), + 78: ScrapBasicArgument( + pf_elt="div", + pf_cls="profile", + name_elt="div", + name_cls="name", + pty_wrapelt="a", + pty_wrapcls="end", + pty_elt="li", + ), + 79: ScrapBasicArgument( + pf_elt="div", pf_cls="profile", name_elt="em", name_cls="name", pty_elt="em" + ), + 80: ScrapBasicArgument( + pf_elt="div", pf_cls="profile", name_elt="em", name_cls="name", pty_elt="em" + ), + 81: ScrapBasicArgument( + pf_memlistelt="div", + pf_memlistcls="member_list", + pf_elt="dd", + name_elt="p", + pty_elt="tr", + ), + 82: ScrapBasicArgument( + pf_memlistelt="div", + pf_memlistcls="cts1426_box", + pf_elt="div", + pf_cls="conbox", + name_elt="p", + pty_elt="li", + ), # 경기 - 동두천 - 83 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='em', name_cls='name', pty_wrapelt='a', pty_wrapcls='start', pty_elt='li'), - 84 : ScrapBasicArgument(pf_elt='div', pf_cls='law_box', name_elt='span', name_cls='name', pty_elt='p'), - 85 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='div', name_cls='name', pty_elt='em'), - 86 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='em', name_cls='name', pty_elt='em'), - 87 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='em', name_cls='name', pty_elt='em'), - 88 : ScrapBasicArgument(pf_memlistelt='div', pf_memlistcls='member_list', pf_elt='dl', pf_cls='box', name_elt='span', name_cls='name', pty_wrapelt='p', pty_wrapcls='btn', pty_elt='li'), - 89 : ScrapBasicArgument(pf_memlistelt='section', pf_memlistcls='memberName', pf_elt='dl', name_elt='dd', name_cls='name', pty_elt='span'), - 90 : ScrapBasicArgument(pf_elt='dl', pf_cls='profile', name_elt='strong', name_cls='name', pty_elt='li'), - # 경기 - 화성 - 91 : ScrapBasicArgument(pf_memlistelt='section', pf_memlistcls='mbr0101', pf_elt='dl', name_elt='dd', name_cls='name', pty_elt='dd'), - 92 : ScrapBasicArgument(pf_memlistelt='section', pf_memlistcls='member', pf_elt='dl', name_elt='dd', name_cls='name', pty_elt='dd'), - 93 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='div', name_cls='name', pty_wrapelt='a', pty_wrapcls='end', pty_elt='li'), - 94 : ScrapBasicArgument(pf_memlistelt='section', pf_memlistcls='mbrListByName', pf_elt='dl', name_elt='dd', name_cls='name', pty_elt='dd'), - 95 : ScrapBasicArgument(pf_memlistelt='section', pf_memlistcls='member', pf_elt='dl', name_elt='dd', name_cls='name', pty_elt='tr'), - 96 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='div', name_cls='name', pty_elt='em'), - 97 : ScrapBasicArgument(pf_memlistelt='ul', pf_memlistcls='memberList', pf_elt='li', name_elt='strong', pty_wrapelt='a', pty_elt='tr'), - 98 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='em', name_cls='name', pty_elt='em'), - 99 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='em', name_cls='name', pty_elt='em'), - 100 : ScrapBasicArgument(pf_elt='div', pf_cls='list', name_elt='h4', name_cls='h0', pty_elt='li'), + 83: ScrapBasicArgument( + pf_elt="div", + pf_cls="profile", + name_elt="em", + name_cls="name", + pty_wrapelt="a", + pty_wrapcls="start", + pty_elt="li", + ), + 84: ScrapBasicArgument( + pf_elt="div", + pf_cls="law_box", + name_elt="span", + name_cls="name", + pty_elt="p", + ), + 85: ScrapBasicArgument( + pf_elt="div", + pf_cls="profile", + name_elt="div", + name_cls="name", + pty_elt="em", + ), + 86: ScrapBasicArgument( + pf_elt="div", pf_cls="profile", name_elt="em", name_cls="name", pty_elt="em" + ), + 87: ScrapBasicArgument( + pf_elt="div", pf_cls="profile", name_elt="em", name_cls="name", pty_elt="em" + ), + 88: ScrapBasicArgument( + pf_memlistelt="div", + pf_memlistcls="member_list", + pf_elt="dl", + pf_cls="box", + name_elt="span", + name_cls="name", + pty_wrapelt="p", + pty_wrapcls="btn", + pty_elt="li", + ), + 89: ScrapBasicArgument( + pf_memlistelt="section", + pf_memlistcls="memberName", + pf_elt="dl", + name_elt="dd", + name_cls="name", + pty_elt="span", + ), + 90: ScrapBasicArgument( + pf_elt="dl", + pf_cls="profile", + name_elt="strong", + name_cls="name", + pty_elt="li", + ), + # 경기 - 화성 + 91: ScrapBasicArgument( + pf_memlistelt="section", + pf_memlistcls="mbr0101", + pf_elt="dl", + name_elt="dd", + name_cls="name", + pty_elt="dd", + ), + 92: ScrapBasicArgument( + pf_memlistelt="section", + pf_memlistcls="member", + pf_elt="dl", + name_elt="dd", + name_cls="name", + pty_elt="dd", + ), + 93: ScrapBasicArgument( + pf_elt="div", + pf_cls="profile", + name_elt="div", + name_cls="name", + pty_wrapelt="a", + pty_wrapcls="end", + pty_elt="li", + ), + 94: ScrapBasicArgument( + pf_memlistelt="section", + pf_memlistcls="mbrListByName", + pf_elt="dl", + name_elt="dd", + name_cls="name", + pty_elt="dd", + ), + 95: ScrapBasicArgument( + pf_memlistelt="section", + pf_memlistcls="member", + pf_elt="dl", + name_elt="dd", + name_cls="name", + pty_elt="tr", + ), + 96: ScrapBasicArgument( + pf_elt="div", + pf_cls="profile", + name_elt="div", + name_cls="name", + pty_elt="em", + ), + 97: ScrapBasicArgument( + pf_memlistelt="ul", + pf_memlistcls="memberList", + pf_elt="li", + name_elt="strong", + pty_wrapelt="a", + pty_elt="tr", + ), + 98: ScrapBasicArgument( + pf_elt="div", pf_cls="profile", name_elt="em", name_cls="name", pty_elt="em" + ), + 99: ScrapBasicArgument( + pf_elt="div", pf_cls="profile", name_elt="em", name_cls="name", pty_elt="em" + ), + 100: ScrapBasicArgument( + pf_elt="div", pf_cls="list", name_elt="h4", name_cls="h0", pty_elt="li" + ), # 경기 - 광주 - 101 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='em', name_cls='name', pty_elt='em'), - 102 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='em', name_cls='name', pty_wrapelt='a', pty_wrapcls='start', pty_elt='li'), - 103 : ScrapBasicArgument(pf_elt='div', pf_cls='col-sm-6', name_elt='h5', name_cls='h5', pty_wrapelt='a', pty_wrapcls='d-inline-block', pty_elt='li'), - 104 : ScrapBasicArgument(pf_elt='div', pf_cls='text_box', name_elt='h3', name_cls='h0', pty_wrapelt='a', pty_wraptxt='누리집', pty_elt='li'), - 105 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='em', name_cls='name', pty_elt='em'), + 101: ScrapBasicArgument( + pf_elt="div", pf_cls="profile", name_elt="em", name_cls="name", pty_elt="em" + ), + 102: ScrapBasicArgument( + pf_elt="div", + pf_cls="profile", + name_elt="em", + name_cls="name", + pty_wrapelt="a", + pty_wrapcls="start", + pty_elt="li", + ), + 103: ScrapBasicArgument( + pf_elt="div", + pf_cls="col-sm-6", + name_elt="h5", + name_cls="h5", + pty_wrapelt="a", + pty_wrapcls="d-inline-block", + pty_elt="li", + ), + 104: ScrapBasicArgument( + pf_elt="div", + pf_cls="text_box", + name_elt="h3", + name_cls="h0", + pty_wrapelt="a", + pty_wraptxt="누리집", + pty_elt="li", + ), + 105: ScrapBasicArgument( + pf_elt="div", pf_cls="profile", name_elt="em", name_cls="name", pty_elt="em" + ), # 강원 # 106 : TODO! 정당정보 없음 # TODO! 107이 get_soup에서 실패 중 - HTTPSConnectionPool(host='council.wonju.go.kr', port=443): Max retries exceeded with url: /content/member/memberName.html (Caused by SSLError(SSLError(1, '[SSL: DH_KEY_TOO_SMALL] dh key too small (_ssl.c:1007)'))) - 107 : ScrapBasicArgument(pf_memlistelt='div', pf_memlistcls='content', pf_elt='dl', name_elt='dd', name_cls='name', pty_elt='span'), - 108 : ScrapBasicArgument(pf_elt='dl', pf_cls='profile', name_elt='strong', pty_elt='li'), - 109 : ScrapBasicArgument(pf_memlistelt='section', pf_memlistcls='memberName', pf_elt='dl', name_elt='dd', name_cls='name', pty_elt='span'), - 110 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='em', name_cls='name', pty_elt='em'), - # 111 : TODO! 정당 없고 홈페이지는 깨짐 - 112 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='em', name_cls='name', pty_elt='em'), - 113 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_cls='name', pty_elt='li'), - 115 : ScrapBasicArgument(pf_elt='div', pf_cls='profile', name_elt='div', name_cls='name', pty_elt='li'), + 107: ScrapBasicArgument( + pf_memlistelt="div", + pf_memlistcls="content", + pf_elt="dl", + name_elt="dd", + name_cls="name", + pty_elt="span", + ), + 108: ScrapBasicArgument( + pf_elt="dl", pf_cls="profile", name_elt="strong", pty_elt="li" + ), + 109: ScrapBasicArgument( + pf_memlistelt="section", + pf_memlistcls="memberName", + pf_elt="dl", + name_elt="dd", + name_cls="name", + pty_elt="span", + ), + 110: ScrapBasicArgument( + pf_elt="div", pf_cls="profile", name_elt="em", name_cls="name", pty_elt="em" + ), + # 111 : TODO! 정당 없고 홈페이지는 깨짐 + 112: ScrapBasicArgument( + pf_elt="div", pf_cls="profile", name_elt="em", name_cls="name", pty_elt="em" + ), + 113: ScrapBasicArgument( + pf_elt="div", pf_cls="profile", name_cls="name", pty_elt="li" + ), + 115: ScrapBasicArgument( + pf_elt="div", + pf_cls="profile", + name_elt="div", + name_cls="name", + pty_elt="li", + ), # TODO : 정당이 주석처리되어 있어서 soup가 인식을 못함. - 116 : ScrapBasicArgument(pf_elt='div', pf_cls='memberName', name_cls='name',pty_elt='dd'), + 116: ScrapBasicArgument( + pf_elt="div", pf_cls="memberName", name_cls="name", pty_elt="dd" + ), } # 데이터 가져오기 data: list[dict] = worksheet.get_all_records() - result: str = '' + result: str = "" error_times = 0 parse_error_times = 0 @@ -140,19 +418,27 @@ def main() -> None: N = 226 # for n in range (113, 169): for n in range(107, 108): - encoding = 'euc-kr' if n in euc_kr else 'utf-8' + encoding = "euc-kr" if n in euc_kr else "utf-8" try: if n in special_functions: function_name = f"scrap_{n}" if hasattr(sys.modules[__name__], function_name): function_to_call = getattr(sys.modules[__name__], function_name) if n < 57: - result = str(function_to_call(data[n - 1]['상세약력 링크']).councilors) + result = str( + function_to_call(data[n - 1]["상세약력 링크"]).councilors + ) else: - result = str(function_to_call(data[n - 1]['상세약력 링크'], args=args[n]).councilors) + result = str( + function_to_call( + data[n - 1]["상세약력 링크"], args=args[n] + ).councilors + ) else: - result = str(scrap_basic(data[n - 1]['상세약력 링크'], n, args[n], encoding).councilors) - if '정보 없음' in result: + result = str( + scrap_basic(data[n - 1]["상세약력 링크"], n, args[n], encoding).councilors + ) + if "정보 없음" in result: print("정보 없음이 포함되어 있습니다.") parse_error_times += 1 print(result) @@ -163,6 +449,10 @@ def main() -> None: print(f"오류 : [district-{n}] {str(e)}") error_times += 1 continue # 에러가 발생하면 다음 반복으로 넘어감 - print(f"| 총 실행 횟수: {N} | 에러 횟수: {error_times} | 정보 없음 횟수: {parse_error_times} | 타임아웃 횟수: {timeouts} |") -if __name__ == '__main__': + print( + f"| 총 실행 횟수: {N} | 에러 횟수: {error_times} | 정보 없음 횟수: {parse_error_times} | 타임아웃 횟수: {timeouts} |" + ) + + +if __name__ == "__main__": main() diff --git a/scrap/utils/types.py b/scrap/utils/types.py index 67acc1f..a6ed4f1 100644 --- a/scrap/utils/types.py +++ b/scrap/utils/types.py @@ -57,23 +57,26 @@ class ScrapResult: class ScrapBasicArgument: - ''' + """ scrap_basic에 쓸 argument입니다 - ''' - def __init__(self, - pf_elt: str | None = None, - pf_cls: str | None = None, - pf_memlistelt: str | None = None, - pf_memlistcls: str | None = None, - name_elt: str | None = None, - name_cls: str | None = None, - name_wrapelt: str | None = None, - name_wrapcls: str | None = None, - pty_elt: str | None = None, - pty_cls: str | None = None, - pty_wrapelt: str | None = None, - pty_wrapcls: str | None = None, - pty_wraptxt: str | None = None): + """ + + def __init__( + self, + pf_elt: str | None = None, + pf_cls: str | None = None, + pf_memlistelt: str | None = None, + pf_memlistcls: str | None = None, + name_elt: str | None = None, + name_cls: str | None = None, + name_wrapelt: str | None = None, + name_wrapcls: str | None = None, + pty_elt: str | None = None, + pty_cls: str | None = None, + pty_wrapelt: str | None = None, + pty_wrapcls: str | None = None, + pty_wraptxt: str | None = None, + ): """ ScrapBasicArgument 클래스의 생성자입니다. @@ -104,4 +107,4 @@ def __init__(self, self.pty_cls = pty_cls self.pty_wrapelt = pty_wrapelt self.pty_wrapcls = pty_wrapcls - self.pty_wraptxt = pty_wraptxt \ No newline at end of file + self.pty_wraptxt = pty_wraptxt