Skip to content

Commit

Permalink
[scrap] 남은것 205, 207, 209, 211, 221, 223, 225
Browse files Browse the repository at this point in the history
  • Loading branch information
Re-st committed Nov 27, 2023
1 parent 8473968 commit 693e5ac
Show file tree
Hide file tree
Showing 7 changed files with 254 additions and 108 deletions.
24 changes: 11 additions & 13 deletions scrap/local_councils/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,9 +124,6 @@ def sel_getname(profile, element, class_, wrapper_element, wrapper_class_):
name = strong_element.text.strip()
except NoSuchElementException:
pass
# print(name+"\n")
if name == "":
return "231114"
name = name.split("(")[0].split(":")[-1].strip() # 이름 뒷 한자이름, 앞 '이 름:' 제거
# TODO : 만약 이름이 우연히 아래 단어를 포함하는 경우를 생각해볼만 함.
if len(name) > 3:
Expand Down Expand Up @@ -154,7 +151,7 @@ def extract_party(string):
return None


def goto_profilesite(profile, wrapper_element, wrapper_class_, wrapper_txt, url):
def goto_profilesite(profile, wrapper_element, wrapper_class_, wrapper_txt, url, inner_euckr=False):
# 의원 프로필에서 프로필보기 링크를 가져옴
parsed_url = urlparse(url)
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
Expand All @@ -169,7 +166,10 @@ def goto_profilesite(profile, wrapper_element, wrapper_class_, wrapper_txt, url)
# base_url = base_url + '/'
profile_url = base_url + profile_link["href"]
try:
profile = get_soup(profile_url, verify=False)
if inner_euckr:
profile = get_soup(profile_url, verify=False, encoding="euc-kr")
else:
profile = get_soup(profile_url, verify=False)
except Exception:
raise RuntimeError("[basic.py] '//'가 있진 않나요?", " url: ", profile_url)
return profile
Expand Down Expand Up @@ -212,7 +212,7 @@ def getpty(profile, element, class_, wrapper_element, wrapper_class_, wrapper_tx
)
)
if party_pulp_list == []:
raise RuntimeError("[basic.py] 정당정보 regex 실패")
raise Exception("[basic.py] 정당정보 regex 실패")
party_pulp = party_pulp_list[0]
party_string = party_pulp.get_text(strip=True).split(" ")[-1]
while True:
Expand All @@ -221,14 +221,14 @@ def getpty(profile, element, class_, wrapper_element, wrapper_class_, wrapper_tx
if (party_pulp := party_pulp.find_next("span")) is not None:
party_string = party_pulp.text.strip().split(" ")[-1]
else:
return "[basic.py] 정당 정보 파싱 불가"
raise Exception("[basic.py] 정당 정보 파싱 불가")


def getpty_easy(profile, wrapper_element, wrapper_class_, wrapper_txt, url):
def getpty_easy(profile, wrapper_element, wrapper_class_, wrapper_txt, url, inner_euckr=False):
# 의원 프로필에서 의원이 몸담는 정당 이름을 가져옴
if wrapper_element is not None:
profile = goto_profilesite(
profile, wrapper_element, wrapper_class_, wrapper_txt, url
profile, wrapper_element, wrapper_class_, wrapper_txt, url, inner_euckr
)
party = extract_party(profile.text)
assert party is not None
Expand All @@ -253,7 +253,7 @@ def sel_getpty_easy(
return party


def scrap_basic(url, cid, args: ScrapBasicArgument, encoding="utf-8") -> ScrapResult:
def scrap_basic(url, cid, args: ScrapBasicArgument, encoding="utf-8", inner_euckr=False) -> ScrapResult:
"""의원 상세약력 스크랩
:param url: 의원 목록 사이트 url
:param cid: 의회 id
Expand Down Expand Up @@ -293,7 +293,7 @@ def scrap_basic(url, cid, args: ScrapBasicArgument, encoding="utf-8") -> ScrapRe
except Exception as e:
try:
party = getpty_easy(
profile, args.pty_wrapelt, args.pty_wrapcls, args.pty_wraptxt, url
profile, args.pty_wrapelt, args.pty_wrapcls, args.pty_wraptxt, url, inner_euckr
)
except Exception:
raise RuntimeError("[basic.py] 의원 정당을 가져오는데 실패했습니다. 이유: " + str(e))
Expand Down Expand Up @@ -331,8 +331,6 @@ def sel_scrap_basic(url, cid, args: ScrapBasicArgument) -> ScrapResult:
raise RuntimeError(
"[basic.py/selenium] 의원 이름을 가져오는데 실패했습니다. 이유 : " + str(e)
)
if name == "231114":
continue
try:
party = sel_getpty_easy(
profile, args.pty_wrapelt, args.pty_wrapcls, args.pty_wraptxt, url
Expand Down
122 changes: 111 additions & 11 deletions scrap/local_councils/gyeongsang.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
regex_pattern,
)

party_keywords = getPartyList()
party_keywords.append("무소속")

def scrap_186(
url,
Expand Down Expand Up @@ -122,6 +124,33 @@ def scrap_191(

return ret_local_councilors(cid, councilors)

def scrap_192(
url,
cid,
args: ArgsType = None,
) -> ScrapResult:
"""경상북도 구미시"""
soup = get_soup(url, verify=False, encoding="euc-kr")
councilors: List[Councilor] = []
for profile in soup.find_all("div", class_="profile"):
name_tag = profile.find("li", class_="name")
name = name_tag.get_text(strip=True).split("(")[0] if name_tag else "이름 정보 없음"

party = "정당 정보 없음"
profile_link = profile.find_all("a")[1]
parsed_url = urlparse(url)
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
profile_url = base_url + profile_link["href"]
profile = get_soup(profile_url, verify=False, encoding="euc-kr")
party=""
for keyword in party_keywords:
if keyword in profile.text:
party=keyword
break
councilors.append(Councilor(name=name, jdName=party))

return ret_local_councilors(cid, councilors)


def scrap_194(
url,
Expand Down Expand Up @@ -188,6 +217,24 @@ def scrap_196(
return ret_local_councilors(cid, councilors)


def scrap_197(
url,
cid,
args: ArgsType = None,
) -> ScrapResult:
"""경상북도 경산시"""
soup = get_soup(url, verify=False, encoding="euc-kr")
councilors: List[Councilor] = []
for profile in soup.find_all('div', class_='memberL') + soup.find_all('div', class_='memberR'):
party = profile.find_previous('h4', class_='title').text.strip()
assert(party in party_keywords)
name = profile.find('dt').text.strip()

councilors.append(Councilor(name=name, jdName=party))

return ret_local_councilors(cid, councilors)


def scrap_198(
url,
cid,
Expand Down Expand Up @@ -216,18 +263,18 @@ def scrap_199(
args: ArgsType = None,
) -> ScrapResult:
"""경상북도 고령군"""
soup = get_soup(url, verify=False)
councilors: List[Councilor] = []
for profile in soup.find_all("div", class_="profile"):
name_tag = profile.find("em", class_="name")
name = name_tag.get_text(strip=True).split("\r")[0] if name_tag else "이름 정보 없음"

browser = get_selenium(url)
councilors: list[Councilor] = []
for profile in browser.find_elements(By.CSS_SELECTOR, "div[class='profile']"):
name_tag = profile.find_element(By.CSS_SELECTOR, "em[class='name']")
name = name_tag.text.strip().split("\r")[0] if name_tag else "이름 정보 없음"
party = ""
for keyword in party_keywords:
if keyword in profile.text:
party = keyword
break
party = "정당 정보 없음"
party_info = profile.find("em", string="정 당 : ")
if party_info:
party = party_info.find_next("span").get_text(strip=True)

councilors.append(Councilor(name=name, jdName=party))
councilors.append(Councilor(name, party))

return ret_local_councilors(cid, councilors)

Expand Down Expand Up @@ -260,6 +307,31 @@ def scrap_201(
return ret_local_councilors(cid, councilors)


def scrap_202(
url,
cid,
args: ArgsType = None,
) -> ScrapResult:
"""경상북도 군위군"""
soup = get_soup(url, verify=False, encoding="euc-kr")
councilors: List[Councilor] = []
for profile in soup.find_all("div", class_="profile"):
name_tag = profile.find("li", class_="name")
name = name_tag.get_text(strip=True).split("(")[0] if name_tag else "이름 정보 없음"
link = profile.find("p", class_="btn").find("a")["href"]
parsed_url = urlparse(url)
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
profile_url = base_url + link
profile = get_soup(profile_url, verify=False, encoding="euc-kr")
party=""
for keyword in party_keywords:
if keyword in profile.text:
party=keyword
break
councilors.append(Councilor(name=name, jdName=party))

return ret_local_councilors(cid, councilors)

def scrap_203(
url,
cid,
Expand All @@ -281,6 +353,34 @@ def scrap_203(

return ret_local_councilors(cid, councilors)

def scrap_204(
url,
cid,
args: ArgsType = None,
) -> ScrapResult:
"""경상북도 청송군"""
soup = get_soup(url, verify=False)
councilors: List[Councilor] = []
for profile in soup.find_all("div", class_="box3vm1"):
name_tag = profile.find("span", class_="t3")
name = name_tag.get_text(strip=True).split()[-1] if name_tag else "이름 정보 없음"
link = profile.find("a", class_="button")["href"]
parsed_url = urlparse(url)
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
profile_url = base_url + link
profile = get_soup(profile_url, verify=False)
link = profile.find('a', text='의원소개', href=True)
profile_url = base_url + link['href']
profile = get_soup(profile_url, verify=False)

party=""
for keyword in party_keywords:
if keyword in profile.text:
party=keyword
break
councilors.append(Councilor(name=name, jdName=party))

return ret_local_councilors(cid, councilors)

def scrap_206(
url,
Expand Down
87 changes: 58 additions & 29 deletions scrap/local_councils/jeolla.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,12 @@
extract_party,
find,
findall,
sel_find,
regex_pattern,
)

party_keywords = getPartyList()
party_keywords.append("무소속")

def scrap_154(
url,
Expand Down Expand Up @@ -334,31 +337,50 @@ def scrap_167(

# return ret_local_councilors(cid, councilors)

def scrap_175(
url,
cid,
args: ArgsType = None,
) -> ScrapResult:
"""전라남도 화순군"""
browser = get_selenium(url)
councilors: list[Councilor] = []
for profileList in browser.find_elements(By.CSS_SELECTOR, "ul[id='councilList']"):
for profile in profileList.find_elements(By.CSS_SELECTOR, "ul[class='name_51']"):
name_tag = profile.find_element(By.TAG_NAME, "li")
name = name_tag.text.strip() if name_tag else "이름 정보 없음"

profile_link = sel_find(profile, "a")
page_content = get_selenium(profile_link.get_attribute("href")).page_source
party = ""
for keyword in party_keywords:
if keyword in page_content:
party = keyword
break

councilors.append(Councilor(name, party))

return ret_local_councilors(cid, councilors)

def scrap_177(
url,
cid,
args: ArgsType = None,
) -> ScrapResult:
"""전라남도 강진군"""
soup = get_soup(url, verify=False)
councilors: List[Councilor] = []
mlist = soup.find_all("ul", class_="memlist")[0]

for profile in mlist.find_all("li", recursive=False):
info = profile.find("ul", class_="info")
name = (
info.find("h5").get_text(strip=True)
if info.find("h5").get_text(strip=True)
else "이름 정보 없음"
)

li = info.find_all("li", recursive=False)[6]
party = "정당 정보 없음"
party_dd = li.find("dd")
if party_dd:
party = party_dd.get_text(strip=True)
councilors.append(Councilor(name=name, jdName=party))
browser = get_selenium(url)
councilors: list[Councilor] = []
for profileList in browser.find_elements(By.CSS_SELECTOR, "ul[id='memlist']"):
for profile in profileList.find_elements(By.CSS_SELECTOR, "ul[class='info']"):
name_tag = profile.find_element(By.TAG_NAME, "h5")
name = name_tag.text.strip() if name_tag else "이름 정보 없음"
party = ""
for keyword in party_keywords:
if keyword in profile.text:
party = keyword
break
party = "정당 정보 없음"
councilors.append(Councilor(name, party))

return ret_local_councilors(cid, councilors)

Expand All @@ -369,14 +391,20 @@ def scrap_178(
args: ArgsType = None,
) -> ScrapResult:
"""전라남도 완도군"""
councilors: List[Councilor] = []

result = requests.get(url)
result_json = result.json()
for profile in result_json["list"]:
name = profile["cmNm"]
party = profile["mpParty"]
councilors.append(Councilor(name=name, jdName=party))
browser = get_selenium(url)
councilors: list[Councilor] = []
for profileList in browser.find_elements(By.CSS_SELECTOR, "div[class='congressperson_list']"):
for profile in profileList.find_elements(By.CSS_SELECTOR, "div[class='col-lg-6']"):
name_tag = profile.find_element(By.TAG_NAME, "strong")
name = name_tag.text.strip() if name_tag else "이름 정보 없음"
profile_link = sel_find(profile, "a", class_="icon_btn")
page_content = get_selenium(profile_link.get_attribute("href")).page_source
party = ""
for keyword in party_keywords:
if keyword in page_content:
party = keyword
break
councilors.append(Councilor(name, party))

return ret_local_councilors(cid, councilors)

Expand All @@ -395,9 +423,10 @@ def scrap_179(
name = name_tag.get_text(strip=True) if name_tag else "이름 정보 없음"

party = "정당 정보 없음"
party_info = profile.find("span", string="소속정당 :")
if party_info:
party = party_info.find_next("span").get_text(strip=True)
for keyword in party_keywords:
if keyword in profile.text:
party = keyword
break
councilors.append(Councilor(name=name, jdName=party))

return ret_local_councilors(cid, councilors)
Expand Down
Loading

0 comments on commit 693e5ac

Please sign in to comment.