Skip to content

Commit

Permalink
Merge pull request #62 from NewWays-TechForImpactKAIST/feat-scraping-…
Browse files Browse the repository at this point in the history
…advance

scraping 스크립트 고도화
  • Loading branch information
keonly authored Nov 22, 2023
2 parents 35e6139 + eb70de2 commit 8ec671d
Show file tree
Hide file tree
Showing 11 changed files with 373 additions and 98 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -222,4 +222,5 @@ pyrightconfig.json

/_data
/output
__pycache__
__pycache__
/logs
6 changes: 3 additions & 3 deletions db/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ class CouncilType(str, Enum):
의회의 종류를 나타내는 열거형입니다.
"""

LOCAL_COUNCIL = "local_council"
NATIONAL_COUNCIL = "national_council"
METROPOLITAN_COUNCIL = "metropolitan_council"
LOCAL_COUNCIL = "local_councilor"
NATIONAL_COUNCIL = "national_councilor"
METROPOLITAN_COUNCIL = "metropolitan_councilor"
LOCAL_LEADER = "local_leader"
METRO_LEADER = "metro_leader"
"""
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,5 @@ gspread==5.11.2
pymongo==4.5.0
python-dotenv==1.0.0
openpyxl
selenium
selenium
tqdm
6 changes: 3 additions & 3 deletions scrap/local_councils/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def sel_getname(profile, element, class_, wrapper_element, wrapper_class_):
if keyword in name: # 인천 서구 등
name = name.replace(keyword, "").strip()
break
print(name, "is name\n")
# print(name, "is name\n")
maybe_name = name.split()[0] # 이름 뒤에 직책이 따라오는 경우
if len(maybe_name) == 1: # 외자 이름이 띄어쓰기 때문에 분리된 경우
name = "".join(name.split()[0:2])
Expand Down Expand Up @@ -266,7 +266,7 @@ def scrap_basic(url, cid, args: ScrapBasicArgument, encoding="utf-8") -> ScrapRe
profiles = getprofiles(
soup, args.pf_elt, args.pf_cls, args.pf_memlistelt, args.pf_memlistcls
)
print(cid, "번째 의회에는,", len(profiles), "명의 의원이 있습니다.") # 디버깅용.
# print(cid, "번째 의회에는,", len(profiles), "명의 의원이 있습니다.") # 디버깅용.

for profile in profiles:
name = party = ""
Expand Down Expand Up @@ -314,7 +314,7 @@ def sel_scrap_basic(url, cid, args: ScrapBasicArgument) -> ScrapResult:
profiles = sel_getprofiles(
browser, args.pf_elt, args.pf_cls, args.pf_memlistelt, args.pf_memlistcls
)
print(cid, "번째 의회에는,", len(profiles), "명의 의원이 있습니다.") # 디버깅용.
# print(cid, "번째 의회에는,", len(profiles), "명의 의원이 있습니다.") # 디버깅용.

for profile in profiles:
name = party = ""
Expand Down
4 changes: 2 additions & 2 deletions scrap/local_councils/gyeonggi.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def scrap_88(url, cid, args: ScrapBasicArgument) -> ScrapResult:
profiles = get_profiles_88_103(
soup, args.pf_elt, args.pf_cls, args.pf_memlistelt, args.pf_memlistcls
)
print(cid, "번째 의회에는,", len(profiles), "명의 의원이 있습니다.") # 디버깅용.
# print(cid, "번째 의회에는,", len(profiles), "명의 의원이 있습니다.") # 디버깅용.

for profile in profiles:
name = getname(
Expand Down Expand Up @@ -153,7 +153,7 @@ def scrap_103(url, cid, args: ScrapBasicArgument) -> ScrapResult:
profiles = get_profiles_88_103(
soup, args.pf_elt, args.pf_cls, args.pf_memlistelt, args.pf_memlistcls
)
print(cid, "번째 의회에는,", len(profiles), "명의 의원이 있습니다.") # 디버깅용.
# print(cid, "번째 의회에는,", len(profiles), "명의 의원이 있습니다.") # 디버깅용.

for profile in profiles:
name = getname(
Expand Down
2 changes: 1 addition & 1 deletion scrap/local_councils/incheon.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ def scrap_57(url, args) -> ScrapResult:
profiles = getprofiles(
soup, args.pf_elt, args.pf_cls, args.pf_memlistelt, args.pf_memlistcls
)
print(cid, "번째 의회에는,", len(profiles), "명의 의원이 있습니다.") # 디버깅용.
# print(cid, "번째 의회에는,", len(profiles), "명의 의원이 있습니다.") # 디버깅용.

for profile in profiles:
name = getname(
Expand Down
148 changes: 83 additions & 65 deletions scrap/utils/database.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import List

from db.client import client
from configurations.secrets import MongoDBSecrets

Expand All @@ -7,74 +9,90 @@
import json


# Note: MongoDB는 데이터베이스가 존재하지 않으면 자동으로 생성합니다.
# MongoDB 데이터베이스는 하나 이상의 컬렉션으로 구성됩니다.
# 컬렉션은 하나 이상의 문서로 구성됩니다.
db = client[str(MongoDBSecrets.database_name)]


def save_to_database(record: ScrapResult):
def save_to_database(record: ScrapResult) -> bool:
"""
지방의회 크롤링 결과를 데이터베이스에 저장합니다.
각 의회의 기존 데이터는 덮어 씌워집니다.
예시는 scrap/utils/database.py를 참조해주세요.
:param record: 지방의회 크롤링 결과
:return: 저장 성공 여부를 불리언 값으로 반환합니다.
"""
try:
# MongoDB는 JSON을 저장할 수 있습니다.
# JSON 형태로 변환한 후, MongoDB에 저장합니다.
# serialized_record = json.dumps(dataclasses.asdict(record), ensure_ascii=False)
collection = db[str(record.council_type)]
result = collection.find_one(
{"council_id": record.council_id},
db = client[str(MongoDBSecrets.database_name)]
collection = db[str(record.council_type)]

cid = record.council_id

new_councilors = check_new_councilors(collection, record)
resigned_councilors = check_resigned_councilors(collection, record)
other_councilors = [
councilor
for councilor in record.councilors
if councilor not in new_councilors
and councilor.name
not in [councilor["name"] for councilor in resigned_councilors]
]

# TODO: DB에 없던 새로운 의원 핸들링
update_councilors(collection, cid, other_councilors)
remove_councilors(collection, resigned_councilors)

return True


def check_new_councilors(collection, record: ScrapResult) -> list[Councilor]:
"""
DB에 없던 새로운 의원을 찾아 반환합니다.
:param collection: MongoDB 컬렉션
:param record: 지방의회 크롤링 결과
:return: 새로운 의원 목록
"""
new_councilors = []

for councilor in record.councilors:
if (
collection.find_one({"localId": record.council_id, "name": councilor.name})
is None
):
new_councilors.append(councilor)

return new_councilors


def check_resigned_councilors(collection, record: ScrapResult) -> list[dict]:
"""
DB에 있었으나 사퇴한 의원을 찾아 반환합니다.
:param collection: MongoDB 컬렉션
:param record: 지방의회 크롤링 결과
:return: 사퇴한 의원 목록
"""
resigned_councilors = []

for councilor in collection.find({"localId": record.council_id}):
if councilor["name"] not in [councilor.name for councilor in record.councilors]:
resigned_councilors.append(councilor)

return resigned_councilors


def update_councilors(collection, cid: int, councilors: List[Councilor]) -> None:
"""
신규/사퇴 의원이 아닌 의원 정보를 업데이트합니다.
:param collection: MongoDB 컬렉션
:param councilors: 업데이트할 의원 리스트
"""
for councilor in councilors:
collection.update_one(
{"localId": cid, "name": councilor.name},
{"$set": asdict(councilor)},
upsert=False,
)


def remove_councilors(collection, councilors: List[Councilor]) -> None:
"""
사퇴한 의원을 데이터베이스에서 제거합니다.
:param collection: MongoDB 컬렉션
:param councilors: 제거할 의원 리스트
"""
for councilor in councilors:
collection.delete_one(
{"localId": councilor["localId"], "name": councilor["name"]}
)
if result is not None:
before_councilors = result["councilors"] # List[dict]
updated_councilors = []
updated_names = set()

name_data_map_for_update = {d.name: asdict(d) for d in record.councilors}

for d in before_councilors:
if d["name"] in name_data_map_for_update:
d.update(
{
k: v
for k, v in name_data_map_for_update[d["name"]].items()
if k in d
}
)
updated_names.add(d["name"])
updated_councilors.append(d)

for d in record.councilors:
if d.name not in updated_names:
updated_councilors.append(asdict(d))

collection.find_one_and_update(
{"council_id": record.council_id},
{"$set": {"councilors": updated_councilors}},
upsert=True,
)
else:
return False

return True
except Exception as e:
t
print(e)
return False


if __name__ == "__main__":
test_record = ScrapResult(
council_id="test-test",
council_type=CouncilType.LOCAL_COUNCIL,
councilors=[
Councilor(name="김철수", jdName="국민의힘"),
Councilor(name="김영희", jdName="Birthday Party"),
Councilor(name="테스트", jdName="테스트당"),
],
)
print(save_to_database(test_record))
39 changes: 39 additions & 0 deletions scrap/utils/export.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import os
import json
from dataclasses import asdict

from scrap.utils.types import ScrapResult, ScrapBasicArgument


def export_results_to_json(
results: dict[int, ScrapResult], output_path: str, current_time: str
):
os.makedirs(output_path, exist_ok=True)
results = {
k: [asdict(councilor) for councilor in v.councilors] for k, v in results.items()
}

with open(
os.path.join(output_path, f"scraping_result_{current_time}.json"),
"w",
encoding="utf-8",
) as f:
json.dump(results, f, ensure_ascii=False, indent=4)


def export_results_to_txt(
results: dict[int, ScrapResult], output_path: str, current_time: str
):
os.makedirs(output_path, exist_ok=True)
results = {
k: [asdict(councilor) for councilor in v.councilors] for k, v in results.items()
}

with open(
os.path.join(output_path, f"scraping_result_{current_time}.txt"),
"w",
encoding="utf-8",
) as f:
for cid, councilors in results.items():
councilors = "\n".join([c.to_txt() for c in councilors])
f.write(f"| {cid} | {councilors}\n")
Loading

0 comments on commit 8ec671d

Please sign in to comment.