diff --git a/Gyeongsangbuk-do/Mungyeong-si.py b/Gyeongsangbuk-do/Mungyeong-si.py new file mode 100644 index 0000000..fab3062 --- /dev/null +++ b/Gyeongsangbuk-do/Mungyeong-si.py @@ -0,0 +1,147 @@ +import re +import os +import sys +import time +import requests +import urllib.parse +from bs4 import BeautifulSoup, Comment +from selenium import webdriver +from selenium.webdriver.chrome.service import Service as ChromeService +from webdriver_manager.chrome import ChromeDriverManager +from selenium.webdriver.chrome.options import Options + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from config import babya_server + +chrome_options = Options() +chrome_options.add_experimental_option("detach", True) +driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options) + +try: + region = "111070" + link_list = list() + current_list = list() + result_data = [] + + site_url = requests.get(f"{babya_server}/policy/site", params={"region": region}) + response_data = site_url.json() + base_url = response_data["data"]["policySiteUrl"] + format_url = base_url.split("/main")[0] + + collected_site_data = requests.get(f"{babya_server}/policy/catalog", params={"site": base_url}) + collected_list = [item["pageId"] for item in collected_site_data.json()["data"]] + + item_list = ["신혼부부", "임신.출산", "임신지원", "영유아지원", "다자녀지원사업", "영양플러스사업"] + + url = f"{format_url}/portal/contents.do?mId=0604072200" + driver.get(url) + time.sleep(2) + soup = BeautifulSoup(driver.page_source, 'html.parser') + + for i in soup.select("#content > nav.tab_depth04 > ul.list04 > li > a"): + if i.text in item_list: + id_item = i.get("href").split("?mId=")[1] + link_list.append(id_item) + + for link in link_list: + driver.get(f"{format_url}/portal/contents.do?mId={link}") + time.sleep(2) + soup = BeautifulSoup(driver.page_source, 'html.parser') + elements = soup.select("div.tab_depth05") + + if elements: + for element in elements: + for i in element.select("ul > li > a"): + id_item = i.get("href").split("?mId=")[1] + current_list.append(id_item) + + else: + current_list.append(link) + + + page_list = set(current_list) - set(collected_list) + + for page_id in page_list: + page_url = f"{format_url}/portal/contents.do?mId={page_id}" + driver.get(page_url) + time.sleep(2) + soup = BeautifulSoup(driver.page_source, 'html.parser') + + data_dict = { + "title": None, + "content": None, + "editDate": None, + "pageId": None, + "site": None, + "page": None + } + + styles = [] + + for comment in soup.find_all(string=lambda text: isinstance(text, Comment)): + comment.extract() + + for meta in soup.select("html > head > meta"): + if meta.get("charset"): + styles.append(str(meta)) + + for link in soup.select("html > head > link"): + if link.get("rel")[0] == "stylesheet": + link_url = link.get("href") + link["href"] = urllib.parse.urljoin(base_url, link_url) + styles.append(str(link)) + + for title in soup.select("#tit_wrap > h3"): + data_dict["title"] = title.get_text() + + for content in soup.select("#conts"): + for img in content.find_all('img'): + img_url = img.get("src") + if img_url: + img["src"] = urllib.parse.urljoin(base_url, img_url) + + for a in content.find_all("a", href=True): + file_url = a['href'] + a['href'] = urllib.parse.urljoin(base_url, file_url) + + styles_str = "".join(styles) + content_str = re.sub(r'[\s\u00A0-\u00FF]+', " ", str(content)) + + head_content = f"{styles_str}" + body_content = f"{content_str}" + + html_content = f"{head_content}{body_content}" + data_dict["content"] = html_content + + for edit_date in soup.select("#researchDiv > div.dataOffer > dl > dd"): + data_dict["editDate"] = edit_date.get_text().strip() + + data_dict["pageId"] = page_id + data_dict["site"] = base_url + data_dict["page"] = page_url + + if all(data_dict[key] is not None for key in ["title", "content"]): + result_data.append(data_dict) + + + if (len(result_data) > 0): + print(f"크롤링한 페이지 개수: [{len(result_data)}]") + policy = requests.post(f"{babya_server}/policy", json=result_data) + print(policy.status_code) + print(policy.text) + + else: + print("아직 새로운 정책이 업데이트 되지 않았습니다.") + +except Exception as e: + print(f"Error: {e}") + driver.close() + sys.exit() + +finally: + driver.close() + sys.exit() + +while True: + pass \ No newline at end of file