Skip to content

Commit

Permalink
feat :: 경상북도 문경시 보건소 크롤링 완성
Browse files Browse the repository at this point in the history
  • Loading branch information
s1hyun2 committed Nov 25, 2024
1 parent b9b66f4 commit 082d9c4
Showing 1 changed file with 147 additions and 0 deletions.
147 changes: 147 additions & 0 deletions Gyeongsangbuk-do/Mungyeong-si.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
import re
import os
import sys
import time
import requests
import urllib.parse
from bs4 import BeautifulSoup, Comment
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from config import babya_server

chrome_options = Options()
chrome_options.add_experimental_option("detach", True)
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options)

try:
region = "111070"
link_list = list()
current_list = list()
result_data = []

site_url = requests.get(f"{babya_server}/policy/site", params={"region": region})
response_data = site_url.json()
base_url = response_data["data"]["policySiteUrl"]
format_url = base_url.split("/main")[0]

collected_site_data = requests.get(f"{babya_server}/policy/catalog", params={"site": base_url})
collected_list = [item["pageId"] for item in collected_site_data.json()["data"]]

item_list = ["신혼부부", "임신.출산", "임신지원", "영유아지원", "다자녀지원사업", "영양플러스사업"]

url = f"{format_url}/portal/contents.do?mId=0604072200"
driver.get(url)
time.sleep(2)
soup = BeautifulSoup(driver.page_source, 'html.parser')

for i in soup.select("#content > nav.tab_depth04 > ul.list04 > li > a"):
if i.text in item_list:
id_item = i.get("href").split("?mId=")[1]
link_list.append(id_item)

for link in link_list:
driver.get(f"{format_url}/portal/contents.do?mId={link}")
time.sleep(2)
soup = BeautifulSoup(driver.page_source, 'html.parser')
elements = soup.select("div.tab_depth05")

if elements:
for element in elements:
for i in element.select("ul > li > a"):
id_item = i.get("href").split("?mId=")[1]
current_list.append(id_item)

else:
current_list.append(link)


page_list = set(current_list) - set(collected_list)

for page_id in page_list:
page_url = f"{format_url}/portal/contents.do?mId={page_id}"
driver.get(page_url)
time.sleep(2)
soup = BeautifulSoup(driver.page_source, 'html.parser')

data_dict = {
"title": None,
"content": None,
"editDate": None,
"pageId": None,
"site": None,
"page": None
}

styles = []

for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
comment.extract()

for meta in soup.select("html > head > meta"):
if meta.get("charset"):
styles.append(str(meta))

for link in soup.select("html > head > link"):
if link.get("rel")[0] == "stylesheet":
link_url = link.get("href")
link["href"] = urllib.parse.urljoin(base_url, link_url)
styles.append(str(link))

for title in soup.select("#tit_wrap > h3"):
data_dict["title"] = title.get_text()

for content in soup.select("#conts"):
for img in content.find_all('img'):
img_url = img.get("src")
if img_url:
img["src"] = urllib.parse.urljoin(base_url, img_url)

for a in content.find_all("a", href=True):
file_url = a['href']
a['href'] = urllib.parse.urljoin(base_url, file_url)

styles_str = "".join(styles)
content_str = re.sub(r'[\s\u00A0-\u00FF]+', " ", str(content))

head_content = f"<head>{styles_str}</head>"
body_content = f"<body>{content_str}</body>"

html_content = f"<!DOCTYPE html><html>{head_content}{body_content}</html>"
data_dict["content"] = html_content

for edit_date in soup.select("#researchDiv > div.dataOffer > dl > dd"):
data_dict["editDate"] = edit_date.get_text().strip()

data_dict["pageId"] = page_id
data_dict["site"] = base_url
data_dict["page"] = page_url

if all(data_dict[key] is not None for key in ["title", "content"]):
result_data.append(data_dict)


if (len(result_data) > 0):
print(f"크롤링한 페이지 개수: [{len(result_data)}]")
policy = requests.post(f"{babya_server}/policy", json=result_data)
print(policy.status_code)
print(policy.text)

else:
print("아직 새로운 정책이 업데이트 되지 않았습니다.")

except Exception as e:
print(f"Error: {e}")
driver.close()
sys.exit()

finally:
driver.close()
sys.exit()

while True:
pass

0 comments on commit 082d9c4

Please sign in to comment.