-
Notifications
You must be signed in to change notification settings - Fork 1
/
hwp_crawling.py
executable file
·47 lines (42 loc) · 1.63 KB
/
hwp_crawling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#-*- coding: utf-8 -*-
import requests
import json
import re
from bs4 import BeautifulSoup
from selenium import webdriver
def getNAMFileInfo(html, kind):
# html = requests.get(url).text
soup = BeautifulSoup(html, 'html.parser')
# 파일 정보
links = soup.select('#ajaxResult a')
fileInfoList = []
hashs = []
for link in links:
href = link['href']
hrefReg = re.findall('fn_pdfPopup', href)
if len(hrefReg) > 0:
fileInfo = re.findall('\'(.*?)\'', href)
hashStr = hash(frozenset(fileInfo))
if hashStr not in hashs:
print(hashStr)
hashs.append(hashStr)
if kind == "HWP":
paddingSize = len(fileInfo[1])
file = str(int(fileInfo[1])+1).zfill(paddingSize)
else:
file = fileInfo[1]
fileInfoList.append("fn_fileDown('" + fileInfo[0] + "', '" + file + "')")
print(fileInfoList)
return fileInfoList
# kind = PDF / HWP
def getNationalAssemblyMinutes(url, kind):
chromeOptions = webdriver.ChromeOptions()
chromeOptions.add_experimental_option("detach", True)
driver = webdriver.Chrome('/Users/apple/Library/WebDriver/chromedriver', chrome_options=chromeOptions)
driver.get(url)
driver.execute_script("fn_movePage('1000', '1')")
html = driver.find_element_by_css_selector('body').get_attribute('innerHTML')
fileInfoList = getNAMFileInfo(html, kind)
for fileInfo in fileInfoList:
driver.execute_script(fileInfo)
getNationalAssemblyMinutes("http://likms.assembly.go.kr/record/mhs-30-011.do", "HWP")