-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhs-news-archive-search.py
171 lines (132 loc) · 6.61 KB
/
hs-news-archive-search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# This script automates a query into historic news articles and fetches headline data
# from Helsingin Sanomat news archive at https://www.hs.fi/arkisto/
# Only the headlines and their dates of publishing are returned.
# You can search for names, words etc to appear in the articles and select how many headlines you want returned
# IMPORTANT: Before running, read the README file for the disclaimer and for the general terms and
# conditions of Sanoma Media Finland. Use of this script is prohibited without first obtaining
# formal consent from Sanoma Media Finland.
# NOTE: There's one issue with the website. When you load the last page of results,
# all results disappear. The only solution I've found to work is to first calculate
# the maximum number of 'acceptable' load more times and then reload the pages
# and fetch the headline data.
# Imports
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
import time
import pandas as pd
# INSERT the path where the WebDriver is saved and where it will be run
path = "/Users/..."
# Ask for the search term to be used for the query (e.g. "koronavirus", "elon musk", "tekoäly")
search_term = str(input("Search term:\n"))
# ... and for the number of articles you want in return
no_of_articles_wanted = int(input("How many headlines?\n"))
# Ask for the time period
# If 'Any' then all articles will be retrieved
# If 'Custom' then the start and end dates will be asked
while True:
time_select = str(input("Select a time period ('Any' or 'Custom'): "))
if time_select.casefold() == "any":
period = "whenever"
query_url = "https://www.hs.fi/haku/?query=" + search_term \
+ "&category=kaikki&period=whenever"
break
elif time_select.casefold() == "custom":
start_date = str(input("Select start date (format YYYY-MM-DD): "))
end_date = str(input("Select end date (format YYYY-MM-DD): "))
query_url = "https://www.hs.fi/haku/?query=" + search_term \
+ "&category=kaikki&period=custom&order=new&startDate=" + start_date \
+ "&endDate=" + end_date
break
else:
print("Error: make sure you type either 'Any' or 'Custom'")
continue
# Create a local WebDriver in the path defined in the beginning
driver = webdriver.Chrome(path)
# Send the query
driver.get(query_url)
# Wait for the page to load
time.sleep(5)
# Next we need to establish the maximum number of 'acceptable' load more times.
# This is done by counting each time we press of the load more button until the page crashes
# The following will press the 'Load More' button as long as it keeps appearing on the page
load_more_times = 0
while True:
try:
load_more_button = driver.find_element_by_xpath(
"/html/body/div[1]/div[2]/div[3]/div[1]/div[2]/main/section[4]/section/div[2]/section/div/button")
load_more_button.click()
load_more_times += 1
time.sleep(1)
except NoSuchElementException:
break
time.sleep(1)
# This is how many 'acceptable' load more times we are allowed in the actual run that will follow
acceptable_load_more_times = load_more_times - 1
# Quit the driver
driver.quit()
# Optional
print("Acceptable Load More times: " + str(acceptable_load_more_times))
# Now we repeat the query but use the 'Acceptable Load More Times'
# and also run the actual scraper.
# Create a local WebDriver in the path defined in the beginning
driver = webdriver.Chrome(path)
# Send the same query
driver.get(query_url)
# Wait for the page to load
time.sleep(5)
# Load all pages (except for the last one which makes all results disappear)
for i in range(acceptable_load_more_times):
load_more_button = driver.find_element_by_xpath(
"/html/body/div[1]/div[2]/div[3]/div[1]/div[2]/main/section[4]/section/div[2]/section/div/button")
load_more_button.click()
time.sleep(1)
# Now that all pages are loaded, next we fetch all the headlines and their dates of publishing
# and append our list with dictionaries for each entry.
# Have to use try/except because of many different xpath patterns for the articles...
# These should capture around 99% of all headlines
articles = []
article_num = 1
for i in range(no_of_articles_wanted):
try:
try:
headline = driver.find_element_by_xpath(
"/html/body/div[1]/div[2]/div[3]/div[1]/div[2]/main/section[4]/section/div[2]/section/article["
+ str(article_num) + "]/a/section/div[1]/div[2]/h2/span[2]").text
except NoSuchElementException:
try:
headline = driver.find_element_by_xpath(
"/html/body/div[1]/div[2]/div[3]/div[1]/div[2]/main/section[4]/section/div[2]/section/article["
+ str(article_num) + "]/a/section/div[1]/div[2]/h2/span").text
except NoSuchElementException:
try:
headline = driver.find_element_by_xpath(
"/html/body/div[1]/div[2]/div[3]/div[1]/div[2]/main/section[4]/section/div[2]/section/article["
+ str(article_num) + "]/a/section/div[1]/div/h2/span[2]").text
except NoSuchElementException:
headline = driver.find_element_by_xpath(
"/html/body/div[1]/div[2]/div[3]/div[1]/div[2]/main/section[4]/section/div[2]/section/article["
+ str(article_num) + "]/a/section/div[1]/div/h2/span").text
try:
published_time = driver.find_element_by_xpath(
"/html/body/div[1]/div[2]/div[3]/div[1]/div[2]/main/section[4]/section/div[2]/section/article["
+ str(article_num) + "]/a/section/div[2]/div[1]/time").get_attribute("datetime")
except NoSuchElementException:
published_time = driver.find_element_by_xpath(
"/html/body/div[1]/div[2]/div[3]/div[1]/div[2]/main/section[4]/section/div[2]/section/article["
+ str(article_num) + "]/a/section/div[2]/div[2]/time").get_attribute("datetime")
articles.append({"Article No": article_num, "Published": published_time, "Headline": headline})
article_num += 1
except NoSuchElementException:
article_num += 1
# Read the data into a data frame
articles_df = pd.DataFrame(articles)
# Optional
print("Number of articles found: " + str(articles_df.shape[0]))
# Optional
print(articles_df.head())
# Create a CSV file, and name it according to the search term
file_name = "Helsingin Sanomat Articles for " + search_term + ".csv"
articles_df.to_csv(file_name)
# Wait 10 seconds and quit the driver
time.sleep(10)
driver.quit()