-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patharlington.py
151 lines (134 loc) · 5.68 KB
/
arlington.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import re
import json
from datetime import datetime, timedelta, time
import uuid
import traceback
import os
import sys
from time import sleep
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from icalendar import Calendar, Event, vBinary
# Check existance of executable ChromeDriver
DRIVER='.\\chromedriver.exe'
if not os.path.exists(DRIVER):
print(f'ERROR: ChromeDriver [{DRIVER}] not found')
sys.exit(1)
if not os.path.isfile(DRIVER):
print('ERROR: ChromeDriver [{DRIVER}] is not a file')
sys.exit(1)
if not os.access(DRIVER, os.X_OK):
print('ERROR: ChromeDriver [{DRIVER}] is not executable')
sys.exit(1)
service = Service(DRIVER)
service.start()
driver = webdriver.Remote(service.service_url)
HOST='https://arlingtonarts.ticketsolve.com'
driver.get('https://arlingtonarts.ticketsolve.com/ticketbooth/shows?i=64')
sleep(5)
page = driver.page_source
soup = BeautifulSoup(page, 'html.parser')
# iCalendar library to generate an iCal (.ics) file
cal = Calendar()
cal.add('VERSION', '2.0')
cal.add('PRODID', '-//Arlington Arts Centre//')
cal.add('X-WR-CALNAME', 'Arlington Arts')
cal.add('X-WR-TIMEZONE', 'Europe/London')
cal.add('X-WR-CALDESC', 'Events at Arlington Arts')
cal.add('X-WR-RELCALTYPE', 'PUBLIC')
cal.add('X-WR-RELCALNAME', 'Arlington Arts')
cal.add('X-WR-RELCALURL', 'https://arlingtonarts.ticketsolve.com/')
cal.add('X-WR-RELCALPRIV', 'PUBLIC')
listings = soup.find_all('article', attrs = {'class': 'show-card'})
for listing in listings:
event = Event()
# Generate a unique ID for each event
event.add('UID', uuid.uuid4()) # Unique ID for the Event
event.add('CREATED', datetime.now()) # When the Event was created
event.add('DTSTAMP', datetime.now()) # When the Event was last modified
subject_node = listing.find('h2')
if subject_node:
subject = subject_node.text
else:
subject = 'Not Stated'
event.add('SUMMARY', subject)
date_node = listing.find('span', attrs = {'class': 'truncate'})
if date_node:
date = date_node.text
# Set the event date and start time later (if start time is known in the details page)
else:
raise Exception(f'No event date specified for [{subject}] -- skipping')
venue_node = listing.find('div', attrs = {'class': 'flex-grow truncate'})
if venue_node and venue_node.text:
venue = venue_node.text.strip()
else:
venue = "Venue Not Known"
event.add('LOCATION', venue)
# Get the detailed event description from the specific event page
event_url = listing.a.get("href")
event.add('URL', f'{HOST}{event_url}')
description = f'Details Page: {HOST}{event_url}\n\n'
driver.get(f'{HOST}{event_url}')
# This page takes a moment to load fully - would be nice to have some notification of load-complete
sleep(1)
event_soup = BeautifulSoup(driver.page_source, 'html.parser')
image_node = event_soup.find('img', attrs = {'alt': 'cover'})
if image_node:
image_url = image_node.get('src')
event.add('IMAGE', image_url) # Does not seem to do anything
"""
# Read the image data and add it to the event
print(f'Reading image file [{image_url}]')
request = urllib.request.Request(image_url, headers={'User-Agent': 'Mozilla/5.0'})
with urllib.request.urlopen(request) as image_file:
image_data = image_file.read()
attachment = vBinary(image_data)
event.add( 'ATTACH', attachment, parameters = { 'FMTTYPE': 'image/jpeg', 'ENCODING': 'BASE64', 'VALUE': 'BINARY' } )
"""
# Get the detailed event description from the specific event page
description_node = event_soup.find('div', attrs = {'class': 'overflow-hidden max-h-60 cursor-pointer default-style'})
if description_node and description_node.div:
description_node = description_node.div
sections = description_node.find_all(recursive=False)
full_text = description = ''
for element in sections:
clean_text = element.get_text()
if clean_text:
full_text += clean_text
else:
if element.name == 'br':
full_text += '\n'
description += full_text
print(f'** Final description: [{description}]]')
else:
description += f'No detailed description found [{description_node}] for [{subject}]'
event.add('DESCRIPTION', f'{description}')
# Get the start time and add to the previous date
event_time_node = event_soup.find('span', attrs = {'class': 'pl-1'})
if event_time_node and event_time_node.text:
# Likely to be in the form: "(Doors open 19:00)"
time_text = re.findall(r'\d\d:\d\d', event_time_node.text)
if time_text:
hours = time_text[0].split(':')[0]
minutes = time_text[0].split(':')[1]
full_datetime = datetime.combine(
datetime.strptime(date, '%d %b %Y'),
time(hour=int(hours), minute=int(minutes))
)
event.add('DTSTART', full_datetime)
# Assume the start time is 30mins before the event, and the event in 2 hours long
event.add('DTEND', full_datetime + timedelta(hours=2, minutes=30))
else:
date_obj = datetime.strptime(date, '%d %b %Y')
print(f'No event time found for [{subject}] on [{date_obj}]')
# Just add the date to the event
event.add('DTSTART', date_obj)
event.add('DTEND', date_obj)
# Add the event to the calendar
cal.add_component(event)
driver.quit()
ics_file = open('arlington.ics', 'wb')
ics_file.write(cal.to_ical())
ics_file.close()