-
Notifications
You must be signed in to change notification settings - Fork 0
/
script.py
139 lines (104 loc) · 4.73 KB
/
script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#!/usr/bin/env python3
from bs4 import BeautifulSoup
from selenium import webdriver
import re
import json
def get_modpack_paths_list(url):
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'lxml')
pagination_items = soup.find_all('a', class_='pagination-item')
num_pages = pagination_items[len(pagination_items)-1].text
return_value = []
for page_num in range(1, int(num_pages)):
new_page_url = url + '&page=' + str(page_num)
driver.get(new_page_url)
soup = BeautifulSoup(driver.page_source, 'lxml')
modpack_listing_rows = soup.find_all('li', class_='project-listing-row')
for listing_row in range(0, len(modpack_listing_rows)):
# Expected list of 'a' tags for modpack_urls:
# 0 - modpack icon
# 1 - modpack name
# 2 - modpack author name
# 3+ - modpack category icons
modpack_urls = modpack_listing_rows[listing_row].find_all('a')
path = modpack_urls[1]['href'] # TODO: do the .parent.findNext() thing on this later for robustness
return_value.append(path)
return return_value
def write_json(data, filename='data.json'):
with open(filename, 'w+') as f:
json.dump(data, f, indent=4)
def initialize_json_file():
with open('data.json', 'w') as fp:
data = {}
data['modpack_metadata'] = []
json.dump(data, fp)
def get_license_path(project_id):
return '/project/' + project_id + '/license'
def scrape_modpack_pages(paths):
ALL_FILES_REL_PATH = '/files/all'
initialize_json_file()
for path in paths:
webdriver_load_from_path(path+ALL_FILES_REL_PATH)
soup = BeautifulSoup(driver.page_source, 'lxml')
aside = soup.find('aside')
header = soup.find('header')
# Header
project_name = header.find('h2').contents[0]
# Aside
project_id = aside.find(text=re.compile('Project ID' )).parent.find_next('span').contents[0]
date_created = aside.find(text=re.compile('Created' )).parent.find_next('span').find('abbr', class_='standard-datetime')['data-epoch']
date_updated_tag = aside.find(text=re.compile('Updated' )).parent.find_next('span').find('abbr', class_='standard-datetime')
try:
date_updated = date_updated_tag['data-epoch']
except:
continue
total_downloads = aside.find(text=re.compile('Total Downloads')).parent.find_next('span').contents[0]
total_downloads = int(total_downloads.replace(',', ''))
# TODO: collect "members" data
# Files (for a list of game versions)
try:
table = soup.find_all('table')[0]
except:
continue
header_row = table.find_all('th')
rows = table.find_all('td')
version_col = -1
for col_counter in range(0, len(header_row)):
header = header_row[col_counter].get_text().strip()
if 'version' in header.lower():
version_col = col_counter
game_versions = set()
for col_counter in range(1, len(rows)):
if col_counter == version_col:
game_versions.add(rows[col_counter].get_text().strip().split('\n')[0].strip())
metadata = {
'project_name' : project_name ,
'project_id' : project_id ,
'date_created' : date_created ,
'date_updated' : date_updated ,
'total_downloads': total_downloads,
'game_versions' : list(game_versions)
}
with open('data.json', "r+") as json_file:
data = json.load(json_file)
data['modpack_metadata'].append(metadata)
write_json(data)
def webdriver_load_from_path(path):
driver.get(CURSEFORGE_DOMAIN + path)
if __name__ == "__main__":
CURSEFORGE_DOMAIN = 'https://www.curseforge.com'
DEPENDENTS_URL = 'https://www.curseforge.com/minecraft/mc-mods/logistics-pipes/relations/dependents'
option = webdriver.ChromeOptions()
chrome_prefs = {}
option.experimental_options["prefs"] = chrome_prefs
# Disable images
chrome_prefs["profile.default_content_settings"] = {"images": 2}
chrome_prefs["profile.managed_default_content_settings"] = {"images": 2}
# Disable Javascript
chrome_prefs['profile.managed_default_content_settings.javascript'] = 2
driver = webdriver.Chrome(options=option)
driver.implicitly_wait(10)
#TODO: put get_modpack_paths_list() and scrape_modpack_pages() into 2 separate scripts maybe?
modpack_paths = get_modpack_paths_list(DEPENDENTS_URL)
scrape_modpack_pages(modpack_paths)
driver.quit()