forked from tobacco-mofs/tobacco_3.0
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_rcsr.py
65 lines (53 loc) · 2.44 KB
/
scrape_rcsr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
from selenium.webdriver import Chrome
import time
import pandas as pd
#SETTINGS
n_topologies = 3001
webdriver = r"/path/to/chromedriver.exe" #change me!
#^Download from: https://chromedriver.chromium.org/
driver = Chrome(executable_path=webdriver)
topologies = []
df = pd.DataFrame([])
url_tags = ['nets','layers']
print('Scraping topology names..')
for url_tag in url_tags:
url = "http://rcsr.anu.edu.au/"+url_tag+"#details"
driver.get(url)
time.sleep(10)
driver.find_elements_by_xpath('//*[@id="react-main"]/div/div[2]/ul/li[2]/div/ul/li[1]/a')[0].click()
for i in range(1,n_topologies+1):
topologies.append(driver.find_elements_by_xpath('//*[@id="react-main"]/div/div[2]/ul/li[2]/div/div/ul/li['+str(i)+']/a')[0].text)
print('Scraping topology data...')
for i, top in enumerate(topologies):
url = 'http://rcsr.anu.edu.au/nets/'+top
driver.get(url)
if i == 0:
time.sleep(10)
else:
time.sleep(1)
spacegroup = driver.find_elements_by_xpath('//*[@id="react-main"]/div/div[2]/table[1]/tbody/tr/td[2]')[0].text
cellparams = []
for j in range(1,7):
cellparams.append(float(driver.find_elements_by_xpath('//*[@id="react-main"]/div/div[2]/table[2]/tbody/tr/td['+str(j)+']')[0].text))
n_vertices = int(driver.find_elements_by_xpath('//*[@id="react-main"]/div/div[2]/div[2]/p/span/span[2]')[0].text)
all_vert_info = []
for j in range(1,n_vertices+1):
vert_info = []
vert_info.append(int(driver.find_elements_by_xpath('//*[@id="react-main"]/div/div[2]/div[2]/table[1]/tbody/tr['+str(j)+']/td['+str(2)+']')[0].text))
for k in range(3,6):
vert_info.append(float(driver.find_elements_by_xpath('//*[@id="react-main"]/div/div[2]/div[2]/table[1]/tbody/tr['+str(j)+']/td['+str(k)+']')[0].text))
all_vert_info.append(vert_info)
n_edges = int(driver.find_elements_by_xpath('//*[@id="react-main"]/div/div[2]/div[3]/p/span/span[2]')[0].text)
all_edge_info = []
for j in range(1,n_edges+1):
edge_info = []
for k in range(2,5):
edge_info.append(float(driver.find_elements_by_xpath('//*[@id="react-main"]/div/div[2]/div[3]/table/tbody/tr['+str(j)+']/td['+str(k)+']')[0].text))
all_edge_info.append(edge_info)
df = df.append(pd.DataFrame({'topology':top,'spacegroup':spacegroup,'cellparams':str(cellparams),'vertices':str(all_vert_info),'edges':str(all_edge_info)},index=[0]),ignore_index=True)
if url_tag == 'nets':
df.to_csv('rcsr_3D.csv')
elif url_tag == 'layers':
df.to_csv('rcsr_2D.csv')
driver.close()
print('Done!')