forked from ShoufaChen/clone-anonymous4open
-
Notifications
You must be signed in to change notification settings - Fork 0
/
clone.py
111 lines (85 loc) · 3.51 KB
/
clone.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import urllib.request as urllib2
import re
from bs4 import BeautifulSoup
import os
import argparse
# pattern that is contained in the page when a file is too big
too_big_pattern = re.compile('too big to be anonymized')
def parse_args():
parser = argparse.ArgumentParser(description='Clone from the https://anonymous.4open.science')
parser.add_argument('--clone-dir', type=str, default='master',
help='master loacation')
parser.add_argument('--target', type=str,
help='anonymous link you want to clone')
return parser.parse_args()
def create_dir(name):
if not os.path.exists(name):
os.mkdir(name)
def pull_html(url):
req = urllib2.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
try:
response = urllib2.urlopen(req).read()
except urllib2.URLError as e:
print(e)
print(url)
content = response.decode('utf-8')
soup = BeautifulSoup(content, "lxml")
return soup
def pull_trees(url):
folder_soup = pull_html(url)
trees = folder_soup.find_all('div', attrs={'class': 'tree'})
return trees
def pull_blobs(url):
blobs_soup = pull_html(url)
blobs = blobs_soup.find_all('div', attrs={'class': 'blob'})
return blobs
def clone_file(url, download, root_url='https://anonymous.4open.science'):
blobs = pull_blobs(root_url+url)
for blob in blobs:
href = blob.a.get('href')
split_href = href.split('/')
file_name = '/'.join([download]+split_href[3:])
print('Clone... ', file_name)
#used for debug
#print('Clone... ', file_name, href)
### Not support clone markdown files now and LICENSE
if split_href[-1].split('.')[-1] == 'md' or split_href[-1] =='LICENSE':
continue
blob_soup = pull_html(root_url+href)
# skipping files that are too big
if len(blob_soup.body.findAll(text=too_big_pattern)) != 0:
print(f'WARNING: Skipping {file_name} as it is too big to be anonymized')
continue
source_code = blob_soup.find('code')
with open(file_name, 'w') as f:
f.write(source_code.get_text())
def clone_dirs(url, folders_url_lis, download, root_url='https://anonymous.4open.science'):
trees = pull_trees(root_url+url)
for t in trees:
href = t.a.get('href')
split_href = href.split('/')
#folder_name = split_href[-2]
folder_name = '/'.join([download]+split_href[3:-1])
print('Clone... ', folder_name)
#print('Clone... ', folder_name, href)
create_dir(folder_name)
folders_url_list.append(href)
folders_url_list.remove(url)
return folders_url_list
if __name__ == '__main__':
args = parse_args()
assert args.target, '\nPlese specifipy your target URL, \n e.g: '\
+'python clone.py --target https://anonymous.4open.science/r/840c8c57-3c32-451e-bf12-0e20be300389/'
root_url = 'https://anonymous.4open.science'
target_url = args.target.replace(root_url, '')
create_dir(args.clone_dir)
folders_url_list = [target_url]
clone_file(target_url.replace(root_url, ''), args.clone_dir)
folders_url_list = clone_dirs(target_url.replace(root_url, ''), folders_url_list, args.clone_dir)
while len(folders_url_list):
url = folders_url_list[0]
clone_file(url, args.clone_dir)
folders_url_list = clone_dirs(url, folders_url_list, args.clone_dir)
print('==='*20)
print('Successfully Clone to: {}'.format(args.clone_dir))
print('==='*20)