-
Notifications
You must be signed in to change notification settings - Fork 1
/
getalb.py
209 lines (171 loc) · 6.78 KB
/
getalb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
#!/usr/bin/env python
"""getalb
Script for downloading entire albums at once from http://musicmp3spb.org,
as opposite to song-by-song manual downloading.
Downloaded files are stored in 'getalb/music/' directory.
Script works with Python 2.7.
Script uses BeautifulSoup library for parsing html pages.
"""
__author__ = 'Oleg Esenkov ([email protected])'
__copyright__ = 'Copyright (c) 2015 Oleg Esenkov'
__license__ = 'The MIT License (MIT)'
__version__ = '0.2.5'
import argparse
import os
import re
import sys
import time
import urllib
import urllib2
from bs4 import BeautifulSoup
TARGET_SITE = 'http://musicmp3spb.org'
def get_url():
"""Return album url string from command line argument or,
if argument not given, from user input."""
argument_parser = argparse.ArgumentParser()
argument_parser.add_argument('album_url', nargs='?', default=None,
help='full album url from ' + TARGET_SITE)
arguments_given = argument_parser.parse_args()
if arguments_given.album_url:
album_url = arguments_given.album_url
else:
album_url = raw_input('Full album url from ' + TARGET_SITE + ':\n')
return album_url
def web_request(url, form_data=None):
"""Return response object to GET or POST request."""
if form_data: # Should be encoded only if not None
form_data = urllib.urlencode(form_data)
request = urllib2.Request(url, form_data)
connect_count = 0
connect_attempts = 5
while connect_count < connect_attempts:
try:
response = urllib2.urlopen(request)
return response
except urllib2.URLError:
print 'Connection failed. Trying again...'
connect_count += 1
time.sleep(1)
except ValueError:
raw_input('Wrong url. Could not connect. Press ENTER to exit.')
sys.exit()
raw_input('Check your internet connection. Press ENTER to exit.')
sys.exit()
def parse_page(response):
"""Return parsed page object."""
html_page = response.read()
parsed_page = BeautifulSoup(html_page)
return parsed_page
def get_artist_name(parsed_album_page):
"""Return artist name string."""
try:
content_div = parsed_album_page.find('div', {'id': 'cntCenter'})
artist_name = content_div.find_all('h1')[0].a.string.replace(' mp3', '')
return artist_name
except AttributeError:
raw_input('Wrong url. No album found. Press ENTER to exit.')
sys.exit()
def get_album_name(parsed_album_page):
"""Return album name string."""
try:
album_name_div = parsed_album_page.find('div', 'Name')
album_name = album_name_div.contents[0].replace('\n', '')
return album_name
except AttributeError:
raw_input('Wrong url. No album found. Press ENTER to exit.')
sys.exit()
def unify_name(name):
"""Delete OS reserved characters from name."""
name = re.sub('[<>:"/\\|?*]', '_', name)
return name
def get_temporary_links(parsed_album_page):
"""Return list of temporary links to songs."""
temporary_links = []
try:
songs_div = parsed_album_page.find('div', 'albSong')
songs_relative_links_list = songs_div.find_all('a', 'Name')
for link in songs_relative_links_list:
temporary_links.append(TARGET_SITE + link.get('href'))
return temporary_links
except AttributeError:
raw_input('Wrong url. No songs found. Press ENTER to exit.')
sys.exit()
def get_form_data(parsed_temp_page):
"""Return form data as a dictionary."""
data_name = 'robot_code'
form_data_element = parsed_temp_page.find('input', {'name': data_name})
data_value = form_data_element.get('value')
form_data = {data_name: data_value}
return form_data
def get_song_link(parsed_song_page):
"""Return song url string."""
song_link_element = parsed_song_page.find(href=re.compile('tempfile.ru'))
song_link = song_link_element.get('href')
return song_link
def download_song(song_link):
"""Download file with progress percentage."""
downloaded_size = 0
block_size = 8192
backspace_code = '\x08'
song_file = web_request(song_link)
metadata = song_file.info()
file_size = int(metadata.getheaders('Content-Length')[0])
file_name = metadata.getheaders('Content-Disposition')[0].split('\"')[1]
file_not_exists = not os.path.isfile(file_name)
if file_not_exists or (os.stat(file_name).st_size != file_size):
downloading_not_finished = True
output_file = open(file_name, 'wb')
print 'File: {0}\nSize: {1} bytes\nDownloading...'.format(file_name,
file_size),
while downloading_not_finished:
next_file_block = song_file.read(block_size)
if next_file_block: # Not empty
output_file.write(next_file_block)
downloaded_size += len(next_file_block)
percentage = '{0}%'.format(downloaded_size * 100 / file_size)
print percentage + backspace_code * (len(percentage) + 1),
else:
print # Newline
downloading_not_finished = False
output_file.close()
def main():
album_url = get_url()
print 'Connecting...'
album_page = web_request(album_url)
parsed_album_page = parse_page(album_page)
artist_name = unify_name(get_artist_name(parsed_album_page))
print 'Artist: ' + artist_name
raw_album_name = get_album_name(parsed_album_page)
album_name = unify_name(raw_album_name)
print 'Album: ' + album_name
temporary_links = get_temporary_links(parsed_album_page)
print 'Found {0} file(s)'.format(len(temporary_links))
music_dir = os.path.dirname(os.path.realpath(__file__)) + '/music'
music_dir_not_exists = not os.path.exists(music_dir)
if music_dir_not_exists:
os.mkdir(music_dir)
os.chdir(music_dir)
artist_dir_not_exists = not os.path.exists(artist_name)
if artist_dir_not_exists:
os.mkdir(artist_name)
os.chdir(artist_name)
album_dir_not_exists = not os.path.exists(album_name)
if album_dir_not_exists:
os.mkdir(album_name)
os.chdir(album_name)
for link in temporary_links:
temp_page = web_request(link)
parsed_temp_page = parse_page(temp_page)
redirected_url = temp_page.geturl()
form_data = get_form_data(parsed_temp_page)
song_page = web_request(redirected_url, form_data)
parsed_song_page = parse_page(song_page)
song_link = get_song_link(parsed_song_page)
download_song(song_link)
raw_input('All files downloaded successfully. Press ENTER to exit.')
if __name__ == '__main__':
try:
main()
except KeyboardInterrupt:
raw_input('\nAborting... Press ENTER to exit.')
sys.exit()