Skip to content

Commit

Permalink
Fix issue #1
Browse files Browse the repository at this point in the history
Fixed issue #1, and some other improvements
  • Loading branch information
duchenpaul committed Nov 17, 2018
1 parent 9cb3a74 commit 2b85231
Show file tree
Hide file tree
Showing 4 changed files with 207 additions and 88 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,6 @@
**/test.py

*.list
**/*.html

download/
118 changes: 68 additions & 50 deletions generate_download_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,62 +10,80 @@


def read_video_list():
with open(VIDEO_LIST, 'r') as f:
videoList = f.read().split('\n')
return videoList
with open(VIDEO_LIST, 'r') as f:
videoList = f.read().split('\n')
return videoList


def get_best_download_link(htmlPage):
'''Get the best quality video from download table'''
soup = bs.BeautifulSoup(htmlPage, 'lxml')
tbl = soup.table

# print(tbl.findAll("tr"))
records = []
for tr in [i for i in tbl.findAll("tr") if len(i.findAll("td")) == 3]:
# print(tr)
# print('+'*80)
record = dict()
resolution_tag, size, dl_link = tr.findAll("td")
if dl_link.a.attrs['href'].startswith('http'):
record['resolution'] = int(resolution_tag.text.strip().split('p')[0])
record['link'] = dl_link.a.attrs['href']
record['name'] = dl_link.a.attrs['download']
records.append(record)

selected_link = max(records, key=lambda x:x['resolution'])
return selected_link


def query_link_generate(youtube_link):
'''
youtube_link = 'https://www.youtube.com/watch?v=iAzShkKzpJo'
'''
data = 'url={}&ajax=1'.format(urllib.parse.quote_plus(youtube_link))
headers = {
"accept": "*/*",
"accept-language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7",
"content-type": "application/x-www-form-urlencoded; charset=UTF-8",
"origin": "https://y2mate.com",
"referer": "https://y2mate.com/youtube/Xi52tx6phRU",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36",
"x-requested-with": "XMLHttpRequest",
}

Sess = requests.session()
req = Request('POST', QUERY_URL, data=data, headers=headers)
prepped = Sess.prepare_request(req)
resp = Sess.send(prepped)
resp_text = resp.content.decode('utf-8')

# print(data)
# print(resp_text)
result = json.loads(resp_text)

# #mp4 > table > tbody > tr:nth-child(1) > td.txt-center > a
soup = bs.BeautifulSoup(result["result"], 'lxml')
videoDownloadLink = soup.select('#mp4 > table > tbody > tr > td.txt-center > a')[0]['href']
# print(videoDownloadLink)
videoName = urllib.parse.unquote_plus(re.findall(r'(?<=&title=).*(?=&)', videoDownloadLink)[0]).split(' || ')[0]
print(videoName)
return videoName, videoDownloadLink
'''
youtube_link = 'https://www.youtube.com/watch?v=iAzShkKzpJo'
'''
data = 'url={}&ajax=1'.format(urllib.parse.quote_plus(youtube_link))
headers = {
"accept": "*/*",
"accept-language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7",
"content-type": "application/x-www-form-urlencoded; charset=UTF-8",
"origin": "https://y2mate.com",
"referer": "https://y2mate.com/youtube/Xi52tx6phRU",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36",
"x-requested-with": "XMLHttpRequest",
}

Sess = requests.session()
req = Request('POST', QUERY_URL, data=data, headers=headers)
prepped = Sess.prepare_request(req)
resp = Sess.send(prepped)
resp_text = resp.content.decode('utf-8')

# print(data)
# print(resp_text)
result = json.loads(resp_text)

video_dict = get_best_download_link(result["result"])
print('Got {}: {}p'.format(video_dict['name'], video_dict['resolution']))
return video_dict['name'], video_dict['link']


if __name__ == '__main__':
# test_link = 'https://www.youtube.com/watch?v=f4KOjWS_KZs'
# query_link_generate(test_link)
# videoDownloadLinkList = list(map(query_link_generate, read_video_list()))

count = 1

with open('youtube_video_download.list', 'w') as f:
f.write('')

for i in read_video_list():
videoName, videoDownloadLink = query_link_generate(i)
with open('youtube_video_download.list', 'a') as f:
f.write('{} {} |#| {}\n'.format(count, videoName, videoDownloadLink))
time.sleep(1)
count += 1
# test_link = 'https://www.youtube.com/watch?v=f4KOjWS_KZs'
# query_link_generate(test_link)
# videoDownloadLinkList = list(map(query_link_generate, read_video_list()))

count = 1

with open('youtube_video_download.list', 'w') as f:
f.write('')

for i in read_video_list():
videoName, videoDownloadLink = query_link_generate(i)
with open('youtube_video_download.list', 'a') as f:
f.write('{} {} |#| {}\n'.format(count, videoName, videoDownloadLink))
time.sleep(1)
count += 1



98 changes: 98 additions & 0 deletions toolkit_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import os, re
import os.path
import glob
from pathlib import Path
import codecs
import shutil

def check_file_exists(FILE):
'''Check if the FILE exists'''
return Path(FILE).is_file()


def check_dir_exists(DIR):
'''Check if the DIR exists'''
return Path(DIR).is_dir()


def get_basename(FILE):
'''
Return the basename of a file. e.g. example.txt -> example
'''
return os.path.splitext(os.path.basename(FILE))[0]


def file_path(FILE):
return os.path.dirname(os.path.realpath(FILE)) + os.sep


def script_path():
return os.path.dirname(os.path.realpath(__file__))


def line_prepender(filename, line):
'''
Add line to the head of a file
'''
with open(filename, 'r+') as f:
content = f.read()
f.seek(0, 0)
f.write(line.rstrip('\r\n') + '\n' + content)


def get_file_list(folder):
file_list = []
for path, subdirs, files in os.walk(folder):
for name in files:
file_list.append(os.path.join(path, name))
return file_list


def purge_folder(folder, filePattern='*'):
# filelist = [ f for f in os.listdir(folder) ] #if f.endswith(".bak") ]
filelist = glob.glob(folder + os.sep + filePattern)
for f in filelist:
# print(f)
os.remove(os.path.join(f)) # using glob
# os.remove(os.path.join(folder, f)) # using listdir


def create_folder(folderName):
'''Create folder if not exists'''
my_file = Path(folderName)
if not my_file.is_dir():
print('Folder {} not found, creating a new one'.format(folderName))
os.mkdir(folderName)


def text_replace_in_file(pattern, string, file):
'''Replace pattern with string in file'''
with open(file) as f:
replaced_script = re.sub(pattern, string, f.read(), flags=re.IGNORECASE)
with open(file, 'w') as f:
f.write(replaced_script)


def convert_encode2utf8(sourceFileName, targetFileName, srcEncoding = 'utf-16'):
BLOCKSIZE = 1048576 # or some other, desired size in bytes
with codecs.open(sourceFileName, 'r', 'utf-16') as sourceFile:
with codecs.open(targetFileName, 'w', 'utf-8') as targetFile:
while True:
contents = sourceFile.read(BLOCKSIZE)
if not contents:
break
targetFile.write(contents)


def remove_junk_line(FILE, junkwords):
'''
Remove the line that contains junkwords
'''
with open(FILE) as oldfile, open(FILE + 'tmp', 'w') as newfile:
for line in oldfile:
if not junkwords in line:
newfile.write(line)
shutil.move(FILE + 'tmp', FILE)

if __name__ == '__main__':
print(get_file_list('E:\\'))
78 changes: 40 additions & 38 deletions youtube_download.py
Original file line number Diff line number Diff line change
@@ -1,62 +1,64 @@
import requests
import os

import toolkit_file

VIDEO_LIST = 'youtube_video_download.list'
proxy = '127.0.0.1:1080'
downloadPath = 'download/'

toolkit_file.create_folder(downloadPath)

def remove_illegal_char(fileName):
'''
Remove reserved characters from file name
'''
RESERVED_CHAR = ['<', '>', ':', '"', '/', '\\', '|', '?', '*',]
'''
Remove reserved characters from file name
'''
RESERVED_CHAR = ['<', '>', ':', '"', '/', '\\', '|', '?', '*',]

for char in RESERVED_CHAR:
fileName = fileName.replace(char, '_')
return fileName
for char in RESERVED_CHAR:
fileName = fileName.replace(char, '_')
return fileName

def read_list():
with open(VIDEO_LIST, 'r') as f:
downloadList_tmp = f.read().split('\n')
with open(VIDEO_LIST, 'r') as f:
downloadList_tmp = f.read().split('\n')

downloadList = []
for i in downloadList_tmp:
# get fileName, downloadUrl
if not i:
continue
downloadList = []
for i in downloadList_tmp:
# get fileName, downloadUrl
if not i:
continue

downloadItem = i.split(' |#| ')
downloadItem[0] = remove_illegal_char(downloadItem[0].strip()) + '.mp4'
downloadList.append(downloadItem)
downloadItem = i.split(' |#| ')
downloadItem[0] = remove_illegal_char(downloadItem[0].strip()) + '.mp4'
downloadList.append(downloadItem)

return(downloadList)
return(downloadList)

def download_file(fileName, url):
'''
Download file with proxy
'''
print('Downloading {}'.format(fileName))
proxies = {'http': 'http://{}'.format(proxy),
'https': 'https://{}'.format(proxy)}
res = requests.get(url,proxies=proxies)
con = res.content
with open(fileName, 'wb') as f:
f.write(con)
print('Downloading finished')
'''
Download file with proxy
'''
print('Downloading {}'.format(fileName))
proxies = {'http': 'http://{}'.format(proxy),
'https': 'https://{}'.format(proxy)}
res = requests.get(url,proxies=proxies)
con = res.content
with open(fileName, 'wb') as f:
f.write(con)
print('Downloading finished')

def create_download_dir():
script_dir = os.path.dirname(os.path.realpath(__file__))
try:
if downloadPath not in os.listdir(script_dir):
os.mkdir(downloadPath)
except FileExistsError as e:
pass
script_dir = os.path.dirname(os.path.realpath(__file__))
try:
if downloadPath not in os.listdir(script_dir):
os.mkdir(downloadPath)
except FileExistsError as e:
pass



if __name__ == '__main__':
create_download_dir()
for i in read_list():
download_file(downloadPath + i[0], i[1])
create_download_dir()
for i in read_list():
download_file(downloadPath + i[0], i[1])

0 comments on commit 2b85231

Please sign in to comment.