From 909e7244ca929ce3e709b6ea856f4667275a251b Mon Sep 17 00:00:00 2001 From: Chenny Du Date: Wed, 20 Jun 2018 10:00:48 +0800 Subject: [PATCH] First release --- .gitignore | 19 +++++++++++ README.md | 8 +++++ generate_download_list.py | 71 +++++++++++++++++++++++++++++++++++++++ youtube_download.py | 62 ++++++++++++++++++++++++++++++++++ 4 files changed, 160 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 generate_download_list.py create mode 100644 youtube_download.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e6852ba --- /dev/null +++ b/.gitignore @@ -0,0 +1,19 @@ +# ---> no_extension_pyc +# Ignore all +* + +# Unignore all with extensions +!*.* + +# Unignore all dirs +!*/ + +### Above combination will ignore all files without extension ### + +**/__pycache__ + +**/test.py + +*.list + +download/ \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..56c220a --- /dev/null +++ b/README.md @@ -0,0 +1,8 @@ +# Usage +1. make a list of youtube links naming it as `youtube_video.list`, e.g. +``` +https://www.youtube.com/watch?v=bY6m6_IIN94 +https://www.youtube.com/watch?v=f4KOjWS_KZs +``` +2. `generate_download_list.py` to get the video link +3. `youtube_download.py` to download the video using proxy \ No newline at end of file diff --git a/generate_download_list.py b/generate_download_list.py new file mode 100644 index 0000000..2028ed9 --- /dev/null +++ b/generate_download_list.py @@ -0,0 +1,71 @@ +import requests +from requests import Request, Session +import urllib, json +import bs4 as bs +import re +import time + +QUERY_URL = 'https://y2mate.com/analyze/ajax' +VIDEO_LIST = 'youtube_video.list' + + +def read_video_list(): + with open(VIDEO_LIST, 'r') as f: + videoList = f.read().split('\n') + return videoList + + +def query_link_generate(youtube_link): + ''' + youtube_link = 'https://www.youtube.com/watch?v=iAzShkKzpJo' + ''' + data = 'url={}&ajax=1'.format(urllib.parse.quote_plus(youtube_link)) + headers = { + "accept": "*/*", + "accept-language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7", + "content-type": "application/x-www-form-urlencoded; charset=UTF-8", + "origin": "https://y2mate.com", + "referer": "https://y2mate.com/youtube/Xi52tx6phRU", + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36", + "x-requested-with": "XMLHttpRequest", + } + + Sess = requests.session() + req = Request('POST', QUERY_URL, data=data, headers=headers) + prepped = Sess.prepare_request(req) + resp = Sess.send(prepped) + resp_text = resp.content.decode('utf-8') + + # print(data) + # print(resp_text) + result = json.loads(resp_text) + + # #mp4 > table > tbody > tr:nth-child(1) > td.txt-center > a + soup = bs.BeautifulSoup(result["result"], 'lxml') + videoDownloadLink = soup.select('#mp4 > table > tbody > tr > td.txt-center > a')[0]['href'] + # print(videoDownloadLink) + videoName = urllib.parse.unquote_plus(re.findall(r'(?<=&title=).*(?=&)', videoDownloadLink)[0]).split(' || ')[0] + print(videoName) + return videoName, videoDownloadLink + + +if __name__ == '__main__': + # test_link = 'https://www.youtube.com/watch?v=f4KOjWS_KZs' + # query_link_generate(test_link) + + # videoDownloadLinkList = list(map(query_link_generate, read_video_list())) + + count = 1 + + with open('youtube_video_download.list', 'w') as f: + f.write('') + + for i in read_video_list(): + videoName, videoDownloadLink = query_link_generate(i) + with open('youtube_video_download.list', 'a') as f: + f.write('{} {} |#| {}\n'.format(count, videoName, videoDownloadLink)) + time.sleep(1) + count += 1 + + + diff --git a/youtube_download.py b/youtube_download.py new file mode 100644 index 0000000..45f0bbe --- /dev/null +++ b/youtube_download.py @@ -0,0 +1,62 @@ +import requests +import os + + +VIDEO_LIST = 'youtube_video_download.list' +proxy = '127.0.0.1:1080' +downloadPath = 'download/' + + +def remove_illegal_char(fileName): + ''' + Remove reserved characters from file name + ''' + RESERVED_CHAR = ['<', '>', ':', '"', '/', '\\', '|', '?', '*',] + + for char in RESERVED_CHAR: + fileName = fileName.replace(char, '_') + return fileName + +def read_list(): + with open(VIDEO_LIST, 'r') as f: + downloadList_tmp = f.read().split('\n') + + downloadList = [] + for i in downloadList_tmp: + # get fileName, downloadUrl + if not i: + continue + + downloadItem = i.split(' |#| ') + downloadItem[0] = remove_illegal_char(downloadItem[0].strip()) + '.mp4' + downloadList.append(downloadItem) + + return(downloadList) + +def download_file(fileName, url): + ''' + Download file with proxy + ''' + print('Downloading {}'.format(fileName)) + proxies = {'http': 'http://{}'.format(proxy), + 'https': 'https://{}'.format(proxy)} + res = requests.get(url,proxies=proxies) + con = res.content + with open(fileName, 'wb') as f: + f.write(con) + print('Downloading finished') + +def create_download_dir(): + script_dir = os.path.dirname(os.path.realpath(__file__)) + try: + if downloadPath not in os.listdir(script_dir): + os.mkdir(downloadPath) + except FileExistsError as e: + pass + + + +if __name__ == '__main__': + create_download_dir() + for i in read_list(): + download_file(downloadPath + i[0], i[1])