Fix issue #1

Fixed issue #1, and some other improvements
duchenpaul · Nov 17, 2018 · 2b85231 · 2b85231
1 parent 9cb3a74
commit 2b85231
Show file tree

Hide file tree

Showing 4 changed files with 207 additions and 88 deletions.
diff --git a/.gitignore b/.gitignore
@@ -15,5 +15,6 @@
 **/test.py
 
 *.list
+**/*.html
 
 download/
diff --git a/generate_download_list.py b/generate_download_list.py
@@ -10,62 +10,80 @@
 
 
 def read_video_list():
-	with open(VIDEO_LIST, 'r') as f:
-		videoList = f.read().split('\n')
-	return videoList
+    with open(VIDEO_LIST, 'r') as f:
+        videoList = f.read().split('\n')
+    return videoList
+
+
+def get_best_download_link(htmlPage):
+    '''Get the best quality video from download table'''
+    soup = bs.BeautifulSoup(htmlPage, 'lxml')
+    tbl = soup.table
+
+    # print(tbl.findAll("tr"))
+    records = []
+    for tr in [i for i in tbl.findAll("tr") if len(i.findAll("td")) == 3]:
+        # print(tr)
+        # print('+'*80)
+        record = dict()
+        resolution_tag, size, dl_link = tr.findAll("td")
+        if dl_link.a.attrs['href'].startswith('http'):
+            record['resolution'] = int(resolution_tag.text.strip().split('p')[0])
+            record['link'] = dl_link.a.attrs['href']
+            record['name'] = dl_link.a.attrs['download']
+            records.append(record)
+
+    selected_link = max(records, key=lambda x:x['resolution'])
+    return selected_link
 
 
 def query_link_generate(youtube_link):
-	'''
-	youtube_link = 'https://www.youtube.com/watch?v=iAzShkKzpJo'
-	'''
-	data = 'url={}&ajax=1'.format(urllib.parse.quote_plus(youtube_link))
-	headers = {
-		"accept": "*/*", 
-		"accept-language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7", 
-		"content-type": "application/x-www-form-urlencoded; charset=UTF-8", 
-		"origin": "https://y2mate.com", 
-		"referer": "https://y2mate.com/youtube/Xi52tx6phRU", 
-		"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36", 
-		"x-requested-with": "XMLHttpRequest", 
-	}
-
-	Sess = requests.session()
-	req = Request('POST', QUERY_URL, data=data, headers=headers)
-	prepped = Sess.prepare_request(req)
-	resp = Sess.send(prepped)
-	resp_text = resp.content.decode('utf-8')
-
-	# print(data)
-	# print(resp_text)
-	result = json.loads(resp_text)
-
-	# #mp4 > table > tbody > tr:nth-child(1) > td.txt-center > a
-	soup = bs.BeautifulSoup(result["result"], 'lxml')
-	videoDownloadLink = soup.select('#mp4 > table > tbody > tr > td.txt-center > a')[0]['href']
-	# print(videoDownloadLink)
-	videoName = urllib.parse.unquote_plus(re.findall(r'(?<=&title=).*(?=&)', videoDownloadLink)[0]).split(' || ')[0]
-	print(videoName)
-	return videoName, videoDownloadLink
+    '''
+    youtube_link = 'https://www.youtube.com/watch?v=iAzShkKzpJo'
+    '''
+    data = 'url={}&ajax=1'.format(urllib.parse.quote_plus(youtube_link))
+    headers = {
+        "accept": "*/*", 
+        "accept-language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7", 
+        "content-type": "application/x-www-form-urlencoded; charset=UTF-8", 
+        "origin": "https://y2mate.com", 
+        "referer": "https://y2mate.com/youtube/Xi52tx6phRU", 
+        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36", 
+        "x-requested-with": "XMLHttpRequest", 
+    }
+
+    Sess = requests.session()
+    req = Request('POST', QUERY_URL, data=data, headers=headers)
+    prepped = Sess.prepare_request(req)
+    resp = Sess.send(prepped)
+    resp_text = resp.content.decode('utf-8')
+
+    # print(data)
+    # print(resp_text)
+    result = json.loads(resp_text)
+
+    video_dict = get_best_download_link(result["result"])
+    print('Got {}: {}p'.format(video_dict['name'], video_dict['resolution']))
+    return video_dict['name'], video_dict['link']
 
 
 if __name__ == '__main__':
-	# test_link = 'https://www.youtube.com/watch?v=f4KOjWS_KZs'
-	# query_link_generate(test_link)
-	
-	# videoDownloadLinkList = list(map(query_link_generate, read_video_list()))
-
-	count = 1
-
-	with open('youtube_video_download.list', 'w') as f:
-		f.write('')
-
-	for i in read_video_list():
-		videoName, videoDownloadLink = query_link_generate(i)
-		with open('youtube_video_download.list', 'a') as f:
-			f.write('{} {} |#| {}\n'.format(count, videoName, videoDownloadLink))
-		time.sleep(1)
-		count += 1
+    # test_link = 'https://www.youtube.com/watch?v=f4KOjWS_KZs'
+    # query_link_generate(test_link)
+    
+    # videoDownloadLinkList = list(map(query_link_generate, read_video_list()))
+
+    count = 1
+
+    with open('youtube_video_download.list', 'w') as f:
+        f.write('')
+
+    for i in read_video_list():
+        videoName, videoDownloadLink = query_link_generate(i)
+        with open('youtube_video_download.list', 'a') as f:
+            f.write('{} {} |#| {}\n'.format(count, videoName, videoDownloadLink))
+        time.sleep(1)
+        count += 1
 
 
 
diff --git a/toolkit_file.py b/toolkit_file.py
@@ -0,0 +1,98 @@
+import os, re
+import os.path
+import glob
+from pathlib import Path
+import codecs
+import shutil
+
+def check_file_exists(FILE):
+    '''Check if the FILE exists'''
+    return Path(FILE).is_file()
+
+
+def check_dir_exists(DIR):
+    '''Check if the DIR exists'''
+    return Path(DIR).is_dir()
+
+
+def get_basename(FILE):
+    '''
+    Return the basename of a file. e.g. example.txt -> example
+    '''
+    return os.path.splitext(os.path.basename(FILE))[0]
+
+
+def file_path(FILE):
+    return os.path.dirname(os.path.realpath(FILE)) + os.sep
+
+
+def script_path():
+    return os.path.dirname(os.path.realpath(__file__))
+
+
+def line_prepender(filename, line):
+    '''
+    Add line to the head of a file
+    '''
+    with open(filename, 'r+') as f:
+        content = f.read()
+        f.seek(0, 0)
+        f.write(line.rstrip('\r\n') + '\n' + content)
+
+
+def get_file_list(folder):
+    file_list = []
+    for path, subdirs, files in os.walk(folder):
+        for name in files:
+            file_list.append(os.path.join(path, name))
+    return file_list
+
+
+def purge_folder(folder, filePattern='*'):
+    # filelist = [ f for f in os.listdir(folder) ] #if f.endswith(".bak") ]
+    filelist = glob.glob(folder + os.sep + filePattern)
+    for f in filelist:
+        # print(f)
+        os.remove(os.path.join(f)) # using glob
+        # os.remove(os.path.join(folder, f)) # using listdir
+
+
+def create_folder(folderName):
+    '''Create folder if not exists'''
+    my_file = Path(folderName)
+    if not my_file.is_dir():
+        print('Folder {} not found, creating a new one'.format(folderName))
+        os.mkdir(folderName)
+
+
+def text_replace_in_file(pattern, string, file):
+    '''Replace pattern with string in file'''
+    with open(file) as f:
+        replaced_script = re.sub(pattern, string, f.read(), flags=re.IGNORECASE)
+    with open(file, 'w') as f:
+        f.write(replaced_script)
+
+
+def convert_encode2utf8(sourceFileName, targetFileName, srcEncoding = 'utf-16'):
+    BLOCKSIZE = 1048576 # or some other, desired size in bytes
+    with codecs.open(sourceFileName, 'r', 'utf-16') as sourceFile:
+        with codecs.open(targetFileName, 'w', 'utf-8') as targetFile:
+            while True:
+                contents = sourceFile.read(BLOCKSIZE)
+                if not contents:
+                    break
+                targetFile.write(contents)
+
+
+def remove_junk_line(FILE, junkwords):
+    '''
+    Remove the line that contains junkwords
+    '''
+    with open(FILE) as oldfile, open(FILE + 'tmp', 'w') as newfile:
+        for line in oldfile:
+            if not junkwords in line:
+                newfile.write(line)
+    shutil.move(FILE + 'tmp', FILE)
+
+if __name__ == '__main__':
+    print(get_file_list('E:\\'))
diff --git a/youtube_download.py b/youtube_download.py
@@ -1,62 +1,64 @@
 import requests
 import os
 
+import toolkit_file
 
 VIDEO_LIST = 'youtube_video_download.list'
 proxy = '127.0.0.1:1080'
 downloadPath = 'download/'
 
+toolkit_file.create_folder(downloadPath)
 
 def remove_illegal_char(fileName):
-	'''
-	Remove reserved characters from file name
-	'''
-	RESERVED_CHAR = ['<', '>', ':', '"', '/', '\\', '|', '?', '*',]
+    '''
+    Remove reserved characters from file name
+    '''
+    RESERVED_CHAR = ['<', '>', ':', '"', '/', '\\', '|', '?', '*',]
 
-	for char in RESERVED_CHAR:
-		fileName = fileName.replace(char, '_')
-	return fileName
+    for char in RESERVED_CHAR:
+        fileName = fileName.replace(char, '_')
+    return fileName
 
 def read_list():
-	with open(VIDEO_LIST, 'r') as f:
-		downloadList_tmp = f.read().split('\n')
+    with open(VIDEO_LIST, 'r') as f:
+        downloadList_tmp = f.read().split('\n')
 
-	downloadList = []
-	for i in downloadList_tmp:
-		# get fileName, downloadUrl
-		if not i:
-			continue
+    downloadList = []
+    for i in downloadList_tmp:
+        # get fileName, downloadUrl
+        if not i:
+            continue
 
-		downloadItem = i.split(' |#| ')
-		downloadItem[0] = remove_illegal_char(downloadItem[0].strip()) + '.mp4'
-		downloadList.append(downloadItem)
+        downloadItem = i.split(' |#| ')
+        downloadItem[0] = remove_illegal_char(downloadItem[0].strip()) + '.mp4'
+        downloadList.append(downloadItem)
 
-	return(downloadList)
+    return(downloadList)
 
 def download_file(fileName, url):
-	'''
-	Download file with proxy
-	'''
-	print('Downloading {}'.format(fileName))
-	proxies = {'http': 'http://{}'.format(proxy),
-	           'https': 'https://{}'.format(proxy)}
-	res = requests.get(url,proxies=proxies)
-	con = res.content
-	with open(fileName, 'wb') as f:
-		f.write(con)
-	print('Downloading finished')
+    '''
+    Download file with proxy
+    '''
+    print('Downloading {}'.format(fileName))
+    proxies = {'http': 'http://{}'.format(proxy),
+               'https': 'https://{}'.format(proxy)}
+    res = requests.get(url,proxies=proxies)
+    con = res.content
+    with open(fileName, 'wb') as f:
+        f.write(con)
+    print('Downloading finished')
 
 def create_download_dir():
-	script_dir = os.path.dirname(os.path.realpath(__file__))
-	try:
-		if downloadPath not in os.listdir(script_dir):
-			os.mkdir(downloadPath)
-	except FileExistsError as e:
-		pass
+    script_dir = os.path.dirname(os.path.realpath(__file__))
+    try:
+        if downloadPath not in os.listdir(script_dir):
+            os.mkdir(downloadPath)
+    except FileExistsError as e:
+        pass
 
 
 
 if __name__ == '__main__':
-	create_download_dir()
-	for i in read_list():
-		download_file(downloadPath + i[0], i[1])
+    create_download_dir()
+    for i in read_list():
+        download_file(downloadPath + i[0], i[1])