diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000..413026f --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,82 @@ +image: python:3.8-slim + +stages: + - pre-test + - basic-download-test + - feature-test + +python-compile-test: + stage: pre-test + script: + # Check whether script is syntax error free + - python3 -m py_compile vo-scraper.py + +ensure-same-version: + stage: pre-test + script: + # Ensure verion numbers in `VERSION` and `vo-scraper.py` match + - grep -q $(sed -n "s/^.*program_version = '\(.*\)'$/\1/p" vo-scraper.py) VERSION + +# Download unprotected video +unprotected-recording: + stage: basic-download-test + needs: [python-compile-test] + script: + # Install dependency + - pip3 install requests + # Download video + - python3 vo-scraper.py --disable-hints --hide-progress-bar --quality low --latest https://video.ethz.ch/lectures/d-infk/2020/spring/252-0028-00L.html + # Compare checksums + - echo $(sha1sum Lecture\ Recordings/Digital\ Design\ and\ Computer\ Architecture/2020-03-12_low-3ebf562d.mp4) | grep -q f80bcc1c215cebf64a4da7f9623406fb1309e512 + +# Download 'PWD' protected video +pwd-protected-recording: + stage: basic-download-test + needs: [python-compile-test] + script: + # Install dependency + - pip3 install requests + # Download video + - python3 vo-scraper.py --disable-hints --hide-progress-bar --quality low --latest --file $PWD_LINK_FILE + # Compare checksums + - echo $(sha1sum Lecture\ Recordings/Introduction\ to\ Machine\ Learning/2020-05-27\ -\ Tutorial_low-1898f0cc.mp4) | grep -q dce9f9aeb00693b6dbce49b113c10d2f84a29b70 + +# Download 'ETH' protected video +eth-protected-recording: + stage: basic-download-test + needs: [python-compile-test] + script: + # Install dependency + - pip3 install requests + # Download video + - python3 vo-scraper.py --disable-hints --hide-progress-bar --quality low --latest --file $ETH_LINK_FILE + # Compare checksums + - echo $(sha1sum Lecture\ Recordings/Advanced\ Systems\ Lab/2020-03-19_low-fd29952f.mp4) | grep -q efd4a1779a29da08c0186ed6121fc10bfa7e8e83 + +# Test default named parameter file +default-parameter-file: + stage: feature-test + needs: [unprotected-recording] + script: + # Install dependency + - pip3 install requests + # Add parameter file + - printf -- "--quality low\n--latest\n--hide-progress-bar\n--disable-hints\n" > parameters.txt + # Download video + - python3 vo-scraper.py https://video.ethz.ch/lectures/d-infk/2020/spring/252-0028-00L.html + # Compare checksums + - echo $(sha1sum Lecture\ Recordings/Digital\ Design\ and\ Computer\ Architecture/2020-03-12_low-3ebf562d.mp4) | grep -q f80bcc1c215cebf64a4da7f9623406fb1309e512 + +# Test custom named parameter file +custom-parameter-file: + stage: feature-test + needs: [unprotected-recording] + script: + # Install dependency + - pip3 install requests + # Add parameter file + - printf -- "--quality low\n--latest\n--hide-progress-bar\n--disable-hints\n" > parameters2.txt + # Download video + - python3 vo-scraper.py --parameter-file parameters2.txt https://video.ethz.ch/lectures/d-infk/2020/spring/252-0028-00L.html + # Compare checksums + - echo $(sha1sum Lecture\ Recordings/Digital\ Design\ and\ Computer\ Architecture/2020-03-12_low-3ebf562d.mp4) | grep -q f80bcc1c215cebf64a4da7f9623406fb1309e512 diff --git a/README.md b/README.md index 3ea0d84..9da1a2d 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# vo-scraper +# vo-scraper 🎓🎥 A python script for ETH students to download lecture videos from [video.ethz.ch](https://video.ethz.ch/). @@ -48,6 +48,18 @@ You may find this example of ranges useful: | `1..3..` | `1 3 5 [...]` | Every other episodes starting from the second (i.e.. all the second episodes of the week) | | `..3..` | `0 3 6 [...]` | Every third episodes, starting from the beginning | +### Q: Can I use it to download live streams? + +#### A: No + +Downloading live streams is not supported. + +### Q: Can I use it to download lecture recordings from other platforms (e.g. Zoom)? + +#### A: No + +Downloading is only supported for recorded lectures on [video.ethz.ch](https://video.ethz.ch/). Other platforms such as Zoom, Moodle, and Polybox are not supported. + ### Q: How do I pass a file with links to multiple lectures? #### A: Use `--file ` @@ -67,6 +79,25 @@ Additionally you can also add a username and password at the end of the link sep **Note:** This is **NOT** recommended for your NETHZ account password for security reasons! +### Q: I don't like having to pass all those parameters each time I download recordings. Is there a better way? + +#### A: Yes + +You can can create a file called `parameters.txt` in which you put all your parameters. As long as you keep it in the same directory in which you call the scraper, it will automatically detect the file and read the parameters from there. + +**Example:** + +If you create a file called `parameters.txt` with the following content + +``` +--all +--quality low +``` + +and then run `python3 vo-scraper.py ` in that directory it will download all recordings (`--all`) from that lecture in low quality (`--quality low`) without you having to pass any parameters. + +If you want to use a different name for the parameter file, you can pass the parameter `--parameter-file `. Ironically, you cannot do this via `parameters.txt` :P + ### Q: How does it acquire the videos? #### A: Like so: diff --git a/VERSION b/VERSION index 589268e..359a5b9 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.3.0 \ No newline at end of file +2.0.0 \ No newline at end of file diff --git a/vo-scraper.py b/vo-scraper.py index 79da956..f491903 100755 --- a/vo-scraper.py +++ b/vo-scraper.py @@ -16,12 +16,16 @@ # ======================================================================== # Import urllib.request, urllib.parse, os, sys, http.client -import urllib.request, os, sys, http.client +import urllib.request +import os +import sys from urllib.request import Request, urlopen from sys import platform -import json # For handling json files -import argparse # For parsing commandline arguments -import getpass # For getting the user password +import json # For handling json files +import argparse # For parsing commandline arguments +import getpass # For getting the user password +import random # For selecting a random hint +import shutil # For getting terminal size # Check whether `requests` is installed @@ -33,7 +37,7 @@ # Check whether `webbrowser` is installed try: - import webbrowser # only used to open the user's browser when reporting a bug + import webbrowser # only used to open the user's browser when reporting a bug except: print_information("Failed to import `webbrowser`. It is however not required for downloading videos", type='warning') @@ -48,10 +52,10 @@ # Links to repo gitlab_repo_page = "https://gitlab.ethz.ch/tgeorg/vo-scraper/" -gitlab_issue_page = gitlab_repo_page+"issues" -gitlab_changelog_page = gitlab_repo_page+"-/tags/v" -remote_version_link = gitlab_repo_page+"raw/master/VERSION" -program_version = '1.3.0' +gitlab_issue_page = gitlab_repo_page + "issues" +gitlab_changelog_page = gitlab_repo_page + "-/tags/v" +remote_version_link = gitlab_repo_page + "raw/master/VERSION" +program_version = '2.0.0' # For web requests user_agent = 'Mozilla/5.0' @@ -75,31 +79,106 @@ # Boolean flags download_all = False +download_latest = False verbose = False print_src = False +HIDE_PROGRESS_BAR = False # Location of text files file_to_print_src_to = "" history_file = "" +PARAMETER_FILE = "parameters.txt" quality_dict = { - 'high' : 0, + 'high': 0, 'medium': 1, - 'low' : 2 + 'low': 2 } + class bcolors: INFO = '\033[94m' ERROR = '\033[91m' WARNING = '\033[93m' ENDC = '\033[0m' + print_type_dict = { - 'info' : f"({bcolors.INFO}INF{bcolors.ENDC})", - 'warning' : f"({bcolors.WARNING}WRN{bcolors.ENDC})", - 'error' : f"({bcolors.ERROR}ERR{bcolors.ENDC})" + 'info': f"({bcolors.INFO}INF{bcolors.ENDC})", + 'warning': f"({bcolors.WARNING}WRN{bcolors.ENDC})", + 'error': f"({bcolors.ERROR}ERR{bcolors.ENDC})" } +HINT_LIST = [ + # --help + """Want to know more about the scrapers functionality? +Run `python3 vo-scraper.py --help` to see all commands that can be used with the scraper. +For a detailed explanation of some of the commands, checkout the README here: https://gitlab.ethz.ch/tgeorg/vo-scraper""", + # --all + """Want to download all recordings of a lecture at once? +If you call the vo-scraper with `--all` it will skip the selection screen and will download all recordings instead. +Usage example: + + python3 vo-scraper.py --all https://video.ethz.ch/lectures/d-infk/2019/spring/252-0028-00L.html""", + # --bug + """Found a bug? +Run `python3 vo-scraper.py --bug` or report it directly at https://gitlab.ethz.ch/tgeorg/vo-scraper/issues""", + # --destination DESTINATION + """Did you know? By default the vo-scraper saves the dowloaded recordings in \"""" + directory_prefix + """\" +If you want the recordings saved in a different place you can use the parameter `--destination ` +For example: + + python3 vo-scraper.py --destination my_folder https://video.ethz.ch/lectures/d-infk/2019/spring/252-0028-00L.html + +saves the recordings inside the folder name \"my_folder\"""", + # --disable-hints + """Getting annoyed by this hint message? +You can pass the parameter `--disable-hints` to not show hints after running.""", + # --file FILE + """Downloading multiple lectures and tired of having to enter all those links everytime you want to download a recording? +You can paste all your links in a text file and then tell the scraper to read from that file using the paramter `--file ` +Example: + + python3 vo-scraper.py --file my_lectures.txt + +The scraper will read the links from that file and download them as usual.""", + # --hide-progress-bar + """Progress bar breaking your terminal? +Hide it by passing the parameter `--hide-progress-bar`""", + # --history FILE + """Did you know, that the scraper does not re-download a lecture recording if it detects the recording in its download folder? +This way bandwidth is safed by preventing unecessary re-downloads, especially when using the `--all` parameter to download all existing recordings of a lecture. +However this also mean that if you delete the recording and run the scraper with `--all` again it will re-download the recording. + +To fix this you can use the parameter `--history ` which creates a text file with that name and stores a history of all downloaded lectures there. +For example: + + python3 vo-scraper.py --history history.txt + +will create a file called 'history.txt' and save a history of all downloaded recordings there. If you delete a downloaded video the scraper will not redownload it as long as you pass `--history every time you run it.`""", + # --parameter-file FILE + """Annoyed by having to type all those parameters like `--all`, `--history`, etc. by hand? +You can create a text file called \"""" + PARAMETER_FILE + """\" and paste all your parameters there. If it's in the same location as the scraper it will automatically read it and apply them. + +If you want to use a different name for it, you can pass `--parameter-file ` to read parameters from `` instead. +Ironically this parameter cannot be put into the parameter file.""", + # --print-source [FILE] + """Have your own method of downloading videos? +You can use the parameter `--print-source` to print the direct links to the recordings instead of downloading them. +By default the links are printed in your terminal. If you follow up the parameter with a file e.g. `--print-source video_links.txt` a file with that name is created and all the links are saved there.""", + # --quality {high,medium,low} + """Downloading recordings takes too long as the files are too big? +You can download switch between different video qualities using the `--quality` parameter together with the keyword 'high', 'low', or 'medium' +Example: + + python3 vo-scraper.py --quality high https://video.ethz.ch/lectures/d-infk/2019/spring/252-0028-00L.html + +Note that the default quality is 'high', so if you just want the highest possible quality, there's no need to pass this parameter.""", + # --skip-connection-check + # --skip-update-check + """In order to ensure functionality, the scraper will check whether your version is up to date and that you have a connection to video.ethz.ch (as well as the internet if video.ethz.ch fails). +If you don't like this, you can pass the parameter `--skip-update-check` to prevent looking for updates and `--skip-connection-check` to prevent checking for an internet connection.""", +] # =============================================================== # _____ _ _ # | ___| _ _ _ __ ___ | |_ (_) ___ _ __ ___ @@ -109,6 +188,7 @@ class bcolors: # # =============================================================== + def print_information(str, type='info', verbose_only=False): """Print provided string. @@ -128,7 +208,8 @@ def print_information(str, type='info', verbose_only=False): print(print_type_dict[type], str) elif verbose: # Always print with tag - print(print_type_dict[type],str) + print(print_type_dict[type], str) + def get_credentials(user, passw): """Gets user credentials and returns them @@ -138,12 +219,13 @@ def get_credentials(user, passw): passw -- The password passed from a text file """ if not user: - user = input("Enter your username: ") + user = input("Enter your username: ") if not passw: passw = getpass.getpass() return(user, passw) + def acquire_login_cookie(protection, vo_link, user, passw): """Gets login-cookie by sending user credentials to login server @@ -152,6 +234,9 @@ def acquire_login_cookie(protection, vo_link, user, passw): vo_link -- The link to the lecture user -- The username passed from a text file passw -- The password passed from a text file + + Returns: + Cookie jar containing the users valid authentication cookie """ global user_agent @@ -164,8 +249,8 @@ def acquire_login_cookie(protection, vo_link, user, passw): (user, passw) = get_credentials(user, passw) # Setup headers and content to send - headers = {"User-Agent": user_agent, "Referer": vo_link+".html"} - data = { "__charset__": "utf-8", "j_validate": True, "j_username": user, "j_password": passw} + headers = {"User-Agent": user_agent, "Referer": vo_link + ".html"} + data = {"__charset__": "utf-8", "j_validate": True, "j_username": user, "j_password": passw} # Request login-cookie r = requests.post("https://video.ethz.ch/j_security_check", headers=headers, data=data) @@ -177,7 +262,7 @@ def acquire_login_cookie(protection, vo_link, user, passw): break else: print_information("Wrong username or password, please try again", type='warning') - (user, passw) = ('', '') # Reset passed credentials to not end up in loop if wrong credentials were passed + (user, passw) = ('', '') # Reset passed credentials to not end up in loop if wrong credentials were passed elif protection == "PWD": print_information("This lecture requires a CUSTOM login. Check the lecture's website or your emails for the credentials.") @@ -186,11 +271,11 @@ def acquire_login_cookie(protection, vo_link, user, passw): (user, passw) = get_credentials(user, passw) # Setup headers and content to send - headers = {"Referer": vo_link+".html", "User-Agent":user_agent} - data = { "__charset__": "utf-8", "username": user, "password": passw } + headers = {"Referer": vo_link + ".html", "User-Agent": user_agent} + data = {"__charset__": "utf-8", "username": user, "password": passw} # Get login cookie - r = requests.post(vo_link+".series-login.json", headers=headers, data=data) + r = requests.post(vo_link + ".series-login.json", headers=headers, data=data) # Put login cookie in cookie_jar cookie_jar = r.cookies @@ -198,7 +283,7 @@ def acquire_login_cookie(protection, vo_link, user, passw): break else: print_information("Wrong username or password, please try again", type='warning') - (user, passw) = ('', '') # Reset passed credentials to not end up in loop if wrong credentials were passed + (user, passw) = ('', '') # Reset passed credentials to not end up in loop if wrong credentials were passed else: print_information("Unknown protection type: " + protection, type='error') @@ -210,6 +295,7 @@ def acquire_login_cookie(protection, vo_link, user, passw): return cookie_jar + def pretty_print_episodes(vo_json_data, selected): """Prints the episode numbers that match `selected`""" # Get length of longest strings for nice formatting when printing @@ -242,20 +328,21 @@ def pretty_print_episodes(vo_json_data, selected): str(episode['createdBy']).ljust(max_lecturer_length) ) + def make_range(item, max_episode_number): """ Keyword arguments: item -- a string in the form of 'x..z' or 'x..y..z' max_episode_number -- The highest episode number to have an upperbound for the range of episodes - + Returns: A range from x to z, with step size y, 1 if y wasn't provided """ if len(item.split('..')) == 2: # user passed something like 'x..z', so step size is 1 lower_bound, upper_bound = item.split('..') - step = 1 + step = 1 else: # user passed something like 'x..y..z', so step size is y lower_bound, step, upper_bound = item.split('..') @@ -265,7 +352,8 @@ def make_range(item, max_episode_number): upper_bound = int(upper_bound) if upper_bound else max_episode_number step = int(step) - return range(lower_bound, upper_bound+1, step) + return range(lower_bound, upper_bound + 1, step) + def get_user_choice(max_episode_number): """ @@ -273,7 +361,7 @@ def get_user_choice(max_episode_number): Keyword arguments: max_episode_number -- The highest episode number to have an upperbound for the range of episodes - + Returns: A list containg the user picked choices """ @@ -287,14 +375,15 @@ def get_user_choice(max_episode_number): choice.append(int(elem)) else: choice += make_range(elem, max_episode_number) - + # make elements of `choice` unique - choice = set(choice) + choice = set(choice) # sort them, to download in order and not randomly choice = sorted(choice) return choice + def vo_scrapper(vo_link, user, passw): """ Gets the list of all available videos for a lecture. @@ -311,6 +400,7 @@ def vo_scrapper(vo_link, user, passw): """ global user_agent global download_all + global download_latest global video_quality global quality_dict @@ -341,6 +431,9 @@ def vo_scrapper(vo_link, user, passw): if download_all: # Add all available videos to the selected choice = list(range(len(vo_json_data['episodes']))) + elif download_latest: + # Only add newest video to the selected + choice = [0] else: # Let user pick videos try: @@ -353,7 +446,7 @@ def vo_scrapper(vo_link, user, passw): # Print the user's choice if not choice: print_information("No videos selected") - return list() # Nothing to do anymore + return list() # Nothing to do anymore else: print_information("You selected:") pretty_print_episodes(vo_json_data, choice) @@ -375,7 +468,7 @@ def vo_scrapper(vo_link, user, passw): for item_nr in choice: # Get link to video metadata json file item = vo_json_data['episodes'][item_nr] - video_info_link = video_info_prefix+item['id'] + video_info_link = video_info_prefix + item['id'] # Download the video metadata file # Use login-cookie if provided otherwise make request without cookie @@ -393,27 +486,26 @@ def vo_scrapper(vo_link, user, passw): continue video_json_data = json.loads(r.text) - # Put available versions in list for sorting by video quality counter = 0 versions = list() print_information("Available versions:", verbose_only=True) for vid_version in video_json_data['streams'][0]['sources']['mp4']: - versions.append((counter, vid_version['res']['w']*vid_version['res']['h'])) - print_information(str(counter) + ": " + "%4d" %vid_version['res']['w'] + "x" + "%4d" %vid_version['res']['h'], verbose_only=True) + versions.append((counter, vid_version['res']['w'] * vid_version['res']['h'])) + print_information(str(counter) + ": " + "%4d" % vid_version['res']['w'] + "x" + "%4d" % vid_version['res']['h'], verbose_only=True) counter += 1 versions.sort(key=lambda tup: tup[1], reverse=True) # Now it's sorted: high -> medium -> low # Get video src url from json - try: # try/except block to handle cases were not all three types of quality exist + try: # try/except block to handle cases were not all three types of quality exist video_src_link = video_json_data['streams'][0]['sources']['mp4'][versions[quality_dict[video_quality]][0]]['src'] except IndexError: print_information("Requested quality \"" + video_quality + "\" not available. Skipping episode!", type='error') continue lecture_title = vo_json_data['title'] - episode_title = vo_json_data["episodes"][item_nr]["title"] + episode_title = vo_json_data["episodes"][item_nr]["title"] # If video and lecture title overlap, remove lecture title from video title if episode_title.startswith(lecture_title): @@ -423,21 +515,22 @@ def vo_scrapper(vo_link, user, passw): episode_name = item['createdAt'][:-6] + " " + lecture_title + episode_title # Append date - episode_title = item['createdAt'][:-6]+episode_title + episode_title = item['createdAt'][:-6] + episode_title # Generate a pseudo hash by using part of the filename of the online version (which appears to be a UUID) - pseudo_hash = video_src_link.replace('https://oc-vp-dist-downloads.ethz.ch/mh_default_org/oaipmh-mmp/','')[:8] + pseudo_hash = video_src_link.replace('https://oc-vp-dist-downloads.ethz.ch/mh_default_org/oaipmh-mmp/', '')[:8] print_information(pseudo_hash, verbose_only=True) # Filename is `directory/