From 1e7457b619524e303376603f06baf36171e86877 Mon Sep 17 00:00:00 2001 From: Niklas Wagner Date: Sat, 28 Sep 2024 16:54:46 +0200 Subject: [PATCH 1/8] Implement videolength filter with ffprobe --- docs/configuration.rst | 34 +++++++++++++ gallery_dl/downloader/http.py | 96 +++++++++++++++++++++++++++++++++++ gallery_dl/text.py | 22 ++++++++ 3 files changed, 152 insertions(+) diff --git a/docs/configuration.rst b/docs/configuration.rst index dbc1d1ef45..7098249450 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -5018,6 +5018,40 @@ Description These suffixes are case-insensitive. +downloader.*.videolegth-min & .videolegth-max +----------------------------------------- +Type + ``string`` +Default + ``null`` +Example + ``"1min"``, ``"1m30s"``, ``"1h21min31s"`` +Description + Minimum/Maximum allowed video length. + Any video shorter/longer than this limit will not be downloaded. + + A file qualifies as a video if it contains more than 10 frames. If a file contains multiple video streams the shortest video will be used for comparison. + + This option requires ``ffprobe`` to be available. Additionally ``download.*.ffprobe-location`` can be configured. + + Possible values are valid integer numbers followed with one of the following suffixes: + * Hours: ```hours``, ``hour``, ``h``, + * Minutes: ``minutes``, ``minute``, ``min``, ``m`` + * Seconds: ``seconds``, ``second``, ``sec``, ``s`` + + Multiple values can be combined. e.g. ``2hours30min2s`` + + +download.*.ffprobe-location +------------------ +Type + ``string`` +Default + ``ffprobe`` +Description + Path/Location of ``ffprobe``. Used for the ``downloader.*.videolegth-min & .videolegth-max`` option. + + downloader.*.mtime ------------------ Type diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index 54750ac733..62e38cd51c 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -10,6 +10,9 @@ import time import mimetypes +import subprocess +import json +from datetime import timedelta from requests.exceptions import RequestException, ConnectionError, Timeout from .common import DownloaderBase from .. import text, util @@ -32,6 +35,10 @@ def __init__(self, job): self.headers = self.config("headers") self.minsize = self.config("filesize-min") self.maxsize = self.config("filesize-max") + self.minlength = self.config("videolength-min") + self.maxlength = self.config("videolength-max") + ffprobe = self.config("ffprobe-location") + self.ffprobe = util.expand_path(ffprobe) if ffprobe else "ffprobe" self.retries = self.config("retries", extractor._retries) self.retry_codes = self.config("retry-codes", extractor._retry_codes) self.timeout = self.config("timeout", extractor._timeout) @@ -59,6 +66,18 @@ def __init__(self, job): self.log.warning( "Invalid maximum file size (%r)", self.maxsize) self.maxsize = maxsize + if self.minlength: + minlength = text.parse_duration(self.minlength) + if not minlength: + self.log.warning( + "Invalid maximum videolength duration (%r)", self.minlength) + self.minlength = minlength + if self.maxlength: + maxlength = text.parse_duration(self.maxlength) + if not maxlength: + self.log.warning( + "Invalid maximum videolength duration (%r)", self.maxlength) + self.maxlength = maxlength if isinstance(self.chunk_size, str): chunk_size = text.parse_bytes(self.chunk_size) if not chunk_size: @@ -219,6 +238,26 @@ def _download_impl(self, url, pathfmt): kwdict[metadata] = util.extract_headers(response) build_path = True + # check video length using ffprobe request + if (self.minlength or self.maxlength): + length = self._fetch_videolength(url) + + if length and self.minlength and length < self.minlength: + self.release_conn(response) + self.log.warning( + "Video length is shorter than allowed minimum (%s < %s)", + length, self.minlength) + pathfmt.temppath = "" + return True + + if length and self.maxlength and length > self.maxlength: + self.release_conn(response) + self.log.warning( + "Video length is longer than allowed maximum (%s > %s)", + length, self.maxlength) + pathfmt.temppath = "" + return True + # build and check file path if build_path: pathfmt.build_path() @@ -376,6 +415,63 @@ def _adjust_extension(pathfmt, file_header): return True return False + def _fetch_videolength(self, url): + minimum_frames = 10 + args = [ + self.ffprobe, + "-v", + "quiet", + "-print_format", + "json", + "-show_format", + "-show_streams", + url, + ] + + try: + result = subprocess.run( + args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + check=True, + ) + data = json.loads(result.stdout) + + video_streams = [ + float(stream["duration"]) + for stream in data["streams"] + if stream["codec_type"] == "video" + and "duration" in stream + and "avg_frame_rate" in stream + and self._frame_count(stream) >= minimum_frames + ] + + if not video_streams: + self.log.info( + "No video streams found or none with a valid duration and minimum frames." + ) + return None + + duration = timedelta(seconds=min(video_streams)) + return duration + + except subprocess.CalledProcessError as e: + self.log.error("ffprobe failed: %s", e.stderr) + return None + except json.JSONDecodeError: + self.log.error("Failed to decode ffprobe output as JSON") + return None + + def _frame_count(self, stream): + """Calculates the number of frames in the video stream.""" + try: + duration = float(stream["duration"]) + avg_frame_rate = eval(stream["avg_frame_rate"]) + return int(duration * avg_frame_rate) + except (ValueError, ZeroDivisionError): + return 0 + MIME_TYPES = { "image/jpeg" : "jpg", diff --git a/gallery_dl/text.py b/gallery_dl/text.py index 8517cdf5dd..de8f872cae 100644 --- a/gallery_dl/text.py +++ b/gallery_dl/text.py @@ -268,6 +268,28 @@ def parse_timestamp(ts, default=None): return default +def parse_duration(duration_string, default=None): + try: + patterns = { + 'hours': r'(\d+)\s*h(our(s)?)?', + 'minutes': r'(\d+)\s*m(in(ute)?(s)?)?', + 'seconds': r'(\d+)\s*s(ec(ond)?(s)?)?' + } + parsed_values = {unit: 0 for unit in patterns.keys()} + + for unit, pattern in patterns.items(): + match = re.search(pattern, duration_string, re.IGNORECASE) + if match: + parsed_values[unit] = int(match.group(1)) + + return datetime.timedelta( + hours=parsed_values['hours'], + minutes=parsed_values['minutes'], + seconds=parsed_values['seconds']) + except Exception: + return default + + def parse_datetime(date_string, format="%Y-%m-%dT%H:%M:%S%z", utcoffset=0): """Create a datetime object by parsing 'date_string'""" try: From 4121bfa9e615f79c2d0bce4cc32d2225eb6980d9 Mon Sep 17 00:00:00 2001 From: Niklas Wagner Date: Sat, 28 Sep 2024 17:09:38 +0200 Subject: [PATCH 2/8] Fix line length error --- gallery_dl/downloader/http.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index 62e38cd51c..039a0d5d5f 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -215,14 +215,18 @@ def _download_impl(self, url, pathfmt): self.release_conn(response) self.log.warning( "File size smaller than allowed minimum (%s < %s)", - size, self.minsize) + size, + self.minsize, + ) pathfmt.temppath = "" return True if self.maxsize and size > self.maxsize: self.release_conn(response) self.log.warning( "File size larger than allowed maximum (%s > %s)", - size, self.maxsize) + size, + self.maxsize, + ) pathfmt.temppath = "" return True @@ -449,7 +453,8 @@ def _fetch_videolength(self, url): if not video_streams: self.log.info( - "No video streams found or none with a valid duration and minimum frames." + "No video streams found or none with a valid duration " + "and minimum frames." ) return None From 8856d66999e1976e142309244db1b8fd55fb9e22 Mon Sep 17 00:00:00 2001 From: Niklas Wagner Date: Sat, 28 Sep 2024 17:11:28 +0200 Subject: [PATCH 3/8] Fix more lint errors --- gallery_dl/downloader/http.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index 039a0d5d5f..403d3c9229 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -70,13 +70,15 @@ def __init__(self, job): minlength = text.parse_duration(self.minlength) if not minlength: self.log.warning( - "Invalid maximum videolength duration (%r)", self.minlength) + "Invalid maximum videolength duration (%r)", + self.minlength) self.minlength = minlength if self.maxlength: maxlength = text.parse_duration(self.maxlength) if not maxlength: self.log.warning( - "Invalid maximum videolength duration (%r)", self.maxlength) + "Invalid maximum videolength duration (%r)", + self.maxlength) self.maxlength = maxlength if isinstance(self.chunk_size, str): chunk_size = text.parse_bytes(self.chunk_size) @@ -249,7 +251,8 @@ def _download_impl(self, url, pathfmt): if length and self.minlength and length < self.minlength: self.release_conn(response) self.log.warning( - "Video length is shorter than allowed minimum (%s < %s)", + "Video length is shorter than allowed minimum " + "(%s < %s)", length, self.minlength) pathfmt.temppath = "" return True @@ -257,7 +260,8 @@ def _download_impl(self, url, pathfmt): if length and self.maxlength and length > self.maxlength: self.release_conn(response) self.log.warning( - "Video length is longer than allowed maximum (%s > %s)", + "Video length is longer than allowed maximum " + "(%s > %s)", length, self.maxlength) pathfmt.temppath = "" return True @@ -445,10 +449,10 @@ def _fetch_videolength(self, url): video_streams = [ float(stream["duration"]) for stream in data["streams"] - if stream["codec_type"] == "video" - and "duration" in stream - and "avg_frame_rate" in stream - and self._frame_count(stream) >= minimum_frames + if stream["codec_type"] == "video" and + "duration" in stream and + "avg_frame_rate" in stream and + self._frame_count(stream) >= minimum_frames ] if not video_streams: From 3c7d897ee7bf08e8968aaba85eec90bf9154a813 Mon Sep 17 00:00:00 2001 From: Niklas Wagner Date: Sat, 28 Sep 2024 17:12:57 +0200 Subject: [PATCH 4/8] Reduce unnessecary diff --- gallery_dl/downloader/http.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index 403d3c9229..a697d97c7a 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -217,8 +217,7 @@ def _download_impl(self, url, pathfmt): self.release_conn(response) self.log.warning( "File size smaller than allowed minimum (%s < %s)", - size, - self.minsize, + size, self.minsize, ) pathfmt.temppath = "" return True @@ -226,8 +225,7 @@ def _download_impl(self, url, pathfmt): self.release_conn(response) self.log.warning( "File size larger than allowed maximum (%s > %s)", - size, - self.maxsize, + size, self.maxsize, ) pathfmt.temppath = "" return True From 1a8937cca78e9c142a60e53d15683eac1334fba7 Mon Sep 17 00:00:00 2001 From: Niklas Wagner Date: Sat, 28 Sep 2024 17:25:43 +0200 Subject: [PATCH 5/8] Update Readme --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index d0e4a721ec..c3e01ed236 100644 --- a/README.rst +++ b/README.rst @@ -26,7 +26,7 @@ Optional -------- - yt-dlp_ or youtube-dl_: HLS/DASH video downloads, ``ytdl`` integration -- FFmpeg_: Pixiv Ugoira conversion +- FFmpeg_: Video length filters & Pixiv Ugoira conversion - mkvmerge_: Accurate Ugoira frame timecodes - PySocks_: SOCKS proxy support - brotli_ or brotlicffi_: Brotli compression support From 3aabb09526020c8d15b4ac90df4286b6b1339e0c Mon Sep 17 00:00:00 2001 From: Niklas Wagner Date: Sat, 28 Sep 2024 17:31:36 +0200 Subject: [PATCH 6/8] Redurce diff --- gallery_dl/downloader/http.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index a697d97c7a..d7780093f8 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -217,16 +217,14 @@ def _download_impl(self, url, pathfmt): self.release_conn(response) self.log.warning( "File size smaller than allowed minimum (%s < %s)", - size, self.minsize, - ) + size, self.minsize) pathfmt.temppath = "" return True if self.maxsize and size > self.maxsize: self.release_conn(response) self.log.warning( "File size larger than allowed maximum (%s > %s)", - size, self.maxsize, - ) + size, self.maxsize) pathfmt.temppath = "" return True From 56054bd30e802cdc98191dcf80079bc66c6b8139 Mon Sep 17 00:00:00 2001 From: Niklas Wagner Date: Fri, 25 Oct 2024 15:38:47 +0200 Subject: [PATCH 7/8] Add Headers and Retry Mechanism to ffprobe Utility --- gallery_dl/downloader/http.py | 67 +----------------------- gallery_dl/ffprobe.py | 96 +++++++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+), 65 deletions(-) create mode 100644 gallery_dl/ffprobe.py diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index d7780093f8..2ab721d0e8 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -10,12 +10,9 @@ import time import mimetypes -import subprocess -import json -from datetime import timedelta from requests.exceptions import RequestException, ConnectionError, Timeout from .common import DownloaderBase -from .. import text, util +from .. import text, util, ffprobe from ssl import SSLError @@ -37,8 +34,6 @@ def __init__(self, job): self.maxsize = self.config("filesize-max") self.minlength = self.config("videolength-min") self.maxlength = self.config("videolength-max") - ffprobe = self.config("ffprobe-location") - self.ffprobe = util.expand_path(ffprobe) if ffprobe else "ffprobe" self.retries = self.config("retries", extractor._retries) self.retry_codes = self.config("retry-codes", extractor._retry_codes) self.timeout = self.config("timeout", extractor._timeout) @@ -242,7 +237,7 @@ def _download_impl(self, url, pathfmt): # check video length using ffprobe request if (self.minlength or self.maxlength): - length = self._fetch_videolength(url) + length = ffprobe.get_video_length(self, url) if length and self.minlength and length < self.minlength: self.release_conn(response) @@ -419,64 +414,6 @@ def _adjust_extension(pathfmt, file_header): return True return False - def _fetch_videolength(self, url): - minimum_frames = 10 - args = [ - self.ffprobe, - "-v", - "quiet", - "-print_format", - "json", - "-show_format", - "-show_streams", - url, - ] - - try: - result = subprocess.run( - args, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - check=True, - ) - data = json.loads(result.stdout) - - video_streams = [ - float(stream["duration"]) - for stream in data["streams"] - if stream["codec_type"] == "video" and - "duration" in stream and - "avg_frame_rate" in stream and - self._frame_count(stream) >= minimum_frames - ] - - if not video_streams: - self.log.info( - "No video streams found or none with a valid duration " - "and minimum frames." - ) - return None - - duration = timedelta(seconds=min(video_streams)) - return duration - - except subprocess.CalledProcessError as e: - self.log.error("ffprobe failed: %s", e.stderr) - return None - except json.JSONDecodeError: - self.log.error("Failed to decode ffprobe output as JSON") - return None - - def _frame_count(self, stream): - """Calculates the number of frames in the video stream.""" - try: - duration = float(stream["duration"]) - avg_frame_rate = eval(stream["avg_frame_rate"]) - return int(duration * avg_frame_rate) - except (ValueError, ZeroDivisionError): - return 0 - MIME_TYPES = { "image/jpeg" : "jpg", diff --git a/gallery_dl/ffprobe.py b/gallery_dl/ffprobe.py new file mode 100644 index 0000000000..7ae2839d33 --- /dev/null +++ b/gallery_dl/ffprobe.py @@ -0,0 +1,96 @@ +# -*- coding: utf-8 -*- + +# Copyright 2014-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Fetch Video Length before actually downloading a whole file""" + +import subprocess +import json +import time +from datetime import timedelta +from . import util + + +def get_video_length(obj, url): + minimum_frames = 10 + data = None + tries = 0 + msg = "" + + ffprobe = util.expand_path(obj.config("ffprobe-location", "ffprobe")) + + command = [ + ffprobe, + "-v", + "quiet", + "-print_format", + "json", + "-show_format", + "-show_streams", + ] + + if obj.headers: + for key, value in obj.headers.items(): + command.extend(["-headers", f"{key}: {value}"]) + + command.append(url) + + while True: + if tries: + obj.log.warning("%s (%s/%s)", msg, tries, obj.retries+1) + if tries > obj.retries: + return False + time.sleep(tries) + tries += 1 + + try: + result = subprocess.run( + command, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + check=True, + ) + data = json.loads(result.stdout) + except subprocess.CalledProcessError as e: + msg = f"ffprobe failed: {e}" + print(e) + continue + except json.JSONDecodeError: + msg = "Failed to decode ffprobe output as JSON" + continue + + # A file typically contains multiple streams (video, audio, subtitle). + # Here we filter out everything that is not considered a video + video_streams = [ + float(stream["duration"]) + for stream in data["streams"] + if stream["codec_type"] == "video" and + "duration" in stream and + "avg_frame_rate" in stream and + frame_count(stream) >= minimum_frames + ] + + if not video_streams: + obj.log.info( + "No video streams found or none with a valid duration " + "and minimum frames." + ) + return None + + duration = timedelta(seconds=min(video_streams)) + return duration + + +def frame_count(stream): + """Calculates the number of frames in the video stream.""" + try: + duration = float(stream["duration"]) + avg_frame_rate = eval(stream["avg_frame_rate"]) + return int(duration * avg_frame_rate) + except (ValueError, ZeroDivisionError): + return 0 From 7e7c92eeef7a91e30ba28d2ff91e533258bc704c Mon Sep 17 00:00:00 2001 From: Niklas Wagner Date: Fri, 25 Oct 2024 15:44:07 +0200 Subject: [PATCH 8/8] Remove f string since they don't exist in python 3.5 --- gallery_dl/ffprobe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gallery_dl/ffprobe.py b/gallery_dl/ffprobe.py index 7ae2839d33..9674cb8f85 100644 --- a/gallery_dl/ffprobe.py +++ b/gallery_dl/ffprobe.py @@ -35,7 +35,7 @@ def get_video_length(obj, url): if obj.headers: for key, value in obj.headers.items(): - command.extend(["-headers", f"{key}: {value}"]) + command.extend(["-headers", key + ": " + value]) command.append(url) @@ -57,7 +57,7 @@ def get_video_length(obj, url): ) data = json.loads(result.stdout) except subprocess.CalledProcessError as e: - msg = f"ffprobe failed: {e}" + msg = "ffprobe failed: " + e print(e) continue except json.JSONDecodeError: