Skip to content

Commit

Permalink
[motherless] add support
Browse files Browse the repository at this point in the history
- Add support for single images and videos
- Tidy metadata extractors
- Add gallery downloaders
- Add support details
- Update media count extractors
  • Loading branch information
smackingpotato authored and mikf committed Oct 29, 2024
1 parent ecc08e3 commit 9a792ab
Show file tree
Hide file tree
Showing 4 changed files with 316 additions and 0 deletions.
6 changes: 6 additions & 0 deletions docs/supportedsites.md
Original file line number Diff line number Diff line change
Expand Up @@ -601,6 +601,12 @@ Consider all listed sites to potentially be NSFW.
<td>Albums, Channels</td>
<td>Supported</td>
</tr>
<tr>
<td>Motherless</td>
<td>https://www.motherless.com/</td>
<td>Individual Images, Individual Videos, Galleries</td>
<td></td>
</tr>
<tr>
<td>My Hentai Gallery</td>
<td>https://myhentaigallery.com/</td>
Expand Down
1 change: 1 addition & 0 deletions gallery_dl/extractor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@
"mangasee",
"mangoxo",
"misskey",
"motherless",
"myhentaigallery",
"myportfolio",
"naver",
Expand Down
308 changes: 308 additions & 0 deletions gallery_dl/extractor/motherless.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,308 @@
# -*- coding: utf-8 -*-

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://motherless.com/"""

from .common import Extractor, Message
from .. import text
import re
from datetime import datetime, timedelta, timezone
from html import unescape

ROOT_URL_PATTERN = r"(?:https?://)?motherless\.com"


class MotherlessExtractor(Extractor):
"""Base class for motherless extractors"""

category = "motherless"
root = "https://motherless.com"
filename_fmt = "{id} {title}.{extension}"


class MotherlessMediaExtractor(MotherlessExtractor):
"""Extractor for a single image/video from motherless.com"""

pattern = ROOT_URL_PATTERN + "/((?!GV|GI|G)[A-Z0-9]+)$"
example = "https://motherless.com/ABC123"
directory_fmt = ("{category}",)

def items(self):
image_url, data = self.get_image()
yield Message.Directory, data
yield Message.Url, image_url, data

def get_image(self):
self.page_data = self.request(self.url).text

try:
# Find image source url.
self.subcategory = "image"

image_url_search = re.search(f'<link rel="image_src" type="image/([a-z]+)" href="(.+)">', self.page_data)
extension = image_url_search.group(1)
media_url = image_url_search.group(2)
id = get_image_id(media_url)

except AttributeError:
# No image, find video url.
self.subcategory = "video"

video_url_search = re.search("__fileurl = '(.+)'", self.page_data)
extension = "mp4"
media_url = video_url_search.group(1)
id = get_video_id(media_url)

data = {
"url": self.url,
"title": get_media_title(self.page_data),
"id": id,
"filename": id,
"extension": extension,
"date": get_media_date(self.page_data),
"uploader": get_media_uploader(self.page_data),
"tags": get_media_tags(self.page_data)}

return media_url, data


class MotherlessMediaInGalleryExtractor(MotherlessMediaExtractor):
"""Extractor for a single image/video from a gallery from motherless.com"""

directory_fmt = ("{category}", "{gallery_id} {gallery_title}")
pattern = ROOT_URL_PATTERN + "/(?:GI?)([A-Z0-9]+)/([A-Z0-9]+)"
example = "https://motherless.com/GABC123/DEF456"

def get_image(self):
media_url, data = super().get_image()
data['gallery_id'] = re.match(self.pattern, self.url).group(1)
data['gallery_title'] = self.get_gallery_name(data['gallery_id'])
data['title'] = get_media_title(self.page_data)
return media_url, data

def get_gallery_name(self, gallery_id):
try:
# 'From the gallery: ...' does not always appear in the page.
return unescape(
re.search('From the gallery: (.+?)</a>', self.page_data).group(1).strip())
except AttributeError:
# Get gallery name from gallery home page.
gallery_page_data = self.request(f"{self.root}/G{gallery_id}").text
return get_gallery_name_from_homepage(gallery_page_data)


class MotherlessGalleryImagesExtractor(MotherlessExtractor):
"""Extractor for all images in a gallery from motherless.com"""

subcategory = "image gallery"
directory_fmt = ("{category}", "{gallery_id} {gallery_title}")
pattern = ROOT_URL_PATTERN + "/GI([A-Z0-9]+)$"
example = "https://motherless.com/GIABC123"

def items(self):
self.gallery_id = re.match(self.pattern, self.url).group(1)

page = self.request(f"{self.root}/G{self.gallery_id}").text
data = {
"gallery_id" : self.gallery_id,
"gallery_title": get_gallery_name_from_homepage(page),
"uploader": get_gallery_uploader(page),
"count": get_gallery_image_count(page)}

yield Message.Directory, data

for id, url, extension, title, num in get_images(self):
data |= {
"id": id,
"filename": id,
"extension": extension,
"title": title,
"num": num}

yield Message.Url, url, data


class MotherlessGalleryVideosExtractor(MotherlessExtractor):
"""Extractor for all videos in a gallery from motherless.com"""

subcategory = "video gallery"
directory_fmt = ("{category}", "{gallery_id} {gallery_title}")
pattern = ROOT_URL_PATTERN + "/GV([A-Z0-9]+)$"
example = "https://motherless.com/GVABC123"

def items(self):
self.gallery_id = re.match(self.pattern, self.url).group(1)
page = self.request(f"{self.root}/G{self.gallery_id}").text
data = {
"gallery_id" : self.gallery_id,
"gallery_title": get_gallery_name_from_homepage(page),
"uploader": get_gallery_uploader(page),
"count": get_gallery_video_count(page)}

yield Message.Directory, data

for id, url, title, num in get_videos(self):
data |= {
"id": id,
"filename": id,
"extension": "mp4",
"title": title,
"num": num}

yield Message.Url, url, data


class MotherlessGalleryExtractor(MotherlessExtractor):
"""Extractor for all images and videos in a gallery from motherless.com"""

subcategory = "gallery"
directory_fmt = ("{category}", "{gallery_id} {gallery_title}")
pattern = ROOT_URL_PATTERN + "/G([A-Z0-9]+)$"
example = "https://motherless.com/GABC123"

def items(self):
self.gallery_id = re.match(self.pattern, self.url).group(1)
page = self.request(f"{self.root}/G{self.gallery_id}").text
data = {
"gallery_id" : self.gallery_id,
"gallery_title": get_gallery_name_from_homepage(page),
"uploader": get_gallery_uploader(page),
"count": get_gallery_image_count(page) + get_gallery_video_count(page)}

yield Message.Directory, data

for id, url, extension, title, num in get_images(self):
data |= {
"id": id,
"filename": id,
"extension": extension,
"title": title,
"num": num}

yield Message.Url, url, data

for id, url, title, num in get_videos(self):
data |= {
"id": id,
"filename": id,
"extension": "mp4",
"title": title,
"num": num}

yield Message.Url, url, data


# Url extractors.

def get_images(extractor):
n = 1
total_count = 0

while True:
page = extractor.request(f"{extractor.root}/GI{extractor.gallery_id}?page={n}").text
page_count = 0

for result in re.finditer(f' src="https:\/\/cdn5-thumbs\.motherlessmedia\.com\/thumbs\/([A-Z0-9]+?)\.(jpg|gif)"[\s\S]+?alt="(.+)"', page):
id = result.group(1)
url = f"https://cdn5-images.motherlessmedia.com/images/{id}.jpg"
extension = result.group(2)
title = result.group(3)
page_count += 1

yield id, url, extension, title, total_count + page_count

if page_count == 0:
return

total_count += page_count
n += 1

def get_videos(extractor):
n = 1
total_count = 0

while True:
page = extractor.request(f"{extractor.root}/GV{extractor.gallery_id}?page={n}").text
page_count = 0

for result in re.finditer(f'thumbs\/([A-Z0-9]+?)-strip\.jpg" alt="(.+)"', page):
id = result.group(1)
url = f"https://cdn5-videos.motherlessmedia.com/videos/{id}.mp4"
title = result.group(2)
page_count += 1

yield id, url, title, total_count + page_count

if page_count == 0:
return

total_count += page_count
n += 1

# Metadata extractors.

def get_media_tags(page_data):
try:
tags_html = re.search('<div class="media-meta-tags">([\S\s]+?)</div>', page_data).group(1)
except AttributeError:
# No tags found.
return []

tags = text.split_html(tags_html)
for i, tag in enumerate(tags):
tags[i] = tag.replace('#', '')

return tags

def get_media_title(page_data):
title = re.search('<div class="media-meta-title">([\S\s]+?)</div>', page_data).group(1)
return unescape(text.remove_html(title))

def get_media_date(page_data):
# Find date uploaded and convert to ISO 8601.
try:
# Find 'DD Mon YYYY' format.
date = re.search('<span class="count">(\d{1,2}\s+\w+\s+\d{4})</span>', page_data).group(1)
return text.parse_datetime(date, "%d %b %Y").isoformat()

except AttributeError:
# Find 'nd ago' format.
days_ago = int(re.search('<span class="count">(\d+)\s*d\s*ago</span>', page_data).group(1))
return (datetime.now(timezone.utc) - timedelta(days=days_ago)).replace(hour=0, minute=0, second=0, microsecond=0)

def get_media_uploader(page_data):
username_html = re.search('class="username">\s+(.+[^\s])\s+<\/span>', page_data).group(1)
return text.remove_html(username_html)

def get_image_id(image_url):
return text.extract(image_url, 'images/', '.')[0]

def get_video_id(video_url):
video_id = text.extract(video_url, 'videos/', '.')[0]

if '-' in video_id:
return text.extract(video_id, '', '-')[0]
return video_id

def get_gallery_name_from_homepage(page_data):
return unescape(re.search('<title>(.+) \|', page_data).group(1))

def get_gallery_uploader(page_data):
return re.search('gallery-member-username">[\s\S]+?<a href="/m/(.+?)"', page_data).group(1)

def get_gallery_image_count(page_data):
try:
return int(re.search('Images \(([0-9,]+)\)', page_data).group(1).replace(',', ''))
except AttributeError:
# No images found.
return 0

def get_gallery_video_count(page_data):
try:
return int(re.search('Videos \(([0-9,]+)\)', page_data).group(1).replace(',', ''))
except AttributeError:
# No images found.
return 0
1 change: 1 addition & 0 deletions scripts/supportedsites.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@
"mastodon.social": "mastodon.social",
"mediawiki" : "MediaWiki",
"micmicidol" : "MIC MIC IDOL",
"motherless" : "Motherless",
"myhentaigallery": "My Hentai Gallery",
"myportfolio" : "Adobe Portfolio",
"naverwebtoon" : "NaverWebtoon",
Expand Down

0 comments on commit 9a792ab

Please sign in to comment.