Skip to content

Commit

Permalink
Merge pull request #164 from Benau/dev
Browse files Browse the repository at this point in the history
Rumble extractor improvements
  • Loading branch information
evermind-zz authored Sep 19, 2024
2 parents 64d2764 + 4794f57 commit 8eadaa1
Show file tree
Hide file tree
Showing 7 changed files with 433 additions and 38 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -218,4 +218,17 @@ public static synchronized Map<String, List<String>> getMinimalHeaders() {
}
return HEADERS;
}

public static String getEmbedVideoId(final String rb) {
final String VALID_URL = "https?://(?:www\\.)?rumble\\.com/embed/(?:[0-9a-z]+\\.)?(?<id>[0-9a-z]+)";
final String EMBED_REGEX = "(?:<(?:script|iframe)[^>]+\\bsrc=|[\"']embedUrl[\"']\\s*:\\s*)[\"'](?<url>" + VALID_URL + ")";
Pattern pattern = Pattern.compile(EMBED_REGEX);
Matcher matcher = pattern.matcher(rb);
if (matcher.find()) {
// Remove v (first character) from the id
return matcher.group(2).substring(1);
} else {
return null;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,13 @@
import org.schabi.newpipe.extractor.search.SearchExtractor;
import org.schabi.newpipe.extractor.services.rumble.extractors.RumbleChannelExtractor;
import org.schabi.newpipe.extractor.services.rumble.extractors.RumbleChannelTabExtractor;
import org.schabi.newpipe.extractor.services.rumble.extractors.RumbleCommentsExtractor;
import org.schabi.newpipe.extractor.services.rumble.extractors.RumbleSearchExtractor;
import org.schabi.newpipe.extractor.services.rumble.extractors.RumbleStreamExtractor;
import org.schabi.newpipe.extractor.services.rumble.extractors.RumbleTrendingExtractor;
import org.schabi.newpipe.extractor.services.rumble.linkHandler.RumbleChannelLinkHandlerFactory;
import org.schabi.newpipe.extractor.services.rumble.linkHandler.RumbleChannelTabLinkHandlerFactory;
import org.schabi.newpipe.extractor.services.rumble.linkHandler.RumbleCommentsLinkHandlerFactory;
import org.schabi.newpipe.extractor.services.rumble.linkHandler.RumbleSearchQueryHandlerFactory;
import org.schabi.newpipe.extractor.services.rumble.linkHandler.RumbleStreamLinkHandlerFactory;
import org.schabi.newpipe.extractor.services.rumble.linkHandler.RumbleTrendingLinkHandlerFactory;
Expand All @@ -36,7 +38,8 @@ public class RumbleService extends StreamingService {

public RumbleService(final int id) {
super(id, "Rumble", asList(ServiceInfo.MediaCapability.VIDEO,
ServiceInfo.MediaCapability.AUDIO, ServiceInfo.MediaCapability.LIVE));
ServiceInfo.MediaCapability.AUDIO, ServiceInfo.MediaCapability.COMMENTS,
ServiceInfo.MediaCapability.LIVE));
}

@Override
Expand Down Expand Up @@ -138,12 +141,12 @@ public SubscriptionExtractor getSubscriptionExtractor() {

@Override
public ListLinkHandlerFactory getCommentsLHFactory() {
return null;
return RumbleCommentsLinkHandlerFactory.getInstance();
}

@Override
public CommentsExtractor getCommentsExtractor(final ListLinkHandler urlIdHandler)
throws ExtractionException {
return null;
return new RumbleCommentsExtractor(this, urlIdHandler);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
public class RumbleChannelExtractor extends ChannelExtractor {

private Document doc;
private Document about;

public RumbleChannelExtractor(final StreamingService service,
final ListLinkHandler linkHandler) {
Expand All @@ -39,6 +40,13 @@ public RumbleChannelExtractor(final StreamingService service,
public void onFetchPage(@Nonnull final Downloader downloader)
throws IOException, ExtractionException {
doc = Jsoup.parse(getDownloader().get(getUrl()).responseBody());
final String about_link = RumbleParsingHelper.extractSafely(false,
"",
() -> doc.select("div.channel-subheader--menu a[href*='about']").first().attr("href")
);
if (null != about_link) {
about = Jsoup.parse(getDownloader().get(getService().getBaseUrl() + about_link).responseBody());
}
}

@Nonnull
Expand Down Expand Up @@ -119,7 +127,7 @@ public long getSubscriberCount() throws ParsingException {

final String viewCount = RumbleParsingHelper.extractSafely(true,
errorMsg,
() -> doc.select("span.channel-header--followers").first().text()
() -> doc.select("div.channel-header--title > span").first().text()
);

if (null != viewCount) {
Expand All @@ -135,7 +143,22 @@ public long getSubscriberCount() throws ParsingException {

@Override
public String getDescription() throws ParsingException {
return ""; // There is no description
if (null != about) {
try {
StringBuilder sb = new StringBuilder();
org.jsoup.select.Elements el = about.select("div.channel-about--description > p");
for (int i = 0; i < el.size(); i++) {
sb.append(el.get(i).text());
if (i < el.size() - 1) {
sb.append("\n\n");
}
}
return sb.toString();
} catch (final Exception e) {
throw new ParsingException("Could not get description: " + e);
}
}
return "";
}

@Override
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
package org.schabi.newpipe.extractor.services.rumble.extractors;

import com.grack.nanojson.JsonObject;
import com.grack.nanojson.JsonParser;
import com.grack.nanojson.JsonParserException;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.Elements;

import org.schabi.newpipe.extractor.NewPipe;
import org.schabi.newpipe.extractor.Page;
import org.schabi.newpipe.extractor.StreamingService;
import org.schabi.newpipe.extractor.comments.CommentsExtractor;
import org.schabi.newpipe.extractor.comments.CommentsInfoItem;
import org.schabi.newpipe.extractor.comments.CommentsInfoItemsCollector;
import org.schabi.newpipe.extractor.downloader.Downloader;
import org.schabi.newpipe.extractor.downloader.Response;
import org.schabi.newpipe.extractor.exceptions.ExtractionException;
import org.schabi.newpipe.extractor.exceptions.ParsingException;
import org.schabi.newpipe.extractor.linkhandler.ListLinkHandler;
import org.schabi.newpipe.extractor.services.rumble.extractor.RumbleCommentsInfoItemExtractor;
import org.schabi.newpipe.extractor.services.rumble.RumbleParsingHelper;
import org.schabi.newpipe.extractor.utils.JsonUtils;

import javax.annotation.Nonnull;
import java.io.IOException;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static org.schabi.newpipe.extractor.NewPipe.getDownloader;

public class RumbleCommentsExtractor extends CommentsExtractor {
private final int maxCommentsPerPage = 15;

private Map<String, String> imageMap;

private Document doc;

public RumbleCommentsExtractor(
final StreamingService service,
final ListLinkHandler uiHandler) {
super(service, uiHandler);
}

public boolean isCommentsDisabled() throws ExtractionException {
return doc == null;
}

@Nonnull
@Override
public InfoItemsPage<CommentsInfoItem> getInitialPage()
throws IOException, ExtractionException {
Downloader downloader = NewPipe.getDownloader();
String id = RumbleParsingHelper.getEmbedVideoId(downloader.get(getUrl()).responseBody());
String url = "https://rumble.com/service.php?video=" + id + "&name=comment.list";
byte[] responseBody = downloader.get(url).responseBody().getBytes();
return getPage(new Page("1", responseBody));
}

@Override
public InfoItemsPage<CommentsInfoItem> getPage(final Page page)
throws IOException, ExtractionException {
byte[] responseBody = page.getBody();
loadFromResponseBody(responseBody);
if (isCommentsDisabled()) {
return new InfoItemsPage<>(Collections.emptyList(), null, Collections.emptyList());
}
int[] ids = stringToIntArray(page.getUrl());
int startIndex = ids[ids.length - 1] - 1;
int count = startIndex + maxCommentsPerPage + 1;
Element next = null;
final CommentsInfoItemsCollector collector = new CommentsInfoItemsCollector(
getServiceId());
for (; startIndex < count; startIndex++) {
ids[ids.length - 1] = startIndex + 1;
next = getComments(ids).first();
if (next == null || startIndex == count - 1) {
break;
}
collector.commit(new RumbleCommentsInfoItemExtractor(this, ids, responseBody));
}
return new InfoItemsPage<>(collector, next != null ?
new Page(intArrayToString(ids), responseBody) : null);
}

@Override
public void onFetchPage(@Nonnull final Downloader downloader)
throws IOException, ExtractionException {
}

public Elements getComments(int[] id) {
if (doc == null) {
return null;
}
int level = 1;
StringBuilder selection = new StringBuilder();
for (int i : id) {
if (level != 1) {
selection.append(" > div.comment-replies > ");
}
selection.append("ul.comments-").append(level++).append(" > li.comment-item");
if (i != 0) {
selection.append(":nth-child(").append(i).append(")");
}
}
return doc.select(selection.toString());
}

public String getImage(Element e) {
Element element = e.selectFirst("i.user-image");
if (element == null || imageMap == null) {
return null;
}
String attr = element.className();
String[] classes = attr.split(" ");
for (String name : classes) {
if (name.startsWith("user-image--img--id-") &&
imageMap.containsKey(name)) {
return imageMap.get(name);
}
}
return null;
}

public static String intArrayToString(int[] intArray) {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < intArray.length; i++) {
sb.append(intArray[i]);
if (i < intArray.length - 1) {
sb.append(" ");
}
}
return sb.toString();
}

private static int[] stringToIntArray(String str) {
String[] stringArray = str.split(" ");
int[] intArray = new int[stringArray.length];
for (int i = 0; i < stringArray.length; i++) {
intArray[i] = Integer.parseInt(stringArray[i]);
}
return intArray;
}

private void initImageMap(String css) {
Pattern pattern = Pattern.compile("i\\.user-image--img--id-(\\w+)\\s*\\{\\s*background-image:\\s*url\\(([^)]+)\\)");
Matcher matcher = pattern.matcher(css);
imageMap = new HashMap<>();
while (matcher.find()) {
String key = "user-image--img--id-" + matcher.group(1);
String value = matcher.group(2);
imageMap.put(key, value);
}
}

private void loadFromResponseBody(byte[] responseBody) throws ExtractionException {
try {
if (responseBody == null) {
return;
}
JsonObject info = JsonParser.object().from(new String(responseBody));
if (info.has("html") && info.has("css_libs")) {
doc = Jsoup.parse(info.get("html").toString());
if (doc.selectFirst("ul.comments-1") == null) {
doc = null;
return;
}
Elements createComment = doc.select("li.comment-item.comment-item.comments-create");
if (createComment != null) {
createComment.remove();
}
initImageMap(info.get("css_libs").toString());
}
} catch (final JsonParserException e) {
e.printStackTrace();
throw new ExtractionException("Could not read json from: " + getUrl());
}
}
}
Loading

0 comments on commit 8eadaa1

Please sign in to comment.