From 566218b385afd060e5c2d97d66840086b9a409a6 Mon Sep 17 00:00:00 2001 From: "guorong.zheng" <360996299@qq.com> Date: Tue, 27 Feb 2024 16:53:49 +0800 Subject: [PATCH] feat:get more page --- README.md | 8 ++- main.py | 151 +++++++++++++++++++++++++++++------------------------- 2 files changed, 84 insertions(+), 75 deletions(-) diff --git a/README.md b/README.md index 03bf0664933..ac85c76ba68 100644 --- a/README.md +++ b/README.md @@ -6,14 +6,12 @@ - 接口效验,过滤无效接口,排序规则:日期、速度、分辨率 - 定时执行,每隔 12 小时执行更新一次 -- 可设置重点关注频道 -- 可获取更多直播链接,数量可设置 -- 关注频道默认保留模板接口,可手动更新接口至模板中 -- 分页接口来源更新(todo) +- 可设置重点关注频道,单独配置获取分页的数量 +- 分页结果获取(可配置数量) ## 使用方法 1. Fork 此项目,开启 Action 工作流可读写权限 2. 修改 demo.txt 文件,后续更新根据此文件内容进行更新 -3. 修改 main.py(可选):importantList(关注频道),importantUrlsNum(更新关注频道的源数量,默认 20),filter_invalid_url(是否过滤无效接口,默认开启) +3. 修改 main.py(可选):importantList(关注频道名称列表),importantPageNum(关注频道获取分页数量,默认值:10),defaultPageNum(常规频道获取分页数量,默认值:5),filter_invalid_url(是否过滤无效接口,默认开启) 4. result.txt 为更新后的直播源文件,source.json 为数据源文件 diff --git a/main.py b/main.py index 48d9dfe1923..7f37ecec64a 100644 --- a/main.py +++ b/main.py @@ -1,16 +1,13 @@ -import selenium from selenium import webdriver -import time from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +import time import os import re -import requests from selenium_stealth import stealth -import concurrent.futures -from concurrent.futures import ThreadPoolExecutor import aiohttp import asyncio -from selenium.common.exceptions import NoSuchElementException from bs4 import BeautifulSoup from datetime import datetime import re @@ -34,7 +31,8 @@ class GetSource: "湖南卫视", "翡翠台", ] - importantUrlsNum = 20 + importantPageNum = 10 + defaultPageNum = 5 filter_invalid_url = True def __init__(self): @@ -134,75 +132,88 @@ def outputTxt(self, cate, channelUrls): f.write("\n") async def visitPage(self, channelItems): - self.driver.get("https://www.foodieguide.com/iptvsearch/") self.removeFile() for cate, channelObj in channelItems.items(): channelUrls = {} for name in channelObj.keys(): - try: - element = self.driver.find_element(By.ID, "search") - element.clear() - element.send_keys(name) - self.driver.find_element(By.ID, "form1").find_element( - By.NAME, "Submit" - ).click() - except NoSuchElementException: - print(f"Element not found when searching for {name}") - continue - infoList = [] isImportant = name in self.importantList - useNum = self.importantUrlsNum if isImportant else 10 - soup = BeautifulSoup(self.driver.page_source, "html.parser") - tables_div = soup.find("div", class_="tables") - results = ( - tables_div.find_all("div", class_="result") if tables_div else [] - ) - for result in results[:useNum]: - m3u8_div = result.find("div", class_="m3u8") - url = m3u8_div.text.strip() if m3u8_div else None - info_div = m3u8_div.find_next_sibling("div") if m3u8_div else None - date = resolution = None - if info_div: - info_text = info_div.text.strip() - date, resolution = ( - ( - info_text.partition(" ")[0] - if info_text.partition(" ")[0] - else None - ), - ( - info_text.partition(" ")[2].partition("•")[2] - if info_text.partition(" ")[2].partition("•")[2] - else None - ), + pageNum = self.importantPageNum if isImportant else self.defaultPageNum + infoList = [] + for page in range(1, pageNum): + try: + page_url = f"https://www.foodieguide.com/iptvsearch/?page={page}&s={name}" + self.driver.get(page_url) + WebDriverWait(self.driver, 10).until( + EC.presence_of_element_located( + (By.CSS_SELECTOR, "div.tables") + ) ) - infoList.append((url, date, resolution)) - infoList.sort( - key=lambda x: ( - x[1] is not None, - datetime.strptime(x[1], "%m-%d-%Y") if x[1] else None, - ), - reverse=True, - ) # Sort by date - infoList = await self.compareSpeed(infoList) # Sort by speed - - def extract_resolution(resolution_str): - numbers = re.findall(r"\d+x\d+", resolution_str) - if numbers: - width, height = map(int, numbers[0].split("x")) - return width * height - else: - return 0 - - infoList.sort( - key=lambda x: ( - x[2] is not None, - extract_resolution(x[2]) if x[2] else 0, - ), - reverse=True, - ) # Sort by resolution - urls = list(dict.fromkeys(url for url, _, _ in infoList)) - channelUrls[name] = urls + soup = BeautifulSoup(self.driver.page_source, "html.parser") + tables_div = soup.find("div", class_="tables") + results = ( + tables_div.find_all("div", class_="result") + if tables_div + else [] + ) + if not any( + result.find("div", class_="m3u8") for result in results + ): + break + for result in results: + m3u8_div = result.find("div", class_="m3u8") + url = m3u8_div.text.strip() if m3u8_div else None + info_div = ( + m3u8_div.find_next_sibling("div") if m3u8_div else None + ) + date = resolution = None + if info_div: + info_text = info_div.text.strip() + date, resolution = ( + ( + info_text.partition(" ")[0] + if info_text.partition(" ")[0] + else None + ), + ( + info_text.partition(" ")[2].partition("•")[2] + if info_text.partition(" ")[2].partition("•")[2] + else None + ), + ) + infoList.append((url, date, resolution)) + except Exception as e: + print(f"Error on page {page}: {e}") + continue + try: + infoList.sort( + key=lambda x: ( + x[1] is not None, + datetime.strptime(x[1], "%m-%d-%Y") if x[1] else None, + ), + reverse=True, + ) # Sort by date + infoList = await self.compareSpeed(infoList) # Sort by speed + + def extract_resolution(resolution_str): + numbers = re.findall(r"\d+x\d+", resolution_str) + if numbers: + width, height = map(int, numbers[0].split("x")) + return width * height + else: + return 0 + + infoList.sort( + key=lambda x: ( + x[2] is not None, + extract_resolution(x[2]) if x[2] else 0, + ), + reverse=True, + ) # Sort by resolution + urls = list(dict.fromkeys(url for url, _, _ in infoList)) + channelUrls[name] = urls + except Exception as e: + print(f"Error on sorting: {e}") + continue self.outputTxt(cate, channelUrls) await asyncio.sleep(1)