From b7db41113576c4d36eb1ac36524b208a93faaf9b Mon Sep 17 00:00:00 2001 From: "guorong.zheng" <360996299@qq.com> Date: Thu, 25 Apr 2024 18:28:39 +0800 Subject: [PATCH 1/2] feat:extend_base_urls --- config.py | 1 + main.py | 13 ++++++++++--- utils.py | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 63 insertions(+), 5 deletions(-) diff --git a/config.py b/config.py index 4095839447c..d019abdf94c 100644 --- a/config.py +++ b/config.py @@ -24,3 +24,4 @@ ipv_type = "ipv4" domain_blacklist = ["epg.pw"] url_keywords_blacklist = [] +extend_base_urls = [] diff --git a/main.py b/main.py index 0be266c1a11..057a91437e5 100644 --- a/main.py +++ b/main.py @@ -14,13 +14,14 @@ updateChannelUrlsTxt, updateFile, getUrlInfo, - compareSpeedAndResolution, + sortUrlsBySpeedAndResolution, getTotalUrls, checkUrlIPVType, checkByDomainBlacklist, checkByURLKeywordsBlacklist, filterUrlsByPatterns, useAccessibleUrl, + getChannelsByExtendBaseUrls, ) import logging from logging.handlers import RotatingFileHandler @@ -61,7 +62,11 @@ def __init__(self): self.driver = self.setup_driver() async def visitPage(self, channelItems): - total_channels = sum(len(channelObj) for _, channelObj in channelItems.items()) + channelNames = [ + name for _, channelObj in channelItems.items() for name in channelObj.keys() + ] + extendResults = await getChannelsByExtendBaseUrls(channelNames) + total_channels = len(channelNames) pbar = tqdm(total=total_channels) pageUrl = await useAccessibleUrl() for cate, channelObj in channelItems.items(): @@ -84,6 +89,8 @@ async def visitPage(self, channelItems): config.favorite_page_num if isFavorite else config.default_page_num ) infoList = [] + for url in extendResults.get(name, []): + infoList.append((url, None, None)) if pageUrl: for page in range(1, pageNum + 1): try: @@ -118,7 +125,7 @@ async def visitPage(self, channelItems): if not github_actions or ( pbar.n <= 200 and github_actions == "true" ): - sorted_data = await compareSpeedAndResolution(infoList) + sorted_data = await sortUrlsBySpeedAndResolution(infoList) if sorted_data: channelUrls[name] = getTotalUrls(sorted_data) for (url, date, resolution), response_time in sorted_data: diff --git a/utils.py b/utils.py index 93dca74d779..f99b007de8d 100644 --- a/utils.py +++ b/utils.py @@ -11,6 +11,7 @@ import urllib.parse import ipaddress from urllib.parse import urlparse +import requests def getChannelItems(): @@ -44,7 +45,11 @@ def getChannelItems(): if match: if match.group(1) not in channels[current_category]: channels[current_category][match.group(1)] = [match.group(2)] - else: + elif ( + match.group(2) + and match.group(2) + not in channels[current_category][match.group(1)] + ): channels[current_category][match.group(1)].append( match.group(2) ) @@ -53,6 +58,51 @@ def getChannelItems(): f.close() +async def getChannelsByExtendBaseUrls(channel_names): + """ + Get the channels by extending the base urls + """ + channels = {} + pattern = r"^(.*?),(?!#genre#)(.*?)$" + for base_url in config.extend_base_urls: + try: + print(f"Processing extend base url: {base_url}") + try: + response = requests.get(base_url, timeout=10) + except requests.exceptions.Timeout: + print(f"Timeout on {base_url}") + continue + content = response.text + if content: + for channel_name in channel_names: + urls = [] + lines = content.split("\n") + for line in lines: + line = line.strip() + match = re.search(pattern, line) + url = match.group(2) + if ( + match + and match.group(1) == channel_name + and url + and url not in urls + and checkUrlIPVType(url) + and checkByDomainBlacklist(url) + and checkByURLKeywordsBlacklist(url) + ): + urls.append(url) + if urls: + if channel_name in channels: + channels[channel_name] += urls + else: + channels[channel_name] = urls + except Exception as e: + print(f"Error on {base_url}: {e}") + continue + print("Finished processing extend base urls") + return channels + + def updateChannelUrlsTxt(cate, channelUrls): """ Update the category and channel urls to the final file @@ -122,7 +172,7 @@ async def getSpeed(url, urlTimeout=5): return float("inf") -async def compareSpeedAndResolution(infoList): +async def sortUrlsBySpeedAndResolution(infoList): """ Sort by speed and resolution """ From ae41c543f6f0e32f62bd645ed8d5fcdc366f12f1 Mon Sep 17 00:00:00 2001 From: "guorong.zheng" <360996299@qq.com> Date: Fri, 26 Apr 2024 17:19:24 +0800 Subject: [PATCH 2/2] feat: extend_base_urls --- CHANGELOG.md | 6 +++++ README-EN.md | 30 +++++++++++---------- README.md | 30 +++++++++++---------- config.py | 6 ++++- docs/tutorial-EN.md | 29 ++++++++++---------- docs/tutorial.md | 3 ++- main.py | 59 ++++++++++++++++++++++------------------ utils.py | 65 ++++++++++++++++++++++++++++++--------------- version.json | 2 +- 9 files changed, 137 insertions(+), 93 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4128f2761a4..d5ded66b450 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # 更新日志(Changelog) +## v1.1.0 + +### 2024/4/26 + +- 新增自定义接口获取源,配置项为 extend_base_urls(#56)(Added custom interface for source acquisition, the configuration item is extend_base_urls (#56)) + ## v1.0.9 ### 2024/4/25 diff --git a/README-EN.md b/README-EN.md index 1cba0404c74..bc1260b755b 100644 --- a/README-EN.md +++ b/README-EN.md @@ -16,23 +16,25 @@ Customize channel menus and automatically obtain and update the latest live sour - Ensure update timeliness, configure to retrieve interfaces updated within a recent time range - Can filter ipv4, ipv6 interfaces - Blacklist feature: Interface domain and keywords +- Customize the source of interface acquisition ## Config -| Configuration Item | Default Value | Description | -| ---------------------- | ------------------ | ------------------------------------------------------------------------------------------------------------------ | -| source_file | "demo.txt" | Template file name | -| final_file | "result.txt" | Generated file name | -| favorite_list | ["CCTV1","CCTV13"] | List of favorite channel names (used only to distinguish from regular channels, custom page retrieval quantity) | -| favorite_page_num | 5 | Page retrieval quantity for favorite channels | -| default_page_num | 3 | Page retrieval quantity for regular channels | -| urls_limit | 10 | Number of interfaces per channel | -| response_time_weight | 0.5 | Response time weight value (the sum of all weight values should be 1) | -| resolution_weight | 0.5 | Resolution weight value (the sum of all weight values should be 1) | -| recent_days | 30 | Retrieve interfaces updated within a recent time range (in days), reducing appropriately can avoid matching issues | -| ipv_type | "ipv4" | The type of interface in the generated result, optional values: "ipv4", "ipv6", "all" | -| domain_blacklist | ["epg.pw"] | Interface domain blacklist, used to filter out interfaces with low-quality, ad-inclusive domains | -| url_keywords_blacklist | [] | Interface keyword blacklist, used to filter out interfaces containing specific characters | +| Configuration Item | Default Value | Description | +| ---------------------- | --------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------- | +| source_file | "demo.txt" | Template file name | +| final_file | "result.txt" | Generated file name | +| favorite_list | ["CCTV1","CCTV13"] | List of favorite channel names (used only to distinguish from regular channels, custom page retrieval quantity) | +| favorite_page_num | 5 | Page retrieval quantity for favorite channels | +| default_page_num | 3 | Page retrieval quantity for regular channels | +| urls_limit | 10 | Number of interfaces per channel | +| response_time_weight | 0.5 | Response time weight value (the sum of all weight values should be 1) | +| resolution_weight | 0.5 | Resolution weight value (the sum of all weight values should be 1) | +| recent_days | 30 | Retrieve interfaces updated within a recent time range (in days), reducing appropriately can avoid matching issues | +| ipv_type | "ipv4" | The type of interface in the generated result, optional values: "ipv4", "ipv6", "all" | +| domain_blacklist | ["epg.pw"] | Interface domain blacklist, used to filter out interfaces with low-quality, ad-inclusive domains | +| url_keywords_blacklist | [] | Interface keyword blacklist, used to filter out interfaces containing specific characters | +| extend_base_urls | ["https://m3u.ibert.me/txt/fmml_dv6.txt",
"https://m3u.ibert.me/txt/o_cn.txt",
"https://m3u.ibert.me/txt/j_iptv.txt"] | The source of interface acquisition, currently only compatible with specific content formats and fuzzy matching of some channel names | ## Quick Start diff --git a/README.md b/README.md index d48bbc67030..875cfeac64c 100644 --- a/README.md +++ b/README.md @@ -16,23 +16,25 @@ - 保证更新时效性,配置获取最近时间范围内更新的接口 - 可过滤 ipv4、ipv6 接口 - 黑名单功能:接口域名与关键字 +- 自定义接口获取源 ## 配置 -| 配置项 | 默认值 | 描述 | -| ---------------------- | ------------------ | ------------------------------------------------------------------ | -| source_file | "demo.txt" | 模板文件名称 | -| final_file | "result.txt" | 生成文件名称 | -| favorite_list | ["CCTV1","CCTV13"] | 关注频道名称列表(仅用于与常规频道区分,自定义获取分页数量) | -| favorite_page_num | 5 | 关注频道获取分页数量 | -| default_page_num | 3 | 常规频道获取分页数量 | -| urls_limit | 10 | 单个频道接口数量 | -| response_time_weight | 0.5 | 响应时间权重值(所有权重值总和应为 1) | -| resolution_weight | 0.5 | 分辨率权重值 (所有权重值总和应为 1) | -| recent_days | 30 | 获取最近时间范围内更新的接口(单位天),适当减小可避免出现匹配问题 | -| ipv_type | "ipv4" | 生成结果中接口的类型,可选值:"ipv4"、"ipv6"、"all" | -| domain_blacklist | ["epg.pw"] | 接口域名黑名单,用于过滤低质量含广告类域名的接口 | -| url_keywords_blacklist | [] | 接口关键字黑名单,用于过滤含特定字符的接口 | +| 配置项 | 默认值 | 描述 | +| ---------------------- | --------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------ | +| source_file | "demo.txt" | 模板文件名称 | +| final_file | "result.txt" | 生成文件名称 | +| favorite_list | ["CCTV1","CCTV13"] | 关注频道名称列表(仅用于与常规频道区分,自定义获取分页数量) | +| favorite_page_num | 5 | 关注频道获取分页数量 | +| default_page_num | 3 | 常规频道获取分页数量 | +| urls_limit | 10 | 单个频道接口数量 | +| response_time_weight | 0.5 | 响应时间权重值(所有权重值总和应为 1) | +| resolution_weight | 0.5 | 分辨率权重值 (所有权重值总和应为 1) | +| recent_days | 30 | 获取最近时间范围内更新的接口(单位天),适当减小可避免出现匹配问题 | +| ipv_type | "ipv4" | 生成结果中接口的类型,可选值:"ipv4"、"ipv6"、"all" | +| domain_blacklist | ["epg.pw"] | 接口域名黑名单,用于过滤低质量含广告类域名的接口 | +| url_keywords_blacklist | [] | 接口关键字黑名单,用于过滤含特定字符的接口 | +| extend_base_urls | ["https://m3u.ibert.me/txt/fmml_dv6.txt",
"https://m3u.ibert.me/txt/o_cn.txt",
"https://m3u.ibert.me/txt/j_iptv.txt"] | 接口获取源,目前仅兼容特定内容格式与部分频道名称的模糊匹配 | ## 快速上手 diff --git a/config.py b/config.py index d019abdf94c..4b6f264e9fa 100644 --- a/config.py +++ b/config.py @@ -24,4 +24,8 @@ ipv_type = "ipv4" domain_blacklist = ["epg.pw"] url_keywords_blacklist = [] -extend_base_urls = [] +extend_base_urls = [ + "https://m3u.ibert.me/txt/fmml_dv6.txt", + "https://m3u.ibert.me/txt/o_cn.txt", + "https://m3u.ibert.me/txt/j_iptv.txt", +] diff --git a/docs/tutorial-EN.md b/docs/tutorial-EN.md index 80e5921c41e..b314d4c2b08 100644 --- a/docs/tutorial-EN.md +++ b/docs/tutorial-EN.md @@ -57,20 +57,21 @@ Similar to editing the template, modify the running configuration Adjust the configuration as needed. Below is the default configuration explanation: -| Configuration Item | Default Value | Description | -| ---------------------- | ------------------ | ------------------------------------------------------------------------------------------------------------------ | -| source_file | "demo.txt" | Template file name | -| final_file | "result.txt" | Generated file name | -| favorite_list | ["CCTV1","CCTV13"] | List of favorite channel names (used only to distinguish from regular channels, custom page retrieval quantity) | -| favorite_page_num | 5 | Page retrieval quantity for favorite channels | -| default_page_num | 3 | Page retrieval quantity for regular channels | -| urls_limit | 10 | Number of interfaces per channel | -| response_time_weight | 0.5 | Response time weight value (the sum of all weight values should be 1) | -| resolution_weight | 0.5 | Resolution weight value (the sum of all weight values should be 1) | -| recent_days | 30 | Retrieve interfaces updated within a recent time range (in days), reducing appropriately can avoid matching issues | -| ipv_type | "ipv4" | The type of interface in the generated result, optional values: "ipv4", "ipv6", "all" | -| domain_blacklist | ["epg.pw"] | Interface domain blacklist, used to filter out interfaces with low-quality, ad-inclusive domains | -| url_keywords_blacklist | [] | Interface keyword blacklist, used to filter out interfaces containing specific characters | +| Configuration Item | Default Value | Description | +| ---------------------- | --------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------- | +| source_file | "demo.txt" | Template file name | +| final_file | "result.txt" | Generated file name | +| favorite_list | ["CCTV1","CCTV13"] | List of favorite channel names (used only to distinguish from regular channels, custom page retrieval quantity) | +| favorite_page_num | 5 | Page retrieval quantity for favorite channels | +| default_page_num | 3 | Page retrieval quantity for regular channels | +| urls_limit | 10 | Number of interfaces per channel | +| response_time_weight | 0.5 | Response time weight value (the sum of all weight values should be 1) | +| resolution_weight | 0.5 | Resolution weight value (the sum of all weight values should be 1) | +| recent_days | 30 | Retrieve interfaces updated within a recent time range (in days), reducing appropriately can avoid matching issues | +| ipv_type | "ipv4" | The type of interface in the generated result, optional values: "ipv4", "ipv6", "all" | +| domain_blacklist | ["epg.pw"] | Interface domain blacklist, used to filter out interfaces with low-quality, ad-inclusive domains | +| url_keywords_blacklist | [] | Interface keyword blacklist, used to filter out interfaces containing specific characters | +| extend_base_urls | ["https://m3u.ibert.me/txt/fmml_dv6.txt",
"https://m3u.ibert.me/txt/o_cn.txt",
"https://m3u.ibert.me/txt/j_iptv.txt"] | The source of interface acquisition, currently only compatible with specific content formats and fuzzy matching of some channel names | ## Step 4: Run Updates Locally (Recommended, Stable, Supports a large number of channel updates) diff --git a/docs/tutorial.md b/docs/tutorial.md index 2331aa20d70..86c4084fd8e 100644 --- a/docs/tutorial.md +++ b/docs/tutorial.md @@ -57,7 +57,7 @@ 按照您的需要适当调整配置,以下是默认配置说明 | 配置项 | 默认值 | 描述 | -| -------------------- | ------------------ | ------------------------------------------------------------------ | +| ---------------------- | --------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------ | | source_file | "demo.txt" | 模板文件名称 | | final_file | "result.txt" | 生成文件名称 | | favorite_list | ["CCTV1","CCTV13"] | 关注频道名称列表(仅用于与常规频道区分,自定义获取分页数量) | @@ -70,6 +70,7 @@ | ipv_type | "ipv4" | 生成结果中接口的类型,可选值:"ipv4"、"ipv6"、"all" | | domain_blacklist | ["epg.pw"] | 接口域名黑名单,用于过滤低质量含广告类域名的接口 | | url_keywords_blacklist | [] | 接口关键字黑名单,用于过滤含特定字符的接口 | +| extend_base_urls | ["https://m3u.ibert.me/txt/fmml_dv6.txt",
"https://m3u.ibert.me/txt/o_cn.txt",
"https://m3u.ibert.me/txt/j_iptv.txt"] | 接口获取源,目前仅兼容特定内容格式与部分频道名称的模糊匹配 | ## 步骤四:本地运行更新(推荐,稳定,支持大量频道更新) diff --git a/main.py b/main.py index 057a91437e5..071b8d3da98 100644 --- a/main.py +++ b/main.py @@ -16,12 +16,10 @@ getUrlInfo, sortUrlsBySpeedAndResolution, getTotalUrls, - checkUrlIPVType, - checkByDomainBlacklist, - checkByURLKeywordsBlacklist, filterUrlsByPatterns, useAccessibleUrl, getChannelsByExtendBaseUrls, + checkUrlByPatterns, ) import logging from logging.handlers import RotatingFileHandler @@ -69,6 +67,7 @@ async def visitPage(self, channelItems): total_channels = len(channelNames) pbar = tqdm(total=total_channels) pageUrl = await useAccessibleUrl() + wait = WebDriverWait(self.driver, 10) for cate, channelObj in channelItems.items(): channelUrls = {} channelObjKeys = channelObj.keys() @@ -76,28 +75,41 @@ async def visitPage(self, channelItems): pbar.set_description( f"Processing {name}, {total_channels - pbar.n} channels remaining" ) - self.driver.get(pageUrl) - search_box = self.driver.find_element(By.XPATH, '//input[@type="text"]') - search_box.clear() - search_box.send_keys(name) - submit_button = self.driver.find_element( - By.XPATH, '//input[@type="submit"]' - ) - submit_button.click() - isFavorite = name in config.favorite_list - pageNum = ( - config.favorite_page_num if isFavorite else config.default_page_num - ) infoList = [] - for url in extendResults.get(name, []): - infoList.append((url, None, None)) + for url, date, resolution in extendResults.get(name, []): + if url and checkUrlByPatterns(url): + infoList.append((url, None, resolution)) if pageUrl: + self.driver.get(pageUrl) + search_box = wait.until( + EC.presence_of_element_located( + (By.XPATH, '//input[@type="text"]') + ) + ) + search_box.clear() + search_box.send_keys(name) + submit_button = wait.until( + EC.element_to_be_clickable( + (By.XPATH, '//input[@type="submit"]') + ) + ) + submit_button.click() + isFavorite = name in config.favorite_list + pageNum = ( + config.favorite_page_num + if isFavorite + else config.default_page_num + ) for page in range(1, pageNum + 1): try: if page > 1: - page_link = self.driver.find_element( - By.XPATH, - f'//a[contains(@href, "={page}") and contains(@href, "{name}")]', + page_link = wait.until( + EC.element_to_be_clickable( + ( + By.XPATH, + f'//a[contains(@href, "={page}") and contains(@href, "{name}")]', + ) + ) ) page_link.click() soup = BeautifulSoup(self.driver.page_source, "html.parser") @@ -107,12 +119,7 @@ async def visitPage(self, channelItems): for result in results: try: url, date, resolution = getUrlInfo(result) - if ( - url - and checkUrlIPVType(url) - and checkByDomainBlacklist(url) - and checkByURLKeywordsBlacklist(url) - ): + if url and checkUrlByPatterns(url): infoList.append((url, date, resolution)) except Exception as e: print(f"Error on result {result}: {e}") diff --git a/utils.py b/utils.py index f99b007de8d..22eb9dec24d 100644 --- a/utils.py +++ b/utils.py @@ -12,6 +12,7 @@ import ipaddress from urllib.parse import urlparse import requests +import re def getChannelItems(): @@ -42,7 +43,7 @@ def getChannelItems(): else: # This is a url, add it to the list of urls for the current channel. match = re.search(pattern, line) - if match: + if match is not None: if match.group(1) not in channels[current_category]: channels[current_category][match.group(1)] = [match.group(2)] elif ( @@ -64,38 +65,47 @@ async def getChannelsByExtendBaseUrls(channel_names): """ channels = {} pattern = r"^(.*?),(?!#genre#)(.*?)$" + sub_pattern = r"_\((.*?)\)|_\[(.*?)\]|频道" for base_url in config.extend_base_urls: try: print(f"Processing extend base url: {base_url}") try: - response = requests.get(base_url, timeout=10) + response = requests.get(base_url, timeout=30) except requests.exceptions.Timeout: print(f"Timeout on {base_url}") continue content = response.text if content: + lines = content.split("\n") + link_dict = {} + for line in lines: + if re.match(pattern, line) is not None: + key = re.match(pattern, line).group(1) + resolution_match = re.search(r"_(\((.*?)\))", key) + resolution = ( + resolution_match.group(2) + if resolution_match is not None + else None + ) + key = re.sub(sub_pattern, "", key).lower() + url = re.match(pattern, line).group(2) + value = (url, None, resolution) + if key in link_dict: + link_dict[key].append(value) + else: + link_dict[key] = [value] + found_channels = [] for channel_name in channel_names: - urls = [] - lines = content.split("\n") - for line in lines: - line = line.strip() - match = re.search(pattern, line) - url = match.group(2) - if ( - match - and match.group(1) == channel_name - and url - and url not in urls - and checkUrlIPVType(url) - and checkByDomainBlacklist(url) - and checkByURLKeywordsBlacklist(url) - ): - urls.append(url) - if urls: + sub_channel_name = re.sub(sub_pattern, "", channel_name).lower() + values = link_dict.get(sub_channel_name) + if values: if channel_name in channels: - channels[channel_name] += urls + channels[channel_name] += values else: - channels[channel_name] = urls + channels[channel_name] = values + found_channels.append(channel_name) + if found_channels: + print(f"{base_url} found channels: {','.join(found_channels)}") except Exception as e: print(f"Error on {base_url}: {e}") continue @@ -139,7 +149,7 @@ def getUrlInfo(result): r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", channel_text, ) - if url_match: + if url_match is not None: url = url_match.group() info_text = result_div[-1].get_text(strip=True) if info_text: @@ -299,6 +309,17 @@ def checkByURLKeywordsBlacklist(url): return not any(keyword in url for keyword in url_keywords_blacklist) +def checkUrlByPatterns(url): + """ + Check the url by patterns + """ + return ( + checkUrlIPVType(url) + and checkByDomainBlacklist(url) + and checkByURLKeywordsBlacklist(url) + ) + + def filterUrlsByPatterns(urls): """ Filter urls by patterns diff --git a/version.json b/version.json index f8696f844c4..07cd7643a22 100644 --- a/version.json +++ b/version.json @@ -1,3 +1,3 @@ { - "version": "1.0.9" + "version": "1.1.0" } \ No newline at end of file