From 5c8646117efb8644de0e530f87e45699fd354025 Mon Sep 17 00:00:00 2001 From: clericpy Date: Sun, 21 Jun 2020 11:17:25 +0800 Subject: [PATCH 01/17] fix tab.findall --- ichrome/async_utils.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/ichrome/async_utils.py b/ichrome/async_utils.py index a735ae6..780b24b 100644 --- a/ichrome/async_utils.py +++ b/ichrome/async_utils.py @@ -1085,7 +1085,7 @@ async def findall(self, Demo:: - # no group + # no group / (?:) / (?<=) / (?!) print(await tab.findall('.*?')) # ['123456789'] @@ -1108,12 +1108,14 @@ async def findall(self, :param timeout: defaults to NotSet :type timeout: [type], optional """ - group_count = len(re.findall(r'(? { if (group_count <= 1) { result.push(item[group_count]) @@ -1126,7 +1128,7 @@ async def findall(self, } }) JSON.stringify(result) -''' % (group_count, regex, flags, cssselector, attribute) +''' % (group_count, cssselector, attribute, act, regex, flags) result = await self.js(code, value_path='result.result.value', timeout=timeout) From fd31c57a5735b66a1102c21e43129b78912ed06d Mon Sep 17 00:00:00 2001 From: clericpy Date: Sun, 21 Jun 2020 11:20:27 +0800 Subject: [PATCH 02/17] handle jsonerror for querySelectorAll --- ichrome/async_utils.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/ichrome/async_utils.py b/ichrome/async_utils.py index 780b24b..dbb50b9 100644 --- a/ichrome/async_utils.py +++ b/ichrome/async_utils.py @@ -1263,10 +1263,15 @@ async def querySelectorAll(self, ) response = None try: - response_items_str = (await self.js( - javascript, timeout=timeout, - value_path='result.result.value')) or '' - items = json.loads(response_items_str) + response_items_str = (await + self.js(javascript, + timeout=timeout, + value_path='result.result.value')) + try: + items = json.loads( + response_items_str) if response_items_str else [] + except (json.JSONDecodeError, ValueError): + items = [] result = [Tag(**kws) for kws in items] if isinstance(index, int): if result: From fa7204d2d21bcd5b17743000b4ad742efa98e5d6 Mon Sep 17 00:00:00 2001 From: clericpy Date: Sun, 21 Jun 2020 11:33:02 +0800 Subject: [PATCH 03/17] warning root user without --no-sandbox --- ichrome/daemon.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/ichrome/daemon.py b/ichrome/daemon.py index b39bd59..4a81e63 100644 --- a/ichrome/daemon.py +++ b/ichrome/daemon.py @@ -119,6 +119,15 @@ def __init__( if extra_config and isinstance(extra_config, str): extra_config = [extra_config] self.extra_config = extra_config or ["--disable-gpu", "--no-first-run"] + if '--no-sandbox' not in str(self.extra_config): + import getpass + if getpass.getuser() == 'root': + if extra_config: + self.extra_config.append('--no-sandbox') + else: + logger.warning( + 'root user without "--no-sandbox" may launch fail for: "Running as root without --no-sandbox is not supported."' + ) if not isinstance(self.extra_config, list): raise TypeError("extra_config type should be list.") self.chrome_proc_start_time = time.time() From 82f973df30ae21587eaa6cabf449fe41756f67d1 Mon Sep 17 00:00:00 2001 From: clericpy Date: Sun, 21 Jun 2020 12:59:21 +0800 Subject: [PATCH 04/17] refactor command line APIs --- README.md | 27 ++++++++++++++++----------- ichrome/__main__.py | 43 +++++++++++++++++++++++++++++++++++++------ ichrome/daemon.py | 13 +++++++------ 3 files changed, 60 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 1a7a1b3..ba39c92 100644 --- a/README.md +++ b/README.md @@ -293,8 +293,10 @@ Other operations: optional arguments: -h, --help show this help message and exit - -V, --version ichrome version info - -c CHROME_PATH, --chrome_path CHROME_PATH + -v, -V, --version ichrome version info + -c CONFIG, --config CONFIG + load config dict from JSON file of given path + -cp CHROME_PATH, --chrome-path CHROME_PATH, --chrome_path CHROME_PATH chrome executable file path, default to null for automatic searching --host HOST --remote-debugging-address, default to 127.0.0.1 @@ -302,31 +304,34 @@ optional arguments: --headless --headless and --hide-scrollbars, default to False -s SHUTDOWN, --shutdown SHUTDOWN shutdown the given port, only for local running chrome - --user_agent USER_AGENT + -A USER_AGENT, --user-agent USER_AGENT, --user_agent USER_AGENT --user-agen, default to 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36' - --proxy PROXY --proxy-server, default to None - --user_data_dir USER_DATA_DIR + -x PROXY, --proxy PROXY + --proxy-server, default to None + -U USER_DATA_DIR, --user-data-dir USER_DATA_DIR, --user_data_dir USER_DATA_DIR user_data_dir to save the user data, default to ~/ichrome_user_data - --disable_image disable image for loading performance, default to + --disable-image, --disable_image + disable image for loading performance, default to False - --start_url START_URL + -url START_URL, --start-url START_URL, --start_url START_URL start url while launching chrome, default to about:blank - --max_deaths MAX_DEATHS + --max-deaths MAX_DEATHS, --max_deaths MAX_DEATHS max deaths in 5 secs, auto restart `max_deaths` times if crash fast in 5 secs. default to 1 for without auto-restart --timeout TIMEOUT timeout to connect the remote server, default to 1 for localhost - --workers WORKERS the number of worker processes with auto-increment + -w WORKERS, --workers WORKERS + the number of worker processes with auto-increment port, default to 1 - --proc_check_interval PROC_CHECK_INTERVAL + --proc-check-interval PROC_CHECK_INTERVAL, --proc_check_interval PROC_CHECK_INTERVAL check chrome process alive every interval seconds --crawl crawl the given URL, output the HTML DOM - --clean, --clear clean user_data_dir + -C, --clear, --clear clean user_data_dir --doc show ChromeDaemon.__doc__ --debug set logger level to DEBUG ``` diff --git a/ichrome/__main__.py b/ichrome/__main__.py index b045590..e4f7e25 100644 --- a/ichrome/__main__.py +++ b/ichrome/__main__.py @@ -28,12 +28,18 @@ def main(): python -m ichrome --crawl --headless --timeout=2 http://myip.ipip.net/ ''' parser = argparse.ArgumentParser(usage=usage) - parser.add_argument("-V", + parser.add_argument("-v", + "-V", "--version", help="ichrome version info", action="store_true") + parser.add_argument("-c", + "--config", + help="load config dict from JSON file of given path", + default="") parser.add_argument( - "-c", + "-cp", + "--chrome-path", "--chrome_path", help= "chrome executable file path, default to null for automatic searching", @@ -57,27 +63,36 @@ def main(): help="shutdown the given port, only for local running chrome", type=int) parser.add_argument( + "-A", + "--user-agent", "--user_agent", help= "--user-agen, default to 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'", default="") - parser.add_argument("--proxy", + parser.add_argument("-x", + "--proxy", help="--proxy-server, default to None", default="") parser.add_argument( + "-U", + "--user-data-dir", "--user_data_dir", help= "user_data_dir to save the user data, default to ~/ichrome_user_data", default=Path.home() / 'ichrome_user_data') parser.add_argument( + "--disable-image", "--disable_image", help="disable image for loading performance, default to False", action="store_true") parser.add_argument( + "-url", + "--start-url", "--start_url", help="start url while launching chrome, default to about:blank", default="about:blank") parser.add_argument( + "--max-deaths", "--max_deaths", help= "max deaths in 5 secs, auto restart `max_deaths` times if crash fast in 5 secs. default to 1 for without auto-restart", @@ -89,12 +104,14 @@ def main(): default=1, type=int) parser.add_argument( + "-w", "--workers", help= "the number of worker processes with auto-increment port, default to 1", default=1, type=int) parser.add_argument( + "--proc-check-interval", "--proc_check_interval", dest='proc_check_interval', help="check chrome process alive every interval seconds", @@ -104,7 +121,8 @@ def main(): help="crawl the given URL, output the HTML DOM", default=False, action="store_true") - parser.add_argument("--clean", + parser.add_argument("-C", + "--clear", "--clear", dest='clean', help="clean user_data_dir", @@ -124,6 +142,18 @@ def main(): if args.version: print(__version__) return + if args.config: + path = Path(args.config) + if not (path.is_file() and path.exists()): + logger.error(f'config file not found: {path}') + return + import json + kwargs = json.loads(path.read_text()) + start_port = kwargs.pop('port', 9222) + workers = kwargs.pop('workers', 1) + asyncio.run( + ChromeWorkers.run_chrome_workers(start_port, workers, kwargs)) + return if args.shutdown: logger.setLevel(1) ChromeDaemon.clear_chrome_process(args.shutdown, @@ -161,13 +191,14 @@ def main(): kwargs['start_url'] = config kwargs['extra_config'].remove(config) break - args.port = getattr(args, 'port', 9222) if '--dump-dom' in extra_config or args.crawl: logger.setLevel(60) from .debugger import crawl_once asyncio.run(crawl_once(**kwargs)) else: - asyncio.run(ChromeWorkers.run_chrome_workers(args, kwargs)) + start_port = getattr(args, 'port', 9222) + asyncio.run( + ChromeWorkers.run_chrome_workers(start_port, args.workers, kwargs)) if __name__ == "__main__": diff --git a/ichrome/daemon.py b/ichrome/daemon.py index 4a81e63..513ab30 100644 --- a/ichrome/daemon.py +++ b/ichrome/daemon.py @@ -660,16 +660,17 @@ async def __aexit__(self, *args, **kwargs): class ChromeWorkers: - def __init__(self, args, kwargs): - self.args = args - self.kwargs = kwargs + def __init__(self, start_port=9222, workers=1, kwargs=None): + self.start_port = start_port or 9222 + self.workers = workers or 1 + self.kwargs = kwargs or {} self.daemons = [] async def __aenter__(self): return await self.create_chrome_workers() async def create_chrome_workers(self): - for port in range(self.args.port, self.args.port + self.args.workers): + for port in range(self.start_port, self.start_port + self.workers): logger.info("ChromeDaemon cmd args: port=%s, %s" % (port, self.kwargs)) self.daemons.append(await @@ -684,6 +685,6 @@ async def __aexit__(self, *args): await daemon.__aexit__() @classmethod - async def run_chrome_workers(cls, args, kwargs): - async with cls(args, kwargs): + async def run_chrome_workers(cls, start_port, workers, kwargs): + async with cls(start_port, workers, kwargs): pass From 6198f48dec7d481356727c49ba653b9c13e17ba9 Mon Sep 17 00:00:00 2001 From: clericpy Date: Sun, 21 Jun 2020 14:52:01 +0800 Subject: [PATCH 05/17] use shutil.rmtree to clear dir --- ichrome/base.py | 12 +++++-- ichrome/daemon.py | 86 ++++++++++++++++++++++++++++++++++------------- 2 files changed, 72 insertions(+), 26 deletions(-) diff --git a/ichrome/base.py b/ichrome/base.py index e2c14ef..49e2535 100644 --- a/ichrome/base.py +++ b/ichrome/base.py @@ -1,13 +1,13 @@ # -*- coding: utf-8 -*- import re import time +from pathlib import Path from typing import List import psutil +from torequests.utils import get_readable_size from .logs import logger - - """ For base usage with sync utils. """ @@ -131,3 +131,11 @@ def clear_chrome_process(port=None, timeout=None, max_deaths=1, interval=0.5): time.sleep(interval) continue return + + +def get_dir_size(path): + return sum(f.stat().st_size for f in Path(path).glob("**/*") if f.is_file()) + + +def get_readable_dir_size(path): + return get_readable_size(get_dir_size(path), rounded=1) diff --git a/ichrome/daemon.py b/ichrome/daemon.py index 513ab30..07299df 100644 --- a/ichrome/daemon.py +++ b/ichrome/daemon.py @@ -10,9 +10,9 @@ from torequests import tPool from torequests.aiohttp_dummy import Requests -from torequests.utils import get_readable_size, timepass, ttime +from torequests.utils import timepass, ttime -from .base import clear_chrome_process, get_memory_by_port, get_proc +from .base import clear_chrome_process, get_memory_by_port, get_proc, get_dir_size, get_readable_dir_size from .logs import logger """ Sync / block operations for launching chrome processes. @@ -114,7 +114,11 @@ def __init__( self.headless = headless self.proxy = proxy self.disable_image = disable_image - self._wrap_user_data_dir(user_data_dir) + if '--user-data-dir=' in str(extra_config): + # ignore custom user_data_dir by ichrome + self.user_data_dir = None + else: + self._wrap_user_data_dir(user_data_dir) self.start_url = start_url if extra_config and isinstance(extra_config, str): extra_config = [extra_config] @@ -167,48 +171,83 @@ def ensure_dir(path: Path): @staticmethod def get_dir_size(path): - return sum(f.stat().st_size for f in path.glob("**/*") if f.is_file()) + return get_dir_size(path) - def _wrap_user_data_dir(self, user_data_dir): - """refactor this function to set accurate dir.""" + @staticmethod + def get_readable_dir_size(path): + return get_readable_dir_size(path) + + @classmethod + def _ensure_user_dir(cls, user_data_dir): if user_data_dir is None: - user_data_dir = Path.home() / 'ichrome_user_data' - elif user_data_dir in self.IGNORE_USER_DIR_FLAGS: - self.user_data_dir = None + # use default path + return Path.home() / 'ichrome_user_data' + elif user_data_dir in cls.IGNORE_USER_DIR_FLAGS: + # ignore custom path settings logger.debug( 'Ignore custom user_data_dir, using default user set by system.' ) - return + return None else: - user_data_dir = Path(user_data_dir) - self.user_data_dir = user_data_dir / f"chrome_{self.port}" + # valid path string + return Path(user_data_dir) + + def _wrap_user_data_dir(self, user_data_dir): + main_user_dir = self._ensure_user_dir(user_data_dir) + if main_user_dir is None: + return + port_user_dir = main_user_dir / f"chrome_{self.port}" + self.user_data_dir = port_user_dir if not self.user_data_dir.is_dir(): logger.warning( f"creating user data dir at [{os.path.realpath(self.user_data_dir)}]." ) self.ensure_dir(self.user_data_dir) - port_dir_size = get_readable_size(self.get_dir_size(self.user_data_dir), - rounded=1) - total_dir_size = get_readable_size(self.get_dir_size(user_data_dir), - rounded=1) + port_dir_size = get_readable_dir_size(port_user_dir) + total_dir_size = get_readable_dir_size(main_user_dir) logger.info( f'user_data_dir({self.user_data_dir}) size: {port_dir_size} / {total_dir_size}' ) @classmethod def clear_user_dir(cls, user_data_dir, port=None): - return cls.clear_dir(user_data_dir, port=port) + main_user_dir = cls._ensure_user_dir(user_data_dir) + if port: + port_user_dir = main_user_dir / f"chrome_{port}" + logger.info( + f'Clearing only port dir: {port_user_dir} => {get_readable_dir_size(port_user_dir)} / {get_readable_dir_size(main_user_dir)}' + ) + cls.clear_dir_with_shutil(port_user_dir) + logger.info( + f'Cleared only port dir: {port_user_dir} => {get_readable_dir_size(port_user_dir)} / {get_readable_dir_size(main_user_dir)}' + ) + else: + logger.info( + f'Clearing total user dir: {main_user_dir} => {get_readable_dir_size(main_user_dir)} / {get_readable_dir_size(main_user_dir)}' + ) + cls.clear_dir_with_shutil(main_user_dir) + logger.info( + f'Cleared total user dir: {main_user_dir} => {get_readable_dir_size(main_user_dir)} / {get_readable_dir_size(main_user_dir)}' + ) + + @staticmethod + def clear_dir_with_shutil(dir_path): + dir_path = Path(dir_path) + if not dir_path.is_dir(): + logger.warning(f'{dir_path} is not exists, ignore.') + return + import shutil + shutil.rmtree(dir_path) @classmethod - def clear_dir(cls, dir_path, port=None): + def clear_dir(cls, dir_path): dir_path = Path(dir_path) - if port: - dir_path = dir_path / f'chrome_{port}' - logger.info(f'Clear dir: {dir_path}.') if not dir_path.is_dir(): - logger.info(f'Dir is not exist: {dir_path}.') + logger.warning(f'{dir_path} not exists, ignore.') + return + if not dir_path.is_dir(): + logger.info(f'{dir_path} is not exist:.') return True - logger.info(f'Cleaning {dir_path}...') for f in dir_path.iterdir(): if f.is_dir(): cls.clear_dir(f) @@ -216,7 +255,6 @@ def clear_dir(cls, dir_path, port=None): f.unlink() logger.info(f'File removed: {f}') dir_path.rmdir() - logger.info(f'Folder removed: {dir_path}') @property def ok(self): From 1f21baf86f38ab1be64b5879db63cb7d5b3b8cd1 Mon Sep 17 00:00:00 2001 From: clericpy Date: Sun, 21 Jun 2020 15:08:35 +0800 Subject: [PATCH 06/17] killall cmd --- ichrome/__main__.py | 23 ++++++++++++++++++++--- ichrome/base.py | 6 +++--- ichrome/daemon.py | 19 ++++++++----------- 3 files changed, 31 insertions(+), 17 deletions(-) diff --git a/ichrome/__main__.py b/ichrome/__main__.py index e4f7e25..1efa8a7 100644 --- a/ichrome/__main__.py +++ b/ichrome/__main__.py @@ -13,15 +13,20 @@ def main(): All the unknown args will be appended to extra_config as chrome original args. Demo: - > python -m ichrome --host=127.0.0.1 --window-size=1212,1212 --incognito - > ChromeDaemon cmd args: {'daemon': True, 'block': True, 'chrome_path': '', 'host': '127.0.0.1', 'port': 9222, 'headless': False, 'user_agent': '', 'proxy': '', 'user_data_dir': None, 'disable_image': False, 'start_url': 'about:blank', 'extra_config': ['--window-size=1212,1212', '--incognito'], 'max_deaths': 1, 'timeout': 2} + > python -m ichrome -H 127.0.0.1 -p 9222 --window-size=1212,1212 --incognito + > ChromeDaemon cmd args: port=9222, {'chrome_path': '', 'host': '127.0.0.1', 'headless': False, 'user_agent': '', 'proxy': '', 'user_data_dir': WindowsPath('C:/Users/root/ichrome_user_data'), 'disable_image': False, 'start_url': 'about:blank', 'extra_config': ['--window-size=1212,1212', '--incognito'], 'max_deaths': 1, 'timeout':1, 'proc_check_interval': 5, 'debug': False} + + > python -m ichrome + > ChromeDaemon cmd args: port=9222, {'chrome_path': '', 'host': '127.0.0.1', 'headless': False, 'user_agent': '', 'proxy': '', 'user_data_dir': WindowsPath('C:/Users/root/ichrome_user_data'), 'disable_image': False, 'start_url': 'about:blank', 'extra_config': [], 'max_deaths': 1, 'timeout': 1, 'proc_check_interval': 5, 'debug': False} Other operations: 1. kill local chrome process with given port: python -m ichrome -s 9222 + python -m ichrome -k 9222 2. clear user_data_dir path (remove the folder and files): python -m ichrome --clear python -m ichrome --clean + python -m ichrome -C -p 9222 3. show ChromeDaemon.__doc__: python -m ichrome --doc 4. crawl the URL, output the HTML DOM: @@ -44,7 +49,8 @@ def main(): help= "chrome executable file path, default to null for automatic searching", default="") - parser.add_argument("--host", + parser.add_argument("-H", + "--host", help="--remote-debugging-address, default to 127.0.0.1", default="127.0.0.1") parser.add_argument("-p", @@ -59,6 +65,7 @@ def main(): action="store_true") parser.add_argument( "-s", + "-k", "--shutdown", help="shutdown the given port, only for local running chrome", type=int) @@ -138,6 +145,12 @@ def main(): help="set logger level to DEBUG", default=False, action="store_true") + parser.add_argument( + "-K", + "--killall", + help="killall chrome launched local with --remote-debugging-port", + default=False, + action="store_true") args, extra_config = parser.parse_known_args() if args.version: print(__version__) @@ -159,6 +172,10 @@ def main(): ChromeDaemon.clear_chrome_process(args.shutdown, max_deaths=args.max_deaths) return + if args.killall: + logger.setLevel(1) + ChromeDaemon.clear_chrome_process(None, max_deaths=args.max_deaths) + return if args.clean: logger.setLevel(1) ChromeDaemon.clear_user_dir(args.user_data_dir, diff --git a/ichrome/base.py b/ichrome/base.py index 49e2535..e474981 100644 --- a/ichrome/base.py +++ b/ichrome/base.py @@ -85,7 +85,7 @@ def get_proc_by_regex(regex, proc_names=None): def get_proc(port=9222) -> List[psutil.Process]: - regex = f"--remote-debugging-port={port}" + regex = f"--remote-debugging-port={port or ''}" proc_names = {"chrome.exe", "chrome"} return get_proc_by_regex(regex, proc_names=proc_names) @@ -107,7 +107,6 @@ def clear_chrome_process(port=None, timeout=None, max_deaths=1, interval=0.5): set timeout to avoid running forever. set max_deaths and port, will return before timeout. """ - port = port or "" killed_count = 0 start_time = time.time() if timeout is None: @@ -115,7 +114,8 @@ def clear_chrome_process(port=None, timeout=None, max_deaths=1, interval=0.5): while 1: procs = get_proc(port) for proc in procs: - logger.debug(f"killing {proc}, port: {port}") + logger.debug( + f"[Killing] {proc}, port: {port}. {' '.join(proc.cmdline())}") try: proc.kill() except psutil._exceptions.NoSuchProcess: diff --git a/ichrome/daemon.py b/ichrome/daemon.py index 07299df..e1c01e1 100644 --- a/ichrome/daemon.py +++ b/ichrome/daemon.py @@ -6,14 +6,18 @@ import subprocess import threading import time +from getpass import getuser from pathlib import Path from torequests import tPool from torequests.aiohttp_dummy import Requests from torequests.utils import timepass, ttime -from .base import clear_chrome_process, get_memory_by_port, get_proc, get_dir_size, get_readable_dir_size +from .base import (clear_chrome_process, get_dir_size, get_memory_by_port, + get_proc, get_readable_dir_size) from .logs import logger + + """ Sync / block operations for launching chrome processes. """ @@ -40,7 +44,7 @@ class ChromeDaemon(object): on_startup & on_shutdown: function which handled a ChromeDaemon object while startup or shutdown - default extra_config: ["--disable-gpu", "--no-first-run"] + default extra_config: ["--disable-gpu", "--no-first-run"], root user should append "--no-sandbox" common args: @@ -123,15 +127,8 @@ def __init__( if extra_config and isinstance(extra_config, str): extra_config = [extra_config] self.extra_config = extra_config or ["--disable-gpu", "--no-first-run"] - if '--no-sandbox' not in str(self.extra_config): - import getpass - if getpass.getuser() == 'root': - if extra_config: - self.extra_config.append('--no-sandbox') - else: - logger.warning( - 'root user without "--no-sandbox" may launch fail for: "Running as root without --no-sandbox is not supported."' - ) + if '--no-sandbox' not in str(self.extra_config) and getuser() == 'root': + self.extra_config.append('--no-sandbox') if not isinstance(self.extra_config, list): raise TypeError("extra_config type should be list.") self.chrome_proc_start_time = time.time() From 291f4dddf10133478cd3aaeb7def2974c302f163 Mon Sep 17 00:00:00 2001 From: clericpy Date: Sun, 21 Jun 2020 15:12:54 +0800 Subject: [PATCH 07/17] update default PC UA --- ichrome/__main__.py | 21 ++++++++------------- ichrome/daemon.py | 4 +--- 2 files changed, 9 insertions(+), 16 deletions(-) diff --git a/ichrome/__main__.py b/ichrome/__main__.py index 1efa8a7..f5092ba 100644 --- a/ichrome/__main__.py +++ b/ichrome/__main__.py @@ -73,8 +73,7 @@ def main(): "-A", "--user-agent", "--user_agent", - help= - "--user-agen, default to 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'", + help=f"--user-agent, default to Chrome PC: {ChromeDaemon.PC_UA}", default="") parser.add_argument("-x", "--proxy", @@ -84,8 +83,7 @@ def main(): "-U", "--user-data-dir", "--user_data_dir", - help= - "user_data_dir to save the user data, default to ~/ichrome_user_data", + help="user_data_dir to save user data, default to ~/ichrome_user_data", default=Path.home() / 'ichrome_user_data') parser.add_argument( "--disable-image", @@ -101,8 +99,7 @@ def main(): parser.add_argument( "--max-deaths", "--max_deaths", - help= - "max deaths in 5 secs, auto restart `max_deaths` times if crash fast in 5 secs. default to 1 for without auto-restart", + help="restart times. default to 1 for without auto-restart", default=1, type=int) parser.add_argument( @@ -110,13 +107,11 @@ def main(): help="timeout to connect the remote server, default to 1 for localhost", default=1, type=int) - parser.add_argument( - "-w", - "--workers", - help= - "the number of worker processes with auto-increment port, default to 1", - default=1, - type=int) + parser.add_argument("-w", + "--workers", + help="the number of worker processes, default to 1", + default=1, + type=int) parser.add_argument( "--proc-check-interval", "--proc_check_interval", diff --git a/ichrome/daemon.py b/ichrome/daemon.py index e1c01e1..103bda0 100644 --- a/ichrome/daemon.py +++ b/ichrome/daemon.py @@ -16,8 +16,6 @@ from .base import (clear_chrome_process, get_dir_size, get_memory_by_port, get_proc, get_readable_dir_size) from .logs import logger - - """ Sync / block operations for launching chrome processes. """ @@ -69,7 +67,7 @@ class ChromeDaemon(object): """ port_in_using: set = set() - PC_UA = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36" + PC_UA = "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Mobile Safari/537.36" MAC_OS_UA = ( "Mozilla/5.0 (Macintosh; Intel Mac OS X 12_0_1) Version/8.0.1a Safari/728.28.19" ) From 6668654a4c0f58d06862475483d5903fa31ff096 Mon Sep 17 00:00:00 2001 From: clericpy Date: Sun, 21 Jun 2020 15:14:39 +0800 Subject: [PATCH 08/17] contains alias for includes --- ichrome/async_utils.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/ichrome/async_utils.py b/ichrome/async_utils.py index dbb50b9..13de181 100644 --- a/ichrome/async_utils.py +++ b/ichrome/async_utils.py @@ -1137,6 +1137,17 @@ async def findall(self, else: return [] + async def contains(self, + text, + cssselector: str = 'html', + attribute: str = 'outerHTML', + timeout=NotSet) -> bool: + """alias for Tab.includes""" + return await self.includes(text=text, + cssselector=cssselector, + attribute=attribute, + timeout=timeout) + async def includes(self, text, cssselector: str = 'html', From 26c7ff578dccccd8d9177247253e77bcae12f464 Mon Sep 17 00:00:00 2001 From: clericpy Date: Sun, 21 Jun 2020 15:25:39 +0800 Subject: [PATCH 09/17] add async get_free_port --- ichrome/async_utils.py | 2 +- ichrome/daemon.py | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/ichrome/async_utils.py b/ichrome/async_utils.py index 13de181..da211d3 100644 --- a/ichrome/async_utils.py +++ b/ichrome/async_utils.py @@ -1159,7 +1159,7 @@ async def includes(self, :type text: str :param cssselector: css selector for outerHTML, defaults to 'html' :type cssselector: str, optional - :param attribute: attribute of the selected element, defaults to 'outerHTML' + :param attribute: attribute of the selected element, defaults to 'outerHTML'. Sometimes for case-insensitive usage by setting `attribute='textContent.toLowerCase()'` :type attribute: str, optional :return: whether the outerHTML contains substring. :rtype: bool diff --git a/ichrome/daemon.py b/ichrome/daemon.py index 103bda0..49e6221 100644 --- a/ichrome/daemon.py +++ b/ichrome/daemon.py @@ -622,6 +622,20 @@ def ok(self): # awaitable property return self.check_chrome_ready() + @classmethod + async def get_free_port(cls, + host="127.0.0.1", + start=9222, + max_tries=100, + timeout=1): + return await asyncio.get_running_loop().run_in_executor( + None, + super().get_free_port, + host=host, + start=start, + max_tries=max_tries, + timeout=timeout) + async def check_chrome_ready(self): if self.proc_ok and await self.check_connection(): logger.info( From ba0bf43335e18a42ccd5919f0ac0b2746d19e752 Mon Sep 17 00:00:00 2001 From: clericpy Date: Sun, 21 Jun 2020 15:54:13 +0800 Subject: [PATCH 10/17] add tab._recv_daemon_break_callback --- ichrome/async_utils.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/ichrome/async_utils.py b/ichrome/async_utils.py index da211d3..7a0985b 100644 --- a/ichrome/async_utils.py +++ b/ichrome/async_utils.py @@ -165,6 +165,7 @@ class Tab(GetValueMixin): _DEFAULT_RECV_TIMEOUT = 5.0 # aiohttp ws timeout default to 10.0, here is 5 _DEFAULT_CONNECT_TIMEOUT = 5.0 + _RECV_DAEMON_BREAK_CALLBACK = None def __init__(self, tab_id: str = None, @@ -179,6 +180,7 @@ def __init__(self, timeout=NotSet, ws_kwargs: dict = None, default_recv_callback: Callable = None, + _recv_daemon_break_callback: Callable = None, **kwargs): """ original Tab JSON:: @@ -214,8 +216,10 @@ def __init__(self, :type timeout: [type], optional :param ws_kwargs: kwargs for ws connection, defaults to None :type ws_kwargs: dict, optional - :param default_recv_callback: sync/async function only accept 1 arg of data comes from ws recv, defaults to None + :param default_recv_callback: called for each data received, sync/async function only accept 1 arg of data comes from ws recv, defaults to None :type default_recv_callback: Callable, optional + :param _recv_daemon_break_callback: like the tab_close_callback. sync/async function only accept 1 arg of self while _recv_daemon break, defaults to None + :type _recv_daemon_break_callback: Callable, optional :raises ValueError: [description] """ tab_id = tab_id or kwargs.pop('id') @@ -238,6 +242,7 @@ def __init__(self, self._message_id = 0 self.ws = None self.default_recv_callback = default_recv_callback + self._recv_daemon_break_callback = _recv_daemon_break_callback or self._RECV_DAEMON_BREAK_CALLBACK if self.chrome: self.req = self.chrome.req else: @@ -378,6 +383,9 @@ async def _recv_daemon(self): else: del f logger.debug(f'[break] {self!r} _recv_daemon loop break.') + if self._recv_daemon_break_callback: + return await _ensure_awaitable_callback_result( + self._recv_daemon_break_callback, self) async def send(self, method: str, From ab05f0830b43c1f15eb44d4eea5d395f589d46c3 Mon Sep 17 00:00:00 2001 From: clericpy Date: Sun, 21 Jun 2020 16:46:47 +0800 Subject: [PATCH 11/17] easy way to init a connected Tab --- README.md | 44 ++++++++++++++++++--------------- examples_async.py | 5 ++++ ichrome/async_utils.py | 55 +++++++++++++++++++++++++++++++++++++++--- 3 files changed, 80 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index ba39c92..74e9cbd 100644 --- a/README.md +++ b/README.md @@ -220,26 +220,25 @@ import asyncio async def main(): # If there is an existing daemon, such as `python -m ichrome`, the `async with AsyncChromeDaemon` context can be omitted. async with AsyncChromeDaemon(): - # connect to an opened chrome + # connect to an opened chrome, default host=127.0.0.1, port=9222, headless=False async with AsyncChrome() as chrome: - tab = await chrome.new_tab(url="https://github.com/ClericPy") - # async with tab() as tab: - # and `as tab` can be omitted - async with tab(): + # If you need the current tab, set index with int like 0 for activated tab. + async with chrome.connect_tab(index='https://github.com/ClericPy', + auto_close=True) as tab: await tab.wait_loading(2) await tab.js("document.write('

Document updated.

')") await asyncio.sleep(1) # await tab.js('alert("test ok")') print('output:', await tab.html) # output:

Document updated.

- await tab.close() + # will auto_close tab while exiting context + # await tab.close() # close_browser gracefully, I have no more need of chrome instance await chrome.close_browser() if __name__ == "__main__": asyncio.run(main()) - ``` [More Examples](https://github.com/ClericPy/ichrome/blob/master/examples_async.py) @@ -277,15 +276,20 @@ usage: All the unknown args will be appended to extra_config as chrome original args. Demo: - > python -m ichrome --host=127.0.0.1 --window-size=1212,1212 --incognito - > ChromeDaemon cmd args: {'daemon': True, 'block': True, 'chrome_path': '', 'host': '127.0.0.1', 'port': 9222, 'headless': False, 'user_agent': '', 'proxy': '', 'user_data_dir': None, 'disable_image': False, 'start_url': 'about:blank', 'extra_config': ['--window-size=1212,1212', '--incognito'], 'max_deaths': 1, 'timeout': 2} + > python -m ichrome -H 127.0.0.1 -p 9222 --window-size=1212,1212 --incognito + > ChromeDaemon cmd args: port=9222, {'chrome_path': '', 'host': '127.0.0.1', 'headless': False, 'user_agent': '', 'proxy': '', 'user_data_dir': WindowsPath('C:/Users/root/ichrome_user_data'), 'disable_image': False, 'start_url': 'about:blank', 'extra_config': ['--window-size=1212,1212', '--incognito'], 'max_deaths': 1, 'timeout':1, 'proc_check_interval': 5, 'debug': False} + + > python -m ichrome + > ChromeDaemon cmd args: port=9222, {'chrome_path': '', 'host': '127.0.0.1', 'headless': False, 'user_agent': '', 'proxy': '', 'user_data_dir': WindowsPath('C:/Users/root/ichrome_user_data'), 'disable_image': False, 'start_url': 'about:blank', 'extra_config': [], 'max_deaths': 1, 'timeout': 1, 'proc_check_interval': 5, 'debug': False} Other operations: 1. kill local chrome process with given port: python -m ichrome -s 9222 + python -m ichrome -k 9222 2. clear user_data_dir path (remove the folder and files): python -m ichrome --clear python -m ichrome --clean + python -m ichrome -C -p 9222 3. show ChromeDaemon.__doc__: python -m ichrome --doc 4. crawl the URL, output the HTML DOM: @@ -299,19 +303,20 @@ optional arguments: -cp CHROME_PATH, --chrome-path CHROME_PATH, --chrome_path CHROME_PATH chrome executable file path, default to null for automatic searching - --host HOST --remote-debugging-address, default to 127.0.0.1 + -H HOST, --host HOST --remote-debugging-address, default to 127.0.0.1 -p PORT, --port PORT --remote-debugging-port, default to 9222 --headless --headless and --hide-scrollbars, default to False - -s SHUTDOWN, --shutdown SHUTDOWN + -s SHUTDOWN, -k SHUTDOWN, --shutdown SHUTDOWN shutdown the given port, only for local running chrome -A USER_AGENT, --user-agent USER_AGENT, --user_agent USER_AGENT - --user-agen, default to 'Mozilla/5.0 (Windows NT 10.0; - WOW64) AppleWebKit/537.36 (KHTML, like Gecko) - Chrome/70.0.3538.102 Safari/537.36' + --user-agent, default to Chrome PC: Mozilla/5.0 + (Linux; Android 6.0; Nexus 5 Build/MRA58N) + AppleWebKit/537.36 (KHTML, like Gecko) + Chrome/83.0.4103.106 Mobile Safari/537.36 -x PROXY, --proxy PROXY --proxy-server, default to None -U USER_DATA_DIR, --user-data-dir USER_DATA_DIR, --user_data_dir USER_DATA_DIR - user_data_dir to save the user data, default to + user_data_dir to save user data, default to ~/ichrome_user_data --disable-image, --disable_image disable image for loading performance, default to @@ -320,20 +325,19 @@ optional arguments: start url while launching chrome, default to about:blank --max-deaths MAX_DEATHS, --max_deaths MAX_DEATHS - max deaths in 5 secs, auto restart `max_deaths` times - if crash fast in 5 secs. default to 1 for without - auto-restart + restart times. default to 1 for without auto-restart --timeout TIMEOUT timeout to connect the remote server, default to 1 for localhost -w WORKERS, --workers WORKERS - the number of worker processes with auto-increment - port, default to 1 + the number of worker processes, default to 1 --proc-check-interval PROC_CHECK_INTERVAL, --proc_check_interval PROC_CHECK_INTERVAL check chrome process alive every interval seconds --crawl crawl the given URL, output the HTML DOM -C, --clear, --clear clean user_data_dir --doc show ChromeDaemon.__doc__ --debug set logger level to DEBUG + -K, --killall killall chrome launched local with --remote-debugging- + port ``` ## Interactive Debugging diff --git a/examples_async.py b/examples_async.py index cc56e08..25cb911 100644 --- a/examples_async.py +++ b/examples_async.py @@ -291,6 +291,11 @@ def on_shutdown(chromed): assert await tab.clear_browser_cache() # close tab await tab.close() + # test chrome.connect_tab + async with chrome.connect_tab(chrome.get_server('/json'), + True) as tab: + await tab.wait_loading(2) + assert 'webSocketDebuggerUrl' in (await tab.current_html) # close_browser gracefully, I have no more need of chrome instance await chrome.close_browser() # await chrome.kill() diff --git a/ichrome/async_utils.py b/ichrome/async_utils.py index 7a0985b..b85be6e 100644 --- a/ichrome/async_utils.py +++ b/ichrome/async_utils.py @@ -47,7 +47,7 @@ async def _ensure_awaitable_callback_result(callback_function, result): return callback_result -class _TabConnectionManager(object): +class _TabConnectionManager: def __init__(self, tabs): self.tabs = tabs @@ -65,7 +65,37 @@ async def __aexit__(self, exc_type, exc_val, exc_tb): await ws_connection.shutdown() -class _WSConnection(object): +class _SingleTabConnectionManager: + + def __init__(self, + chrome: 'Chrome', + index: Union[None, int, str] = 0, + auto_close: bool = False): + self.chrome = chrome + self.index = index + self.tab: 'Tab' = None + self._ws_connection: '_WSConnection' = None + self._auto_close = auto_close + + async def __aenter__(self) -> 'Tab': + if isinstance(self.index, int): + self.tab = await self.chrome.get_tab(self.index) + else: + self.tab = await self.chrome.new_tab(self.index or "") + if not self.tab: + raise ValueError(f'Tab not found.') + self._ws_connection = _WSConnection(self.tab) + await self._ws_connection.__aenter__() + return self.tab + + async def __aexit__(self, exc_type, exc_val, exc_tb): + if self._ws_connection: + if self._auto_close: + await self.tab.close(timeout=0) + await self._ws_connection.__aexit__() + + +class _WSConnection: def __init__(self, tab): self.tab = tab @@ -1707,7 +1737,7 @@ def mouse_drag_rel_chain(self, timeout=timeout) -class OffsetMoveWalker(object): +class OffsetMoveWalker: __slots__ = ('path', 'start_x', 'start_y', 'tab', 'timeout') def __init__(self, start_x, start_y, tab: Tab, timeout=NotSet): @@ -1767,7 +1797,7 @@ async def start(self): return self -class Listener(object): +class Listener: def __init__(self): self._registered_futures = WeakValueDictionary() @@ -2001,6 +2031,23 @@ async def close_tabs(self, tab_ids = await self.tabs return [await self.close_tab(tab_id) for tab_id in tab_ids] + def connect_tab(self, + index: Union[None, int, str] = 0, + auto_close: bool = False): + '''More easier way to init a connected Tab with `async with`. + + Got a connected Tab object by using `async with chrome.connect_tab(0):` + + index = 0 means the current tab. + index = None means create a new tab. + index = 'http://python.org' means create a new tab with url. + + If auto_close is True: close this tab while exiting context. +''' + return _SingleTabConnectionManager(chrome=self, + index=index, + auto_close=auto_close) + def connect_tabs(self, *tabs) -> '_TabConnectionManager': '''async with chrome.connect_tabs([tab1, tab2]):. or From 78f1086b114dfe1438bbd6c575c3db0b22ddd137 Mon Sep 17 00:00:00 2001 From: clericpy Date: Sun, 21 Jun 2020 16:50:48 +0800 Subject: [PATCH 12/17] update readmes --- examples_async.py | 1 + ichrome/debugger.py | 3 +-- use_cases.py | 7 ++----- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/examples_async.py b/examples_async.py index 25cb911..8753bf5 100644 --- a/examples_async.py +++ b/examples_async.py @@ -261,6 +261,7 @@ def on_shutdown(chromed): assert chrome.get_memory() > 0 await test_chrome(chrome) # ===================== Tab Test Cases ===================== + # Duplicate, use async with chrome.connect_tab(None) instead tab: Tab = await chrome.new_tab() await test_tab_ws(tab) # same as: async with tab.connect(): diff --git a/ichrome/debugger.py b/ichrome/debugger.py index c8203cb..19d20c5 100644 --- a/ichrome/debugger.py +++ b/ichrome/debugger.py @@ -276,8 +276,7 @@ async def crawl_once(**kwargs): port=kwargs.get('port', 9222), timeout=cd._timeout or 2, ) as chrome: - tab: AsyncTab = await chrome[0] - async with tab(): + async with chrome.connect_tab(0) as tab: await tab.set_url(url, timeout=cd._timeout) html = await tab.get_html(timeout=cd._timeout) print(html) diff --git a/use_cases.py b/use_cases.py index 514ed4a..2c37f23 100644 --- a/use_cases.py +++ b/use_cases.py @@ -33,8 +33,7 @@ async def main(): # listen network flow in 60 s timeout = 60 async with AsyncChrome() as chrome: - tab: AsyncTab = await chrome[0] - async with tab(): + async with chrome.connect_tab(0) as tab: await tab.wait_request(filter_function=filter_function, timeout=timeout) @@ -57,14 +56,12 @@ async def main(): timeout = 3 async def crawl(url): - tab: AsyncTab = await chrome.new_tab(url) - async with tab(): + async with chrome.connect_tab(url, True) as tab: await tab.wait_loading(timeout=timeout) html = await tab.html result = re.search('

(.*?)

', html).group(1) print(result) assert result == 'Herman Melville - Moby-Dick' - await tab.close() async with AsyncChromeDaemon(headless=True): async with AsyncChrome() as chrome: From 52bdf7bdebaadf716de436dcfc8b1a000dec146d Mon Sep 17 00:00:00 2001 From: clericpy Date: Sun, 21 Jun 2020 16:57:04 +0800 Subject: [PATCH 13/17] update test cases --- use_cases.py | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/use_cases.py b/use_cases.py index 2c37f23..2a8528f 100644 --- a/use_cases.py +++ b/use_cases.py @@ -4,17 +4,15 @@ """ -def network_sniffer(): +def network_sniffer(timeout=60): """network flow sniffer - 0. launch a chrome daemon before running this use case - > python3 -m ichrome 1. run the function. 2. change url of chrome's tab. 3. watch the console logs. """ import asyncio - from ichrome import AsyncChrome, AsyncTab + from ichrome import AsyncChrome, AsyncTab, AsyncChromeDaemon import json get_data_value = AsyncTab.get_data_value @@ -30,26 +28,24 @@ def filter_function(r): # print(r) async def main(): - # listen network flow in 60 s - timeout = 60 - async with AsyncChrome() as chrome: - async with chrome.connect_tab(0) as tab: - await tab.wait_request(filter_function=filter_function, - timeout=timeout) + async with AsyncChromeDaemon(): + async with AsyncChrome() as chrome: + async with chrome.connect_tab(0) as tab: + await tab.wait_request(filter_function=filter_function, + timeout=timeout) asyncio.run(main()) -def html_headless_crawler(): +def html_headless_crawler(url='http://httpbin.org/html'): """crawl a page with headless chrome""" import asyncio import re - from ichrome import AsyncChrome, AsyncTab, AsyncChromeDaemon + from ichrome import AsyncChrome, AsyncChromeDaemon # WARNING: Chrome has a limit of 6 connections per host name, and a max of 10 connections. # Read more: https://blog.bluetriangle.com/blocking-web-performance-villain - test_urls = ['http://httpbin.org/html'] * 3 async def main(): # crawl 3 urls in 3 tabs @@ -62,12 +58,16 @@ async def crawl(url): result = re.search('

(.*?)

', html).group(1) print(result) assert result == 'Herman Melville - Moby-Dick' - + # multi-urls concurrently crawl + # test_urls = ['http://httpbin.org/html'] * 3 + # async with AsyncChromeDaemon(headless=True): + # async with AsyncChrome() as chrome: + # tasks = [asyncio.ensure_future(crawl(url)) for url in test_urls] + # await asyncio.wait(tasks) + # # await asyncio.sleep(2) async with AsyncChromeDaemon(headless=True): async with AsyncChrome() as chrome: - tasks = [asyncio.ensure_future(crawl(url)) for url in test_urls] - await asyncio.wait(tasks) - # await asyncio.sleep(2) + await crawl(url) asyncio.run(main()) @@ -102,6 +102,6 @@ async def main(): if __name__ == "__main__": pass - # network_sniffer() - # html_headless_crawler() + # network_sniffer(120) + # html_headless_crawler('http://httpbin.org/html') # custom_ua_headless_crawler() From 162bea2664478e4c1b320001125ba80088edda0f Mon Sep 17 00:00:00 2001 From: clericpy Date: Sun, 21 Jun 2020 17:26:44 +0800 Subject: [PATCH 14/17] add --clear-cache --- ichrome/__main__.py | 20 ++++++++++++++++---- ichrome/debugger.py | 14 ++++++++++++-- 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/ichrome/__main__.py b/ichrome/__main__.py index f5092ba..32a6487 100644 --- a/ichrome/__main__.py +++ b/ichrome/__main__.py @@ -30,7 +30,7 @@ def main(): 3. show ChromeDaemon.__doc__: python -m ichrome --doc 4. crawl the URL, output the HTML DOM: - python -m ichrome --crawl --headless --timeout=2 http://myip.ipip.net/ + python -m ichrome --crawl --timeout=2 http://myip.ipip.net/ ''' parser = argparse.ArgumentParser(usage=usage) parser.add_argument("-v", @@ -61,7 +61,7 @@ def main(): parser.add_argument( "--headless", help="--headless and --hide-scrollbars, default to False", - default=False, + default=argparse.SUPPRESS, action="store_true") parser.add_argument( "-s", @@ -119,7 +119,8 @@ def main(): help="check chrome process alive every interval seconds", default=5, type=int) - parser.add_argument("--crawl", + parser.add_argument("-crawl", + "--crawl", help="crawl the given URL, output the HTML DOM", default=False, action="store_true") @@ -140,6 +141,12 @@ def main(): help="set logger level to DEBUG", default=False, action="store_true") + parser.add_argument("-cc", + "--clear-cache", + "--clear_cache", + help="clear cache for given port, port default to 9222", + default=False, + action="store_true") parser.add_argument( "-K", "--killall", @@ -185,7 +192,7 @@ def main(): kwargs.update( chrome_path=args.chrome_path, host=args.host, - headless=args.headless, + headless=getattr(args, 'headless', False), user_agent=args.user_agent, proxy=args.proxy, user_data_dir=args.user_data_dir, @@ -206,7 +213,12 @@ def main(): if '--dump-dom' in extra_config or args.crawl: logger.setLevel(60) from .debugger import crawl_once + kwargs['headless'] = getattr(args, 'headless', True) asyncio.run(crawl_once(**kwargs)) + elif args.clear_cache: + from .debugger import clear_cache_handler + kwargs['headless'] = getattr(args, 'headless', True) + asyncio.run(clear_cache_handler(**kwargs)) else: start_port = getattr(args, 'port', 9222) asyncio.run( diff --git a/ichrome/debugger.py b/ichrome/debugger.py index 19d20c5..5c2c501 100644 --- a/ichrome/debugger.py +++ b/ichrome/debugger.py @@ -276,8 +276,18 @@ async def crawl_once(**kwargs): port=kwargs.get('port', 9222), timeout=cd._timeout or 2, ) as chrome: - async with chrome.connect_tab(0) as tab: + async with chrome.connect_tab(0, auto_close=True) as tab: await tab.set_url(url, timeout=cd._timeout) html = await tab.get_html(timeout=cd._timeout) print(html) - # await tab.close_browser() + + +async def clear_cache_handler(**kwargs): + async with AsyncChromeDaemon(**kwargs) as cd: + async with AsyncChrome( + host=kwargs.get('host', '127.0.0.1'), + port=kwargs.get('port', 9222), + timeout=cd._timeout or 2, + ) as chrome: + async with chrome.connect_tab(0, auto_close=True) as tab: + await tab.clear_browser_cache() From 59a3ef64a56194f3d9a674c0fe74c47df64fda2d Mon Sep 17 00:00:00 2001 From: clericpy Date: Sun, 21 Jun 2020 17:42:10 +0800 Subject: [PATCH 15/17] fix tab.wait_page_loading --- ichrome/async_utils.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ichrome/async_utils.py b/ichrome/async_utils.py index b85be6e..4da9ead 100644 --- a/ichrome/async_utils.py +++ b/ichrome/async_utils.py @@ -683,9 +683,10 @@ async def wait_page_loading(self, timeout=None, callback_function: Optional[Callable] = None, timeout_stop_loading=False): - return self.wait_loading(timeout=timeout, - callback_function=callback_function, - timeout_stop_loading=timeout_stop_loading) + return await self.wait_loading( + timeout=timeout, + callback_function=callback_function, + timeout_stop_loading=timeout_stop_loading) async def wait_event( self, From 12b0ce4fdb7ef1958c0d8cbdf6ebf47b2a63e1a8 Mon Sep 17 00:00:00 2001 From: clericpy Date: Sun, 21 Jun 2020 17:48:25 +0800 Subject: [PATCH 16/17] update readme, 2.3.0 --- README.md | 80 +++++++++++++++++++++++++++------------------ ichrome/__init__.py | 2 +- 2 files changed, 49 insertions(+), 33 deletions(-) diff --git a/README.md b/README.md index 74e9cbd..36e86ea 100644 --- a/README.md +++ b/README.md @@ -80,7 +80,23 @@ assert (await tab0.current_url) == 'about:blank' assert (await tab1.current_url) == 'about:blank' ``` - +1. `connect_tab` + > The easiest way to get a connected tab. + > get an existing tab + ```python + async with chrome.connect_tab(0) as tab: + print(await tab.current_title) + ``` + > get a new tab and auto close it + ```python + async with chrome.connect_tab(None, True) as tab: + print(await tab.current_title) + ``` + > get a new tab with given url and auto close it + ```python + async with chrome.connect_tab('http://python.org', True) as tab: + print(await tab.current_title) + ``` @@ -90,91 +106,91 @@ 1. `set_url` / `reload` > navigate to a new url. `reload` equals to `set_url(None)` -1. `wait_event` +2. `wait_event` > listening the events with given name, and separate from other same-name events with filter_function, finally run the callback_function with result. -1. `wait_page_loading` / `wait_loading` +3. `wait_page_loading` / `wait_loading` > wait for `Page.loadEventFired` event, or stop loading while timeout. Different from `wait_loading_finished`. -1. `wait_response` / `wait_request` +4. `wait_response` / `wait_request` > filt the `Network.responseReceived` / `Network.requestWillBeSent` event by `filter_function`, return the `request_dict` which can be used by `get_response` / `get_response_body` / `get_request_post_data`. WARNING: requestWillBeSent event fired do not mean the response is ready, should await tab.wait_request_loading(request_dict) or await tab.get_response(request_dict, wait_loading=True) -1. `wait_request_loading` / `wait_loading_finished` +5. `wait_request_loading` / `wait_loading_finished` > sometimes event got `request_dict` with `wait_response`, but the ajax request is still fetching, which need to wait the `Network.loadingFinished` event. -1. `activate` / `activate_tab` +6. `activate` / `activate_tab` > activate tab with websocket / http message. -1. `close` / `close_tab` +7. `close` / `close_tab` > close tab with websocket / http message. -1. `add_js_onload` +8. `add_js_onload` > `Page.addScriptToEvaluateOnNewDocument`, which means this javascript code will be run before page loaded. -1. `clear_browser_cache` / `clear_browser_cookies` +9. `clear_browser_cache` / `clear_browser_cookies` > `Network.clearBrowserCache` and `Network.clearBrowserCookies` -1. `querySelectorAll` +10. `querySelectorAll` > get the tag instance, which contains the `tagName, innerHTML, outerHTML, textContent, attributes` attrs. -1. `click` +11. `click` > click the element queried by given *css selector*. -1. `refresh_tab_info` +12. `refresh_tab_info` > to refresh the init attrs: `url`, `title`. -1. `current_html` / `current_title` / `current_url` +13. `current_html` / `current_title` / `current_url` > get the current html / title / url with `tab.js`. or using the `refresh_tab_info` method and init attrs. -1. `crash` +14. `crash` > `Page.crash` -1. `get_cookies` / `get_all_cookies` / `delete_cookies` / `set_cookie` +15. `get_cookies` / `get_all_cookies` / `delete_cookies` / `set_cookie` > some page cookies operations. -1. `set_headers` / `set_ua` +16. `set_headers` / `set_ua` > `Network.setExtraHTTPHeaders` and `Network.setUserAgentOverride`, used to update headers dynamically. -1. `close_browser` +17. `close_browser` > send `Browser.close` message to close the chrome browser gracefully. -1. `get_bounding_client_rect` / `get_element_clip` +18. `get_bounding_client_rect` / `get_element_clip` > `get_element_clip` is alias name for the other, these two method is to get the rect of element which queried by css element. -1. `screenshot` / `screenshot_element` +19. `screenshot` / `screenshot_element` > get the screenshot base64 encoded image data. `screenshot_element` should be given a css selector to locate the element. -1. `get_page_size` / `get_screen_size` +20. `get_page_size` / `get_screen_size` > size of current window or the whole screen. -1. `get_response` +21. `get_response` > get the response body with the given request dict. -1. `js` +22. `js` > run the given js code, return the raw response from sending `Runtime.evaluate` message. -1. `inject_js_url` +23. `inject_js_url` > inject some js url, like `` do. -1. `get_value` & `get_variable` +24. `get_value` & `get_variable` > run the given js variable or expression, and return the result. ```python await tab.get_value('document.title') await tab.get_value("document.querySelector('title').innerText") ``` -8. `keyboard_send` +25. `keyboard_send` > dispath key event with `Input.dispatchKeyEvent` -9. `mouse_click` +26. `mouse_click` > dispath click event on given position -1. `mouse_drag` +27. `mouse_drag` > dispath drag event on given position, and return the target x, y. `duration` arg is to slow down the move speed. -1. `mouse_drag_rel` +28. `mouse_drag_rel` > dispath drag event on given offset, and return the target x, y. -1. `mouse_drag_rel` +29. `mouse_drag_rel` > drag with offsets continuously. ```python await tab.set_url('https://draw.yunser.com/') @@ -182,10 +198,10 @@ 0, 50, 0.2).move(-50, 0, 0.2).move(0, -50, 0.2) await walker.move(50 * 1.414, 50 * 1.414, 0.2) ``` -1. `mouse_press` / `mouse_release` / `mouse_move` / `mouse_move_rel` / `mouse_move_rel_chain` +30. `mouse_press` / `mouse_release` / `mouse_move` / `mouse_move_rel` / `mouse_move_rel_chain` > similar to the drag features. These mouse features is only dispatched events, not the real mouse action. -1. `history_back` / `history_forward` / `goto_history_relative` / `reset_history` +31. `history_back` / `history_forward` / `goto_history_relative` / `reset_history` > back / forward history @@ -222,7 +238,7 @@ async def main(): async with AsyncChromeDaemon(): # connect to an opened chrome, default host=127.0.0.1, port=9222, headless=False async with AsyncChrome() as chrome: - # If you need the current tab, set index with int like 0 for activated tab. + # If you need reuse an existing tab, set index with int like 0 for activated tab, such as `async with chrome.connect_tab(0) as tab:` async with chrome.connect_tab(index='https://github.com/ClericPy', auto_close=True) as tab: await tab.wait_loading(2) diff --git a/ichrome/__init__.py b/ichrome/__init__.py index 2eb7256..0f40b61 100644 --- a/ichrome/__init__.py +++ b/ichrome/__init__.py @@ -5,7 +5,7 @@ from .logs import logger from .sync_utils import Chrome, Tab -__version__ = "2.2.4" +__version__ = "2.3.0" __tips__ = "[github]: https://github.com/ClericPy/ichrome\n[cdp]: https://chromedevtools.github.io/devtools-protocol/\n[cmd args]: https://peter.sh/experiments/chromium-command-line-switches/" __all__ = [ 'Chrome', 'ChromeDaemon', 'Tab', 'Tag', 'AsyncChrome', 'AsyncTab', 'logger', From c440d6801feb464a096f06ebfd343bb7606b3249 Mon Sep 17 00:00:00 2001 From: clericpy Date: Sun, 21 Jun 2020 17:50:55 +0800 Subject: [PATCH 17/17] fix test cases --- examples_async.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples_async.py b/examples_async.py index 8753bf5..b1bd8cb 100644 --- a/examples_async.py +++ b/examples_async.py @@ -293,8 +293,7 @@ def on_shutdown(chromed): # close tab await tab.close() # test chrome.connect_tab - async with chrome.connect_tab(chrome.get_server('/json'), - True) as tab: + async with chrome.connect_tab(chrome.server + '/json', True) as tab: await tab.wait_loading(2) assert 'webSocketDebuggerUrl' in (await tab.current_html) # close_browser gracefully, I have no more need of chrome instance