Skip to content

Commit

Permalink
fix: run log writer in a separate thread
Browse files Browse the repository at this point in the history
  • Loading branch information
monosans committed Nov 6, 2024
1 parent 460090d commit 48ffa9a
Show file tree
Hide file tree
Showing 8 changed files with 73 additions and 56 deletions.
47 changes: 16 additions & 31 deletions proxy_scraper_checker/__main__.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
# ruff: noqa: E402
from __future__ import annotations

from . import logs

_console, _logs_listener = logs.configure()

import asyncio
import logging
import sys
from typing import TYPE_CHECKING

import aiofiles
import rich.traceback
from aiohttp import ClientSession, TCPConnector
from rich.console import Console
from rich.logging import RichHandler
from rich.progress import BarColumn, MofNCompleteColumn, Progress, TextColumn
from rich.table import Table

Expand All @@ -36,7 +38,7 @@

T = TypeVar("T")

logger = logging.getLogger(__name__)
_logger = logging.getLogger(__name__)


def get_async_run() -> Callable[[Coroutine[Any, Any, T]], T]:
Expand Down Expand Up @@ -73,27 +75,6 @@ async def read_config(file: str, /) -> dict[str, Any]:
return tomllib.loads(utils.bytes_decode(content))


def configure_logging(*, console: Console, debug: bool) -> None:
rich.traceback.install(
console=console, width=None, extra_lines=0, word_wrap=True
)
logging.basicConfig(
format="%(message)s",
datefmt=logging.Formatter.default_time_format,
level=logging.DEBUG if debug else logging.INFO,
handlers=(
RichHandler(
console=console,
omit_repeated_times=False,
show_path=False,
rich_tracebacks=True,
tracebacks_extra_lines=0,
),
),
force=True,
)


def get_summary_table(
*, before: Mapping[ProxyType, int], after: Mapping[ProxyType, int]
) -> Table:
Expand All @@ -113,8 +94,8 @@ def get_summary_table(

async def main() -> None:
cfg = await read_config("config.toml")
console = Console()
configure_logging(console=console, debug=cfg["debug"])
if cfg["debug"]:
logging.root.setLevel(logging.DEBUG)
should_save = False
try:
async with ClientSession(
Expand All @@ -132,7 +113,7 @@ async def main() -> None:
TextColumn("[green]{task.fields[col2]}"),
BarColumn(),
MofNCompleteColumn(),
console=console,
console=_console,
transient=True,
) as progress:
scrape = scraper.scrape_all(
Expand Down Expand Up @@ -166,7 +147,7 @@ async def main() -> None:
if settings.check_website:
storage.remove_unchecked()
count_after_checking = storage.get_count()
console.print(
_console.print(
get_summary_table(
before=count_before_checking, after=count_after_checking
)
Expand All @@ -176,10 +157,14 @@ async def main() -> None:
output.save_proxies, storage=storage, settings=settings
)

logger.info(
_logger.info(
"Thank you for using https://github.com/monosans/proxy-scraper-checker"
)


if __name__ == "__main__":
get_async_run()(main())
_logs_listener.start()
try:
get_async_run()(main())
finally:
_logs_listener.stop()
6 changes: 3 additions & 3 deletions proxy_scraper_checker/checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from .settings import Settings
from .storage import ProxyStorage

logger = logging.getLogger(__name__)
_logger = logging.getLogger(__name__)


async def check_one(
Expand All @@ -32,9 +32,9 @@ async def check_one(
except Exception as e:
# Too many open files
if isinstance(e, OSError) and e.errno == 24: # noqa: PLR2004
logger.error("Please, set max_connections to lower value")
_logger.error("Please, set max_connections to lower value")

logger.debug(
_logger.debug(
"%s.%s: %s", e.__class__.__module__, e.__class__.__qualname__, e
)
storage.remove(proxy)
Expand Down
4 changes: 2 additions & 2 deletions proxy_scraper_checker/fs.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
if TYPE_CHECKING:
from pathlib import Path

logger = logging.getLogger(__name__)
_logger = logging.getLogger(__name__)
CACHE_PATH = platformdirs.user_cache_path("proxy_scraper_checker")


Expand All @@ -20,7 +20,7 @@ def add_permission(
new_permissions = current_permissions | permission
if current_permissions != new_permissions:
path.chmod(new_permissions)
logger.info(
_logger.info(
"Changed permissions of %s from %o to %o",
path,
current_permissions,
Expand Down
8 changes: 4 additions & 4 deletions proxy_scraper_checker/geodb.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from aiohttp import ClientResponse, ClientSession
from rich.progress import Progress, TaskID

logger = logging.getLogger(__name__)
_logger = logging.getLogger(__name__)

GEODB_URL = "https://raw.githubusercontent.com/P3TERX/GeoLite.mmdb/download/GeoLite2-City.mmdb"
GEODB_PATH = fs.CACHE_PATH / "geolocation_database.mmdb"
Expand Down Expand Up @@ -70,7 +70,7 @@ async def download_geodb(*, progress: Progress, session: ClientSession) -> None:

async with session.get(GEODB_URL, headers=headers) as response:
if response.status == 304: # noqa: PLR2004
logger.info(
_logger.info(
"Latest geolocation database is already cached at %s",
GEODB_PATH,
)
Expand All @@ -87,13 +87,13 @@ async def download_geodb(*, progress: Progress, session: ClientSession) -> None:
)

if IS_DOCKER:
logger.info(
_logger.info(
"Downloaded geolocation database to proxy_scraper_checker_cache "
"Docker volume (%s in container)",
GEODB_PATH,
)
else:
logger.info("Downloaded geolocation database to %s", GEODB_PATH)
_logger.info("Downloaded geolocation database to %s", GEODB_PATH)

if etag := response.headers.get(hdrs.ETAG):
await _save_etag(etag)
Expand Down
32 changes: 32 additions & 0 deletions proxy_scraper_checker/logs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from __future__ import annotations

import logging
import logging.handlers
import queue
from typing import TYPE_CHECKING

if TYPE_CHECKING:
from typing import Any

from rich.console import Console


def configure() -> tuple[Console, logging.handlers.QueueListener]:
log_queue: queue.Queue[Any] = queue.Queue()

logging.root.setLevel(logging.INFO)
logging.root.addHandler(logging.handlers.QueueHandler(log_queue))

# Start logging before importing rich for the first time
from rich.console import Console # noqa: PLC0415
from rich.logging import RichHandler # noqa: PLC0415

console = Console()
stream_handler = RichHandler(
console=console,
omit_repeated_times=False,
show_path=False,
log_time_format=logging.Formatter.default_time_format,
)

return console, logging.handlers.QueueListener(log_queue, stream_handler)
6 changes: 3 additions & 3 deletions proxy_scraper_checker/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from .settings import Settings
from .storage import ProxyStorage

logger = logging.getLogger(__name__)
_logger = logging.getLogger(__name__)


def _create_proxy_list_str(
Expand Down Expand Up @@ -105,11 +105,11 @@ def save_proxies(*, settings: Settings, storage: ProxyStorage) -> None:
text, encoding="utf-8"
)
if IS_DOCKER:
logger.info(
_logger.info(
"Proxies have been saved to ./out (%s in container)",
settings.output_path.absolute(),
)
else:
logger.info(
_logger.info(
"Proxies have been saved to %s", settings.output_path.absolute()
)
8 changes: 4 additions & 4 deletions proxy_scraper_checker/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from .settings import Settings
from .storage import ProxyStorage

logger = logging.getLogger(__name__)
_logger = logging.getLogger(__name__)


async def scrape_one(
Expand All @@ -44,11 +44,11 @@ async def scrape_one(
content = await f.read()
text = bytes_decode(content)
except ClientResponseError as e:
logger.warning(
_logger.warning(
"%s | HTTP status code %d: %s", source, e.status, e.message
)
except Exception as e:
logger.warning(
_logger.warning(
"%s | %s.%s: %s",
source,
e.__class__.__module__,
Expand All @@ -60,7 +60,7 @@ async def scrape_one(
try:
proxy = next(proxies)
except StopIteration:
logger.warning("%s | No proxies found", source)
_logger.warning("%s | No proxies found", source)
else:
for proxy in itertools.chain((proxy,), proxies): # noqa: B020
try:
Expand Down
18 changes: 9 additions & 9 deletions proxy_scraper_checker/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@

from .proxy import Proxy

logger = logging.getLogger(__name__)
_logger = logging.getLogger(__name__)


def _get_supported_max_connections() -> int | None:
Expand All @@ -46,7 +46,7 @@ def _get_supported_max_connections() -> int | None:
import resource # type: ignore[unreachable, unused-ignore] # noqa: PLC0415

soft_limit, hard_limit = resource.getrlimit(resource.RLIMIT_NOFILE)
logger.debug(
_logger.debug(
"max_connections: soft limit = %d, hard limit = %d, infinity = %d",
soft_limit,
hard_limit,
Expand All @@ -56,7 +56,7 @@ def _get_supported_max_connections() -> int | None:
try:
resource.setrlimit(resource.RLIMIT_NOFILE, (hard_limit, hard_limit))
except ValueError as e:
logger.warning("Failed setting max_connections: %s", e)
_logger.warning("Failed setting max_connections: %s", e)
else:
soft_limit = hard_limit
if soft_limit == resource.RLIM_INFINITY:
Expand All @@ -70,11 +70,11 @@ def _get_max_connections(value: int, /) -> int | None:
raise ValueError(msg)
max_supported = _get_supported_max_connections()
if not value:
logger.info("Using %d as max_connections value", max_supported or 0)
_logger.info("Using %d as max_connections value", max_supported or 0)
return max_supported
if not max_supported or value <= max_supported:
return value
logger.warning(
_logger.warning(
"max_connections value is too high for your OS. "
"The config value will be ignored and %d will be used.%s",
max_supported,
Expand Down Expand Up @@ -151,7 +151,7 @@ async def _get_check_website_type_and_real_ip(
content = await response.read()
text = get_response_text(response=response, content=content)
except Exception:
logger.exception(
_logger.exception(
"Error when opening check_website without proxy, it will be "
"impossible to determine anonymity and geolocation of proxies"
)
Expand All @@ -168,7 +168,7 @@ async def _get_check_website_type_and_real_ip(
return CheckWebsiteType.HTTPBIN_IP, parse_ipv4(js["origin"])
except (KeyError, TypeError, ValueError):
pass
logger.warning(
_logger.warning(
"Check_website is not httpbin and does not return plain ip, so it will"
" be impossible to determine the anonymity and geolocation of proxies"
)
Expand Down Expand Up @@ -249,7 +249,7 @@ def __attrs_post_init__(self) -> None:
raise ValueError(msg)

if not self.check_website and self.sort_by_speed:
logger.warning(
_logger.warning(
"Proxy checking is disabled, so sorting by speed is not"
" possible. Alphabetical sorting will be used instead."
)
Expand All @@ -272,7 +272,7 @@ def _validate_check_website(
raise ValueError(msg)

if parsed_url.scheme == "http":
logger.warning(
_logger.warning(
"check_website uses the http protocol. "
"It is recommended to use https for correct checking."
)
Expand Down

0 comments on commit 48ffa9a

Please sign in to comment.