From 7d39909124b88003873e54e63d06b852fb244376 Mon Sep 17 00:00:00 2001 From: thatDudo Date: Thu, 16 Nov 2023 21:11:18 +0100 Subject: [PATCH 1/4] Fix waifu2x --- manga_translator/translators/chatgpt.py | 7 +++++-- manga_translator/upscaling/waifu2x.py | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/manga_translator/translators/chatgpt.py b/manga_translator/translators/chatgpt.py index aa0fed260..ffca13491 100644 --- a/manga_translator/translators/chatgpt.py +++ b/manga_translator/translators/chatgpt.py @@ -1,6 +1,9 @@ import re -import openai -import openai.error +try: + import openai + import openai.error +except ImportError: + openai = None import asyncio import time from typing import List, Dict diff --git a/manga_translator/upscaling/waifu2x.py b/manga_translator/upscaling/waifu2x.py index 8ea7849e2..5c654b3cb 100644 --- a/manga_translator/upscaling/waifu2x.py +++ b/manga_translator/upscaling/waifu2x.py @@ -51,7 +51,7 @@ # https://github.com/nihui/waifu2x-ncnn-vulkan class Waifu2xUpscaler(OfflineUpscaler): # ~2GB of vram _MODEL_MAPPING = model_mapping - _VALID_UPSCALE_RATIOS = [1, 2, 4, 8, 16, 32] + _VALID_UPSCALE_RATIOS = [2, 4, 8, 16, 32] def __init__(self, *args, **kwargs): os.makedirs(self.model_dir, exist_ok=True) From 5cec578cebd7b68294aab95faa27b5bbd80ef7d0 Mon Sep 17 00:00:00 2001 From: thatDudo Date: Fri, 17 Nov 2023 22:56:09 +0100 Subject: [PATCH 2/4] Set correct openai version (0.28) in requirements.txt We need to update the google translator to use the newer httpx version that openai relies on. --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 618189773..8b213d0a6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -29,7 +29,7 @@ websockets protobuf ctranslate2 colorama -openai +openai==0.28 open_clip_torch safetensors pandas From 45001cc6eb7a02479b80deb7a0df37d8a2a27f42 Mon Sep 17 00:00:00 2001 From: thatDudo Date: Sun, 19 Nov 2023 22:30:27 +0100 Subject: [PATCH 3/4] Filter after merge and skip text thats already in the target language --- README.md | 7 ++-- README_CN.md | 7 ++-- manga_translator/args.py | 1 + manga_translator/manga_translator.py | 48 ++++++++++++++------------ manga_translator/rendering/__init__.py | 2 +- manga_translator/utils/generic.py | 2 +- manga_translator/utils/textblock.py | 21 ++++++----- 7 files changed, 49 insertions(+), 39 deletions(-) diff --git a/README.md b/README.md index 9bca76465..539af5e7e 100644 --- a/README.md +++ b/README.md @@ -398,7 +398,7 @@ THA: Thai --detector {default,ctd,craft,none} Text detector used for creating a text mask from an image, DO NOT use craft for manga, it's not designed for it ---ocr {48px,32px,48px_ctc} Optical character recognition (OCR) model to use +--ocr {32px,48px,48px_ctc} Optical character recognition (OCR) model to use --inpainter {default,lama_large,lama_mpe,sd,none,original} Inpainting model to use --upscaler {waifu2x,esrgan,4xultrasharp} Upscaler to use. --upscale-ratio has to be set for it @@ -431,10 +431,11 @@ THA: Thai --box-threshold BOX_THRESHOLD Threshold for bbox generation --text-threshold TEXT_THRESHOLD Threshold for text detection --min-text-length MIN_TEXT_LENGTH Minimum text length of a text region +--no-text-lang-skip Dont skip text that is seemingly already in the target + language. --inpainting-size INPAINTING_SIZE Size of image used for inpainting (too large will result in OOM) ---inpainting-precision INPAINTING_PRECISION Inpainting precision for lama, - use bf16 while you can. +--inpainting-precision {fp32,fp16,bf16} Inpainting precision for lama, use bf16 while you can. --colorization-size COLORIZATION_SIZE Size of image used for colorization. Set to -1 to use full image size --denoise-sigma DENOISE_SIGMA Used by colorizer and affects color strength, range diff --git a/README_CN.md b/README_CN.md index 204582a52..e5a485519 100644 --- a/README_CN.md +++ b/README_CN.md @@ -130,7 +130,7 @@ THA: Thai --detector {default,ctd,craft,none} Text detector used for creating a text mask from an image, DO NOT use craft for manga, it's not designed for it ---ocr {48px,32px,48px_ctc} Optical character recognition (OCR) model to use +--ocr {32px,48px,48px_ctc} Optical character recognition (OCR) model to use --inpainter {default,lama_large,lama_mpe,sd,none,original} Inpainting model to use --upscaler {waifu2x,esrgan,4xultrasharp} Upscaler to use. --upscale-ratio has to be set for it @@ -163,10 +163,11 @@ THA: Thai --box-threshold BOX_THRESHOLD Threshold for bbox generation --text-threshold TEXT_THRESHOLD Threshold for text detection --min-text-length MIN_TEXT_LENGTH Minimum text length of a text region +--no-text-lang-skip Dont skip text that is seemingly already in the target + language. --inpainting-size INPAINTING_SIZE Size of image used for inpainting (too large will result in OOM) ---inpainting-precision INPAINTING_PRECISION Inpainting precision for lama, - use bf16 while you can. +--inpainting-precision {fp32,fp16,bf16} Inpainting precision for lama, use bf16 while you can. --colorization-size COLORIZATION_SIZE Size of image used for colorization. Set to -1 to use full image size --denoise-sigma DENOISE_SIGMA Used by colorizer and affects color strength, range diff --git a/manga_translator/args.py b/manga_translator/args.py index fe757ad6e..e7841218e 100644 --- a/manga_translator/args.py +++ b/manga_translator/args.py @@ -125,6 +125,7 @@ def _format_action_invocation(self, action: argparse.Action) -> str: parser.add_argument('--box-threshold', default=0.7, type=float, help='Threshold for bbox generation') parser.add_argument('--text-threshold', default=0.5, type=float, help='Threshold for text detection') parser.add_argument('--min-text-length', default=0, type=int, help='Minimum text length of a text region') +parser.add_argument('--no-text-lang-skip', action='store_true', help='Dont skip text that is seemingly already in the target language.') parser.add_argument('--inpainting-size', default=2048, type=int, help='Size of image used for inpainting (too large will result in OOM)') parser.add_argument('--inpainting-precision', default='fp32', type=str, help='Inpainting precision for lama, use bf16 while you can.', choices=['fp32', 'fp16', 'bf16']) parser.add_argument('--colorization-size', default=576, type=int, help='Size of image used for colorization. Set to -1 to use full image size') diff --git a/manga_translator/manga_translator.py b/manga_translator/manga_translator.py index 420a0f006..b81a9d1da 100644 --- a/manga_translator/manga_translator.py +++ b/manga_translator/manga_translator.py @@ -5,7 +5,7 @@ import cv2 from aiohttp.web_middlewares import middleware from omegaconf import OmegaConf -import py3langid as langid +import langcodes import requests import os import re @@ -469,15 +469,9 @@ async def _run_detection(self, ctx: Context): async def _run_ocr(self, ctx: Context): textlines = await dispatch_ocr(ctx.ocr, ctx.img_rgb, ctx.textlines, ctx, self.device, self.verbose) - # Filter out regions by original text new_textlines = [] for textline in textlines: - text = textline.text - if (ctx.filter_text and re.search(ctx.filter_text, text)) \ - or not is_valuable_text(text): - if text.strip(): - logger.info(f'Filtered out: {text}') - else: + if textline.text.strip(): if ctx.font_color_fg: textline.fg_r, textline.fg_g, textline.fg_b = ctx.font_color_fg if ctx.font_color_bg: @@ -488,12 +482,19 @@ async def _run_ocr(self, ctx: Context): async def _run_textline_merge(self, ctx: Context): text_regions = await dispatch_textline_merge(ctx.textlines, ctx.img_rgb.shape[1], ctx.img_rgb.shape[0], verbose=self.verbose) - text_regions = [region for region in text_regions if len(''.join(region.text)) >= ctx.min_text_length] - + new_text_regions = [] for region in text_regions: - if ctx.font_color_fg or ctx.font_color_bg: - if ctx.font_color_bg: - region.adjust_bg_color = False + if len(region.text) >= ctx.min_text_length \ + and not is_valuable_text(region.text) \ + or (not ctx.no_text_lang_skip and langcodes.tag_distance(region.source_lang, ctx.target_lang) == 0): + if region.text.strip(): + logger.info(f'Filtered out: {region.text}') + else: + if ctx.font_color_fg or ctx.font_color_bg: + if ctx.font_color_bg: + region.adjust_bg_color = False + new_text_regions.append(region) + text_regions = new_text_regions # Sort ctd (comic text detector) regions left to right. Otherwise right to left. # Sorting will improve text translation quality. @@ -501,10 +502,11 @@ async def _run_textline_merge(self, ctx: Context): return text_regions async def _run_text_translation(self, ctx: Context): - translated_sentences = await dispatch_translation(ctx.translator, - [region.get_text() for region in ctx.text_regions], - ctx.use_mtpe, - ctx, 'cpu' if self._cuda_limited_memory else self.device) + translated_sentences = \ + await dispatch_translation(ctx.translator, + [region.text for region in ctx.text_regions], + ctx.use_mtpe, + ctx, 'cpu' if self._cuda_limited_memory else self.device) for region, translation in zip(ctx.text_regions, translated_sentences): if ctx.uppercase: @@ -521,8 +523,8 @@ async def _run_text_translation(self, ctx: Context): for region in ctx.text_regions: # TODO: Maybe print reasons for filtering if not ctx.translator == 'none' and (region.translation.isnumeric() \ - or ctx.filter_text and re.search(ctx.filter_text, region.translation) - or not ctx.translator == 'original' and region.get_text().lower().strip() == region.translation.lower().strip()): + or ctx.filter_text and re.search(ctx.filter_text, region.translation) + or not ctx.translator == 'original' and region.text.lower().strip() == region.translation.lower().strip()): if region.translation.strip(): logger.info(f'Filtered out: {region.translation}') else: @@ -618,7 +620,7 @@ def identify_colors(fg_rgb: List[int]): s += f'\n-- {i + 1} --\n' s += f'color: #{color_id}: {color_name} (fg, bg: {rgb2hex(*fore)} {rgb2hex(*back)})\n' - s += f'text: {region.get_text()}\n' + s += f'text: {region.text}\n' s += f'trans: {region.translation}\n' for line in region.lines: s += f'coords: {list(line.ravel())}\n' @@ -743,7 +745,7 @@ async def _run_text_translation(self, ctx: Context): requests.post(f'http://{self.host}:{self.port}/request-manual-internal', json={ 'task_id': self._task_id, 'nonce': self.nonce, - 'texts': [r.get_text() for r in text_regions], + 'texts': [r.text for r in text_regions], 'translations': [r.translation for r in text_regions], }, timeout=20) @@ -1225,7 +1227,7 @@ def format_translate(self, ctx: Context, return_image: bool): trans = {key: value[i] for key, value in ctx['translations'].items()} else: trans = {} - trans["originalText"] = text_regions[i].get_text() + trans["originalText"] = text_regions[i].text if inpaint is not None: overlay = inpaint[minY:maxY, minX:maxX] @@ -1248,7 +1250,7 @@ def format_translate(self, ctx: Context, return_image: bool): 'fg': color1.tolist(), 'bg': color2.tolist() }, - 'language': langid.classify(text_regions[i].get_text())[0], + 'language': text_regions[i].source_lang, 'background': background }) if return_image and ctx.img_colorized is not None: diff --git a/manga_translator/rendering/__init__.py b/manga_translator/rendering/__init__.py index d088db334..111ef0dd4 100644 --- a/manga_translator/rendering/__init__.py +++ b/manga_translator/rendering/__init__.py @@ -41,7 +41,7 @@ def resize_regions_to_font_size(img: np.ndarray, text_regions: List[TextBlock], dst_points_list = [] for region in text_regions: - char_count_orig = len(region.get_text()) + char_count_orig = len(region.text) char_count_trans = len(region.translation.strip()) if char_count_trans > char_count_orig: # More characters were added, have to reduce fontsize to fit allotted area diff --git a/manga_translator/utils/generic.py b/manga_translator/utils/generic.py index be0253b20..a86c668bb 100644 --- a/manga_translator/utils/generic.py +++ b/manga_translator/utils/generic.py @@ -118,7 +118,7 @@ def is_punctuation(ch): def is_valuable_char(ch): # return re.search(r'[^\d\W]', ch) - return not is_punctuation(ch) and not is_control(ch) and not is_whitespace(ch) and not ch.isnumeric() + return not is_punctuation(ch) and not is_control(ch) and not is_whitespace(ch) and not ch.isdigit() def is_valuable_text(text): for ch in text: diff --git a/manga_translator/utils/textblock.py b/manga_translator/utils/textblock.py index 06227815d..3f5b10184 100644 --- a/manga_translator/utils/textblock.py +++ b/manga_translator/utils/textblock.py @@ -5,6 +5,7 @@ from functools import cached_property import copy import re +import py3langid as langid from .generic import color_difference, is_right_to_left_char, is_valuable_char # from ..detection.ctd_utils.utils.imgproc_utils import union_area, xywh2xyxypoly @@ -41,7 +42,7 @@ class TextBlock(object): Object that stores a block of text made up of textlines. """ def __init__(self, lines: List, - text: List[str] = None, + texts: List[str] = None, language: str = 'unknown', font_size: float = -1, angle: int = 0, @@ -60,6 +61,7 @@ def __init__(self, lines: List, _bounding_rect: List = None, default_stroke_width = 0.2, font_weight = 50, + source_lang: str = "", target_lang: str = "", opacity: float = 1., shadow_radius: float = 0., @@ -75,7 +77,8 @@ def __init__(self, lines: List, self.angle = angle self._direction = direction - self.text = text if text is not None else [] + self.texts = texts if texts is not None else [] + self.text = ' '.join(texts) self.prob = prob self.translation = translation @@ -92,6 +95,7 @@ def __init__(self, lines: List, self.line_spacing = line_spacing self.letter_spacing = letter_spacing self._alignment = alignment + self._source_lang = source_lang self.target_lang = target_lang self._bounding_rect = _bounding_rect @@ -235,10 +239,11 @@ def get_transformed_region(self, img: np.ndarray, line_idx: int, textheight: int region = cv2.resize(region, (maxwidth, h)) return region - def get_text(self): - if isinstance(self.text, str): - return self.text - return ' '.join(self.text).strip() + @property + def source_lang(self): + if not self._source_lang: + self._source_lang = langid.classify(self.text)[0] + return self._source_lang def get_translation_for_rendering(self): text = self.translation @@ -275,7 +280,7 @@ def is_bulleted_list(self): A determining factor of whether we should be sticking to the strict per textline text distribution when rendering. """ - if len(self.text) <= 1: + if len(self.texts) <= 1: return False bullet_regexes = [ @@ -284,7 +289,7 @@ def is_bulleted_list(self): r'[QA]:', # Q: ... A: ... ] bullet_type_idx = -1 - for line_text in self.text: + for line_text in self.texts: for i, breg in enumerate(bullet_regexes): if re.search(r'(?:[\n]|^)((?:' + breg + r')[\s]*)', line_text): if bullet_type_idx >= 0 and bullet_type_idx != i: From 05680656292636acb6a04b9b7822dba5e8c3048b Mon Sep 17 00:00:00 2001 From: thatDudo Date: Mon, 20 Nov 2023 12:21:00 +0100 Subject: [PATCH 4/4] Split chatgpt translations with \n if incorrect or repeat request --- manga_translator/translators/chatgpt.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/manga_translator/translators/chatgpt.py b/manga_translator/translators/chatgpt.py index ffca13491..417b16511 100644 --- a/manga_translator/translators/chatgpt.py +++ b/manga_translator/translators/chatgpt.py @@ -90,7 +90,7 @@ def temperature(self) -> float: def top_p(self) -> float: return self._config_get('top_p', default=1) - def _assemble_prompts(self, from_lang: str, to_lang: str, queries: List[str]) -> List[str]: + def _assemble_prompts(self, from_lang: str, to_lang: str, queries: List[str]): prompt = '' if self._INCLUDE_TEMPLATE: @@ -110,7 +110,7 @@ def _assemble_prompts(self, from_lang: str, to_lang: str, queries: List[str]) -> if self._MAX_TOKENS * 2 and len(''.join(queries[i+1:])) > self._MAX_TOKENS: if self._RETURN_PROMPT: prompt += '\n<|1|>' - yield prompt.lstrip() + yield prompt.lstrip(), i+1-i_offset prompt = self.prompt_template.format(to_lang=to_lang) # Restart counting at 1 i_offset = i + 1 @@ -118,7 +118,7 @@ def _assemble_prompts(self, from_lang: str, to_lang: str, queries: List[str]) -> if self._RETURN_PROMPT: prompt += '\n<|1|>' - yield prompt.lstrip() + yield prompt.lstrip(), len(queries)-i_offset def _format_prompt_log(self, to_lang: str, prompt: str) -> str: return prompt @@ -127,7 +127,7 @@ async def _translate(self, from_lang: str, to_lang: str, queries: List[str]) -> translations = [] self.logger.debug(f'Temperature: {self.temperature}, TopP: {self.top_p}') - for prompt in self._assemble_prompts(from_lang, to_lang, queries): + for prompt, query_size in self._assemble_prompts(from_lang, to_lang, queries): self.logger.debug('-- GPT Prompt --\n' + self._format_prompt_log(to_lang, prompt)) ratelimit_attempt = 0 @@ -165,10 +165,21 @@ async def _translate(self, from_lang: str, to_lang: str, queries: List[str]) -> await asyncio.sleep(1) self.logger.debug('-- GPT Response --\n' + response) + new_translations = re.split(r'<\|\d+\|>', response) # When there is only one query chatgpt likes to exclude the <|1|> if not new_translations[0].strip(): new_translations = new_translations[1:] + + if len(new_translations) <= 1 and query_size > 1: + # Try splitting by newlines instead + new_translations = re.split(r'\n', response) + + if len(new_translations) != query_size: + # super method will repeat translation as per self._INVALID_REPEAT_COUNT + translations = [] + break + translations.extend([t.strip() for t in new_translations]) self.logger.debug(translations)