Skip to content

Commit

Permalink
Merge branch 'zyddnys:main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
My12123 authored Nov 20, 2023
2 parents d46c17b + 0568065 commit fedbccf
Show file tree
Hide file tree
Showing 10 changed files with 71 additions and 47 deletions.
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -398,7 +398,7 @@ THA: Thai
--detector {default,ctd,craft,none} Text detector used for creating a text mask from an
image, DO NOT use craft for manga, it's not designed
for it
--ocr {48px,32px,48px_ctc} Optical character recognition (OCR) model to use
--ocr {32px,48px,48px_ctc} Optical character recognition (OCR) model to use
--inpainter {default,lama_large,lama_mpe,sd,none,original}
Inpainting model to use
--upscaler {waifu2x,esrgan,4xultrasharp} Upscaler to use. --upscale-ratio has to be set for it
Expand Down Expand Up @@ -431,10 +431,11 @@ THA: Thai
--box-threshold BOX_THRESHOLD Threshold for bbox generation
--text-threshold TEXT_THRESHOLD Threshold for text detection
--min-text-length MIN_TEXT_LENGTH Minimum text length of a text region
--no-text-lang-skip Dont skip text that is seemingly already in the target
language.
--inpainting-size INPAINTING_SIZE Size of image used for inpainting (too large will
result in OOM)
--inpainting-precision INPAINTING_PRECISION Inpainting precision for lama,
use bf16 while you can.
--inpainting-precision {fp32,fp16,bf16} Inpainting precision for lama, use bf16 while you can.
--colorization-size COLORIZATION_SIZE Size of image used for colorization. Set to -1 to use
full image size
--denoise-sigma DENOISE_SIGMA Used by colorizer and affects color strength, range
Expand Down
7 changes: 4 additions & 3 deletions README_CN.md
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ THA: Thai
--detector {default,ctd,craft,none} Text detector used for creating a text mask from an
image, DO NOT use craft for manga, it's not designed
for it
--ocr {48px,32px,48px_ctc} Optical character recognition (OCR) model to use
--ocr {32px,48px,48px_ctc} Optical character recognition (OCR) model to use
--inpainter {default,lama_large,lama_mpe,sd,none,original}
Inpainting model to use
--upscaler {waifu2x,esrgan,4xultrasharp} Upscaler to use. --upscale-ratio has to be set for it
Expand Down Expand Up @@ -163,10 +163,11 @@ THA: Thai
--box-threshold BOX_THRESHOLD Threshold for bbox generation
--text-threshold TEXT_THRESHOLD Threshold for text detection
--min-text-length MIN_TEXT_LENGTH Minimum text length of a text region
--no-text-lang-skip Dont skip text that is seemingly already in the target
language.
--inpainting-size INPAINTING_SIZE Size of image used for inpainting (too large will
result in OOM)
--inpainting-precision INPAINTING_PRECISION Inpainting precision for lama,
use bf16 while you can.
--inpainting-precision {fp32,fp16,bf16} Inpainting precision for lama, use bf16 while you can.
--colorization-size COLORIZATION_SIZE Size of image used for colorization. Set to -1 to use
full image size
--denoise-sigma DENOISE_SIGMA Used by colorizer and affects color strength, range
Expand Down
1 change: 1 addition & 0 deletions manga_translator/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ def _format_action_invocation(self, action: argparse.Action) -> str:
parser.add_argument('--box-threshold', default=0.7, type=float, help='Threshold for bbox generation')
parser.add_argument('--text-threshold', default=0.5, type=float, help='Threshold for text detection')
parser.add_argument('--min-text-length', default=0, type=int, help='Minimum text length of a text region')
parser.add_argument('--no-text-lang-skip', action='store_true', help='Dont skip text that is seemingly already in the target language.')
parser.add_argument('--inpainting-size', default=2048, type=int, help='Size of image used for inpainting (too large will result in OOM)')
parser.add_argument('--inpainting-precision', default='fp32', type=str, help='Inpainting precision for lama, use bf16 while you can.', choices=['fp32', 'fp16', 'bf16'])
parser.add_argument('--colorization-size', default=576, type=int, help='Size of image used for colorization. Set to -1 to use full image size')
Expand Down
48 changes: 25 additions & 23 deletions manga_translator/manga_translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import cv2
from aiohttp.web_middlewares import middleware
from omegaconf import OmegaConf
import py3langid as langid
import langcodes
import requests
import os
import re
Expand Down Expand Up @@ -469,15 +469,9 @@ async def _run_detection(self, ctx: Context):
async def _run_ocr(self, ctx: Context):
textlines = await dispatch_ocr(ctx.ocr, ctx.img_rgb, ctx.textlines, ctx, self.device, self.verbose)

# Filter out regions by original text
new_textlines = []
for textline in textlines:
text = textline.text
if (ctx.filter_text and re.search(ctx.filter_text, text)) \
or not is_valuable_text(text):
if text.strip():
logger.info(f'Filtered out: {text}')
else:
if textline.text.strip():
if ctx.font_color_fg:
textline.fg_r, textline.fg_g, textline.fg_b = ctx.font_color_fg
if ctx.font_color_bg:
Expand All @@ -488,23 +482,31 @@ async def _run_ocr(self, ctx: Context):
async def _run_textline_merge(self, ctx: Context):
text_regions = await dispatch_textline_merge(ctx.textlines, ctx.img_rgb.shape[1], ctx.img_rgb.shape[0],
verbose=self.verbose)
text_regions = [region for region in text_regions if len(''.join(region.text)) >= ctx.min_text_length]

new_text_regions = []
for region in text_regions:
if ctx.font_color_fg or ctx.font_color_bg:
if ctx.font_color_bg:
region.adjust_bg_color = False
if len(region.text) >= ctx.min_text_length \
and not is_valuable_text(region.text) \
or (not ctx.no_text_lang_skip and langcodes.tag_distance(region.source_lang, ctx.target_lang) == 0):
if region.text.strip():
logger.info(f'Filtered out: {region.text}')
else:
if ctx.font_color_fg or ctx.font_color_bg:
if ctx.font_color_bg:
region.adjust_bg_color = False
new_text_regions.append(region)
text_regions = new_text_regions

# Sort ctd (comic text detector) regions left to right. Otherwise right to left.
# Sorting will improve text translation quality.
text_regions = sort_regions(text_regions, right_to_left=True if ctx.detector != 'ctd' else False)
return text_regions

async def _run_text_translation(self, ctx: Context):
translated_sentences = await dispatch_translation(ctx.translator,
[region.get_text() for region in ctx.text_regions],
ctx.use_mtpe,
ctx, 'cpu' if self._cuda_limited_memory else self.device)
translated_sentences = \
await dispatch_translation(ctx.translator,
[region.text for region in ctx.text_regions],
ctx.use_mtpe,
ctx, 'cpu' if self._cuda_limited_memory else self.device)

for region, translation in zip(ctx.text_regions, translated_sentences):
if ctx.uppercase:
Expand All @@ -521,8 +523,8 @@ async def _run_text_translation(self, ctx: Context):
for region in ctx.text_regions:
# TODO: Maybe print reasons for filtering
if not ctx.translator == 'none' and (region.translation.isnumeric() \
or ctx.filter_text and re.search(ctx.filter_text, region.translation)
or not ctx.translator == 'original' and region.get_text().lower().strip() == region.translation.lower().strip()):
or ctx.filter_text and re.search(ctx.filter_text, region.translation)
or not ctx.translator == 'original' and region.text.lower().strip() == region.translation.lower().strip()):
if region.translation.strip():
logger.info(f'Filtered out: {region.translation}')
else:
Expand Down Expand Up @@ -618,7 +620,7 @@ def identify_colors(fg_rgb: List[int]):

s += f'\n-- {i + 1} --\n'
s += f'color: #{color_id}: {color_name} (fg, bg: {rgb2hex(*fore)} {rgb2hex(*back)})\n'
s += f'text: {region.get_text()}\n'
s += f'text: {region.text}\n'
s += f'trans: {region.translation}\n'
for line in region.lines:
s += f'coords: {list(line.ravel())}\n'
Expand Down Expand Up @@ -743,7 +745,7 @@ async def _run_text_translation(self, ctx: Context):
requests.post(f'http://{self.host}:{self.port}/request-manual-internal', json={
'task_id': self._task_id,
'nonce': self.nonce,
'texts': [r.get_text() for r in text_regions],
'texts': [r.text for r in text_regions],
'translations': [r.translation for r in text_regions],
}, timeout=20)

Expand Down Expand Up @@ -1225,7 +1227,7 @@ def format_translate(self, ctx: Context, return_image: bool):
trans = {key: value[i] for key, value in ctx['translations'].items()}
else:
trans = {}
trans["originalText"] = text_regions[i].get_text()
trans["originalText"] = text_regions[i].text
if inpaint is not None:
overlay = inpaint[minY:maxY, minX:maxX]

Expand All @@ -1248,7 +1250,7 @@ def format_translate(self, ctx: Context, return_image: bool):
'fg': color1.tolist(),
'bg': color2.tolist()
},
'language': langid.classify(text_regions[i].get_text())[0],
'language': text_regions[i].source_lang,
'background': background
})
if return_image and ctx.img_colorized is not None:
Expand Down
2 changes: 1 addition & 1 deletion manga_translator/rendering/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def resize_regions_to_font_size(img: np.ndarray, text_regions: List[TextBlock],

dst_points_list = []
for region in text_regions:
char_count_orig = len(region.get_text())
char_count_orig = len(region.text)
char_count_trans = len(region.translation.strip())
if char_count_trans > char_count_orig:
# More characters were added, have to reduce fontsize to fit allotted area
Expand Down
26 changes: 20 additions & 6 deletions manga_translator/translators/chatgpt.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import re
import openai
import openai.error
try:
import openai
import openai.error
except ImportError:
openai = None
import asyncio
import time
from typing import List, Dict
Expand Down Expand Up @@ -87,7 +90,7 @@ def temperature(self) -> float:
def top_p(self) -> float:
return self._config_get('top_p', default=1)

def _assemble_prompts(self, from_lang: str, to_lang: str, queries: List[str]) -> List[str]:
def _assemble_prompts(self, from_lang: str, to_lang: str, queries: List[str]):
prompt = ''

if self._INCLUDE_TEMPLATE:
Expand All @@ -107,15 +110,15 @@ def _assemble_prompts(self, from_lang: str, to_lang: str, queries: List[str]) ->
if self._MAX_TOKENS * 2 and len(''.join(queries[i+1:])) > self._MAX_TOKENS:
if self._RETURN_PROMPT:
prompt += '\n<|1|>'
yield prompt.lstrip()
yield prompt.lstrip(), i+1-i_offset
prompt = self.prompt_template.format(to_lang=to_lang)
# Restart counting at 1
i_offset = i + 1

if self._RETURN_PROMPT:
prompt += '\n<|1|>'

yield prompt.lstrip()
yield prompt.lstrip(), len(queries)-i_offset

def _format_prompt_log(self, to_lang: str, prompt: str) -> str:
return prompt
Expand All @@ -124,7 +127,7 @@ async def _translate(self, from_lang: str, to_lang: str, queries: List[str]) ->
translations = []
self.logger.debug(f'Temperature: {self.temperature}, TopP: {self.top_p}')

for prompt in self._assemble_prompts(from_lang, to_lang, queries):
for prompt, query_size in self._assemble_prompts(from_lang, to_lang, queries):
self.logger.debug('-- GPT Prompt --\n' + self._format_prompt_log(to_lang, prompt))

ratelimit_attempt = 0
Expand Down Expand Up @@ -162,10 +165,21 @@ async def _translate(self, from_lang: str, to_lang: str, queries: List[str]) ->
await asyncio.sleep(1)

self.logger.debug('-- GPT Response --\n' + response)

new_translations = re.split(r'<\|\d+\|>', response)
# When there is only one query chatgpt likes to exclude the <|1|>
if not new_translations[0].strip():
new_translations = new_translations[1:]

if len(new_translations) <= 1 and query_size > 1:
# Try splitting by newlines instead
new_translations = re.split(r'\n', response)

if len(new_translations) != query_size:
# super method will repeat translation as per self._INVALID_REPEAT_COUNT
translations = []
break

translations.extend([t.strip() for t in new_translations])

self.logger.debug(translations)
Expand Down
2 changes: 1 addition & 1 deletion manga_translator/upscaling/waifu2x.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
# https://github.com/nihui/waifu2x-ncnn-vulkan
class Waifu2xUpscaler(OfflineUpscaler): # ~2GB of vram
_MODEL_MAPPING = model_mapping
_VALID_UPSCALE_RATIOS = [1, 2, 4, 8, 16, 32]
_VALID_UPSCALE_RATIOS = [2, 4, 8, 16, 32]

def __init__(self, *args, **kwargs):
os.makedirs(self.model_dir, exist_ok=True)
Expand Down
2 changes: 1 addition & 1 deletion manga_translator/utils/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def is_punctuation(ch):

def is_valuable_char(ch):
# return re.search(r'[^\d\W]', ch)
return not is_punctuation(ch) and not is_control(ch) and not is_whitespace(ch) and not ch.isnumeric()
return not is_punctuation(ch) and not is_control(ch) and not is_whitespace(ch) and not ch.isdigit()

def is_valuable_text(text):
for ch in text:
Expand Down
21 changes: 13 additions & 8 deletions manga_translator/utils/textblock.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from functools import cached_property
import copy
import re
import py3langid as langid

from .generic import color_difference, is_right_to_left_char, is_valuable_char
# from ..detection.ctd_utils.utils.imgproc_utils import union_area, xywh2xyxypoly
Expand Down Expand Up @@ -41,7 +42,7 @@ class TextBlock(object):
Object that stores a block of text made up of textlines.
"""
def __init__(self, lines: List,
text: List[str] = None,
texts: List[str] = None,
language: str = 'unknown',
font_size: float = -1,
angle: int = 0,
Expand All @@ -60,6 +61,7 @@ def __init__(self, lines: List,
_bounding_rect: List = None,
default_stroke_width = 0.2,
font_weight = 50,
source_lang: str = "",
target_lang: str = "",
opacity: float = 1.,
shadow_radius: float = 0.,
Expand All @@ -75,7 +77,8 @@ def __init__(self, lines: List,
self.angle = angle
self._direction = direction

self.text = text if text is not None else []
self.texts = texts if texts is not None else []
self.text = ' '.join(texts)
self.prob = prob

self.translation = translation
Expand All @@ -92,6 +95,7 @@ def __init__(self, lines: List,
self.line_spacing = line_spacing
self.letter_spacing = letter_spacing
self._alignment = alignment
self._source_lang = source_lang
self.target_lang = target_lang

self._bounding_rect = _bounding_rect
Expand Down Expand Up @@ -235,10 +239,11 @@ def get_transformed_region(self, img: np.ndarray, line_idx: int, textheight: int
region = cv2.resize(region, (maxwidth, h))
return region

def get_text(self):
if isinstance(self.text, str):
return self.text
return ' '.join(self.text).strip()
@property
def source_lang(self):
if not self._source_lang:
self._source_lang = langid.classify(self.text)[0]
return self._source_lang

def get_translation_for_rendering(self):
text = self.translation
Expand Down Expand Up @@ -275,7 +280,7 @@ def is_bulleted_list(self):
A determining factor of whether we should be sticking to the strict per textline
text distribution when rendering.
"""
if len(self.text) <= 1:
if len(self.texts) <= 1:
return False

bullet_regexes = [
Expand All @@ -284,7 +289,7 @@ def is_bulleted_list(self):
r'[QA]:', # Q: ... A: ...
]
bullet_type_idx = -1
for line_text in self.text:
for line_text in self.texts:
for i, breg in enumerate(bullet_regexes):
if re.search(r'(?:[\n]|^)((?:' + breg + r')[\s]*)', line_text):
if bullet_type_idx >= 0 and bullet_type_idx != i:
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ websockets
protobuf
ctranslate2
colorama
openai
openai==0.28
open_clip_torch
safetensors
pandas
Expand Down

0 comments on commit fedbccf

Please sign in to comment.