Skip to content

Commit

Permalink
Merge branch 'main' of github.com:zyddnys/manga-image-translator
Browse files Browse the repository at this point in the history
  • Loading branch information
zyddnys committed Nov 28, 2023
2 parents c09a8e8 + be068d1 commit 5e2c7aa
Show file tree
Hide file tree
Showing 35 changed files with 80 additions and 80 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,7 @@ THA: Thai
--selective-translation SELECTIVE_TRANSLATION
Select a translator based on detected language in
image. Note the first translation service acts as
default if the language isnt defined. Example:
default if the language isn't defined. Example:
--translator-chain "google:JPN;sugoi:ENG".
--revert-upscaling Downscales the previously upscaled image after
translation back to original size (Use with --upscale-
Expand Down Expand Up @@ -589,7 +589,7 @@ A list of what needs to be done next, you're welcome to contribute.
This works for images without speech bubbles, but making it impossible to decide where to put translated English text. I have no idea how to solve this.
4. [Ryota et al.](https://arxiv.org/abs/2012.14271) proposed using multimodal machine translation, maybe we can add ViT features for building custom NMT models.
5. Make this project works for video(rewrite code in C++ and use GPU/other hardware NN accelerator).\
Used for detecting hard subtitles in videos, generting ass file and remove them completetly.
Used for detecting hard subtitles in videos, generating ass file and remove them completely.
6. ~~Mask refinement based using non deep learning algorithms, I am currently testing out CRF based algorithm.~~
7. ~~Angled text region merge is not currently supported~~
8. Create pip repository
Expand Down
2 changes: 1 addition & 1 deletion README_CN.md
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ THA: Thai
--selective-translation SELECTIVE_TRANSLATION
Select a translator based on detected language in
image. Note the first translation service acts as
default if the language isnt defined. Example:
default if the language isn't defined. Example:
--translator-chain "google:JPN;sugoi:ENG".
--revert-upscaling Downscales the previously upscaled image after
translation back to original size (Use with --upscale-
Expand Down
4 changes: 2 additions & 2 deletions manga_translator/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def dir_path(string):
return s

# def choice_chain(choices):
# """Argument type for string chains from choices seperated by ':'. Example: 'choice1:choice2:choice3'"""
# """Argument type for string chains from choices separated by ':'. Example: 'choice1:choice2:choice3'"""
# def _func(string):
# if choices is not None:
# for s in string.split(':') or ['']:
Expand Down Expand Up @@ -113,7 +113,7 @@ def _format_action_invocation(self, action: argparse.Action) -> str:
g = parser.add_mutually_exclusive_group()
g.add_argument('--translator', default='google', type=str, choices=TRANSLATORS, help='Language translator to use')
g.add_argument('--translator-chain', default=None, type=translator_chain, help='Output of one translator goes in another. Example: --translator-chain "google:JPN;sugoi:ENG".')
g.add_argument('--selective-translation', default=None, type=translator_chain, help='Select a translator based on detected language in image. Note the first translation service acts as default if the language isnt defined. Example: --translator-chain "google:JPN;sugoi:ENG".')
g.add_argument('--selective-translation', default=None, type=translator_chain, help='Select a translator based on detected language in image. Note the first translation service acts as default if the language isn\'t defined. Example: --translator-chain "google:JPN;sugoi:ENG".')

parser.add_argument('--revert-upscaling', action='store_true', help='Downscales the previously upscaled image after translation back to original size (Use with --upscale-ratio).')
parser.add_argument('--detection-size', default=1536, type=int, help='Size of image used for detection')
Expand Down
6 changes: 3 additions & 3 deletions manga_translator/detection/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ async def detect(self, image: np.ndarray, detect_size: int, text_threshold: floa
self.logger.debug('Adding border')
image = self._add_border(image, minimum_image_size)
if invert:
self.logger.debug('Adding invertion')
image = self._add_invertion(image)
self.logger.debug('Adding inversion')
image = self._add_inversion(image)
if gamma_correct:
self.logger.debug('Adding gamma correction')
image = self._add_gamma_correction(image)
Expand Down Expand Up @@ -112,7 +112,7 @@ def _remove_rotation(self, textlines, raw_mask, mask, img_w, img_h):
textlines[i] = Quadrilateral(rotated_pts, txtln.text, txtln.prob)
return textlines, raw_mask, mask

def _add_invertion(self, image: np.ndarray):
def _add_inversion(self, image: np.ndarray):
return cv2.bitwise_not(image)

def _add_gamma_correction(self, image: np.ndarray):
Expand Down
8 changes: 4 additions & 4 deletions manga_translator/detection/ctd_utils/textmask.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,9 +98,9 @@ def merge_mask_list(mask_list, pred_mask, blk: Quadrilateral = None, pred_thresh
continue
x1, y1, x2, y2 = x, y, x+w, y+h
label_local = labels[y1: y2, x1: x2]
label_cordinates = np.where(label_local==label_index)
label_coordinates = np.where(label_local==label_index)
tmp_merged = np.zeros_like(label_local, np.uint8)
tmp_merged[label_cordinates] = 255
tmp_merged[label_coordinates] = 255
tmp_merged = cv2.bitwise_or(mask_merged[y1: y2, x1: x2], tmp_merged)
xor_merged = cv2.bitwise_xor(tmp_merged, pred_mask[y1: y2, x1: x2]).sum()
xor_origin = cv2.bitwise_xor(mask_merged[y1: y2, x1: x2], pred_mask[y1: y2, x1: x2]).sum()
Expand All @@ -121,9 +121,9 @@ def merge_mask_list(mask_list, pred_mask, blk: Quadrilateral = None, pred_thresh
if area < area_thresh:
x1, y1, x2, y2 = x, y, x+w, y+h
label_local = labels[y1: y2, x1: x2]
label_cordinates = np.where(label_local==label_index)
label_coordinates = np.where(label_local==label_index)
tmp_merged = np.zeros_like(label_local, np.uint8)
tmp_merged[label_cordinates] = 255
tmp_merged[label_coordinates] = 255
tmp_merged = cv2.bitwise_or(mask_merged[y1: y2, x1: x2], tmp_merged)
xor_merged = cv2.bitwise_xor(tmp_merged, pred_mask[y1: y2, x1: x2]).sum()
xor_origin = cv2.bitwise_xor(mask_merged[y1: y2, x1: x2], pred_mask[y1: y2, x1: x2]).sum()
Expand Down
18 changes: 9 additions & 9 deletions manga_translator/detection/ctd_utils/utils/db_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def __call__(self, batch, pred, is_output_polygon=False, height=None, width=None
pred:
binary: text region segmentation map, with shape (N, H, W)
thresh: [if exists] thresh hold prediction with shape (N, H, W)
thresh_binary: [if exists] binarized with threshhold, (N, H, W)
thresh_binary: [if exists] binarized with threshold, (N, H, W)
'''
pred = pred[:, 0, :, :]
segmentation = self.binarize(pred)
Expand Down Expand Up @@ -522,12 +522,12 @@ def shrink_polygon_pyclipper(polygon, shrink_ratio):
subject = [tuple(l) for l in polygon]
padding = pyclipper.PyclipperOffset()
padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
shrinked = padding.Execute(-distance)
if shrinked == []:
shrinked = np.array(shrinked)
shrunk = padding.Execute(-distance)
if shrunk == []:
shrunk = np.array(shrunk)
else:
shrinked = np.array(shrinked[0]).reshape(-1, 2)
return shrinked
shrunk = np.array(shrunk[0]).reshape(-1, 2)
return shrunk

class MakeShrinkMap():
r'''
Expand Down Expand Up @@ -563,12 +563,12 @@ def __call__(self, data: dict) -> dict:
cv2.fillPoly(mask, polygon.astype(np.int32)[np.newaxis, :, :], 0)
ignore_tags[i] = True
else:
shrinked = self.shrink_func(polygon, self.shrink_ratio)
if shrinked.size == 0:
shrunk = self.shrink_func(polygon, self.shrink_ratio)
if shrunk.size == 0:
cv2.fillPoly(mask, polygon.astype(np.int32)[np.newaxis, :, :], 0)
ignore_tags[i] = True
continue
cv2.fillPoly(gt, [shrinked.astype(np.int32)], 1)
cv2.fillPoly(gt, [shrunk.astype(np.int32)], 1)

data['shrink_map'] = gt
data['shrink_mask'] = mask
Expand Down
6 changes: 3 additions & 3 deletions manga_translator/detection/default_utils/craft_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@
import cv2
import math

""" auxilary functions """
""" auxiliary functions """
# unwarp corodinates
def warpCoord(Minv, pt):
out = np.matmul(Minv, (pt[0], pt[1], 1))
return np.array([out[0]/out[2], out[1]/out[2]])
""" end of auxilary functions """
""" end of auxiliary functions """


def getDetBoxes_core(textmap, linkmap, text_threshold, link_threshold, low_text):
Expand Down Expand Up @@ -168,7 +168,7 @@ def getPoly_core(boxes, labels, mapper, linkmap):
# calc median maximum of pivot points
half_char_h = np.median(seg_height) * expand_ratio / 2

# calc gradiant and apply to make horizontal pivots
# calc gradient and apply to make horizontal pivots
new_pp = []
for i, (x, cy) in enumerate(pp):
dx = cp_section[i * 2 + 2][0] - cp_section[i * 2][0]
Expand Down
2 changes: 1 addition & 1 deletion manga_translator/detection/default_utils/dbnet_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def __call__(self, batch, pred, is_output_polygon=False):
pred:
binary: text region segmentation map, with shape (N, H, W)
thresh: [if exists] thresh hold prediction with shape (N, H, W)
thresh_binary: [if exists] binarized with threshhold, (N, H, W)
thresh_binary: [if exists] binarized with threshold, (N, H, W)
'''
pred = pred[:, 0, :, :]
segmentation = self.binarize(pred)
Expand Down
2 changes: 1 addition & 1 deletion manga_translator/inpainting/inpainting_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ class GlobalAttention(nn.Module):

def __init__(self, in_dim):
super(GlobalAttention, self).__init__()
self.chanel_in = in_dim
self.channel_in = in_dim

self.query_conv = ScaledWSConv2d(in_channels=in_dim, out_channels=in_dim, kernel_size=1)
self.key_conv = ScaledWSConv2d(in_channels=in_dim, out_channels=in_dim, kernel_size=1)
Expand Down
6 changes: 3 additions & 3 deletions manga_translator/inpainting/inpainting_lama.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ async def _load(self, device: str):
self.model = self.model.cuda()


class DepthWiseSeperableConv(nn.Module):
class DepthWiseSeparableConv(nn.Module):
def __init__(self, in_dim, out_dim, *args, **kwargs):
super().__init__()
if 'groups' in kwargs:
Expand Down Expand Up @@ -91,7 +91,7 @@ def __init__(self, in_dim, out_dim, kernel_size, dilation_num=3, comb_mode='sum'
self.cat_in = False
self.in_dims = [in_dim] * dilation_num

conv_type = DepthWiseSeperableConv if use_depthwise else nn.Conv2d
conv_type = DepthWiseSeparableConv if use_depthwise else nn.Conv2d
dilation = min_dilation
for i in range(dilation_num):
if isinstance(padding, int):
Expand Down Expand Up @@ -158,7 +158,7 @@ def get_conv_block_ctor(kind='default'):
if kind == 'default':
return nn.Conv2d
if kind == 'depthwise':
return DepthWiseSeperableConv
return DepthWiseSeparableConv
if kind == 'multidilated':
return MultidilatedConv
raise ValueError(f'Unknown convolutional block kind {kind}')
Expand Down
2 changes: 1 addition & 1 deletion manga_translator/inpainting/ldm/models/diffusion/plms.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ def get_x_prev_and_pred_x0(e_t, index):
# 2nd order Pseudo Linear Multistep (Adams-Bashforth)
e_t_prime = (3 * e_t - old_eps[-1]) / 2
elif len(old_eps) == 2:
# 3nd order Pseudo Linear Multistep (Adams-Bashforth)
# 3rd order Pseudo Linear Multistep (Adams-Bashforth)
e_t_prime = (23 * e_t - 16 * old_eps[-1] + 5 * old_eps[-2]) / 12
elif len(old_eps) >= 3:
# 4nd order Pseudo Linear Multistep (Adams-Bashforth)
Expand Down
6 changes: 3 additions & 3 deletions manga_translator/inpainting/ldm/modules/attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
try:
import xformers
import xformers.ops
XFORMERS_IS_AVAILBLE = False
XFORMERS_IS_AVAILABLE = False
except ImportError:
XFORMERS_IS_AVAILBLE = False
XFORMERS_IS_AVAILABLE = False

# CrossAttn precision handling
import os
Expand Down Expand Up @@ -251,7 +251,7 @@ class BasicTransformerBlock(nn.Module):
def __init__(self, dim, n_heads, d_head, dropout=0., context_dim=None, gated_ff=True, checkpoint=True,
disable_self_attn=False):
super().__init__()
attn_mode = "softmax-xformers" if XFORMERS_IS_AVAILBLE else "softmax"
attn_mode = "softmax-xformers" if XFORMERS_IS_AVAILABLE else "softmax"
assert attn_mode in self.ATTENTION_MODES
attn_cls = self.ATTENTION_MODES[attn_mode]
self.disable_self_attn = disable_self_attn
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@
try:
import xformers
import xformers.ops
XFORMERS_IS_AVAILBLE = False
XFORMERS_IS_AVAILABLE = False
except:
XFORMERS_IS_AVAILBLE = False
XFORMERS_IS_AVAILABLE = False
print("No module 'xformers'. Proceeding without it.")


Expand Down Expand Up @@ -279,7 +279,7 @@ def forward(self, x, context=None, mask=None):

def make_attn(in_channels, attn_type="vanilla", attn_kwargs=None):
assert attn_type in ["vanilla", "vanilla-xformers", "memory-efficient-cross-attn", "linear", "none"], f'attn_type {attn_type} unknown'
if XFORMERS_IS_AVAILBLE and attn_type == "vanilla":
if XFORMERS_IS_AVAILABLE and attn_type == "vanilla":
attn_type = "vanilla-xformers"
print(f"making attention of type '{attn_type}' with {in_channels} in_channels")
if attn_type == "vanilla":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,7 @@ def count_flops_attn(model, _x, y):

class QKVAttentionLegacy(nn.Module):
"""
A module which performs QKV attention. Matches legacy QKVAttention + input/ouput heads shaping
A module which performs QKV attention. Matches legacy QKVAttention + input/output heads shaping
"""

def __init__(self, n_heads):
Expand Down
4 changes: 2 additions & 2 deletions manga_translator/inpainting/ldm/modules/ema.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@


class LitEma(nn.Module):
def __init__(self, model, decay=0.9999, use_num_upates=True):
def __init__(self, model, decay=0.9999, use_num_updates=True):
super().__init__()
if decay < 0.0 or decay > 1.0:
raise ValueError('Decay must be between 0 and 1')

self.m_name2s_name = {}
self.register_buffer('decay', torch.tensor(decay, dtype=torch.float32))
self.register_buffer('num_updates', torch.tensor(0, dtype=torch.int) if use_num_upates
self.register_buffer('num_updates', torch.tensor(0, dtype=torch.int) if use_num_updates
else torch.tensor(-1, dtype=torch.int))

for name, p in model.named_parameters():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ def gen_kernel(k_size=np.array([15, 15]), scale_factor=np.array([4, 4]), min_var
[X, Y] = np.meshgrid(range(k_size[0]), range(k_size[1]))
Z = np.stack([X, Y], 2)[:, :, :, None]

# Calcualte Gaussian for every pixel of the kernel
# Calculate Gaussian for every pixel of the kernel
ZZ = Z - MU
ZZ_t = ZZ.transpose(0, 1, 3, 2)
raw_kernel = np.exp(-0.5 * np.squeeze(ZZ_t @ INV_SIGMA @ ZZ)) * (1 + noise)
Expand Down Expand Up @@ -613,7 +613,7 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None):
return example


# TODO incase there is a pickle error one needs to replace a += x with a = a + x in add_speckle_noise etc...
# TODO in case there is a pickle error one needs to replace a += x with a = a + x in add_speckle_noise etc...
def degradation_bsrgan_plus(img, sf=4, shuffle_prob=0.5, use_sharp=True, lq_patchsize=64, isp_model=None):
"""
This is an extended degradation model by combining
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ def gen_kernel(k_size=np.array([15, 15]), scale_factor=np.array([4, 4]), min_var
[X, Y] = np.meshgrid(range(k_size[0]), range(k_size[1]))
Z = np.stack([X, Y], 2)[:, :, :, None]

# Calcualte Gaussian for every pixel of the kernel
# Calculate Gaussian for every pixel of the kernel
ZZ = Z - MU
ZZ_t = ZZ.transpose(0, 1, 3, 2)
raw_kernel = np.exp(-0.5 * np.squeeze(ZZ_t @ INV_SIGMA @ ZZ)) * (1 + noise)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def surf(Z, cmap='rainbow', figsize=None):

'''
# --------------------------------------------
# get image pathes
# get image paths
# --------------------------------------------
'''

Expand Down Expand Up @@ -122,14 +122,14 @@ def imssave(imgs, img_path):
cv2.imwrite(new_path, img)


def split_imageset(original_dataroot, taget_dataroot, n_channels=3, p_size=800, p_overlap=96, p_max=1000):
def split_imageset(original_dataroot, target_dataroot, n_channels=3, p_size=800, p_overlap=96, p_max=1000):
"""
split the large images from original_dataroot into small overlapped images with size (p_size)x(p_size),
and save them into taget_dataroot; only the images with larger size than (p_max)x(p_max)
will be splitted.
and save them into target_dataroot; only the images with larger size than (p_max)x(p_max)
will be split.
Args:
original_dataroot:
taget_dataroot:
target_dataroot:
p_size: size of small images
p_overlap: patch size in training is a good choice
p_max: images with smaller size than (p_max)x(p_max) keep unchanged.
Expand All @@ -139,8 +139,8 @@ def split_imageset(original_dataroot, taget_dataroot, n_channels=3, p_size=800,
# img_name, ext = os.path.splitext(os.path.basename(img_path))
img = imread_uint(img_path, n_channels=n_channels)
patches = patches_from_image(img, p_size, p_overlap, p_max)
imssave(patches, os.path.join(taget_dataroot,os.path.basename(img_path)))
#if original_dataroot == taget_dataroot:
imssave(patches, os.path.join(target_dataroot,os.path.basename(img_path)))
#if original_dataroot == target_dataroot:
#del img_path

'''
Expand Down Expand Up @@ -180,7 +180,7 @@ def mkdir_and_rename(path):


# --------------------------------------------
# get uint8 image of size HxWxn_channles (RGB)
# get uint8 image of size HxWxn_channels (RGB)
# --------------------------------------------
def imread_uint(path, n_channels=3):
# input: path
Expand Down Expand Up @@ -215,7 +215,7 @@ def imwrite(img, img_path):


# --------------------------------------------
# get single image of size HxWxn_channles (BGR)
# get single image of size HxWxn_channels (BGR)
# --------------------------------------------
def read_img(path):
# read image by cv2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def get_size(self, width, height):
# fit height
scale_width = scale_height
elif self.__resize_method == "minimal":
# scale as least as possbile
# scale as least as possible
if abs(1 - scale_width) < abs(1 - scale_height):
# fit width
scale_height = scale_width
Expand Down
2 changes: 1 addition & 1 deletion manga_translator/inpainting/ldm/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def log_txt_as_img(wh, xc, size=10):
try:
draw.text((0, 0), lines, fill="black", font=font)
except UnicodeEncodeError:
print("Cant encode string for logging. Skipping.")
print("Can't encode string for logging. Skipping.")

txt = np.array(txt).transpose(2, 0, 1) / 127.5 - 1.0
txts.append(txt)
Expand Down
4 changes: 2 additions & 2 deletions manga_translator/manga_translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from .args import DEFAULT_ARGS, translator_chain
from .utils import (
BASE_PATH,
LANGAUGE_ORIENTATION_PRESETS,
LANGUAGE_ORIENTATION_PRESETS,
ModelWrapper,
Context,
PriorityLock,
Expand Down Expand Up @@ -543,7 +543,7 @@ async def _run_text_rendering(self, ctx: Context):
if ctx.renderer == 'none':
output = ctx.img_inpainted
# manga2eng currently only supports horizontal left to right rendering
elif ctx.renderer == 'manga2eng' and ctx.text_regions and LANGAUGE_ORIENTATION_PRESETS.get(
elif ctx.renderer == 'manga2eng' and ctx.text_regions and LANGUAGE_ORIENTATION_PRESETS.get(
ctx.text_regions[0].target_lang) == 'h':
output = await dispatch_eng_render(ctx.img_inpainted, ctx.img_rgb, ctx.text_regions, ctx.font_path, ctx.line_spacing)
else:
Expand Down
Loading

0 comments on commit 5e2c7aa

Please sign in to comment.