Skip to content

Commit

Permalink
alpha-v2.2.1
Browse files Browse the repository at this point in the history
  • Loading branch information
zyddnys committed May 6, 2021
1 parent 3b3efd6 commit 1e4954e
Show file tree
Hide file tree
Showing 7 changed files with 468 additions and 132 deletions.
73 changes: 73 additions & 0 deletions DBHead.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# -*- coding: utf-8 -*-
# @Time : 2019/12/4 14:54
# @Author : zhoujun
import torch
from torch import nn

class DBHead(nn.Module):
def __init__(self, in_channels, out_channels, k = 50):
super().__init__()
self.k = k
self.binarize = nn.Sequential(
nn.Conv2d(in_channels, in_channels // 4, 3, padding=1),
nn.BatchNorm2d(in_channels // 4),
nn.ReLU(inplace=True),
nn.ConvTranspose2d(in_channels // 4, in_channels // 4, 4, 2, 1),
nn.BatchNorm2d(in_channels // 4),
nn.ReLU(inplace=True),
nn.ConvTranspose2d(in_channels // 4, 1, 4, 2, 1),
)
self.binarize.apply(self.weights_init)

self.thresh = self._init_thresh(in_channels)
self.thresh.apply(self.weights_init)

def forward(self, x):
shrink_maps = self.binarize(x)
threshold_maps = self.thresh(x)
if self.training:
binary_maps = self.step_function(shrink_maps.sigmoid(), threshold_maps)
y = torch.cat((shrink_maps, threshold_maps, binary_maps), dim=1)
else:
y = torch.cat((shrink_maps, threshold_maps), dim=1)
return y

def weights_init(self, m):
classname = m.__class__.__name__
if classname.find('Conv') != -1:
nn.init.kaiming_normal_(m.weight.data)
elif classname.find('BatchNorm') != -1:
m.weight.data.fill_(1.)
m.bias.data.fill_(1e-4)

def _init_thresh(self, inner_channels, serial=False, smooth=False, bias=False):
in_channels = inner_channels
if serial:
in_channels += 1
self.thresh = nn.Sequential(
nn.Conv2d(in_channels, inner_channels // 4, 3, padding=1, bias=bias),
nn.BatchNorm2d(inner_channels // 4),
nn.ReLU(inplace=True),
self._init_upsample(inner_channels // 4, inner_channels // 4, smooth=smooth, bias=bias),
nn.BatchNorm2d(inner_channels // 4),
nn.ReLU(inplace=True),
self._init_upsample(inner_channels // 4, 1, smooth=smooth, bias=bias),
nn.Sigmoid())
return self.thresh

def _init_upsample(self, in_channels, out_channels, smooth=False, bias=False):
if smooth:
inter_out_channels = out_channels
if out_channels == 1:
inter_out_channels = in_channels
module_list = [
nn.Upsample(scale_factor=2, mode='nearest'),
nn.Conv2d(in_channels, inter_out_channels, 3, 1, 1, bias=bias)]
if out_channels == 1:
module_list.append(nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=1, bias=True))
return nn.Sequential(module_list)
else:
return nn.ConvTranspose2d(in_channels, out_channels, 4, 2, 1)

def step_function(self, x, y):
return torch.reciprocal(1 + torch.exp(-self.k * (x - y)))
152 changes: 152 additions & 0 deletions DBNet_resnet101.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@

import torch
import torch.nn as nn
import torch.nn.functional as F

from torchvision.models import resnet101

import DBHead
import einops

class ImageMultiheadSelfAttention(nn.Module) :
def __init__(self, planes):
super(ImageMultiheadSelfAttention, self).__init__()
self.attn = nn.MultiheadAttention(planes, 8)
def forward(self, x) :
res = x
n, c, h, w = x.shape
x = einops.rearrange(x, 'n c h w -> (h w) n c')
x = self.attn(x, x, x)[0]
x = einops.rearrange(x, '(h w) n c -> n c h w', n = n, c = c, h = h, w = w)
return res + x

class double_conv(nn.Module):
def __init__(self, in_ch, mid_ch, out_ch, stride = 1, planes = 256):
super(double_conv, self).__init__()
self.planes = planes
# down = None
# if stride > 1 :
# down = nn.Sequential(
# nn.AvgPool2d(2, 2),
# nn.Conv2d(in_ch + mid_ch, self.planes * Bottleneck.expansion, kernel_size=1, stride=1, bias=False),nn.BatchNorm2d(self.planes * Bottleneck.expansion)
# )
self.down = None
if stride > 1 :
self.down = nn.AvgPool2d(2,stride=2)
self.conv = nn.Sequential(
nn.Conv2d(in_ch + mid_ch, mid_ch, kernel_size=3, padding=1, stride = 1, bias=False),
nn.BatchNorm2d(mid_ch),
nn.ReLU(inplace=True),
#Bottleneck(mid_ch, self.planes, stride, down, 2, 1, avd = True, norm_layer = nn.BatchNorm2d),
nn.Conv2d(mid_ch, out_ch, kernel_size=3, stride = 1, padding=1, bias=False),
nn.BatchNorm2d(out_ch),
nn.ReLU(inplace=True),
)

def forward(self, x):
if self.down is not None :
x = self.down(x)
x = self.conv(x)
return x

class double_conv_up(nn.Module):
def __init__(self, in_ch, mid_ch, out_ch, stride = 1, planes = 256):
super(double_conv_up, self).__init__()
self.planes = planes
# down = None
# if stride > 1 :
# down = nn.Sequential(
# nn.AvgPool2d(2, 2),
# nn.Conv2d(in_ch + mid_ch, self.planes * Bottleneck.expansion, kernel_size=1, stride=1, bias=False),nn.BatchNorm2d(self.planes * Bottleneck.expansion)
# )
self.down = None
if stride > 1 :
self.down = nn.AvgPool2d(2,stride=2)
self.conv = nn.Sequential(
nn.Conv2d(in_ch + mid_ch, mid_ch, kernel_size=3, padding=1, stride = 1, bias=False),
nn.BatchNorm2d(mid_ch),
nn.ReLU(inplace=True),
#Bottleneck(mid_ch, self.planes, stride, down, 2, 1, avd = True, norm_layer = nn.BatchNorm2d),
nn.Conv2d(mid_ch, mid_ch, kernel_size=3, stride = 1, padding=1, bias=False),
nn.BatchNorm2d(mid_ch),
nn.ReLU(inplace=True),
nn.ConvTranspose2d(mid_ch, out_ch, kernel_size=4, stride = 2, padding=1, bias=False),
nn.BatchNorm2d(out_ch),
nn.ReLU(inplace=True),
)

def forward(self, x):
if self.down is not None :
x = self.down(x)
x = self.conv(x)
return x

class TextDetection(nn.Module) :
def __init__(self, pretrained=None) :
super(TextDetection, self).__init__()
self.backbone = resnet101(pretrained=True if pretrained else False)

self.conv_db = DBHead.DBHead(64, 0)

self.conv_mask = nn.Sequential(
nn.Conv2d(64, 64, kernel_size=3, padding=1), nn.ReLU(inplace=True),
nn.Conv2d(64, 64, kernel_size=3, padding=1), nn.ReLU(inplace=True),
nn.Conv2d(64, 32, kernel_size=3, padding=1), nn.ReLU(inplace=True),
nn.Conv2d(32, 1, kernel_size=1),
nn.Sigmoid()
)

self.down_conv1 = double_conv(0, 512, 512, 2)
self.down_conv2 = double_conv(0, 512, 512, 2)
self.down_conv3 = double_conv(0, 512, 512, 2)

self.upconv1 = double_conv_up(0, 512, 256)
self.upconv2 = double_conv_up(256, 512, 256)
self.upconv3 = double_conv_up(256, 512, 256)
self.upconv4 = double_conv_up(256, 512, 256, planes = 128)
self.upconv5 = double_conv_up(256, 256, 128, planes = 64)
self.upconv6 = double_conv_up(128, 128, 64, planes = 32)
self.upconv7 = double_conv_up(64, 64, 64, planes = 16)

self.proj_h4 = nn.Conv2d(64 * 4, 64, 1)
self.proj_h8 = nn.Conv2d(128 * 4, 128, 1)
self.proj_h16 = nn.Conv2d(256 * 4, 256, 1)
self.proj_h32 = nn.Conv2d(512 * 4, 512, 1)

def forward(self, x) :
x = self.backbone.conv1(x)
x = self.backbone.bn1(x)
x = self.backbone.relu(x)
x = self.backbone.maxpool(x) # 64@384

h4 = self.backbone.layer1(x) # 64@384
h8 = self.backbone.layer2(h4) # 128@192
h16 = self.backbone.layer3(h8) # 256@96
h32 = self.backbone.layer4(h16) # 512@48

h4 = self.proj_h4(h4)
h8 = self.proj_h8(h8)
h16 = self.proj_h16(h16)
h32 = self.proj_h32(h32)

h64 = self.down_conv1(h32) # 512@24
h128 = self.down_conv2(h64) # 512@12
h256 = self.down_conv3(h128) # 512@6

up256 = self.upconv1(h256) # 128@12
up128 = self.upconv2(torch.cat([up256, h128], dim = 1)) # 64@24
up64 = self.upconv3(torch.cat([up128, h64], dim = 1)) # 128@48
up32 = self.upconv4(torch.cat([up64, h32], dim = 1)) # 64@96
up16 = self.upconv5(torch.cat([up32, h16], dim = 1)) # 128@192
up8 = self.upconv6(torch.cat([up16, h8], dim = 1)) # 64@384
up4 = self.upconv7(torch.cat([up8, h4], dim = 1)) # 64@768

return self.conv_db(up8), self.conv_mask(up4)

if __name__ == '__main__' :
device = torch.device("cuda:0")
net = TextDetection().to(device)
img = torch.randn(2, 3, 1024, 1024).to(device)
db, seg = net(img)
print(db.shape)
print(seg.shape)
62 changes: 12 additions & 50 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,17 @@ https://touhou.ai/imgtrans/
Note this may not work sometimes due to stupid google gcp kept restarting my instance. In that case you can wait for me to restart the service, which may take up to 24 hrs.
# English README
[README_EN.md](README_EN.md)
# 关于新模型
新模型使用DBNet,正在训练,将更好支持英文识别。 \
新的图片修复将去掉attention以减少显存占用。 \
预计一到两周左右出来。
# Changelogs
### 2021-05-06
1. 检测模型更新为基于ResNet101的DBNet
2. OCR模型更新更深
3. 默认检测分辨率增加到2048

注意这个版本除了英文检测稍微好一些,其他方面都不如之前版本
### 2021-03-04
1. 添加图片修补模型
### 2021-02-17
1. 初步版本发布
# 一键翻译各类图片内文字
针对群内、各个图站上大量不太可能会有人去翻译的图片设计,让我这种日语小白能够勉强看懂图片\
主要支持日语,不过也能识别汉语和小写英文 \
Expand All @@ -15,7 +22,7 @@ Note this may not work sometimes due to stupid google gcp kept restarting my ins

# 使用说明
1. clone这个repo
2. [下载](https://github.com/zyddnys/manga-image-translator/releases/tag/alpha-v2.2)ocr.ckpt、detect.ckpt和inpainting.ckpt,放到这个repo的根目录下
2. [下载](https://github.com/zyddnys/manga-image-translator/releases/tag/alpha-v2.2.1)ocr.ckpt、detect.ckpt和inpainting.ckpt,放到这个repo的根目录下
3. 申请百度翻译API,把你的appid和密钥存到key.py里
4. 运行`python translate_demo.py --image <图片文件路径> [--use-inpainting] [--use-cuda]`,结果会存放到result文件夹里。请加上`--use-inpainting`使用图像修补,请加上`--use-cuda`使用GPU。
# 只是初步版本,我们需要您的帮助完善
Expand All @@ -38,48 +45,3 @@ Note this may not work sometimes due to stupid google gcp kept restarting my ins
![Original](original2.jpg "https://twitter.com/mmd_96yuki/status/1320122899005460481")|![Output](result2.png)
![Original](original3.jpg "https://twitter.com/_taroshin_/status/1231099378779082754")|![Output](result3.png)
![Original](original4.jpg "https://amagi.fanbox.cc/posts/1904941")|![Output](result4.png)
# Citation
```
@inproceedings{baek2019character,
title={Character region awareness for text detection},
author={Baek, Youngmin and Lee, Bado and Han, Dongyoon and Yun, Sangdoo and Lee, Hwalsuk},
booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
pages={9365--9374},
year={2019}
}
@article{hinami2020towards,
title={Towards Fully Automated Manga Translation},
author={Hinami, Ryota and Ishiwatari, Shonosuke and Yasuda, Kazuhiko and Matsui, Yusuke},
journal={arXiv preprint arXiv:2012.14271},
year={2020}
}
@article{oord2017neural,
title={Neural discrete representation learning},
author={Oord, Aaron van den and Vinyals, Oriol and Kavukcuoglu, Koray},
journal={arXiv preprint arXiv:1711.00937},
year={2017}
}
@article{uddin2020global,
title={Global and Local Attention-Based Free-Form Image Inpainting},
author={Uddin, SM and Jung, Yong Ju},
journal={Sensors},
volume={20},
number={11},
pages={3204},
year={2020},
publisher={Multidisciplinary Digital Publishing Institute}
}
@article{brock2021characterizing,
title={Characterizing signal propagation to close the performance gap in unnormalized ResNets},
author={Brock, Andrew and De, Soham and Smith, Samuel L},
journal={arXiv preprint arXiv:2101.08692},
year={2021}
}
@inproceedings{fujimoto2016manga109,
title={Manga109 dataset and creation of metadata},
author={Fujimoto, Azuma and Ogawa, Toru and Yamamoto, Kazuyoshi and Matsui, Yusuke and Yamasaki, Toshihiko and Aizawa, Kiyoharu},
booktitle={Proceedings of the 1st international workshop on comics analysis, processing and understanding},
pages={1--5},
year={2016}
}
```
60 changes: 12 additions & 48 deletions README_EN.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,17 @@
# Online Demo
https://touhou.ai/imgtrans/
Note this may not work sometimes due to stupid google gcp kept restarting my instance. In that case you can wait for me to restart the service, which may take up to 24 hrs.
# New model delayed
New model delayed due to poor result, I am fixing it however there is no guarantee it will be out this week.
# Changelogs
### 2021-05-06
1. Text detection model is now based on DBNet with ResNet101 backbone
2. OCR model is now deeper
3. Default detection resolution has been increased to 2048 from 1536

Note this version is slightly better at handling English texts, other than that it is worse in every other ways
### 2021-03-04
1. Added inpainting model
### 2021-02-17
1. First version launched
# Translate texts in manga/images
Some manga/images will never be translated, therefore this project is born, \
Primarily designed for translating Japanese text, but also support Chinese and sometimes English \
Expand All @@ -11,7 +20,7 @@ Successor to https://github.com/PatchyVideo/MMDOCR-HighPerformance

# How to use
1. Clone this repo
2. [Download](https://github.com/zyddnys/manga-image-translator/releases/tag/alpha-v2.2)ocr.ckpt、detect.ckpt and inpainting.ckpt,put them in the root directory of this repo
2. [Download](https://github.com/zyddnys/manga-image-translator/releases/tag/alpha-v2.2.1)ocr.ckpt、detect.ckpt and inpainting.ckpt,put them in the root directory of this repo
3. Apply for baidu translate API, put ypur appid and key in `key.py`
4. Run`python translate_demo.py --image <path_to_image_file> [--use-inpainting] [--use-cuda]`,result can be found in `result/`. Add `--use-inpainting` to enable inpainting, Add `--use-cuda` to use CUDA.
# This is a hobby project, you are welcome to contribute
Expand All @@ -32,48 +41,3 @@ Original | Translated
![Original](original2.jpg "https://twitter.com/mmd_96yuki/status/1320122899005460481")|![Output](result2.png)
![Original](original3.jpg "https://twitter.com/_taroshin_/status/1231099378779082754")|![Output](result3.png)
![Original](original4.jpg "https://amagi.fanbox.cc/posts/1904941")|![Output](result4.png)
# Citation
```
@inproceedings{baek2019character,
title={Character region awareness for text detection},
author={Baek, Youngmin and Lee, Bado and Han, Dongyoon and Yun, Sangdoo and Lee, Hwalsuk},
booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
pages={9365--9374},
year={2019}
}
@article{hinami2020towards,
title={Towards Fully Automated Manga Translation},
author={Hinami, Ryota and Ishiwatari, Shonosuke and Yasuda, Kazuhiko and Matsui, Yusuke},
journal={arXiv preprint arXiv:2012.14271},
year={2020}
}
@article{oord2017neural,
title={Neural discrete representation learning},
author={Oord, Aaron van den and Vinyals, Oriol and Kavukcuoglu, Koray},
journal={arXiv preprint arXiv:1711.00937},
year={2017}
}
@article{uddin2020global,
title={Global and Local Attention-Based Free-Form Image Inpainting},
author={Uddin, SM and Jung, Yong Ju},
journal={Sensors},
volume={20},
number={11},
pages={3204},
year={2020},
publisher={Multidisciplinary Digital Publishing Institute}
}
@article{brock2021characterizing,
title={Characterizing signal propagation to close the performance gap in unnormalized ResNets},
author={Brock, Andrew and De, Soham and Smith, Samuel L},
journal={arXiv preprint arXiv:2101.08692},
year={2021}
}
@inproceedings{fujimoto2016manga109,
title={Manga109 dataset and creation of metadata},
author={Fujimoto, Azuma and Ogawa, Toru and Yamamoto, Kazuyoshi and Matsui, Yusuke and Yamasaki, Toshihiko and Aizawa, Kiyoharu},
booktitle={Proceedings of the 1st international workshop on comics analysis, processing and understanding},
pages={1--5},
year={2016}
}
```
Loading

0 comments on commit 1e4954e

Please sign in to comment.