alpha-v2.2.1

zyddnys · May 6, 2021 · 1e4954e · 1e4954e
1 parent 3b3efd6
commit 1e4954e
Show file tree

Hide file tree

Showing 7 changed files with 468 additions and 132 deletions.
diff --git a/DBHead.py b/DBHead.py
@@ -0,0 +1,73 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2019/12/4 14:54
+# @Author  : zhoujun
+import torch
+from torch import nn
+
+class DBHead(nn.Module):
+    def __init__(self, in_channels, out_channels, k = 50):
+        super().__init__()
+        self.k = k
+        self.binarize = nn.Sequential(
+            nn.Conv2d(in_channels, in_channels // 4, 3, padding=1),
+            nn.BatchNorm2d(in_channels // 4),
+            nn.ReLU(inplace=True),
+            nn.ConvTranspose2d(in_channels // 4, in_channels // 4, 4, 2, 1),
+            nn.BatchNorm2d(in_channels // 4),
+            nn.ReLU(inplace=True),
+            nn.ConvTranspose2d(in_channels // 4, 1, 4, 2, 1),
+            )
+        self.binarize.apply(self.weights_init)
+
+        self.thresh = self._init_thresh(in_channels)
+        self.thresh.apply(self.weights_init)
+
+    def forward(self, x):
+        shrink_maps = self.binarize(x)
+        threshold_maps = self.thresh(x)
+        if self.training:
+            binary_maps = self.step_function(shrink_maps.sigmoid(), threshold_maps)
+            y = torch.cat((shrink_maps, threshold_maps, binary_maps), dim=1)
+        else:
+            y = torch.cat((shrink_maps, threshold_maps), dim=1)
+        return y
+
+    def weights_init(self, m):
+        classname = m.__class__.__name__
+        if classname.find('Conv') != -1:
+            nn.init.kaiming_normal_(m.weight.data)
+        elif classname.find('BatchNorm') != -1:
+            m.weight.data.fill_(1.)
+            m.bias.data.fill_(1e-4)
+
+    def _init_thresh(self, inner_channels, serial=False, smooth=False, bias=False):
+        in_channels = inner_channels
+        if serial:
+            in_channels += 1
+        self.thresh = nn.Sequential(
+            nn.Conv2d(in_channels, inner_channels // 4, 3, padding=1, bias=bias),
+            nn.BatchNorm2d(inner_channels // 4),
+            nn.ReLU(inplace=True),
+            self._init_upsample(inner_channels // 4, inner_channels // 4, smooth=smooth, bias=bias),
+            nn.BatchNorm2d(inner_channels // 4),
+            nn.ReLU(inplace=True),
+            self._init_upsample(inner_channels // 4, 1, smooth=smooth, bias=bias),
+            nn.Sigmoid())
+        return self.thresh
+
+    def _init_upsample(self, in_channels, out_channels, smooth=False, bias=False):
+        if smooth:
+            inter_out_channels = out_channels
+            if out_channels == 1:
+                inter_out_channels = in_channels
+            module_list = [
+                nn.Upsample(scale_factor=2, mode='nearest'),
+                nn.Conv2d(in_channels, inter_out_channels, 3, 1, 1, bias=bias)]
+            if out_channels == 1:
+                module_list.append(nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=1, bias=True))
+            return nn.Sequential(module_list)
+        else:
+            return nn.ConvTranspose2d(in_channels, out_channels, 4, 2, 1)
+
+    def step_function(self, x, y):
+        return torch.reciprocal(1 + torch.exp(-self.k * (x - y)))
diff --git a/DBNet_resnet101.py b/DBNet_resnet101.py
@@ -0,0 +1,152 @@
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from torchvision.models import resnet101
+
+import DBHead
+import einops
+
+class ImageMultiheadSelfAttention(nn.Module) :
+	def __init__(self, planes):
+		super(ImageMultiheadSelfAttention, self).__init__()
+		self.attn = nn.MultiheadAttention(planes, 8)
+	def forward(self, x) :
+		res = x
+		n, c, h, w = x.shape
+		x = einops.rearrange(x, 'n c h w -> (h w) n c')
+		x = self.attn(x, x, x)[0]
+		x = einops.rearrange(x, '(h w) n c -> n c h w', n = n, c = c, h = h, w = w)
+		return res + x
+
+class double_conv(nn.Module):
+	def __init__(self, in_ch, mid_ch, out_ch, stride = 1, planes = 256):
+		super(double_conv, self).__init__()
+		self.planes = planes
+		# down = None
+		# if stride > 1 :
+		# 	down = nn.Sequential(
+		# 		nn.AvgPool2d(2, 2),
+		# 		nn.Conv2d(in_ch + mid_ch, self.planes * Bottleneck.expansion, kernel_size=1, stride=1, bias=False),nn.BatchNorm2d(self.planes * Bottleneck.expansion)
+		# 		)
+		self.down = None
+		if stride > 1 :
+			self.down = nn.AvgPool2d(2,stride=2)
+		self.conv = nn.Sequential(
+			nn.Conv2d(in_ch + mid_ch, mid_ch, kernel_size=3, padding=1, stride = 1, bias=False),
+			nn.BatchNorm2d(mid_ch),
+			nn.ReLU(inplace=True),
+			#Bottleneck(mid_ch, self.planes, stride, down, 2, 1, avd = True, norm_layer = nn.BatchNorm2d),
+			nn.Conv2d(mid_ch, out_ch, kernel_size=3, stride = 1, padding=1, bias=False),
+			nn.BatchNorm2d(out_ch),
+			nn.ReLU(inplace=True),
+		)
+
+	def forward(self, x):
+		if self.down is not None :
+			x = self.down(x)
+		x = self.conv(x)
+		return x
+
+class double_conv_up(nn.Module):
+	def __init__(self, in_ch, mid_ch, out_ch, stride = 1, planes = 256):
+		super(double_conv_up, self).__init__()
+		self.planes = planes
+		# down = None
+		# if stride > 1 :
+		# 	down = nn.Sequential(
+		# 		nn.AvgPool2d(2, 2),
+		# 		nn.Conv2d(in_ch + mid_ch, self.planes * Bottleneck.expansion, kernel_size=1, stride=1, bias=False),nn.BatchNorm2d(self.planes * Bottleneck.expansion)
+		# 		)
+		self.down = None
+		if stride > 1 :
+			self.down = nn.AvgPool2d(2,stride=2)
+		self.conv = nn.Sequential(
+			nn.Conv2d(in_ch + mid_ch, mid_ch, kernel_size=3, padding=1, stride = 1, bias=False),
+			nn.BatchNorm2d(mid_ch),
+			nn.ReLU(inplace=True),
+			#Bottleneck(mid_ch, self.planes, stride, down, 2, 1, avd = True, norm_layer = nn.BatchNorm2d),
+			nn.Conv2d(mid_ch, mid_ch, kernel_size=3, stride = 1, padding=1, bias=False),
+			nn.BatchNorm2d(mid_ch),
+			nn.ReLU(inplace=True),
+			nn.ConvTranspose2d(mid_ch, out_ch, kernel_size=4, stride = 2, padding=1, bias=False),
+			nn.BatchNorm2d(out_ch),
+			nn.ReLU(inplace=True),
+		)
+
+	def forward(self, x):
+		if self.down is not None :
+			x = self.down(x)
+		x = self.conv(x)
+		return x
+
+class TextDetection(nn.Module) :
+	def __init__(self, pretrained=None) :
+		super(TextDetection, self).__init__()
+		self.backbone = resnet101(pretrained=True if pretrained else False)
+
+		self.conv_db = DBHead.DBHead(64, 0)
+
+		self.conv_mask = nn.Sequential(
+			nn.Conv2d(64, 64, kernel_size=3, padding=1), nn.ReLU(inplace=True),
+			nn.Conv2d(64, 64, kernel_size=3, padding=1), nn.ReLU(inplace=True),
+			nn.Conv2d(64, 32, kernel_size=3, padding=1), nn.ReLU(inplace=True),
+			nn.Conv2d(32, 1, kernel_size=1),
+			nn.Sigmoid()
+		)
+
+		self.down_conv1 = double_conv(0, 512, 512, 2)
+		self.down_conv2 = double_conv(0, 512, 512, 2)
+		self.down_conv3 = double_conv(0, 512, 512, 2)
+
+		self.upconv1 = double_conv_up(0, 512, 256)
+		self.upconv2 = double_conv_up(256, 512, 256)
+		self.upconv3 = double_conv_up(256, 512, 256)
+		self.upconv4 = double_conv_up(256, 512, 256, planes = 128)
+		self.upconv5 = double_conv_up(256, 256, 128, planes = 64)
+		self.upconv6 = double_conv_up(128, 128, 64, planes = 32)
+		self.upconv7 = double_conv_up(64, 64, 64, planes = 16)
+
+		self.proj_h4 = nn.Conv2d(64 * 4, 64, 1)
+		self.proj_h8 = nn.Conv2d(128 * 4, 128, 1)
+		self.proj_h16 = nn.Conv2d(256 * 4, 256, 1)
+		self.proj_h32 = nn.Conv2d(512 * 4, 512, 1)
+
+	def forward(self, x) :
+		x = self.backbone.conv1(x)
+		x = self.backbone.bn1(x)
+		x = self.backbone.relu(x)
+		x = self.backbone.maxpool(x) # 64@384
+
+		h4 = self.backbone.layer1(x) # 64@384
+		h8 = self.backbone.layer2(h4) # 128@192
+		h16 = self.backbone.layer3(h8) # 256@96
+		h32 = self.backbone.layer4(h16) # 512@48
+
+		h4 = self.proj_h4(h4)
+		h8 = self.proj_h8(h8)
+		h16 = self.proj_h16(h16)
+		h32 = self.proj_h32(h32)
+
+		h64 = self.down_conv1(h32) # 512@24
+		h128 = self.down_conv2(h64) # 512@12
+		h256 = self.down_conv3(h128) # 512@6
+
+		up256 = self.upconv1(h256) # 128@12
+		up128 = self.upconv2(torch.cat([up256, h128], dim = 1)) # 64@24
+		up64 = self.upconv3(torch.cat([up128, h64], dim = 1)) # 128@48
+		up32 = self.upconv4(torch.cat([up64, h32], dim = 1)) # 64@96
+		up16 = self.upconv5(torch.cat([up32, h16], dim = 1)) # 128@192
+		up8 = self.upconv6(torch.cat([up16, h8], dim = 1)) # 64@384
+		up4 = self.upconv7(torch.cat([up8, h4], dim = 1)) # 64@768
+
+		return self.conv_db(up8), self.conv_mask(up4)
+
+if __name__ == '__main__' :
+	device = torch.device("cuda:0")
+	net = TextDetection().to(device)
+	img = torch.randn(2, 3, 1024, 1024).to(device)
+	db, seg = net(img)
+	print(db.shape)
+	print(seg.shape)
diff --git a/README.md b/README.md
@@ -3,10 +3,17 @@ https://touhou.ai/imgtrans/
 Note this may not work sometimes due to stupid google gcp kept restarting my instance. In that case you can wait for me to restart the service, which may take up to 24 hrs.
 # English README
 [README_EN.md](README_EN.md)
-# 关于新模型
-新模型使用DBNet，正在训练，将更好支持英文识别。 \
-新的图片修复将去掉attention以减少显存占用。 \
-预计一到两周左右出来。
+# Changelogs
+### 2021-05-06
+1. 检测模型更新为基于ResNet101的DBNet
+2. OCR模型更新更深
+3. 默认检测分辨率增加到2048
+
+注意这个版本除了英文检测稍微好一些，其他方面都不如之前版本
+### 2021-03-04
+1. 添加图片修补模型
+### 2021-02-17
+1. 初步版本发布
 # 一键翻译各类图片内文字
 针对群内、各个图站上大量不太可能会有人去翻译的图片设计，让我这种日语小白能够勉强看懂图片\
 主要支持日语，不过也能识别汉语和小写英文 \
@@ -15,7 +22,7 @@ Note this may not work sometimes due to stupid google gcp kept restarting my ins
 
 # 使用说明
 1. clone这个repo
-2. [下载](https://github.com/zyddnys/manga-image-translator/releases/tag/alpha-v2.2)ocr.ckpt、detect.ckpt和inpainting.ckpt，放到这个repo的根目录下
+2. [下载](https://github.com/zyddnys/manga-image-translator/releases/tag/alpha-v2.2.1)ocr.ckpt、detect.ckpt和inpainting.ckpt，放到这个repo的根目录下
 3. 申请百度翻译API，把你的appid和密钥存到key.py里
 4. 运行`python translate_demo.py --image <图片文件路径> [--use-inpainting] [--use-cuda]`，结果会存放到result文件夹里。请加上`--use-inpainting`使用图像修补，请加上`--use-cuda`使用GPU。
 # 只是初步版本，我们需要您的帮助完善
@@ -38,48 +45,3 @@ Note this may not work sometimes due to stupid google gcp kept restarting my ins
 ![Original](original2.jpg "https://twitter.com/mmd_96yuki/status/1320122899005460481")|![Output](result2.png)
 ![Original](original3.jpg "https://twitter.com/_taroshin_/status/1231099378779082754")|![Output](result3.png)
 ![Original](original4.jpg "https://amagi.fanbox.cc/posts/1904941")|![Output](result4.png)
-# Citation
-```
-@inproceedings{baek2019character,
-  title={Character region awareness for text detection},
-  author={Baek, Youngmin and Lee, Bado and Han, Dongyoon and Yun, Sangdoo and Lee, Hwalsuk},
-  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
-  pages={9365--9374},
-  year={2019}
-}
-@article{hinami2020towards,
-  title={Towards Fully Automated Manga Translation},
-  author={Hinami, Ryota and Ishiwatari, Shonosuke and Yasuda, Kazuhiko and Matsui, Yusuke},
-  journal={arXiv preprint arXiv:2012.14271},
-  year={2020}
-}
-@article{oord2017neural,
-  title={Neural discrete representation learning},
-  author={Oord, Aaron van den and Vinyals, Oriol and Kavukcuoglu, Koray},
-  journal={arXiv preprint arXiv:1711.00937},
-  year={2017}
-}
-@article{uddin2020global,
-  title={Global and Local Attention-Based Free-Form Image Inpainting},
-  author={Uddin, SM and Jung, Yong Ju},
-  journal={Sensors},
-  volume={20},
-  number={11},
-  pages={3204},
-  year={2020},
-  publisher={Multidisciplinary Digital Publishing Institute}
-}
-@article{brock2021characterizing,
-  title={Characterizing signal propagation to close the performance gap in unnormalized ResNets},
-  author={Brock, Andrew and De, Soham and Smith, Samuel L},
-  journal={arXiv preprint arXiv:2101.08692},
-  year={2021}
-}
-@inproceedings{fujimoto2016manga109,
-  title={Manga109 dataset and creation of metadata},
-  author={Fujimoto, Azuma and Ogawa, Toru and Yamamoto, Kazuyoshi and Matsui, Yusuke and Yamasaki, Toshihiko and Aizawa, Kiyoharu},
-  booktitle={Proceedings of the 1st international workshop on comics analysis, processing and understanding},
-  pages={1--5},
-  year={2016}
-}
-```
diff --git a/README_EN.md b/README_EN.md
@@ -1,8 +1,17 @@
 # Online Demo
 https://touhou.ai/imgtrans/
 Note this may not work sometimes due to stupid google gcp kept restarting my instance. In that case you can wait for me to restart the service, which may take up to 24 hrs.
-# New model delayed
-New model delayed due to poor result, I am fixing it however there is no guarantee it will be out this week.
+# Changelogs
+### 2021-05-06
+1. Text detection model is now based on DBNet with ResNet101 backbone
+2. OCR model is now deeper
+3. Default detection resolution has been increased to 2048 from 1536
+
+Note this version is slightly better at handling English texts, other than that it is worse in every other ways
+### 2021-03-04
+1. Added inpainting model
+### 2021-02-17
+1. First version launched
 # Translate texts in manga/images
 Some manga/images will never be translated, therefore this project is born, \
 Primarily designed for translating Japanese text, but also support Chinese and sometimes English \
@@ -11,7 +20,7 @@ Successor to https://github.com/PatchyVideo/MMDOCR-HighPerformance
 
 # How to use
 1. Clone this repo
-2. [Download](https://github.com/zyddnys/manga-image-translator/releases/tag/alpha-v2.2)ocr.ckpt、detect.ckpt and inpainting.ckpt，put them in the root directory of this repo
+2. [Download](https://github.com/zyddnys/manga-image-translator/releases/tag/alpha-v2.2.1)ocr.ckpt、detect.ckpt and inpainting.ckpt，put them in the root directory of this repo
 3. Apply for baidu translate API, put ypur appid and key in `key.py`
 4. Run`python translate_demo.py --image <path_to_image_file> [--use-inpainting] [--use-cuda]`，result can be found in `result/`. Add `--use-inpainting` to enable inpainting, Add `--use-cuda` to use CUDA.
 # This is a hobby project, you are welcome to contribute
@@ -32,48 +41,3 @@ Original             |  Translated
 ![Original](original2.jpg "https://twitter.com/mmd_96yuki/status/1320122899005460481")|![Output](result2.png)
 ![Original](original3.jpg "https://twitter.com/_taroshin_/status/1231099378779082754")|![Output](result3.png)
 ![Original](original4.jpg "https://amagi.fanbox.cc/posts/1904941")|![Output](result4.png)
-# Citation
-```
-@inproceedings{baek2019character,
-  title={Character region awareness for text detection},
-  author={Baek, Youngmin and Lee, Bado and Han, Dongyoon and Yun, Sangdoo and Lee, Hwalsuk},
-  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
-  pages={9365--9374},
-  year={2019}
-}
-@article{hinami2020towards,
-  title={Towards Fully Automated Manga Translation},
-  author={Hinami, Ryota and Ishiwatari, Shonosuke and Yasuda, Kazuhiko and Matsui, Yusuke},
-  journal={arXiv preprint arXiv:2012.14271},
-  year={2020}
-}
-@article{oord2017neural,
-  title={Neural discrete representation learning},
-  author={Oord, Aaron van den and Vinyals, Oriol and Kavukcuoglu, Koray},
-  journal={arXiv preprint arXiv:1711.00937},
-  year={2017}
-}
-@article{uddin2020global,
-  title={Global and Local Attention-Based Free-Form Image Inpainting},
-  author={Uddin, SM and Jung, Yong Ju},
-  journal={Sensors},
-  volume={20},
-  number={11},
-  pages={3204},
-  year={2020},
-  publisher={Multidisciplinary Digital Publishing Institute}
-}
-@article{brock2021characterizing,
-  title={Characterizing signal propagation to close the performance gap in unnormalized ResNets},
-  author={Brock, Andrew and De, Soham and Smith, Samuel L},
-  journal={arXiv preprint arXiv:2101.08692},
-  year={2021}
-}
-@inproceedings{fujimoto2016manga109,
-  title={Manga109 dataset and creation of metadata},
-  author={Fujimoto, Azuma and Ogawa, Toru and Yamamoto, Kazuyoshi and Matsui, Yusuke and Yamasaki, Toshihiko and Aizawa, Kiyoharu},
-  booktitle={Proceedings of the 1st international workshop on comics analysis, processing and understanding},
-  pages={1--5},
-  year={2016}
-}
-```