Merge pull request #704 from dmMaze/main

fix overflow in SegDetectorRepresenter and get_transformed_region
zyddnys · Sep 18, 2024 · 37bb4cd · 37bb4cd
2 parents ead6693 + e158e8d
commit 37bb4cd
Show file tree

Hide file tree

Showing 6 changed files with 65 additions and 16 deletions.
diff --git a/manga_translator/detection/ctd_utils/utils/db_utils.py b/manga_translator/detection/ctd_utils/utils/db_utils.py
@@ -140,7 +140,7 @@ def boxes_from_bitmap(self, pred, _bitmap, dest_width, dest_height):
         height, width = bitmap.shape
         contours, _ = cv2.findContours((bitmap * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
         num_contours = min(len(contours), self.max_candidates)
-        boxes = np.zeros((num_contours, 4, 2), dtype=np.int16)
+        boxes = np.zeros((num_contours, 4, 2), dtype=np.int64)
         scores = np.zeros((num_contours,), dtype=np.float32)
 
         for index in range(num_contours):
@@ -166,7 +166,7 @@ def boxes_from_bitmap(self, pred, _bitmap, dest_width, dest_height):
 
             box[:, 0] = np.clip(np.round(box[:, 0] / width * dest_width), 0, dest_width)
             box[:, 1] = np.clip(np.round(box[:, 1] / height * dest_height), 0, dest_height)
-            boxes[index, :, :] = box.astype(np.int16)
+            boxes[index, :, :] = box.astype(np.int64)
             scores[index] = score
         return boxes, scores
 

diff --git a/manga_translator/detection/dbnet_convnext.py b/manga_translator/detection/dbnet_convnext.py
@@ -568,7 +568,7 @@ async def _infer(self, image: np.ndarray, detect_size: int, text_threshold: floa
             polys, _ = boxes[idx], scores[idx]
             polys = polys.astype(np.float64)
             polys = craft_utils.adjustResultCoordinates(polys, ratio_w, ratio_h, ratio_net=1)
-            polys = polys.astype(np.int16)
+            polys = polys.astype(np.int64)
 
         textlines = [Quadrilateral(pts.astype(int), '', score) for pts, score in zip(polys, scores)]
         textlines = list(filter(lambda q: q.area > 16, textlines))

diff --git a/manga_translator/detection/default.py b/manga_translator/detection/default.py
@@ -83,7 +83,7 @@ async def _infer(self, image: np.ndarray, detect_size: int, text_threshold: floa
             polys, _ = boxes[idx], scores[idx]
             polys = polys.astype(np.float64)
             polys = craft_utils.adjustResultCoordinates(polys, ratio_w, ratio_h, ratio_net=1)
-            polys = polys.astype(np.int16)
+            polys = polys.astype(np.int64)
 
         textlines = [Quadrilateral(pts.astype(int), '', score) for pts, score in zip(polys, scores)]
         textlines = list(filter(lambda q: q.area > 16, textlines))

diff --git a/manga_translator/detection/default_utils/dbnet_utils.py b/manga_translator/detection/default_utils/dbnet_utils.py
@@ -112,7 +112,7 @@ def boxes_from_bitmap(self, pred, _bitmap, dest_width, dest_height):
         except ValueError:
             return [], []
         num_contours = min(len(contours), self.max_candidates)
-        boxes = np.zeros((num_contours, 4, 2), dtype=np.int16)
+        boxes = np.zeros((num_contours, 4, 2), dtype=np.int64)
         scores = np.zeros((num_contours,), dtype=np.float32)
 
         for index in range(num_contours):
@@ -139,7 +139,7 @@ def boxes_from_bitmap(self, pred, _bitmap, dest_width, dest_height):
             startidx = box.sum(axis=1).argmin()
             box = np.roll(box, 4-startidx, 0)
             box = np.array(box)
-            boxes[index, :, :] = box.astype(np.int16)
+            boxes[index, :, :] = box.astype(np.int64)
             scores[index] = score
         return boxes, scores
 

diff --git a/manga_translator/utils/generic.py b/manga_translator/utils/generic.py
@@ -473,21 +473,36 @@ def get_transformed_region(self, img, direction, textheight) -> np.ndarray:
         v_vec = l1b - l1a
         h_vec = l2b - l2a
         ratio = np.linalg.norm(v_vec) / np.linalg.norm(h_vec)
-        src_pts = self.pts.astype(np.float32)
+
+        src_pts = self.pts.astype(np.int64).copy()
+        im_h, im_w = img.shape[:2]
+
+        x1, y1, x2, y2 = src_pts[:, 0].min(), src_pts[:, 1].min(), src_pts[:, 0].max(), src_pts[:, 1].max()
+        x1 = np.clip(x1, 0, im_w)
+        y1 = np.clip(y1, 0, im_h)
+        x2 = np.clip(x2, 0, im_w)
+        y2 = np.clip(y2, 0, im_h)
+        # cv2.warpPerspective could overflow if image size is too large, better crop it here
+        img_croped = img[y1: y2, x1: x2]
+
+
+        src_pts[:, 0] -= x1
+        src_pts[:, 1] -= y1
+
         self.assigned_direction = direction
         if direction == 'h':
             h = max(int(textheight), 2)
             w = max(int(round(textheight / ratio)), 2)
             dst_pts = np.array([[0, 0], [w - 1, 0], [w - 1, h - 1], [0, h - 1]]).astype(np.float32)
             M, _ = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, 5.0)
-            region = cv2.warpPerspective(img, M, (w, h))
+            region = cv2.warpPerspective(img_croped, M, (w, h))
             return region
         elif direction == 'v':
             w = max(int(textheight), 2)
             h = max(int(round(textheight * ratio)), 2)
             dst_pts = np.array([[0, 0], [w - 1, 0], [w - 1, h - 1], [0, h - 1]]).astype(np.float32)
             M, _ = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, 5.0)
-            region = cv2.warpPerspective(img, M, (w, h))
+            region = cv2.warpPerspective(img_croped, M, (w, h))
             region = cv2.rotate(region, cv2.ROTATE_90_COUNTERCLOCKWISE)
             return region
 

diff --git a/manga_translator/utils/textblock.py b/manga_translator/utils/textblock.py
@@ -221,30 +221,64 @@ def to_dict(self):
         return blk_dict
 
     def get_transformed_region(self, img: np.ndarray, line_idx: int, textheight: int, maxwidth: int = None) -> np.ndarray:
-        src_pts = np.array(self.lines[line_idx], dtype=np.float64)
-
+        im_h, im_w = img.shape[:2]
+
+        line = np.round(np.array(self.lines[line_idx])).astype(np.int64)
+
+        x1, y1, x2, y2 = line[:, 0].min(), line[:, 1].min(), line[:, 0].max(), line[:, 1].max()
+        x1 = np.clip(x1, 0, im_w)
+        y1 = np.clip(y1, 0, im_h)
+        x2 = np.clip(x2, 0, im_w)
+        y2 = np.clip(y2, 0, im_h)
+        img_croped = img[y1: y2, x1: x2]
+
+        direction = 'v' if self.src_is_vertical else 'h'
+
+        src_pts = line.copy()
+        src_pts[:, 0] -= x1
+        src_pts[:, 1] -= y1
         middle_pnt = (src_pts[[1, 2, 3, 0]] + src_pts) / 2
         vec_v = middle_pnt[2] - middle_pnt[0]   # vertical vectors of textlines
         vec_h = middle_pnt[1] - middle_pnt[3]   # horizontal vectors of textlines
-        ratio = np.linalg.norm(vec_v) / np.linalg.norm(vec_h)
+        norm_v = np.linalg.norm(vec_v)
+        norm_h = np.linalg.norm(vec_h)
 
-        if ratio < 1:
+        if textheight is None:
+            if direction == 'h' :
+                textheight = int(norm_v)
+            else:
+                textheight = int(norm_h)
+
+        if norm_v <= 0 or norm_h <= 0:
+            print('invalid textpolygon to target img')
+            return np.zeros((textheight, textheight, 3), dtype=np.uint8)
+        ratio = norm_v / norm_h
+
+        if direction == 'h' :
             h = int(textheight)
             w = int(round(textheight / ratio))
             dst_pts = np.array([[0, 0], [w - 1, 0], [w - 1, h - 1], [0, h - 1]]).astype(np.float32)
             M, _ = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, 5.0)
-            region = cv2.warpPerspective(img, M, (w, h))
-        else:
+            if M is None:
+                print('invalid textpolygon to target img')
+                return np.zeros((textheight, textheight, 3), dtype=np.uint8)
+            region = cv2.warpPerspective(img_croped, M, (w, h))
+        elif direction == 'v' :
             w = int(textheight)
             h = int(round(textheight * ratio))
             dst_pts = np.array([[0, 0], [w - 1, 0], [w - 1, h - 1], [0, h - 1]]).astype(np.float32)
             M, _ = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, 5.0)
-            region = cv2.warpPerspective(img, M, (w, h))
+            if M is None:
+                print('invalid textpolygon to target img')
+                return np.zeros((textheight, textheight, 3), dtype=np.uint8)
+            region = cv2.warpPerspective(img_croped, M, (w, h))
             region = cv2.rotate(region, cv2.ROTATE_90_COUNTERCLOCKWISE)
+
         if maxwidth is not None:
             h, w = region.shape[: 2]
             if w > maxwidth:
                 region = cv2.resize(region, (maxwidth, h))
+
         return region
 
     @property