Skip to content

Commit

Permalink
fix overflow in SegDetectorRepresenter and get_transformed_region
Browse files Browse the repository at this point in the history
  • Loading branch information
dmMaze committed Sep 5, 2024
1 parent ead6693 commit e158e8d
Show file tree
Hide file tree
Showing 6 changed files with 65 additions and 16 deletions.
4 changes: 2 additions & 2 deletions manga_translator/detection/ctd_utils/utils/db_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def boxes_from_bitmap(self, pred, _bitmap, dest_width, dest_height):
height, width = bitmap.shape
contours, _ = cv2.findContours((bitmap * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
num_contours = min(len(contours), self.max_candidates)
boxes = np.zeros((num_contours, 4, 2), dtype=np.int16)
boxes = np.zeros((num_contours, 4, 2), dtype=np.int64)
scores = np.zeros((num_contours,), dtype=np.float32)

for index in range(num_contours):
Expand All @@ -166,7 +166,7 @@ def boxes_from_bitmap(self, pred, _bitmap, dest_width, dest_height):

box[:, 0] = np.clip(np.round(box[:, 0] / width * dest_width), 0, dest_width)
box[:, 1] = np.clip(np.round(box[:, 1] / height * dest_height), 0, dest_height)
boxes[index, :, :] = box.astype(np.int16)
boxes[index, :, :] = box.astype(np.int64)
scores[index] = score
return boxes, scores

Expand Down
2 changes: 1 addition & 1 deletion manga_translator/detection/dbnet_convnext.py
Original file line number Diff line number Diff line change
Expand Up @@ -568,7 +568,7 @@ async def _infer(self, image: np.ndarray, detect_size: int, text_threshold: floa
polys, _ = boxes[idx], scores[idx]
polys = polys.astype(np.float64)
polys = craft_utils.adjustResultCoordinates(polys, ratio_w, ratio_h, ratio_net=1)
polys = polys.astype(np.int16)
polys = polys.astype(np.int64)

textlines = [Quadrilateral(pts.astype(int), '', score) for pts, score in zip(polys, scores)]
textlines = list(filter(lambda q: q.area > 16, textlines))
Expand Down
2 changes: 1 addition & 1 deletion manga_translator/detection/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ async def _infer(self, image: np.ndarray, detect_size: int, text_threshold: floa
polys, _ = boxes[idx], scores[idx]
polys = polys.astype(np.float64)
polys = craft_utils.adjustResultCoordinates(polys, ratio_w, ratio_h, ratio_net=1)
polys = polys.astype(np.int16)
polys = polys.astype(np.int64)

textlines = [Quadrilateral(pts.astype(int), '', score) for pts, score in zip(polys, scores)]
textlines = list(filter(lambda q: q.area > 16, textlines))
Expand Down
4 changes: 2 additions & 2 deletions manga_translator/detection/default_utils/dbnet_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def boxes_from_bitmap(self, pred, _bitmap, dest_width, dest_height):
except ValueError:
return [], []
num_contours = min(len(contours), self.max_candidates)
boxes = np.zeros((num_contours, 4, 2), dtype=np.int16)
boxes = np.zeros((num_contours, 4, 2), dtype=np.int64)
scores = np.zeros((num_contours,), dtype=np.float32)

for index in range(num_contours):
Expand All @@ -139,7 +139,7 @@ def boxes_from_bitmap(self, pred, _bitmap, dest_width, dest_height):
startidx = box.sum(axis=1).argmin()
box = np.roll(box, 4-startidx, 0)
box = np.array(box)
boxes[index, :, :] = box.astype(np.int16)
boxes[index, :, :] = box.astype(np.int64)
scores[index] = score
return boxes, scores

Expand Down
21 changes: 18 additions & 3 deletions manga_translator/utils/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -473,21 +473,36 @@ def get_transformed_region(self, img, direction, textheight) -> np.ndarray:
v_vec = l1b - l1a
h_vec = l2b - l2a
ratio = np.linalg.norm(v_vec) / np.linalg.norm(h_vec)
src_pts = self.pts.astype(np.float32)

src_pts = self.pts.astype(np.int64).copy()
im_h, im_w = img.shape[:2]

x1, y1, x2, y2 = src_pts[:, 0].min(), src_pts[:, 1].min(), src_pts[:, 0].max(), src_pts[:, 1].max()
x1 = np.clip(x1, 0, im_w)
y1 = np.clip(y1, 0, im_h)
x2 = np.clip(x2, 0, im_w)
y2 = np.clip(y2, 0, im_h)
# cv2.warpPerspective could overflow if image size is too large, better crop it here
img_croped = img[y1: y2, x1: x2]


src_pts[:, 0] -= x1
src_pts[:, 1] -= y1

self.assigned_direction = direction
if direction == 'h':
h = max(int(textheight), 2)
w = max(int(round(textheight / ratio)), 2)
dst_pts = np.array([[0, 0], [w - 1, 0], [w - 1, h - 1], [0, h - 1]]).astype(np.float32)
M, _ = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, 5.0)
region = cv2.warpPerspective(img, M, (w, h))
region = cv2.warpPerspective(img_croped, M, (w, h))
return region
elif direction == 'v':
w = max(int(textheight), 2)
h = max(int(round(textheight * ratio)), 2)
dst_pts = np.array([[0, 0], [w - 1, 0], [w - 1, h - 1], [0, h - 1]]).astype(np.float32)
M, _ = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, 5.0)
region = cv2.warpPerspective(img, M, (w, h))
region = cv2.warpPerspective(img_croped, M, (w, h))
region = cv2.rotate(region, cv2.ROTATE_90_COUNTERCLOCKWISE)
return region

Expand Down
48 changes: 41 additions & 7 deletions manga_translator/utils/textblock.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,30 +221,64 @@ def to_dict(self):
return blk_dict

def get_transformed_region(self, img: np.ndarray, line_idx: int, textheight: int, maxwidth: int = None) -> np.ndarray:
src_pts = np.array(self.lines[line_idx], dtype=np.float64)

im_h, im_w = img.shape[:2]

line = np.round(np.array(self.lines[line_idx])).astype(np.int64)

x1, y1, x2, y2 = line[:, 0].min(), line[:, 1].min(), line[:, 0].max(), line[:, 1].max()
x1 = np.clip(x1, 0, im_w)
y1 = np.clip(y1, 0, im_h)
x2 = np.clip(x2, 0, im_w)
y2 = np.clip(y2, 0, im_h)
img_croped = img[y1: y2, x1: x2]

direction = 'v' if self.src_is_vertical else 'h'

src_pts = line.copy()
src_pts[:, 0] -= x1
src_pts[:, 1] -= y1
middle_pnt = (src_pts[[1, 2, 3, 0]] + src_pts) / 2
vec_v = middle_pnt[2] - middle_pnt[0] # vertical vectors of textlines
vec_h = middle_pnt[1] - middle_pnt[3] # horizontal vectors of textlines
ratio = np.linalg.norm(vec_v) / np.linalg.norm(vec_h)
norm_v = np.linalg.norm(vec_v)
norm_h = np.linalg.norm(vec_h)

if ratio < 1:
if textheight is None:
if direction == 'h' :
textheight = int(norm_v)
else:
textheight = int(norm_h)

if norm_v <= 0 or norm_h <= 0:
print('invalid textpolygon to target img')
return np.zeros((textheight, textheight, 3), dtype=np.uint8)
ratio = norm_v / norm_h

if direction == 'h' :
h = int(textheight)
w = int(round(textheight / ratio))
dst_pts = np.array([[0, 0], [w - 1, 0], [w - 1, h - 1], [0, h - 1]]).astype(np.float32)
M, _ = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, 5.0)
region = cv2.warpPerspective(img, M, (w, h))
else:
if M is None:
print('invalid textpolygon to target img')
return np.zeros((textheight, textheight, 3), dtype=np.uint8)
region = cv2.warpPerspective(img_croped, M, (w, h))
elif direction == 'v' :
w = int(textheight)
h = int(round(textheight * ratio))
dst_pts = np.array([[0, 0], [w - 1, 0], [w - 1, h - 1], [0, h - 1]]).astype(np.float32)
M, _ = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, 5.0)
region = cv2.warpPerspective(img, M, (w, h))
if M is None:
print('invalid textpolygon to target img')
return np.zeros((textheight, textheight, 3), dtype=np.uint8)
region = cv2.warpPerspective(img_croped, M, (w, h))
region = cv2.rotate(region, cv2.ROTATE_90_COUNTERCLOCKWISE)

if maxwidth is not None:
h, w = region.shape[: 2]
if w > maxwidth:
region = cv2.resize(region, (maxwidth, h))

return region

@property
Expand Down

0 comments on commit e158e8d

Please sign in to comment.