-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathremove_duplicates.py
70 lines (60 loc) · 2.36 KB
/
remove_duplicates.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/usr/bin/env python
# coding: utf-8
import os
import io
import numpy as np
from wand.image import Image
from wand.display import display
from PyPDF2 import PdfFileReader, PdfFileWriter
get_ipython().run_line_magic('matplotlib', 'inline')
EQ_THRESHOLD = 0.99
def pdf_page_to_png(page, resolution = 72):
"""
Returns specified PDF page as wand.image.Image png. src: https://gist.github.com/jrsmith3/9947838
:param PyPDF2.PdfFileReader src_pdf: PDF from which to take pages.
:param int pagenum: Page number to take.
:param int resolution: Resolution for resulting png in DPI.
"""
dst_pdf = PdfFileWriter()
dst_pdf.addPage(page)
pdf_bytes = io.BytesIO()
dst_pdf.write(pdf_bytes)
pdf_bytes.seek(0)
# if you get an PolicyError: edit out <policy domain="delegate" rights="none" pattern="gs" /> in /etc/ImageMagick-7/policy.xml
return Image(file = pdf_bytes, resolution = resolution)
def pdf_page_to_np(page, resolution = 72):
# https://stackoverflow.com/questions/47599012/how-to-convert-a-wand-image-object-to-numpy-array-without-opencv?rq=1
image = pdf_page_to_png(page)
image.format = 'gray' #If rgb image, change this to 'rgb' to get raw values
image.alpha_channel = False
image = np.array(image)
image = image.reshape(image.shape[1], image.shape[0], image.shape[2])
return image
def similarity(last, curr):
""" check if 2 pdf-pages are similar enough
by converting pages to images and check how many rgb-channels match """
eq = np.equal(pdf_page_to_np(curr), pdf_page_to_np(last))
# return ratio
return np.sum(eq) / np.prod(eq.shape)
def rem_dup(in_fn, out_fn=None):
if out_fn == None:
path, ext = os.path.splitext(in_fn)
out_fn = path + ".dedup"+ext
# read original
orig = PdfFileReader(in_fn, strict=False)
# create new pdf with only non-duplicated pages
pdf_writer = PdfFileWriter()
pdf_writer.addPage(orig.getPage(0))
for page in range(orig.getNumPages()-1):
last = orig.getPage(page)
curr = orig.getPage(page+1)
simmi = similarity(last, curr)
if simmi < EQ_THRESHOLD:
pdf_writer.addPage(curr)
else:
print(f"Removed page {page+2} with simmilarity {simmi*100}%")
# write output
with open(out_fn, 'wb') as out:
pdf_writer.write(out)
if __name__ == '__main__':
rem_dup("ml.pdf")