-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patharticle_search.py
177 lines (144 loc) · 7.08 KB
/
article_search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
# this code serves for quickly search information,
# including title, authors' names, etc., from a designated root folder
# specifically designed for a "raw" literature search
# jul 2 update -- change the code to multiprocessing version
# oct 30 update -- update duplication check
# nov 1st update -- add I/O interface [embedded into article search]
# dec 12th update -- update photo comparison [considering first page as a "picture"]
# -- currently unused due to computational inefficiency
# jan 3rd update -- update article title extraction + rename
# Author: Zihan Zhang, UW
import warnings
import scipy
import argparse
from helper import *
if __name__ == "__main__":
warnings.filterwarnings("ignore")
# Create the parser
parser = argparse.ArgumentParser(
description='Process inputs for document handling in a specified folder.')
# Add arguments
# filepath of designated folder
parser.add_argument('--rootfolder', type=str, default="../paper/",
help="Filepath of the designated folder")
# keywords list you want to search on;
# for more precise result, keep it short and concise without special characters;
# e.g. article's title, author's name, or article's keyword
# notice that special character, including question mark (?) and dashed line (-), should not be included
parser.add_argument('--key', type=str, help="Keywords to search")
# which functionality to choose
# 0: search content &/ (re)create database
# 1: duplication search & remove undesired documents
parser.add_argument('--index', type=int, choices=[0, 1], required=True,
help='Functionality to choose (0: search content/create database, 1: duplication search)')
### The following options are only needed when index == 0 ###
# I/O index [whether to use existed database to speed up searching (though might not be exhaustive)]
parser.add_argument('--ioindex', type=int,
choices=[0, 1], default=1, help='I/O index (0: start from scratch, 1: use existed database)')
# database index, only useful when ioindex==0 [whether to rewrite old database]
parser.add_argument('--databaseindex', type=int,
choices=[0, 1], default=0, help='Database index, only useful when ioindex==0 (0: no action, 1: generate and overwrite old database)')
# output image folder
parser.add_argument('--imgout', type=str, default="image_save/",
help="Output path of first image extraction")
# Parse the arguments
args = parser.parse_args()
print("=== Program Starts ===")
if args.index == 0 and (args.key is None or not args.key.strip()):
parser.error("--key is required when --index is 0.")
# Assigning values from arguments
rootfolder = args.rootfolder
key = [args.key]
index = args.index
img_output = f"{rootfolder}{args.imgout}"
# Create image output folder
os.makedirs(img_output, exist_ok=True)
print(f"Path '{img_output}' ensured.")
# The following options are only needed when index == 0
if index == 0:
ioindex = args.ioindex
databaseindex = args.databaseindex
else:
ioindex = None
databaseindex = None
print("=== Experiment Start ===")
start_time = time.time()
try:
if index == 0:
file_name = "loaddata.json"
file_path = os.path.join(os.getcwd(), file_name)
if os.path.exists(file_path):
timestamp = os.path.getmtime(file_path)
formatted_timestamp = datetime.fromtimestamp(
timestamp).strftime('%H:%M at %m:%d:%Y')
# output timestamp info
print(
f"Database file with path: {file_path} generated at {formatted_timestamp}")
else:
print("No database is existed")
if ioindex == 0:
print("== Starting searching from scratch ==")
# keyword search demo
cutthreshold = 1000
keywordresult, allresult, allabstract, alltext, alltitle, renamecnt = article_search_by_words(
rootfolder, key, cutthreshold, img_output)
resultlist = keywordresult
# check
assert len(allresult) == len(
allabstract) == len(alltext) == len(alltitle)
if databaseindex == 1:
generate_json(allresult, allabstract,
alltext, alltitle, renamecnt)
elif ioindex == 1:
print("== Use database result ==")
san1 = time.time()
# keyword search demo, but using pregenerated database
# no need to use multithreading, since already efficient enough
try:
resultlist = []
sepkeystrig = splitkey(key)
with open(file_path, "r") as json_file:
data = json.load(json_file)
# make sure the index is consistent with [generate_json] function in article_search_helper.py
allpath = data["the_path"]
allabstract = data["this_abstract"]
print(f"Total number of files: {len(allpath)}")
# searching over list is efficient, so parallel computation is not needed
for i in range(len(allpath)):
theabstract = allabstract[i]
if all([x in theabstract for x in sepkeystrig]):
resultlist.append(allpath[i])
except Exception as e:
print(e)
san2 = time.time()
print(f"Sanity Check: {san2-san1}s")
# whether to open the selected files
openindex = input(
f"Do you want to open these {len(resultlist)} files? ")
# open the files (in default application) if permitted
# DON'T DO THAT IF THERE ARE TOO MANY FILES IN THE SEARCHED LIST
if int(openindex) == 1:
print(f"Search Files: {resultlist}")
for i in resultlist:
open_pdf_file(i)
elif index == 1:
# duplication search demo
cutthreshold = 1000
duplicateresult = duplicate_search_by_words_and_photos(
rootfolder, cutthreshold, img_output, 0.99)
resultlist = duplicateresult
# resultlist = merge_tuple(resultlist)
print(resultlist)
deleteindex = input(f"Delete these files? ")
if int(deleteindex) == 1:
cnt = delete_files(resultlist)
print(f"Total number of files for deletion: {cnt}")
end_time = time.time()
running_time = end_time - start_time
print(f"Program running time: {int(running_time)}s")
# user interrupt
except KeyboardInterrupt:
print("Interrupted by user")
finally:
# Delete cache folder
delete_pycache(os.getcwd())