-
Notifications
You must be signed in to change notification settings - Fork 1
/
PhrasaFerret.py
executable file
·185 lines (150 loc) · 5.11 KB
/
PhrasaFerret.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
#! /usr/bin/python3
import codecs
from lxml import etree # Lets use this as the xml parser, it has some more features :D
import sys
import fnmatch
import ScholarScrape
# This is Phrasal Ferret's collection of global variables
phrases = {} # Dictionary of phrases and frequencies
filename = ""
doc = "" # Holds the plain text vers of the doc
font = "" # keep track of fonts used (for finding italics)
italic_phrase = "" # temp phrase value
last_text_size = 0 # value to track varing font sizes (for finding title)
title = ""
temp = "" # holds the temporary title
size = 0
page = 0
# Phrasal Ferret wants some new phrases! But first it needs plain text, not xml
# It strips the branches of all the useful leaves to get plain text
def extract_xml():
global doc
root = tree.getroot()
allChars = root.findall(".//text")
for char in allChars:
try:
doc += char.text
except:
continue
find_repeats() # this calls the function that raises the error
# Phrasal Ferret liked those italic phrases so much
# Now it wants to find those phrases again!
# This looks for repeats of phrases in the document.
# Note: only accepts plain text, not xml.
def find_repeats():
global doc
global phrases
for key, value in phrases.items():
appearances = doc.count(key) # this is the broken line of code
#if phrase in doc:
phrases[key] += appearances
def get_title(text):
global title
global size
global last_text_size # is -1 if title ended
global temp
# is this a text element with a size?
if text.get("size"):
size = float(text.get("size"))
if size > last_text_size:
last_text_size = size
temp = text.text
elif size == last_text_size:
temp += text.text
elif size < last_text_size:
temp = temp.strip()
if len(temp) > len(title): # assuming titles will be the biggest and longest
title = temp
else: # if there is no char in this element, add a space
temp += " "
# Phrasal Ferret thinks italic phrases are damn tasty
# this function finds all of the italic phrases
def find_italic(textline):
global italic_phrase
global font
global size
global last_text_size
global page
for text in textline:
if not text.tag == "text":
continue
# Grab the title from the first page
if page == '1':
get_title(text)
# Is this char part of the current phrase?
if text.get("font"):
if font == text.get("font"):
italic_phrase += text.text
# is this the start of a a new phrase?
elif "Italic" in text.get("font"):
# save the last phrase before starting a new one
if len(italic_phrase) > 2:
italic_phrase = italic_phrase.strip()
if not italic_phrase in phrases:
phrases[italic_phrase] = 1
else:
phrases[italic_phrase] += 1
# clear old phrase and save the new one
italic_phrase = text.text
font = text.get("font")
else:
continue
elif len(italic_phrase) > 0: # if there is no font, add a space
italic_phrase += " "
# Phrasal Ferret excitedly wiggles it's nose!
# It scuttles along the branches in search for tasty text he can analyse.
def find_text():
global page
root = tree.getroot()
# are there leaves on this tree?
if len(root):
print("lets climb some branches!")
# climb through the first branches
for child in root:
# Grab the page number
if child.tag == "page":
page = child.get("id")
# Get the textbox
for textbox in child:
if not textbox.tag == "textbox":
continue
# Get the textline
for textline in textbox:
if not textline.tag == "textline":
continue
find_italic(textline)
def search_google():
query = raw_input("Would you like to search Google Scholar for this paper and its most common phrase? yes/no\n")
if query == "yes":
print("Searching...")
ScholarScrape.search(title + key)
elif query == "no":
print("done")
raise SystemExit
else:
print("**Please choose 'yes' or 'no'**")
search_google()
# ---- This is Phrasal Ferret's starting point, the first block of the program ---- #
if len(sys.argv) != 2:
print("Usage: FirstFerret.py <xml file> \nPhrasel Ferret likes to climb xml trees. They have the softest leaves and pleasant smelling flowers. Please try again with an xml file >^_^<")
raise SystemExit
elif fnmatch.fnmatch(sys.argv[1], "*.xml"):
filename = sys.argv[1]
try:
parser = etree.XMLParser(recover=True)
tree = etree.parse(codecs.open(filename, encoding = "UTF-8", errors='replace'), parser=parser)
except:
print("OH NO. Phrasal Ferret fell from the branch. (Looks like something went wrong)")
raise SystemExit
find_text()
else:
print("Phrasel Ferret farts in your general direction.")
# convert to plain text and find recurrances
extract_xml()
# What did Phrasal Ferret find? Here are the results
print(str(len(phrases)) + " italic phrases found")
key = max(phrases, key=phrases.get)
print("The most common italic phrase is:")
print(key, phrases[key])
print("title: " + title)
search_google()