This repository has been archived by the owner on May 10, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathurloop.rb
executable file
·350 lines (305 loc) · 8.43 KB
/
urloop.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
#!/usr/bin/env ruby
# coding: utf-8
require 'rubygems'
require 'bundler/setup'
Bundler.require
require "pp"
VERSION=0.1
# Config
begin
@config = YAML.load_file('config.yml')
rescue Errno::ENOENT
puts "Please create a config.yml file."
exit 1
end
# Scraplist
begin
@craplist = YAML.load_file('crap.yml')
rescue Errno::ENOENT
puts "Please create a crap.yml file, even empty."
exit 1
end
# Synonyms
begin
@synonyms = YAML.load_file('synonyms.yml')
rescue Errno::ENOENT
puts "synonyms.yml : Why removing this innocent file ?"
exit 1
end
# already parsed logs list
begin
@already_parsed_logs = YAML.load_file('logs_parsed.yml')
rescue Errno::ENOENT
File.open('logs_parsed.yml', 'w') {|file| file.puts([].to_yaml)}
@already_parsed_logs = YAML.load_file('logs_parsed.yml')
end
# lazy debug puts
def dputs(msg)
puts "[DEBUG] #{msg}" if @config['debug']
end
dputs "URloop version #{VERSION} - Debug true"
trap("INT") { itsAtrap("int") }
dputs "Now trap'ing: INT"
# Connect to the SemanticScuttle/Delicious API
d = WWW::Delicious.new(@config['api']['user'], @config['api']['pass'], :base_uri => @config['api']['url'])
begin
d.valid_account?
dputs "[API] Valid account"
rescue WWW::Delicious::ResponseError
# ignore
end
def itsAtrap(trapsig)
case trapsig
when 'int'
trapIntSaveParsedLogs
else
trapIntSaveParsedLogs
end
end
def trapIntSaveParsedLogs
puts "Got trapped by a INT signal, going to save the parsed logs file and exit nicely."
File.open('logs_parsed.yml', 'w') {|file| file.puts(@already_parsed_logs.to_yaml)}
exit 0
end
def getUrlTitle(url)
agent = Mechanize.new
agent.user_agent_alias = 'Mac Safari'
begin
doc = agent.get(url)
rescue Net::HTTPNotFound => e
dputs "getUrlTitle error: #{e.message} for #{url}"
return nil
rescue Mechanize::ResponseCodeError => e
dputs "getUrlTitle error: #{e.message} for #{url}"
return nil
rescue OpenSSL::SSL::SSLError => e
dputs "getUrlTitle error: #{e.message} for #{url}"
return ""
rescue SocketError => e
dputs "getUrlTitle error: #{e.message} for #{url}"
return nil # sorry...
rescue Mechanize::UnsupportedSchemeError => e
["http://", "https://", "ftp://", "ftps://", "mailto://", "nntp://", "xmpp://"].each do |format|
if url.include? format
return "" # valid format
end
end
dputs "getUrlTitle error: #{e.message} for #{url}"
return nil # nope
rescue Errno::ETIMEDOUT => e
rescue Net::HTTP::Persistent::Error => e
dputs "getUrlTitle error: #{e.message} for #{url}"
return nil
rescue Errno::EHOSTUNREACH => e
rescue Errno::ENETUNREACH => e
dputs "getUrlTitle error: #{e.message} for #{url}"
return nil
rescue Mechanize::RedirectLimitReachedError => e
dputs "getUrlTitle error: #{e.message} for #{url}"
return nil
rescue URI::InvalidURIError => e
dputs "getUrlTitle error: #{e.message} for #{url}"
return nil
end
if doc
begin
return (doc.title.nil? ? "" : doc.title)
rescue => e
dputs "getUrlTitle 'doc' error: #{e.message} for #{url}"
return ""
end
end
dputs "getUrlTitle, no doc, no title, wtf ? #{url}"
return ""
end
def addLogToVarAndSave(logname)
@already_parsed_logs << logname
File.open('logs_parsed.yml', 'w') {|file| file.puts(@already_parsed_logs.to_yaml)}
end
def urlInCrapList(url)
@craplist.each do |urlcrap|
if url.include? urlcrap
return true
end
end
return false
end
def userExcluded(user)
@config['exclude_users'].include? user
end
def cleanTags(tags)
t = []
tags.map {|tt| t << tt.strip.gsub("#", "")}
t
end
def urlHasTagExcluded(tags)
@config['exclude_tags'].each do |tag|
if tags.include? tag
return true
end
end
return false
end
def fixTagsWithSynonyms(tags)
@fixed_tags = []
tags.each do |tag|
t = nil
@synonyms.each_pair do |key, vals|
if vals.include? tag
t = key
end
end
@fixed_tags << (t || tag)
end
return @fixed_tags
end
def askUserYesOrNot(question)
while true
print question
case gets.strip
when 'Y', 'y', 'yes'
return true
when /\A[nN]o?\Z/
break
end
end
return false
end
# scan for logs to parse : config['logs_dir']
# scan directory, and exclude logs which filename is in the already_parsed logs files
# also exclude the log-of-the-day from being parsed
@logs_to_scan = []
@logslist = []
if ARGV.empty?
@logslist = Dir.new(@config['logs']['dir']).find_all
else
ARGV.map { |z| @logslist << z }
ARGV.clear
end
@log_of_the_day = Time.now.strftime @config['logs']['format']
@logslist.each do |log|
if @already_parsed_logs.include?(log) or @log_of_the_day == log or ['..', '.'].include? log
next
else
@logs_to_scan << log
end
end
dputs "Logs to scan: #{@logs_to_scan.join(', ')}"
# parse logs and grab urls
@logs = {}
@logs_to_scan.each do |log|
file = File.join(@config['logs']['dir'], log)
next if !File.readable?(file) or !File.exists?(file)
dputs "Parsing #{file}"
@no_urls = true
@log_urls=[]
l = File.open(file, 'r')
l.each do |line|
line.encode!('UTF-8', 'UTF-8', :invalid => :replace)
local_urls = []
# 1/ extract urls
urls = URI.extract(line)
urls.each do |url|
if url =~ /^(http|gopher|mailto|ftp)/
local_urls << url
end
end
next if local_urls.empty?
# 2/ extract user and timestamp
things = line.match(/^(\d{4}-\d{2}-\d{2}\w\d{2}:\d{2}:\d{2})\s+<(.*)>\s/i)
user = things[2] if things
timestamp = things[1] if things
next if userExcluded(user) # excluded users, like lapool bot
# 3/ extract tags
tags = line.match(/^.*\#(.*),?\#$/)
if tags
tags = tags[1]
tags = tags.split(",")
else
tags = []
end
next if (tags.empty? and @config['exclude_no_tags'])
tags = cleanTags(tags)
next if urlHasTagExcluded(tags)
tags << user if @config["add_user_to_tags"]
# 4/ fill @urls with the new urls
urls_w_title = nil
local_urls.each do |url|
next if urlInCrapList(url)
valid = false
title = getUrlTitle(url) # also used to verify if the url is valid (no 404)
# title is nil (fail), "" (no title, image), or a non empty string
if title.nil?
dputs "URL ignored #{url} return code is a 404 or other than 200 title '#{title}'"
end
if !title.nil?
title.gsub!("\n", "")
title.gsub!("\r", "")
title.strip!
end
title = "no title" if (title == "" or title.nil?)
valid = true if !title.nil?
begin
url = PostRank::URI.clean(url)
rescue Addressable::URI::InvalidURIError
valid = false
end
dputs "+++> #{valid ? 'valid' : 'invalid'} url '#{url}' w/ title '#{title}' tags: '#{tags}'"
urls_w_title = {:url => url, :title => title, :user => user, :tags => fixTagsWithSynonyms(tags)} if valid
if valid
dputs "-> URL found : '#{url}'"
dputs "-> User: '#{user}', tags : #{tags.join(', ')}"
end
end
@log_urls << urls_w_title if urls_w_title
@no_urls = false
end
@logs[log] = @log_urls if (@log_urls and !@log_urls.empty?)
addLogToVarAndSave(log) if @no_urls
dputs "Log with no urls :(" if @no_urls
dputs ""
l.close
end
# start working with the user
pp @logs
@old_log = nil
# log: logname
# user: foo
# tags: foo, bar, baz
# urls:
# url: foobar
# title: coin
#
# {"coin.log" => [{},{}]}
#
@count_posted = 0
@logs.each_pair do |log,urls|
if @old_log != log
# new log to pase, save the old one in the parsed files
addLogToVarAndSave(@old_log)
urls.each do |url|
# url{ :user, url, tags, title }
# Show the url, with tag, and user to the user
puts ">>> #{url[:user]} posted #{url[:url]} with tags #{url[:tags].join(", ")}"
puts ">> Title: #{url[:title]}"
# Ask if we upload it
ret = askUserYesOrNot("Post this URL to the remote API ? (if no you will need to upload manually) [y/n]: ")
if ret
# Adding post to scuttle, replacing if already exists
post = d.posts_get(:url => url[:url])
newpost = d.posts_add(:url => url[:url], :title => url[:title], :tags => url[:tags], :replace => true)
if newpost
puts "=> Post saved !"
@count_posted+=1
else
puts "=> Unsaved, error somewhere :("
end
else
next # dont save, switch to next url
end
end
end
@old_log = log
end
addLogToVarAndSave(@old_log) # the last one
puts "You have posted: #{@count_posted} links. OMNOMNOMNOM"