-
Notifications
You must be signed in to change notification settings - Fork 49
/
mongodb-101.txt
189 lines (150 loc) · 7.58 KB
/
mongodb-101.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
# MongoDB Installation Instructions:
=> http://docs.mongodb.org/manual/installation/
Just follow the instructions in web guide, fairly simply stuff...
Debian => http://docs.mongodb.org/manual/tutorial/install-mongodb-on-debian/
^^ Follow above if you're using Kali
Ubuntu => http://docs.mongodb.org/manual/tutorial/install-mongodb-on-ubuntu/
Red Hat => http://docs.mongodb.org/manual/tutorial/install-mongodb-on-red-hat-centos-or-fedora-linux/
Linux Source => http://docs.mongodb.org/manual/administration/install-on-linux/
OSX => http://docs.mongodb.org/manual/tutorial/install-mongodb-on-os-x/
# OK, Steps to Query MongoDB Before/After Crawler Sessions:
# Guide by HR
# If you are on Debian/Ubuntu you can use the installer script 'mongodb_setup.sh' I slapped together
# It will get MongoDB installed on local server and ready for craweler
# This file aims to make you familiar enough with MongoDB interaction via Ruby to work with results as you like
# Think custom parsers, additional analysis, etc...
# Start MongoDB
sudo service mongodb start
# Stop MongoDB
sudo service mongodb stop
# Restart MongoDB
sudo service mongodb restart
# Login to IRB in separate terminal so you can query the MongoDB database instance(s) before, during and/or after crawling:
irb
# Load mongo driver
require 'mongo'
# Connect to local MongoDB instance
mongo = Mongo::MongoClient.new("localhost", 27017)
# List the available Mongo Databases or Schemas
mongo.database_names
# Before Crawl => ["local", "test"]
# After Crawl => ["local", "anemone", "makeshift", "test"]
# anemone = default crawler results (NO extras, has body)
# makeshift = custom crawler results (Extras, NO body)
# Connect to a database for querying
mongodb = mongo.db('makeshift')
# Check DB Profiling level
puts "Profiling Level: #{mongodb.profiling_level}"
# List currently active/connected database name
mongodb.name
# List available collections in database
mongodb.collection_names
# Anemone Defaults => ["system.indexes", "pages"]
# Makeshift Defaults => ["system.indexes", "crawler_info"]
# Set collection variable
coll = mongodb['crawler_info']
# Determine how large a collection is (i.e. how many entries it has)
mongodb['crawler_info'].size
mongodb['crawler_info'].count
OR
coll.count
coll.size
# List the first entry in the collection
# Response is a hash with all results stored
mongodb['crawler_info'].find_one
OR
coll.find_one
# Example First Entry in Makeshift database, crawler_info collection => {"_id"=>BSON::ObjectId('52c245423dc39f5b14000001'), "url"=>"http://site.com/", "depth"=>"0", "code"=>"200", "time"=>"485", "size"=>26752, "content-length"=>"26752", "redirect_to"=>"N/A", "key_headers"=>{"Server"=>"Apache"}, "cookies"=>["PHPSESSID=d398f893fbe3fd09ca8982411ed79291; path=/"], "title"=>"N/A", "description"=>"N/A", "keywords"=>"N/A", "generator"=>"N/A", "language"=>"N/A", "author"=>"N/A", "copyright"=>"None", "forms"=>4, "scripts"=>7, "iframes"=>0, "phpbb"=>"N/A", "vulnerable"=>"N/A", "emails"=>[]}
# Using the default makeshift results db
# Iterate through the specific details/findings for a single link or stored result
mongodb['crawler_info'].find_one.each do |k, v|
print "#{k}: "
if v.respond_to?('each')
print "\n"
if v.kind_of?(Hash)
v.each do |a, b|
puts " #{a}: #{b}"
end
else
v.each do |something|
puts " #{something.to_s.chomp}"
end
end
else
print "#{v}\n"
end
end
# Iterate through all entries in collection
# Row by row, Display all fields reported on...
mongodb['crawler_info'].find.each do |row|
row.each do |k, v|
print "#{k.chomp}: "
if v.respond_to?('each')
print "\n"
if v.kind_of?(Hash)
v.each do |a, b|
puts " #{a}: #{b}".chomp
end
else
v.each do |something|
puts " #{something.to_s.chomp}"
end
end
else
print "#{v}\n"
end
end
puts "\n\n"
end
# Process in chunks of 10 rows at a time
coll.find.each_slice(10) do |slice|
puts slice.inspect
end
OR
mongodb['crawler_info'].find.each_slice(10) do |slice|
puts slice.inspect
end
# Find how many links fetched had a response code of 200
coll.find("code" => '200').to_a.size
OR
mongodb['crawler_info'].find("code" => '200').to_a.size
# Find only broken links which had a response code of 404
# Results placed in array which can be enumerated using methods already shown above
coll.find("code" => '404').to_a
OR
mongodb['crawler_info'].find("code" => '404').to_a
# Example Result => [{"_id"=>BSON::ObjectId('52c245503dc39f5b14000055'), "url"=>"http://site.com/placeholder.gif", "depth"=>"1", "code"=>"404", "time"=>"62", "size"=>11812, "content-length"=>"11812", "redirect_to"=>"N/A", "key_headers"=>{"Server"=>"Apache"}, "cookies"=>nil, "title"=>"N/A", "description"=>"N/A", "keywords"=>"N/A", "generator"=>"N/A", "language"=>"N/A", "author"=>"N/A", "copyright"=>"None", "forms"=>0, "scripts"=>3, "iframes"=>0, "phpbb"=>nil, "vulnerable"=>"N/A", "emails"=>[]}, {"_id"=>BSON::ObjectId('52c245523dc39f5b14000064'), "url"=>"http://site.com/Mango-404-52.html", "depth"=>"2", "code"=>"404", "time"=>"194", "size"=>11812, "content-length"=>"11812", "redirect_to"=>"N/A", "key_headers"=>{"Server"=>"Apache"}, "cookies"=>nil, "title"=>"N/A", "description"=>"N/A", "keywords"=>"N/A", "generator"=>"N/A", "language"=>"N/A", "author"=>"N/A", "copyright"=>"None", "forms"=>0, "scripts"=>3, "iframes"=>0, "phpbb"=>"N/A", "vulnerable"=>"N/A", "emails"=>[]}]
# Sort results based on crawler depth
coll.find("code" => '200').sort('depth').to_a
OR
mongodb['crawler_info'].find("code" => '200').sort('depth').to_a
# Sort results findings by crawler depth or other field values
# Limit results to those with response code of 200 where depth = 1
mongodb['crawler_info'].find("code" => '200', "depth" => '1').to_a.size
# Example Result => 88
# Verse all with response code 200
# mongodb['crawler_info'].find("code" => '200').to_a.size
# Example Result => 953
# Get range of results based on a range of values
# Get results where response code 200 and depth > 0 but <= 2
mongodb['crawler_info'].find("code" => '200', "depth" => {"$gt" => '0', "$lte" => '2'}).to_a.size
# Found with Crawl Depth 2 => 952
mdb['crawler_info'].find("code" => '200', "depth" => {"$gt" => '1', "$lte" => '2'}).to_a.size
# Found from Crawl Depth 1 to Crawl Depth 2 => 864
mongodb['crawler_info'].find("code" => '200', "depth" => {"$gt" => '0', "$lte" => '1'}).to_a.size
# Found with Crawl Depth 1 => 88
mongodb['crawler_info'].find("code" => '200', "depth" => {"$lte" => '0'}).to_a.size
# Base Crawl => 1
# Search DB based on field name and regex pattern
mongodb['crawler_info'].find({"url" => /placeholder/}).to_a
# Example Result => [{"_id"=>BSON::ObjectId('52c245503dc39f5b14000055'), "url"=>"http://site.com/placeholder.gif", "depth"=>"1", "code"=>"404", "time"=>"62", "size"=>11812, "content-length"=>"11812", "redirect_to"=>"N/A", "key_headers"=>{"Server"=>"Apache"}, "cookies"=>nil, "title"=>"N/A", "description"=>"N/A", "keywords"=>"N/A", "generator"=>"N/A", "language"=>"N/A", "author"=>"N/A", "copyright"=>"None", "forms"=>0, "scripts"=>3, "iframes"=>0, "phpbb"=>nil, "vulnerable"=>"N/A", "emails"=>[]}]
# Delete all entries in collection under database
mongodb['crawler_info'].remove
OR
coll.remove
# Drop/Delete Collection Entirely
coll.drop
OR
mongodb['crawler_info'].drop
# Drop/Delete Database Entirely
mongo.drop_database("makeshift")