Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Produce first batch of classifiers #250

Merged
merged 2 commits into from
Jul 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
182 changes: 31 additions & 151 deletions imrs/imrs_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,118 +18,15 @@
import numpy as np
from sklearn.linear_model import LinearRegression
import random

def file_has_header(imrs_file):
has_header = False
# get the first line
line = ""
for line in open(imrs_file, "r"):
break
if len(line) > 0:
parts = line.split(",")
if len(parts) > 1:
try:
queries = int(parts[1])
print("No header for " + imrs_file + ": " + parts[0] + ", " + parts[1] + ", ...")
has_header = False
except:
print("header for " + imrs_file + ": " + parts[0] + ", " + parts[1] + ", ...")
has_header = True
return has_header

def load_imrs_to_frame(imrs_file):
if file_has_header(imrs_file):
df = pd.read_csv(imrs_ratio_file)
else:
df = pd.read_csv(imrs_file, header=None, names=imrs.imrs_headers, dtype={"network": str}, index_col = False)
return df

def protected_ratio(v, d):
r = 0
if d > 0:
r = v/d
return r

def protected_count(x, r, list):
s = r
if r > 0:
mx = 0
for k in list:
if x[k] > mx:
mx = x[k]
s = r*mx
count = 0
for k in list:
if x[k] > s:
count += 1
return count

def reset_d31(x, list):
s = 0
for k in list:
s += x[k]
d31 = x["queries"] - s
if d31 < 0:
d31 = 0
return d31

def compute_nb_tlds(x):
tld_count = 0
for tld in [ "COM", "NET", "ORG", "INFO", "CN", "IN", "DE", "US" ]:
if x[tld] > 0:
tld_count += 1
tld_count += int(x["TLDs"])
return tld_count

def compute_nb_slds(x):
sld_count = 0
for sld in [ "RESOLVER", "EC2", "CLOUD", "WPAD", "CORP", "MAIL", "_TCP", "PROD" ]:
if x[sld] > 0:
sld_count += 1
sld_count += int(x["SLDs"])
if sld_count < 1:
sld_count = 1
return sld_count
import imrs_pandas
from imrs_pandas import print_stats, plot_or_save, example_and_count, print_names, print_mean

def compute_l10_sa(x, y, n, intercept):
d = float(intercept)
for i in range(len(y)):
d += float(x[n[i]]*y[i])
return d

def print_stats(x_df, name):
print(name)
x_des = x_df.describe()
print(x_des.transpose())
x_cor = x_df.corr()
print(x_cor)

def plot_or_save(plot_dir, image_name):
if plot_dir == "-":
plt.show()
else:
image_path = join(plot_dir, image_name)
plt.savefig(image_path)

def example_and_count(df, name):
count = df.shape[0]
all_rows = df.shape[1]
queries = 0
network = ""
sample = df.sample(13)
nb_rows = sample.shape[0]
# print(name + ": samples = " + str(nb_rows) + ", out of " + str(all_rows))
sdp = sample[["network", "queries"]]
sdp_np = sdp.to_numpy()
# print("Sample shape: " + str(sdp.shape))
# print("Sdp_np shape: " + str(np.shape(sdp_np)))
for i in range(np.shape(sdp_np)[0]):
if sdp_np[i,1] > queries:
queries = sdp_np[i,1]
network = str(sdp_np[i,0])
print(name + ": count=" + str(count) + ", network=" + network)
return count, network

# main
if len(sys.argv) != 2 and len(sys.argv) != 3:
for x in range(0, len(sys.argv)):
Expand All @@ -141,54 +38,16 @@ def example_and_count(df, name):
if len(sys.argv) == 3:
plot_dir = sys.argv[2]

full_df = load_imrs_to_frame(imrs_file)
full_df = imrs_pandas.load_imrs_to_frame(imrs_file)
print("Loaded full")
# apply corections for day overlow bug:
# ignore d00, it is always 0
# compute d31 = queries - sum (d01..d30)
# compute arpa = arpa0 - d31
days = [
"d01", "d02", "d03", "d04", "d05", "d06", "d07", "d08", "d09", "d10", \
"d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", \
"d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", \
"d31" ]
full_df["d31"] = full_df.apply(lambda x: reset_d31(x, days[:-1]), axis=1)
full_df["arpa"] = full_df["arpa0"] - full_df["d31"]

print("Computed corrections")
# compute the good column
full_df["good"] = full_df.apply(lambda x: x["queries"] - x["no_such"], axis=1)
# compute the ratio of good over APNIC
full_df["r_good_apnic"] = full_df.apply(lambda x: protected_ratio(x["good"], x["APNIC"]), axis=1)

# compute log10 column of queries and apnic
full_df["l10_q"] = np.log10(full_df["queries"])
full_df["l10_a"] = np.log10(2*full_df["APNIC"] + 1)
full_df["l10_g"] = np.log10(2*full_df["good"] + 1)
full_df["l_tld"] = np.log10(2*full_df["TLDs"] + 1)
full_df["l_sld"] = np.log10(2*full_df["SLDs"] + 1)
# add columns for ratios
for d in [ "no_such", "AAAA", "NS", "PTR", "NSEC", "SOA", "APNIC" ]:
r_d = "r_" + d
full_df[r_d] = full_df[d] / full_df["queries"]

full_df["r_arpa"] = full_df["arpa"] / (2*full_df["queries"])

full_df["r_COM"] = full_df.apply(lambda x: protected_ratio(x["COM"], x["queries"] - x["no_such"]), axis=1)
full_df["r_INFO"] = full_df.apply(lambda x: protected_ratio(x["INFO"], x["queries"] - x["no_such"]), axis=1)
print("Computed ratios")

hours = ["h00", "h01", "h02", "h03", "h04", "h05", "h06", "h07", "h08", "h09", \
"h10", "h11", "h12", "h13", "h14", "h15", "h16", "h17", "h18", "h19", \
"h20", "h21", "h22", "h23" ]

full_df["h_count"] = full_df.apply(lambda x: protected_count(x, 0, hours), axis=1)
full_df["d_count"] = full_df.apply(lambda x: protected_count(x, 0, days), axis=1)
print("Computed hours")

imrs_pandas.imrs_corrections(full_df)
print("Applied corrections")

# First, study the APNIC Data
# get APNIC subset
apnic_df = full_df[full_df["l10_a"] > 0]
apnic_selected = [ "network", "queries", "r_no_such", "h_count", "d_count", "r_arpa", "r_COM", "l10_q", "l_com", "l_tld", "l_sld", "l10_a", "APNIC", "COM" ]

# select 4 subsets, based on 2 variables
apnic_loneg_df = apnic_df[apnic_df["r_no_such"] < 0.1]
Expand All @@ -199,6 +58,13 @@ def example_and_count(df, name):
apnic_hineg_loap_df = apnic_hineg_df[apnic_hineg_df["r_good_apnic"] < 300]
apnic_hineg_hiap_df = apnic_hineg_df[apnic_hineg_df["r_good_apnic"] >= 300]

print_names(apnic_loneg_loap_df)
print_mean(apnic_loneg_loap_df,"apnic_loneg_loap_df", apnic_selected)
print_mean(apnic_loneg_hiap_df,"apnic_loneg_hiap_df", apnic_selected)
print_mean(apnic_hineg_loap_df,"apnic_hineg_loap_df", apnic_selected)
print_mean(apnic_hineg_hiap_df,"apnic_hineg_hiap_df", apnic_selected)


# select 4 subsets, based on 2 variables
apnic_loneg_df = apnic_df[apnic_df["r_no_such"] < 0.1]
apnic_hineg_df = apnic_df[apnic_df["r_no_such"] >= 0.1]
Expand All @@ -222,6 +88,14 @@ def example_and_count(df, name):
apnic_hineg_hiap_df.plot.scatter(ax=axb, x="queries", y="APNIC", alpha=0.5, color="red")
plot_or_save(plot_dir, "apnic-queries.jpg")

axtld = apnic_df.plot.scatter(x="APNIC", y="TLDs", alpha=0.5, logx=True, logy=False, color="blue")
plot_or_save(plot_dir, "tlds-apnic.jpg")
axcom = apnic_df.plot.scatter(x="APNIC", y="COM", alpha=0.5, logx=True, logy=False, color="blue")
plot_or_save(plot_dir, "com-apnic.jpg")
axnosuch = full_df.plot.scatter(x="queries", y="r_no_such", alpha=0.5, logx=True, logy=False, color="blue")
apnic_df.plot.scatter(ax=axnosuch, x="queries", y="r_no_such", alpha=0.5, color="orange")
plot_or_save(plot_dir, "no_such-queries.jpg")

# plot APNIC/Queries/no_such
axb = apnic_loneg_loap_df.plot.scatter(x="queries", y="r_no_such", alpha=0.5, logx=True, logy=False, color="blue")
apnic_loneg_hiap_df.plot.scatter(ax=axb, x="queries", y="r_no_such", alpha=0.5, color="green")
Expand All @@ -236,7 +110,6 @@ def example_and_count(df, name):

# study the APNIC correlations
# get a view of only the important columns
apnic_selected = [ "network", "l10_q", "r_no_such", "h_count", "d_count", "r_arpa", "r_COM", "l_tld", "l_sld", "l10_a" ]
full_selected_df = full_df[apnic_selected]
apnic_selected_df = full_selected_df[full_selected_df["l10_a"] > 0]

Expand All @@ -260,6 +133,7 @@ def example_and_count(df, name):
full_df["l10_sa"] = full_df.apply(lambda x: compute_l10_sa(x, lr.coef_.T, apnic_coeffs[:-1], lr.intercept_[0]), axis=1)
full_df["l10_gsa"] = full_df.apply(lambda x: x["l10_g"] - x["l10_sa"], axis=1)
#print(list(full_df))

apnic_coeffs_x = [ "network", "l10_sa", "l10_gsa", "l10_q", "r_no_such", "l10_a", "queries" ]
full_data_x_df = full_df[apnic_coeffs_x]
# print(list(full_data_x_df))
Expand All @@ -274,7 +148,7 @@ def example_and_count(df, name):
#print_stats(full_df["l10_sa", "l10_a"], "full_df")

# apply regression to classify not APNIC traffic
notap_df = full_data_x_df[full_data_x_df["l10_a"] == 0]
notap_df = full_df[full_df["l10_a"] == 0]

axp = notap_df.plot.scatter(x="queries", y="r_no_such", alpha=0.1, logx=True, logy=False, color="blue")
apnic_df.plot.scatter(ax=axp, x="queries", y="r_no_such", alpha=0.2, color="orange")
Expand Down Expand Up @@ -318,4 +192,10 @@ def example_and_count(df, name):
example_and_count(notap_loneg_loap_df, "notap_loneg_loap_df (blue)")
example_and_count(notap_loneg_hiap_df, "notap_loneg_hiap_df (green)")
example_and_count(notap_hineg_loap_df, "notap_hineg_loap_df (orange)")
example_and_count(notap_hineg_hiap_df, "notap_hineg_hiap_df (red)")
example_and_count(notap_hineg_hiap_df, "notap_hineg_hiap_df (red)")

print_names(notap_loneg_loap_df)
print_mean(notap_loneg_loap_df,"notap_loneg_loap_df", apnic_selected)
print_mean(notap_loneg_hiap_df,"notap_loneg_hiap_df", apnic_selected)
print_mean(notap_hineg_loap_df,"notap_hineg_loap_df", apnic_selected)
print_mean(notap_hineg_hiap_df,"notap_hineg_hiap_df", apnic_selected)
123 changes: 123 additions & 0 deletions imrs/imrs_classifier2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
#
# Exploration of the ipstats file for each network
#

import sys
import traceback
import random
import time
import concurrent.futures
import math
import os
from os import listdir
from os.path import isfile, isdir, join
import imrs
from imrs import parse_imrs_volume_only, apnic_record
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
import random
import imrs_pandas
from imrs_pandas import print_stats, save_stats, save_selected_stats, \
plot_or_save, plot_and_explore, example_and_count, \
print_names, print_mean


# main
if len(sys.argv) != 2 and len(sys.argv) != 3:
for x in range(0, len(sys.argv)):
print(str(x) + ":" + sys.argv[x])
print("Usage: imrs_classifier.py <imrs_ratio csv file> [<img_folder>]")
exit(1)
imrs_file = sys.argv[1]
plot_dir = "-"
out_file = sys.stdout
if len(sys.argv) == 3:
plot_dir = sys.argv[2]
csv_path = join(plot_dir, "stats.csv")
out_file = open(csv_path, "w")
out_file.write("frame, property, count, mean, std, min, c25%, c50%, c75%, max\n")


full_df = imrs_pandas.load_imrs_to_frame(imrs_file)
print("Loaded full")

imrs_pandas.imrs_corrections(full_df)
print("Applied corrections")

tracked = [ "network", "queries", "r_no_such", "h_count", "d_count", "r_arpa", "r_COM", "l10_q", "l_com", "l_tld", "l_sld", "l10_a", "APNIC", "COM" ]
save_selected_stats(out_file, full_df, tracked, "full_df")
plot_and_explore(full_df, plot_dir, "full", 'queries', [ 'r_arpa', 'l_tld', 'l_com', 'r_COM'], lx=True, ly=False)

# First, isolate the "small" nodes, defined
# as sending fewer than 40 queries.
small_df = full_df[full_df["queries"] <= 100]
big_df = full_df[full_df["queries"] > 100]

save_selected_stats(out_file, small_df, tracked,"small_df")
save_selected_stats(out_file, big_df, tracked,"big_df")

# then, create three subsets of the big sites:
# ns_low: no-such < 5%
# ns_high: no-such > 90%
# ns_mid: in_between

ns_low = big_df[big_df["r_no_such"] < 0.05]
ns_other = big_df[big_df["r_no_such"] >= 0.05]
ns_high = ns_other[ns_other["r_no_such"] > 0.9]
ns_mid = ns_other[ns_other["r_no_such"] <= 0.9]

save_selected_stats(out_file, ns_other, tracked,"ns_other")
plot_and_explore(ns_other, plot_dir, "other", 'queries', [ 'r_arpa', 'l_tld', 'l_com', 'r_COM'], lx=True, ly=False)

save_selected_stats(out_file, ns_low, tracked,"ns_low")
save_selected_stats(out_file, ns_high, tracked,"ns_high")
save_selected_stats(out_file, ns_mid, tracked,"ns_mid")

# At this stage, we have separated 4 groups.
# We will ignore the "small" group for now, because in the absence of
# traffic it is hard to classify anything.

plot_and_explore(ns_low, plot_dir, "low", 'queries', [ 'r_arpa', 'l_tld', 'l_com', 'r_COM'], lx=True, ly=False)
plot_and_explore(ns_mid, plot_dir, "mid", 'queries', [ 'r_arpa', 'l_tld', 'l_com', 'r_COM'], lx=True, ly=False)
plot_and_explore(ns_high, plot_dir, "high", 'queries', [ 'r_arpa', 'l_tld', 'l_com', 'r_COM'], lx=True, ly=False)

# In the "low NS" group, the plot of TLDs versus queries shows a break
# at somewhere between 500 and 1000 TLDs seen. Above that line we find
# very few APNIC servers but many large non APNIC nodes. This could
# be nodes engaged in some kind of scanning process.

low_lt500t = ns_low[ns_low["TLDs"] <= 500]
low_gt500t = ns_low[ns_low["TLDs"] > 500]
save_selected_stats(out_file, low_lt500t, tracked,"low_lt500t")
save_selected_stats(out_file, low_gt500t, tracked,"low_gt500t")
plot_and_explore(low_lt500t, plot_dir, "low_lt500t", 'queries', [ 'r_arpa', 'l_tld', 'l_com', 'r_COM'], lx=True, ly=False)
plot_and_explore(low_gt500t, plot_dir, "low_gt500t", 'queries', [ 'r_arpa', 'l_tld', 'l_com', 'r_COM'], lx=True, ly=False)

# In the "high NS" group, there seems
# to be two interesting subgroups: more than 500 TLDs, as in the
# "low" case, and more than about 10^6 queries, which separates
# a bunch of high values from the bulk of APNNIC resolvers.

high_lt500t = ns_high[ns_high["TLDs"] <= 500]
high_gt500t = ns_high[ns_high["TLDs"] > 500]
save_selected_stats(out_file, high_lt500t, tracked,"high_lt500t")
save_selected_stats(out_file, high_gt500t, tracked,"high_gt500t")
plot_and_explore(high_lt500t, plot_dir, "high_lt500t", 'queries', [ 'r_arpa', 'l_tld', 'l_com', 'r_COM'], lx=True, ly=False)
plot_and_explore(high_gt500t, plot_dir, "high_gt500t", 'queries', [ 'r_arpa', 'l_tld', 'l_com', 'r_COM'], lx=True, ly=False)

# In the "mid" group, the pictures are murky. There seems to be
# a separation between resolvers with more than 1 million
# queries and others. (or is it 100K?)

mid_lt1Mq = ns_mid[ns_mid["queries"] <= 1000000]
mid_gt1Mq = ns_mid[ns_mid["queries"] > 1000000]

save_selected_stats(out_file, mid_lt1Mq, tracked,"mid_lt1Mq")
save_selected_stats(out_file, mid_gt1Mq, tracked,"mid_gt1Mq")
plot_and_explore(mid_lt1Mq, plot_dir, "mid_lt1Mq", 'queries', [ 'r_arpa', 'l_tld', 'l_com', 'r_COM'], lx=True, ly=False)
plot_and_explore(mid_gt1Mq, plot_dir, "mid_gt1Mq", 'queries', [ 'r_arpa', 'l_tld', 'l_com', 'r_COM'], lx=True, ly=False)

if out_file != sys.stdout:
out_file.close()
Loading
Loading