Skip to content

Commit

Permalink
Add HTTP traffic repo
Browse files Browse the repository at this point in the history
  • Loading branch information
caesar0301 committed Dec 11, 2015
1 parent d32ebae commit 19cc497
Show file tree
Hide file tree
Showing 22 changed files with 323 additions and 6 deletions.
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ to add new data sources to Layer0 or Layer1.

## Project structure

* `etlers`: ETL tools for each data repo.
* `etlers`: source code of ETL tools.

* `deploy`: folder to dploy binary ETL tools referred by `porters`.

* `porters`: automatic scripts to port a new repo periodically with ETL tools.

Expand All @@ -49,7 +51,7 @@ to add new data sources to Layer0 or Layer1.

* [WifiSyslog](https://github.com/OMNILab/OmniDataHouse/blob/master/repos/wifi_syslog.md)

* [WifiTraffic](#)
* [WifiTrafficHTTP](https://github.com/OMNILab/OmniDataHouse/blob/master/repos/wifi_traffic_http.md)


## Instructions to add a new repo.
Expand Down
Binary file added arch/OMNILabDataPlatform.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions deploy/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This folder contains binary jars in deployment mode.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name := "ArubaSyslog"
name := "WifiToolkit"

version := "1.0"

Expand Down
File renamed without changes.
File renamed without changes.
12 changes: 12 additions & 0 deletions etlers/WifiToolkit/src/main/main.iml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="JAVA_MODULE" version="4">
<component name="NewModuleRootManager" inherit-compiler-output="true">
<exclude-output />
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/java" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/scala" isTestSource="false" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
package cn.edu.sjtu.omnilab.odh.spark


import cn.edu.sjtu.omnilab.odh.utils.{HttpSchema}
import org.apache.spark.{SparkContext, SparkConf}


object CleanseHttp {

def main(args: Array[String]): Unit = {

if (args.length < 2) {
println("Usage: CleanseHttp <in> <out>")
sys.exit(-1)
}

val input = args(0)
val output = args(1)

val conf = new SparkConf()
conf.setAppName("Cleanse HTTP logs")
val spark = new SparkContext(conf)

spark.textFile(input)

.map( m => {
val fields = splitLog(m)

if ( fields == null ) {
null
} else {
// Select required fields
"%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s".format(

// connection info
fields(HttpSchema.source_ip),
formatInt(fields(HttpSchema.source_port)),
fields(HttpSchema.dest_ip),
formatInt(fields(HttpSchema.dest_port)),
fields(HttpSchema.conn),

// connection timing metrics
formatDouble(fields(HttpSchema.conn_ts)),
formatDouble(fields(HttpSchema.close_ts)),
formatDouble(fields(HttpSchema.conn_dur)),
formatDouble(fields(HttpSchema.idle_time0)),
formatDouble(fields(HttpSchema.request_ts)),
formatDouble(fields(HttpSchema.request_dur)),
formatDouble(fields(HttpSchema.response_ts)),
formatDouble(fields(HttpSchema.response_dur_b)),
formatDouble(fields(HttpSchema.response_dur_e)),
formatDouble(fields(HttpSchema.idle_time1)),

// request/response sizes
formatInt(fields(HttpSchema.request_size)),
formatInt(fields(HttpSchema.response_size)),

// request keywords
fields(HttpSchema.request_method),
formatString(fields(HttpSchema.request_url)),
formatString(fields(HttpSchema.request_host)),
formatString(fields(HttpSchema.request_user_agent)),
formatString(fields(HttpSchema.request_referrer)),

// response keywords
formatInt(fields(HttpSchema.response_code)),
formatString(fields(HttpSchema.response_server)),
formatString(fields(HttpSchema.response_ctype))
)
}
})

.filter(m => m != null)

.saveAsTextFile(output)

}

/**
* Split each http log entry into fields.
* Null fields will be empty string.
*
* @param line a single entry of HTTP log
* @return an array of separated parts
*/
def splitLog(line: String): Array[String] = {
// get HTTP header fields
val chops = line.split("""\"\s\"""");
if ( chops.length != 21 )
return null

// get timestamps
val timestamps = chops(0).split(" ");
if (timestamps.length != 18 )
return null

val results = timestamps ++ chops.slice(1, 21)

// remove N/A values and extrat quote
results.transform( field => {
var new_field = field.replaceAll("\"", "")
if (new_field == "N/A")
new_field = ""
new_field
})

results
}


def formatDouble(field: String): String = {
try {
"%.3f".format(field.toDouble)
} catch {
case e: Exception => ""
}
}


def formatInt(field: String): String = {
try {
"%d".format(field.toInt)
} catch {
case e: Exception => ""
}
}


def formatString(field: String): String = {
try {
field.replaceAll("|", "")
} catch {
case e: Exception => field
}
}

}
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package cn.edu.sjtu.omnilab.odh.spark

import cn.edu.sjtu.omnilab.odh.rawfilter.{WIFILogFilter}
import cn.edu.sjtu.omnilab.odh.utils.Utils
import org.apache.spark.{SparkConf, SparkContext}


Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
package cn.edu.sjtu.omnilab.odh.utils

/**
* Data Schema of SJTU HTTP logs
*/
object HttpSchema {

val source_ip = 0
val source_port = 1
val dest_ip = 2
val dest_port = 3
val conn = 4
val conn_ts = 5
val close_ts = 6
val conn_dur = 7
val idle_time0 = 8
val request_ts = 9
val request_dur = 10
val response_ts = 11
val response_dur_b = 12
val response_dur_e = 13
val idle_time1 = 14
val request_size = 15
val response_size = 16
val request_method = 17
val request_url = 18
val request_protocol = 19
val request_host = 20
val request_user_agent = 21
val request_referrer = 22
val request_conn = 23
val request_keep_alive = 24
val response_protocol = 25
val response_code = 26
val response_server = 27
val response_clen = 28
val response_ctype = 29
val response_cenc = 30
val response_etag = 31
val response_cache_ctl = 32
val response_last_mod = 33
val response_age = 34
val response_expire = 35
val response_connval = 36
val response_keep_alive = 37

}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package cn.edu.sjtu.omnilab.odh.spark
package cn.edu.sjtu.omnilab.odh.utils

import com.google.common.net._
import org.joda.time.DateTime
Expand Down
2 changes: 1 addition & 1 deletion porters/wifi_syslog.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ BASEDIR=$(dirname $0)/..
source $BASEDIR/global_config.sh

# Cleansing tools
BINJAR=$BASEDIR/etlers/ArubaSyslog/target/scala-2.10/ArubaSyslog-assembly-1.0.jar
BINJAR=$BASEDIR/deploy/WifiToolkit-assembly-1.0.jar
CLSNAME="cn.edu.sjtu.omnilab.odh.spark.CleanseWifiLogs"

# Check root path for raw data
Expand Down
2 changes: 1 addition & 1 deletion porters/wifi_syslog_session.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ BASEDIR=$(dirname $0)/..
source $BASEDIR/global_config.sh

# Cleansing tools
BINJAR=$BASEDIR/etlers/ArubaSyslog/target/scala-2.10/ArubaSyslog-assembly-1.0.jar
BINJAR=$BASEDIR/deploy/WifiToolkit-assembly-1.0.jar
CLSNAME="cn.edu.sjtu.omnilab.odh.spark.MergeWifiSession"

# Check HDFS path for clean wifi logs
Expand Down
56 changes: 56 additions & 0 deletions porters/wifi_traffic_http.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/bin/bash

function clean_trash () {
hadoop fs -rm -r .Trash/Current > /dev/null
}

function die () {
echo "${@}"
exit 1
}

# Check permission
if [ `whoami` != 'omnilab' ]; then
die "Need permission of OMNILAB to run. Try user omnilab."
fi

# Global vars
BASEDIR=$(dirname $0)/..
source $BASEDIR/global_config.sh

# Cleansing tools
BINJAR=$BASEDIR/deploy/WifiToolkit-assembly-1.0.jar
CLSNAME="cn.edu.sjtu.omnilab.odh.spark.CleanseHttp"

# Check root path for raw data
if [ ! -d $WIFI_TRAFFIC_PATH ]; then
die "Cann't find path for archived traffic data: $WIFI_TRAFFIC_PATH"
fi

year=`date -d "yesterday" "+%Y"`
month=`date -d "yesterday" "+%m"`
day=`date -d "yesterday" "+%d"`

INPUT=$WIFI_TRAFFIC_PATH/$year$month/http/$year$month$day-*.gz
INPUT_TEMP=$HDFS_WIFI_TRAFFIC/HTTP/_temp
OUTPUT=$HDFS_WIFI_TRAFFIC/HTTP/$year$month$day

# Decompress files
for rawfile in `ls $INPUT`; do
echo $rawfile
rfname=${rawfile%.*}

if ! hadoop fs -test -e $INPUT_TEMP/`basename $rfname`; then
gunzip -c $rawfile | hadoop fs -put - $INPUT_TEMP/`basename $rfname`
fi
done

if ! hadoop fs -test -e $OUTPUT/_SUCCESS; then
hadoop fs -rm -r $OUTPUT
spark-submit2 --class $CLSNAME $BINJAR $INPUT_TEMP/$year$month$day* $OUTPUT
hadoop fs -rm -r $INPUT_TEMP
fi

clean_trash

exit 0;
61 changes: 61 additions & 0 deletions repos/wifi_traffic_http.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# WifiSyslog

This repo contains updated Wifi HTTP traffic logs.


## Data path

hdfs://user/omnilab/warehouse/WifiTraffic/HTTP


## Data format

There are 25 fields recorded and each line represents a request-response (RR) pair.

**Connection info.**

* [0] source_ip: string, the source IP address
* [1] source_port: int, the source port number of a connection
* [2] dest_ip: string, the destination IP address
* [3] dest_port: int, the destination port number of a connection
* [4] connection: string, position indicator of an RR pair in a connection

**Connection timings**

* [5] connect_ts: double, the timestamp to start the TCP connection
* [6] close_ts: double, the timestamp to terminate the TCP connection
* [7] connection_dur: double, the duration in seconds of the whole TCP connection
* [8] idle_time0: double, the idle interval between connection success and first byte of request
* [9] request_ts: double, the timestamp to send the first byte of request
* [10] request_dur: double, the duration to sending out request data
* [11] response_ts: double, the timestsamp to receive the first byte of response
* [12] response_dur_b: double, the idle interval between last byte of request and first byte of response
* [13] response_dur_e: double, the idle interval between first byte of response and last byte of response
* [14] idle_time1: double, the idle interval between response end and next new request or TCP connection end.

**RR volumes**

* [15] request_size: int, the size of request (including request header length)
* [16] response_size: int, the size of response (including response header length)

**Request keywords**

* [17] request_method: string, request method, such as GET, POST.
* [18] request_url: string, requested URL
* [19] request_host: string, remote server name
* [20] request_user_agent: string, indicator of user agent
* [21] request_referrer: string, request referrer

**Response keywords**

* [22] response_code: int, response statues
* [23] response_server: string, the remote server type
* [24] response_ctype: string, the content type


## Data sample

10.186.227.204|63851|114.66.198.57|80|start|1446307201.983||0.030|0.000|1446307202.129|0.000|1446307202.413|0.028|0.000|0.004|589|291|GET|/tools/FsPlatformAction?rprotocol=3*_*action=161.Foamii*_*actionresult=13888*_*actionobjectver=0*_*channelid=*_*mac=64D954A9C582*_*guid=83fced5c-ac31-4e78-880e-1a201ee9cff6*_*name=Aptupd*_*version=3.0.7.5*_*actiontime=|1|other*_*pullupname=FunWorks64*_*pullupversion=*_*cid=1799*_*aptid=1|3|stat.funshion.net|Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.3; WOW64; Trident/7.0; .NET4.0E; .NET4.0C; .NET CLR 3.5.30729; .NET CLR 2.0.50727; .NET CLR 3.0.30729; Shuame; GWX:RESERVED) Funshion/1.0.0.1||200|nginx/1.2.2|text/plain
10.186.227.204|63851|114.66.198.57|80|continue|||||1446307202.449|0.000|1446307202.729|0.028|0.000|0.006|589|291|GET|/tools/FsPlatformAction?rprotocol=3*_*action=161.Foamii*_*actionresult=13825*_*actionobjectver=0*_*channelid=*_*mac=64D954A9C582*_*guid=83fced5c-ac31-4e78-880e-1a201ee9cff6*_*name=Aptupd*_*version=3.0.7.5*_*actiontime=|1|other*_*pullupname=FunWorks64*_*pullupversion=*_*cid=1799*_*aptid=1|3|stat.funshion.net|Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.3; WOW64; Trident/7.0; .NET4.0E; .NET4.0C; .NET CLR 3.5.30729; .NET CLR 2.0.50727; .NET CLR 3.0.30729; Shuame; GWX:RESERVED) Funshion/1.0.0.1||200|nginx/1.2.2|text/plain
10.187.252.52|62241|121.195.187.54|80|unique|1446307202.270|1446307202.382|0.036|0.009|1446307202.315|0.000|1446307202.349|0.034|0.000|0.033|780|1349|POST|/sugg?ifc=4&em=4|s.wisdom.www.sogou.com|SogouPSI||200|nginx|
10.186.227.204|63851|114.66.198.57|80|last||1446307202.400|||1446307202.787|0.000|1446307202.107|0.028|0.000|0.293|579|291|GET|/tools/FsPlatformAction?rprotocol=3*_*action=161.Foamii*_*actionresult=13829*_*actionobjectver=0*_*channelid=*_*mac=64D954A9C582*_*guid=83fced5c-ac31-4e78-880e-1a201ee9cff6*_*name=Aptupd*_*version=3.0.7.5*_*actiontime=*_*pullupname=FunWorks64*_*pullupversion=*_*cid=1799*_*aptid=3|stat.funshion.net|Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.3; WOW64; Trident/7.0; .NET4.0E; .NET4.0C; .NET CLR 3.5.30729; .NET CLR 2.0.50727; .NET CLR 3.0.30729; Shuame; GWX:RESERVED) Funshion/1.0.0.1||200|nginx/1.2.2|text/plain

0 comments on commit 19cc497

Please sign in to comment.