diff --git a/etlers/ArubaSyslog/build.sbt b/etlers/ArubaSyslog/build.sbt new file mode 100644 index 0000000..8c4def8 --- /dev/null +++ b/etlers/ArubaSyslog/build.sbt @@ -0,0 +1,15 @@ +name := "ArubaSyslog" + +version := "1.0" + +scalaVersion := "2.10.4" // For Spark 1.3.0 + +conflictManager := ConflictManager.latestRevision + +libraryDependencies += "org.apache.spark" %% "spark-core" % "1.2.0" % "provided" + +libraryDependencies += "joda-time" % "joda-time" % "2.6" + +libraryDependencies += "org.apache.commons" % "commons-math3" % "3.5" + +libraryDependencies += "org.yaml" % "snakeyaml" % "1.15" diff --git a/etlers/ArubaSyslog/project/build.properties b/etlers/ArubaSyslog/project/build.properties new file mode 100644 index 0000000..d638b4f --- /dev/null +++ b/etlers/ArubaSyslog/project/build.properties @@ -0,0 +1 @@ +sbt.version = 0.13.8 \ No newline at end of file diff --git a/etlers/ArubaSyslog/project/plugins.sbt b/etlers/ArubaSyslog/project/plugins.sbt new file mode 100644 index 0000000..58b2aae --- /dev/null +++ b/etlers/ArubaSyslog/project/plugins.sbt @@ -0,0 +1,3 @@ +logLevel := Level.Warn + +addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.12.0") \ No newline at end of file diff --git a/etlers/ArubaSyslog/src/main/java/cn/edu/sjtu/omnilab/odh/rawfilter/APToBuilding.java b/etlers/ArubaSyslog/src/main/java/cn/edu/sjtu/omnilab/odh/rawfilter/APToBuilding.java new file mode 100644 index 0000000..80a9a82 --- /dev/null +++ b/etlers/ArubaSyslog/src/main/java/cn/edu/sjtu/omnilab/odh/rawfilter/APToBuilding.java @@ -0,0 +1,97 @@ +package cn.edu.sjtu.omnilab.odh.rawfilter; + +import org.yaml.snakeyaml.Yaml; +import org.yaml.snakeyaml.constructor.SafeConstructor; + +import java.io.InputStream; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; + +/** + * Given a AP name string, this function return the description of the building. + * Specifically for AP name, both full name string (the default mode, e.g. + * BYGTSG-4F-01) and building name (e.g. BYGTSG) can be used. + * If only building name is given, you can save processing time to declare this + * method with @full_apname param. But for accuracy, the full AP name is preferred. + * + * @author chenxm + */ +public class APToBuilding { + private static final String AP_BUILDING_DATABASE = "/APNamesUTF8.yaml"; + private Map> APNameDB; + private boolean full_apname = true; + private Map APBN_RealBN_Cache = new HashMap(); + + public APToBuilding(){ + this(APToBuilding.class.getResourceAsStream(AP_BUILDING_DATABASE), true); + } + + public APToBuilding(boolean full_apname){ + this(APToBuilding.class.getResourceAsStream(AP_BUILDING_DATABASE), full_apname); + } + + @SuppressWarnings("unchecked") + public APToBuilding(InputStream APDBYAML, boolean full_apname) { + this.full_apname = full_apname; + // Load yaml database + Yaml yaml = new Yaml(new SafeConstructor()); + Map regexConfig = (Map) yaml.load(APDBYAML); + APNameDB = (Map>) regexConfig.get("apprefix_sjtu"); + } + + public List parse(String APName){ + List result = null; + if ( APName == null ) + return result; + + if ( full_apname ) { // Given full AP name string + String[] parts = APName.split("-\\d+F-", 2); + String buildName = parts[0]; + + // Remove MH- prefix + if (buildName.startsWith("MH-")) + buildName = buildName.substring(3, buildName.length()); + + // Check cache first + if ( APBN_RealBN_Cache.containsKey(buildName) ) { // Cache hit + String cacheRealBN = APBN_RealBN_Cache.get(buildName); + result = getBuildInfo(cacheRealBN); + } else { // Cache miss + if ( APNameDB.containsKey(buildName)) { + result = getBuildInfo(buildName); + APBN_RealBN_Cache.put(buildName, buildName); + } else { + // Worst case; try to find its longest matched building name + String realBuildName = null; + for ( String BN : APNameDB.keySet()) + if ( buildName.contains(BN) ) + if ( realBuildName == null ) + realBuildName = BN; + else if ( BN.length() > realBuildName.length() ) + realBuildName = BN; // Get the longest match + if ( realBuildName != null ){ + result = getBuildInfo(realBuildName); + // Cache the real building name + APBN_RealBN_Cache.put(buildName, realBuildName); + } + } + } + } else { // Given build name, skip cache actions + if ( APNameDB.containsKey(APName) ) // Have item + result = getBuildInfo(APName); + } + + return result; + } + + private List getBuildInfo(String realBuildName){ + List info = new LinkedList(); + Map buildInfo = APNameDB.get(realBuildName); + info.add(buildInfo.get("dsp")); + info.add(buildInfo.get("typ")); + info.add(buildInfo.get("usr")); + return info; + } +} diff --git a/etlers/ArubaSyslog/src/main/java/cn/edu/sjtu/omnilab/odh/rawfilter/WIFICode.java b/etlers/ArubaSyslog/src/main/java/cn/edu/sjtu/omnilab/odh/rawfilter/WIFICode.java new file mode 100644 index 0000000..613f5ce --- /dev/null +++ b/etlers/ArubaSyslog/src/main/java/cn/edu/sjtu/omnilab/odh/rawfilter/WIFICode.java @@ -0,0 +1,12 @@ +package cn.edu.sjtu.omnilab.odh.rawfilter; + + +public class WIFICode { + public static final int AuthRequest = 0; + public static final int Deauth = 1; + public static final int AssocRequest = 2; + public static final int Disassoc = 3; + public static final int UserAuth = 4; + public static final int IPAllocation = 5; + public static final int IPRecycle = 6; +} diff --git a/etlers/ArubaSyslog/src/main/java/cn/edu/sjtu/omnilab/odh/rawfilter/WIFILogFilter.java b/etlers/ArubaSyslog/src/main/java/cn/edu/sjtu/omnilab/odh/rawfilter/WIFILogFilter.java new file mode 100644 index 0000000..507f0bf --- /dev/null +++ b/etlers/ArubaSyslog/src/main/java/cn/edu/sjtu/omnilab/odh/rawfilter/WIFILogFilter.java @@ -0,0 +1,236 @@ +package cn.edu.sjtu.omnilab.odh.rawfilter; + +import java.io.IOException; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + + +/** + * Cleanse and format Aruba Wifi logs by filtering out unwanted log numbers. + * + * Input: + * Raw or original Aruba syslog files. + * + * Output: + * 6c71d96d8c4d,2013-10-11 23:50:53,0,XXY-3F-09 + * 6c71d96d8c4d,2013-10-11 23:50:53,2,XXY-3F-09 + * + * Message encoding: see WifiCode.java + * + * BN: + * Before Oct. 14, 2013, WiFi networks in SJTU deploy the IP address of 111.186.0.0/18. + * But after that (or confirmedly Oct. 17, 2013), local addresses are utlized to save the IP + * resources. New IP addresses deployed in WiFi networks are: + * Local1: 1001 111.186.16.1/20, New: 10.185.0.0/16 + * Local2: 1001 111.186.33.1/21, New: 10.186.0.0/16 + * Local3: 1001 111.186.40.1/21, New: 10.188.0.0/16 + * Local4: 1001 111.186.48.1/20, New: 10.187.0.0/16 + * Local5: 1001 111.186.0.1/20, New: 10.184.0.0/16 + * + * + * @Author chenxm, gwj + */ +public class WIFILogFilter { + + /** + * Filter the raw wifilogs with regex utilities. + * + * @param rawLogEntry + * @throws java.io.FileNotFoundException + * @throws java.io.IOException + */ + public static String filterData(String rawLogEntry) throws IOException { + + // Message CODE + final int[] CODE_AUTHREQ = {501091, 501092, 501109}; + final int[] CODE_AUTHRES = {501093, 501094, 501110}; + final int[] CODE_DEAUTH = {501105, 501080, 501098, 501099, 501106, 501107, 501108, 501111}; // from and to + final int[] CODE_ASSOCREQ = {501095, 501096, 501097}; + final int[] CODE_ASSOCRES = {501100, 501101, 501112}; + final int[] CODE_DISASSOCFROM = {501102, 501104, 501113}; + final int[] CODE_USERAUTH = {522008, 522042, 522038}; // Successful and failed + final int[] CODE_USRSTATUS = {522005, 522006, 522026}; // User Entry added, deleted, and user miss + final int[] CODE_USERROAM = {500010}; + + final String regPrefix = "(\\w+\\s+\\d+\\s+(?:\\d{1,2}:){2}\\d{1,2}\\s+\\d{4})"; + final String regUserMac = "((?:[0-9a-f]{2}:){5}[0-9a-f]{2})"; + final String regApInfo = "((?:\\d{1,3}\\.){3}\\d{1,3})-((?:[0-9a-f]{2}:){5}[0-9a-f]{2})-([\\w-]+)"; + + // time: group(1), usename: group(2), apip: group(3), apmac: group(4), apname: group(5) + final Pattern REG_AUTHREQ = Pattern.compile(String.format("%s(?:.*)Auth\\s+request:\\s+%s:?\\s+(?:.*)AP\\s+%s", regPrefix, regUserMac, regApInfo), Pattern.CASE_INSENSITIVE); + // time: group(1), usename: group(2), apip: group(3), apmac: group(4), apname: group(5) + final Pattern REG_AUTHRES = Pattern.compile(String.format("%s(?:.*)Auth\\s+(success|failure):\\s+%s:?\\s+AP\\s+%s", regPrefix, regUserMac, regApInfo), Pattern.CASE_INSENSITIVE); + // time: group(1), usename: group(2), apip: group(3), apmac: group(4), apname: group(5) + final Pattern REG_DEAUTH = Pattern.compile(String.format("%s(?:.*)Deauth(?:.*):\\s+%s:?\\s+(?:.*)AP\\s+%s", regPrefix, regUserMac, regApInfo), Pattern.CASE_INSENSITIVE); + // time: group(1), usename: group(2), apip: group(3), apmac: group(4), apname: group(5) + final Pattern REG_ASSOCREQ = Pattern.compile(String.format("%s(?:.*)Assoc(?:.*):\\s+%s(?:.*):?\\s+(?:.*)AP\\s+%s", regPrefix, regUserMac, regApInfo), Pattern.CASE_INSENSITIVE); + // time: group(1), usename: group(2), apip: group(3), apmac: group(4), apname: group(5) + final Pattern REG_DISASSOCFROM = Pattern.compile(String.format("%s(?:.*)Disassoc(?:.*):\\s+%s:?\\s+AP\\s+%s", regPrefix, regUserMac, regApInfo), Pattern.CASE_INSENSITIVE); + // time: group(1), usename: group(2), usermac: group(3), userip: group(4), apname: group(5) + final Pattern REG_USERAUTH = Pattern.compile(String.format("%s(?:.*)\\s+username=([^\\s]+)\\s+MAC=%s\\s+IP=((?:\\d{1,3}\\.){3}\\d{1,3})(?:.+)(?:AP=([^\\s]+))?", regPrefix, regUserMac), Pattern.CASE_INSENSITIVE); + // time: group(1), usermac: group(2), userip: group(3) + final Pattern REG_USRSTATUS = Pattern.compile(String.format("%s(?:.*)MAC=%s\\s+IP=((?:111\\.\\d+|10\\.18[4-8])(?:\\.\\d+){2})", regPrefix, regUserMac), Pattern.CASE_INSENSITIVE); + + String cleanLog = null; + String[] chops = new String[0]; + try { + chops = rawLogEntry.split("<", 3); + } catch (Exception e) { + return cleanLog; + } + + if (chops.length < 3 || chops[2].length() == 0 || chops[2].charAt(0) != '5') + return cleanLog; + + int messageCode = Integer.valueOf(chops[2].split(">", 2)[0]); + if (hasCodes(messageCode, CODE_AUTHREQ)) { // Auth request + Matcher matcher = REG_AUTHREQ.matcher(rawLogEntry); + if (matcher.find()) { + String time = formattrans(matcher.group(1)); + String usermac = matcher.group(2).replaceAll(":", ""); + String apname = matcher.group(5); + cleanLog = String.format("%s,%s,%s,%s", usermac, time, WIFICode.AuthRequest, apname); + } + } else if (hasCodes(messageCode, CODE_DEAUTH)) { // Deauth from and to + Matcher matcher = REG_DEAUTH.matcher(rawLogEntry); + if (matcher.find()) { + String time = formattrans(matcher.group(1)); + String usermac = matcher.group(2).replaceAll(":", ""); + String apname = matcher.group(5); + cleanLog = String.format("%s,%s,%s,%s", usermac, time, WIFICode.Deauth, apname); + } + } else if (hasCodes(messageCode, CODE_ASSOCREQ)) { // Association request + Matcher matcher = REG_ASSOCREQ.matcher(rawLogEntry); + if (matcher.find()) { + String time = formattrans(matcher.group(1)); + String usermac = matcher.group(2).replaceAll(":", ""); + String apname = matcher.group(5); + cleanLog = String.format("%s,%s,%s,%s", usermac, time, WIFICode.AssocRequest, apname); + } + } else if (hasCodes(messageCode, CODE_DISASSOCFROM)) { // Disassociation + Matcher matcher = REG_DISASSOCFROM.matcher(rawLogEntry); + if (matcher.find()) { + String time = formattrans(matcher.group(1)); + String usermac = matcher.group(2).replaceAll(":", ""); + String apname = matcher.group(5); + cleanLog = String.format("%s,%s,%s,%s", usermac, time, WIFICode.Disassoc, apname); + } + } else if (hasCodes(messageCode, CODE_USERAUTH)) { //username information, User authentication + Matcher matcher = REG_USERAUTH.matcher(rawLogEntry); + if (matcher.find()) { + String time = formattrans(matcher.group(1)); + String username = matcher.group(2); + String usermac = matcher.group(3).replaceAll(":", ""); + String userip = matcher.group(4); + String apname = null; + try { + apname = matcher.group(5); + } catch (Exception e) { + //apname is null if it is not there + } + cleanLog = String.format("%s,%s,%s,%s,%s,%s", usermac, time, WIFICode.UserAuth, username, userip, apname); + } + } else if (hasCodes(messageCode, CODE_USRSTATUS)) { // User entry update status + Matcher matcher = REG_USRSTATUS.matcher(rawLogEntry); + if (matcher.find()) { + String time = formattrans(matcher.group(1)); + String usermac = matcher.group(2).replaceAll(":", ""); + String userip = matcher.group(3); + int action = WIFICode.IPAllocation; // IP bond + if (messageCode == 522005) { + action = WIFICode.IPRecycle; + } + /* From the output, we see multiple IPAllocation message between + * the first allocation of specific IP and its recycling action. + */ + cleanLog = String.format("%s,%s,%s,%s", usermac, time, action, userip); + } + } + + return cleanLog; + } + + /** + * Helper to check if specific message code is contained. + * + * @param messageCode + * @param codes + * @return + */ + private static boolean hasCodes(int messageCode, int[] codes) { + boolean flag = false; + for (int i : codes) { + if (messageCode == i) { + flag = true; + break; + } + } + return flag; + } + + /** + * Put a user record into a map data structure. + * + * @param userMac + * @param record + * @param map + * @return + */ + private static int putRecordMap(String userMac, String record, Map> map) { + if (!map.containsKey(userMac)) { + map.put(userMac, new LinkedList()); + } + map.get(userMac).add(record); + return 0; + } + + //This function is used to change the date format from "May 4" to "2013-05-04" + private static String formattrans(String date_string){ + //Prepare for the month name for date changing + TreeMap month_tmap = new TreeMap(); + month_tmap.put("Jan", "01"); + month_tmap.put("Feb", "02"); + month_tmap.put("Mar", "03"); + month_tmap.put("Apr", "04"); + month_tmap.put("May", "05"); + month_tmap.put("Jun", "06"); + month_tmap.put("Jul", "07"); + month_tmap.put("Aug", "08"); + month_tmap.put("Sep", "09"); + month_tmap.put("Oct", "10"); + month_tmap.put("Nov", "11"); + month_tmap.put("Dec", "12"); + + //change the date from "May 4" to "2013-05-04" + // month: group(1), day: group(2), time: group(3), year: group(4) + String date_reg = "(\\w+)\\s+(\\d+)\\s+((?:\\d{1,2}:){2}\\d{1,2})\\s+(\\d{4})"; + Pattern date_pattern = Pattern.compile(date_reg); + Matcher date_matcher = date_pattern.matcher(date_string); + if(! date_matcher.find()) + return null; + + String year_string=date_matcher.group(4); + //change the month format + String month_string = date_matcher.group(1); + if(month_tmap.containsKey(month_string)){ + month_string = month_tmap.get(month_string); + }else{ + System.out.println("Can not find the month!!!"); + } + //change the day format + String day_string = date_matcher.group(2); + int day_int = Integer.parseInt(day_string); + if(day_int < 10){ + day_string = "0" + Integer.toString(day_int); + }else{ + day_string = Integer.toString(day_int); + } + String time_string = date_matcher.group(3); + + return String.format("%s-%s-%s %s", year_string, month_string, day_string, time_string); + } +} diff --git a/etlers/ArubaSyslog/src/main/resources/APNamesUTF8.yaml b/etlers/ArubaSyslog/src/main/resources/APNamesUTF8.yaml new file mode 100644 index 0000000..b33288b --- /dev/null +++ b/etlers/ArubaSyslog/src/main/resources/APNamesUTF8.yaml @@ -0,0 +1,644 @@ +%APNAMES 0.2 #Defining the prefixes of AP names in SJTU. @COPYRIGHT OMNILAB +--- +apprefix_sjtu: + BBY: + dsp: 标本园 + typ: SocBldg + usr: PUB + BYGTSG: + dsp: 包玉刚图书馆 + typ: LibBldg + usr: PUB + CL-A: + dsp: 材料A楼 + typ: AcadBldg + usr: MATR + CL-B: + dsp: 材料B楼 + typ: AcadBldg + usr: MATR + CL-C: + dsp: 材料C楼 + typ: AcadBldg + usr: MATR + CL-D: + dsp: 材料D楼 + typ: AcadBldg + usr: MATR + CL-E: + dsp: 材料E楼 + typ: AcadBldg + usr: MATR + CL-F: + dsp: 材料F楼 + typ: AcadBldg + usr: MATR + SXL: + dsp: 数学楼 + typ: AcadBldg + usr: MATH + WLL: + dsp: 物理楼 + typ: AcadBldg + usr: PHY + WLSYL: + dsp: 物理实验楼 + typ: AcadBldg + usr: PHY + GCLXSYZX: + dsp: 工程力学实验中心 + typ: AcadBldg + usr: PHY + STHJCLYJS: + dsp: 生态与环境材料研究所 + typ: AcadBldg + usr: ENV + GCXLZX-SYL: + dsp: 工程训练中心-思源楼 + typ: AcadBldg + usr: PUB + GCXLZX-B: + dsp: 工程训练中心-B楼 + typ: AcadBldg + usr: PUB + HJKXL: + dsp: 环境科学楼 + typ: AcadBldg + usr: ENV + HX-A: + dsp: 化学楼A + typ: AcadBldg + usr: CHEM + WXYXL: + dsp: 文选医学楼 + typ: AcadBldg + usr: MED + YXL-1: + dsp: 药学楼1号楼 + typ: AcadBldg + usr: PHAR + YXL-2: + dsp: 药学楼2号楼 + typ: AcadBldg + usr: PHAR + YXL-3: + dsp: 药学楼3号楼 + typ: AcadBldg + usr: PHAR + YXL-4: + dsp: 药学楼4号楼 + typ: AcadBldg + usr: PHAR + YXL-5: + dsp: 药学楼5号楼 + typ: AcadBldg + usr: PHAR + YXL-6: + dsp: 药学楼6号楼 + typ: AcadBldg + usr: PHAR + YXL-7: + dsp: 药学楼7号楼 + typ: AcadBldg + usr: PHAR + FXCSZX: + dsp: 分析测试中心 + typ: AcadBldg + usr: PUB + FXCSZX-1: + dsp: 分析测试中心1号楼 + typ: AcadBldg + usr: PUB + FXCSZX-2: + dsp: 分析测试中心2号楼 + typ: AcadBldg + usr: PUB + FXCSZX-3: + dsp: 分析测试中心3号楼 + typ: AcadBldg + usr: PUB + JXDLXY: + dsp: 机械与动力工程学院 + typ: AcadBldg + usr: ME + JXDLXY-A-Z: + dsp: 机械与动力工程学院A楼-主楼 + typ: AcadBldg + usr: ME + JXDLXY-A-F: + dsp: 机械与动力工程学院A楼-附楼 + typ: AcadBldg + usr: ME + JXDLXY-B-N: + dsp: 机械与动力工程学院B楼-北楼 + typ: AcadBldg + usr: ME + JXDLXY-B-E: + dsp: 机械与动力工程学院B楼-东楼 + typ: AcadBldg + usr: ME + JXDLXY-B-S: + dsp: 机械与动力工程学院B楼-南楼 + typ: AcadBldg + usr: ME + JXDLXY-C: + dsp: 机械与动力工程学院C楼-附楼 + typ: AcadBldg + usr: ME + WDZXY: + dsp: 微电子学院 + typ: AcadBldg + usr: IC + NXSWXY: + dsp: 农学生物学院 + typ: AcadBldg + usr: ARGI + NXSWXY-F1: + dsp: 农学生物学院-附楼一 + typ: AcadBldg + usr: ARGI + NXSWXY-F2: + dsp: 农学生物学院-附楼二 + typ: AcadBldg + usr: ARGI + NXSWXY-F3: + dsp: 农学生物学院-附楼三 + typ: AcadBldg + usr: ARGI + XTSWYJY: + dsp: 系统生物研究院 + typ: AcadBldg + usr: SCSB + XTSWYJY-A: + dsp: 系统生物研究院A区 + typ: AcadBldg + usr: SCSB + XTSWYJY-B: + dsp: 系统生物研究院B区 + typ: AcadBldg + usr: SCSB + XTSWYJY-C: + dsp: 系统生物研究院C区 + typ: AcadBldg + usr: SCSB + XTSWYJY-D: + dsp: 系统生物研究院D区 + typ: AcadBldg + usr: SCSB + MLXY: + dsp: 木兰学院 + typ: AcadBldg + usr: NAOCE + MLXY-S: + dsp: 木兰学院南楼 + typ: AcadBldg + usr: NAOCE + MLXY-N: + dsp: 木兰学院北楼 + typ: AcadBldg + usr: NAOCE + KYFXY: + dsp: 凯原法学院 + typ: AcadBldg + usr: LAW + MTSJSYL: + dsp: 媒体与设计实验室 + typ: AcadBldg + usr: MEDIA + MTSJSYL-A: + dsp: 媒体与设计实验室A楼 + typ: AcadBldg + usr: MEDIA + MTSJSYL-B: + dsp: 媒体与设计实验室B楼 + typ: AcadBldg + usr: MEDIA + WYL: + dsp: 外语楼 + typ: AcadBldg + usr: FRLAN + RWL: + dsp: 人文楼 + typ: AcadBldg + usr: SS + DXQL: + dsp: 电信群楼 + typ: AcadBldg + usr: SEIEE + DXQL-1: + dsp: 电信群楼1号楼 + typ: AcadBldg + usr: SEIEE + DXQL-2: + dsp: 电信群楼2号楼 + typ: AcadBldg + usr: SEIEE + DXQL-3: + dsp: 电信群楼3号楼 + typ: AcadBldg + usr: SEIEE + DXQL-4-N: + dsp: 电信群楼4号楼北楼 + typ: AcadBldg + usr: SEIEE + DXQL-4-S: + dsp: 电信群楼4号楼南楼 + typ: AcadBldg + usr: SEIEE + DXQL-5: + dsp: 电信群楼5号楼 + typ: AcadBldg + usr: SEIEE + DXQL-LianLang: + dsp: 电信群楼3F连廊 + typ: AcadBldg + usr: SEIEE + LLDCYJS: + dsp: 燃料电池研究所 + typ: AcadBldg + usr: PHY + JZG: + dsp: 建筑馆 + typ: AcadBldg + usr: ARCH + ZYXY: + dsp: 志远学院 + typ: AcadBldg + usr: ZHIYUAN + ZYNYL: + dsp: 中意能源楼 + typ: AcadBldg + usr: PHY + WLXXZX: + dsp: 网络信息中心 + typ: AdmBldg + usr: PUB + CRQ: + dsp: 陈瑞球楼 + typ: AdmBldg + usr: PUB + CRQ-N: + dsp: 陈瑞球北楼 + typ: AdmBldg + usr: PUB + CRQ-S: + dsp: 陈瑞球南楼 + typ: AdmBldg + usr: PUB + LXZL: + dsp: 老行政楼 + typ: AdmBldg + usr: PUB + XXZL: + dsp: 新行政楼 + typ: AdmBldg + usr: PUB + XXZL-B: + dsp: 新行政楼B楼 + typ: AdmBldg + usr: PUB + XXZL-A: + dsp: 新行政楼A楼 + typ: AdmBldg + usr: PUB + LWLZX: + dsp: 闵行老网络中心 + typ: AdmBldg + usr: PUB + XNTYG: + dsp: 西南体育馆-南体 + typ: AthlBldg + usr: PUB + XTYG: + dsp: 新体育馆-近沧源路 + typ: AthlBldg + usr: PUB + BQC: + dsp: 棒球场 + typ: AthlBldg + usr: PUB + DYST: + dsp: 第一食堂 + typ: CantBldg + usr: PUB + DEST: + dsp: 第二食堂 + typ: CantBldg + usr: PUB + D3ST: + dsp: 第三食堂 + typ: CantBldg + usr: PUB + DSST: + dsp: 第四食堂 + typ: CantBldg + usr: PUB + DWST: + dsp: 第五食堂 + typ: CantBldg + usr: PUB + LiuYuan: + dsp: 留园 + typ: CantBldg + usr: PUB + HLGC: + dsp: 哈乐小吃广场 + typ: CantBldg + usr: PUB + XYY: + dsp: 校医院 + typ: HospBldg + usr: PUB + XYY-N: + dsp: 校医院北楼 + typ: HospBldg + usr: PUB + XYY-S: + dsp: 校医院南楼 + typ: HospBldg + usr: PUB + TSG: + dsp: 图书馆 + typ: LibBldg + usr: PUB + TSG-1: + dsp: 图书馆一区 + typ: LibBldg + usr: PUB + TSG-2: + dsp: 图书馆二区 + typ: LibBldg + usr: PUB + TSG-3: + dsp: 图书馆三区 + typ: LibBldg + usr: PUB + TSG-4: + dsp: 图书馆四区 + typ: LibBldg + usr: PUB + JJT: + dsp: 菁菁堂 + typ: SocBldg + usr: PUB + CJMSG: + dsp: 程及美术馆 + typ: SocBldg + usr: PUB + XSSWZX: + dsp: 学生事务中心-铁生馆 + typ: SocBldg + usr: PUB + GBL: + dsp: 光彪楼 + typ: SocBldg + usr: PUB + YFKJL: + dsp: 逸夫科技楼 + typ: SocBldg + usr: PUB + XSFWZX: + dsp: 学生服务中心 + typ: SocBldg + usr: PUB + JJC: + dsp: 基建处 + typ: SuppBldg + usr: PUB + DHZJF: + dsp: 电话总机房 + typ: SuppBldg + usr: PUB + MHHCS: + dsp: 闵行候车室 + typ: SuppBldg + usr: PUB + XSY: + dsp: 西上院 + typ: TeachBldg + usr: PUB + XZY: + dsp: 西中院 + typ: TeachBldg + usr: PUB + XXY: + dsp: 西下院 + typ: TeachBldg + usr: PUB + DSY: + dsp: 东上院 + typ: TeachBldg + usr: PUB + DZY: + dsp: 东中院 + typ: TeachBldg + usr: PUB + DZY-1: + dsp: 东中院一区 + typ: TeachBldg + usr: PUB + DZY-2: + dsp: 东中院二区 + typ: TeachBldg + usr: PUB + DZY-3: + dsp: 东中院三区 + typ: TeachBldg + usr: PUB + DZY-4: + dsp: 东中院四区 + typ: TeachBldg + usr: PUB + DXY: + dsp: 东下院 + typ: TeachBldg + usr: PUB + XSSS: + dsp: 学生宿舍 + typ: DormBldg + usr: PRVT + XSSS-E: + dsp: 学生宿舍东区 + typ: DormBldg + usr: PRVT + XSSS-W: + dsp: 学生宿舍西区 + typ: DormBldg + usr: PRVT + XSSS-S: + dsp: 学生宿舍南区 + typ: DormBldg + usr: PRVT + XSSS-N: + dsp: 学生宿舍北区 + typ: DormBldg + usr: PRVT + student-D: + dsp: 学生宿舍D + typ: DormBldg + usr: PRVT + student-X: + dsp: 学生宿舍X + typ: DormBldg + usr: PRVT + student-W: + dsp: 学生宿舍W + typ: DormBldg + usr: PRVT + student: + dsp: 学生宿舍 + typ: DormBldg + usr: PRVT + southdoor: + dsp: 凯旋门-待定 + typ: GateBldg + usr: PUB + XH-XJL: + dsp: 徐汇新建楼 + typ: AcadBldg + usr: SIPA + XH-XYY: + dsp: 徐汇校医院 + typ: HospBldg + usr: PUB + XH-XBL: + dsp: 徐汇小白楼 + typ: AcadBldg + usr: EBIO + XH-FXL: + dsp: 徐汇法学楼 + typ: AcadBldg + usr: LAW + XH-ZY: + dsp: 徐汇中院 + typ: TeachBldg + usr: PUB + XH-HRDS: + dsp: 徐汇浩然大厦 + typ: AdmBldg + usr: PUB + XH-JGHDZX: + dsp: 徐汇教工活动中心 + typ: SocBldg + usr: PUB + XH-XZY: + dsp: 徐汇新中院 + typ: TeachBldg + usr: PUB + XH-GCG: + dsp: 徐汇工程馆 + typ: AcadBldg + usr: PUB + XH-TSG: + dsp: 徐汇图书馆 + typ: LibBldg + usr: PUB + XH-XINSY: + dsp: 徐汇新上院 + typ: TeachBldg + usr: PUB + XH-TYG: + dsp: 徐汇体育馆 + typ: AthlBldg + usr: PUB + XH-ZBGT: + dsp: 徐汇总办公厅 + typ: AdmBldg + usr: PUB + XH-BWC: + dsp: 徐汇保卫处 + typ: AdmBldg + usr: PUB + HX-LTSG: + dsp: 徐汇老图书馆 + typ: LibBldg + usr: PUB + HX-GCG: + dsp: 徐汇工程馆 + typ: AcadBldg + usr: PUB + HX-HRDS: + dsp: 徐汇浩然大厦 + typ: AdmBldg + usr: PUB + FH-ATL: + dsp: 法华安泰综合楼 + typ: AcadBldg + usr: MNG + FH-BL: + dsp: 法华北楼 + typ: AdmBldg + usr: PUB + FH-NL: + dsp: 法华南楼 + typ: AdmBldg + usr: PUB + FH-1: + dsp: 法华1号楼 + typ: N/A + usr: N/A + FH-2: + dsp: 法华2号楼 + typ: N/A + usr: N/A + FH-3: + dsp: 法华3号楼 + typ: N/A + usr: N/A + FH-4: + dsp: 法华4号楼 + typ: N/A + usr: N/A + FH-5: + dsp: 法华5号楼 + typ: N/A + usr: N/A + FH-6: + dsp: 法华6号楼 + typ: N/A + usr: N/A + FH-7: + dsp: 法华7号楼 + typ: N/A + usr: N/A + FH-8: + dsp: 法华8号楼 + typ: N/A + usr: N/A + FH-9: + dsp: 法华9号楼 + typ: N/A + usr: N/A + FH-10: + dsp: 法华10号楼 + typ: N/A + usr: N/A + QB-WLZX: + dsp: 七宝网络中心 + typ: AdmBldg + usr: PUB + 6c: + dsp: 第六食堂-待定 + typ: CantBldg + usr: PUB + CLAL: + dsp: 材料A楼 + typ: AcadBldg + usr: MATR + NYCL: + dsp: 能源材料 + typ: AcadBldg + usr: MATR + WWQT: + dsp: 伍威权堂 + typ: SocBldg + usr: PUB + TEST: + dsp: 测试点 + typ: N/A + usr: N/A + test: + dsp: 测试点 + typ: N/A + usr: N/A +... \ No newline at end of file diff --git a/etlers/ArubaSyslog/src/main/scala/cn/edu/sjtu/omnilab/odh/spark/CleanseWifiLogs.scala b/etlers/ArubaSyslog/src/main/scala/cn/edu/sjtu/omnilab/odh/spark/CleanseWifiLogs.scala new file mode 100644 index 0000000..fe1a289 --- /dev/null +++ b/etlers/ArubaSyslog/src/main/scala/cn/edu/sjtu/omnilab/odh/spark/CleanseWifiLogs.scala @@ -0,0 +1,61 @@ +package cn.edu.sjtu.omnilab.odh.spark + +import cn.edu.sjtu.omnilab.odh.rawfilter.{WIFILogFilter} +import org.apache.spark.{SparkConf, SparkContext} + +case class CleanWIFILog(MAC: String, time: Long, code: Int, payload: String) // payload as AP or IP + + +/** + * Cleanse raw WIFI syslog into movement data. + */ +object CleanseWifiLogs { + + final val mergeSessionThreshold: Long = 10 * 1000 + + def main(args: Array[String]): Unit = { + + if ( args.length < 2) { + println("Usage: CleanseWifiLogs ") + sys.exit(-1) + } + + val input = args(0) + val output = args(1) + + println(input) + + val conf = new SparkConf() + conf.setAppName("Cleanse WIFI syslog into movement data") + val spark = new SparkContext(conf) + + val inRDD = spark.textFile(input) + .map { m => { + + val filtered = WIFILogFilter.filterData(m) + var cleanLog: CleanWIFILog = null + + if (filtered != null) { + val parts = filtered.split(',') + + cleanLog = CleanWIFILog(MAC = parts(0), + time = Utils.ISOToUnix(parts(1)), + code = parts(2).toInt, + payload = parts(3)) + + } + + cleanLog + + }} + .filter(_ != null) + + .map(m => "%s,%d,%d,%s".format(m.MAC, m.time, m.code, m.payload)) + + .saveAsTextFile(output) + + spark.stop() + + } + +} diff --git a/etlers/ArubaSyslog/src/main/scala/cn/edu/sjtu/omnilab/odh/spark/Utils.scala b/etlers/ArubaSyslog/src/main/scala/cn/edu/sjtu/omnilab/odh/spark/Utils.scala new file mode 100644 index 0000000..a136d53 --- /dev/null +++ b/etlers/ArubaSyslog/src/main/scala/cn/edu/sjtu/omnilab/odh/spark/Utils.scala @@ -0,0 +1,206 @@ +package cn.edu.sjtu.omnilab.odh.spark + +import com.google.common.net._ +import org.joda.time.DateTime +import org.joda.time.format.DateTimeFormat + +import scala.collection.immutable.HashMap + + +object Utils { + + /** + * Parse integer value from a string + * @param s a string + * @return an Some(Int) or None + */ + def parseLong(s: String, default: Long): Long = { + try { + s.toLong + } catch { + case e: Exception => default + } + } + + /** + * Parse double value from a string + * @param s a string + * @return an Some(Double) or None + */ + def parseDouble(s: String, default: Double): Double = { + try { + s.toDouble + } catch { + case e: Exception => default + } + } + + /** + * Remove params of a URL string + * @param s a URL string + * @return a clean URL without params + */ + def stripURLParams(s: String): String = { + if (s == null) return null + + val pattern = """^(?:\w+://)?([^\?&]+)\??""".r + val matched = pattern.findFirstMatchIn(s) + matched match { + case Some(m) => m.group(1) + case None => s + } + } + + /** + * Remove protocol prefix of a URL string + * @param s a URL string + * @return a new string without prefix + */ + def stripURLSchema(s: String): String = { + if (s == null) return null + + val pattern = """^(?:\w+:?//)?(.*)$""".r + val matched = pattern.findFirstMatchIn(s) + matched match { + case Some(m) => m.group(1) + case None => s + } + } + + /** + * Check if a given URL takes schema "http://" + * @param url + * @return + */ + def hasURLSchema(url: String): Boolean = { + if (url == null) return false + val pattern = """^(\w+:?//).*""".r + pattern.findFirstIn(url).nonEmpty + } + + /** + * Remove protocol prefix and params of a URL string + * @param s + * @return + */ + def stripURL(s: String): String = { + if (s == null) return null + stripURLSchema(stripURLParams(s)) + } + + /** + * Concatenate Host and URI fileds in HTTP header into a URL address. + * @param host + * @param uri + */ + def combineURL(host: String, uri: String): String = { + var new_url = uri + if (!hasURLSchema(uri)) + new_url = host + uri + new_url + } + + /** + * Get host name from a given Host or URL string + * @param url a Host or URL string, e.g. http://baidu.com/index.html + * @return the host name, e.g. baidu.com + */ + def getHost(url: String): String = { + if (url == null) return null + + val pattern = """^(?:\w+:?//)?([^:\/\?&]+)""".r + val matched = pattern.findFirstMatchIn(url) + matched match { + case Some(m) => m.group(1) + case None => null + } + } + + /** + * Extract parameters from given URL string + * @param url + * @return + */ + def getURLParams(url: String): Map[String, String] = { + if (url == null) return null + + val mainUrlPattern = """^(?:\w+://)?(?:[^\?&]+)(\?.*)?$""".r + val urlParamPattern = """([^?=&]+)=([^?&=]+)""".r + + // do param extraction + def _extractParams(s: String): Map[String, String] = { + val result = new HashMap[String, String] + urlParamPattern.findAllMatchIn(s).foreach{ + case m => result.updated(m.group(1), m.group(2)) + } + result + } + + val mainUrlMatched = mainUrlPattern.findFirstMatchIn(url) + mainUrlMatched match { + case Some(m) => _extractParams(m.group(1)) + case None => null + } + } + + /** + * Get the top-level domain from URL. + * For example, given url "www.baidu.com", this UDF returns "baidu.com" as results. + * + * @param url + * @return + */ + def getTopPrivateDomain(url: String): String = { + if (url == null) return null + + val host = getHost(url) + try { + val name = InternetDomainName.from(host).topPrivateDomain.toString + // NB: there is a compiling error with `name()` method + val pattern = """name=([^=\\{\\}]+)""".r + pattern.findFirstMatchIn(name).get.group(1) + } catch { + case e: Exception => host + } + } + + /** + * Detect the mobile OS name given the User Agent string. + * "unknown" is returned if not identified. + * + * @param user_agent + * @return + */ + def getMobileName(user_agent: String): String = { + if (user_agent == null) return "unknown" + + val pattern = """android|(bb\\d+|meego).+mobile|avantgo|bada\\/|blackberry|blazer|compal|docomo|dolfin|dolphin|elaine|fennec|hiptop|iemobile|(hpw|web)os|htc( touch)?|ip(hone|od|ad)|iris|j2me|kindle( fire)?|lge |maemo|midp|minimo|mmp|netfront|nokia|opera m(ob|in)i|palm( os)?|phone|p(ixi|re)\\/|plucker|playstation|pocket|portalmmm|psp|series(4|6)0|symbian|silk-accelerated|skyfire|sonyericsson|treo|tablet|touch(pad)?|up\\.(browser|link)|vodafone|wap|webos|windows (ce|phone)|wireless|xda|xiino|zune""".r + val matched = pattern.findFirstMatchIn(user_agent) + matched match { + case Some(m) => m.group(0) + case None => "unknown" + } + } + + val timeFormatString = "" + + /** + * Converts ISO8601 datetime strings to Unix Time Longs (milliseconds) + * @param isotime + * @return + */ + def ISOToUnix(isotime: String): Long = { + val fmt = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss") + fmt.parseDateTime(isotime).getMillis + } + + /** + * Converts Unix Time Longs (milliseconds) to ISO8601 datetime strings + * @param unixtime + * @return + */ + def UnixToISO(unixtime: Long): String = { + new DateTime(unixtime).formatted("yyyy-MM-dd HH:mm:ss") + } + +}