Init

WikiPlag · Sep 12, 2016 · b1e7d11 · b1e7d11
commit b1e7d11
Show file tree

Hide file tree

Showing 82 changed files with 56,265 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,23 @@
+**/bin/
+**/target/
+project/project
+.idea
+
+*.iml
+*.classpath
+*.properties
+*.project
+
+# #ignore everything expected...->
+# *
+# !*.scala
+# !*.sc
+# !README.md
+# !*.xml
+# !*.iml
+# !*.classpath
+# !*.properties
+# !.gitignore
+# 
+# #even sub dirs
+# */
diff --git a/README.MD b/README.MD
@@ -0,0 +1,13 @@
+# WikiPlag - Plattform zur Erkennung von Wikpediaplagiaten
+
+## Übersicht
+
+Projektstudiumsaufgabe im SS16 an der HTW Berlin, betraut von Prof. Dr.-Ing. Hendrik Gärtner
+
+## try it out
+http://wikiplag.f4.htw-berlin.de/
+
+## Links
+[Redmine](https://studi.f4.htw-berlin.de/redmine/projects/0-wikiplag) 
+
+[Aufgabe](http://puck.f4.htw-berlin.de/hgaertner/veranstaltungen/projektstudium/ss-2016/)
diff --git a/analysisJob/src/main/scala/de/htw/ai/wikiplag/analysisJob/InputJobHandler.scala b/analysisJob/src/main/scala/de/htw/ai/wikiplag/analysisJob/InputJobHandler.scala
@@ -0,0 +1,41 @@
+package de.htw.ai.wikiplag.analysisJob
+
+import com.mongodb.{MongoCredential, ServerAddress}
+
+import de.htw.ai.wikiplag.forwardreferencetable.ForwardReferenceTableImp.buildForwardReferenceTable
+import de.htw.ai.wikiplag.parser.WikiDumpParser.extractPlainText
+import de.htw.ai.wikiplag.connection.MongoDBImpl
+
+object InputJobHandler {
+
+  def handleJob(text: String, step: Int): Any= {
+    print(text)
+    val tokens = extractPlainText(text)
+    val hashes = buildForwardReferenceTable(tokens.map(_.toLowerCase()), step).toMap
+
+   MongoDBImpl.open(
+      new ServerAddress("hadoop03.f4.htw-berlin.de", 27020),
+      MongoCredential.createCredential("REPLACE-ME", "REPLACE-ME", "REPLACE-ME".toCharArray)
+    )
+
+    val siml = MongoDBImpl.findSimilarity(hashes.keys.toList, 7)
+    val fitSim = siml.map( hashSim =>
+      (hashSim._1, hashSim._2.groupBy(_._1).mapValues(list=>list.map(x=> x._2)).toList )
+    )
+    MongoDBImpl.close()
+    //generateWhitelist(fitSim, 30)
+  }
+
+  def generateWhitelist(ngMatches:List[(String, List[(String,List[Int])])], threshold:Double)
+  :List[(String, List[(String, List[Int])], List[(String, Double)])] = {
+    val filteredNg = filterMatches(ngMatches, threshold)
+    filteredNg.map(item => (item._1, item._2, item._2.map(doc=> (doc._1, (item._2.size / ngMatches.size).toDouble))))
+  }
+
+  def filterMatches(ngMatches:List[(String, List[(String,List[Int])])], threshold:Double) :
+  List[(String, List[(String,List[Int])])] = {
+    val matchesSize = ngMatches.size
+    ngMatches.filter(item => item._2.size / matchesSize > threshold)
+  }
+
+}
diff --git a/analysisJob/src/main/scala/de/htw/ai/wikiplag/analysisJob/TextInputJob.scala b/analysisJob/src/main/scala/de/htw/ai/wikiplag/analysisJob/TextInputJob.scala
@@ -0,0 +1,28 @@
+package de.htw.ai.wikiplag.analysisJob
+import com.typesafe.config.{Config, ConfigFactory}
+import org.apache.spark._
+import scala.util.Try
+import spark.jobserver._
+
+object TextInputJob extends SparkJob {
+  def main(args: Array[String]) {
+    val conf = new SparkConf().setMaster("local[4]").setAppName("TextInputJob")
+    val sc = new SparkContext(conf)
+    val config = ConfigFactory.parseString("")
+    val results = runJob(sc, config)
+    println("Result is " + results)
+  }
+
+  override def validate(sc: SparkContext, config: Config): SparkJobValidation = {
+
+    Try(config.getString("text") ++ config.getString("step"))
+      .map(x => SparkJobValid)
+      .getOrElse(SparkJobInvalid("text and step config params should be defined"))
+  }
+
+  override def runJob(sc: SparkContext, config: Config): Any = {
+//    sc.parallelize(config.getString("text").split(" ").toSeq).countByValue
+//    List(config.getString("text"), config.getString("step"))
+    InputJobHandler.handleJob(config.getString("text"), config.getString("step").toInt).toString
+  }
+}
diff --git a/build.sbt b/build.sbt
@@ -0,0 +1,187 @@
+import AssemblyKeys._
+import sbtassembly.Plugin._
+
+import sbt._
+import org.scalatra.sbt._
+import org.scalatra.sbt.PluginKeys._
+import com.earldouglas.xwp.JettyPlugin
+import com.mojolly.scalate.ScalatePlugin._
+import ScalateKeys._
+import com.typesafe.sbt.packager.archetypes.JavaAppPackaging
+
+/*
+ * Dependencies
+ */
+val parserComb = "org.scala-lang.modules" %% "scala-parser-combinators" % "1.0.4"
+val mongoDBDriverDep = "org.mongodb" %% "casbah" % "3.1.1"
+val sparkCoreDep = "org.apache.spark" %% "spark-core" % "1.3.0" % "provided"
+val sparkSQLDep = "org.apache.spark" %% "spark-sql" % "1.3.0" % "provided"
+val sparkDataBricksDep = "com.databricks" % "spark-xml_2.10" % "0.3.3"
+val unbescaped = "org.unbescape" % "unbescape" % "1.1.3.RELEASE"
+val commonsCodec = "commons-codec" % "commons-codec" % "1.9"
+val jobserver = "spark.jobserver" %% "job-server-api" % "0.6.2" % "provided"
+val config = "com.typesafe" % "config" % "1.3.0"
+val hadoopClient = ("org.apache.hadoop" % "hadoop-client" % "2.2.0")
+  .exclude("commons-logging", "commons-logging")
+  .exclude("commons-beanutils", "commons-beanutils-core")
+  .exclude("commons-collections", "commons-collections")
+val mongoDBHadoopCore = ("org.mongodb.mongo-hadoop" % "mongo-hadoop-core" % "1.5.1")
+  .exclude("commons-logging", "commons-logging")
+  .exclude("commons-beanutils", "commons-beanutils-core")
+  .exclude("commons-collections", "commons-collections")
+
+
+/*
+ * Test-Dependencies
+ */
+val testDependencies = Seq(
+  "org.slf4j" % "slf4j-simple" % "1.7.21" % "test",
+  "junit" % "junit" % "4.11" % "test",
+  "org.scalatest" % "scalatest_2.10" % "2.2.6" % "test"
+)
+
+/*
+ * Settings
+ */
+lazy val commonSettings = Seq(
+  organization := "HTW Berlin",
+  name := "WikiPlag",
+  version := "0.0.1",
+  scalaVersion := "2.10.4",
+  libraryDependencies ++= testDependencies
+)
+
+/*
+ * Modules
+ */
+lazy val mongodb = (project in file("mongodb"))
+  .settings(commonSettings: _*)
+  .settings(
+    name := "MongoDBConnection",
+    libraryDependencies ++= Seq(
+      mongoDBDriverDep
+    )
+  )
+
+lazy val forwardreferencetable = (project in file("forwardreferencetable"))
+  .settings(commonSettings: _*)
+  .settings(
+    name := "forwardreferencetable",
+    libraryDependencies ++= Seq(
+      commonsCodec
+    )
+  )
+
+lazy val viewindex = (project in file("viewindex"))
+  .settings(commonSettings: _*)
+  .settings(
+    name := "ViewIndex",
+    libraryDependencies ++= Seq(
+    )
+  )
+
+lazy val parser = (project in file("parser"))
+  .settings(commonSettings: _*)
+  .settings(
+    name := "Parser",
+    excludeFilter in unmanagedResources := "*",
+    libraryDependencies ++= Seq(
+      unbescaped
+    )
+  )
+
+lazy val sparkApp = (project in file("sparkapp"))
+  .settings(commonSettings: _*)
+  .settings(
+    name := "SparkApp",
+    libraryDependencies ++= Seq(
+      sparkCoreDep, sparkSQLDep, sparkDataBricksDep //, mongoDBHadoopCore, hadoopClient
+    )
+  ).settings(
+    assemblySettings,
+    jarName in assembly := "wikiplag_sparkapp.jar",
+    assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false)
+  )
+  .dependsOn(
+    forwardreferencetable, viewindex, parser, mongodb
+  )
+
+lazy val analysisJob = (project in file("analysisJob"))
+  .settings(commonSettings: _*)
+  .settings(
+    name := "AnalysisJob",
+    libraryDependencies ++= Seq(
+      sparkCoreDep, jobserver, config
+    ),
+    resolvers ++= Seq("Job Server Bintray" at "https://dl.bintray.com/spark-jobserver/maven"),
+    assemblySettings,
+    jarName in assembly := "analysisJob.jar"
+  )
+  .dependsOn(
+    mongodb, forwardreferencetable, viewindex, parser
+  )
+
+lazy val similarity = (project in file("similarity"))
+  .settings(commonSettings: _*)
+  .settings(
+    name := "Similarity",
+    excludeFilter in unmanagedResources := "*",
+    libraryDependencies ++= Seq(
+    )
+  )
+
+lazy val stopwordfinder = (project in file("stopwordfinder"))
+  .settings(commonSettings: _*)
+  .settings(
+    name := "stopwordfinder",
+    excludeFilter in unmanagedResources := "*",
+    libraryDependencies ++= Seq(
+       sparkCoreDep, sparkSQLDep
+    )
+  )
+
+val ScalatraVersion = "2.4.1"
+
+lazy val webApp = (project in file("webapp"))
+  .settings(ScalatraPlugin.scalatraSettings: _*)
+  .settings(scalateSettings: _*)
+  .settings(commonSettings: _*)
+  .settings(
+    name := "webapp",
+    resolvers += Classpaths.typesafeReleases,
+    resolvers += "Scalaz Bintray Repo" at "http://dl.bintray.com/scalaz/releases",
+    libraryDependencies ++= Seq(
+      "org.scalatra" %% "scalatra" % ScalatraVersion,
+      "org.scalatra" %% "scalatra-scalate" % ScalatraVersion,
+      "org.scalatra" %% "scalatra-specs2" % ScalatraVersion % "test",
+      "ch.qos.logback" % "logback-classic" % "1.1.5" % "runtime",
+      "org.eclipse.jetty" % "jetty-webapp" % "9.2.15.v20160210" % "compile;container",
+      "javax.servlet" % "javax.servlet-api" % "3.1.0" % "container;provided",
+      "org.scalatra" %% "scalatra-json" % ScalatraVersion,
+      "org.json4s"   %% "json4s-jackson" % "3.3.0",
+      "org.scalaj" %% "scalaj-http" % "2.3.0",
+      "com.typesafe" % "config" % "1.3.0",
+      "commons-codec" % "commons-codec" % "1.9"
+    ),
+    scalateTemplateConfig in Compile <<= (sourceDirectory in Compile){ base =>
+      Seq(
+        TemplateConfig(
+          base / "webapp" / "WEB-INF" / "templates",
+          Seq.empty,  /* default imports should be added here */
+          Seq(
+            Binding("context", "_root_.org.scalatra.scalate.ScalatraRenderContext", importMembers = true, isImplicit = true)
+          ),  /* add extra bindings here */
+          Some("templates")
+        )
+      )
+    }
+  )
+  .dependsOn(
+    mongodb,
+    forwardreferencetable,
+    viewindex,
+    parser
+  )
+  .enablePlugins(JettyPlugin)
+  .enablePlugins(JavaAppPackaging)
+
diff --git a/forwardreferencetable/src/main/frt.iml b/forwardreferencetable/src/main/frt.iml
@@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="JAVA_MODULE" version="4">
+  <component name="NewModuleRootManager" inherit-compiler-output="true">
+    <exclude-output />
+    <content url="file://$MODULE_DIR$">
+      <sourceFolder url="file://$MODULE_DIR$/scala" isTestSource="false" />
+    </content>
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>
diff --git a/...table/src/main/scala/de/htw/ai/wikiplag/forwardreferencetable/ForwardReferenceTable.scala b/...table/src/main/scala/de/htw/ai/wikiplag/forwardreferencetable/ForwardReferenceTable.scala
@@ -0,0 +1,65 @@
+package de.htw.ai.wikiplag.forwardreferencetable
+
+/**
+  * Created by robertsteiner on 27.05.16.
+  */
+trait ForwardReferenceTable {
+  /**
+    * Erzeugt eine ForwardReferenceTable nach dem Schema:
+    * {
+    *   "hash("ngram_1")":  List[ ( page_id, List[ ngram_position_1, ngram_position_2, ngram_position_3 ] ) ],
+    *   "hash("ngram_2")":  List[ ( page_id, List[ ngram_position_1, ... ] ) ], ...
+    * }
+    *
+    * Beispiel:
+    *
+    * Input:
+    * pageId = Int(1)
+    * pageWordsAsList = List[String]("kam", "die", "Parodie", "An", "Alan", "Smithee", "Film", "Burn", "Hollywood")
+    * stepSize = Int(3)
+    *
+    * Output:
+    * collection.mutable.Map[String, List[(Int, List[Int])]
+    * {
+    *   "hash("kam die Parodie")": List[ ( 1, List[ 0 ] ) ],
+    *   "hash("die Parodie An")": List[ ( 1, List[ 1 ] ) ],
+    *   "hash("Parodie An Alan")": List[ ( 1, List[ 2 ] ) ], ...
+    * }
+    *
+    * @param pageId          Die Page-ID.
+    * @param pageWordsAsList Eine Liste, deren Elemente die Woerter der Page enthalten.
+    * @param nGramStepSize   Die Schrittlaenge der n-Gramme.
+    * @return Eine Forward Reference Table.
+    */
+  def buildForwardReferenceTable(pageId: Int,
+                                 pageWordsAsList: List[String],
+                                 nGramStepSize: Int): collection.mutable.Map[String, List[(Int, List[Int])]]
+
+  /**
+    * Erzeugt eine ForwardReferenceTable nach dem Schema:
+    * {
+    *   "hash("ngram_1")":  List[ ngram_position_1, ngram_position_2, ngram_position_3 ],
+    *   "hash("ngram_2")":  List[ ngram_position_1, ... ], ...
+    * }
+    *
+    * Beispiel:
+    *
+    * Input:
+    * pageWordsAsList = List[String]("kam", "die", "Parodie", "An", "Alan", "Smithee", "Film", "Burn", "Hollywood")
+    * stepSize = Int(3)
+    *
+    * Output:
+    * collection.mutable.Map[String, List[Int]]
+    * {
+    *   "hash("kam die Parodie")": List[ 0 ],
+    *   "hash("die Parodie An")": List[ 1 ],
+    *   "hash("Parodie An Alan")": List[ 2 ], ...
+    * }
+    *
+    * @param pageWordsAsList Eine Liste, deren Elemente die Woerter der Page enthalten.
+    * @param nGramStepSize   Die Schrittlaenge der n-Gramme.
+    * @return Eine Forward Reference Table.
+    */
+  def buildForwardReferenceTable(pageWordsAsList: List[String],
+                                 nGramStepSize: Int): collection.mutable.Map[String, List[Int]]
+}