Skip to content

Commit

Permalink
Init
Browse files Browse the repository at this point in the history
  • Loading branch information
maiktheknife committed Sep 12, 2016
0 parents commit b1e7d11
Show file tree
Hide file tree
Showing 82 changed files with 56,265 additions and 0 deletions.
23 changes: 23 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
**/bin/
**/target/
project/project
.idea

*.iml
*.classpath
*.properties
*.project

# #ignore everything expected...->
# *
# !*.scala
# !*.sc
# !README.md
# !*.xml
# !*.iml
# !*.classpath
# !*.properties
# !.gitignore
#
# #even sub dirs
# */
13 changes: 13 additions & 0 deletions README.MD
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# WikiPlag - Plattform zur Erkennung von Wikpediaplagiaten

## Übersicht

Projektstudiumsaufgabe im SS16 an der HTW Berlin, betraut von Prof. Dr.-Ing. Hendrik Gärtner

## try it out
http://wikiplag.f4.htw-berlin.de/

## Links
[Redmine](https://studi.f4.htw-berlin.de/redmine/projects/0-wikiplag)

[Aufgabe](http://puck.f4.htw-berlin.de/hgaertner/veranstaltungen/projektstudium/ss-2016/)
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
package de.htw.ai.wikiplag.analysisJob

import com.mongodb.{MongoCredential, ServerAddress}

import de.htw.ai.wikiplag.forwardreferencetable.ForwardReferenceTableImp.buildForwardReferenceTable
import de.htw.ai.wikiplag.parser.WikiDumpParser.extractPlainText
import de.htw.ai.wikiplag.connection.MongoDBImpl

object InputJobHandler {

def handleJob(text: String, step: Int): Any= {
print(text)
val tokens = extractPlainText(text)
val hashes = buildForwardReferenceTable(tokens.map(_.toLowerCase()), step).toMap

MongoDBImpl.open(
new ServerAddress("hadoop03.f4.htw-berlin.de", 27020),
MongoCredential.createCredential("REPLACE-ME", "REPLACE-ME", "REPLACE-ME".toCharArray)
)

val siml = MongoDBImpl.findSimilarity(hashes.keys.toList, 7)
val fitSim = siml.map( hashSim =>
(hashSim._1, hashSim._2.groupBy(_._1).mapValues(list=>list.map(x=> x._2)).toList )
)
MongoDBImpl.close()
//generateWhitelist(fitSim, 30)
}

def generateWhitelist(ngMatches:List[(String, List[(String,List[Int])])], threshold:Double)
:List[(String, List[(String, List[Int])], List[(String, Double)])] = {
val filteredNg = filterMatches(ngMatches, threshold)
filteredNg.map(item => (item._1, item._2, item._2.map(doc=> (doc._1, (item._2.size / ngMatches.size).toDouble))))
}

def filterMatches(ngMatches:List[(String, List[(String,List[Int])])], threshold:Double) :
List[(String, List[(String,List[Int])])] = {
val matchesSize = ngMatches.size
ngMatches.filter(item => item._2.size / matchesSize > threshold)
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
package de.htw.ai.wikiplag.analysisJob
import com.typesafe.config.{Config, ConfigFactory}
import org.apache.spark._
import scala.util.Try
import spark.jobserver._

object TextInputJob extends SparkJob {
def main(args: Array[String]) {
val conf = new SparkConf().setMaster("local[4]").setAppName("TextInputJob")
val sc = new SparkContext(conf)
val config = ConfigFactory.parseString("")
val results = runJob(sc, config)
println("Result is " + results)
}

override def validate(sc: SparkContext, config: Config): SparkJobValidation = {

Try(config.getString("text") ++ config.getString("step"))
.map(x => SparkJobValid)
.getOrElse(SparkJobInvalid("text and step config params should be defined"))
}

override def runJob(sc: SparkContext, config: Config): Any = {
// sc.parallelize(config.getString("text").split(" ").toSeq).countByValue
// List(config.getString("text"), config.getString("step"))
InputJobHandler.handleJob(config.getString("text"), config.getString("step").toInt).toString
}
}
187 changes: 187 additions & 0 deletions build.sbt
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
import AssemblyKeys._
import sbtassembly.Plugin._

import sbt._
import org.scalatra.sbt._
import org.scalatra.sbt.PluginKeys._
import com.earldouglas.xwp.JettyPlugin
import com.mojolly.scalate.ScalatePlugin._
import ScalateKeys._
import com.typesafe.sbt.packager.archetypes.JavaAppPackaging

/*
* Dependencies
*/
val parserComb = "org.scala-lang.modules" %% "scala-parser-combinators" % "1.0.4"
val mongoDBDriverDep = "org.mongodb" %% "casbah" % "3.1.1"
val sparkCoreDep = "org.apache.spark" %% "spark-core" % "1.3.0" % "provided"
val sparkSQLDep = "org.apache.spark" %% "spark-sql" % "1.3.0" % "provided"
val sparkDataBricksDep = "com.databricks" % "spark-xml_2.10" % "0.3.3"
val unbescaped = "org.unbescape" % "unbescape" % "1.1.3.RELEASE"
val commonsCodec = "commons-codec" % "commons-codec" % "1.9"
val jobserver = "spark.jobserver" %% "job-server-api" % "0.6.2" % "provided"
val config = "com.typesafe" % "config" % "1.3.0"
val hadoopClient = ("org.apache.hadoop" % "hadoop-client" % "2.2.0")
.exclude("commons-logging", "commons-logging")
.exclude("commons-beanutils", "commons-beanutils-core")
.exclude("commons-collections", "commons-collections")
val mongoDBHadoopCore = ("org.mongodb.mongo-hadoop" % "mongo-hadoop-core" % "1.5.1")
.exclude("commons-logging", "commons-logging")
.exclude("commons-beanutils", "commons-beanutils-core")
.exclude("commons-collections", "commons-collections")


/*
* Test-Dependencies
*/
val testDependencies = Seq(
"org.slf4j" % "slf4j-simple" % "1.7.21" % "test",
"junit" % "junit" % "4.11" % "test",
"org.scalatest" % "scalatest_2.10" % "2.2.6" % "test"
)

/*
* Settings
*/
lazy val commonSettings = Seq(
organization := "HTW Berlin",
name := "WikiPlag",
version := "0.0.1",
scalaVersion := "2.10.4",
libraryDependencies ++= testDependencies
)

/*
* Modules
*/
lazy val mongodb = (project in file("mongodb"))
.settings(commonSettings: _*)
.settings(
name := "MongoDBConnection",
libraryDependencies ++= Seq(
mongoDBDriverDep
)
)

lazy val forwardreferencetable = (project in file("forwardreferencetable"))
.settings(commonSettings: _*)
.settings(
name := "forwardreferencetable",
libraryDependencies ++= Seq(
commonsCodec
)
)

lazy val viewindex = (project in file("viewindex"))
.settings(commonSettings: _*)
.settings(
name := "ViewIndex",
libraryDependencies ++= Seq(
)
)

lazy val parser = (project in file("parser"))
.settings(commonSettings: _*)
.settings(
name := "Parser",
excludeFilter in unmanagedResources := "*",
libraryDependencies ++= Seq(
unbescaped
)
)

lazy val sparkApp = (project in file("sparkapp"))
.settings(commonSettings: _*)
.settings(
name := "SparkApp",
libraryDependencies ++= Seq(
sparkCoreDep, sparkSQLDep, sparkDataBricksDep //, mongoDBHadoopCore, hadoopClient
)
).settings(
assemblySettings,
jarName in assembly := "wikiplag_sparkapp.jar",
assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false)
)
.dependsOn(
forwardreferencetable, viewindex, parser, mongodb
)

lazy val analysisJob = (project in file("analysisJob"))
.settings(commonSettings: _*)
.settings(
name := "AnalysisJob",
libraryDependencies ++= Seq(
sparkCoreDep, jobserver, config
),
resolvers ++= Seq("Job Server Bintray" at "https://dl.bintray.com/spark-jobserver/maven"),
assemblySettings,
jarName in assembly := "analysisJob.jar"
)
.dependsOn(
mongodb, forwardreferencetable, viewindex, parser
)

lazy val similarity = (project in file("similarity"))
.settings(commonSettings: _*)
.settings(
name := "Similarity",
excludeFilter in unmanagedResources := "*",
libraryDependencies ++= Seq(
)
)

lazy val stopwordfinder = (project in file("stopwordfinder"))
.settings(commonSettings: _*)
.settings(
name := "stopwordfinder",
excludeFilter in unmanagedResources := "*",
libraryDependencies ++= Seq(
sparkCoreDep, sparkSQLDep
)
)

val ScalatraVersion = "2.4.1"

lazy val webApp = (project in file("webapp"))
.settings(ScalatraPlugin.scalatraSettings: _*)
.settings(scalateSettings: _*)
.settings(commonSettings: _*)
.settings(
name := "webapp",
resolvers += Classpaths.typesafeReleases,
resolvers += "Scalaz Bintray Repo" at "http://dl.bintray.com/scalaz/releases",
libraryDependencies ++= Seq(
"org.scalatra" %% "scalatra" % ScalatraVersion,
"org.scalatra" %% "scalatra-scalate" % ScalatraVersion,
"org.scalatra" %% "scalatra-specs2" % ScalatraVersion % "test",
"ch.qos.logback" % "logback-classic" % "1.1.5" % "runtime",
"org.eclipse.jetty" % "jetty-webapp" % "9.2.15.v20160210" % "compile;container",
"javax.servlet" % "javax.servlet-api" % "3.1.0" % "container;provided",
"org.scalatra" %% "scalatra-json" % ScalatraVersion,
"org.json4s" %% "json4s-jackson" % "3.3.0",
"org.scalaj" %% "scalaj-http" % "2.3.0",
"com.typesafe" % "config" % "1.3.0",
"commons-codec" % "commons-codec" % "1.9"
),
scalateTemplateConfig in Compile <<= (sourceDirectory in Compile){ base =>
Seq(
TemplateConfig(
base / "webapp" / "WEB-INF" / "templates",
Seq.empty, /* default imports should be added here */
Seq(
Binding("context", "_root_.org.scalatra.scalate.ScalatraRenderContext", importMembers = true, isImplicit = true)
), /* add extra bindings here */
Some("templates")
)
)
}
)
.dependsOn(
mongodb,
forwardreferencetable,
viewindex,
parser
)
.enablePlugins(JettyPlugin)
.enablePlugins(JavaAppPackaging)

11 changes: 11 additions & 0 deletions forwardreferencetable/src/main/frt.iml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="JAVA_MODULE" version="4">
<component name="NewModuleRootManager" inherit-compiler-output="true">
<exclude-output />
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/scala" isTestSource="false" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
package de.htw.ai.wikiplag.forwardreferencetable

/**
* Created by robertsteiner on 27.05.16.
*/
trait ForwardReferenceTable {
/**
* Erzeugt eine ForwardReferenceTable nach dem Schema:
* {
* "hash("ngram_1")": List[ ( page_id, List[ ngram_position_1, ngram_position_2, ngram_position_3 ] ) ],
* "hash("ngram_2")": List[ ( page_id, List[ ngram_position_1, ... ] ) ], ...
* }
*
* Beispiel:
*
* Input:
* pageId = Int(1)
* pageWordsAsList = List[String]("kam", "die", "Parodie", "An", "Alan", "Smithee", "Film", "Burn", "Hollywood")
* stepSize = Int(3)
*
* Output:
* collection.mutable.Map[String, List[(Int, List[Int])]
* {
* "hash("kam die Parodie")": List[ ( 1, List[ 0 ] ) ],
* "hash("die Parodie An")": List[ ( 1, List[ 1 ] ) ],
* "hash("Parodie An Alan")": List[ ( 1, List[ 2 ] ) ], ...
* }
*
* @param pageId Die Page-ID.
* @param pageWordsAsList Eine Liste, deren Elemente die Woerter der Page enthalten.
* @param nGramStepSize Die Schrittlaenge der n-Gramme.
* @return Eine Forward Reference Table.
*/
def buildForwardReferenceTable(pageId: Int,
pageWordsAsList: List[String],
nGramStepSize: Int): collection.mutable.Map[String, List[(Int, List[Int])]]

/**
* Erzeugt eine ForwardReferenceTable nach dem Schema:
* {
* "hash("ngram_1")": List[ ngram_position_1, ngram_position_2, ngram_position_3 ],
* "hash("ngram_2")": List[ ngram_position_1, ... ], ...
* }
*
* Beispiel:
*
* Input:
* pageWordsAsList = List[String]("kam", "die", "Parodie", "An", "Alan", "Smithee", "Film", "Burn", "Hollywood")
* stepSize = Int(3)
*
* Output:
* collection.mutable.Map[String, List[Int]]
* {
* "hash("kam die Parodie")": List[ 0 ],
* "hash("die Parodie An")": List[ 1 ],
* "hash("Parodie An Alan")": List[ 2 ], ...
* }
*
* @param pageWordsAsList Eine Liste, deren Elemente die Woerter der Page enthalten.
* @param nGramStepSize Die Schrittlaenge der n-Gramme.
* @return Eine Forward Reference Table.
*/
def buildForwardReferenceTable(pageWordsAsList: List[String],
nGramStepSize: Int): collection.mutable.Map[String, List[Int]]
}
Loading

0 comments on commit b1e7d11

Please sign in to comment.