Random sampling combos #23 Subset sampling #25

eHarmony · Aug 18, 2016 · 24247ec · 24247ec
1 parent 95daa98
commit 24247ec
Show file tree

Hide file tree

Showing 27 changed files with 421 additions and 213 deletions.
diff --git a/core/pom.xml b/core/pom.xml
@@ -43,6 +43,10 @@
             <groupId>commons-io</groupId>
             <artifactId>commons-io</artifactId>
         </dependency>
+        <dependency>
+            <groupId>com.github.dpaukov</groupId>
+            <artifactId>combinatoricslib3</artifactId>
+        </dependency>
         <dependency>
             <groupId>org.slf4j</groupId>
             <artifactId>slf4j-api</artifactId>

diff --git a/core/src/main/scala/com/eharmony/spotz/backend/BackendFunctions.scala b/core/src/main/scala/com/eharmony/spotz/backend/BackendFunctions.scala
@@ -1,8 +1,8 @@
 package com.eharmony.spotz.backend
 
 import com.eharmony.spotz.objective.Objective
-import com.eharmony.spotz.optimizer.RandomSampler
 import com.eharmony.spotz.optimizer.grid.Grid
+import com.eharmony.spotz.optimizer.hyperparam.RandomSampler
 
 import scala.reflect.ClassTag
 
@@ -15,13 +15,13 @@ import scala.reflect.ClassTag
   */
 trait BackendFunctions {
   protected def bestRandomPointAndLoss[P, L](
-      startIndex: Long,
-      batchSize: Long,
-      objective: Objective[P, L],
-      reducer: ((P, L), (P, L)) => (P, L),
-      hyperParams: Map[String, RandomSampler[_]],
-      seed: Long = 0,
-      sampleFunction: (Map[String, RandomSampler[_]], Long) => P): (P, L)
+                                              startIndex: Long,
+                                              batchSize: Long,
+                                              objective: Objective[P, L],
+                                              reducer: ((P, L), (P, L)) => (P, L),
+                                              hyperParams: Map[String, RandomSampler[_]],
+                                              seed: Long = 0,
+                                              sampleFunction: (Map[String, RandomSampler[_]], Long) => P): (P, L)
 
   protected def bestGridPointAndLoss[P, L](
       startIndex: Long,

diff --git a/core/src/main/scala/com/eharmony/spotz/backend/ParallelFunctions.scala b/core/src/main/scala/com/eharmony/spotz/backend/ParallelFunctions.scala
@@ -1,8 +1,8 @@
 package com.eharmony.spotz.backend
 
 import com.eharmony.spotz.objective.Objective
-import com.eharmony.spotz.optimizer.RandomSampler
 import com.eharmony.spotz.optimizer.grid.Grid
+import com.eharmony.spotz.optimizer.hyperparam.RandomSampler
 
 import scala.reflect.ClassTag
 
@@ -31,13 +31,13 @@ trait ParallelFunctions extends BackendFunctions {
     * @return the best point with the best loss as a tuple
     */
   protected override def bestRandomPointAndLoss[P, L](
-      startIndex: Long,
-      batchSize: Long,
-      objective: Objective[P, L],
-      reducer: ((P, L), (P, L)) => (P, L),
-      hyperParams: Map[String, RandomSampler[_]],
-      seed: Long = 0,
-      sampleFunction: (Map[String, RandomSampler[_]], Long) => P): (P, L) = {
+                                                       startIndex: Long,
+                                                       batchSize: Long,
+                                                       objective: Objective[P, L],
+                                                       reducer: ((P, L), (P, L)) => (P, L),
+                                                       hyperParams: Map[String, RandomSampler[_]],
+                                                       seed: Long = 0,
+                                                       sampleFunction: (Map[String, RandomSampler[_]], Long) => P): (P, L) = {
 
     val pointsAndLosses = (startIndex until (startIndex + batchSize)).par.map { trial =>
       val point = sampleFunction(hyperParams, seed + trial)

diff --git a/core/src/main/scala/com/eharmony/spotz/backend/SparkFunctions.scala b/core/src/main/scala/com/eharmony/spotz/backend/SparkFunctions.scala
@@ -1,8 +1,8 @@
 package com.eharmony.spotz.backend
 
 import com.eharmony.spotz.objective.Objective
-import com.eharmony.spotz.optimizer.RandomSampler
 import com.eharmony.spotz.optimizer.grid.Grid
+import com.eharmony.spotz.optimizer.hyperparam.RandomSampler
 import org.apache.spark.SparkContext
 
 import scala.reflect.ClassTag
@@ -33,13 +33,13 @@ trait SparkFunctions extends BackendFunctions {
     * @return the best point with the best loss as a tuple
     */
   protected override def bestRandomPointAndLoss[P, L](
-      startIndex: Long,
-      batchSize: Long,
-      objective: Objective[P, L],
-      reducer: ((P, L), (P, L)) => (P, L),
-      hyperParams: Map[String, RandomSampler[_]],
-      seed: Long = 0,
-      sampleFunction: (Map[String, RandomSampler[_]], Long) => P): (P, L) = {
+                                                       startIndex: Long,
+                                                       batchSize: Long,
+                                                       objective: Objective[P, L],
+                                                       reducer: ((P, L), (P, L)) => (P, L),
+                                                       hyperParams: Map[String, RandomSampler[_]],
+                                                       seed: Long = 0,
+                                                       sampleFunction: (Map[String, RandomSampler[_]], Long) => P): (P, L) = {
 
     assert(batchSize > 0, "batchSize must be greater than 0")
 

diff --git a/core/src/main/scala/com/eharmony/spotz/optimizer/HyperParameter.scala b/core/src/main/scala/com/eharmony/spotz/optimizer/HyperParameter.scala
diff --git a/core/src/main/scala/com/eharmony/spotz/optimizer/hyperparam/Combinations.scala b/core/src/main/scala/com/eharmony/spotz/optimizer/hyperparam/Combinations.scala
@@ -0,0 +1,96 @@
+package com.eharmony.spotz.optimizer.hyperparam
+
+import scala.util.Random
+
+
+trait CombinatoricRandomSampler[T] extends RandomSampler[Iterable[Iterable[T]]]
+trait IterableRandomSampler[T] extends RandomSampler[Iterable[T]]
+
+/**
+  *
+  * @param iterable
+  * @param k
+  * @param x
+  * @param replacement
+  * @tparam T
+  */
+abstract class AbstractCombinations[T](
+    iterable: Iterable[T],
+    k: Int,
+    x: Int = 1,
+    replacement: Boolean = false) extends Serializable {
+
+  import org.paukov.combinatorics3.Generator
+
+  import scala.collection.JavaConverters._
+
+  private val values = iterable.toSeq
+
+  assert(k > 0, "k must be greater than 0")
+  assert(k <= values.length, s"k must be less than or equal to length of the iterable, ${values.length}")
+
+  // TODO: This is hideous!  Rewrite this to be more memory efficient by unranking combinations.  For now, use a Java lib.
+  val combinations = Generator.combination(iterable.asJavaCollection).simple(k).asScala.toIndexedSeq.map(l => l.asScala.toIndexedSeq)
+
+  /**
+    *
+    * @param rng
+    * @return
+    */
+  def combos(rng: Random): Iterable[Iterable[T]] = {
+    if (replacement) {
+      Seq.fill(x)(combinations(rng.nextInt(combinations.size)))
+    } else {
+      val indices = collection.mutable.Set[Int]()
+      val numElements = scala.math.min(x, combinations.size)
+      val ret = new collection.mutable.ArrayBuffer[Iterable[T]](numElements)
+      while (indices.size < numElements) {
+        val index = rng.nextInt(combinations.size)
+        if (!indices.contains(index)) {
+          indices.add(index)
+          ret += combinations(index)
+        }
+      }
+      ret.toIndexedSeq
+    }
+  }
+}
+
+
+/**
+  * Sample a single combination of K unordered items from the iterable of length N.
+  *
+  * @param iterable
+  * @param k
+  * @param replacement
+  * @tparam T
+  */
+case class Combination[T](
+                           iterable: Iterable[T],
+                           k: Int,
+                           replacement: Boolean = false)
+  extends AbstractCombinations[T](iterable, k, 1, replacement) with IterableRandomSampler[T] {
+
+  override def apply(rng: Random): Iterable[T] = combos(rng).head
+}
+
+
+/**
+  * Binomial coefficient implementation.  Pick K unordered items from an Iterable of N items.
+  * Also known as N Choose K, where N is the size of an Iterable and K is the desired number
+  * of items to be chosen.  This implementation will actually compute all the possible choices
+  * and return them as an Iterable.
+  *
+  * @param iterable an iterable of finite length
+  * @param k the number of items to choose
+  * @tparam T
+  */
+case class Combinations[T](
+    iterable: Iterable[T],
+    k: Int,
+    x: Int = 1,
+    replacement: Boolean = false)
+  extends AbstractCombinations[T](iterable, k, x, replacement) with CombinatoricRandomSampler[T] {
+
+  override def apply(rng: Random): Iterable[Iterable[T]] = combos(rng)
+}
diff --git a/core/src/main/scala/com/eharmony/spotz/optimizer/hyperparam/NormalDistribution.scala b/core/src/main/scala/com/eharmony/spotz/optimizer/hyperparam/NormalDistribution.scala
@@ -0,0 +1,21 @@
+import com.eharmony.spotz.optimizer.hyperparam.RandomSampler
+
+import scala.util.Random
+
+/**
+  * Sample from a normal distribution given the mean and standard deviation
+  *
+  * {{{
+  *   val hyperParamSpace = Map(
+  *     ("x1", NormalDistribution(0, 0.1))
+  *   )
+  * }}}
+  *
+  * @param mean mean
+  * @param std standard deviation
+  */
+case class NormalDistribution(mean: Double, std: Double) extends RandomSampler[Double] {
+  override def apply(rng: Random): Double = {
+    std * rng.nextGaussian() + mean
+  }
+}
diff --git a/core/src/main/scala/com/eharmony/spotz/optimizer/hyperparam/RandomChoice.scala b/core/src/main/scala/com/eharmony/spotz/optimizer/hyperparam/RandomChoice.scala
@@ -0,0 +1,24 @@
+package com.eharmony.spotz.optimizer.hyperparam
+
+import scala.util.Random
+
+/**
+  * Sample an element from an Iterable of fixed length with uniform random distribution.
+  *
+  * {{{
+  *   val hyperParamSpace = Map(
+  *     ("x1", RandomChoice(Seq("svm", "logistic")))
+  *   )
+  * }}}
+  *
+  * @param iterable an iterable of type T
+  * @tparam T type parameter of iterable
+  */
+case class RandomChoice[T](iterable: Iterable[T]) extends RandomSampler[T] {
+  private val values = iterable.toIndexedSeq
+
+  if (values.length < 1)
+    throw new IllegalArgumentException("Empty iterable")
+
+  override def apply(rng: Random): T = values(rng.nextInt(values.length))
+}