From d9686d4dba4afd6e2c061fd2822d40d9a55d88d4 Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Wed, 12 Jun 2024 16:26:28 -0500
Subject: [PATCH 01/10] Add in the ability to fingerprint JSON columns (#11002)

Signed-off-by: Robert (Bobby) Evans <bobby@apache.org>
---
 .../spark/sql/tests/datagen/bigDataGen.scala  | 1181 ++++++++++++++---
 1 file changed, 1002 insertions(+), 179 deletions(-)

diff --git a/datagen/src/main/scala/org/apache/spark/sql/tests/datagen/bigDataGen.scala b/datagen/src/main/scala/org/apache/spark/sql/tests/datagen/bigDataGen.scala
index 91335afe4e6..14e0d4e0970 100644
--- a/datagen/src/main/scala/org/apache/spark/sql/tests/datagen/bigDataGen.scala
+++ b/datagen/src/main/scala/org/apache/spark/sql/tests/datagen/bigDataGen.scala
@@ -16,21 +16,22 @@
 
 package org.apache.spark.sql.tests.datagen
 
+import com.fasterxml.jackson.core.{JsonFactoryBuilder, JsonParser, JsonToken}
+import com.fasterxml.jackson.core.json.JsonReadFeature
 import java.math.{BigDecimal => JavaBigDecimal}
 import java.sql.{Date, Timestamp}
 import java.time.{Duration, Instant, LocalDate, LocalDateTime}
 import java.util
-
 import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 import scala.math.BigDecimal.RoundingMode
 import scala.util.Random
 
-import org.apache.spark.sql.{Column, DataFrame, SparkSession}
+import org.apache.spark.sql.{Column, DataFrame, Row, SparkSession}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{Expression, XXH64}
 import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, ArrayData, DateTimeUtils}
-import org.apache.spark.sql.functions.col
+import org.apache.spark.sql.functions.{approx_count_distinct, avg, coalesce, col, count, lit, stddev, struct, transform, udf, when}
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 import org.apache.spark.util.random.XORShiftRandom
@@ -79,22 +80,28 @@ class RowLocation(val rowNum: Long, val subRows: Array[Int] = null) {
  * hash. This makes the generated data correlated for all column/child columns.
  * @param tableNum a unique ID for the table this is a part of.
  * @param columnNum the location of the column in the data being generated
+ * @param substringNum the location of the substring column
  * @param correlatedKeyGroup the correlated key group this column is a part of, if any.
  */
-case class ColumnLocation(tableNum: Int, columnNum: Int, correlatedKeyGroup: Option[Long] = None) {
-  def forNextColumn(): ColumnLocation = ColumnLocation(tableNum, columnNum + 1)
+case class ColumnLocation(tableNum: Int,
+                          columnNum: Int,
+                          substringNum: Int,
+                          correlatedKeyGroup: Option[Long] = None) {
+  def forNextColumn(): ColumnLocation = ColumnLocation(tableNum, columnNum + 1, 0)
 
+  def forNextSubstring: ColumnLocation = ColumnLocation(tableNum, columnNum, substringNum + 1)
 
   /**
    * Create a new ColumnLocation that is specifically for a given key group
    */
   def forCorrelatedKeyGroup(keyGroup: Long): ColumnLocation =
-    ColumnLocation(tableNum, columnNum, Some(keyGroup))
+    ColumnLocation(tableNum, columnNum, substringNum, Some(keyGroup))
 
   /**
    * Hash the location into a single long value.
    */
-  lazy val hashLoc: Long = XXH64.hashLong(tableNum, correlatedKeyGroup.getOrElse(columnNum))
+  lazy val hashLoc: Long = XXH64.hashLong(tableNum,
+    correlatedKeyGroup.getOrElse(XXH64.hashLong(columnNum, substringNum)))
 }
 
 /**
@@ -115,6 +122,9 @@ case class ColumnConf(columnLoc: ColumnLocation,
   def forNextColumn(nullable: Boolean): ColumnConf =
     ColumnConf(columnLoc.forNextColumn(), nullable, numTableRows)
 
+  def forNextSubstring: ColumnConf =
+    ColumnConf(columnLoc.forNextSubstring, nullable = true, numTableRows)
+
   /**
    * Create a new configuration based on this, but for a given correlated key group.
    */
@@ -303,6 +313,23 @@ case class VarLengthGeneratorFunction(minLength: Int, maxLength: Int) extends
   }
 }
 
+case class StdDevLengthGen(mean: Double,
+                           stdDev: Double,
+                           mapping: LocationToSeedMapping = null) extends
+  LengthGeneratorFunction {
+  override def withLocationToSeedMapping(mapping: LocationToSeedMapping): LengthGeneratorFunction =
+    StdDevLengthGen(mean, stdDev, mapping)
+
+  override def apply(rowLoc: RowLocation): Int = {
+    val r = DataGen.getRandomFor(rowLoc, mapping)
+    val g = r.nextGaussian() // g has a mean of 0 and a stddev of 1.0
+    val adjusted = mean + (g * stdDev)
+    // If the range of seed is too small compared to the stddev and mean we will
+    // end up with an invalid distribution, but they asked for it.
+    math.max(0, math.round(adjusted).toInt)
+  }
+}
+
 /**
  * Generate nulls with a given probability.
  * @param prob 0.0 to 1.0 for how often nulls should appear in the output.
@@ -562,11 +589,8 @@ case class DataGenExpr(child: Expression,
   }
 }
 
-/**
- * Base class for generating a column/sub-column. This holds configuration for the column,
- * and handles what is needed to convert it into GeneratorFunction
- */
-abstract class DataGen(var conf: ColumnConf,
+abstract class CommonDataGen(
+    var conf: ColumnConf,
     defaultValueRange: Option[(Any, Any)],
     var seedMapping: LocationToSeedMapping = FlatDistribution(),
     var nullMapping: LocationToSeedMapping = FlatDistribution(),
@@ -576,26 +600,25 @@ abstract class DataGen(var conf: ColumnConf,
   protected var valueRange: Option[(Any, Any)] = defaultValueRange
 
   /**
-   * Set a value range for this data gen.
+   * Set a value range
    */
-  def setValueRange(min: Any, max: Any): DataGen = {
+  def setValueRange(min: Any, max: Any): CommonDataGen = {
     valueRange = Some((min, max))
     this
   }
 
   /**
-   * Set a custom GeneratorFunction to use for this column.
+   * Set a custom GeneratorFunction
    */
-  def setValueGen(f: GeneratorFunction): DataGen = {
+  def setValueGen(f: GeneratorFunction): CommonDataGen = {
     userProvidedValueGen = Some(f)
     this
   }
 
   /**
-   * Set a NullGeneratorFunction for this column. This will not be used
-   * if the column is not nullable.
+   * Set a NullGeneratorFunction
    */
-  def setNullGen(f: NullGeneratorFunction): DataGen = {
+  def setNullGen(f: NullGeneratorFunction): CommonDataGen = {
     this.userProvidedNullGen = Some(f)
     this
   }
@@ -604,12 +627,12 @@ abstract class DataGen(var conf: ColumnConf,
    * Set the probability of a null appearing in the output. The probability should be
    * 0.0 to 1.0.
    */
-  def setNullProbability(probability: Double): DataGen = {
+  def setNullProbability(probability: Double): CommonDataGen = {
     this.userProvidedNullGen = Some(NullProbabilityGenerationFunction(probability))
     this
   }
 
-  def setNullProbabilityRecursively(probability: Double): DataGen = {
+  def setNullProbabilityRecursively(probability: Double): CommonDataGen = {
     this.userProvidedNullGen = Some(NullProbabilityGenerationFunction(probability))
     children.foreach {
       case (_, dataGen) =>
@@ -621,7 +644,7 @@ abstract class DataGen(var conf: ColumnConf,
   /**
    * Set a specific location to seed mapping for the value generation.
    */
-  def setSeedMapping(seedMapping: LocationToSeedMapping): DataGen = {
+  def setSeedMapping(seedMapping: LocationToSeedMapping): CommonDataGen = {
     this.seedMapping = seedMapping
     this
   }
@@ -629,7 +652,7 @@ abstract class DataGen(var conf: ColumnConf,
   /**
    * Set a specific location to seed mapping for the null generation.
    */
-  def setNullMapping(nullMapping: LocationToSeedMapping): DataGen = {
+  def setNullMapping(nullMapping: LocationToSeedMapping): CommonDataGen = {
     this.nullMapping = nullMapping
     this
   }
@@ -638,7 +661,7 @@ abstract class DataGen(var conf: ColumnConf,
    * Set a specific LengthGeneratorFunction to use. This will only be used if
    * the datatype needs a length.
    */
-  def setLengthGen(lengthGen: LengthGeneratorFunction): DataGen = {
+  def setLengthGen(lengthGen: LengthGeneratorFunction): CommonDataGen = {
     this.lengthGen = lengthGen
     this
   }
@@ -646,25 +669,30 @@ abstract class DataGen(var conf: ColumnConf,
   /**
    * Set the length generation to be a fixed length.
    */
-  def setLength(len: Int): DataGen = {
+  def setLength(len: Int): CommonDataGen = {
     this.lengthGen = FixedLengthGeneratorFunction(len)
     this
   }
 
-  def setLength(minLen: Int, maxLen: Int) = {
+  def setLength(minLen: Int, maxLen: Int): CommonDataGen = {
     this.lengthGen = VarLengthGeneratorFunction(minLen, maxLen)
     this
   }
 
+  def setGaussianLength(mean: Double, stdDev: Double): CommonDataGen = {
+    this.lengthGen = StdDevLengthGen(mean, stdDev)
+    this
+  }
+
   /**
    * Add this column to a specific correlated key group. This should not be
    * called directly by users.
    */
   def setCorrelatedKeyGroup(keyGroup: Long,
-      minSeed: Long, maxSeed: Long,
-      seedMapping: LocationToSeedMapping): DataGen = {
+                            minSeed: Long, maxSeed: Long,
+                            seedMapping: LocationToSeedMapping): CommonDataGen = {
     conf = conf.forCorrelatedKeyGroup(keyGroup)
-        .forSeedRange(minSeed, maxSeed)
+      .forSeedRange(minSeed, maxSeed)
     this.seedMapping = seedMapping
     this
   }
@@ -672,7 +700,7 @@ abstract class DataGen(var conf: ColumnConf,
   /**
    * Set a range of seed values that should be returned by the LocationToSeedMapping
    */
-  def setSeedRange(min: Long, max: Long): DataGen = {
+  def setSeedRange(min: Long, max: Long): CommonDataGen = {
     conf = conf.forSeedRange(min, max)
     this
   }
@@ -681,7 +709,7 @@ abstract class DataGen(var conf: ColumnConf,
    * Get the default value generator for this specific data gen.
    */
   protected def getValGen: GeneratorFunction
-  def children: Seq[(String, DataGen)]
+  def children: Seq[(String, CommonDataGen)]
 
   /**
    * Get the final ready to use GeneratorFunction for the data generator.
@@ -690,8 +718,8 @@ abstract class DataGen(var conf: ColumnConf,
     val sm = seedMapping.withColumnConf(conf)
     val lg = lengthGen.withLocationToSeedMapping(sm)
     var valGen = userProvidedValueGen.getOrElse(getValGen)
-        .withLocationToSeedMapping(sm)
-        .withLengthGeneratorFunction(lg)
+      .withLocationToSeedMapping(sm)
+      .withLengthGeneratorFunction(lg)
     valueRange.foreach {
       case (min, max) =>
         valGen = valGen.withValueRange(min, max)
@@ -700,35 +728,75 @@ abstract class DataGen(var conf: ColumnConf,
       val nullColConf = conf.forNulls
       val nm = nullMapping.withColumnConf(nullColConf)
       userProvidedNullGen.get
-          .withWrapped(valGen)
-          .withLocationToSeedMapping(nm)
+        .withWrapped(valGen)
+        .withLocationToSeedMapping(nm)
     } else {
       valGen
     }
   }
 
-  /**
-   * Get the data type for this column
-   */
-  def dataType: DataType
-
   /**
    * Is this column nullable or not.
    */
   def nullable: Boolean = conf.nullable
 
   /**
-   * Get a child column for a given name, if it has one.
+   * Get a child for a given name, if it has one.
    */
-  final def apply(name: String): DataGen = {
+  final def apply(name: String): CommonDataGen = {
     get(name).getOrElse{
       throw new IllegalStateException(s"Could not find a child $name for $this")
     }
   }
 
-  def get(name: String): Option[DataGen] = None
+  def get(name: String): Option[CommonDataGen] = None
+}
+
+
+/**
+ * Base class for generating a column/sub-column. This holds configuration
+ * for the column, and handles what is needed to convert it into GeneratorFunction
+ */
+abstract class DataGen(
+    conf: ColumnConf,
+    defaultValueRange: Option[(Any, Any)],
+    seedMapping: LocationToSeedMapping = FlatDistribution(),
+    nullMapping: LocationToSeedMapping = FlatDistribution(),
+    lengthGen: LengthGeneratorFunction = FixedLengthGeneratorFunction(10)) extends
+  CommonDataGen(conf, defaultValueRange, seedMapping, nullMapping, lengthGen) {
+
+  /**
+   * Get the data type for this column
+   */
+  def dataType: DataType
+
+  override def get(name: String): Option[DataGen] = None
+
+  def getSubstringGen: Option[SubstringDataGen] = None
+
+  def substringGen: SubstringDataGen =
+    getSubstringGen.getOrElse(
+      throw new IllegalArgumentException("substring data gen was not set"))
+
+  def setSubstringGen(f : ColumnConf => SubstringDataGen): Unit =
+    setSubstringGen(Option(f(conf.forNextSubstring)))
+
+  def setSubstringGen(subgen: Option[SubstringDataGen]): Unit =
+    throw new IllegalArgumentException("substring data gens can only be set for a STRING")
 }
 
+/**
+ * Base class for generating a sub-string. This holds configuration
+ * for the substring, and handles what is needed to convert it into a GeneratorFunction
+ */
+abstract class SubstringDataGen(
+    conf: ColumnConf,
+    defaultValueRange: Option[(Any, Any)],
+    seedMapping: LocationToSeedMapping = FlatDistribution(),
+    nullMapping: LocationToSeedMapping = FlatDistribution(),
+    lengthGen: LengthGeneratorFunction = FixedLengthGeneratorFunction(10)) extends
+  CommonDataGen(conf, defaultValueRange, seedMapping, nullMapping, lengthGen) {}
+
 /**
  * A special GeneratorFunction that just returns the computed seed. This is helpful for
  * debugging distributions or if you want long values without any abstraction in between.
@@ -1494,155 +1562,866 @@ class FloatGen(conf: ColumnConf, defaultValueRange: Option[(Any, Any)])
   override def children: Seq[(String, DataGen)] = Seq.empty
 }
 
-trait JSONType {
-  def appendRandomValue(sb: StringBuilder,
-      index: Int,
-      maxStringLength: Int,
-      maxArrayLength: Int,
-      maxObjectLength: Int,
-      depth: Int,
-      maxDepth: Int,
-      r: Random): Unit
-}
+case class JsonPathElement(name: String, is_array: Boolean)
+case class JsonLevel(path: Array[JsonPathElement], data_type: String, length: Int, value: String) {}
+
+object JsonColumnStats {
+  private def printHelp(): Unit = {
+    println("JSON Fingerprinting Tool:")
+    println("PARAMS: <inputPath> <outputPath>")
+    println("  <inputPath> is a path to a Spark dataframe to read in")
+    println("  <outputPath> is a path in a Spark file system to write out fingerprint data to.")
+    println()
+    println("OPTIONS:")
+    println("  --json=<COLUMN>       where <COLUMN> is the name of a top level String column")
+    println("  --anon=<SEED>         where <SEED> is a SEED used to anonymize the JSON keys ")
+    println("                        and column names.")
+    println("  --input_format=<TYPE> where <TYPE> is parquet or ORC. Defaults to parquet.")
+    println("  --overwrite           to enable overwriting the fingerprint output.")
+    println("  --debug               to enable some debug information to be printed out")
+    println("  --help                to print out this help message")
+    println()
+  }
+
+  def main(args: Array[String]): Unit = {
+    var inputPath = Option.empty[String]
+    var outputPath = Option.empty[String]
+    val jsonColumns = ArrayBuffer.empty[String]
+    var anonSeed = Option.empty[Long]
+    var debug = false
+    var argsDone = false
+    var format = "parquet"
+    var overwrite = false
+
+    args.foreach {
+      case a if !argsDone && a.startsWith("--json=") =>
+        jsonColumns += a.substring("--json=".length)
+      case a if !argsDone && a.startsWith("--anon=") =>
+        anonSeed = Some(a.substring("--anon=".length).toLong)
+      case a if !argsDone && a.startsWith("--input_format=") =>
+        format = a.substring("--input_format=".length).toLowerCase(java.util.Locale.US)
+      case "--overwrite" if !argsDone =>
+        overwrite = true
+      case "--debug" if !argsDone =>
+        debug = true
+      case "--help" if !argsDone =>
+        printHelp()
+        System.exit(0)
+      case "--" if !argsDone =>
+        argsDone = true
+      case a if !argsDone && a.startsWith("--") => // "--" was covered above already
+        println(s"ERROR $a is not a supported argument")
+        printHelp()
+        System.exit(-1)
+      case a if inputPath.isEmpty =>
+        inputPath = Some(a)
+      case a if outputPath.isEmpty =>
+        outputPath = Some(a)
+      case a =>
+        println(s"ERROR only two arguments are supported. Found $a")
+        printHelp()
+        System.exit(-1)
+    }
+    if (outputPath.isEmpty) {
+      println("ERROR both an inputPath and an outputPath are required")
+      printHelp()
+      System.exit(-1)
+    }
+
+    val spark = SparkSession.builder.getOrCreate()
+    spark.sparkContext.setLogLevel("WARN")
+
+    val df = spark.read.format(format).load(inputPath.get)
+    jsonColumns.foreach { column =>
+      val fp = fingerPrint(df, df(column), anonSeed)
+      val name = anonSeed.map(s => anonymizeString(column, s)).getOrElse(column)
+      val fullOutPath = s"${outputPath.get}/$name"
+      var writer = fp.write
+      if (overwrite) {
+        writer = writer.mode("overwrite")
+      }
+      if (debug) {
+        anonSeed.foreach { s =>
+          println(s"Keys and columns will be anonymized with seed $s")
+        }
+        println(s"Writing $column fingerprint to $fullOutPath")
+        spark.time(writer.parquet(fullOutPath))
+        println(s"Wrote ${spark.read.parquet(fullOutPath).count} rows")
+        spark.read.parquet(fullOutPath).show()
+      } else {
+        writer.parquet(fullOutPath)
+      }
+    }
+  }
 
-object JSONType {
-  def selectType(depth: Int,
-      maxDepth: Int,
-      r: Random): JSONType = {
-    val toSelectFrom = if (depth < maxDepth) {
-      Seq(QuotedJSONString, JSONLong, JSONDouble, JSONArray, JSONObject)
-    } else {
-      Seq(QuotedJSONString, JSONLong, JSONDouble)
-    }
-    val index = r.nextInt(toSelectFrom.length)
-    toSelectFrom(index)
-  }
-}
-
-object QuotedJSONString extends JSONType {
-  override def appendRandomValue(sb: StringBuilder,
-      index: Int,
-      maxStringLength: Int,
-      maxArrayLength: Int,
-      maxObjectLength: Int,
-      depth: Int,
-      maxDepth: Int,
-      r: Random): Unit = {
-    val strValue = r.nextString(r.nextInt(maxStringLength + 1))
-        .replace("\\", "\\\\")
-        .replace("\"", "\\\"")
-        .replace("\n", "\\n")
-        .replace("\r", "\\r")
-        .replace("\b", "\\b")
-        .replace("\f", "\\f")
-    sb.append('"')
-    sb.append(strValue)
-    sb.append('"')
-  }
-}
-
-object JSONLong extends JSONType {
-  override def appendRandomValue(sb: StringBuilder,
-      index: Int,
-      maxStringLength: Int,
-      maxArrayLength: Int,
-      maxObjectLength: Int,
-      depth: Int,
-      maxDepth: Int,
-      r: Random): Unit = {
-    sb.append(r.nextLong())
-  }
-}
-
-object JSONDouble extends JSONType {
-  override def appendRandomValue(sb: StringBuilder,
-      index: Int,
-      maxStringLength: Int,
-      maxArrayLength: Int,
-      maxObjectLength: Int,
-      depth: Int,
-      maxDepth: Int,
-      r: Random): Unit = {
-    sb.append(r.nextDouble() * 4096.0)
-  }
-}
-
-object JSONArray extends JSONType {
-  override def appendRandomValue(sb: StringBuilder,
-      index: Int,
-      maxStringLength: Int,
-      maxArrayLength: Int,
-      maxObjectLength: Int,
-      depth: Int,
-      maxDepth: Int,
-      r: Random): Unit = {
-    val childType = JSONType.selectType(depth, maxDepth, r)
-    val length = r.nextInt(maxArrayLength + 1)
-    sb.append("[")
+  case class JsonNodeStats(count: Long, meanLen: Double, stdDevLength: Double, dc: Long)
+
+  class JsonNode() {
+    private val forDataType =
+      mutable.HashMap[String, (JsonNodeStats, mutable.HashMap[String, JsonNode])]()
+
+    def getChild(name: String, isArray: Boolean): JsonNode = {
+      val dt = if (isArray) { "ARRAY" } else { "OBJECT" }
+      val typed = forDataType.getOrElse(dt,
+        throw new IllegalArgumentException(s"$dt is not a set data type yet."))
+      typed._2.getOrElse(name,
+        throw new IllegalArgumentException(s"$name is not a child when the type is $dt"))
+    }
+
+    def contains(name: String, isArray: Boolean): Boolean = {
+      val dt = if (isArray) { "ARRAY" } else { "OBJECT" }
+      forDataType.get(dt).exists { children =>
+        children._2.contains(name)
+      }
+    }
+
+    def addChild(name: String, isArray: Boolean): JsonNode = {
+      val dt = if (isArray) { "ARRAY" } else { "OBJECT" }
+      val found = forDataType.getOrElse(dt,
+        throw new IllegalArgumentException(s"$dt was not already added as a data type"))
+      if (found._2.contains(name)) {
+        throw new IllegalArgumentException(s"$dt already has a child named $name")
+      }
+      val node = new JsonNode()
+      found._2.put(name, node)
+      node
+    }
+
+    def addChoice(dt: String, stats: JsonNodeStats): Unit = {
+      if (forDataType.contains(dt)) {
+        throw new IllegalArgumentException(s"$dt was already added as a data type")
+      }
+      forDataType.put(dt, (stats, new mutable.HashMap[String, JsonNode]()))
+    }
+
+    override def toString: String = {
+      forDataType.toString()
+    }
+
+    def totalCount: Long = {
+      forDataType.values.map{ case (stats, _) => stats.count}.sum
+    }
+
+    private def makeNoChoiceGenRecursive(dt: String,
+                                         children: mutable.HashMap[String, JsonNode],
+                                         cc: ColumnConf): (SubstringDataGen, ColumnConf) = {
+      var c = cc
+      val ret = dt match {
+        case "LONG" => new JSONLongGen(c)
+        case "DOUBLE" => new JSONDoubleGen(c)
+        case "BOOLEAN" => new JSONBoolGen(c)
+        case "NULL" => new JSONNullGen(false, c)
+        case "VALUE_NULL" => new JSONNullGen(true, c)
+        case "ERROR" => new JSONErrorGen(c)
+        case "STRING" => new JSONStringGen(c)
+        case "ARRAY" =>
+          val child = if (children.isEmpty) {
+            // A corner case, we will just make it a BOOL column and it will be ignored
+            val tmp = new JSONBoolGen(c)
+            c = c.forNextSubstring
+            tmp
+          } else {
+            val tmp = children.values.head.makeGenRecursive(c)
+            c = tmp._2
+            tmp._1
+          }
+          new JSONArrayGen(child, c)
+        case "OBJECT" =>
+          val childGens = if (children.isEmpty) {
+            Seq.empty
+          } else {
+            children.toSeq.map {
+              case (k, node) =>
+                val tmp = node.makeGenRecursive(c)
+                c = tmp._2
+                (k, tmp._1)
+            }
+          }
+          new JSONObjectGen(childGens, c)
+        case other =>
+          throw new IllegalArgumentException(s"$other is not a leaf node type")
+      }
+      (ret, c.forNextSubstring)
+    }
+
+    private def makeGenRecursive(cc: ColumnConf): (SubstringDataGen, ColumnConf) = {
+      var c = cc
+      // We are going to recursively walk the tree for all of the values.
+      if (forDataType.size == 1) {
+        // We don't need a choice at all. This makes it simpler..
+        val (dt, (_, children)) = forDataType.head
+        makeNoChoiceGenRecursive(dt, children, c)
+      } else {
+        val totalSum = forDataType.map(f => f._2._1.count).sum.toDouble
+        var runningSum = 0L
+        val allChoices = ArrayBuffer[(Double, String, SubstringDataGen)]()
+        forDataType.foreach {
+          case (dt, (stats, children)) =>
+            val tmp = makeNoChoiceGenRecursive(dt, children, c)
+            c = tmp._2
+            runningSum += stats.count
+            allChoices.append((runningSum/totalSum, dt, tmp._1))
+        }
+
+        val ret = new JSONChoiceGen(allChoices.toSeq, c)
+        (ret, c.forNextSubstring)
+      }
+    }
+
+    def makeGen(cc: ColumnConf): SubstringDataGen = {
+      val (ret, _) = makeGenRecursive(cc)
+      ret
+    }
+
+    def setStatsSingle(dg: CommonDataGen,
+                       dt: String,
+                       stats: JsonNodeStats,
+                       nullPct: Double): Unit = {
+
+      val includeLength = dt != "OBJECT" && dt != "BOOLEAN" && dt != "NULL" && dt != "VALUE_NULL"
+      val includeNullPct = nullPct > 0.0
+      if (includeLength) {
+        dg.setGaussianLength(stats.meanLen, stats.stdDevLength)
+      }
+      if (includeNullPct) {
+        dg.setNullProbability(nullPct)
+      }
+      dg.setSeedRange(1, stats.dc)
+    }
+
+    def setStats(dg: CommonDataGen,
+                 parentCount: Option[Long]): Unit  = {
+      // We are going to recursively walk the tree...
+      if (forDataType.size == 1) {
+        // We don't need a choice at all. This makes it simpler..
+        val (dt, (stats, children)) = forDataType.head
+        val nullPct = parentCount.map { pc =>
+          (pc - stats.count).toDouble/pc
+        }.getOrElse(0.0)
+        setStatsSingle(dg, dt, stats, nullPct)
+        val myCount = if (dt == "OBJECT") {
+          Some(totalCount)
+        } else {
+          None
+        }
+        children.foreach {
+          case (name, node) =>
+            node.setStats(dg(name), myCount)
+        }
+      } else {
+        // We have choices to make between different types.
+        // The null percent cannot be calculated for each individual choice
+        // but is calculated on the group as a whole instead
+        parentCount.foreach { pc =>
+          val tc = totalCount
+          val choiceNullPct = (pc - tc).toDouble / pc
+          if (choiceNullPct > 0.0) {
+            dg.setNullProbability(choiceNullPct)
+          }
+        }
+        forDataType.foreach {
+          case (dt, (stats, children)) =>
+            // When there is a choice the name to access it is the data type
+            val choiceDg = dg(dt)
+            setStatsSingle(choiceDg, dt, stats, 0.0)
+            children.foreach {
+              case (name, node) =>
+                val myCount = if (dt == "OBJECT") {
+                  // Here we only want the count for the OBJECTs
+                  Some(stats.count)
+                } else {
+                  None
+                }
+                node.setStats(choiceDg(name), myCount)
+            }
+        }
+      }
+    }
+  }
+
+  private lazy val jsonFactory = new JsonFactoryBuilder()
+    // The two options below enabled for Hive compatibility
+    .enable(JsonReadFeature.ALLOW_UNESCAPED_CONTROL_CHARS)
+    .enable(JsonReadFeature.ALLOW_SINGLE_QUOTES)
+    .build()
+
+  private def processNext(parser: JsonParser,
+                          currentPath: ArrayBuffer[JsonPathElement],
+                          output: ArrayBuffer[JsonLevel]): Unit = {
+    parser.currentToken() match {
+      case JsonToken.START_OBJECT =>
+        parser.nextToken()
+        while (parser.currentToken() != JsonToken.END_OBJECT) {
+          processNext(parser, currentPath, output)
+        }
+        output.append(JsonLevel(currentPath.toArray, "OBJECT", 0, ""))
+        parser.nextToken()
+      case JsonToken.START_ARRAY =>
+        currentPath.append(JsonPathElement("data", is_array = true))
+        parser.nextToken()
+        var length = 0
+        while (parser.currentToken() != JsonToken.END_ARRAY) {
+          length += 1
+          processNext(parser, currentPath, output)
+        }
+        currentPath.remove(currentPath.length - 1)
+        output.append(JsonLevel(currentPath.toArray, "ARRAY", length, ""))
+        parser.nextToken()
+      case JsonToken.FIELD_NAME =>
+        currentPath.append(JsonPathElement(parser.getCurrentName, is_array = false))
+        parser.nextToken()
+        processNext(parser, currentPath, output)
+        currentPath.remove(currentPath.length - 1)
+      case JsonToken.VALUE_NUMBER_INT =>
+        val length = parser.getValueAsString.getBytes("UTF-8").length
+        output.append(JsonLevel(currentPath.toArray, "LONG", length, parser.getValueAsString))
+        parser.nextToken()
+      case JsonToken.VALUE_NUMBER_FLOAT =>
+        val length = parser.getValueAsString.getBytes("UTF-8").length
+        output.append(JsonLevel(currentPath.toArray, "DOUBLE", length, parser.getValueAsString))
+        parser.nextToken()
+      case JsonToken.VALUE_TRUE | JsonToken.VALUE_FALSE =>
+        val length = parser.getValueAsString.getBytes("UTF-8").length
+        output.append(JsonLevel(currentPath.toArray, "BOOLEAN", length, parser.getValueAsString))
+        parser.nextToken()
+      case JsonToken.VALUE_NULL | null =>
+        output.append(JsonLevel(currentPath.toArray, "VALUE_NULL", 4, "NULL"))
+        parser.nextToken()
+      case JsonToken.VALUE_STRING =>
+        val length = parser.getValueAsString.getBytes("UTF-8").length
+        output.append(JsonLevel(currentPath.toArray, "STRING", length, parser.getValueAsString))
+        parser.nextToken()
+      case other =>
+        throw new IllegalStateException(s"DON'T KNOW HOW TO DEAL WITH $other")
+    }
+  }
+
+  def jsonStatsUdf(json: String): Array[JsonLevel] = {
+    val output = new ArrayBuffer[JsonLevel]()
+    try {
+      val currentPath = new ArrayBuffer[JsonPathElement]()
+      if (json == null) {
+        output.append(JsonLevel(Array.empty, "NULL", 0, ""))
+      } else {
+        val parser = jsonFactory.createParser(json)
+        try {
+          parser.nextToken()
+          processNext(parser, currentPath, output)
+        } finally {
+          parser.close()
+        }
+      }
+    } catch {
+      case _: com.fasterxml.jackson.core.JsonParseException =>
+        output.clear()
+        output.append(JsonLevel(Array.empty, "ERROR", json.getBytes("UTF-8").length, json))
+    }
+    output.toArray
+  }
+
+  private lazy val extractPaths = udf(json => jsonStatsUdf(json))
+
+  def anonymizeString(str: String, seed: Long): String = {
+    val length = str.length
+    val data = new Array[Byte](length)
+    val hash = XXH64.hashLong(str.hashCode, seed)
+    val r = new Random()
+    r.setSeed(hash)
     (0 until length).foreach { i =>
-      if (i > 0) {
-        sb.append(",")
+      val tmp = r.nextInt(16)
+      data(i) = (tmp + 'A').toByte
+    }
+    new String(data)
+  }
+
+  private lazy val anonPath = udf((str, seed) => anonymizeString(str, seed))
+
+  def anonymizeFingerPrint(df: DataFrame, anonSeed: Long): DataFrame = {
+    df.withColumn("tmp", transform(col("path"),
+        o => {
+          val name = o("name")
+          val isArray = o("is_array")
+          val anon = anonPath(name, lit(anonSeed))
+          val newName = when(isArray, name).otherwise(anon).alias("name")
+          struct(newName, isArray)
+        }))
+      .drop("path").withColumnRenamed("tmp", "path")
+      .orderBy("path", "dt")
+      .selectExpr("path", "dt","c","mean_len","stddev_len","distinct","version")
+  }
+
+  def fingerPrint(df: DataFrame, column: Column, anonymize: Option[Long] = None): DataFrame = {
+    val ret = df.select(extractPaths(column).alias("paths"))
+      .selectExpr("explode_outer(paths) as p")
+      .selectExpr("p.path as path", "p.data_type as dt", "p.length as len", "p.value as value")
+      .groupBy(col("path"), col("dt")).agg(
+        count(lit(1)).alias("c"),
+        avg(col("len")).alias("mean_len"),
+        coalesce(stddev(col("len")), lit(0.0)).alias("stddev_len"),
+        approx_count_distinct(col("value")).alias("distinct"))
+      .orderBy("path", "dt").withColumn("version", lit("0.1"))
+      .selectExpr("path", "dt","c","mean_len","stddev_len","distinct","version")
+
+    anonymize.map { anonSeed =>
+      anonymizeFingerPrint(ret, anonSeed)
+    }.getOrElse(ret)
+  }
+
+  def apply(aggForColumn: DataFrame, genColumn: ColumnGen): Unit =
+    apply(aggForColumn, genColumn.dataGen)
+
+  private val expectedSchema = StructType.fromDDL(
+    "path ARRAY<STRUCT<name: STRING, is_array: BOOLEAN>>," +
+      "dt STRING," +
+      "c BIGINT," +
+      "mean_len DOUBLE," +
+      "stddev_len DOUBLE," +
+      "distinct BIGINT," +
+      "version STRING")
+
+  def apply(aggForColumn: DataFrame, gen: DataGen): Unit = {
+    val aggData = aggForColumn.orderBy("path", "dt").collect()
+    val rootNode: JsonNode = new JsonNode()
+    assert(aggData.length > 0)
+    val schema = aggData.head.schema
+    assert(schema.length == expectedSchema.length)
+    schema.fields.zip(expectedSchema.fields).foreach {
+      case(found, expected) =>
+        assert(found.name == expected.name)
+        // TODO we can worry about the exact types later if we need to
+    }
+    assert(aggData.head.getString(6) == "0.1")
+    aggData.foreach { row =>
+      val fullPath = row.getAs[mutable.WrappedArray[Row]](0)
+      val parsedPath = fullPath.map(r => (r.getString(0), r.getBoolean(1))).toList
+      val dt = row.getString(1)
+      val count = row.getLong(2)
+      val meanLen = row.getDouble(3)
+      val stdLen = row.getDouble(4)
+      val dc = row.getLong(5)
+
+      val stats = JsonNodeStats(count, meanLen, stdLen, dc)
+      var currentNode = rootNode
+      // Find everything up to the last path element
+      if (parsedPath.length > 1) {
+        parsedPath.slice(0, parsedPath.length - 1).foreach {
+          case (name, isArray) =>
+            currentNode = currentNode.getChild(name, isArray)
+        }
+      }
+
+      if (parsedPath.nonEmpty) {
+        // For the last path element (that is not the root element) we might need to add it
+        // as a child
+        val (name, isArray) = parsedPath.last
+        if (!currentNode.contains(name, isArray)) {
+          currentNode.addChild(name, isArray)
+        }
+        currentNode = currentNode.getChild(name, isArray)
       }
-      childType.appendRandomValue(sb, i, maxStringLength, maxArrayLength, maxObjectLength,
-        depth + 1, maxDepth, r)
+      currentNode.addChoice(dt, stats)
     }
-    sb.append("]")
+
+    gen.setSubstringGen(cc => rootNode.makeGen(cc))
+    rootNode.setStats(gen.substringGen, None)
   }
 }
 
-object JSONObject extends JSONType {
-  override def appendRandomValue(sb: StringBuilder,
-      index: Int,
-      maxStringLength: Int,
-      maxArrayLength: Int,
-      maxObjectLength: Int,
-      depth: Int,
-      maxDepth: Int,
-      r: Random): Unit = {
-    val length = r.nextInt(maxObjectLength) + 1
-    sb.append("{")
-    (0 until length).foreach { i =>
-      if (i > 0) {
-        sb.append(",")
+
+case class JSONStringGenFunc(lengthGen: LengthGeneratorFunction = null,
+                             mapping: LocationToSeedMapping = null) extends GeneratorFunction {
+
+  override def apply(rowLoc: RowLocation): Any = {
+    val len = lengthGen(rowLoc)
+    val r = DataGen.getRandomFor(rowLoc, mapping)
+    val buffer = new Array[Byte](len)
+    var at = 0
+    while (at < len) {
+      // Value range is 32 (Space) to 126 (~)
+      buffer(at) = (r.nextInt(126 - 31) + 32).toByte
+      at += 1
+    }
+    val strVal = new String(buffer, 0, len)
+            .replace("\\", "\\\\")
+            .replace("\"", "\\\"")
+            .replace("\n", "\\n")
+            .replace("\r", "\\r")
+            .replace("\b", "\\b")
+            .replace("\f", "\\f")
+    '"' + strVal + '"'
+  }
+
+  override def withLengthGeneratorFunction(lengthGen: LengthGeneratorFunction): JSONStringGenFunc =
+    JSONStringGenFunc(lengthGen, mapping)
+
+  override def withLocationToSeedMapping(mapping: LocationToSeedMapping): JSONStringGenFunc =
+    JSONStringGenFunc(lengthGen, mapping)
+
+  override def withValueRange(min: Any, max: Any): GeneratorFunction =
+    throw new IllegalArgumentException("value ranges are not supported for JSON")
+}
+
+class JSONStringGen(conf: ColumnConf,
+                  defaultValueRange: Option[(Any, Any)] = None)
+  extends SubstringDataGen(conf, defaultValueRange) {
+
+  override protected def getValGen: GeneratorFunction = JSONStringGenFunc()
+
+  override def children: Seq[(String, SubstringDataGen)] = Seq.empty
+}
+
+case class JSONLongGenFunc(lengthGen: LengthGeneratorFunction = null,
+                    mapping: LocationToSeedMapping = null) extends GeneratorFunction {
+
+  override def apply(rowLoc: RowLocation): Any = {
+    val len = math.max(lengthGen(rowLoc), 1) // We need at least 1 long for a valid value
+    val r = DataGen.getRandomFor(rowLoc, mapping)
+    val buffer = new Array[Byte](len)
+    var at = 0
+    while (at < len) {
+      if (at == 0) {
+        // No leading 0's
+        buffer(at) = (r.nextInt(9) + '1').toByte
+      } else {
+        buffer(at) = (r.nextInt(10) + '0').toByte
       }
-      sb.append("\"key_")
-      sb.append(i)
-      sb.append("_")
-      sb.append(depth )
-      sb.append("\":")
-      val childType = JSONType.selectType(depth, maxDepth, r)
-      childType.appendRandomValue(sb, i, maxStringLength, maxArrayLength, maxObjectLength,
-        depth + 1, maxDepth, r)
+      at += 1
     }
-    sb.append("}")
+    new String(buffer, 0, len)
   }
+
+  override def withLengthGeneratorFunction(lengthGen: LengthGeneratorFunction): JSONLongGenFunc =
+    JSONLongGenFunc(lengthGen, mapping)
+
+  override def withLocationToSeedMapping(mapping: LocationToSeedMapping): JSONLongGenFunc =
+    JSONLongGenFunc(lengthGen, mapping)
+
+  override def withValueRange(min: Any, max: Any): GeneratorFunction =
+    throw new IllegalArgumentException("value ranges are not supported for JSON")
 }
 
-case class JSONGenFunc(
-    maxStringLength: Int,
-    maxArrayLength: Int,
-    maxObjectLength: Int,
-    maxDepth: Int,
-    lengthGen: LengthGeneratorFunction = null,
-    mapping: LocationToSeedMapping = null) extends GeneratorFunction {
+class JSONLongGen(conf: ColumnConf,
+                  defaultValueRange: Option[(Any, Any)] = None)
+  extends SubstringDataGen(conf, defaultValueRange) {
+
+  override protected def getValGen: GeneratorFunction = JSONLongGenFunc()
+
+  override def children: Seq[(String, SubstringDataGen)] = Seq.empty
+}
+
+case class JSONDoubleGenFunc(lengthGen: LengthGeneratorFunction = null,
+                             mapping: LocationToSeedMapping = null) extends GeneratorFunction {
 
   override def apply(rowLoc: RowLocation): Any = {
+    val len = math.max(lengthGen(rowLoc), 3) // We have to have at least 3 chars NUM.NUM
     val r = DataGen.getRandomFor(rowLoc, mapping)
-    val sb = new StringBuilder()
-    JSONObject.appendRandomValue(sb, 0, maxStringLength, maxArrayLength, maxObjectLength,
-      0, maxDepth, r)
-    // For now I am going to have some hard coded keys
-    UTF8String.fromString(sb.toString())
+    val beforeLen = if (len == 3) { 1 } else { r.nextInt(len - 3) + 1 }
+    val buffer = new Array[Byte](len)
+    var at = 0
+    while (at < len) {
+      if (at == 0) {
+        // No leading 0's
+        buffer(at) = (r.nextInt(9) + '1').toByte
+      } else if (at == beforeLen) {
+        buffer(at) = '.'
+      } else {
+        buffer(at) = (r.nextInt(10) + '0').toByte
+      }
+      at += 1
+    }
+    UTF8String.fromBytes(buffer, 0, len)
   }
 
-  override def withLengthGeneratorFunction(lengthGen: LengthGeneratorFunction): GeneratorFunction =
-    JSONGenFunc(maxStringLength, maxArrayLength, maxObjectLength, maxDepth, lengthGen, mapping)
+  override def withLengthGeneratorFunction(lengthGen: LengthGeneratorFunction): JSONDoubleGenFunc =
+    JSONDoubleGenFunc(lengthGen, mapping)
 
-  override def withLocationToSeedMapping(mapping: LocationToSeedMapping): GeneratorFunction =
-    JSONGenFunc(maxStringLength, maxArrayLength, maxObjectLength, maxDepth, lengthGen, mapping)
+  override def withLocationToSeedMapping(mapping: LocationToSeedMapping): JSONDoubleGenFunc =
+    JSONDoubleGenFunc(lengthGen, mapping)
 
   override def withValueRange(min: Any, max: Any): GeneratorFunction =
-    throw new IllegalArgumentException("value ranges are not supported for strings")
+    throw new IllegalArgumentException("value ranges are not supported for JSON")
+}
+
+class JSONDoubleGen(conf: ColumnConf,
+                  defaultValueRange: Option[(Any, Any)] = None)
+  extends SubstringDataGen(conf, defaultValueRange) {
+
+  override protected def getValGen: GeneratorFunction = JSONDoubleGenFunc()
+
+  override def children: Seq[(String, SubstringDataGen)] = Seq.empty
+}
+
+case class JSONBoolGenFunc(lengthGen: LengthGeneratorFunction = null,
+                    mapping: LocationToSeedMapping = null) extends GeneratorFunction {
+
+  override def apply(rowLoc: RowLocation): Any = {
+    val r = DataGen.getRandomFor(rowLoc, mapping)
+    val ret = if (r.nextBoolean()) "true" else "false"
+    UTF8String.fromString(ret)
+  }
+
+  override def withLengthGeneratorFunction(lengthGen: LengthGeneratorFunction): JSONBoolGenFunc =
+    JSONBoolGenFunc(lengthGen, mapping)
+
+  override def withLocationToSeedMapping(mapping: LocationToSeedMapping): JSONBoolGenFunc =
+    JSONBoolGenFunc(lengthGen, mapping)
+
+  override def withValueRange(min: Any, max: Any): GeneratorFunction =
+    throw new IllegalArgumentException("value ranges are not supported for JSON")
+}
+
+class JSONBoolGen(conf: ColumnConf,
+                 defaultValueRange: Option[(Any, Any)] = None)
+  extends SubstringDataGen(conf, defaultValueRange) {
+
+  override protected def getValGen: GeneratorFunction = JSONBoolGenFunc()
+
+  override def children: Seq[(String, SubstringDataGen)] = Seq.empty
+}
+
+case class JSONNullGenFunc(nullAsString: Boolean,
+                           lengthGen: LengthGeneratorFunction = null,
+                           mapping: LocationToSeedMapping = null) extends GeneratorFunction {
+
+  override def apply(rowLoc: RowLocation): Any =
+    if (nullAsString) {
+      UTF8String.fromString("null")
+    } else {
+      null
+    }
+
+
+  override def withLengthGeneratorFunction(lengthGen: LengthGeneratorFunction): JSONNullGenFunc =
+    JSONNullGenFunc(nullAsString, lengthGen, mapping)
+
+  override def withLocationToSeedMapping(mapping: LocationToSeedMapping): JSONNullGenFunc =
+    JSONNullGenFunc(nullAsString, lengthGen, mapping)
+
+  override def withValueRange(min: Any, max: Any): GeneratorFunction =
+    throw new IllegalArgumentException("value ranges are not supported for JSON")
+}
+
+class JSONNullGen(nullAsString: Boolean,
+                  conf: ColumnConf,
+                  defaultValueRange: Option[(Any, Any)] = None)
+  extends SubstringDataGen(conf, defaultValueRange) {
+
+  override protected def getValGen: GeneratorFunction = JSONNullGenFunc(nullAsString)
+
+  override def children: Seq[(String, SubstringDataGen)] = Seq.empty
+}
+
+case class JSONErrorGenFunc(lengthGen: LengthGeneratorFunction = null,
+                            mapping: LocationToSeedMapping = null) extends GeneratorFunction {
+
+  override def apply(rowLoc: RowLocation): Any = {
+    val len = lengthGen(rowLoc)
+    val r = DataGen.getRandomFor(rowLoc, mapping)
+    val buffer = new Array[Byte](len)
+    var at = 0
+    while (at < len) {
+      // Value range is 32 (Space) to 126 (~)
+      // But it is almost impossible to show up as valid JSON
+      buffer(at) = (r.nextInt(126 - 31) + 32).toByte
+      at += 1
+    }
+    UTF8String.fromBytes(buffer, 0, len)
+  }
+
+  override def withLengthGeneratorFunction(lengthGen: LengthGeneratorFunction): JSONErrorGenFunc =
+    JSONErrorGenFunc(lengthGen, mapping)
+
+  override def withLocationToSeedMapping(mapping: LocationToSeedMapping): JSONErrorGenFunc =
+    JSONErrorGenFunc(lengthGen, mapping)
+
+  override def withValueRange(min: Any, max: Any): GeneratorFunction =
+    throw new IllegalArgumentException("value ranges are not supported for JSON")
+}
+
+class JSONErrorGen(conf: ColumnConf,
+                  defaultValueRange: Option[(Any, Any)] = None)
+  extends SubstringDataGen(conf, defaultValueRange) {
+
+  override protected def getValGen: GeneratorFunction = JSONErrorGenFunc()
+
+  override def children: Seq[(String, SubstringDataGen)] = Seq.empty
+}
+
+case class JSONArrayGenFunc(child: GeneratorFunction,
+                     lengthGen: LengthGeneratorFunction = null,
+                     mapping: LocationToSeedMapping = null) extends GeneratorFunction {
+
+  override def apply(rowLoc: RowLocation): Any = {
+    val len = lengthGen(rowLoc)
+    val data = new Array[String](len)
+    val childRowLoc = rowLoc.withNewChild()
+    var i = 0
+    while (i < len) {
+      childRowLoc.setLastChildIndex(i)
+      val v = child(childRowLoc)
+      if (v == null) {
+        // A null in an array must look like "null"
+        data(i) = "null"
+      } else {
+        data(i) = v.toString
+      }
+      i += 1
+    }
+    val ret = data.mkString("[", ",", "]")
+    UTF8String.fromString(ret)
+  }
+
+  override def withLengthGeneratorFunction(lengthGen: LengthGeneratorFunction): JSONArrayGenFunc =
+    JSONArrayGenFunc(child, lengthGen, mapping)
+
+  override def withLocationToSeedMapping(mapping: LocationToSeedMapping): JSONArrayGenFunc =
+    JSONArrayGenFunc(child, lengthGen, mapping)
+
+  override def withValueRange(min: Any, max: Any): GeneratorFunction =
+    throw new IllegalArgumentException("value ranges are not supported for JSON")
+}
+
+class JSONArrayGen(child: SubstringDataGen,
+               conf: ColumnConf,
+               defaultValueRange: Option[(Any, Any)] = None)
+  extends SubstringDataGen(conf, defaultValueRange) {
+
+  override def setCorrelatedKeyGroup(keyGroup: Long,
+                                     minSeed: Long, maxSeed: Long,
+                                     seedMapping: LocationToSeedMapping): SubstringDataGen = {
+    super.setCorrelatedKeyGroup(keyGroup, minSeed, maxSeed, seedMapping)
+    child.setCorrelatedKeyGroup(keyGroup, minSeed, maxSeed, seedMapping)
+    this
+  }
+
+  override protected def getValGen: GeneratorFunction = JSONArrayGenFunc(child.getGen)
+
+  override def get(name: String): Option[SubstringDataGen] = {
+    if ("data".equalsIgnoreCase(name) || "child".equalsIgnoreCase(name)) {
+      Some(child)
+    } else {
+      None
+    }
+  }
+
+  override def children: Seq[(String, SubstringDataGen)] = Seq(("data", child))
+}
+
+case class JSONObjectGenFunc(childGens: Array[(String, GeneratorFunction)],
+                             lengthGen: LengthGeneratorFunction = null,
+                             mapping: LocationToSeedMapping = null) extends GeneratorFunction {
+  override def apply(rowLoc: RowLocation): Any = {
+    // TODO randomize the order of the children???
+    // TODO duplicate child values???
+    // The row location does not change for a struct/object
+    val data = childGens.map {
+      case (k, gen) =>
+        val key = k.replace("\\", "\\\\")
+          .replace("\"", "\\\"")
+          .replace("\n", "\\n")
+          .replace("\r", "\\r")
+          .replace("\b", "\\b")
+          .replace("\f", "\\f")
+        val v = gen.apply(rowLoc)
+        if (v == null) {
+          ""
+        } else {
+          '"' + key + "\":" + v
+        }
+    }
+    val ret = data.filterNot(_.isEmpty).mkString("{",",","}")
+    UTF8String.fromString(ret)
+  }
+
+  override def withLocationToSeedMapping(mapping: LocationToSeedMapping): JSONObjectGenFunc =
+    JSONObjectGenFunc(childGens, lengthGen, mapping)
+
+  override def withLengthGeneratorFunction(lengthGen: LengthGeneratorFunction): JSONObjectGenFunc =
+    JSONObjectGenFunc(childGens, lengthGen, mapping)
+
+  override def withValueRange(min: Any, max: Any): GeneratorFunction =
+    throw new IllegalArgumentException("value ranges are not supported for JSON")
+}
+
+class JSONObjectGen(val children: Seq[(String, SubstringDataGen)],
+                conf: ColumnConf,
+                defaultValueRange: Option[(Any, Any)] = None)
+  extends SubstringDataGen(conf, defaultValueRange) {
+
+  override def setCorrelatedKeyGroup(keyGroup: Long,
+                                     minSeed: Long, maxSeed: Long,
+                                     seedMapping: LocationToSeedMapping): SubstringDataGen = {
+    super.setCorrelatedKeyGroup(keyGroup, minSeed, maxSeed, seedMapping)
+    children.foreach {
+      case (_, gen) =>
+        gen.setCorrelatedKeyGroup(keyGroup, minSeed, maxSeed, seedMapping)
+    }
+    this
+  }
+
+  override def get(name: String): Option[SubstringDataGen] =
+    children.collectFirst {
+      case (childName, dataGen) if childName.equalsIgnoreCase(name) => dataGen
+    }
+
+  override protected def getValGen: GeneratorFunction = {
+    val childGens = children.map(c => (c._1, c._2.getGen)).toArray
+    JSONObjectGenFunc(childGens)
+  }
+}
+
+case class JSONChoiceGenFunc(choices: List[(Double, GeneratorFunction)],
+                             lengthGen: LengthGeneratorFunction = null,
+                             mapping: LocationToSeedMapping = null) extends GeneratorFunction {
+  override def apply(rowLoc: RowLocation): Any = {
+    val r = DataGen.getRandomFor(rowLoc, mapping)
+    val l = r.nextDouble()
+    var index = 0
+    while (choices(index)._1 < l) {
+      index += 1
+    }
+    val childRowLoc = rowLoc.withNewChild()
+    choices(index)._2(childRowLoc)
+  }
+
+  override def withLengthGeneratorFunction(lengthGen: LengthGeneratorFunction): JSONChoiceGenFunc =
+    JSONChoiceGenFunc(choices, lengthGen, mapping)
+
+  override def withLocationToSeedMapping(mapping: LocationToSeedMapping): JSONChoiceGenFunc =
+    JSONChoiceGenFunc(choices, lengthGen, mapping)
+
+  override def withValueRange(min: Any, max: Any): GeneratorFunction =
+    throw new IllegalArgumentException("value ranges are not supported for JSON")
+}
+
+class JSONChoiceGen(val choices: Seq[(Double, String, SubstringDataGen)],
+                    conf: ColumnConf,
+                    defaultValueRange: Option[(Any, Any)] = None)
+  extends SubstringDataGen(conf, defaultValueRange) {
+
+  override val children: Seq[(String, SubstringDataGen)] =
+    choices.map { case (_, name, gen) => (name, gen) }
+
+  override def setCorrelatedKeyGroup(keyGroup: Long,
+                                     minSeed: Long, maxSeed: Long,
+                                     seedMapping: LocationToSeedMapping): SubstringDataGen = {
+    super.setCorrelatedKeyGroup(keyGroup, minSeed, maxSeed, seedMapping)
+    children.foreach {
+      case (_, gen) =>
+        gen.setCorrelatedKeyGroup(keyGroup, minSeed, maxSeed, seedMapping)
+    }
+    this
+  }
+
+  override def get(name: String): Option[SubstringDataGen] =
+    children.collectFirst {
+      case (childName, dataGen) if childName.equalsIgnoreCase(name) => dataGen
+    }
+
+  override protected def getValGen: GeneratorFunction = {
+    val childGens = choices.map(c => (c._1, c._3.getGen)).toList
+    JSONChoiceGenFunc(childGens)
+  }
 }
 
 case class ASCIIGenFunc(
@@ -1672,14 +2451,46 @@ case class ASCIIGenFunc(
     throw new IllegalArgumentException("value ranges are not supported for strings")
 }
 
-class StringGen(conf: ColumnConf, defaultValueRange: Option[(Any, Any)])
-    extends DataGen(conf, defaultValueRange) {
+/**
+ * This is here to wrap the substring gen function so that its length/settings
+ * are the ones used when generating a string, and not what was set for the string.
+ */
+case class SubstringGenFunc(
+    substringGen: GeneratorFunction,
+    lengthGen: LengthGeneratorFunction = null,
+    mapping: LocationToSeedMapping = null) extends GeneratorFunction {
+
+  override def apply(rowLoc: RowLocation): Any = {
+    substringGen(rowLoc)
+  }
+
+  // The length and location seed mapping are just ignored for this...
+  override def withLengthGeneratorFunction(lengthGen: LengthGeneratorFunction): GeneratorFunction =
+    this
+
+  override def withLocationToSeedMapping(mapping: LocationToSeedMapping): GeneratorFunction =
+    this
+
+  override def withValueRange(min: Any, max: Any): GeneratorFunction =
+    throw new IllegalArgumentException("value ranges are not supported for strings")
+}
+
+class StringGen(conf: ColumnConf,
+                defaultValueRange: Option[(Any, Any)],
+                var substringDataGen: Option[SubstringDataGen] = None)
+  extends DataGen(conf, defaultValueRange) {
 
   override def dataType: DataType = StringType
 
-  override protected def getValGen: GeneratorFunction = ASCIIGenFunc()
+  override protected def getValGen: GeneratorFunction =
+    substringDataGen.map(s => SubstringGenFunc(s.getGen)).getOrElse(ASCIIGenFunc())
 
   override def children: Seq[(String, DataGen)] = Seq.empty
+
+  override def setSubstringGen(subgen: Option[SubstringDataGen]): Unit =
+    substringDataGen = subgen
+
+  override def getSubstringGen: Option[SubstringDataGen] = substringDataGen
 }
 
 case class StructGenFunc(childGens: Array[GeneratorFunction]) extends GeneratorFunction {
@@ -1854,7 +2665,6 @@ class MapGen(key: DataGen,
   override def children: Seq[(String, DataGen)] = Seq(("key", key), ("value", value))
 }
 
-
 object ColumnGen {
   private def genInternal(rowNumber: Column,
       dataType: DataType,
@@ -1869,8 +2679,8 @@ object ColumnGen {
  */
 class ColumnGen(val dataGen: DataGen) {
   def setCorrelatedKeyGroup(kg: Long,
-      minSeed: Long, maxSeed: Long,
-      seedMapping: LocationToSeedMapping): ColumnGen = {
+                            minSeed: Long, maxSeed: Long,
+                            seedMapping: LocationToSeedMapping): ColumnGen = {
     dataGen.setCorrelatedKeyGroup(kg, minSeed, maxSeed, seedMapping)
     this
   }
@@ -1930,6 +2740,11 @@ class ColumnGen(val dataGen: DataGen) {
     this
   }
 
+  def setGaussianLength(mean: Double, stdDev: Double): ColumnGen = {
+    dataGen.setGaussianLength(mean, stdDev)
+    this
+  }
+
   final def apply(name: String): DataGen = {
     get(name).getOrElse {
       throw new IllegalArgumentException(s"$name not a child of $this")
@@ -1941,8 +2756,16 @@ class ColumnGen(val dataGen: DataGen) {
   def gen(rowNumber: Column): Column = {
     ColumnGen.genInternal(rowNumber, dataGen.dataType, dataGen.nullable, dataGen.getGen)
   }
+
+  def getSubstring: Option[SubstringDataGen] = dataGen.getSubstringGen
+
+  def substringGen: SubstringDataGen = dataGen.substringGen
+
+  def setSubstringGen(f : ColumnConf => SubstringDataGen): Unit =
+    dataGen.setSubstringGen(f)
 }
 
+
 sealed trait KeyGroupType
 
 /**
@@ -2192,7 +3015,7 @@ object DBGen {
       numRows: Long,
       mapping: OrderedTypeMapping): Seq[(String, ColumnGen)] = {
     // a bit of a hack with the column num so that we update it before each time...
-    var conf = ColumnConf(ColumnLocation(tableId, -1), true, numRows)
+    var conf = ColumnConf(ColumnLocation(tableId, -1, 0), true, numRows)
     st.toArray.map { sf =>
       if (!mapping.canMap(sf.dataType, mapping)) {
         throw new IllegalArgumentException(s"$sf is not supported at this time")

From 73d76cfade9758f7f47357b59daa1dc5fe2743fa Mon Sep 17 00:00:00 2001
From: Feng Jiang <106386742+Feng-Jiang28@users.noreply.github.com>
Date: Thu, 13 Jun 2024 13:23:38 +0800
Subject: [PATCH 02/10] Concat() Exception bug fix (#11039)

* concat_null_bug_fix

Signed-off-by: fejiang <fejiang@nvidia.com>

* concat_null_bug_fix

Signed-off-by: fejiang <fejiang@nvidia.com>

* Setting modified

Signed-off-by: fejiang <fejiang@nvidia.com>

* remove comment

Signed-off-by: fejiang <fejiang@nvidia.com>

* concat considered as empty string

Signed-off-by: fejiang <fejiang@nvidia.com>

---------

Signed-off-by: fejiang <fejiang@nvidia.com>
---
 .../org/apache/spark/sql/rapids/collectionOperations.scala    | 4 ++--
 .../apache/spark/sql/rapids/utils/RapidsTestSettings.scala    | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/collectionOperations.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/collectionOperations.scala
index 7f0a82517c3..41c2e5e3776 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/collectionOperations.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/collectionOperations.scala
@@ -49,8 +49,8 @@ case class GpuConcat(children: Seq[Expression]) extends GpuComplexTypeMergingExp
 
   override def columnarEval(batch: ColumnarBatch): GpuColumnVector = {
     val res = dataType match {
-      // Explicitly return null for empty concat as Spark, since cuDF doesn't support empty concat.
-      case dt if children.isEmpty => GpuScalar.from(null, dt)
+      // in Spark concat() will be considered as an empty string here
+      case dt if children.isEmpty => GpuScalar("", dt)
       // For single column concat, we pass the result of child node to avoid extra cuDF call.
       case _ if children.length == 1 => children.head.columnarEval(batch)
       case StringType => stringConcat(batch)
diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestSettings.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestSettings.scala
index ad93c4dd2e9..4cf155041d9 100644
--- a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestSettings.scala
+++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestSettings.scala
@@ -72,7 +72,6 @@ class RapidsTestSettings extends BackendTestSettings {
   enableSuite[RapidsMathFunctionsSuite]
   enableSuite[RapidsRegexpExpressionsSuite]
   enableSuite[RapidsStringExpressionsSuite]
-    .exclude("concat", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775"))
     .exclude("string substring_index function", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775"))
     .exclude("SPARK-22498: Concat should not generate codes beyond 64KB", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775"))
     .exclude("SPARK-22549: ConcatWs should not generate codes beyond 64KB", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775"))

From 2bc5ab6ad633b3d4a436d38786009579ef7465da Mon Sep 17 00:00:00 2001
From: Tim Liu <timl@nvidia.com>
Date: Thu, 13 Jun 2024 18:10:26 +0800
Subject: [PATCH 03/10] Add spark343 shim for scala2.13 dist jar (#11052)

We missed spark343 shim for the scala2.13 dist jar on branch-24.06.

Add scala2.13 spark343 shim for v24.06.0

Signed-off-by: Tim Liu <timl@nvidia.com>
---
 pom.xml           | 1 +
 scala2.13/pom.xml | 1 +
 2 files changed, 2 insertions(+)

diff --git a/pom.xml b/pom.xml
index 942905ea427..c16a64e1d65 100644
--- a/pom.xml
+++ b/pom.xml
@@ -886,6 +886,7 @@
             340,
             341,
             342,
+            343,
             350,
             351
         </noSnapshotScala213.buildvers>
diff --git a/scala2.13/pom.xml b/scala2.13/pom.xml
index 783aa38cf50..db00f77fe16 100644
--- a/scala2.13/pom.xml
+++ b/scala2.13/pom.xml
@@ -886,6 +886,7 @@
             340,
             341,
             342,
+            343,
             350,
             351
         </noSnapshotScala213.buildvers>

From 4da4d4a6041077f1fd759de499de8c7f41f7af80 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 13 Jun 2024 18:25:01 +0800
Subject: [PATCH 04/10] Update latest changelog [skip ci] (#11056)

Update change log with CLI: \n\n   scripts/generate-changelog --token=<GIT_TOKEN> --releases=24.04,24.06

Signed-off-by: jenkins <jenkins@localhost>
Co-authored-by: jenkins <jenkins@localhost>
---
 CHANGELOG.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d534d9398b6..788fed0a98f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,5 @@
 # Change log
-Generated on 2024-06-10
+Generated on 2024-06-13
 
 ## Release 24.06
 
@@ -48,6 +48,8 @@ Generated on 2024-06-10
 ### PRs
 |||
 |:---|:---|
+|[#11052](https://github.com/NVIDIA/spark-rapids/pull/11052)|Add spark343 shim for scala2.13 dist jar|
+|[#10981](https://github.com/NVIDIA/spark-rapids/pull/10981)|Update latest changelog [skip ci]|
 |[#10984](https://github.com/NVIDIA/spark-rapids/pull/10984)|[DOC] Update docs for 24.06.0 release [skip ci]|
 |[#10974](https://github.com/NVIDIA/spark-rapids/pull/10974)|Update rapids JNI and private dependency to 24.06.0|
 |[#10947](https://github.com/NVIDIA/spark-rapids/pull/10947)|Prevent contains-PrefixRange optimization if not preceded by wildcards|

From cfd8f0023806412b4b0526453cf4798b12fa66d9 Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Thu, 13 Jun 2024 09:07:32 -0500
Subject: [PATCH 05/10] Revert "Add in the ability to fingerprint JSON columns
 (#11002)"

This reverts commit d9686d4dba4afd6e2c061fd2822d40d9a55d88d4.
---
 .../spark/sql/tests/datagen/bigDataGen.scala  | 1181 +++--------------
 1 file changed, 179 insertions(+), 1002 deletions(-)

diff --git a/datagen/src/main/scala/org/apache/spark/sql/tests/datagen/bigDataGen.scala b/datagen/src/main/scala/org/apache/spark/sql/tests/datagen/bigDataGen.scala
index 14e0d4e0970..91335afe4e6 100644
--- a/datagen/src/main/scala/org/apache/spark/sql/tests/datagen/bigDataGen.scala
+++ b/datagen/src/main/scala/org/apache/spark/sql/tests/datagen/bigDataGen.scala
@@ -16,22 +16,21 @@
 
 package org.apache.spark.sql.tests.datagen
 
-import com.fasterxml.jackson.core.{JsonFactoryBuilder, JsonParser, JsonToken}
-import com.fasterxml.jackson.core.json.JsonReadFeature
 import java.math.{BigDecimal => JavaBigDecimal}
 import java.sql.{Date, Timestamp}
 import java.time.{Duration, Instant, LocalDate, LocalDateTime}
 import java.util
+
 import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 import scala.math.BigDecimal.RoundingMode
 import scala.util.Random
 
-import org.apache.spark.sql.{Column, DataFrame, Row, SparkSession}
+import org.apache.spark.sql.{Column, DataFrame, SparkSession}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{Expression, XXH64}
 import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, ArrayData, DateTimeUtils}
-import org.apache.spark.sql.functions.{approx_count_distinct, avg, coalesce, col, count, lit, stddev, struct, transform, udf, when}
+import org.apache.spark.sql.functions.col
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 import org.apache.spark.util.random.XORShiftRandom
@@ -80,28 +79,22 @@ class RowLocation(val rowNum: Long, val subRows: Array[Int] = null) {
  * hash. This makes the generated data correlated for all column/child columns.
  * @param tableNum a unique ID for the table this is a part of.
  * @param columnNum the location of the column in the data being generated
- * @param substringNum the location of the substring column
  * @param correlatedKeyGroup the correlated key group this column is a part of, if any.
  */
-case class ColumnLocation(tableNum: Int,
-                          columnNum: Int,
-                          substringNum: Int,
-                          correlatedKeyGroup: Option[Long] = None) {
-  def forNextColumn(): ColumnLocation = ColumnLocation(tableNum, columnNum + 1, 0)
+case class ColumnLocation(tableNum: Int, columnNum: Int, correlatedKeyGroup: Option[Long] = None) {
+  def forNextColumn(): ColumnLocation = ColumnLocation(tableNum, columnNum + 1)
 
-  def forNextSubstring: ColumnLocation = ColumnLocation(tableNum, columnNum, substringNum + 1)
 
   /**
    * Create a new ColumnLocation that is specifically for a given key group
    */
   def forCorrelatedKeyGroup(keyGroup: Long): ColumnLocation =
-    ColumnLocation(tableNum, columnNum, substringNum, Some(keyGroup))
+    ColumnLocation(tableNum, columnNum, Some(keyGroup))
 
   /**
    * Hash the location into a single long value.
    */
-  lazy val hashLoc: Long = XXH64.hashLong(tableNum,
-    correlatedKeyGroup.getOrElse(XXH64.hashLong(columnNum, substringNum)))
+  lazy val hashLoc: Long = XXH64.hashLong(tableNum, correlatedKeyGroup.getOrElse(columnNum))
 }
 
 /**
@@ -122,9 +115,6 @@ case class ColumnConf(columnLoc: ColumnLocation,
   def forNextColumn(nullable: Boolean): ColumnConf =
     ColumnConf(columnLoc.forNextColumn(), nullable, numTableRows)
 
-  def forNextSubstring: ColumnConf =
-    ColumnConf(columnLoc.forNextSubstring, nullable = true, numTableRows)
-
   /**
    * Create a new configuration based on this, but for a given correlated key group.
    */
@@ -313,23 +303,6 @@ case class VarLengthGeneratorFunction(minLength: Int, maxLength: Int) extends
   }
 }
 
-case class StdDevLengthGen(mean: Double,
-                           stdDev: Double,
-                           mapping: LocationToSeedMapping = null) extends
-  LengthGeneratorFunction {
-  override def withLocationToSeedMapping(mapping: LocationToSeedMapping): LengthGeneratorFunction =
-    StdDevLengthGen(mean, stdDev, mapping)
-
-  override def apply(rowLoc: RowLocation): Int = {
-    val r = DataGen.getRandomFor(rowLoc, mapping)
-    val g = r.nextGaussian() // g has a mean of 0 and a stddev of 1.0
-    val adjusted = mean + (g * stdDev)
-    // If the range of seed is too small compared to the stddev and mean we will
-    // end up with an invalid distribution, but they asked for it.
-    math.max(0, math.round(adjusted).toInt)
-  }
-}
-
 /**
  * Generate nulls with a given probability.
  * @param prob 0.0 to 1.0 for how often nulls should appear in the output.
@@ -589,8 +562,11 @@ case class DataGenExpr(child: Expression,
   }
 }
 
-abstract class CommonDataGen(
-    var conf: ColumnConf,
+/**
+ * Base class for generating a column/sub-column. This holds configuration for the column,
+ * and handles what is needed to convert it into GeneratorFunction
+ */
+abstract class DataGen(var conf: ColumnConf,
     defaultValueRange: Option[(Any, Any)],
     var seedMapping: LocationToSeedMapping = FlatDistribution(),
     var nullMapping: LocationToSeedMapping = FlatDistribution(),
@@ -600,25 +576,26 @@ abstract class CommonDataGen(
   protected var valueRange: Option[(Any, Any)] = defaultValueRange
 
   /**
-   * Set a value range
+   * Set a value range for this data gen.
    */
-  def setValueRange(min: Any, max: Any): CommonDataGen = {
+  def setValueRange(min: Any, max: Any): DataGen = {
     valueRange = Some((min, max))
     this
   }
 
   /**
-   * Set a custom GeneratorFunction
+   * Set a custom GeneratorFunction to use for this column.
    */
-  def setValueGen(f: GeneratorFunction): CommonDataGen = {
+  def setValueGen(f: GeneratorFunction): DataGen = {
     userProvidedValueGen = Some(f)
     this
   }
 
   /**
-   * Set a NullGeneratorFunction
+   * Set a NullGeneratorFunction for this column. This will not be used
+   * if the column is not nullable.
    */
-  def setNullGen(f: NullGeneratorFunction): CommonDataGen = {
+  def setNullGen(f: NullGeneratorFunction): DataGen = {
     this.userProvidedNullGen = Some(f)
     this
   }
@@ -627,12 +604,12 @@ abstract class CommonDataGen(
    * Set the probability of a null appearing in the output. The probability should be
    * 0.0 to 1.0.
    */
-  def setNullProbability(probability: Double): CommonDataGen = {
+  def setNullProbability(probability: Double): DataGen = {
     this.userProvidedNullGen = Some(NullProbabilityGenerationFunction(probability))
     this
   }
 
-  def setNullProbabilityRecursively(probability: Double): CommonDataGen = {
+  def setNullProbabilityRecursively(probability: Double): DataGen = {
     this.userProvidedNullGen = Some(NullProbabilityGenerationFunction(probability))
     children.foreach {
       case (_, dataGen) =>
@@ -644,7 +621,7 @@ abstract class CommonDataGen(
   /**
    * Set a specific location to seed mapping for the value generation.
    */
-  def setSeedMapping(seedMapping: LocationToSeedMapping): CommonDataGen = {
+  def setSeedMapping(seedMapping: LocationToSeedMapping): DataGen = {
     this.seedMapping = seedMapping
     this
   }
@@ -652,7 +629,7 @@ abstract class CommonDataGen(
   /**
    * Set a specific location to seed mapping for the null generation.
    */
-  def setNullMapping(nullMapping: LocationToSeedMapping): CommonDataGen = {
+  def setNullMapping(nullMapping: LocationToSeedMapping): DataGen = {
     this.nullMapping = nullMapping
     this
   }
@@ -661,7 +638,7 @@ abstract class CommonDataGen(
    * Set a specific LengthGeneratorFunction to use. This will only be used if
    * the datatype needs a length.
    */
-  def setLengthGen(lengthGen: LengthGeneratorFunction): CommonDataGen = {
+  def setLengthGen(lengthGen: LengthGeneratorFunction): DataGen = {
     this.lengthGen = lengthGen
     this
   }
@@ -669,30 +646,25 @@ abstract class CommonDataGen(
   /**
    * Set the length generation to be a fixed length.
    */
-  def setLength(len: Int): CommonDataGen = {
+  def setLength(len: Int): DataGen = {
     this.lengthGen = FixedLengthGeneratorFunction(len)
     this
   }
 
-  def setLength(minLen: Int, maxLen: Int): CommonDataGen = {
+  def setLength(minLen: Int, maxLen: Int) = {
     this.lengthGen = VarLengthGeneratorFunction(minLen, maxLen)
     this
   }
 
-  def setGaussianLength(mean: Double, stdDev: Double): CommonDataGen = {
-    this.lengthGen = StdDevLengthGen(mean, stdDev)
-    this
-  }
-
   /**
    * Add this column to a specific correlated key group. This should not be
    * called directly by users.
    */
   def setCorrelatedKeyGroup(keyGroup: Long,
-                            minSeed: Long, maxSeed: Long,
-                            seedMapping: LocationToSeedMapping): CommonDataGen = {
+      minSeed: Long, maxSeed: Long,
+      seedMapping: LocationToSeedMapping): DataGen = {
     conf = conf.forCorrelatedKeyGroup(keyGroup)
-      .forSeedRange(minSeed, maxSeed)
+        .forSeedRange(minSeed, maxSeed)
     this.seedMapping = seedMapping
     this
   }
@@ -700,7 +672,7 @@ abstract class CommonDataGen(
   /**
    * Set a range of seed values that should be returned by the LocationToSeedMapping
    */
-  def setSeedRange(min: Long, max: Long): CommonDataGen = {
+  def setSeedRange(min: Long, max: Long): DataGen = {
     conf = conf.forSeedRange(min, max)
     this
   }
@@ -709,7 +681,7 @@ abstract class CommonDataGen(
    * Get the default value generator for this specific data gen.
    */
   protected def getValGen: GeneratorFunction
-  def children: Seq[(String, CommonDataGen)]
+  def children: Seq[(String, DataGen)]
 
   /**
    * Get the final ready to use GeneratorFunction for the data generator.
@@ -718,8 +690,8 @@ abstract class CommonDataGen(
     val sm = seedMapping.withColumnConf(conf)
     val lg = lengthGen.withLocationToSeedMapping(sm)
     var valGen = userProvidedValueGen.getOrElse(getValGen)
-      .withLocationToSeedMapping(sm)
-      .withLengthGeneratorFunction(lg)
+        .withLocationToSeedMapping(sm)
+        .withLengthGeneratorFunction(lg)
     valueRange.foreach {
       case (min, max) =>
         valGen = valGen.withValueRange(min, max)
@@ -728,75 +700,35 @@ abstract class CommonDataGen(
       val nullColConf = conf.forNulls
       val nm = nullMapping.withColumnConf(nullColConf)
       userProvidedNullGen.get
-        .withWrapped(valGen)
-        .withLocationToSeedMapping(nm)
+          .withWrapped(valGen)
+          .withLocationToSeedMapping(nm)
     } else {
       valGen
     }
   }
 
+  /**
+   * Get the data type for this column
+   */
+  def dataType: DataType
+
   /**
    * Is this column nullable or not.
    */
   def nullable: Boolean = conf.nullable
 
   /**
-   * Get a child for a given name, if it has one.
+   * Get a child column for a given name, if it has one.
    */
-  final def apply(name: String): CommonDataGen = {
+  final def apply(name: String): DataGen = {
     get(name).getOrElse{
       throw new IllegalStateException(s"Could not find a child $name for $this")
     }
   }
 
-  def get(name: String): Option[CommonDataGen] = None
-}
-
-
-/**
- * Base class for generating a column/sub-column. This holds configuration
- * for the column, and handles what is needed to convert it into GeneratorFunction
- */
-abstract class DataGen(
-    conf: ColumnConf,
-    defaultValueRange: Option[(Any, Any)],
-    seedMapping: LocationToSeedMapping = FlatDistribution(),
-    nullMapping: LocationToSeedMapping = FlatDistribution(),
-    lengthGen: LengthGeneratorFunction = FixedLengthGeneratorFunction(10)) extends
-  CommonDataGen(conf, defaultValueRange, seedMapping, nullMapping, lengthGen) {
-
-  /**
-   * Get the data type for this column
-   */
-  def dataType: DataType
-
-  override def get(name: String): Option[DataGen] = None
-
-  def getSubstringGen: Option[SubstringDataGen] = None
-
-  def substringGen: SubstringDataGen =
-    getSubstringGen.getOrElse(
-      throw new IllegalArgumentException("substring data gen was not set"))
-
-  def setSubstringGen(f : ColumnConf => SubstringDataGen): Unit =
-    setSubstringGen(Option(f(conf.forNextSubstring)))
-
-  def setSubstringGen(subgen: Option[SubstringDataGen]): Unit =
-    throw new IllegalArgumentException("substring data gens can only be set for a STRING")
+  def get(name: String): Option[DataGen] = None
 }
 
-/**
- * Base class for generating a sub-string. This holds configuration
- * for the substring, and handles what is needed to convert it into a GeneratorFunction
- */
-abstract class SubstringDataGen(
-    conf: ColumnConf,
-    defaultValueRange: Option[(Any, Any)],
-    seedMapping: LocationToSeedMapping = FlatDistribution(),
-    nullMapping: LocationToSeedMapping = FlatDistribution(),
-    lengthGen: LengthGeneratorFunction = FixedLengthGeneratorFunction(10)) extends
-  CommonDataGen(conf, defaultValueRange, seedMapping, nullMapping, lengthGen) {}
-
 /**
  * A special GeneratorFunction that just returns the computed seed. This is helpful for
  * debugging distributions or if you want long values without any abstraction in between.
@@ -1562,866 +1494,155 @@ class FloatGen(conf: ColumnConf, defaultValueRange: Option[(Any, Any)])
   override def children: Seq[(String, DataGen)] = Seq.empty
 }
 
-case class JsonPathElement(name: String, is_array: Boolean)
-case class JsonLevel(path: Array[JsonPathElement], data_type: String, length: Int, value: String) {}
-
-object JsonColumnStats {
-  private def printHelp(): Unit = {
-    println("JSON Fingerprinting Tool:")
-    println("PARAMS: <inputPath> <outputPath>")
-    println("  <inputPath> is a path to a Spark dataframe to read in")
-    println("  <outputPath> is a path in a Spark file system to write out fingerprint data to.")
-    println()
-    println("OPTIONS:")
-    println("  --json=<COLUMN>       where <COLUMN> is the name of a top level String column")
-    println("  --anon=<SEED>         where <SEED> is a SEED used to anonymize the JSON keys ")
-    println("                        and column names.")
-    println("  --input_format=<TYPE> where <TYPE> is parquet or ORC. Defaults to parquet.")
-    println("  --overwrite           to enable overwriting the fingerprint output.")
-    println("  --debug               to enable some debug information to be printed out")
-    println("  --help                to print out this help message")
-    println()
-  }
-
-  def main(args: Array[String]): Unit = {
-    var inputPath = Option.empty[String]
-    var outputPath = Option.empty[String]
-    val jsonColumns = ArrayBuffer.empty[String]
-    var anonSeed = Option.empty[Long]
-    var debug = false
-    var argsDone = false
-    var format = "parquet"
-    var overwrite = false
-
-    args.foreach {
-      case a if !argsDone && a.startsWith("--json=") =>
-        jsonColumns += a.substring("--json=".length)
-      case a if !argsDone && a.startsWith("--anon=") =>
-        anonSeed = Some(a.substring("--anon=".length).toLong)
-      case a if !argsDone && a.startsWith("--input_format=") =>
-        format = a.substring("--input_format=".length).toLowerCase(java.util.Locale.US)
-      case "--overwrite" if !argsDone =>
-        overwrite = true
-      case "--debug" if !argsDone =>
-        debug = true
-      case "--help" if !argsDone =>
-        printHelp()
-        System.exit(0)
-      case "--" if !argsDone =>
-        argsDone = true
-      case a if !argsDone && a.startsWith("--") => // "--" was covered above already
-        println(s"ERROR $a is not a supported argument")
-        printHelp()
-        System.exit(-1)
-      case a if inputPath.isEmpty =>
-        inputPath = Some(a)
-      case a if outputPath.isEmpty =>
-        outputPath = Some(a)
-      case a =>
-        println(s"ERROR only two arguments are supported. Found $a")
-        printHelp()
-        System.exit(-1)
-    }
-    if (outputPath.isEmpty) {
-      println("ERROR both an inputPath and an outputPath are required")
-      printHelp()
-      System.exit(-1)
-    }
-
-    val spark = SparkSession.builder.getOrCreate()
-    spark.sparkContext.setLogLevel("WARN")
-
-    val df = spark.read.format(format).load(inputPath.get)
-    jsonColumns.foreach { column =>
-      val fp = fingerPrint(df, df(column), anonSeed)
-      val name = anonSeed.map(s => anonymizeString(column, s)).getOrElse(column)
-      val fullOutPath = s"${outputPath.get}/$name"
-      var writer = fp.write
-      if (overwrite) {
-        writer = writer.mode("overwrite")
-      }
-      if (debug) {
-        anonSeed.foreach { s =>
-          println(s"Keys and columns will be anonymized with seed $s")
-        }
-        println(s"Writing $column fingerprint to $fullOutPath")
-        spark.time(writer.parquet(fullOutPath))
-        println(s"Wrote ${spark.read.parquet(fullOutPath).count} rows")
-        spark.read.parquet(fullOutPath).show()
-      } else {
-        writer.parquet(fullOutPath)
-      }
-    }
-  }
-
-  case class JsonNodeStats(count: Long, meanLen: Double, stdDevLength: Double, dc: Long)
-
-  class JsonNode() {
-    private val forDataType =
-      mutable.HashMap[String, (JsonNodeStats, mutable.HashMap[String, JsonNode])]()
-
-    def getChild(name: String, isArray: Boolean): JsonNode = {
-      val dt = if (isArray) { "ARRAY" } else { "OBJECT" }
-      val typed = forDataType.getOrElse(dt,
-        throw new IllegalArgumentException(s"$dt is not a set data type yet."))
-      typed._2.getOrElse(name,
-        throw new IllegalArgumentException(s"$name is not a child when the type is $dt"))
-    }
-
-    def contains(name: String, isArray: Boolean): Boolean = {
-      val dt = if (isArray) { "ARRAY" } else { "OBJECT" }
-      forDataType.get(dt).exists { children =>
-        children._2.contains(name)
-      }
-    }
-
-    def addChild(name: String, isArray: Boolean): JsonNode = {
-      val dt = if (isArray) { "ARRAY" } else { "OBJECT" }
-      val found = forDataType.getOrElse(dt,
-        throw new IllegalArgumentException(s"$dt was not already added as a data type"))
-      if (found._2.contains(name)) {
-        throw new IllegalArgumentException(s"$dt already has a child named $name")
-      }
-      val node = new JsonNode()
-      found._2.put(name, node)
-      node
-    }
-
-    def addChoice(dt: String, stats: JsonNodeStats): Unit = {
-      if (forDataType.contains(dt)) {
-        throw new IllegalArgumentException(s"$dt was already added as a data type")
-      }
-      forDataType.put(dt, (stats, new mutable.HashMap[String, JsonNode]()))
-    }
-
-    override def toString: String = {
-      forDataType.toString()
-    }
-
-    def totalCount: Long = {
-      forDataType.values.map{ case (stats, _) => stats.count}.sum
-    }
-
-    private def makeNoChoiceGenRecursive(dt: String,
-                                         children: mutable.HashMap[String, JsonNode],
-                                         cc: ColumnConf): (SubstringDataGen, ColumnConf) = {
-      var c = cc
-      val ret = dt match {
-        case "LONG" => new JSONLongGen(c)
-        case "DOUBLE" => new JSONDoubleGen(c)
-        case "BOOLEAN" => new JSONBoolGen(c)
-        case "NULL" => new JSONNullGen(false, c)
-        case "VALUE_NULL" => new JSONNullGen(true, c)
-        case "ERROR" => new JSONErrorGen(c)
-        case "STRING" => new JSONStringGen(c)
-        case "ARRAY" =>
-          val child = if (children.isEmpty) {
-            // A corner case, we will just make it a BOOL column and it will be ignored
-            val tmp = new JSONBoolGen(c)
-            c = c.forNextSubstring
-            tmp
-          } else {
-            val tmp = children.values.head.makeGenRecursive(c)
-            c = tmp._2
-            tmp._1
-          }
-          new JSONArrayGen(child, c)
-        case "OBJECT" =>
-          val childGens = if (children.isEmpty) {
-            Seq.empty
-          } else {
-            children.toSeq.map {
-              case (k, node) =>
-                val tmp = node.makeGenRecursive(c)
-                c = tmp._2
-                (k, tmp._1)
-            }
-          }
-          new JSONObjectGen(childGens, c)
-        case other =>
-          throw new IllegalArgumentException(s"$other is not a leaf node type")
-      }
-      (ret, c.forNextSubstring)
-    }
-
-    private def makeGenRecursive(cc: ColumnConf): (SubstringDataGen, ColumnConf) = {
-      var c = cc
-      // We are going to recursively walk the tree for all of the values.
-      if (forDataType.size == 1) {
-        // We don't need a choice at all. This makes it simpler..
-        val (dt, (_, children)) = forDataType.head
-        makeNoChoiceGenRecursive(dt, children, c)
-      } else {
-        val totalSum = forDataType.map(f => f._2._1.count).sum.toDouble
-        var runningSum = 0L
-        val allChoices = ArrayBuffer[(Double, String, SubstringDataGen)]()
-        forDataType.foreach {
-          case (dt, (stats, children)) =>
-            val tmp = makeNoChoiceGenRecursive(dt, children, c)
-            c = tmp._2
-            runningSum += stats.count
-            allChoices.append((runningSum/totalSum, dt, tmp._1))
-        }
-
-        val ret = new JSONChoiceGen(allChoices.toSeq, c)
-        (ret, c.forNextSubstring)
-      }
-    }
-
-    def makeGen(cc: ColumnConf): SubstringDataGen = {
-      val (ret, _) = makeGenRecursive(cc)
-      ret
-    }
-
-    def setStatsSingle(dg: CommonDataGen,
-                       dt: String,
-                       stats: JsonNodeStats,
-                       nullPct: Double): Unit = {
-
-      val includeLength = dt != "OBJECT" && dt != "BOOLEAN" && dt != "NULL" && dt != "VALUE_NULL"
-      val includeNullPct = nullPct > 0.0
-      if (includeLength) {
-        dg.setGaussianLength(stats.meanLen, stats.stdDevLength)
-      }
-      if (includeNullPct) {
-        dg.setNullProbability(nullPct)
-      }
-      dg.setSeedRange(1, stats.dc)
-    }
-
-    def setStats(dg: CommonDataGen,
-                 parentCount: Option[Long]): Unit  = {
-      // We are going to recursively walk the tree...
-      if (forDataType.size == 1) {
-        // We don't need a choice at all. This makes it simpler..
-        val (dt, (stats, children)) = forDataType.head
-        val nullPct = parentCount.map { pc =>
-          (pc - stats.count).toDouble/pc
-        }.getOrElse(0.0)
-        setStatsSingle(dg, dt, stats, nullPct)
-        val myCount = if (dt == "OBJECT") {
-          Some(totalCount)
-        } else {
-          None
-        }
-        children.foreach {
-          case (name, node) =>
-            node.setStats(dg(name), myCount)
-        }
-      } else {
-        // We have choices to make between different types.
-        // The null percent cannot be calculated for each individual choice
-        // but is calculated on the group as a whole instead
-        parentCount.foreach { pc =>
-          val tc = totalCount
-          val choiceNullPct = (pc - tc).toDouble / pc
-          if (choiceNullPct > 0.0) {
-            dg.setNullProbability(choiceNullPct)
-          }
-        }
-        forDataType.foreach {
-          case (dt, (stats, children)) =>
-            // When there is a choice the name to access it is the data type
-            val choiceDg = dg(dt)
-            setStatsSingle(choiceDg, dt, stats, 0.0)
-            children.foreach {
-              case (name, node) =>
-                val myCount = if (dt == "OBJECT") {
-                  // Here we only want the count for the OBJECTs
-                  Some(stats.count)
-                } else {
-                  None
-                }
-                node.setStats(choiceDg(name), myCount)
-            }
-        }
-      }
-    }
-  }
-
-  private lazy val jsonFactory = new JsonFactoryBuilder()
-    // The two options below enabled for Hive compatibility
-    .enable(JsonReadFeature.ALLOW_UNESCAPED_CONTROL_CHARS)
-    .enable(JsonReadFeature.ALLOW_SINGLE_QUOTES)
-    .build()
-
-  private def processNext(parser: JsonParser,
-                          currentPath: ArrayBuffer[JsonPathElement],
-                          output: ArrayBuffer[JsonLevel]): Unit = {
-    parser.currentToken() match {
-      case JsonToken.START_OBJECT =>
-        parser.nextToken()
-        while (parser.currentToken() != JsonToken.END_OBJECT) {
-          processNext(parser, currentPath, output)
-        }
-        output.append(JsonLevel(currentPath.toArray, "OBJECT", 0, ""))
-        parser.nextToken()
-      case JsonToken.START_ARRAY =>
-        currentPath.append(JsonPathElement("data", is_array = true))
-        parser.nextToken()
-        var length = 0
-        while (parser.currentToken() != JsonToken.END_ARRAY) {
-          length += 1
-          processNext(parser, currentPath, output)
-        }
-        currentPath.remove(currentPath.length - 1)
-        output.append(JsonLevel(currentPath.toArray, "ARRAY", length, ""))
-        parser.nextToken()
-      case JsonToken.FIELD_NAME =>
-        currentPath.append(JsonPathElement(parser.getCurrentName, is_array = false))
-        parser.nextToken()
-        processNext(parser, currentPath, output)
-        currentPath.remove(currentPath.length - 1)
-      case JsonToken.VALUE_NUMBER_INT =>
-        val length = parser.getValueAsString.getBytes("UTF-8").length
-        output.append(JsonLevel(currentPath.toArray, "LONG", length, parser.getValueAsString))
-        parser.nextToken()
-      case JsonToken.VALUE_NUMBER_FLOAT =>
-        val length = parser.getValueAsString.getBytes("UTF-8").length
-        output.append(JsonLevel(currentPath.toArray, "DOUBLE", length, parser.getValueAsString))
-        parser.nextToken()
-      case JsonToken.VALUE_TRUE | JsonToken.VALUE_FALSE =>
-        val length = parser.getValueAsString.getBytes("UTF-8").length
-        output.append(JsonLevel(currentPath.toArray, "BOOLEAN", length, parser.getValueAsString))
-        parser.nextToken()
-      case JsonToken.VALUE_NULL | null =>
-        output.append(JsonLevel(currentPath.toArray, "VALUE_NULL", 4, "NULL"))
-        parser.nextToken()
-      case JsonToken.VALUE_STRING =>
-        val length = parser.getValueAsString.getBytes("UTF-8").length
-        output.append(JsonLevel(currentPath.toArray, "STRING", length, parser.getValueAsString))
-        parser.nextToken()
-      case other =>
-        throw new IllegalStateException(s"DON'T KNOW HOW TO DEAL WITH $other")
-    }
-  }
-
-  def jsonStatsUdf(json: String): Array[JsonLevel] = {
-    val output = new ArrayBuffer[JsonLevel]()
-    try {
-      val currentPath = new ArrayBuffer[JsonPathElement]()
-      if (json == null) {
-        output.append(JsonLevel(Array.empty, "NULL", 0, ""))
-      } else {
-        val parser = jsonFactory.createParser(json)
-        try {
-          parser.nextToken()
-          processNext(parser, currentPath, output)
-        } finally {
-          parser.close()
-        }
-      }
-    } catch {
-      case _: com.fasterxml.jackson.core.JsonParseException =>
-        output.clear()
-        output.append(JsonLevel(Array.empty, "ERROR", json.getBytes("UTF-8").length, json))
-    }
-    output.toArray
-  }
-
-  private lazy val extractPaths = udf(json => jsonStatsUdf(json))
-
-  def anonymizeString(str: String, seed: Long): String = {
-    val length = str.length
-    val data = new Array[Byte](length)
-    val hash = XXH64.hashLong(str.hashCode, seed)
-    val r = new Random()
-    r.setSeed(hash)
-    (0 until length).foreach { i =>
-      val tmp = r.nextInt(16)
-      data(i) = (tmp + 'A').toByte
-    }
-    new String(data)
-  }
-
-  private lazy val anonPath = udf((str, seed) => anonymizeString(str, seed))
-
-  def anonymizeFingerPrint(df: DataFrame, anonSeed: Long): DataFrame = {
-    df.withColumn("tmp", transform(col("path"),
-        o => {
-          val name = o("name")
-          val isArray = o("is_array")
-          val anon = anonPath(name, lit(anonSeed))
-          val newName = when(isArray, name).otherwise(anon).alias("name")
-          struct(newName, isArray)
-        }))
-      .drop("path").withColumnRenamed("tmp", "path")
-      .orderBy("path", "dt")
-      .selectExpr("path", "dt","c","mean_len","stddev_len","distinct","version")
-  }
-
-  def fingerPrint(df: DataFrame, column: Column, anonymize: Option[Long] = None): DataFrame = {
-    val ret = df.select(extractPaths(column).alias("paths"))
-      .selectExpr("explode_outer(paths) as p")
-      .selectExpr("p.path as path", "p.data_type as dt", "p.length as len", "p.value as value")
-      .groupBy(col("path"), col("dt")).agg(
-        count(lit(1)).alias("c"),
-        avg(col("len")).alias("mean_len"),
-        coalesce(stddev(col("len")), lit(0.0)).alias("stddev_len"),
-        approx_count_distinct(col("value")).alias("distinct"))
-      .orderBy("path", "dt").withColumn("version", lit("0.1"))
-      .selectExpr("path", "dt","c","mean_len","stddev_len","distinct","version")
-
-    anonymize.map { anonSeed =>
-      anonymizeFingerPrint(ret, anonSeed)
-    }.getOrElse(ret)
-  }
-
-  def apply(aggForColumn: DataFrame, genColumn: ColumnGen): Unit =
-    apply(aggForColumn, genColumn.dataGen)
-
-  private val expectedSchema = StructType.fromDDL(
-    "path ARRAY<STRUCT<name: STRING, is_array: BOOLEAN>>," +
-      "dt STRING," +
-      "c BIGINT," +
-      "mean_len DOUBLE," +
-      "stddev_len DOUBLE," +
-      "distinct BIGINT," +
-      "version STRING")
-
-  def apply(aggForColumn: DataFrame, gen: DataGen): Unit = {
-    val aggData = aggForColumn.orderBy("path", "dt").collect()
-    val rootNode: JsonNode = new JsonNode()
-    assert(aggData.length > 0)
-    val schema = aggData.head.schema
-    assert(schema.length == expectedSchema.length)
-    schema.fields.zip(expectedSchema.fields).foreach {
-      case(found, expected) =>
-        assert(found.name == expected.name)
-        // TODO we can worry about the exact types later if we need to
-    }
-    assert(aggData.head.getString(6) == "0.1")
-    aggData.foreach { row =>
-      val fullPath = row.getAs[mutable.WrappedArray[Row]](0)
-      val parsedPath = fullPath.map(r => (r.getString(0), r.getBoolean(1))).toList
-      val dt = row.getString(1)
-      val count = row.getLong(2)
-      val meanLen = row.getDouble(3)
-      val stdLen = row.getDouble(4)
-      val dc = row.getLong(5)
-
-      val stats = JsonNodeStats(count, meanLen, stdLen, dc)
-      var currentNode = rootNode
-      // Find everything up to the last path element
-      if (parsedPath.length > 1) {
-        parsedPath.slice(0, parsedPath.length - 1).foreach {
-          case (name, isArray) =>
-            currentNode = currentNode.getChild(name, isArray)
-        }
-      }
-
-      if (parsedPath.nonEmpty) {
-        // For the last path element (that is not the root element) we might need to add it
-        // as a child
-        val (name, isArray) = parsedPath.last
-        if (!currentNode.contains(name, isArray)) {
-          currentNode.addChild(name, isArray)
-        }
-        currentNode = currentNode.getChild(name, isArray)
-      }
-      currentNode.addChoice(dt, stats)
-    }
-
-    gen.setSubstringGen(cc => rootNode.makeGen(cc))
-    rootNode.setStats(gen.substringGen, None)
-  }
-}
-
-
-case class JSONStringGenFunc(lengthGen: LengthGeneratorFunction = null,
-                             mapping: LocationToSeedMapping = null) extends GeneratorFunction {
-
-  override def apply(rowLoc: RowLocation): Any = {
-    val len = lengthGen(rowLoc)
-    val r = DataGen.getRandomFor(rowLoc, mapping)
-    val buffer = new Array[Byte](len)
-    var at = 0
-    while (at < len) {
-      // Value range is 32 (Space) to 126 (~)
-      buffer(at) = (r.nextInt(126 - 31) + 32).toByte
-      at += 1
-    }
-    val strVal = new String(buffer, 0, len)
-            .replace("\\", "\\\\")
-            .replace("\"", "\\\"")
-            .replace("\n", "\\n")
-            .replace("\r", "\\r")
-            .replace("\b", "\\b")
-            .replace("\f", "\\f")
-    '"' + strVal + '"'
-  }
-
-  override def withLengthGeneratorFunction(lengthGen: LengthGeneratorFunction): JSONStringGenFunc =
-    JSONStringGenFunc(lengthGen, mapping)
-
-  override def withLocationToSeedMapping(mapping: LocationToSeedMapping): JSONStringGenFunc =
-    JSONStringGenFunc(lengthGen, mapping)
-
-  override def withValueRange(min: Any, max: Any): GeneratorFunction =
-    throw new IllegalArgumentException("value ranges are not supported for JSON")
-}
-
-class JSONStringGen(conf: ColumnConf,
-                  defaultValueRange: Option[(Any, Any)] = None)
-  extends SubstringDataGen(conf, defaultValueRange) {
-
-  override protected def getValGen: GeneratorFunction = JSONStringGenFunc()
-
-  override def children: Seq[(String, SubstringDataGen)] = Seq.empty
-}
-
-case class JSONLongGenFunc(lengthGen: LengthGeneratorFunction = null,
-                    mapping: LocationToSeedMapping = null) extends GeneratorFunction {
-
-  override def apply(rowLoc: RowLocation): Any = {
-    val len = math.max(lengthGen(rowLoc), 1) // We need at least 1 long for a valid value
-    val r = DataGen.getRandomFor(rowLoc, mapping)
-    val buffer = new Array[Byte](len)
-    var at = 0
-    while (at < len) {
-      if (at == 0) {
-        // No leading 0's
-        buffer(at) = (r.nextInt(9) + '1').toByte
-      } else {
-        buffer(at) = (r.nextInt(10) + '0').toByte
-      }
-      at += 1
-    }
-    new String(buffer, 0, len)
-  }
-
-  override def withLengthGeneratorFunction(lengthGen: LengthGeneratorFunction): JSONLongGenFunc =
-    JSONLongGenFunc(lengthGen, mapping)
-
-  override def withLocationToSeedMapping(mapping: LocationToSeedMapping): JSONLongGenFunc =
-    JSONLongGenFunc(lengthGen, mapping)
-
-  override def withValueRange(min: Any, max: Any): GeneratorFunction =
-    throw new IllegalArgumentException("value ranges are not supported for JSON")
-}
-
-class JSONLongGen(conf: ColumnConf,
-                  defaultValueRange: Option[(Any, Any)] = None)
-  extends SubstringDataGen(conf, defaultValueRange) {
-
-  override protected def getValGen: GeneratorFunction = JSONLongGenFunc()
-
-  override def children: Seq[(String, SubstringDataGen)] = Seq.empty
-}
-
-case class JSONDoubleGenFunc(lengthGen: LengthGeneratorFunction = null,
-                             mapping: LocationToSeedMapping = null) extends GeneratorFunction {
-
-  override def apply(rowLoc: RowLocation): Any = {
-    val len = math.max(lengthGen(rowLoc), 3) // We have to have at least 3 chars NUM.NUM
-    val r = DataGen.getRandomFor(rowLoc, mapping)
-    val beforeLen = if (len == 3) { 1 } else { r.nextInt(len - 3) + 1 }
-    val buffer = new Array[Byte](len)
-    var at = 0
-    while (at < len) {
-      if (at == 0) {
-        // No leading 0's
-        buffer(at) = (r.nextInt(9) + '1').toByte
-      } else if (at == beforeLen) {
-        buffer(at) = '.'
-      } else {
-        buffer(at) = (r.nextInt(10) + '0').toByte
-      }
-      at += 1
-    }
-    UTF8String.fromBytes(buffer, 0, len)
-  }
-
-  override def withLengthGeneratorFunction(lengthGen: LengthGeneratorFunction): JSONDoubleGenFunc =
-    JSONDoubleGenFunc(lengthGen, mapping)
-
-  override def withLocationToSeedMapping(mapping: LocationToSeedMapping): JSONDoubleGenFunc =
-    JSONDoubleGenFunc(lengthGen, mapping)
-
-  override def withValueRange(min: Any, max: Any): GeneratorFunction =
-    throw new IllegalArgumentException("value ranges are not supported for JSON")
-}
-
-class JSONDoubleGen(conf: ColumnConf,
-                  defaultValueRange: Option[(Any, Any)] = None)
-  extends SubstringDataGen(conf, defaultValueRange) {
-
-  override protected def getValGen: GeneratorFunction = JSONDoubleGenFunc()
-
-  override def children: Seq[(String, SubstringDataGen)] = Seq.empty
+trait JSONType {
+  def appendRandomValue(sb: StringBuilder,
+      index: Int,
+      maxStringLength: Int,
+      maxArrayLength: Int,
+      maxObjectLength: Int,
+      depth: Int,
+      maxDepth: Int,
+      r: Random): Unit
 }
 
-case class JSONBoolGenFunc(lengthGen: LengthGeneratorFunction = null,
-                    mapping: LocationToSeedMapping = null) extends GeneratorFunction {
-
-  override def apply(rowLoc: RowLocation): Any = {
-    val r = DataGen.getRandomFor(rowLoc, mapping)
-    val ret = if (r.nextBoolean()) "true" else "false"
-    UTF8String.fromString(ret)
-  }
-
-  override def withLengthGeneratorFunction(lengthGen: LengthGeneratorFunction): JSONBoolGenFunc =
-    JSONBoolGenFunc(lengthGen, mapping)
-
-  override def withLocationToSeedMapping(mapping: LocationToSeedMapping): JSONBoolGenFunc =
-    JSONBoolGenFunc(lengthGen, mapping)
-
-  override def withValueRange(min: Any, max: Any): GeneratorFunction =
-    throw new IllegalArgumentException("value ranges are not supported for JSON")
-}
-
-class JSONBoolGen(conf: ColumnConf,
-                 defaultValueRange: Option[(Any, Any)] = None)
-  extends SubstringDataGen(conf, defaultValueRange) {
-
-  override protected def getValGen: GeneratorFunction = JSONBoolGenFunc()
-
-  override def children: Seq[(String, SubstringDataGen)] = Seq.empty
-}
-
-case class JSONNullGenFunc(nullAsString: Boolean,
-                           lengthGen: LengthGeneratorFunction = null,
-                           mapping: LocationToSeedMapping = null) extends GeneratorFunction {
-
-  override def apply(rowLoc: RowLocation): Any =
-    if (nullAsString) {
-      UTF8String.fromString("null")
+object JSONType {
+  def selectType(depth: Int,
+      maxDepth: Int,
+      r: Random): JSONType = {
+    val toSelectFrom = if (depth < maxDepth) {
+      Seq(QuotedJSONString, JSONLong, JSONDouble, JSONArray, JSONObject)
     } else {
-      null
-    }
-
-
-  override def withLengthGeneratorFunction(lengthGen: LengthGeneratorFunction): JSONNullGenFunc =
-    JSONNullGenFunc(nullAsString, lengthGen, mapping)
-
-  override def withLocationToSeedMapping(mapping: LocationToSeedMapping): JSONNullGenFunc =
-    JSONNullGenFunc(nullAsString, lengthGen, mapping)
-
-  override def withValueRange(min: Any, max: Any): GeneratorFunction =
-    throw new IllegalArgumentException("value ranges are not supported for JSON")
-}
-
-class JSONNullGen(nullAsString: Boolean,
-                  conf: ColumnConf,
-                  defaultValueRange: Option[(Any, Any)] = None)
-  extends SubstringDataGen(conf, defaultValueRange) {
-
-  override protected def getValGen: GeneratorFunction = JSONNullGenFunc(nullAsString)
-
-  override def children: Seq[(String, SubstringDataGen)] = Seq.empty
-}
-
-case class JSONErrorGenFunc(lengthGen: LengthGeneratorFunction = null,
-                            mapping: LocationToSeedMapping = null) extends GeneratorFunction {
-
-  override def apply(rowLoc: RowLocation): Any = {
-    val len = lengthGen(rowLoc)
-    val r = DataGen.getRandomFor(rowLoc, mapping)
-    val buffer = new Array[Byte](len)
-    var at = 0
-    while (at < len) {
-      // Value range is 32 (Space) to 126 (~)
-      // But it is almost impossible to show up as valid JSON
-      buffer(at) = (r.nextInt(126 - 31) + 32).toByte
-      at += 1
-    }
-    UTF8String.fromBytes(buffer, 0, len)
-  }
-
-  override def withLengthGeneratorFunction(lengthGen: LengthGeneratorFunction): JSONErrorGenFunc =
-    JSONErrorGenFunc(lengthGen, mapping)
-
-  override def withLocationToSeedMapping(mapping: LocationToSeedMapping): JSONErrorGenFunc =
-    JSONErrorGenFunc(lengthGen, mapping)
-
-  override def withValueRange(min: Any, max: Any): GeneratorFunction =
-    throw new IllegalArgumentException("value ranges are not supported for JSON")
-}
-
-class JSONErrorGen(conf: ColumnConf,
-                  defaultValueRange: Option[(Any, Any)] = None)
-  extends SubstringDataGen(conf, defaultValueRange) {
-
-  override protected def getValGen: GeneratorFunction = JSONErrorGenFunc()
-
-  override def children: Seq[(String, SubstringDataGen)] = Seq.empty
-}
-
-case class JSONArrayGenFunc(child: GeneratorFunction,
-                     lengthGen: LengthGeneratorFunction = null,
-                     mapping: LocationToSeedMapping = null) extends GeneratorFunction {
-
-  override def apply(rowLoc: RowLocation): Any = {
-    val len = lengthGen(rowLoc)
-    val data = new Array[String](len)
-    val childRowLoc = rowLoc.withNewChild()
-    var i = 0
-    while (i < len) {
-      childRowLoc.setLastChildIndex(i)
-      val v = child(childRowLoc)
-      if (v == null) {
-        // A null in an array must look like "null"
-        data(i) = "null"
-      } else {
-        data(i) = v.toString
+      Seq(QuotedJSONString, JSONLong, JSONDouble)
+    }
+    val index = r.nextInt(toSelectFrom.length)
+    toSelectFrom(index)
+  }
+}
+
+object QuotedJSONString extends JSONType {
+  override def appendRandomValue(sb: StringBuilder,
+      index: Int,
+      maxStringLength: Int,
+      maxArrayLength: Int,
+      maxObjectLength: Int,
+      depth: Int,
+      maxDepth: Int,
+      r: Random): Unit = {
+    val strValue = r.nextString(r.nextInt(maxStringLength + 1))
+        .replace("\\", "\\\\")
+        .replace("\"", "\\\"")
+        .replace("\n", "\\n")
+        .replace("\r", "\\r")
+        .replace("\b", "\\b")
+        .replace("\f", "\\f")
+    sb.append('"')
+    sb.append(strValue)
+    sb.append('"')
+  }
+}
+
+object JSONLong extends JSONType {
+  override def appendRandomValue(sb: StringBuilder,
+      index: Int,
+      maxStringLength: Int,
+      maxArrayLength: Int,
+      maxObjectLength: Int,
+      depth: Int,
+      maxDepth: Int,
+      r: Random): Unit = {
+    sb.append(r.nextLong())
+  }
+}
+
+object JSONDouble extends JSONType {
+  override def appendRandomValue(sb: StringBuilder,
+      index: Int,
+      maxStringLength: Int,
+      maxArrayLength: Int,
+      maxObjectLength: Int,
+      depth: Int,
+      maxDepth: Int,
+      r: Random): Unit = {
+    sb.append(r.nextDouble() * 4096.0)
+  }
+}
+
+object JSONArray extends JSONType {
+  override def appendRandomValue(sb: StringBuilder,
+      index: Int,
+      maxStringLength: Int,
+      maxArrayLength: Int,
+      maxObjectLength: Int,
+      depth: Int,
+      maxDepth: Int,
+      r: Random): Unit = {
+    val childType = JSONType.selectType(depth, maxDepth, r)
+    val length = r.nextInt(maxArrayLength + 1)
+    sb.append("[")
+    (0 until length).foreach { i =>
+      if (i > 0) {
+        sb.append(",")
       }
-      i += 1
+      childType.appendRandomValue(sb, i, maxStringLength, maxArrayLength, maxObjectLength,
+        depth + 1, maxDepth, r)
     }
-    val ret = data.mkString("[", ",", "]")
-    UTF8String.fromString(ret)
+    sb.append("]")
   }
-
-  override def withLengthGeneratorFunction(lengthGen: LengthGeneratorFunction): JSONArrayGenFunc =
-    JSONArrayGenFunc(child, lengthGen, mapping)
-
-  override def withLocationToSeedMapping(mapping: LocationToSeedMapping): JSONArrayGenFunc =
-    JSONArrayGenFunc(child, lengthGen, mapping)
-
-  override def withValueRange(min: Any, max: Any): GeneratorFunction =
-    throw new IllegalArgumentException("value ranges are not supported for JSON")
 }
 
-class JSONArrayGen(child: SubstringDataGen,
-               conf: ColumnConf,
-               defaultValueRange: Option[(Any, Any)] = None)
-  extends SubstringDataGen(conf, defaultValueRange) {
-
-  override def setCorrelatedKeyGroup(keyGroup: Long,
-                                     minSeed: Long, maxSeed: Long,
-                                     seedMapping: LocationToSeedMapping): SubstringDataGen = {
-    super.setCorrelatedKeyGroup(keyGroup, minSeed, maxSeed, seedMapping)
-    child.setCorrelatedKeyGroup(keyGroup, minSeed, maxSeed, seedMapping)
-    this
-  }
-
-  override protected def getValGen: GeneratorFunction = JSONArrayGenFunc(child.getGen)
-
-  override def get(name: String): Option[SubstringDataGen] = {
-    if ("data".equalsIgnoreCase(name) || "child".equalsIgnoreCase(name)) {
-      Some(child)
-    } else {
-      None
-    }
-  }
-
-  override def children: Seq[(String, SubstringDataGen)] = Seq(("data", child))
-}
-
-case class JSONObjectGenFunc(childGens: Array[(String, GeneratorFunction)],
-                             lengthGen: LengthGeneratorFunction = null,
-                             mapping: LocationToSeedMapping = null) extends GeneratorFunction {
-  override def apply(rowLoc: RowLocation): Any = {
-    // TODO randomize the order of the children???
-    // TODO duplicate child values???
-    // The row location does not change for a struct/object
-    val data = childGens.map {
-      case (k, gen) =>
-        val key = k.replace("\\", "\\\\")
-          .replace("\"", "\\\"")
-          .replace("\n", "\\n")
-          .replace("\r", "\\r")
-          .replace("\b", "\\b")
-          .replace("\f", "\\f")
-        val v = gen.apply(rowLoc)
-        if (v == null) {
-          ""
-        } else {
-          '"' + key + "\":" + v
-        }
+object JSONObject extends JSONType {
+  override def appendRandomValue(sb: StringBuilder,
+      index: Int,
+      maxStringLength: Int,
+      maxArrayLength: Int,
+      maxObjectLength: Int,
+      depth: Int,
+      maxDepth: Int,
+      r: Random): Unit = {
+    val length = r.nextInt(maxObjectLength) + 1
+    sb.append("{")
+    (0 until length).foreach { i =>
+      if (i > 0) {
+        sb.append(",")
+      }
+      sb.append("\"key_")
+      sb.append(i)
+      sb.append("_")
+      sb.append(depth )
+      sb.append("\":")
+      val childType = JSONType.selectType(depth, maxDepth, r)
+      childType.appendRandomValue(sb, i, maxStringLength, maxArrayLength, maxObjectLength,
+        depth + 1, maxDepth, r)
     }
-    val ret = data.filterNot(_.isEmpty).mkString("{",",","}")
-    UTF8String.fromString(ret)
+    sb.append("}")
   }
-
-  override def withLocationToSeedMapping(mapping: LocationToSeedMapping): JSONObjectGenFunc =
-    JSONObjectGenFunc(childGens, lengthGen, mapping)
-
-  override def withLengthGeneratorFunction(lengthGen: LengthGeneratorFunction): JSONObjectGenFunc =
-    JSONObjectGenFunc(childGens, lengthGen, mapping)
-
-  override def withValueRange(min: Any, max: Any): GeneratorFunction =
-    throw new IllegalArgumentException("value ranges are not supported for JSON")
 }
 
-class JSONObjectGen(val children: Seq[(String, SubstringDataGen)],
-                conf: ColumnConf,
-                defaultValueRange: Option[(Any, Any)] = None)
-  extends SubstringDataGen(conf, defaultValueRange) {
-
-  override def setCorrelatedKeyGroup(keyGroup: Long,
-                                     minSeed: Long, maxSeed: Long,
-                                     seedMapping: LocationToSeedMapping): SubstringDataGen = {
-    super.setCorrelatedKeyGroup(keyGroup, minSeed, maxSeed, seedMapping)
-    children.foreach {
-      case (_, gen) =>
-        gen.setCorrelatedKeyGroup(keyGroup, minSeed, maxSeed, seedMapping)
-    }
-    this
-  }
-
-  override def get(name: String): Option[SubstringDataGen] =
-    children.collectFirst {
-      case (childName, dataGen) if childName.equalsIgnoreCase(name) => dataGen
-    }
-
-  override protected def getValGen: GeneratorFunction = {
-    val childGens = children.map(c => (c._1, c._2.getGen)).toArray
-    JSONObjectGenFunc(childGens)
-  }
-}
+case class JSONGenFunc(
+    maxStringLength: Int,
+    maxArrayLength: Int,
+    maxObjectLength: Int,
+    maxDepth: Int,
+    lengthGen: LengthGeneratorFunction = null,
+    mapping: LocationToSeedMapping = null) extends GeneratorFunction {
 
-case class JSONChoiceGenFunc(choices: List[(Double, GeneratorFunction)],
-                             lengthGen: LengthGeneratorFunction = null,
-                             mapping: LocationToSeedMapping = null) extends GeneratorFunction {
   override def apply(rowLoc: RowLocation): Any = {
     val r = DataGen.getRandomFor(rowLoc, mapping)
-    val l = r.nextDouble()
-    var index = 0
-    while (choices(index)._1 < l) {
-      index += 1
-    }
-    val childRowLoc = rowLoc.withNewChild()
-    choices(index)._2(childRowLoc)
+    val sb = new StringBuilder()
+    JSONObject.appendRandomValue(sb, 0, maxStringLength, maxArrayLength, maxObjectLength,
+      0, maxDepth, r)
+    // For now I am going to have some hard coded keys
+    UTF8String.fromString(sb.toString())
   }
 
-  override def withLengthGeneratorFunction(lengthGen: LengthGeneratorFunction): JSONChoiceGenFunc =
-    JSONChoiceGenFunc(choices, lengthGen, mapping)
+  override def withLengthGeneratorFunction(lengthGen: LengthGeneratorFunction): GeneratorFunction =
+    JSONGenFunc(maxStringLength, maxArrayLength, maxObjectLength, maxDepth, lengthGen, mapping)
 
-  override def withLocationToSeedMapping(mapping: LocationToSeedMapping): JSONChoiceGenFunc =
-    JSONChoiceGenFunc(choices, lengthGen, mapping)
+  override def withLocationToSeedMapping(mapping: LocationToSeedMapping): GeneratorFunction =
+    JSONGenFunc(maxStringLength, maxArrayLength, maxObjectLength, maxDepth, lengthGen, mapping)
 
   override def withValueRange(min: Any, max: Any): GeneratorFunction =
-    throw new IllegalArgumentException("value ranges are not supported for JSON")
-}
-
-class JSONChoiceGen(val choices: Seq[(Double, String, SubstringDataGen)],
-                    conf: ColumnConf,
-                    defaultValueRange: Option[(Any, Any)] = None)
-  extends SubstringDataGen(conf, defaultValueRange) {
-
-  override val children: Seq[(String, SubstringDataGen)] =
-    choices.map { case (_, name, gen) => (name, gen) }
-
-  override def setCorrelatedKeyGroup(keyGroup: Long,
-                                     minSeed: Long, maxSeed: Long,
-                                     seedMapping: LocationToSeedMapping): SubstringDataGen = {
-    super.setCorrelatedKeyGroup(keyGroup, minSeed, maxSeed, seedMapping)
-    children.foreach {
-      case (_, gen) =>
-        gen.setCorrelatedKeyGroup(keyGroup, minSeed, maxSeed, seedMapping)
-    }
-    this
-  }
-
-  override def get(name: String): Option[SubstringDataGen] =
-    children.collectFirst {
-      case (childName, dataGen) if childName.equalsIgnoreCase(name) => dataGen
-    }
-
-  override protected def getValGen: GeneratorFunction = {
-    val childGens = choices.map(c => (c._1, c._3.getGen)).toList
-    JSONChoiceGenFunc(childGens)
-  }
+    throw new IllegalArgumentException("value ranges are not supported for strings")
 }
 
 case class ASCIIGenFunc(
@@ -2451,46 +1672,14 @@ case class ASCIIGenFunc(
     throw new IllegalArgumentException("value ranges are not supported for strings")
 }
 
-/**
- * This is here to wrap the substring gen function so that its length/settings
- * are the ones used when generating a string, and not what was set for the string.
- */
-case class SubstringGenFunc(
-    substringGen: GeneratorFunction,
-    lengthGen: LengthGeneratorFunction = null,
-    mapping: LocationToSeedMapping = null) extends GeneratorFunction {
-
-  override def apply(rowLoc: RowLocation): Any = {
-    substringGen(rowLoc)
-  }
-
-  // The length and location seed mapping are just ignored for this...
-  override def withLengthGeneratorFunction(lengthGen: LengthGeneratorFunction): GeneratorFunction =
-    this
-
-  override def withLocationToSeedMapping(mapping: LocationToSeedMapping): GeneratorFunction =
-    this
-
-  override def withValueRange(min: Any, max: Any): GeneratorFunction =
-    throw new IllegalArgumentException("value ranges are not supported for strings")
-}
-
-class StringGen(conf: ColumnConf,
-                defaultValueRange: Option[(Any, Any)],
-                var substringDataGen: Option[SubstringDataGen] = None)
-  extends DataGen(conf, defaultValueRange) {
+class StringGen(conf: ColumnConf, defaultValueRange: Option[(Any, Any)])
+    extends DataGen(conf, defaultValueRange) {
 
   override def dataType: DataType = StringType
 
-  override protected def getValGen: GeneratorFunction =
-    substringDataGen.map(s => SubstringGenFunc(s.getGen)).getOrElse(ASCIIGenFunc())
+  override protected def getValGen: GeneratorFunction = ASCIIGenFunc()
 
   override def children: Seq[(String, DataGen)] = Seq.empty
-
-  override def setSubstringGen(subgen: Option[SubstringDataGen]): Unit =
-    substringDataGen = subgen
-
-  override def getSubstringGen: Option[SubstringDataGen] = substringDataGen
 }
 
 case class StructGenFunc(childGens: Array[GeneratorFunction]) extends GeneratorFunction {
@@ -2665,6 +1854,7 @@ class MapGen(key: DataGen,
   override def children: Seq[(String, DataGen)] = Seq(("key", key), ("value", value))
 }
 
+
 object ColumnGen {
   private def genInternal(rowNumber: Column,
       dataType: DataType,
@@ -2679,8 +1869,8 @@ object ColumnGen {
  */
 class ColumnGen(val dataGen: DataGen) {
   def setCorrelatedKeyGroup(kg: Long,
-                            minSeed: Long, maxSeed: Long,
-                            seedMapping: LocationToSeedMapping): ColumnGen = {
+      minSeed: Long, maxSeed: Long,
+      seedMapping: LocationToSeedMapping): ColumnGen = {
     dataGen.setCorrelatedKeyGroup(kg, minSeed, maxSeed, seedMapping)
     this
   }
@@ -2740,11 +1930,6 @@ class ColumnGen(val dataGen: DataGen) {
     this
   }
 
-  def setGaussianLength(mean: Double, stdDev: Double): ColumnGen = {
-    dataGen.setGaussianLength(mean, stdDev)
-    this
-  }
-
   final def apply(name: String): DataGen = {
     get(name).getOrElse {
       throw new IllegalArgumentException(s"$name not a child of $this")
@@ -2756,16 +1941,8 @@ class ColumnGen(val dataGen: DataGen) {
   def gen(rowNumber: Column): Column = {
     ColumnGen.genInternal(rowNumber, dataGen.dataType, dataGen.nullable, dataGen.getGen)
   }
-
-  def getSubstring: Option[SubstringDataGen] = dataGen.getSubstringGen
-
-  def substringGen: SubstringDataGen = dataGen.substringGen
-
-  def setSubstringGen(f : ColumnConf => SubstringDataGen): Unit =
-    dataGen.setSubstringGen(f)
 }
 
-
 sealed trait KeyGroupType
 
 /**
@@ -3015,7 +2192,7 @@ object DBGen {
       numRows: Long,
       mapping: OrderedTypeMapping): Seq[(String, ColumnGen)] = {
     // a bit of a hack with the column num so that we update it before each time...
-    var conf = ColumnConf(ColumnLocation(tableId, -1, 0), true, numRows)
+    var conf = ColumnConf(ColumnLocation(tableId, -1), true, numRows)
     st.toArray.map { sf =>
       if (!mapping.canMap(sf.dataType, mapping)) {
         throw new IllegalArgumentException(s"$sf is not supported at this time")

From 531a9f5e6501baeda67bd12fa223726d5e8c3572 Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Thu, 13 Jun 2024 14:22:50 -0500
Subject: [PATCH 06/10] Add in the ability to fingerprint JSON columns
 [databricks] (#11060)

Also fixed issue with databricks dependency not being what we said it was.

Signed-off-by: Robert (Bobby) Evans <bobby@apache.org>
---
 .../spark/sql/tests/datagen/bigDataGen.scala  | 1181 ++++++++++++++---
 jenkins/databricks/install_deps.py            |    4 +-
 scala2.13/shim-deps/databricks/pom.xml        |    8 +-
 shim-deps/databricks/pom.xml                  |    8 +-
 4 files changed, 1019 insertions(+), 182 deletions(-)

diff --git a/datagen/src/main/scala/org/apache/spark/sql/tests/datagen/bigDataGen.scala b/datagen/src/main/scala/org/apache/spark/sql/tests/datagen/bigDataGen.scala
index 91335afe4e6..14e0d4e0970 100644
--- a/datagen/src/main/scala/org/apache/spark/sql/tests/datagen/bigDataGen.scala
+++ b/datagen/src/main/scala/org/apache/spark/sql/tests/datagen/bigDataGen.scala
@@ -16,21 +16,22 @@
 
 package org.apache.spark.sql.tests.datagen
 
+import com.fasterxml.jackson.core.{JsonFactoryBuilder, JsonParser, JsonToken}
+import com.fasterxml.jackson.core.json.JsonReadFeature
 import java.math.{BigDecimal => JavaBigDecimal}
 import java.sql.{Date, Timestamp}
 import java.time.{Duration, Instant, LocalDate, LocalDateTime}
 import java.util
-
 import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 import scala.math.BigDecimal.RoundingMode
 import scala.util.Random
 
-import org.apache.spark.sql.{Column, DataFrame, SparkSession}
+import org.apache.spark.sql.{Column, DataFrame, Row, SparkSession}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{Expression, XXH64}
 import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, ArrayData, DateTimeUtils}
-import org.apache.spark.sql.functions.col
+import org.apache.spark.sql.functions.{approx_count_distinct, avg, coalesce, col, count, lit, stddev, struct, transform, udf, when}
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 import org.apache.spark.util.random.XORShiftRandom
@@ -79,22 +80,28 @@ class RowLocation(val rowNum: Long, val subRows: Array[Int] = null) {
  * hash. This makes the generated data correlated for all column/child columns.
  * @param tableNum a unique ID for the table this is a part of.
  * @param columnNum the location of the column in the data being generated
+ * @param substringNum the location of the substring column
  * @param correlatedKeyGroup the correlated key group this column is a part of, if any.
  */
-case class ColumnLocation(tableNum: Int, columnNum: Int, correlatedKeyGroup: Option[Long] = None) {
-  def forNextColumn(): ColumnLocation = ColumnLocation(tableNum, columnNum + 1)
+case class ColumnLocation(tableNum: Int,
+                          columnNum: Int,
+                          substringNum: Int,
+                          correlatedKeyGroup: Option[Long] = None) {
+  def forNextColumn(): ColumnLocation = ColumnLocation(tableNum, columnNum + 1, 0)
 
+  def forNextSubstring: ColumnLocation = ColumnLocation(tableNum, columnNum, substringNum + 1)
 
   /**
    * Create a new ColumnLocation that is specifically for a given key group
    */
   def forCorrelatedKeyGroup(keyGroup: Long): ColumnLocation =
-    ColumnLocation(tableNum, columnNum, Some(keyGroup))
+    ColumnLocation(tableNum, columnNum, substringNum, Some(keyGroup))
 
   /**
    * Hash the location into a single long value.
    */
-  lazy val hashLoc: Long = XXH64.hashLong(tableNum, correlatedKeyGroup.getOrElse(columnNum))
+  lazy val hashLoc: Long = XXH64.hashLong(tableNum,
+    correlatedKeyGroup.getOrElse(XXH64.hashLong(columnNum, substringNum)))
 }
 
 /**
@@ -115,6 +122,9 @@ case class ColumnConf(columnLoc: ColumnLocation,
   def forNextColumn(nullable: Boolean): ColumnConf =
     ColumnConf(columnLoc.forNextColumn(), nullable, numTableRows)
 
+  def forNextSubstring: ColumnConf =
+    ColumnConf(columnLoc.forNextSubstring, nullable = true, numTableRows)
+
   /**
    * Create a new configuration based on this, but for a given correlated key group.
    */
@@ -303,6 +313,23 @@ case class VarLengthGeneratorFunction(minLength: Int, maxLength: Int) extends
   }
 }
 
+case class StdDevLengthGen(mean: Double,
+                           stdDev: Double,
+                           mapping: LocationToSeedMapping = null) extends
+  LengthGeneratorFunction {
+  override def withLocationToSeedMapping(mapping: LocationToSeedMapping): LengthGeneratorFunction =
+    StdDevLengthGen(mean, stdDev, mapping)
+
+  override def apply(rowLoc: RowLocation): Int = {
+    val r = DataGen.getRandomFor(rowLoc, mapping)
+    val g = r.nextGaussian() // g has a mean of 0 and a stddev of 1.0
+    val adjusted = mean + (g * stdDev)
+    // If the range of seed is too small compared to the stddev and mean we will
+    // end up with an invalid distribution, but they asked for it.
+    math.max(0, math.round(adjusted).toInt)
+  }
+}
+
 /**
  * Generate nulls with a given probability.
  * @param prob 0.0 to 1.0 for how often nulls should appear in the output.
@@ -562,11 +589,8 @@ case class DataGenExpr(child: Expression,
   }
 }
 
-/**
- * Base class for generating a column/sub-column. This holds configuration for the column,
- * and handles what is needed to convert it into GeneratorFunction
- */
-abstract class DataGen(var conf: ColumnConf,
+abstract class CommonDataGen(
+    var conf: ColumnConf,
     defaultValueRange: Option[(Any, Any)],
     var seedMapping: LocationToSeedMapping = FlatDistribution(),
     var nullMapping: LocationToSeedMapping = FlatDistribution(),
@@ -576,26 +600,25 @@ abstract class DataGen(var conf: ColumnConf,
   protected var valueRange: Option[(Any, Any)] = defaultValueRange
 
   /**
-   * Set a value range for this data gen.
+   * Set a value range
    */
-  def setValueRange(min: Any, max: Any): DataGen = {
+  def setValueRange(min: Any, max: Any): CommonDataGen = {
     valueRange = Some((min, max))
     this
   }
 
   /**
-   * Set a custom GeneratorFunction to use for this column.
+   * Set a custom GeneratorFunction
    */
-  def setValueGen(f: GeneratorFunction): DataGen = {
+  def setValueGen(f: GeneratorFunction): CommonDataGen = {
     userProvidedValueGen = Some(f)
     this
   }
 
   /**
-   * Set a NullGeneratorFunction for this column. This will not be used
-   * if the column is not nullable.
+   * Set a NullGeneratorFunction
    */
-  def setNullGen(f: NullGeneratorFunction): DataGen = {
+  def setNullGen(f: NullGeneratorFunction): CommonDataGen = {
     this.userProvidedNullGen = Some(f)
     this
   }
@@ -604,12 +627,12 @@ abstract class DataGen(var conf: ColumnConf,
    * Set the probability of a null appearing in the output. The probability should be
    * 0.0 to 1.0.
    */
-  def setNullProbability(probability: Double): DataGen = {
+  def setNullProbability(probability: Double): CommonDataGen = {
     this.userProvidedNullGen = Some(NullProbabilityGenerationFunction(probability))
     this
   }
 
-  def setNullProbabilityRecursively(probability: Double): DataGen = {
+  def setNullProbabilityRecursively(probability: Double): CommonDataGen = {
     this.userProvidedNullGen = Some(NullProbabilityGenerationFunction(probability))
     children.foreach {
       case (_, dataGen) =>
@@ -621,7 +644,7 @@ abstract class DataGen(var conf: ColumnConf,
   /**
    * Set a specific location to seed mapping for the value generation.
    */
-  def setSeedMapping(seedMapping: LocationToSeedMapping): DataGen = {
+  def setSeedMapping(seedMapping: LocationToSeedMapping): CommonDataGen = {
     this.seedMapping = seedMapping
     this
   }
@@ -629,7 +652,7 @@ abstract class DataGen(var conf: ColumnConf,
   /**
    * Set a specific location to seed mapping for the null generation.
    */
-  def setNullMapping(nullMapping: LocationToSeedMapping): DataGen = {
+  def setNullMapping(nullMapping: LocationToSeedMapping): CommonDataGen = {
     this.nullMapping = nullMapping
     this
   }
@@ -638,7 +661,7 @@ abstract class DataGen(var conf: ColumnConf,
    * Set a specific LengthGeneratorFunction to use. This will only be used if
    * the datatype needs a length.
    */
-  def setLengthGen(lengthGen: LengthGeneratorFunction): DataGen = {
+  def setLengthGen(lengthGen: LengthGeneratorFunction): CommonDataGen = {
     this.lengthGen = lengthGen
     this
   }
@@ -646,25 +669,30 @@ abstract class DataGen(var conf: ColumnConf,
   /**
    * Set the length generation to be a fixed length.
    */
-  def setLength(len: Int): DataGen = {
+  def setLength(len: Int): CommonDataGen = {
     this.lengthGen = FixedLengthGeneratorFunction(len)
     this
   }
 
-  def setLength(minLen: Int, maxLen: Int) = {
+  def setLength(minLen: Int, maxLen: Int): CommonDataGen = {
     this.lengthGen = VarLengthGeneratorFunction(minLen, maxLen)
     this
   }
 
+  def setGaussianLength(mean: Double, stdDev: Double): CommonDataGen = {
+    this.lengthGen = StdDevLengthGen(mean, stdDev)
+    this
+  }
+
   /**
    * Add this column to a specific correlated key group. This should not be
    * called directly by users.
    */
   def setCorrelatedKeyGroup(keyGroup: Long,
-      minSeed: Long, maxSeed: Long,
-      seedMapping: LocationToSeedMapping): DataGen = {
+                            minSeed: Long, maxSeed: Long,
+                            seedMapping: LocationToSeedMapping): CommonDataGen = {
     conf = conf.forCorrelatedKeyGroup(keyGroup)
-        .forSeedRange(minSeed, maxSeed)
+      .forSeedRange(minSeed, maxSeed)
     this.seedMapping = seedMapping
     this
   }
@@ -672,7 +700,7 @@ abstract class DataGen(var conf: ColumnConf,
   /**
    * Set a range of seed values that should be returned by the LocationToSeedMapping
    */
-  def setSeedRange(min: Long, max: Long): DataGen = {
+  def setSeedRange(min: Long, max: Long): CommonDataGen = {
     conf = conf.forSeedRange(min, max)
     this
   }
@@ -681,7 +709,7 @@ abstract class DataGen(var conf: ColumnConf,
    * Get the default value generator for this specific data gen.
    */
   protected def getValGen: GeneratorFunction
-  def children: Seq[(String, DataGen)]
+  def children: Seq[(String, CommonDataGen)]
 
   /**
    * Get the final ready to use GeneratorFunction for the data generator.
@@ -690,8 +718,8 @@ abstract class DataGen(var conf: ColumnConf,
     val sm = seedMapping.withColumnConf(conf)
     val lg = lengthGen.withLocationToSeedMapping(sm)
     var valGen = userProvidedValueGen.getOrElse(getValGen)
-        .withLocationToSeedMapping(sm)
-        .withLengthGeneratorFunction(lg)
+      .withLocationToSeedMapping(sm)
+      .withLengthGeneratorFunction(lg)
     valueRange.foreach {
       case (min, max) =>
         valGen = valGen.withValueRange(min, max)
@@ -700,35 +728,75 @@ abstract class DataGen(var conf: ColumnConf,
       val nullColConf = conf.forNulls
       val nm = nullMapping.withColumnConf(nullColConf)
       userProvidedNullGen.get
-          .withWrapped(valGen)
-          .withLocationToSeedMapping(nm)
+        .withWrapped(valGen)
+        .withLocationToSeedMapping(nm)
     } else {
       valGen
     }
   }
 
-  /**
-   * Get the data type for this column
-   */
-  def dataType: DataType
-
   /**
    * Is this column nullable or not.
    */
   def nullable: Boolean = conf.nullable
 
   /**
-   * Get a child column for a given name, if it has one.
+   * Get a child for a given name, if it has one.
    */
-  final def apply(name: String): DataGen = {
+  final def apply(name: String): CommonDataGen = {
     get(name).getOrElse{
       throw new IllegalStateException(s"Could not find a child $name for $this")
     }
   }
 
-  def get(name: String): Option[DataGen] = None
+  def get(name: String): Option[CommonDataGen] = None
+}
+
+
+/**
+ * Base class for generating a column/sub-column. This holds configuration
+ * for the column, and handles what is needed to convert it into GeneratorFunction
+ */
+abstract class DataGen(
+    conf: ColumnConf,
+    defaultValueRange: Option[(Any, Any)],
+    seedMapping: LocationToSeedMapping = FlatDistribution(),
+    nullMapping: LocationToSeedMapping = FlatDistribution(),
+    lengthGen: LengthGeneratorFunction = FixedLengthGeneratorFunction(10)) extends
+  CommonDataGen(conf, defaultValueRange, seedMapping, nullMapping, lengthGen) {
+
+  /**
+   * Get the data type for this column
+   */
+  def dataType: DataType
+
+  override def get(name: String): Option[DataGen] = None
+
+  def getSubstringGen: Option[SubstringDataGen] = None
+
+  def substringGen: SubstringDataGen =
+    getSubstringGen.getOrElse(
+      throw new IllegalArgumentException("substring data gen was not set"))
+
+  def setSubstringGen(f : ColumnConf => SubstringDataGen): Unit =
+    setSubstringGen(Option(f(conf.forNextSubstring)))
+
+  def setSubstringGen(subgen: Option[SubstringDataGen]): Unit =
+    throw new IllegalArgumentException("substring data gens can only be set for a STRING")
 }
 
+/**
+ * Base class for generating a sub-string. This holds configuration
+ * for the substring, and handles what is needed to convert it into a GeneratorFunction
+ */
+abstract class SubstringDataGen(
+    conf: ColumnConf,
+    defaultValueRange: Option[(Any, Any)],
+    seedMapping: LocationToSeedMapping = FlatDistribution(),
+    nullMapping: LocationToSeedMapping = FlatDistribution(),
+    lengthGen: LengthGeneratorFunction = FixedLengthGeneratorFunction(10)) extends
+  CommonDataGen(conf, defaultValueRange, seedMapping, nullMapping, lengthGen) {}
+
 /**
  * A special GeneratorFunction that just returns the computed seed. This is helpful for
  * debugging distributions or if you want long values without any abstraction in between.
@@ -1494,155 +1562,866 @@ class FloatGen(conf: ColumnConf, defaultValueRange: Option[(Any, Any)])
   override def children: Seq[(String, DataGen)] = Seq.empty
 }
 
-trait JSONType {
-  def appendRandomValue(sb: StringBuilder,
-      index: Int,
-      maxStringLength: Int,
-      maxArrayLength: Int,
-      maxObjectLength: Int,
-      depth: Int,
-      maxDepth: Int,
-      r: Random): Unit
-}
+case class JsonPathElement(name: String, is_array: Boolean)
+case class JsonLevel(path: Array[JsonPathElement], data_type: String, length: Int, value: String) {}
+
+object JsonColumnStats {
+  private def printHelp(): Unit = {
+    println("JSON Fingerprinting Tool:")
+    println("PARAMS: <inputPath> <outputPath>")
+    println("  <inputPath> is a path to a Spark dataframe to read in")
+    println("  <outputPath> is a path in a Spark file system to write out fingerprint data to.")
+    println()
+    println("OPTIONS:")
+    println("  --json=<COLUMN>       where <COLUMN> is the name of a top level String column")
+    println("  --anon=<SEED>         where <SEED> is a SEED used to anonymize the JSON keys ")
+    println("                        and column names.")
+    println("  --input_format=<TYPE> where <TYPE> is parquet or ORC. Defaults to parquet.")
+    println("  --overwrite           to enable overwriting the fingerprint output.")
+    println("  --debug               to enable some debug information to be printed out")
+    println("  --help                to print out this help message")
+    println()
+  }
+
+  def main(args: Array[String]): Unit = {
+    var inputPath = Option.empty[String]
+    var outputPath = Option.empty[String]
+    val jsonColumns = ArrayBuffer.empty[String]
+    var anonSeed = Option.empty[Long]
+    var debug = false
+    var argsDone = false
+    var format = "parquet"
+    var overwrite = false
+
+    args.foreach {
+      case a if !argsDone && a.startsWith("--json=") =>
+        jsonColumns += a.substring("--json=".length)
+      case a if !argsDone && a.startsWith("--anon=") =>
+        anonSeed = Some(a.substring("--anon=".length).toLong)
+      case a if !argsDone && a.startsWith("--input_format=") =>
+        format = a.substring("--input_format=".length).toLowerCase(java.util.Locale.US)
+      case "--overwrite" if !argsDone =>
+        overwrite = true
+      case "--debug" if !argsDone =>
+        debug = true
+      case "--help" if !argsDone =>
+        printHelp()
+        System.exit(0)
+      case "--" if !argsDone =>
+        argsDone = true
+      case a if !argsDone && a.startsWith("--") => // "--" was covered above already
+        println(s"ERROR $a is not a supported argument")
+        printHelp()
+        System.exit(-1)
+      case a if inputPath.isEmpty =>
+        inputPath = Some(a)
+      case a if outputPath.isEmpty =>
+        outputPath = Some(a)
+      case a =>
+        println(s"ERROR only two arguments are supported. Found $a")
+        printHelp()
+        System.exit(-1)
+    }
+    if (outputPath.isEmpty) {
+      println("ERROR both an inputPath and an outputPath are required")
+      printHelp()
+      System.exit(-1)
+    }
+
+    val spark = SparkSession.builder.getOrCreate()
+    spark.sparkContext.setLogLevel("WARN")
+
+    val df = spark.read.format(format).load(inputPath.get)
+    jsonColumns.foreach { column =>
+      val fp = fingerPrint(df, df(column), anonSeed)
+      val name = anonSeed.map(s => anonymizeString(column, s)).getOrElse(column)
+      val fullOutPath = s"${outputPath.get}/$name"
+      var writer = fp.write
+      if (overwrite) {
+        writer = writer.mode("overwrite")
+      }
+      if (debug) {
+        anonSeed.foreach { s =>
+          println(s"Keys and columns will be anonymized with seed $s")
+        }
+        println(s"Writing $column fingerprint to $fullOutPath")
+        spark.time(writer.parquet(fullOutPath))
+        println(s"Wrote ${spark.read.parquet(fullOutPath).count} rows")
+        spark.read.parquet(fullOutPath).show()
+      } else {
+        writer.parquet(fullOutPath)
+      }
+    }
+  }
 
-object JSONType {
-  def selectType(depth: Int,
-      maxDepth: Int,
-      r: Random): JSONType = {
-    val toSelectFrom = if (depth < maxDepth) {
-      Seq(QuotedJSONString, JSONLong, JSONDouble, JSONArray, JSONObject)
-    } else {
-      Seq(QuotedJSONString, JSONLong, JSONDouble)
-    }
-    val index = r.nextInt(toSelectFrom.length)
-    toSelectFrom(index)
-  }
-}
-
-object QuotedJSONString extends JSONType {
-  override def appendRandomValue(sb: StringBuilder,
-      index: Int,
-      maxStringLength: Int,
-      maxArrayLength: Int,
-      maxObjectLength: Int,
-      depth: Int,
-      maxDepth: Int,
-      r: Random): Unit = {
-    val strValue = r.nextString(r.nextInt(maxStringLength + 1))
-        .replace("\\", "\\\\")
-        .replace("\"", "\\\"")
-        .replace("\n", "\\n")
-        .replace("\r", "\\r")
-        .replace("\b", "\\b")
-        .replace("\f", "\\f")
-    sb.append('"')
-    sb.append(strValue)
-    sb.append('"')
-  }
-}
-
-object JSONLong extends JSONType {
-  override def appendRandomValue(sb: StringBuilder,
-      index: Int,
-      maxStringLength: Int,
-      maxArrayLength: Int,
-      maxObjectLength: Int,
-      depth: Int,
-      maxDepth: Int,
-      r: Random): Unit = {
-    sb.append(r.nextLong())
-  }
-}
-
-object JSONDouble extends JSONType {
-  override def appendRandomValue(sb: StringBuilder,
-      index: Int,
-      maxStringLength: Int,
-      maxArrayLength: Int,
-      maxObjectLength: Int,
-      depth: Int,
-      maxDepth: Int,
-      r: Random): Unit = {
-    sb.append(r.nextDouble() * 4096.0)
-  }
-}
-
-object JSONArray extends JSONType {
-  override def appendRandomValue(sb: StringBuilder,
-      index: Int,
-      maxStringLength: Int,
-      maxArrayLength: Int,
-      maxObjectLength: Int,
-      depth: Int,
-      maxDepth: Int,
-      r: Random): Unit = {
-    val childType = JSONType.selectType(depth, maxDepth, r)
-    val length = r.nextInt(maxArrayLength + 1)
-    sb.append("[")
+  case class JsonNodeStats(count: Long, meanLen: Double, stdDevLength: Double, dc: Long)
+
+  class JsonNode() {
+    private val forDataType =
+      mutable.HashMap[String, (JsonNodeStats, mutable.HashMap[String, JsonNode])]()
+
+    def getChild(name: String, isArray: Boolean): JsonNode = {
+      val dt = if (isArray) { "ARRAY" } else { "OBJECT" }
+      val typed = forDataType.getOrElse(dt,
+        throw new IllegalArgumentException(s"$dt is not a set data type yet."))
+      typed._2.getOrElse(name,
+        throw new IllegalArgumentException(s"$name is not a child when the type is $dt"))
+    }
+
+    def contains(name: String, isArray: Boolean): Boolean = {
+      val dt = if (isArray) { "ARRAY" } else { "OBJECT" }
+      forDataType.get(dt).exists { children =>
+        children._2.contains(name)
+      }
+    }
+
+    def addChild(name: String, isArray: Boolean): JsonNode = {
+      val dt = if (isArray) { "ARRAY" } else { "OBJECT" }
+      val found = forDataType.getOrElse(dt,
+        throw new IllegalArgumentException(s"$dt was not already added as a data type"))
+      if (found._2.contains(name)) {
+        throw new IllegalArgumentException(s"$dt already has a child named $name")
+      }
+      val node = new JsonNode()
+      found._2.put(name, node)
+      node
+    }
+
+    def addChoice(dt: String, stats: JsonNodeStats): Unit = {
+      if (forDataType.contains(dt)) {
+        throw new IllegalArgumentException(s"$dt was already added as a data type")
+      }
+      forDataType.put(dt, (stats, new mutable.HashMap[String, JsonNode]()))
+    }
+
+    override def toString: String = {
+      forDataType.toString()
+    }
+
+    def totalCount: Long = {
+      forDataType.values.map{ case (stats, _) => stats.count}.sum
+    }
+
+    private def makeNoChoiceGenRecursive(dt: String,
+                                         children: mutable.HashMap[String, JsonNode],
+                                         cc: ColumnConf): (SubstringDataGen, ColumnConf) = {
+      var c = cc
+      val ret = dt match {
+        case "LONG" => new JSONLongGen(c)
+        case "DOUBLE" => new JSONDoubleGen(c)
+        case "BOOLEAN" => new JSONBoolGen(c)
+        case "NULL" => new JSONNullGen(false, c)
+        case "VALUE_NULL" => new JSONNullGen(true, c)
+        case "ERROR" => new JSONErrorGen(c)
+        case "STRING" => new JSONStringGen(c)
+        case "ARRAY" =>
+          val child = if (children.isEmpty) {
+            // A corner case, we will just make it a BOOL column and it will be ignored
+            val tmp = new JSONBoolGen(c)
+            c = c.forNextSubstring
+            tmp
+          } else {
+            val tmp = children.values.head.makeGenRecursive(c)
+            c = tmp._2
+            tmp._1
+          }
+          new JSONArrayGen(child, c)
+        case "OBJECT" =>
+          val childGens = if (children.isEmpty) {
+            Seq.empty
+          } else {
+            children.toSeq.map {
+              case (k, node) =>
+                val tmp = node.makeGenRecursive(c)
+                c = tmp._2
+                (k, tmp._1)
+            }
+          }
+          new JSONObjectGen(childGens, c)
+        case other =>
+          throw new IllegalArgumentException(s"$other is not a leaf node type")
+      }
+      (ret, c.forNextSubstring)
+    }
+
+    private def makeGenRecursive(cc: ColumnConf): (SubstringDataGen, ColumnConf) = {
+      var c = cc
+      // We are going to recursively walk the tree for all of the values.
+      if (forDataType.size == 1) {
+        // We don't need a choice at all. This makes it simpler..
+        val (dt, (_, children)) = forDataType.head
+        makeNoChoiceGenRecursive(dt, children, c)
+      } else {
+        val totalSum = forDataType.map(f => f._2._1.count).sum.toDouble
+        var runningSum = 0L
+        val allChoices = ArrayBuffer[(Double, String, SubstringDataGen)]()
+        forDataType.foreach {
+          case (dt, (stats, children)) =>
+            val tmp = makeNoChoiceGenRecursive(dt, children, c)
+            c = tmp._2
+            runningSum += stats.count
+            allChoices.append((runningSum/totalSum, dt, tmp._1))
+        }
+
+        val ret = new JSONChoiceGen(allChoices.toSeq, c)
+        (ret, c.forNextSubstring)
+      }
+    }
+
+    def makeGen(cc: ColumnConf): SubstringDataGen = {
+      val (ret, _) = makeGenRecursive(cc)
+      ret
+    }
+
+    def setStatsSingle(dg: CommonDataGen,
+                       dt: String,
+                       stats: JsonNodeStats,
+                       nullPct: Double): Unit = {
+
+      val includeLength = dt != "OBJECT" && dt != "BOOLEAN" && dt != "NULL" && dt != "VALUE_NULL"
+      val includeNullPct = nullPct > 0.0
+      if (includeLength) {
+        dg.setGaussianLength(stats.meanLen, stats.stdDevLength)
+      }
+      if (includeNullPct) {
+        dg.setNullProbability(nullPct)
+      }
+      dg.setSeedRange(1, stats.dc)
+    }
+
+    def setStats(dg: CommonDataGen,
+                 parentCount: Option[Long]): Unit  = {
+      // We are going to recursively walk the tree...
+      if (forDataType.size == 1) {
+        // We don't need a choice at all. This makes it simpler..
+        val (dt, (stats, children)) = forDataType.head
+        val nullPct = parentCount.map { pc =>
+          (pc - stats.count).toDouble/pc
+        }.getOrElse(0.0)
+        setStatsSingle(dg, dt, stats, nullPct)
+        val myCount = if (dt == "OBJECT") {
+          Some(totalCount)
+        } else {
+          None
+        }
+        children.foreach {
+          case (name, node) =>
+            node.setStats(dg(name), myCount)
+        }
+      } else {
+        // We have choices to make between different types.
+        // The null percent cannot be calculated for each individual choice
+        // but is calculated on the group as a whole instead
+        parentCount.foreach { pc =>
+          val tc = totalCount
+          val choiceNullPct = (pc - tc).toDouble / pc
+          if (choiceNullPct > 0.0) {
+            dg.setNullProbability(choiceNullPct)
+          }
+        }
+        forDataType.foreach {
+          case (dt, (stats, children)) =>
+            // When there is a choice the name to access it is the data type
+            val choiceDg = dg(dt)
+            setStatsSingle(choiceDg, dt, stats, 0.0)
+            children.foreach {
+              case (name, node) =>
+                val myCount = if (dt == "OBJECT") {
+                  // Here we only want the count for the OBJECTs
+                  Some(stats.count)
+                } else {
+                  None
+                }
+                node.setStats(choiceDg(name), myCount)
+            }
+        }
+      }
+    }
+  }
+
+  private lazy val jsonFactory = new JsonFactoryBuilder()
+    // The two options below enabled for Hive compatibility
+    .enable(JsonReadFeature.ALLOW_UNESCAPED_CONTROL_CHARS)
+    .enable(JsonReadFeature.ALLOW_SINGLE_QUOTES)
+    .build()
+
+  private def processNext(parser: JsonParser,
+                          currentPath: ArrayBuffer[JsonPathElement],
+                          output: ArrayBuffer[JsonLevel]): Unit = {
+    parser.currentToken() match {
+      case JsonToken.START_OBJECT =>
+        parser.nextToken()
+        while (parser.currentToken() != JsonToken.END_OBJECT) {
+          processNext(parser, currentPath, output)
+        }
+        output.append(JsonLevel(currentPath.toArray, "OBJECT", 0, ""))
+        parser.nextToken()
+      case JsonToken.START_ARRAY =>
+        currentPath.append(JsonPathElement("data", is_array = true))
+        parser.nextToken()
+        var length = 0
+        while (parser.currentToken() != JsonToken.END_ARRAY) {
+          length += 1
+          processNext(parser, currentPath, output)
+        }
+        currentPath.remove(currentPath.length - 1)
+        output.append(JsonLevel(currentPath.toArray, "ARRAY", length, ""))
+        parser.nextToken()
+      case JsonToken.FIELD_NAME =>
+        currentPath.append(JsonPathElement(parser.getCurrentName, is_array = false))
+        parser.nextToken()
+        processNext(parser, currentPath, output)
+        currentPath.remove(currentPath.length - 1)
+      case JsonToken.VALUE_NUMBER_INT =>
+        val length = parser.getValueAsString.getBytes("UTF-8").length
+        output.append(JsonLevel(currentPath.toArray, "LONG", length, parser.getValueAsString))
+        parser.nextToken()
+      case JsonToken.VALUE_NUMBER_FLOAT =>
+        val length = parser.getValueAsString.getBytes("UTF-8").length
+        output.append(JsonLevel(currentPath.toArray, "DOUBLE", length, parser.getValueAsString))
+        parser.nextToken()
+      case JsonToken.VALUE_TRUE | JsonToken.VALUE_FALSE =>
+        val length = parser.getValueAsString.getBytes("UTF-8").length
+        output.append(JsonLevel(currentPath.toArray, "BOOLEAN", length, parser.getValueAsString))
+        parser.nextToken()
+      case JsonToken.VALUE_NULL | null =>
+        output.append(JsonLevel(currentPath.toArray, "VALUE_NULL", 4, "NULL"))
+        parser.nextToken()
+      case JsonToken.VALUE_STRING =>
+        val length = parser.getValueAsString.getBytes("UTF-8").length
+        output.append(JsonLevel(currentPath.toArray, "STRING", length, parser.getValueAsString))
+        parser.nextToken()
+      case other =>
+        throw new IllegalStateException(s"DON'T KNOW HOW TO DEAL WITH $other")
+    }
+  }
+
+  def jsonStatsUdf(json: String): Array[JsonLevel] = {
+    val output = new ArrayBuffer[JsonLevel]()
+    try {
+      val currentPath = new ArrayBuffer[JsonPathElement]()
+      if (json == null) {
+        output.append(JsonLevel(Array.empty, "NULL", 0, ""))
+      } else {
+        val parser = jsonFactory.createParser(json)
+        try {
+          parser.nextToken()
+          processNext(parser, currentPath, output)
+        } finally {
+          parser.close()
+        }
+      }
+    } catch {
+      case _: com.fasterxml.jackson.core.JsonParseException =>
+        output.clear()
+        output.append(JsonLevel(Array.empty, "ERROR", json.getBytes("UTF-8").length, json))
+    }
+    output.toArray
+  }
+
+  private lazy val extractPaths = udf(json => jsonStatsUdf(json))
+
+  def anonymizeString(str: String, seed: Long): String = {
+    val length = str.length
+    val data = new Array[Byte](length)
+    val hash = XXH64.hashLong(str.hashCode, seed)
+    val r = new Random()
+    r.setSeed(hash)
     (0 until length).foreach { i =>
-      if (i > 0) {
-        sb.append(",")
+      val tmp = r.nextInt(16)
+      data(i) = (tmp + 'A').toByte
+    }
+    new String(data)
+  }
+
+  private lazy val anonPath = udf((str, seed) => anonymizeString(str, seed))
+
+  def anonymizeFingerPrint(df: DataFrame, anonSeed: Long): DataFrame = {
+    df.withColumn("tmp", transform(col("path"),
+        o => {
+          val name = o("name")
+          val isArray = o("is_array")
+          val anon = anonPath(name, lit(anonSeed))
+          val newName = when(isArray, name).otherwise(anon).alias("name")
+          struct(newName, isArray)
+        }))
+      .drop("path").withColumnRenamed("tmp", "path")
+      .orderBy("path", "dt")
+      .selectExpr("path", "dt","c","mean_len","stddev_len","distinct","version")
+  }
+
+  def fingerPrint(df: DataFrame, column: Column, anonymize: Option[Long] = None): DataFrame = {
+    val ret = df.select(extractPaths(column).alias("paths"))
+      .selectExpr("explode_outer(paths) as p")
+      .selectExpr("p.path as path", "p.data_type as dt", "p.length as len", "p.value as value")
+      .groupBy(col("path"), col("dt")).agg(
+        count(lit(1)).alias("c"),
+        avg(col("len")).alias("mean_len"),
+        coalesce(stddev(col("len")), lit(0.0)).alias("stddev_len"),
+        approx_count_distinct(col("value")).alias("distinct"))
+      .orderBy("path", "dt").withColumn("version", lit("0.1"))
+      .selectExpr("path", "dt","c","mean_len","stddev_len","distinct","version")
+
+    anonymize.map { anonSeed =>
+      anonymizeFingerPrint(ret, anonSeed)
+    }.getOrElse(ret)
+  }
+
+  def apply(aggForColumn: DataFrame, genColumn: ColumnGen): Unit =
+    apply(aggForColumn, genColumn.dataGen)
+
+  private val expectedSchema = StructType.fromDDL(
+    "path ARRAY<STRUCT<name: STRING, is_array: BOOLEAN>>," +
+      "dt STRING," +
+      "c BIGINT," +
+      "mean_len DOUBLE," +
+      "stddev_len DOUBLE," +
+      "distinct BIGINT," +
+      "version STRING")
+
+  def apply(aggForColumn: DataFrame, gen: DataGen): Unit = {
+    val aggData = aggForColumn.orderBy("path", "dt").collect()
+    val rootNode: JsonNode = new JsonNode()
+    assert(aggData.length > 0)
+    val schema = aggData.head.schema
+    assert(schema.length == expectedSchema.length)
+    schema.fields.zip(expectedSchema.fields).foreach {
+      case(found, expected) =>
+        assert(found.name == expected.name)
+        // TODO we can worry about the exact types later if we need to
+    }
+    assert(aggData.head.getString(6) == "0.1")
+    aggData.foreach { row =>
+      val fullPath = row.getAs[mutable.WrappedArray[Row]](0)
+      val parsedPath = fullPath.map(r => (r.getString(0), r.getBoolean(1))).toList
+      val dt = row.getString(1)
+      val count = row.getLong(2)
+      val meanLen = row.getDouble(3)
+      val stdLen = row.getDouble(4)
+      val dc = row.getLong(5)
+
+      val stats = JsonNodeStats(count, meanLen, stdLen, dc)
+      var currentNode = rootNode
+      // Find everything up to the last path element
+      if (parsedPath.length > 1) {
+        parsedPath.slice(0, parsedPath.length - 1).foreach {
+          case (name, isArray) =>
+            currentNode = currentNode.getChild(name, isArray)
+        }
+      }
+
+      if (parsedPath.nonEmpty) {
+        // For the last path element (that is not the root element) we might need to add it
+        // as a child
+        val (name, isArray) = parsedPath.last
+        if (!currentNode.contains(name, isArray)) {
+          currentNode.addChild(name, isArray)
+        }
+        currentNode = currentNode.getChild(name, isArray)
       }
-      childType.appendRandomValue(sb, i, maxStringLength, maxArrayLength, maxObjectLength,
-        depth + 1, maxDepth, r)
+      currentNode.addChoice(dt, stats)
     }
-    sb.append("]")
+
+    gen.setSubstringGen(cc => rootNode.makeGen(cc))
+    rootNode.setStats(gen.substringGen, None)
   }
 }
 
-object JSONObject extends JSONType {
-  override def appendRandomValue(sb: StringBuilder,
-      index: Int,
-      maxStringLength: Int,
-      maxArrayLength: Int,
-      maxObjectLength: Int,
-      depth: Int,
-      maxDepth: Int,
-      r: Random): Unit = {
-    val length = r.nextInt(maxObjectLength) + 1
-    sb.append("{")
-    (0 until length).foreach { i =>
-      if (i > 0) {
-        sb.append(",")
+
+case class JSONStringGenFunc(lengthGen: LengthGeneratorFunction = null,
+                             mapping: LocationToSeedMapping = null) extends GeneratorFunction {
+
+  override def apply(rowLoc: RowLocation): Any = {
+    val len = lengthGen(rowLoc)
+    val r = DataGen.getRandomFor(rowLoc, mapping)
+    val buffer = new Array[Byte](len)
+    var at = 0
+    while (at < len) {
+      // Value range is 32 (Space) to 126 (~)
+      buffer(at) = (r.nextInt(126 - 31) + 32).toByte
+      at += 1
+    }
+    val strVal = new String(buffer, 0, len)
+            .replace("\\", "\\\\")
+            .replace("\"", "\\\"")
+            .replace("\n", "\\n")
+            .replace("\r", "\\r")
+            .replace("\b", "\\b")
+            .replace("\f", "\\f")
+    '"' + strVal + '"'
+  }
+
+  override def withLengthGeneratorFunction(lengthGen: LengthGeneratorFunction): JSONStringGenFunc =
+    JSONStringGenFunc(lengthGen, mapping)
+
+  override def withLocationToSeedMapping(mapping: LocationToSeedMapping): JSONStringGenFunc =
+    JSONStringGenFunc(lengthGen, mapping)
+
+  override def withValueRange(min: Any, max: Any): GeneratorFunction =
+    throw new IllegalArgumentException("value ranges are not supported for JSON")
+}
+
+class JSONStringGen(conf: ColumnConf,
+                  defaultValueRange: Option[(Any, Any)] = None)
+  extends SubstringDataGen(conf, defaultValueRange) {
+
+  override protected def getValGen: GeneratorFunction = JSONStringGenFunc()
+
+  override def children: Seq[(String, SubstringDataGen)] = Seq.empty
+}
+
+case class JSONLongGenFunc(lengthGen: LengthGeneratorFunction = null,
+                    mapping: LocationToSeedMapping = null) extends GeneratorFunction {
+
+  override def apply(rowLoc: RowLocation): Any = {
+    val len = math.max(lengthGen(rowLoc), 1) // We need at least 1 long for a valid value
+    val r = DataGen.getRandomFor(rowLoc, mapping)
+    val buffer = new Array[Byte](len)
+    var at = 0
+    while (at < len) {
+      if (at == 0) {
+        // No leading 0's
+        buffer(at) = (r.nextInt(9) + '1').toByte
+      } else {
+        buffer(at) = (r.nextInt(10) + '0').toByte
       }
-      sb.append("\"key_")
-      sb.append(i)
-      sb.append("_")
-      sb.append(depth )
-      sb.append("\":")
-      val childType = JSONType.selectType(depth, maxDepth, r)
-      childType.appendRandomValue(sb, i, maxStringLength, maxArrayLength, maxObjectLength,
-        depth + 1, maxDepth, r)
+      at += 1
     }
-    sb.append("}")
+    new String(buffer, 0, len)
   }
+
+  override def withLengthGeneratorFunction(lengthGen: LengthGeneratorFunction): JSONLongGenFunc =
+    JSONLongGenFunc(lengthGen, mapping)
+
+  override def withLocationToSeedMapping(mapping: LocationToSeedMapping): JSONLongGenFunc =
+    JSONLongGenFunc(lengthGen, mapping)
+
+  override def withValueRange(min: Any, max: Any): GeneratorFunction =
+    throw new IllegalArgumentException("value ranges are not supported for JSON")
 }
 
-case class JSONGenFunc(
-    maxStringLength: Int,
-    maxArrayLength: Int,
-    maxObjectLength: Int,
-    maxDepth: Int,
-    lengthGen: LengthGeneratorFunction = null,
-    mapping: LocationToSeedMapping = null) extends GeneratorFunction {
+class JSONLongGen(conf: ColumnConf,
+                  defaultValueRange: Option[(Any, Any)] = None)
+  extends SubstringDataGen(conf, defaultValueRange) {
+
+  override protected def getValGen: GeneratorFunction = JSONLongGenFunc()
+
+  override def children: Seq[(String, SubstringDataGen)] = Seq.empty
+}
+
+case class JSONDoubleGenFunc(lengthGen: LengthGeneratorFunction = null,
+                             mapping: LocationToSeedMapping = null) extends GeneratorFunction {
 
   override def apply(rowLoc: RowLocation): Any = {
+    val len = math.max(lengthGen(rowLoc), 3) // We have to have at least 3 chars NUM.NUM
     val r = DataGen.getRandomFor(rowLoc, mapping)
-    val sb = new StringBuilder()
-    JSONObject.appendRandomValue(sb, 0, maxStringLength, maxArrayLength, maxObjectLength,
-      0, maxDepth, r)
-    // For now I am going to have some hard coded keys
-    UTF8String.fromString(sb.toString())
+    val beforeLen = if (len == 3) { 1 } else { r.nextInt(len - 3) + 1 }
+    val buffer = new Array[Byte](len)
+    var at = 0
+    while (at < len) {
+      if (at == 0) {
+        // No leading 0's
+        buffer(at) = (r.nextInt(9) + '1').toByte
+      } else if (at == beforeLen) {
+        buffer(at) = '.'
+      } else {
+        buffer(at) = (r.nextInt(10) + '0').toByte
+      }
+      at += 1
+    }
+    UTF8String.fromBytes(buffer, 0, len)
   }
 
-  override def withLengthGeneratorFunction(lengthGen: LengthGeneratorFunction): GeneratorFunction =
-    JSONGenFunc(maxStringLength, maxArrayLength, maxObjectLength, maxDepth, lengthGen, mapping)
+  override def withLengthGeneratorFunction(lengthGen: LengthGeneratorFunction): JSONDoubleGenFunc =
+    JSONDoubleGenFunc(lengthGen, mapping)
 
-  override def withLocationToSeedMapping(mapping: LocationToSeedMapping): GeneratorFunction =
-    JSONGenFunc(maxStringLength, maxArrayLength, maxObjectLength, maxDepth, lengthGen, mapping)
+  override def withLocationToSeedMapping(mapping: LocationToSeedMapping): JSONDoubleGenFunc =
+    JSONDoubleGenFunc(lengthGen, mapping)
 
   override def withValueRange(min: Any, max: Any): GeneratorFunction =
-    throw new IllegalArgumentException("value ranges are not supported for strings")
+    throw new IllegalArgumentException("value ranges are not supported for JSON")
+}
+
+class JSONDoubleGen(conf: ColumnConf,
+                  defaultValueRange: Option[(Any, Any)] = None)
+  extends SubstringDataGen(conf, defaultValueRange) {
+
+  override protected def getValGen: GeneratorFunction = JSONDoubleGenFunc()
+
+  override def children: Seq[(String, SubstringDataGen)] = Seq.empty
+}
+
+case class JSONBoolGenFunc(lengthGen: LengthGeneratorFunction = null,
+                    mapping: LocationToSeedMapping = null) extends GeneratorFunction {
+
+  override def apply(rowLoc: RowLocation): Any = {
+    val r = DataGen.getRandomFor(rowLoc, mapping)
+    val ret = if (r.nextBoolean()) "true" else "false"
+    UTF8String.fromString(ret)
+  }
+
+  override def withLengthGeneratorFunction(lengthGen: LengthGeneratorFunction): JSONBoolGenFunc =
+    JSONBoolGenFunc(lengthGen, mapping)
+
+  override def withLocationToSeedMapping(mapping: LocationToSeedMapping): JSONBoolGenFunc =
+    JSONBoolGenFunc(lengthGen, mapping)
+
+  override def withValueRange(min: Any, max: Any): GeneratorFunction =
+    throw new IllegalArgumentException("value ranges are not supported for JSON")
+}
+
+class JSONBoolGen(conf: ColumnConf,
+                 defaultValueRange: Option[(Any, Any)] = None)
+  extends SubstringDataGen(conf, defaultValueRange) {
+
+  override protected def getValGen: GeneratorFunction = JSONBoolGenFunc()
+
+  override def children: Seq[(String, SubstringDataGen)] = Seq.empty
+}
+
+case class JSONNullGenFunc(nullAsString: Boolean,
+                           lengthGen: LengthGeneratorFunction = null,
+                           mapping: LocationToSeedMapping = null) extends GeneratorFunction {
+
+  override def apply(rowLoc: RowLocation): Any =
+    if (nullAsString) {
+      UTF8String.fromString("null")
+    } else {
+      null
+    }
+
+
+  override def withLengthGeneratorFunction(lengthGen: LengthGeneratorFunction): JSONNullGenFunc =
+    JSONNullGenFunc(nullAsString, lengthGen, mapping)
+
+  override def withLocationToSeedMapping(mapping: LocationToSeedMapping): JSONNullGenFunc =
+    JSONNullGenFunc(nullAsString, lengthGen, mapping)
+
+  override def withValueRange(min: Any, max: Any): GeneratorFunction =
+    throw new IllegalArgumentException("value ranges are not supported for JSON")
+}
+
+class JSONNullGen(nullAsString: Boolean,
+                  conf: ColumnConf,
+                  defaultValueRange: Option[(Any, Any)] = None)
+  extends SubstringDataGen(conf, defaultValueRange) {
+
+  override protected def getValGen: GeneratorFunction = JSONNullGenFunc(nullAsString)
+
+  override def children: Seq[(String, SubstringDataGen)] = Seq.empty
+}
+
+case class JSONErrorGenFunc(lengthGen: LengthGeneratorFunction = null,
+                            mapping: LocationToSeedMapping = null) extends GeneratorFunction {
+
+  override def apply(rowLoc: RowLocation): Any = {
+    val len = lengthGen(rowLoc)
+    val r = DataGen.getRandomFor(rowLoc, mapping)
+    val buffer = new Array[Byte](len)
+    var at = 0
+    while (at < len) {
+      // Value range is 32 (Space) to 126 (~)
+      // But it is almost impossible to show up as valid JSON
+      buffer(at) = (r.nextInt(126 - 31) + 32).toByte
+      at += 1
+    }
+    UTF8String.fromBytes(buffer, 0, len)
+  }
+
+  override def withLengthGeneratorFunction(lengthGen: LengthGeneratorFunction): JSONErrorGenFunc =
+    JSONErrorGenFunc(lengthGen, mapping)
+
+  override def withLocationToSeedMapping(mapping: LocationToSeedMapping): JSONErrorGenFunc =
+    JSONErrorGenFunc(lengthGen, mapping)
+
+  override def withValueRange(min: Any, max: Any): GeneratorFunction =
+    throw new IllegalArgumentException("value ranges are not supported for JSON")
+}
+
+class JSONErrorGen(conf: ColumnConf,
+                  defaultValueRange: Option[(Any, Any)] = None)
+  extends SubstringDataGen(conf, defaultValueRange) {
+
+  override protected def getValGen: GeneratorFunction = JSONErrorGenFunc()
+
+  override def children: Seq[(String, SubstringDataGen)] = Seq.empty
+}
+
+case class JSONArrayGenFunc(child: GeneratorFunction,
+                     lengthGen: LengthGeneratorFunction = null,
+                     mapping: LocationToSeedMapping = null) extends GeneratorFunction {
+
+  override def apply(rowLoc: RowLocation): Any = {
+    val len = lengthGen(rowLoc)
+    val data = new Array[String](len)
+    val childRowLoc = rowLoc.withNewChild()
+    var i = 0
+    while (i < len) {
+      childRowLoc.setLastChildIndex(i)
+      val v = child(childRowLoc)
+      if (v == null) {
+        // A null in an array must look like "null"
+        data(i) = "null"
+      } else {
+        data(i) = v.toString
+      }
+      i += 1
+    }
+    val ret = data.mkString("[", ",", "]")
+    UTF8String.fromString(ret)
+  }
+
+  override def withLengthGeneratorFunction(lengthGen: LengthGeneratorFunction): JSONArrayGenFunc =
+    JSONArrayGenFunc(child, lengthGen, mapping)
+
+  override def withLocationToSeedMapping(mapping: LocationToSeedMapping): JSONArrayGenFunc =
+    JSONArrayGenFunc(child, lengthGen, mapping)
+
+  override def withValueRange(min: Any, max: Any): GeneratorFunction =
+    throw new IllegalArgumentException("value ranges are not supported for JSON")
+}
+
+class JSONArrayGen(child: SubstringDataGen,
+               conf: ColumnConf,
+               defaultValueRange: Option[(Any, Any)] = None)
+  extends SubstringDataGen(conf, defaultValueRange) {
+
+  override def setCorrelatedKeyGroup(keyGroup: Long,
+                                     minSeed: Long, maxSeed: Long,
+                                     seedMapping: LocationToSeedMapping): SubstringDataGen = {
+    super.setCorrelatedKeyGroup(keyGroup, minSeed, maxSeed, seedMapping)
+    child.setCorrelatedKeyGroup(keyGroup, minSeed, maxSeed, seedMapping)
+    this
+  }
+
+  override protected def getValGen: GeneratorFunction = JSONArrayGenFunc(child.getGen)
+
+  override def get(name: String): Option[SubstringDataGen] = {
+    if ("data".equalsIgnoreCase(name) || "child".equalsIgnoreCase(name)) {
+      Some(child)
+    } else {
+      None
+    }
+  }
+
+  override def children: Seq[(String, SubstringDataGen)] = Seq(("data", child))
+}
+
+case class JSONObjectGenFunc(childGens: Array[(String, GeneratorFunction)],
+                             lengthGen: LengthGeneratorFunction = null,
+                             mapping: LocationToSeedMapping = null) extends GeneratorFunction {
+  override def apply(rowLoc: RowLocation): Any = {
+    // TODO randomize the order of the children???
+    // TODO duplicate child values???
+    // The row location does not change for a struct/object
+    val data = childGens.map {
+      case (k, gen) =>
+        val key = k.replace("\\", "\\\\")
+          .replace("\"", "\\\"")
+          .replace("\n", "\\n")
+          .replace("\r", "\\r")
+          .replace("\b", "\\b")
+          .replace("\f", "\\f")
+        val v = gen.apply(rowLoc)
+        if (v == null) {
+          ""
+        } else {
+          '"' + key + "\":" + v
+        }
+    }
+    val ret = data.filterNot(_.isEmpty).mkString("{",",","}")
+    UTF8String.fromString(ret)
+  }
+
+  override def withLocationToSeedMapping(mapping: LocationToSeedMapping): JSONObjectGenFunc =
+    JSONObjectGenFunc(childGens, lengthGen, mapping)
+
+  override def withLengthGeneratorFunction(lengthGen: LengthGeneratorFunction): JSONObjectGenFunc =
+    JSONObjectGenFunc(childGens, lengthGen, mapping)
+
+  override def withValueRange(min: Any, max: Any): GeneratorFunction =
+    throw new IllegalArgumentException("value ranges are not supported for JSON")
+}
+
+class JSONObjectGen(val children: Seq[(String, SubstringDataGen)],
+                conf: ColumnConf,
+                defaultValueRange: Option[(Any, Any)] = None)
+  extends SubstringDataGen(conf, defaultValueRange) {
+
+  override def setCorrelatedKeyGroup(keyGroup: Long,
+                                     minSeed: Long, maxSeed: Long,
+                                     seedMapping: LocationToSeedMapping): SubstringDataGen = {
+    super.setCorrelatedKeyGroup(keyGroup, minSeed, maxSeed, seedMapping)
+    children.foreach {
+      case (_, gen) =>
+        gen.setCorrelatedKeyGroup(keyGroup, minSeed, maxSeed, seedMapping)
+    }
+    this
+  }
+
+  override def get(name: String): Option[SubstringDataGen] =
+    children.collectFirst {
+      case (childName, dataGen) if childName.equalsIgnoreCase(name) => dataGen
+    }
+
+  override protected def getValGen: GeneratorFunction = {
+    val childGens = children.map(c => (c._1, c._2.getGen)).toArray
+    JSONObjectGenFunc(childGens)
+  }
+}
+
+case class JSONChoiceGenFunc(choices: List[(Double, GeneratorFunction)],
+                             lengthGen: LengthGeneratorFunction = null,
+                             mapping: LocationToSeedMapping = null) extends GeneratorFunction {
+  override def apply(rowLoc: RowLocation): Any = {
+    val r = DataGen.getRandomFor(rowLoc, mapping)
+    val l = r.nextDouble()
+    var index = 0
+    while (choices(index)._1 < l) {
+      index += 1
+    }
+    val childRowLoc = rowLoc.withNewChild()
+    choices(index)._2(childRowLoc)
+  }
+
+  override def withLengthGeneratorFunction(lengthGen: LengthGeneratorFunction): JSONChoiceGenFunc =
+    JSONChoiceGenFunc(choices, lengthGen, mapping)
+
+  override def withLocationToSeedMapping(mapping: LocationToSeedMapping): JSONChoiceGenFunc =
+    JSONChoiceGenFunc(choices, lengthGen, mapping)
+
+  override def withValueRange(min: Any, max: Any): GeneratorFunction =
+    throw new IllegalArgumentException("value ranges are not supported for JSON")
+}
+
+class JSONChoiceGen(val choices: Seq[(Double, String, SubstringDataGen)],
+                    conf: ColumnConf,
+                    defaultValueRange: Option[(Any, Any)] = None)
+  extends SubstringDataGen(conf, defaultValueRange) {
+
+  override val children: Seq[(String, SubstringDataGen)] =
+    choices.map { case (_, name, gen) => (name, gen) }
+
+  override def setCorrelatedKeyGroup(keyGroup: Long,
+                                     minSeed: Long, maxSeed: Long,
+                                     seedMapping: LocationToSeedMapping): SubstringDataGen = {
+    super.setCorrelatedKeyGroup(keyGroup, minSeed, maxSeed, seedMapping)
+    children.foreach {
+      case (_, gen) =>
+        gen.setCorrelatedKeyGroup(keyGroup, minSeed, maxSeed, seedMapping)
+    }
+    this
+  }
+
+  override def get(name: String): Option[SubstringDataGen] =
+    children.collectFirst {
+      case (childName, dataGen) if childName.equalsIgnoreCase(name) => dataGen
+    }
+
+  override protected def getValGen: GeneratorFunction = {
+    val childGens = choices.map(c => (c._1, c._3.getGen)).toList
+    JSONChoiceGenFunc(childGens)
+  }
 }
 
 case class ASCIIGenFunc(
@@ -1672,14 +2451,46 @@ case class ASCIIGenFunc(
     throw new IllegalArgumentException("value ranges are not supported for strings")
 }
 
-class StringGen(conf: ColumnConf, defaultValueRange: Option[(Any, Any)])
-    extends DataGen(conf, defaultValueRange) {
+/**
+ * This is here to wrap the substring gen function so that its length/settings
+ * are the ones used when generating a string, and not what was set for the string.
+ */
+case class SubstringGenFunc(
+    substringGen: GeneratorFunction,
+    lengthGen: LengthGeneratorFunction = null,
+    mapping: LocationToSeedMapping = null) extends GeneratorFunction {
+
+  override def apply(rowLoc: RowLocation): Any = {
+    substringGen(rowLoc)
+  }
+
+  // The length and location seed mapping are just ignored for this...
+  override def withLengthGeneratorFunction(lengthGen: LengthGeneratorFunction): GeneratorFunction =
+    this
+
+  override def withLocationToSeedMapping(mapping: LocationToSeedMapping): GeneratorFunction =
+    this
+
+  override def withValueRange(min: Any, max: Any): GeneratorFunction =
+    throw new IllegalArgumentException("value ranges are not supported for strings")
+}
+
+class StringGen(conf: ColumnConf,
+                defaultValueRange: Option[(Any, Any)],
+                var substringDataGen: Option[SubstringDataGen] = None)
+  extends DataGen(conf, defaultValueRange) {
 
   override def dataType: DataType = StringType
 
-  override protected def getValGen: GeneratorFunction = ASCIIGenFunc()
+  override protected def getValGen: GeneratorFunction =
+    substringDataGen.map(s => SubstringGenFunc(s.getGen)).getOrElse(ASCIIGenFunc())
 
   override def children: Seq[(String, DataGen)] = Seq.empty
+
+  override def setSubstringGen(subgen: Option[SubstringDataGen]): Unit =
+    substringDataGen = subgen
+
+  override def getSubstringGen: Option[SubstringDataGen] = substringDataGen
 }
 
 case class StructGenFunc(childGens: Array[GeneratorFunction]) extends GeneratorFunction {
@@ -1854,7 +2665,6 @@ class MapGen(key: DataGen,
   override def children: Seq[(String, DataGen)] = Seq(("key", key), ("value", value))
 }
 
-
 object ColumnGen {
   private def genInternal(rowNumber: Column,
       dataType: DataType,
@@ -1869,8 +2679,8 @@ object ColumnGen {
  */
 class ColumnGen(val dataGen: DataGen) {
   def setCorrelatedKeyGroup(kg: Long,
-      minSeed: Long, maxSeed: Long,
-      seedMapping: LocationToSeedMapping): ColumnGen = {
+                            minSeed: Long, maxSeed: Long,
+                            seedMapping: LocationToSeedMapping): ColumnGen = {
     dataGen.setCorrelatedKeyGroup(kg, minSeed, maxSeed, seedMapping)
     this
   }
@@ -1930,6 +2740,11 @@ class ColumnGen(val dataGen: DataGen) {
     this
   }
 
+  def setGaussianLength(mean: Double, stdDev: Double): ColumnGen = {
+    dataGen.setGaussianLength(mean, stdDev)
+    this
+  }
+
   final def apply(name: String): DataGen = {
     get(name).getOrElse {
       throw new IllegalArgumentException(s"$name not a child of $this")
@@ -1941,8 +2756,16 @@ class ColumnGen(val dataGen: DataGen) {
   def gen(rowNumber: Column): Column = {
     ColumnGen.genInternal(rowNumber, dataGen.dataType, dataGen.nullable, dataGen.getGen)
   }
+
+  def getSubstring: Option[SubstringDataGen] = dataGen.getSubstringGen
+
+  def substringGen: SubstringDataGen = dataGen.substringGen
+
+  def setSubstringGen(f : ColumnConf => SubstringDataGen): Unit =
+    dataGen.setSubstringGen(f)
 }
 
+
 sealed trait KeyGroupType
 
 /**
@@ -2192,7 +3015,7 @@ object DBGen {
       numRows: Long,
       mapping: OrderedTypeMapping): Seq[(String, ColumnGen)] = {
     // a bit of a hack with the column num so that we update it before each time...
-    var conf = ColumnConf(ColumnLocation(tableId, -1), true, numRows)
+    var conf = ColumnConf(ColumnLocation(tableId, -1, 0), true, numRows)
     st.toArray.map { sf =>
       if (!mapping.canMap(sf.dataType, mapping)) {
         throw new IllegalArgumentException(s"$sf is not supported at this time")
diff --git a/jenkins/databricks/install_deps.py b/jenkins/databricks/install_deps.py
index be5cb9bc040..8d21a4f9556 100644
--- a/jenkins/databricks/install_deps.py
+++ b/jenkins/databricks/install_deps.py
@@ -115,8 +115,10 @@ def define_deps(spark_version, scala_version):
                  f'{prefix_ws_sp_mvn_hadoop}--org.json4s--json4s-jackson_{scala_version}--org.json4s__json4s-jackson_{scala_version}__*.jar'),
         Artifact('org.javaassist', 'javaassist',
                  f'{prefix_ws_sp_mvn_hadoop}--org.javassist--javassist--org.javassist__javassist__*.jar'),
-        Artifact('com.fasterxml.jackson.core', 'jackson-core',
+        Artifact('com.fasterxml.jackson.core', 'jackson-databind',
                  f'{prefix_ws_sp_mvn_hadoop}--com.fasterxml.jackson.core--jackson-databind--com.fasterxml.jackson.core__jackson-databind__*.jar'),
+        Artifact('com.fasterxml.jackson.core', 'jackson-core',
+                 f'{prefix_ws_sp_mvn_hadoop}--com.fasterxml.jackson.core--jackson-core--com.fasterxml.jackson.core__jackson-core__*.jar'),
         Artifact('com.fasterxml.jackson.core', 'jackson-annotations',
                  f'{prefix_ws_sp_mvn_hadoop}--com.fasterxml.jackson.core--jackson-annotations--com.fasterxml.jackson.core__jackson-annotations__*.jar'),
         Artifact('org.apache.spark', f'spark-avro_{scala_version}',
diff --git a/scala2.13/shim-deps/databricks/pom.xml b/scala2.13/shim-deps/databricks/pom.xml
index b342f381c71..a0459901079 100644
--- a/scala2.13/shim-deps/databricks/pom.xml
+++ b/scala2.13/shim-deps/databricks/pom.xml
@@ -105,6 +105,12 @@
             <version>${spark.version}</version>
             <scope>compile</scope>
         </dependency>
+        <dependency>
+            <groupId>com.fasterxml.jackson.core</groupId>
+            <artifactId>jackson-databind</artifactId>
+            <version>${spark.version}</version>
+            <scope>compile</scope>
+        </dependency>
         <dependency>
             <groupId>com.fasterxml.jackson.core</groupId>
             <artifactId>jackson-annotations</artifactId>
@@ -286,4 +292,4 @@
             <scope>compile</scope>
         </dependency>
     </dependencies>
-</project>
\ No newline at end of file
+</project>
diff --git a/shim-deps/databricks/pom.xml b/shim-deps/databricks/pom.xml
index bef8a90d227..22842b0f7c0 100644
--- a/shim-deps/databricks/pom.xml
+++ b/shim-deps/databricks/pom.xml
@@ -105,6 +105,12 @@
             <version>${spark.version}</version>
             <scope>compile</scope>
         </dependency>
+        <dependency>
+            <groupId>com.fasterxml.jackson.core</groupId>
+            <artifactId>jackson-databind</artifactId>
+            <version>${spark.version}</version>
+            <scope>compile</scope>
+        </dependency>
         <dependency>
             <groupId>com.fasterxml.jackson.core</groupId>
             <artifactId>jackson-annotations</artifactId>
@@ -286,4 +292,4 @@
             <scope>compile</scope>
         </dependency>
     </dependencies>
-</project>
\ No newline at end of file
+</project>

From eb1549c5764e2ceaaac1d9f7f885c0d6c11ae92c Mon Sep 17 00:00:00 2001
From: Raza Jafri <razajafri@users.noreply.github.com>
Date: Thu, 13 Jun 2024 14:23:57 -0700
Subject: [PATCH 07/10] `binary-dedupe` changes for Spark 4.0.0 [databricks]
 (#10993)

* Binary dedupe changes for Spark 4.0.0

Signed-off-by: Raza Jafri <rjafri@nvidia.com>

* updated comments

* Changed the URL for the common classes among shims

* renamed spark34-common to spark-shared and renamed relevant variables

* addressed review comments

* renamed variable from common to shared

---------

Signed-off-by: Raza Jafri <rjafri@nvidia.com>
---
 dist/maven-antrun/build-parallel-worlds.xml   |  4 +-
 dist/scripts/binary-dedupe.sh                 | 58 +++++++++----------
 .../com/nvidia/spark/rapids/ShimLoader.scala  | 10 ++--
 3 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/dist/maven-antrun/build-parallel-worlds.xml b/dist/maven-antrun/build-parallel-worlds.xml
index 524b15addf9..07838616340 100644
--- a/dist/maven-antrun/build-parallel-worlds.xml
+++ b/dist/maven-antrun/build-parallel-worlds.xml
@@ -1,6 +1,6 @@
 <?xml version="1.0"?>
 <!--
-  Copyright (c) 2021-2023, NVIDIA CORPORATION.
+  Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
@@ -131,7 +131,7 @@
 
         <!-- Remove the explicily unshimmed files from the common directory -->
         <delete>
-            <fileset dir="${project.build.directory}/parallel-world/spark3xx-common"
+            <fileset dir="${project.build.directory}/parallel-world/spark-shared"
                      includesfile="${spark.rapids.source.basedir}/${rapids.module}/unshimmed-common-from-spark311.txt"/>
         </delete>
     </target>
diff --git a/dist/scripts/binary-dedupe.sh b/dist/scripts/binary-dedupe.sh
index 183e86b1524..356b0b4dbae 100755
--- a/dist/scripts/binary-dedupe.sh
+++ b/dist/scripts/binary-dedupe.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -34,10 +34,10 @@ case "$OSTYPE" in
 esac
 
 STEP=0
-export SPARK3XX_COMMON_TXT="$PWD/spark3xx-common.txt"
-export SPARK3XX_COMMON_COPY_LIST="$PWD/spark-common-copy-list.txt"
+export SPARK_SHARED_TXT="$PWD/spark-shared.txt"
+export SPARK_SHARED_COPY_LIST="$PWD/spark-shared-copy-list.txt"
 export DELETE_DUPLICATES_TXT="$PWD/delete-duplicates.txt"
-export SPARK3XX_COMMON_DIR="$PWD/spark3xx-common"
+export SPARK_SHARED_DIR="$PWD/spark-shared"
 
 # This script de-duplicates .class files at the binary level.
 # We could also diff classes using scalap / javap outputs.
@@ -47,17 +47,17 @@ export SPARK3XX_COMMON_DIR="$PWD/spark3xx-common"
 
 # The following pipeline determines identical classes across shims in this build.
 # - checksum all class files
-# - move the varying-prefix spark3xy to the left so it can be easily skipped for uniq and sort
+# - move the varying-prefix sparkxyz to the left so it can be easily skipped for uniq and sort
 # - sort by path, secondary sort by checksum, print one line per group
 # - produce uniq count for paths
 # - filter the paths with count=1, the class files without diverging checksums
-# - put the path starting with /spark3xy back together for the final list
+# - put the path starting with /sparkxyz back together for the final list
 echo "Retrieving class files hashing to a single value ..."
 
 
 echo "$((++STEP))/ SHA1 of all non-META files > tmp-sha1-files.txt"
-find ./parallel-world/spark3* -name META-INF -prune -o \( -type f -print \) | \
-  xargs $SHASUM > tmp-sha1-files.txt
+find ./parallel-world/spark[34]* -name META-INF -prune -o -name webapps -prune -o \( -type f -print0 \) | \
+  xargs --null $SHASUM > tmp-sha1-files.txt
 
 echo "$((++STEP))/ make shim column 1 > tmp-shim-sha-package-files.txt"
 < tmp-sha1-files.txt awk -F/ '$1=$1' | \
@@ -68,10 +68,10 @@ echo "$((++STEP))/ sort by path, sha1; output first from each group > tmp-count-
 sort -k3 -k2,2 -u tmp-shim-sha-package-files.txt | \
   uniq -f 2 -c > tmp-count-shim-sha-package-files.txt
 
-echo "$((++STEP))/ files with unique sha1 > $SPARK3XX_COMMON_TXT"
+echo "$((++STEP))/ files with unique sha1 > $SPARK_SHARED_TXT"
 grep '^\s\+1 .*' tmp-count-shim-sha-package-files.txt | \
   awk '{$1=""; $3=""; print $0 }' | \
-  tr -s ' ' | sed 's/\ /\//g' > "$SPARK3XX_COMMON_TXT"
+  tr -s ' ' | sed 's/\ /\//g' > "$SPARK_SHARED_TXT"
 
 function retain_single_copy() {
   set -e
@@ -93,10 +93,10 @@ function retain_single_copy() {
   package_class="${package_class_with_spaces// //}"
 
   # get the reference copy out of the way
-  echo "$package_class" >> "from-$shim-to-spark3xx-common.txt"
+  echo "$package_class" >> "from-$shim-to-spark-shared.txt"
   # expanding directories separately because full path
   # glob is broken for class file name including the "$" character
-  for pw in ./parallel-world/spark3* ; do
+  for pw in ./parallel-world/spark[34]* ; do
     delete_path="$pw/$package_class"
     [[ -f "$delete_path" ]] && echo "$delete_path" || true
   done >> "$DELETE_DUPLICATES_TXT" || exit 255
@@ -106,26 +106,26 @@ function retain_single_copy() {
 # standalone debugging
 # truncate incremental files
 : > "$DELETE_DUPLICATES_TXT"
-rm -f from-spark3*-to-spark3xx-common.txt
-rm -rf "$SPARK3XX_COMMON_DIR"
-mkdir -p "$SPARK3XX_COMMON_DIR"
+rm -f from-spark[34]*-to-spark-shared.txt
+rm -rf "$SPARK_SHARED_DIR"
+mkdir -p "$SPARK_SHARED_DIR"
 
-echo "$((++STEP))/ retaining a single copy of spark3xx-common classes"
+echo "$((++STEP))/ retaining a single copy of spark-shared classes"
 while read spark_common_class; do
   retain_single_copy "$spark_common_class"
-done < "$SPARK3XX_COMMON_TXT"
+done < "$SPARK_SHARED_TXT"
 
-echo "$((++STEP))/ rsyncing common classes to $SPARK3XX_COMMON_DIR"
-for copy_list in from-spark3*-to-spark3xx-common.txt; do
+echo "$((++STEP))/ rsyncing common classes to $SPARK_SHARED_DIR"
+for copy_list in from-spark[34]*-to-spark-shared.txt; do
   echo Initializing rsync of "$copy_list"
   IFS='-' <<< "$copy_list" read -ra copy_list_parts
   # declare -p copy_list_parts
   shim="${copy_list_parts[1]}"
   # use rsync to reduce process forking
-  rsync --files-from="$copy_list" ./parallel-world/"$shim" "$SPARK3XX_COMMON_DIR"
+  rsync --files-from="$copy_list" ./parallel-world/"$shim" "$SPARK_SHARED_DIR"
 done
 
-mv "$SPARK3XX_COMMON_DIR" parallel-world/
+mv "$SPARK_SHARED_DIR" parallel-world/
 
 # TODO further dedupe by FEATURE version lines:
 #  spark30x-common
@@ -137,9 +137,9 @@ mv "$SPARK3XX_COMMON_DIR" parallel-world/
 #
 # At this point the duplicate classes have not been removed from version-specific jar
 # locations such as parallel-world/spark312.
-# For each unshimmed class file look for all of its copies inside /spark3* and
+# For each unshimmed class file look for all of its copies inside /spark[34]* and
 # and count the number of distinct checksums. There are two representative cases
-# 1) The class is contributed to the unshimmed location via the unshimmed-from-each-spark3xx list. These are classes
+# 1) The class is contributed to the unshimmed location via the unshimmed-from-each-spark34 list. These are classes
 #    carrying the shim classifier in their package name such as
 #    com.nvidia.spark.rapids.spark312.RapidsShuffleManager. They are unique by construction,
 #    and will have zero copies in any non-spark312 shims. Although such classes are currently excluded from
@@ -157,25 +157,25 @@ mv "$SPARK3XX_COMMON_DIR" parallel-world/
 # Determine the list of unshimmed class files
 UNSHIMMED_LIST_TXT=unshimmed-result.txt
 echo "$((++STEP))/ creating sorted list of unshimmed classes > $UNSHIMMED_LIST_TXT"
-find ./parallel-world -name '*.class' -not -path './parallel-world/spark3*' | \
+find ./parallel-world -name '*.class' -not -path './parallel-world/spark[34-]*' | \
   cut -d/ -f 3- | sort > "$UNSHIMMED_LIST_TXT"
 
 function verify_same_sha_for_unshimmed() {
   set -e
   class_file="$1"
 
-  # the raw spark3xx-common.txt file list contains all single-sha1 classes
+  # the raw spark-shared.txt file list contains all single-sha1 classes
   # including the ones that are unshimmed. Instead of expensively recomputing
   # sha1 look up if there is an entry with the unshimmed class as a suffix
 
   class_file_quoted=$(printf '%q' "$class_file")
 
-  # TODO currently RapidsShuffleManager is "removed" from /spark3* by construction in
+  # TODO currently RapidsShuffleManager is "removed" from /spark* by construction in
   # dist pom.xml via ant. We could delegate this logic to this script
   # and make both simmpler
-  if [[ ! "$class_file_quoted" =~ (com/nvidia/spark/rapids/spark3.*/.*ShuffleManager.class|org/apache/spark/sql/rapids/shims/spark3.*/ProxyRapidsShuffleInternalManager.class) ]]; then
+  if [[ ! "$class_file_quoted" =~ (com/nvidia/spark/rapids/spark[34].*/.*ShuffleManager.class|org/apache/spark/sql/rapids/shims/spark[34].*/ProxyRapidsShuffleInternalManager.class) ]]; then
 
-    if ! grep -q "/spark.\+/$class_file_quoted" "$SPARK3XX_COMMON_TXT"; then
+    if ! grep -q "/spark.\+/$class_file_quoted" "$SPARK_SHARED_TXT"; then
       echo >&2 "$class_file is not bitwise-identical across shims"
       exit 255
     fi
@@ -192,7 +192,7 @@ done < "$UNSHIMMED_LIST_TXT"
 echo "$((++STEP))/ removing duplicates of unshimmed classes"
 
 while read unshimmed_class; do
-  for pw in ./parallel-world/spark3* ; do
+  for pw in ./parallel-world/spark[34]* ; do
     unshimmed_path="$pw/$unshimmed_class"
     [[ -f "$unshimmed_path" ]] && echo "$unshimmed_path" || true
   done >> "$DELETE_DUPLICATES_TXT"
diff --git a/sql-plugin-api/src/main/scala/com/nvidia/spark/rapids/ShimLoader.scala b/sql-plugin-api/src/main/scala/com/nvidia/spark/rapids/ShimLoader.scala
index 36abc75ba87..2d7a51c4e43 100644
--- a/sql-plugin-api/src/main/scala/com/nvidia/spark/rapids/ShimLoader.scala
+++ b/sql-plugin-api/src/main/scala/com/nvidia/spark/rapids/ShimLoader.scala
@@ -40,19 +40,19 @@ import org.apache.spark.util.MutableURLClassLoader
     "parallel worlds" in the JDK's com.sun.istack.internal.tools.ParallelWorldClassLoader parlance
     1. a few publicly documented classes in the conventional layout at the top
     2. a large fraction of classes whose bytecode is identical under all supported Spark versions
-       in spark3xx-common
+       in spark-shared
     3. a smaller fraction of classes that differ under one of the supported Spark versions
     com/nvidia/spark/SQLPlugin.class
-    spark3xx-common/com/nvidia/spark/rapids/CastExprMeta.class
+    spark-shared/com/nvidia/spark/rapids/CastExprMeta.class
     spark311/org/apache/spark/sql/rapids/GpuUnaryMinus.class
     spark320/org/apache/spark/sql/rapids/GpuUnaryMinus.class
     Each shim can see a consistent parallel world without conflicts by referencing
     only one conflicting directory.
     E.g., Spark 3.2.0 Shim will use
-    jar:file:/home/spark/rapids-4-spark_2.12-24.08.0.jar!/spark3xx-common/
+    jar:file:/home/spark/rapids-4-spark_2.12-24.08.0.jar!/spark-shared/
     jar:file:/home/spark/rapids-4-spark_2.12-24.08.0.jar!/spark320/
     Spark 3.1.1 will use
-    jar:file:/home/spark/rapids-4-spark_2.12-24.08.0.jar!/spark3xx-common/
+    jar:file:/home/spark/rapids-4-spark_2.12-24.08.0.jar!/spark-shared/
     jar:file:/home/spark/rapids-4-spark_2.12-24.08.0.jar!/spark311/
     Using these Jar URL's allows referencing different bytecode produced from identical sources
     by incompatible Scala / Spark dependencies.
@@ -67,7 +67,7 @@ object ShimLoader extends Logging {
     new URL(rootUrlStr)
   }
 
-  private val shimCommonURL = new URL(s"${shimRootURL.toString}spark3xx-common/")
+  private val shimCommonURL = new URL(s"${shimRootURL.toString}spark-shared/")
   @volatile private var shimProviderClass: String = _
   @volatile private var shimProvider: SparkShimServiceProvider = _
   @volatile private var shimURL: URL = _

From 356d5a1a5021c421cef7fbb73734527ba0ac1857 Mon Sep 17 00:00:00 2001
From: Renjie Liu <liurenjie2008@gmail.com>
Date: Fri, 14 Jun 2024 08:45:28 +0800
Subject: [PATCH 08/10] [FEA] Increase parallelism of deltalake test on
 databricks (#11051)

---
 integration_tests/run_pyspark_from_build.sh | 6 ++++++
 jenkins/databricks/test.sh                  | 5 +----
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/integration_tests/run_pyspark_from_build.sh b/integration_tests/run_pyspark_from_build.sh
index dec93e6f22a..8b10b3debac 100755
--- a/integration_tests/run_pyspark_from_build.sh
+++ b/integration_tests/run_pyspark_from_build.sh
@@ -245,6 +245,12 @@ else
     DRIVER_EXTRA_JAVA_OPTIONS="-ea -Duser.timezone=$TZ -Ddelta.log.cacheSize=$deltaCacheSize"
     export PYSP_TEST_spark_driver_extraJavaOptions="$DRIVER_EXTRA_JAVA_OPTIONS $COVERAGE_SUBMIT_FLAGS"
     export PYSP_TEST_spark_executor_extraJavaOptions="-ea -Duser.timezone=$TZ"
+
+    # Set driver memory to speed up tests such as deltalake
+    if [[ -n "${DRIVER_MEMORY}" ]]; then
+        export PYSP_TEST_spark_driver_memory="${DRIVER_MEMORY}"
+    fi
+
     export PYSP_TEST_spark_ui_showConsoleProgress='false'
     export PYSP_TEST_spark_sql_session_timeZone=$TZ
     export PYSP_TEST_spark_sql_shuffle_partitions='4'
diff --git a/jenkins/databricks/test.sh b/jenkins/databricks/test.sh
index f71f69844f7..c966d5a92f7 100755
--- a/jenkins/databricks/test.sh
+++ b/jenkins/databricks/test.sh
@@ -66,9 +66,6 @@ TEST_MODE=${TEST_MODE:-'DEFAULT'}
 # --packages in distributed setups, should be fixed by
 # https://github.com/NVIDIA/spark-rapids/pull/5646
 
-# Increase driver memory as Delta Lake tests can slowdown with default 1G (possibly due to caching?)
-DELTA_LAKE_CONFS="--driver-memory 2g"
-
 # Enable event log for qualification & profiling tools testing
 export PYSP_TEST_spark_eventLog_enabled=true
 mkdir -p /tmp/spark-events
@@ -105,7 +102,7 @@ if [[ "$(pwd)" == "$SOURCE_PATH" ]]; then
 
     if [[ "$TEST_MODE" == "DEFAULT" || $TEST_MODE == "CI_PART2" || "$TEST_MODE" == "DELTA_LAKE_ONLY" ]]; then
         ## Run Delta Lake tests
-        SPARK_SUBMIT_FLAGS="$SPARK_CONF $DELTA_LAKE_CONFS" TEST_PARALLEL=1 \
+        DRIVER_MEMORY="4g" \
             bash integration_tests/run_pyspark_from_build.sh --runtime_env="databricks"  -m "delta_lake" --delta_lake --test_type=$TEST_TYPE
     fi
 

From 599ae17e02c8b88c63fe0133caf19570f332092e Mon Sep 17 00:00:00 2001
From: "Hongbin Ma (Mahone)" <mahongbin@apache.org>
Date: Fri, 14 Jun 2024 09:27:03 +0800
Subject: [PATCH 09/10] fix flaky array_item test failures (#11054)

* fix flaky array_item test failures

Signed-off-by: Hongbin Ma (Mahone) <mahongbin@apache.org>

* fix indent

Signed-off-by: Hongbin Ma (Mahone) <mahongbin@apache.org>

* fix whitespace

Signed-off-by: Hongbin Ma (Mahone) <mahongbin@apache.org>

---------

Signed-off-by: Hongbin Ma (Mahone) <mahongbin@apache.org>
---
 integration_tests/src/main/python/data_gen.py | 23 ++++++++++++-------
 .../src/main/python/parquet_write_test.py     |  6 ++++-
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/integration_tests/src/main/python/data_gen.py b/integration_tests/src/main/python/data_gen.py
index 2e6c36b77d9..fb1627af75b 100644
--- a/integration_tests/src/main/python/data_gen.py
+++ b/integration_tests/src/main/python/data_gen.py
@@ -159,7 +159,8 @@ def __repr__(self):
         return super().__repr__() + '(' + str(self._child_gen) + ')'
 
     def _cache_repr(self):
-        return super()._cache_repr() + '(' + self._child_gen._cache_repr() + ')'
+        return (super()._cache_repr() + '(' + self._child_gen._cache_repr() +
+                ',' + str(self._func.__code__) + ')' )
 
     def start(self, rand):
         self._child_gen.start(rand)
@@ -667,7 +668,10 @@ def __repr__(self):
         return super().__repr__() + '(' + str(self._child_gen) + ')'
 
     def _cache_repr(self):
-        return super()._cache_repr() + '(' + self._child_gen._cache_repr() + ')'
+        return (super()._cache_repr() + '(' + self._child_gen._cache_repr() +
+                ',' + str(self._min_length) + ',' + str(self._max_length) + ',' +
+                str(self.all_null) + ',' + str(self.convert_to_tuple) + ')')
+
 
     def start(self, rand):
         self._child_gen.start(rand)
@@ -701,7 +705,8 @@ def __repr__(self):
         return super().__repr__() + '(' + str(self._key_gen) + ',' + str(self._value_gen) + ')'
 
     def _cache_repr(self):
-        return super()._cache_repr() + '(' + self._key_gen._cache_repr() + ',' + self._value_gen._cache_repr() + ')'
+        return (super()._cache_repr() + '(' + self._key_gen._cache_repr() + ',' + self._value_gen._cache_repr() +
+                ',' + str(self._min_length) + ',' + str(self._max_length) + ')')
 
     def start(self, rand):
         self._key_gen.start(rand)
@@ -769,12 +774,13 @@ def __init__(self, min_value=MIN_DAY_TIME_INTERVAL, max_value=MAX_DAY_TIME_INTER
         self._min_micros = (math.floor(min_value.total_seconds()) * 1000000) + min_value.microseconds
         self._max_micros = (math.floor(max_value.total_seconds()) * 1000000) + max_value.microseconds
         fields = ["day", "hour", "minute", "second"]
-        start_index = fields.index(start_field)
-        end_index = fields.index(end_field)
-        if start_index > end_index:
+        self._start_index = fields.index(start_field)
+        self._end_index = fields.index(end_field)
+        if self._start_index > self._end_index:
             raise RuntimeError('Start field {}, end field {}, valid fields is {}, start field index should <= end '
                                'field index'.format(start_field, end_field, fields))
-        super().__init__(DayTimeIntervalType(start_index, end_index), nullable=nullable, special_cases=special_cases)
+        super().__init__(DayTimeIntervalType(self._start_index, self._end_index), nullable=nullable,
+                         special_cases=special_cases)
 
     def _gen_random(self, rand):
         micros = rand.randint(self._min_micros, self._max_micros)
@@ -784,7 +790,8 @@ def _gen_random(self, rand):
         return timedelta(microseconds=micros)
     
     def _cache_repr(self):
-        return super()._cache_repr() + '(' + str(self._min_micros) + ',' + str(self._max_micros) + ')'
+        return (super()._cache_repr() + '(' + str(self._min_micros) + ',' + str(self._max_micros) +
+                ',' + str(self._start_index) + ',' + str(self._end_index) + ')')
 
     def start(self, rand):
         self._start(rand, lambda: self._gen_random(rand))
diff --git a/integration_tests/src/main/python/parquet_write_test.py b/integration_tests/src/main/python/parquet_write_test.py
index 99a2d4241e8..38dab9e84a4 100644
--- a/integration_tests/src/main/python/parquet_write_test.py
+++ b/integration_tests/src/main/python/parquet_write_test.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -224,6 +224,10 @@ def test_all_null_int96(spark_tmp_path):
     class AllNullTimestampGen(TimestampGen):
         def start(self, rand):
             self._start(rand, lambda : None)
+
+        def _cache_repr(self):
+            return super()._cache_repr() + '(all_nulls)'
+
     data_path = spark_tmp_path + '/PARQUET_DATA'
     confs = copy_and_update(writer_confs, {'spark.sql.parquet.outputTimestampType': 'INT96'})
     assert_gpu_and_cpu_writes_are_equal_collect(

From 2f3c0c276dba3e8870f3e3bfa9287e644809f4f6 Mon Sep 17 00:00:00 2001
From: Tim Liu <timl@nvidia.com>
Date: Fri, 14 Jun 2024 19:47:56 +0800
Subject: [PATCH 10/10] Calculate parallelism to speed up pre-merge CI (#11046)

* Calculate parallelism to speed up pre-merge CI

Calculate parallelism based on GPU memory to speed up pre-merge CI with appropriate amount of parallelism.

But when TEST_PARALLEL > 8 and as it increases, the integration tests running speed will become slower and slower, so we limit TEST_PARALLEL <= 8.

Based on this change, and ran pre-merge CI on powerful nodes, we observed the pre-merge CI 1 hour less than on common nodes.

    16 CPU/128G Mem/24G GPU : [2hours]  VS
    8  CPU/64G  Mem/16G GPU : [3hours]

Note: currently we only have 3 fixed powerful nodes for the pre-merge CI job, so only 1 pre-merge CI be speeded up at the same time

Signed-off-by: Tim Liu <timl@nvidia.com>

* Add a variable to set maximum test parallelism for the integration tests

Signed-off-by: Tim Liu <timl@nvidia.com>

* Fix typo

Signed-off-by: Tim Liu <timl@nvidia.com>

---------

Signed-off-by: Tim Liu <timl@nvidia.com>
---
 integration_tests/run_pyspark_from_build.sh | 5 +++++
 jenkins/spark-premerge-build.sh             | 4 +---
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/integration_tests/run_pyspark_from_build.sh b/integration_tests/run_pyspark_from_build.sh
index 8b10b3debac..18c26aa26e7 100755
--- a/integration_tests/run_pyspark_from_build.sh
+++ b/integration_tests/run_pyspark_from_build.sh
@@ -171,11 +171,16 @@ else
         TEST_TYPE_PARAM="--test_type $TEST_TYPE"
     fi
 
+    # We found that when parallelism > 8, as it increases, the test speed will become slower and slower. So we set the default maximum parallelism to 8.
+    # Note that MAX_PARALLEL varies with the hardware, OS, and test case. Please overwrite it with an appropriate value if needed.
+    MAX_PARALLEL=${MAX_PARALLEL:-8}
     if [[ ${TEST_PARALLEL} -lt 2 ]];
     then
         # With xdist 0 and 1 are the same parallelism but
         # 0 is more efficient
         TEST_PARALLEL_OPTS=()
+    elif [[ ${TEST_PARALLEL} -gt ${MAX_PARALLEL} ]]; then
+        TEST_PARALLEL_OPTS=("-n" "$MAX_PARALLEL")
     else
         TEST_PARALLEL_OPTS=("-n" "$TEST_PARALLEL")
     fi
diff --git a/jenkins/spark-premerge-build.sh b/jenkins/spark-premerge-build.sh
index 883b3f3acfc..697722c0138 100755
--- a/jenkins/spark-premerge-build.sh
+++ b/jenkins/spark-premerge-build.sh
@@ -78,7 +78,7 @@ mvn_verify() {
     # Here run Python integration tests tagged with 'premerge_ci_1' only, that would help balance test duration and memory
     # consumption from two k8s pods running in parallel, which executes 'mvn_verify()' and 'ci_2()' respectively.
     $MVN_CMD -B $MVN_URM_MIRROR $PREMERGE_PROFILES clean verify -Dpytest.TEST_TAGS="premerge_ci_1" \
-        -Dpytest.TEST_TYPE="pre-commit" -Dpytest.TEST_PARALLEL=4 -Dcuda.version=$CLASSIFIER
+        -Dpytest.TEST_TYPE="pre-commit" -Dcuda.version=$CLASSIFIER
 
     # The jacoco coverage should have been collected, but because of how the shade plugin
     # works and jacoco we need to clean some things up so jacoco will only report for the
@@ -162,7 +162,6 @@ ci_2() {
     $MVN_CMD -U -B $MVN_URM_MIRROR clean package $MVN_BUILD_ARGS -DskipTests=true
     export TEST_TAGS="not premerge_ci_1"
     export TEST_TYPE="pre-commit"
-    export TEST_PARALLEL=5
 
     # Download a Scala 2.12 build of spark
     prepare_spark $SPARK_VER 2.12
@@ -206,7 +205,6 @@ ci_scala213() {
     cd .. # Run integration tests in the project root dir to leverage test cases and resource files
     export TEST_TAGS="not premerge_ci_1"
     export TEST_TYPE="pre-commit"
-    export TEST_PARALLEL=5
     # SPARK_HOME (and related) must be set to a Spark built with Scala 2.13
     SPARK_HOME=$SPARK_HOME PYTHONPATH=$PYTHONPATH \
         ./integration_tests/run_pyspark_from_build.sh