A schema is a description of the structure of DataFrame (which is a result of a query over a semi-structured data). It can be inferred (implied) or explicit.
An explicit schema is described using StructType
which is a collection of StructField
objects. A StructField
describes a single column — the name, the type and whether or not it accepts empty values.
import org.apache.spark.sql.types.StructType
val schema1 = new StructType()
.add("a", "int")
.add("b", "string")
scala> schema1.printTreeString
root
|-- a: integer (nullable = true)
|-- b: string (nullable = true)
// it is equivalent to the above expression
import org.apache.spark.sql.types.{IntegerType, StringType}
val schema2 = new StructType()
.add("a", IntegerType)
.add("b", StringType)
scala> schema2.printTreeString
root
|-- a: integer (nullable = true)
|-- b: string (nullable = true)
scala> println(schema1.prettyJson)
{
"type" : "struct",
"fields" : [ {
"name" : "a",
"type" : "integer",
"nullable" : true,
"metadata" : { }
}, {
"name" : "b",
"type" : "string",
"nullable" : true,
"metadata" : { }
} ]
}