read json schema - veeraravi/Spark-notes GitHub Wiki

case class JColumn(trim: Boolean, name: String, nullable: Boolean, id: Option[String], position: BigInt, table: String, _type: String, primaryKey: Boolean

val source: String = Source.fromFile("/home/tglbvks/schema.json").getLines.mkString val json: JsValue = Json.parse(source)

import org.json4s._ import org.json4s.native.JsonMethods

case class JColumn(trim: Boolean, name: String, nullable: Boolean, id: Option[String], position: BigInt, table: String, _type: String, primaryKey: Boolean)

val path = """/dev/veera/schema.json""" val input = scala.io.Source.fromFile(path) val json = JsonMethods.parse(input.reader())

val typeMapping = Map( "double" -> DoubleType, "integer" -> IntegerType, "string" -> StringType, "date" -> DateType, "bool" -> BooleanType)

var rddSchema = ListBufferStructField implicit val formats = DefaultFormats val schema = json.extract[Array[JColumn]]

//schema.foreach(c => println(s"name:${c.name} type:${c._type} isnullable:${c.nullable}"))

schema.foreach { c => rddSchema += StructField(c.name, typeMapping(c._type), c.nullable, Metadata.empty) }

val in_emp = spark.read .format("com.databricks.spark.csv") .schema(StructType(rddSchema.toList)) .option("inferSchema", "false") .option("dateFormat", "yyyy.MM.dd") .option("header", "false") .option("delimiter", ",") .option("nullValue", "null") .option("treatEmptyValuesAsNulls", "true") .csv("""your_path\employee.csv""")

in_emp.printSchema() in_emp.collect() in_emp.show()