Spark Scan Node - pykello/pykello.github.com GitHub Wiki
Accessing:
scala> import org.apache.spark.sql.execution._
scala> spark.sql("create table t1(a int) using parquet;")
scala> val df = spark.sql("select count(*) from t1")
scala> df.queryExecution.sparkPlan
res15: org.apache.spark.sql.execution.SparkPlan =
HashAggregate(keys=[], functions=[count(1)], output=[count(1)#20L])
+- HashAggregate(keys=[], functions=[partial_count(1)], output=[count#23L])
+- FileScan parquet default.t1[] Batched: true, DataFilters: [], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/home/hadi/mominag-playground/spark-warehouse/t1], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<>
scala> val scan = df.queryExecution.sparkPlan.asInstanceOf[UnaryExecNode].child.asInstanceOf[UnaryExecNode].child
scala> scan.getClass
res16: Class[_ <: org.apache.spark.sql.execution.SparkPlan] = class org.apache.spark.sql.execution.FileSourceScanExec
scala> val scanExec = scan.asInstanceOf[FileSourceScanExec]
scala> val analyzedScan = df.queryExecution.analyzed.children(0).children(0).asInstanceOf[datasources.LogicalRelation]
scala> val relation = analyzedScan.relation.asInstanceOf[datasources.HadoopFsRelation]
scala> val fileFormat = relation.fileFormat.asInstanceOf[datasources.parquet.ParquetFileFormat]