Spark Scan Node - pykello/pykello.github.com GitHub Wiki

Accessing:

scala> import org.apache.spark.sql.execution._

scala> spark.sql("create table t1(a int) using parquet;")

scala> val df = spark.sql("select count(*) from t1")

scala> df.queryExecution.sparkPlan
res15: org.apache.spark.sql.execution.SparkPlan =
HashAggregate(keys=[], functions=[count(1)], output=[count(1)#20L])
+- HashAggregate(keys=[], functions=[partial_count(1)], output=[count#23L])
   +- FileScan parquet default.t1[] Batched: true, DataFilters: [], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/home/hadi/mominag-playground/spark-warehouse/t1], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<>

scala> val scan = df.queryExecution.sparkPlan.asInstanceOf[UnaryExecNode].child.asInstanceOf[UnaryExecNode].child

scala> scan.getClass
res16: Class[_ <: org.apache.spark.sql.execution.SparkPlan] = class org.apache.spark.sql.execution.FileSourceScanExec

scala> val scanExec = scan.asInstanceOf[FileSourceScanExec]

scala> val analyzedScan = df.queryExecution.analyzed.children(0).children(0).asInstanceOf[datasources.LogicalRelation]

scala> val relation = analyzedScan.relation.asInstanceOf[datasources.HadoopFsRelation]

scala> val fileFormat = relation.fileFormat.asInstanceOf[datasources.parquet.ParquetFileFormat]