Loop in Apache spark - vaquarkhan/Apache-Kafka-poc-and-notes GitHub Wiki
val data = Array(Array(1,2,3), Array(4,5,6,7,8)) val rdd = sc.parallelize(data, 2)
// Print the RDD of arrays. rdd.collect().foreach(a => println(a.size))
// Use map() to create an RDD with the array sizes. val countRDD = rdd.map(a => a.size)
// Print the elements of this new RDD. countRDD.collect().foreach(a => println(a))
// Use filter() to create an RDD with just the longer arrays. val bigRDD = rdd.filter(a => a.size > 3)
// Print each remaining array. bigRDD.collect().foreach(a => { a.foreach(e => print(e + " ")) println() })
3
5
3
5
4 5 6 7 8
data: Array[Array[Int]] = Array(Array(1, 2, 3), Array(4, 5, 6, 7, 8))
rdd: org.apache.spark.rdd.RDD[Array[Int]] = ParallelCollectionRDD[0] at parallelize at command-4074328521026683:5
countRDD: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[1] at map at command-4074328521026683:11
bigRDD: org.apache.spark.rdd.RDD[Array[Int]] = MapPartitionsRDD[2] at filter at command-4074328521026683:17