Spark function to explain: coalesce
Recalculate the partitions in RDD.
Function prototype
def coalesce(numPartitions: Int, shuffle: Boolean = false) (implicit ord: Ordering[T] = null): RDD[T] |
/** * User: 过往记忆 * Date: 15-03-09 * Time: 上午06:30 * bolg: https://www.iteblog.com * 过往记忆博客,专注于hadoop、hive、spark、shark、flume的技术博客,大量的干货 * 过往记忆博客微信公共帐号:iteblog_hadoop */scala> var data = sc.parallelize(List(1,2,3,4))data: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[45] at parallelize at <console>:12scala> data.partitions.lengthres68: Int = 30scala> val result = data.coalesce(2, false)result: org.apache.spark.rdd.RDD[Int] = CoalescedRDD[57] at coalesce at <console>:14scala> result.partitions.lengthres77: Int = 2scala> result.toDebugStringres75: String =(2) CoalescedRDD[57] at coalesce at <console>:14 [] | ParallelCollectionRDD[45] at parallelize at <console>:12 []scala> val result1 = data.coalesce(2, true)result1: org.apache.spark.rdd.RDD[Int] = MappedRDD[61] at coalesce at <console>:14scala> result1.toDebugStringres76: String =(2) MappedRDD[61] at coalesce at <console>:14 [] | CoalescedRDD[60] at coalesce at <console>:14 [] | ShuffledRDD[59] at coalesce at <console>:14 [] +-(30) MapPartitionsRDD[58] at coalesce at <console>:14 [] | ParallelCollectionRDD[45] at parallelize at <console>:12 [] |
Commentaires
Enregistrer un commentaire