Coalesce
Def coalesce (numPartitions: Int, shuffle: Boolean = false) (implicit ord: Ordering [T] = null): RDD [T]
- Scala> var data = sc.textFile ("/ tmp / lxw1234 / 1.txt")
- Data: org.apache.spark.rdd.RDD [String] = MapPartitionsRDD [53] at textFile at: 21
-
- Scala> data.collect
- Res37: Array [String] = Array (hello world, hello spark, hello hive, hi spark)
-
- Scala> data.partitions.size
- Res38: Int = 2 // RDD data By default there are two partitions scala> var rdd1 = data.coalesce (1)
- Rdd1: org.apache.spark.rdd.RDD [String] = CoalescedRDD [2] at coalesce at: 23
-
- Scala> rdd1.partitions.size
- Res1: Int = 1 // rdd1 The number of partitions is 1
-
-
- Scala> var rdd1 = data.coalesce (4)
- Rdd1: org.apache.spark.rdd.RDD [String] = CoalescedRDD [3] at coalesce at: 23
-
- Scala> rdd1.partitions.size
- Res2: Int = 2 // If the number of partitions is greater than the original number of partitions, you must specify the shuffle parameter to true, otherwise else, the number of partitions is not scalable > var rdd1 = data.coalesce (4, true)
- Rdd1: org.apache.spark.rdd.RDD [String] = MapPartitionsRDD [7] at coalesce at: 23
-
- Scala> rdd1.partitions.size
- Res3: Int = 4
-
Repartition
Def repartition (numPartitions: Int) (implicit ord: Ordering [T] = null): RDD [T]
- Scala> var rdd2 = data.repartition (1)
- Rdd2: org.apache.spark.rdd.RDD [String] = MapPartitionsRDD [11] at repartition at: 23
-
- Scala> rdd2.partitions.size
- Res4: Int = 1
-
- Scala> var rdd2 = data.repartition (4)
- Rdd2: org.apache.spark.rdd.RDD [String] = MapPartitionsRDD [15] at repartition at: 23
-
- Scala> rdd2.partitions.size
- Res5: Int = 4
-
Commentaires
Enregistrer un commentaire