An Apache Spark word count example | Scala Cookbook

By Alvin Alexander. Last updated: December 21, 2021

Without much introduction, here’s an Apache Spark “word count” example, written with Scala:

import org.apache.spark.sql.SparkSession
import org.apache.spark.rdd.RDD
object WordCount {
 def main(args: Array[String]) {
 val file = "Gettysburg-Address.txt"
 val spark: SparkSession = SparkSession.builder
 .appName("Word Count")
 .config("spark.master", "local")
 .getOrCreate()
 val fileRdd: RDD[String] = spark.sparkContext.textFile(file)
 // create the counts
 val counts = fileRdd.map(_.replaceAll("[.,]", ""))
 .map(_.replace("—", " "))
 .flatMap(line => line.split(" "))
 .map(word => (word, 1))
 .reduceByKey(_ + _)
 .sortBy(_._2)
 .collect
 println( "------------------------------------------")
 counts.foreach(println)
 println( "------------------------------------------")
 spark.stop()
 }
}

For more details on this example, see the Scala Cookbook, where I discuss this example, along with other Spark examples.

map

rdd

An Apache Spark word count example | Scala Cookbook

books by alvin