from pyspark import SparkContext from pyspark.streaming import StreamingContext # Create a local StreamingContext with two working threads and batch interval of 1 second sconf = SparkContext("local[2]", "NetworkWordCount") ssc = StreamingContext(sconf, 1) # Create a DStream to connect to hostname:port, like localhost:9999 # lines DStream represents the stream of data that will be received from the data server. Each record in this DStream is a line of text. lines = ssc.socketTextStream("localhost", 9999) # Split each line into words words = lines.flatMap(lambda line: line.split(" ")) # Count each word in each batch pairs = words.map(lambda word: (word, 1)) wordCounts = pairs.reduceByKey(lambda x, y: x + y) # Print the first ten elements of each RDD generated in this DStream to the console wordCounts.pprint() ssc.start() # Start the computation ssc.awaitTermination() # Wait for the computation to terminate