#!/usr/bin/env python3

# tag::ch09-launch-pyspark[]
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

spark = SparkSession.builder.appName(
    "Counting word occurences from a book, the sequel."
).getOrCreate()
# end::ch09-launch-pyspark[]

# tag::ch09-results-code[]

results = (
    spark.read.text("./data/gutenberg_books/*.txt")
    .select(F.split(F.col("value"), " ").alias("line"))
    .select(F.explode(F.col("line")).alias("word"))
    .select(F.lower(F.col("word")).alias("word"))
    .select(F.regexp_extract(F.col("word"), "[a-z']*", 0).alias("word"))
    .where(F.col("word") != "")
    .groupby(F.col("word"))
    .count()
)

# end::ch09-results-code[]

# tag::ch09-results-job[]

results.show(5, False)

# +------+-----+
# |word  |count|
# +------+-----+
# |online|4    |
# |some  |203  |
# |still |72   |
# |few   |72   |
# |hope  |122  |
# +------+-----+
# only showing top 5 rows

# end::ch09-results-job[]
