"""Book code for chapter 3 of Data Analysis with Python and PySpark"""

from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

words_nonull = spark.read.csv("dummy_file")

# tag::ch03-code-frequencies[]

groups = words_nonull.groupby(col("word"))

print(groups)

# <pyspark.sql.group.GroupedData at 0x10ed23da0>

results = words_nonull.groupby(col("word")).count()

print(results)

# DataFrame[word: string, count: bigint]

results.show()

# +-------------+-----+
# |         word|count|
# +-------------+-----+
# |       online|    4|
# |         some|  203|
# |        still|   72|
# |          few|   72|
# |         hope|  122|
# [...]
# |       doubts|    2|
# |    destitute|    1|
# |    solemnity|    5|
# |gratification|    1|
# |    connected|   14|
# +-------------+-----+
# only showing top 20 rows

# end::ch03-code-frequencies[]

# tag::ch03-code-orderby[]

results.orderBy("count", ascending=False).show(10)
results.orderBy(col("count").desc()).show(10)

# +----+-----+
# |word|count|
# +----+-----+
# | the| 4480|
# |  to| 4218|
# |  of| 3711|
# | and| 3504|
# | her| 2199|
# |   a| 1982|
# |  in| 1909|
# | was| 1838|
# |   i| 1749|
# | she| 1668|
# +----+-----+
# only showing top 10 rows

# end::ch03-code-orderby[]

# tag::ch03-code-before-and-after[]

# Before
book = spark.read.text("./data/gutenberg_books/1342-0.txt")

lines = book.select(split(book.value, " ").alias("line"))

words = lines.select(explode(col("line")).alias("word"))

words_lower = words.select(lower(col("word")).alias("word"))

words_clean = words_lower.select(
    regexp_extract(col("word"), "[a-z']*", 0).alias("word")
)

words_nonull = words_clean.where(col("word") != "")

results = words_nonull.groupby("word").count()

# After
import pyspark.sql.functions as F

results = (
    spark.read.text("./data/gutenberg_books/1342-0.txt")
    .select(F.split(F.col("value"), " ").alias("line"))
    .select(F.explode(F.col("line")).alias("word"))
    .select(F.lower(F.col("word")).alias("word"))
    .select(F.regexp_extract(F.col("word"), "[a-z']*", 0).alias("word"))
    .where(F.col("word") != "")
    .groupby("word")
    .count()
)

# end::ch03-code-before-and-after[]
