#!/usr/bin/env python3

# Preparation for the exercises

from pyspark.sql import SparkSession

from pyspark.sql.functions import (
    col,
    explode,
    lower,
    regexp_extract,
    split,
)

spark = SparkSession.builder.appName(
    "Analyzing the vocabulary of Pride and Prejudice."
).getOrCreate()

book = spark.read.text("./data/gutenberg_books/1342-0.txt")

lines = book.select(split(book.value, " ").alias("line"))

words = lines.select(explode(col("line")).alias("word"))

words_lower = words.select(lower(col("word")).alias("word"))

words_clean = words_lower.select(
    regexp_extract(col("word"), "[a-z']*", 0).alias("word")
)

words_nonull = words_clean.where(col("word") != "")

results = words_nonull.groupby(col("word")).count()

# tag::sol3_1[]
from pyspark.sql.functions import col, length

words_nonull.select(length(col("word")).alias("length")).groupby(
    "length"
).count().show(5)

# +------+-----+
# |length|count|
# +------+-----+
# |    12|  815|
# |     1| 3750|
# |    13|  399|
# |     6| 9121|
# |    16|    5|
# +------+-----+
# only showing top 5 rows
# end::sol3_1[]

# tag::exo3_2[]

(
    results.orderBy("count", ascending=False)
    .groupby(length(col("word")))
    .count()
    .show(5)
)
# +------------+-----+
# |length(word)|count|
# +------------+-----+
# |          12|  199|
# |           1|   10|
# |          13|  113|
# |           6|  908|
# |          16|    4|
# +------------+-----+
# only showing top 5 rows

# end::exo3_2[]

import pyspark.sql.functions as F

# tag::sol3_3[]
results = (
    spark.read.text("./data/gutenberg_books/1342-0.txt")
    .select(F.split(F.col("value"), " ").alias("line"))
    .select(F.explode(F.col("line")).alias("word"))
    .select(F.lower(F.col("word")).alias("word"))
    .select(F.regexp_extract(F.col("word"), "[a-z']*", 0).alias("word"))
    .where(F.col("word") != "")
    .groupby(F.col("word"))
    .count()
    .count()  # <1>
)

print(results)  # => 6595
# end::sol3_3[]


# tag::sol3_3a[]
results = (
    spark.read.text("./data/gutenberg_books/1342-0.txt")
    .select(F.split(F.col("value"), " ").alias("line"))
    .select(F.explode(F.col("line")).alias("word"))
    .select(F.lower(F.col("word")).alias("word"))
    .select(F.regexp_extract(F.col("word"), "[a-z']*", 0).alias("word"))
    .where(F.col("word") != "")
    .distinct()  # <1>
    .count()
)

print(results)  # => 6595
# end::sol3_3a[]

# tag::sol3_3b[]
def num_of_distinct_words(file):
    return (
        spark.read.text(file)
        .select(F.split(F.col("value"), " ").alias("line"))
        .select(F.explode(F.col("line")).alias("word"))
        .select(F.lower(F.col("word")).alias("word"))
        .select(
            F.regexp_extract(F.col("word"), "[a-z']*", 0).alias("word")
        )
        .where(F.col("word") != "")
        .distinct()
        .count()
    )

print(num_of_distinct_words("./data/gutenberg_books/1342-0.txt"))  # => 6595
# end::sol3_3b[]

# tag::sol3_4[]
results = (
    spark.read.text("./data/gutenberg_books/1342-0.txt")
    .select(F.split(F.col("value"), " ").alias("line"))
    .select(F.explode(F.col("line")).alias("word"))
    .select(F.lower(F.col("word")).alias("word"))
    .select(F.regexp_extract(F.col("word"), "[a-z']*", 0).alias("word"))
    .where(F.col("word") != "")
    .groupby(F.col("word"))
    .count()
    .where(F.col("count") == 1)  # <1>
)

results.show(5)
# +------------+-----+
# |        word|count|
# +------------+-----+
# |   imitation|    1|
# |     solaced|    1|
# |premeditated|    1|
# |     elevate|    1|
# |   destitute|    1|
# +------------+-----+
# only showing top 5 rows
# end::sol3_4[]



# tag::res3_5[]
results = (
    spark.read.text("./data/gutenberg_books/1342-0.txt")
    .select(F.split(F.col("value"), " ").alias("line"))
    .select(F.explode(F.col("line")).alias("word"))
    .select(F.lower(F.col("word")).alias("word"))
    .select(F.regexp_extract(F.col("word"), "[a-z']*", 0).alias("word"))
    .where(F.col("word") != "")
    .groupby(F.col("word"))
    .count()
)
# end::res3_5[]

# tag::sol3_5a[]
results.withColumn(
    "first_letter", F.substring(F.col("word"), 1, 1)
).groupby(F.col("first_letter")).sum().orderBy(
    "sum(count)", ascending=False
).show(
    5
)

# +------------+----------+
# |first_letter|sum(count)|
# +------------+----------+
# |           t|     16101|
# |           a|     13684|
# |           h|     10419|
# |           w|      9091|
# |           s|      8791|
# +------------+----------+
# only showing top 5 rows
# end::sol3_5a[]

# tag::sol3_5b[]
results.withColumn(
    "first_letter_vowel",
    F.substring(F.col("word"), 1, 1).isin(["a", "e", "i", "o", "u"]),
).groupby(F.col("first_letter_vowel")).sum().show()
# +------------------+----------+
# |first_letter_vowel|sum(count)|
# +------------------+----------+
# |              true|     33522|
# |             false|     88653|
# +------------------+----------+
# end::sol3_5b[]
