#!/usr/bin/env python3

from pyspark.sql import SparkSession  # <1>

spark = SparkSession.builder.appName(  # <2>
    "Analyzing the vocabulary of Pride and Prejudice."
).getOrCreate()  # <3>

# fmt:off
# tag::sol2_1[]
from pyspark.sql.functions import col, explode

exo_2_1_df = spark.createDataFrame(
    [
        [[1, 2, 3, 4, 5]],
        [[5, 6, 7, 8, 9, 10]]
    ],
    ["numbers"]
)

solution_2_1_df = exo_2_1_df.select(explode(col("numbers")))

print(f"solution_2_1_df contains {solution_2_1_df.count()} records.")
# => solution_2_1_df contains 11 records.

solution_2_1_df.show()
# +---+
# |col|
# +---+
# |  1|
# |  2|
# |  3|
# |  4|
# |  5|
# |  5|
# |  6|
# |  7|
# |  8|
# |  9|
# | 10|
# +---+
# end::sol2_1[]
# fmt:on

# tag::exo2_1[]
exo_2_1_df.show()

# +-------------------+
# |            numbers|
# +-------------------+
# |    [1, 2, 3, 4, 5]|
# |[5, 6, 7, 8, 9, 10]|
# +-------------------+

solution_2_1_df = exo_2_1_df.select(explode(col("numbers")))

# end::exo2_1[]


# tag::exo2_2[]
exo2_2_df = spark.createDataFrame(
    [["test", "more test", 10_000_000_000]], ["one", "two", "three"]
)

exo2_2_df.printSchema()
# root
#  |-- one: string (nullable = true)
#  |-- two: string (nullable = true)
#  |-- three: long (nullable = true)
# end::exo2_2[]


# tag::sol2_2[]
print(len([x for x, y in exo2_2_df.dtypes if y != "string"]))  # => 1
# end::sol2_2[]

# tag::exo2_3[]
from pyspark.sql.functions import col, length

# The `length` function returns the number of characters in a string column.

exo2_3_df = (
    spark.read.text("./data/gutenberg_books/1342-0.txt")
    .select(length(col("value")))
    .withColumnRenamed("length(value)", "number_of_char")
)
# end::exo2_3[]

# fmt:off
# tag::sol2_3[]
exo2_3_df = (
    spark.read.text("./data/gutenberg_books/1342-0.txt")
    .select(length(col("value")).alias("number_of_char"))
)
# end::sol2_3[]
# fmt:on

# tag::exo2_4[]
from pyspark.sql.functions import col, greatest

exo2_4_df = spark.createDataFrame(
    [["key", 10_000, 20_000]], ["key", "value1", "value2"]
)

exo2_4_df.printSchema()
# root
#  |-- key: string (containsNull = true)
#  |-- value1: long (containsNull = true)
#  |-- value2: long (containsNull = true)

# `greatest` will return the greatest value of the list of column names,
# skipping null value

# The following statement will return an error
from pyspark.sql.utils import AnalysisException

try:
    exo2_4_mod = exo2_4_df.select(
        greatest(col("value1"), col("value2")).alias("maximum_value")
    ).select("key", "max_value")
except AnalysisException as err:
    print(err)
# end::exo2_4[]

# tag::exo2_5[]
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split, explode, lower, regexp_extract

spark = SparkSession.builder.getOrCreate()

book = spark.read.text("./data/gutenberg_books/1342-0.txt")

lines = book.select(split(book.value, " ").alias("line"))

words = lines.select(explode(col("line")).alias("word"))

words_lower = words.select(lower(col("word")).alias("word_lower"))

words_clean = words_lower.select(
    regexp_extract(col("word_lower"), "[a-z]*", 0).alias("word")
)

words_nonull = words_clean.where(col("word") != "")
# end::exo2_5[]

# tag::sol2_5a[]
words_without_is = words_nonull.where(col("word") != "is")
# end::sol2_5a[]

# tag::sol2_5b[]
from pyspark.sql.functions import length

words_more_than_3_char = words_nonull.where(length(col("word")) > 3)
# end::sol2_5b[]

# tag::sol2_6[]
words_no_is_not_the_if = (
    words_nonull.where(~col("word").isin(
        ["no", "is", "the", "if"])))
# end::sol2_6[]

# tag::exo2_7[]
from pyspark.sql.functions import col, split

try:
    book = spark.read.text("./data/gutenberg_books/1342-0.txt")
    book = book.printSchema()
    lines = book.select(split(book.value, " ").alias("line"))
    words = lines.select(explode(col("line")).alias("word"))
except AnalysisException as err:
    print(err)

# end::exo2_7[]
