"""Book code for chapter 2 of Data Analysis with Python and PySpark"""

# tag::ch02-sparksession[]

from pyspark.sql import SparkSession  # <1>

spark = (SparkSession
         .builder  # <2>
         .appName("Analyzing the vocabulary of Pride and Prejudice.")  # <3>
         .getOrCreate())

# end::ch02-sparksession[]

# tag::ch02-code-reading-file[]

book = spark.read.text("./data/gutenberg_books/1342-0.txt")

book
# DataFrame[value: string]

# end::ch02-code-reading-file[]

# tag::ch02-code-print-schema[]

book.printSchema()

# root  <1>
#  |-- value: string (nullable = true)  <2>

print(book.dtypes)

# [('value', 'string')] <3>

# end::ch02-code-print-schema[]

# tag::ch02-code-show[]

book.show()

# +--------------------+
# |               value|  <1>
# +--------------------+
# |The Project Guten...|
# |                    |
# |This eBook is for...|
# |almost no restric...|
# |re-use it under t...|
# |with this eBook o...|
# |                    |
# |                    |
# |Title: Pride and ...|
# |                    |
# | [... more records] |
# |Character set enc...|
# |                    |
# +--------------------+
# only showing top 20 rows

# end::ch02-code-show[]

# tag::ch02-code-show2[]

book.show(10, truncate=50)

# +--------------------------------------------------+
# |                                             value|
# +--------------------------------------------------+
# |The Project Gutenberg EBook of Pride and Prejud...|
# |                                                  |
# |This eBook is for the use of anyone anywhere at...|
# |almost no restrictions whatsoever.  You may cop...|
# |re-use it under the terms of the Project Gutenb...|
# |    with this eBook or online at www.gutenberg.org|
# |                                                  |
# |                                                  |
# |                        Title: Pride and Prejudice|
# |                                                  |
# +--------------------------------------------------+
# only showing top 10 rows

# end::ch02-code-show2[]

# tag::ch02-code-split-words[]

from pyspark.sql.functions import split

lines = book.select(split(book.value, " ").alias("line"))

lines.show(5)

# +--------------------+
# |                line|
# +--------------------+
# |[The, Project, Gu...|
# |                  []|
# |[This, eBook, is,...|
# |[almost, no, rest...|
# |[re-use, it, unde...|
# +--------------------+
# only showing top 5 rows

# end::ch02-code-split-words[]

# tag::ch02-code-simple-select[]

book.select(book.value)

# end::ch02-code-simple-select[]

# tag::ch02-code-multiple-selection[]

from pyspark.sql.functions import col

book.select(book.value)
book.select(book["value"])
book.select(col("value"))
book.select("value")

# end::ch02-code-multiple-selection[]

# tag::ch02-code-split-example[]

from pyspark.sql.functions import col, split

lines = book.select(split(col("value"), " "))

lines

# DataFrame[split(value,  , -1): array<string>]

lines.printSchema()

# root
#  |-- split(value,  , -1): array (nullable = true)
#  |    |-- element: string (containsNull = true)

lines.show(5)

# +--------------------+
# | split(value,  , -1)|
# +--------------------+
# |[The, Project, Gu...|
# |                  []|
# |[This, eBook, is,...|
# |[almost, no, rest...|
# |[re-use, it, unde...|
# +--------------------+
# only showing top 5 rows

# end::ch02-code-split-example[]

# tag::ch02-code-simple-alias[]

book.select(split(col("value"), " ")).printSchema()
# root
#  |-- split(value,  , -1): array (nullable = true)  <1>
#  |    |-- element: string (containsNull = true)

book.select(split(col("value"), " ").alias("line")).printSchema()

# root
#  |-- line: array (nullable = true) <2>
#  |    |-- element: string (containsNull = true)

# end::ch02-code-simple-alias[]

# tag::ch02-code-renaming-examples[]

# This looks a lot cleaner
lines = book.select(split(book.value, " ").alias("line"))

# This is messier, and you have to remember the name PySpark assigns automatically
lines = book.select(split(book.value, " "))
lines = lines.withColumnRenamed("split(value,  , -1)", "line")

# end::ch02-code-renaming-examples[]

# tag::ch02-code-explode[]

from pyspark.sql.functions import explode, col

words = lines.select(explode(col("line")).alias("word"))

words.show(15)
# +----------+
# |      word|
# +----------+
# |       The|
# |   Project|
# | Gutenberg|
# |     EBook|
# |        of|
# |     Pride|
# |       and|
# |Prejudice,|
# |        by|
# |      Jane|
# |    Austen|
# |          |
# |      This|
# |     eBook|
# |        is|
# +----------+
# only showing top 15 rows

# end::ch02-code-explode[]

# tag::ch02-code-lower-case[]

from pyspark.sql.functions import lower
words_lower = words.select(lower(col("word")).alias("word_lower"))

words_lower.show()

# +-----------+
# | word_lower|
# +-----------+
# |        the|
# |    project|
# |  gutenberg|
# |      ebook|
# |         of|
# |      pride|
# |        and|
# | prejudice,|
# |         by|
# |       jane|
# |     austen|
# |           |
# |       this|
# |      ebook|
# |         is|
# |        for|
# |        the|
# |        use|
# |         of|
# |     anyone|
# +-----------+
# only showing top 20 rows

# end::ch02-code-lower-case[]

# tag::ch02-regexp-word[]
from pyspark.sql.functions import regexp_extract
words_clean = words_lower.select(
    regexp_extract(col("word_lower"), "[a-z]+", 0).alias("word")  # <1>
)

words_clean.show()

# +---------+
# |     word|
# +---------+
# |      the|
# |  project|
# |gutenberg|
# |    ebook|
# |       of|
# |    pride|
# |      and|
# |prejudice|
# |       by|
# |     jane|
# |   austen|
# |         |
# |     this|
# |    ebook|
# |       is|
# |      for|
# |      the|
# |      use|
# |       of|
# |   anyone|
# +---------+
# only showing top 20 rows

# end::ch02-regexp-word[]

# tag::ch02-code-filtering[]

words_nonull = words_clean.filter(col("word") != "")

words_nonull.show()

# +---------+
# |     word|
# +---------+
# |      the|
# |  project|
# |gutenberg|
# |    ebook|
# |       of|
# |    pride|
# |      and|
# |prejudice|
# |       by|
# |     jane|
# |   austen|
# |     this| <1>
# |    ebook|
# |       is|
# |      for|
# |      the|
# |      use|
# |       of|
# |   anyone|
# | anywhere|
# +---------+
# only showing top 20 rows

# end::ch02-code-filtering[]
