#!/usr/bin/env python3

"""Code for Ch11 of the book _Data Analysis with Python and PySpark_"""

# tag::ch11-launch-pyspark-wordcount[]

from pyspark.sql import SparkSession
import pyspark.sql.functions as F

spark = SparkSession.builder.appName(  # <1>  # <1>
    "Counting word occurences from a book, one more time."  # <1>
).getOrCreate()  # <1>  # <1>  # <1>

results = (  # <2>
    spark.read.text("./data/gutenberg_books/*.txt")
    .select(F.split(F.col("value"), " ").alias("line"))
    .select(F.explode(F.col("line")).alias("word"))
    .select(F.lower(F.col("word")).alias("word"))
    .select(F.regexp_extract(F.col("word"), "[a-z']+", 0).alias("word"))
    .where(F.col("word") != "")
    .groupby(F.col("word"))
    .count()
)

results.orderBy(F.col("count").desc()).show(10)  #  <3>

# end::ch11-launch-pyspark-wordcount[]

spark.stop()

# tag::ch11-launch-pyspark-bis[]
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName("Launching PySpark with custom options")
    .master("local[8]")  # <1>
    .config("spark.driver.memory", "16g")  # <2>
).getOrCreate()

# [... Run the program here ...]

# end::ch11-launch-pyspark-bis[]

spark.stop()

# tag::ch11-results-code[]

from pyspark.sql import SparkSession
import pyspark.sql.functions as F

spark = (
    SparkSession.builder.appName(
        "Counting word occurences from a book, one more time."
    )
    .master("local[4]")
    .config("spark.driver.memory", "8g")
    .getOrCreate()
)

results = (
    spark.read.text("./data/gutenberg_books/*.txt")
    .select(F.split(F.col("value"), " ").alias("line"))
    .select(F.explode(F.col("line")).alias("word"))
    .select(F.lower(F.col("word")).alias("word"))
    .select(F.regexp_extract(F.col("word"), "[a-z']+", 0).alias("word"))
    .where(F.col("word") != "")
    .groupby(F.col("word"))
    .count()
)

results.orderBy(F.col("count").desc()).show(10)


# end::ch11-results-code[]

# tag::ch11-modified-word-count[]

results = (
    spark.read.text("./data/gutenberg_books/*.txt")
    .select(F.split(F.col("value"), " ").alias("line"))
    .select(F.explode(F.col("line")).alias("word"))
    .select(F.lower(F.col("word")).alias("word"))
    .select(F.regexp_extract(F.col("word"), "[a-z']+", 0).alias("word"))
    .where(F.col("word") != "")
    .groupby(F.col("word"))
    .count()
    .where(F.length(F.col("word")) > 8)
    .groupby(F.length(F.col("word")))
    .sum("count")
)

results.show(5, False)
# Output not shown for brievty.

results.explain("formatted")

# == Physical Plan ==
# * HashAggregate (12)
# +- Exchange (11)
#    +- * HashAggregate (10)
#       +- * HashAggregate (9)
#          +- Exchange (8)
#             +- * HashAggregate (7)
#                +- * Project (6)
#                   +- * Filter (5)
#                      +- Generate (4)
#                         +- * Project (3)
#                            +- * Filter (2)
#                               +- Scan text  (1)

# (1) Scan text
# Output [1]: [value#16766]
# Batched: false
# Location: InMemoryFileIndex [file:/.../data/gutenberg_books/11-0.txt, ... 5 entries]
# ReadSchema: struct<value:string>

# (2) Filter [codegen id : 1]
# Input [1]: [value#16766]
# Condition : ((size(split(value#16766,  , -1), true) > 0) AND isnotnull(split(value#16766,  , -1)))

# [...]

# (11) Exchange
# Input [2]: [length(word#16775)#16806, sum#16799L]
# Arguments: hashpartitioning(length(word#16775)#16806, 200), ENSURE_REQUIREMENTS, [id=#2416]

# (12) HashAggregate [codegen id : 4]
# Input [2]: [length(word#16775)#16806, sum#16799L]
# Keys [1]: [length(word#16775)#16806]
# Functions [1]: [sum(count#16779L)]
# Aggregate Attributes [1]: [sum(count#16779L)#16786L]
# Results [2]: [length(word#16775)#16806 AS length(word)#16787, sum(count#16779L)#16786L AS sum(count)#16788L]

# end::ch11-modified-word-count[]

# tag::ch11-optimized-word-count[]

results_bis = (
    spark.read.text("./data/gutenberg_books/*.txt")
    .select(F.split(F.col("value"), " ").alias("line"))
    .select(F.explode(F.col("line")).alias("word"))
    .select(F.lower(F.col("word")).alias("word"))
    .select(F.regexp_extract(F.col("word"), "[a-z']+", 0).alias("word"))
    .where(F.col("word") != "")
    .where(F.length(F.col("word")) > 8)
    .groupby(F.length(F.col("word")))
    .count()
)

results_bis.show(5, False)
# Output not shown for brievty.

results_bis.explain("formatted")
# == Physical Plan ==
# * HashAggregate (9)
# +- Exchange (8)
#    +- * HashAggregate (7)
#       +- * Project (6)
#          +- * Filter (5)
#             +- Generate (4)
#                +- * Project (3)
#                   +- * Filter (2)
#                      +- Scan text  (1)


# (1) Scan text
# Output [1]: [value#16935]
# Batched: false
# Location: InMemoryFileIndex [file:/Users/jonathan/Library/Mobile Documents/com~apple~CloudDocs/PySparkInAction/data/gutenberg_books/11-0.txt, ... 5 entries]
# ReadSchema: struct<value:string>

# (2) Filter [codegen id : 1]
# Input [1]: [value#16935]
# Condition : ((size(split(value#16935,  , -1), true) > 0) AND isnotnull(split(value#16935,  , -1)))

# [...]

# (5) Filter [codegen id : 2]
# Input [1]: [word#16940]
# Condition : ((isnotnull(word#16940) AND NOT (regexp_extract(lower(word#16940), [a-z']+, 0) = )) AND (length(regexp_extract(lower(word#16940), [a-z']+, 0)) > 8))

# [...]

# (9) HashAggregate [codegen id : 3]
# Input [2]: [length(word#16944)#16965, count#16960L]
# Keys [1]: [length(word#16944)#16965]
# Functions [1]: [count(1)]
# Aggregate Attributes [1]: [count(1)#16947L]
# Results [2]: [length(word#16944)#16965 AS length(word)#16949, count(1)#16947L AS count#16948L]

# end::ch11-optimized-word-count[]
