# tag::ch07-code-imports[]
from pyspark.sql import SparkSession
from pyspark.sql.utils import AnalysisException  # <1>
import pyspark.sql.functions as F
import pyspark.sql.types as T

spark = SparkSession.builder.getOrCreate()
# end::ch07-code-imports[]

# tag::ch07-code-elements[]

elements = spark.read.csv(
    "./data/elements/Periodic_Table_Of_Elements.csv",
    header=True,
    inferSchema=True,
)

elements.where(F.col("phase") == "liq").groupby("period").count().show()

# end::ch07-code-elements[]

# tag::ch07-code-sql-fail[]
try:
    spark.sql(
        "select period, count(*) from elements "
        "where phase='liq' group by period"
    ).show(5)
except AnalysisException as e:
    print(e)

# 'Table or view not found: elements; line 1 pos 29'
# end::ch07-code-sql-fail[]
# tag::ch07-code-sql-win[]
elements.createOrReplaceTempView("elements")  # <1>

spark.sql(
    "select period, count(*) from elements where phase='liq' group by period"
).show(5)

# +------+--------+
# |period|count(1)|
# +------+--------+
# |     6|       1|
# |     4|       1|
# +------+--------+ <2>
# end::ch07-code-sql-win[]

# tag::ch07-code-catalog[]

spark.catalog  # <1>

#  <pyspark.sql.catalog.Catalog at 0x117ef0c18>

spark.catalog.listTables()  # <2>

#  [Table(name='elements', database=None, description=None,
#         tableType='TEMPORARY', isTemporary=True)]

spark.catalog.dropTempView("elements")  # <3>

spark.catalog.listTables()  # <4>

# []

# end::ch07-code-catalog[]

# tag::ch07-reading-backblaze-data[]
DATA_DIRECTORY = "./data/backblaze/"

q1 = spark.read.csv(
    DATA_DIRECTORY + "drive_stats_2019_Q1", header=True, inferSchema=True
)
q2 = spark.read.csv(
    DATA_DIRECTORY + "data_Q2_2019", header=True, inferSchema=True
)
q3 = spark.read.csv(
    DATA_DIRECTORY + "data_Q3_2019", header=True, inferSchema=True
)
q4 = spark.read.csv(
    DATA_DIRECTORY + "data_Q4_2019", header=True, inferSchema=True
)

# Q4 has two more fields than the rest

q4_fields_extra = set(q4.columns) - set(q1.columns)

for i in q4_fields_extra:
    q1 = q1.withColumn(i, F.lit(None).cast(T.StringType()))
    q2 = q2.withColumn(i, F.lit(None).cast(T.StringType()))
    q3 = q3.withColumn(i, F.lit(None).cast(T.StringType()))


# if you are only using the minimal set of data, use this version
backblaze_2019 = q3

# if you are using the full set of data, use this version
backblaze_2019 = (
    q1.select(q4.columns)
    .union(q2.select(q4.columns))
    .union(q3.select(q4.columns))
    .union(q4)
)

# Setting the layout for each column according to the schema

backblaze_2019 = backblaze_2019.select(
    [
        F.col(x).cast(T.LongType()) if x.startswith("smart") else F.col(x)
        for x in backblaze_2019.columns
    ]
)

backblaze_2019.createOrReplaceTempView("backblaze_stats_2019")
# end::ch07-reading-backblaze-data[]

# tag::ch07-code-where[]
spark.sql(
    "select serial_number from backblaze_stats_2019 where failure = 1"
).show(
    5
)  # <1>

backblaze_2019.where("failure = 1").select(F.col("serial_number")).show(5)

# +-------------+
# |serial_number|
# +-------------+
# |    57GGPD9NT|
# |     ZJV02GJM|
# |     ZJV03Y00|
# |     ZDEB33GK|
# |     Z302T6CW|
# +-------------+
# only showing top 5 rows
# end::ch07-code-where[]

# tag::ch07-code-groupby[]

spark.sql(
    """SELECT
           model,
           min(capacity_bytes / pow(1024, 3)) min_GB,
           max(capacity_bytes/ pow(1024, 3)) max_GB
        FROM backblaze_stats_2019
        GROUP BY 1
        ORDER BY 3 DESC"""
).show(5)

backblaze_2019.groupby(F.col("model")).agg(
    F.min(F.col("capacity_bytes") / F.pow(F.lit(1024), 3)).alias("min_GB"),
    F.max(F.col("capacity_bytes") / F.pow(F.lit(1024), 3)).alias("max_GB"),
).orderBy(F.col("max_GB"), ascending=False).show(5)

# +--------------------+--------------------+-------+
# |               model|              min_GB| max_GB|
# +--------------------+--------------------+-------+
# |       ST16000NM001G|             14902.0|14902.0|
# | TOSHIBA MG07ACA14TA|-9.31322574615478...|13039.0|
# |HGST HUH721212ALE600|             11176.0|11176.0|
# |       ST12000NM0007|-9.31322574615478...|11176.0|
# |       ST12000NM0008|             11176.0|11176.0|
# +--------------------+--------------------+-------+
# only showing top 5 rows
# end::ch07-code-groupby[]

# tag::ch07-code-having[]
spark.sql(
    """SELECT
           model,
           min(capacity_bytes / pow(1024, 3)) min_GB,
           max(capacity_bytes/ pow(1024, 3)) max_GB
        FROM backblaze_stats_2019
        GROUP BY 1
        HAVING min_GB != max_GB
        ORDER BY 3 DESC"""
).show(5)

backblaze_2019.groupby(F.col("model")).agg(
    F.min(F.col("capacity_bytes") / F.pow(F.lit(1024), 3)).alias("min_GB"),
    F.max(F.col("capacity_bytes") / F.pow(F.lit(1024), 3)).alias("max_GB"),
).where(F.col("min_GB") != F.col("max_GB")).orderBy(
    F.col("max_GB"), ascending=False
).show(
    5
)

# +--------------------+--------------------+-------+
# |               model|              min_GB| max_GB|
# +--------------------+--------------------+-------+
# | TOSHIBA MG07ACA14TA|-9.31322574615478...|13039.0|
# |       ST12000NM0007|-9.31322574615478...|11176.0|
# |HGST HUH721212ALN604|-9.31322574615478...|11176.0|
# |       ST10000NM0086|-9.31322574615478...| 9314.0|
# |HGST HUH721010ALE600|-9.31322574615478...| 9314.0|
# +--------------------+--------------------+-------+
# only showing top 5 rows
# end::ch07-code-having[]

# tag::ch07-code-create-table[]
backblaze_2019.createOrReplaceTempView("drive_stats")


spark.sql(
    """
    CREATE OR REPLACE TEMP VIEW drive_days AS
        SELECT model, count(*) AS drive_days
        FROM drive_stats
        GROUP BY model"""
)

spark.sql(
    """CREATE OR REPLACE TEMP VIEW failures AS
           SELECT model, count(*) AS failures
           FROM drive_stats
           WHERE failure = 1
           GROUP BY model"""
)

drive_days = backblaze_2019.groupby(F.col("model")).agg(
    F.count(F.col("*")).alias("drive_days")
)

failures = (
    backblaze_2019.where(F.col("failure") == 1)
    .groupby(F.col("model"))
    .agg(F.count(F.col("*")).alias("failures"))
)
# end::ch07-code-create-table[]


# tag::ch07-code-union[]
columns_backblaze = ", ".join(q4.columns)  # <1>

q1.createOrReplaceTempView("Q1")  # <2>
q2.createOrReplaceTempView("Q2")
q3.createOrReplaceTempView("Q3")
q4.createOrReplaceTempView("Q4")

spark.sql(
    """
    CREATE OR REPLACE TEMP VIEW backblaze_2019 AS
    SELECT {col} FROM Q1 UNION ALL
    SELECT {col} FROM Q2 UNION ALL
    SELECT {col} FROM Q3 UNION ALL
    SELECT {col} FROM Q4
""".format(
        col=columns_backblaze
    )
)

backblaze_2019 = (  # <3>
    q1.select(q4.columns)
    .union(q2.select(q4.columns))
    .union(q3.select(q4.columns))
    .union(q4)
)
# end::ch07-code-union[]

# tag::ch07-code-join[]

spark.sql(
    """select
           drive_days.model,
           drive_days,
           failures
    from drive_days
    left join failures
    on
        drive_days.model = failures.model"""
).show(5)

drive_days.join(failures, on="model", how="left").show(5)
# end::ch07-code-join[]

# tag::ch07-code-subqueries[]
spark.sql(
    """
    SELECT
        failures.model,
        failures / drive_days failure_rate
    FROM (
        SELECT
            model,
            count(*) AS drive_days
        FROM drive_stats
        GROUP BY model) drive_days
    INNER JOIN (
        SELECT
            model,
            count(*) AS failures
        FROM drive_stats
        WHERE failure = 1
        GROUP BY model) failures
    ON
        drive_days.model = failures.model
    ORDER BY 2 desc
    """
).show(5)
# end::ch07-code-subqueries[]

# tag::ch07-code-cte[]
spark.sql(
    """
    WITH drive_days as (
        SELECT
            model,
            count(*) AS drive_days
        FROM drive_stats
        GROUP BY model),
    failures as (
        SELECT
            model,
            count(*) AS failures
        FROM drive_stats
        WHERE failure = 1
        GROUP BY model)
    SELECT
        failures.model,
        failures / drive_days failure_rate
    FROM drive_days
    INNER JOIN failures
    ON
        drive_days.model = failures.model
    ORDER BY 2 desc
    """
).show(5)
# end::ch07-code-cte[]

# tag::ch07-code-pyspark-cte[]
def failure_rate(drive_stats):
    drive_days = drive_stats.groupby(F.col("model")).agg(  # <1>
        F.count(F.col("*")).alias("drive_days")
    )
    failures = (
        drive_stats.where(F.col("failure") == 1)
        .groupby(F.col("model"))
        .agg(F.count(F.col("*")).alias("failures"))
    )
    answer = (  # <2>
        drive_days.join(failures, on="model", how="inner")
        .withColumn("failure_rate", F.col("failures") / F.col("drive_days"))
        .orderBy(F.col("failure_rate").desc())
    )
    return answer


failure_rate(backblaze_2019).show(5)

print("drive_days" in dir())  # <3>
# end::ch07-code-pyspark-cte[]
