#!/usr/bin/env python3

# tag::sol5_5[]
import pyspark.sql.functions as F

call_signs = spark.read.csv(
    "data/broadcast_logs/Call_Signs.csv", header=True
).drop("UndertakingNo")

answer.printSchema()
# root
#  |-- LogIdentifierID: string (nullable = true) <1>
#  |-- duration_commercial: long (nullable = true)
#  |-- duration_total: long (nullable = true)
#  |-- commercial_ratio: double (nullable = false)

call_signs.printSchema()
# root
#  |-- LogIdentifierID: string (nullable = true) <1>
#  |-- Undertaking_Name: string (nullable = true)


exo5_5_df = answer.join(call_signs, on="LogIdentifierID")

exo5_5_df.show(10)
# +---------------+-------------------+--------------+--------------------+--------------------+
# |LogIdentifierID|duration_commercial|duration_total|    commercial_ratio|    Undertaking_Name|
# +---------------+-------------------+--------------+--------------------+--------------------+
# |           CJCO|             538455|       3281593| 0.16408341924181336|Rogers Media Inc....|
# |          BRAVO|             701000|       3383060|  0.2072088582525879|              Bravo!|
# |           CFTF|                665|         45780| 0.01452599388379205|Télévision MBS in...|
# |           CKCS|             314774|       3005153|    0.10474475010091|Crossroads Televi...|
# |           CJNT|             796196|       3470359| 0.22942756066447303|Rogers Media Inc....|
# |           CKES|             303945|       2994495|  0.1015012548025627|Crossroads Televi...|
# |           CHBX|             919866|       3316728| 0.27734140393785683|Bell Media Inc., ...|
# |           CASA|             696398|       3374798| 0.20635249872733125|Casa - (formerly ...|
# |           BOOK|             607620|       3292170| 0.18456519560047022|Book Television (...|
# |         MOVIEP|             107888|       2678400|0.040280764635603344|STARZ (formerly T...|
# +---------------+-------------------+--------------+--------------------+--------------------+
# only showing top 10 rows

# end::sol5_5[]

# tag::sol5_6[]
PRC_vs_Commercial = (  # <1>
    F.when(
        F.trim(F.col("ProgramClassCD")).isin(
            ["COM", "PGI", "PRO", "LOC", "SPO", "MER", "SOL"]
        ),
        F.col("duration_seconds"),
    )
    .when(  # <2>
        F.trim(F.col("ProgramClassCD")) == "PRC",
        F.col("duration_seconds") * 0.75,
    )
    .otherwise(0)
)

exo5_6_df = (
    full_log.groupby("LogIdentifierID")
    .agg(
        F.sum(PRC_vs_Commercial).alias("duration_commercial"),
        F.sum("duration_seconds").alias("duration_total"),
    )
    .withColumn(
        "commercial_ratio",
        F.col("duration_commercial") / F.col("duration_total"),
    )
)

exo5_6_df.orderBy("commercial_ratio", ascending=False).show(5, False)
# end::sol5_6[]

# fmt:off
# tag::sol5_7[]

exo5_7_df = (
    answer
        .groupby(F.round(F.col("commercial_ratio"), 1).alias("commercial_ratio"))
        .agg(F.count("*").alias("number_of_channels"))
)

exo5_7_df.orderBy("commercial_ratio", ascending=False).show()
# +----------------+------------------+
# |commercial_ratio|number_of_channels|
# +----------------+------------------+
# |             1.0|                24|
# |             0.9|                 4|
# |             0.8|                 1|
# |             0.7|                 1|
# |             0.5|                 1|
# |             0.4|                 5|
# |             0.3|                45|
# |             0.2|               141|
# |             0.1|                64|
# |             0.0|                38|
# +----------------+------------------+



# end::sol5_7[]
