#!/usr/bin/env python3

from pyspark.sql import SparkSession
import pyspark.sql.functions as F

spark = SparkSession.builder.getOrCreate()

# tag::sol4_3[]
DIRECTORY = "./data/broadcast_logs"
logs_raw = spark.read.csv(os.path.join(
    DIRECTORY, "BroadcastLogs_2018_Q3_M8.CSV"),)

logs_raw.printSchema()
# root
#  |-- _c0: string (nullable = true)

logs_raw.show(5, truncate=50)
# +--------------------------------------------------+
# |                                               _c0|
# +--------------------------------------------------+
# |BroadcastLogID|LogServiceID|LogDate|SequenceNO|...|
# |1196192316|3157|2018-08-01|1|4||13|3|3|||10|19|...|
# |1196192317|3157|2018-08-01|2||||1|||||20|||00:0...|
# |1196192318|3157|2018-08-01|3||||1|||||3|||00:00...|
# |1196192319|3157|2018-08-01|4||||1|||||3|||00:00...|
# +--------------------------------------------------+
# only showing top 5 rows
# end::sol4_3[]

import os

DIRECTORY = "./data/broadcast_logs"
logs = spark.read.csv(
    os.path.join(DIRECTORY, "BroadcastLogs_2018_Q3_M8.CSV"),  # <1>
    sep="|",  # <2>
    header=True,  # <3>
    inferSchema=True,  # <4>
    timestampFormat='yyyy-MM-dd',  # <5>
)

# tag::sol4_4[]
logs_clean = logs.select(*[x for x in logs.columns if not x.endswith("ID")])

logs_clean.printSchema()
# root
#  |-- LogDate: timestamp (nullable = true)
#  |-- SequenceNO: integer (nullable = true)
#  |-- Duration: string (nullable = true)
#  |-- EndTime: string (nullable = true)
#  |-- LogEntryDate: timestamp (nullable = true)
#  |-- ProductionNO: string (nullable = true)
#  |-- ProgramTitle: string (nullable = true)
#  |-- StartTime: string (nullable = true)
#  |-- Subtitle: string (nullable = true)
#  |-- Producer1: string (nullable = true)
#  |-- Producer2: string (nullable = true)
#  |-- Language1: integer (nullable = true)
#  |-- Language2: integer (nullable = true)
# end::sol4_4[]
