"""Code for the Data Analysis with Python and PySpark book, chapter 6."""

# pylint: disable=C0302, C0413

# tag::ch06-reading-json-python[]
import json  # <1>

sample_json = """{
  "id": 143,
  "name": "Silicon Valley",
  "type": "Scripted",
  "language": "English",
  "genres": [
    "Comedy"
  ],
  "network": {
    "id": 8,
    "name": "HBO",
    "country": {
      "name": "United States",
      "code": "US",
      "timezone": "America/New_York"
    }
  }
}"""

document = json.loads(sample_json)
print(document)  # <2>
# {'id': 143,
#  'name': 'Silicon Valley',
#  'type': 'Scripted',
#  'language': 'English',
#  'genres': ['Comedy'],
#  'network': {'id': 8,
#   'name': 'HBO',
#   'country': {'name': 'United States',
#    'code': 'US',
#    'timezone': 'America/New_York'}}}

type(document)
# dict  <3>

# end::ch06-reading-json-python[]

# tag::ch06-reading-the-data[]

from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

shows = spark.read.json("./data/shows/shows-silicon-valley.json")  # <1>

shows.count()
# 1  <2>
# end::ch06-reading-the-data[]

assert shows.count() == 1

# tag::ch06-json-multi[]

three_shows = spark.read.json("./data/shows/shows-*.json", multiLine=True)

three_shows.count()
# 3

assert three_shows.count() == 3

# end::ch06-json-multi[]

# tag::ch06-json-print-schema[]

shows.printSchema()
# root  <1>
#  |-- _embedded: struct (nullable = true)  <2>
#  |    |-- episodes: array (nullable = true)
#  |    |    |-- element: struct (containsNull = true)
#  |    |    |    |-- _links: struct (nullable = true)
#  |    |    |    |    |-- self: struct (nullable = true)
#  |    |    |    |    |    |-- href: string (nullable = true)
#  |    |    |    |-- airdate: string (nullable = true)
#  |    |    |    |-- airstamp: string (nullable = true)
#  |    |    |    |-- airtime: string (nullable = true)
#  |    |    |    |-- id: long (nullable = true)
#  |    |    |    |-- image: struct (nullable = true)
#  |    |    |    |    |-- medium: string (nullable = true)
#  |    |    |    |    |-- original: string (nullable = true)
#  |    |    |    |-- name: string (nullable = true)
#  |    |    |    |-- number: long (nullable = true)
#  |    |    |    |-- runtime: long (nullable = true)
#  |    |    |    |-- season: long (nullable = true)
#  |    |    |    |-- summary: string (nullable = true)
#  |    |    |    |-- url: string (nullable = true)
#  |-- _links: struct (nullable = true)
#  |    |-- previousepisode: struct (nullable = true)
#  |    |    |-- href: string (nullable = true)
#  |    |-- self: struct (nullable = true)
#  |    |    |-- href: string (nullable = true)
#  |-- externals: struct (nullable = true)
#  |    |-- imdb: string (nullable = true)
#  |    |-- thetvdb: long (nullable = true)
#  |    |-- tvrage: long (nullable = true)
#  |-- genres: array (nullable = true)
#  |    |-- element: string (containsNull = true)
#  |-- id: long (nullable = true)
# [and more columns...]

# end::ch06-json-print-schema[]


# tag::ch06-first-layer-col[]

print(shows.columns)

# ['_embedded', '_links', 'externals', 'genres', 'id', 'image',
#  'language', 'name', 'network', 'officialSite', 'premiered',
#  'rating', 'runtime', 'schedule', 'status', 'summary', 'type',
#  'updated', 'url', 'webChannel', 'weight']

# end::ch06-first-layer-col[]

# tag::ch06-array-column[]
array_subset = shows.select("name", "genres")

array_subset.show(1, False)
# +--------------+--------+
# |name          |genres  |
# +--------------+--------+
# |Silicon Valley|[Comedy]|
# +--------------+--------+

# end::ch06-array-column[]

# tag::ch06-array-index[]
import pyspark.sql.functions as F

array_subset = array_subset.select(
    "name",
    array_subset.genres[0].alias("dot_and_index"),  # <1>
    F.col("genres")[0].alias("col_and_index"),
    array_subset.genres.getItem(0).alias("dot_and_method"),  # <2>
    F.col("genres").getItem(0).alias("col_and_method"),
)

array_subset.show()

# +--------------+-------------+-------------+--------------+--------------+
# |          name|dot_and_index|col_and_index|dot_and_method|col_and_method|
# +--------------+-------------+-------------+--------------+--------------+
# |Silicon Valley|       Comedy|       Comedy|        Comedy|        Comedy|
# +--------------+-------------+-------------+--------------+--------------+

# end::ch06-array-index[]

# tag::ch06-array-discovery[]
array_subset_repeated = array_subset.select(
    "name",
    F.lit("Comedy").alias("one"),
    F.lit("Horror").alias("two"),
    F.lit("Drama").alias("three"),
    F.col("dot_and_index"),
).select(
    "name",
    F.array("one", "two", "three").alias("Some_Genres"),  # <1>
    F.array_repeat("dot_and_index", 5).alias("Repeated_Genres"),  # <2>
)

array_subset_repeated.show(1, False)

# +--------------+-----------------------+----------------------------------------+
# |name          |Some_Genres            |Repeated_Genres                         |
# +--------------+-----------------------+----------------------------------------+
# |Silicon Valley|[Comedy, Horror, Drama]|[Comedy, Comedy, Comedy, Comedy, Comedy]|
# +--------------+-----------------------+----------------------------------------+

array_subset_repeated.select(
    "name", F.size("Some_Genres"), F.size("Repeated_Genres")  # <3>
).show()

# +--------------+-----------------+---------------------+
# |          name|size(Some_Genres)|size(Repeated_Genres)|
# +--------------+-----------------+---------------------+
# |Silicon Valley|                3|                    5|
# +--------------+-----------------+---------------------+

array_subset_repeated.select(
    "name",
    F.array_distinct("Some_Genres"),  # <4>
    F.array_distinct("Repeated_Genres"),  # <4>
).show(1, False)

# +--------------+---------------------------+-------------------------------+
# |name          |array_distinct(Some_Genres)|array_distinct(Repeated_Genres)|
# +--------------+---------------------------+-------------------------------+
# |Silicon Valley|[Comedy, Horror, Drama]    |[Comedy]                       |
# +--------------+---------------------------+-------------------------------+

array_subset_repeated = array_subset_repeated.select(
    "name",
    F.array_intersect("Some_Genres", "Repeated_Genres").alias(  # <5>
        "Genres"
    ),
)

array_subset_repeated.show()

# +--------------+--------+
# |          name|  Genres|
# +--------------+--------+
# |Silicon Valley|[Comedy]|
# +--------------+--------+

# end::ch06-array-discovery[]

# tag::ch06-array-position[]

array_subset_repeated.select(
    "Genres", F.array_position("Genres", "Comedy")
).show()

# +--------+------------------------------+
# |  Genres|array_position(Genres, Comedy)|
# +--------+------------------------------+
# |[Comedy]|                             1|
# +--------+------------------------------+

# end::ch06-array-position[]

# tag::ch06-map-creation[]

columns = ["name", "language", "type"]

shows_map = shows.select(
    *[F.lit(column) for column in columns],
    F.array(*columns).alias("values"),
)

shows_map = shows_map.select(F.array(*columns).alias("keys"), "values")

shows_map.show(1)
# +--------------------+--------------------+
# |                keys|              values|
# +--------------------+--------------------+
# |[name, language, ...|[Silicon Valley, ...|
# +--------------------+--------------------+

shows_map = shows_map.select(
    F.map_from_arrays("keys", "values").alias("mapped")
)

shows_map.printSchema()

# root
#  |-- mapped: map (nullable = false)
#  |    |-- key: string
#  |    |-- value: string (valueContainsNull = true)

shows_map.show(1, False)

# +---------------------------------------------------------------+
# |mapped                                                         |
# +---------------------------------------------------------------+
# |[name -> Silicon Valley, language -> English, type -> Scripted]|
# +---------------------------------------------------------------+

shows_map.select(
    F.col("mapped.name"),  # <1>
    F.col("mapped")["name"],  # <2>
    shows_map.mapped["name"],  # <3>
).show()

# +--------------+--------------+--------------+
# |          name|  mapped[name]|  mapped[name]|
# +--------------+--------------+--------------+
# |Silicon Valley|Silicon Valley|Silicon Valley|
# +--------------+--------------+--------------+

# end::ch06-map-creation[]

# tag::ch06-struct-subset[]
shows.select("schedule").printSchema()

# root
#  |-- schedule: struct (nullable = true)  <1>
#  |    |-- days: array (nullable = true)
#  |    |    |-- element: string (containsNull = true)
#  |    |-- time: string (nullable = true)

# end::ch06-struct-subset[]

# tag::ch06-embedded[]

shows.select(F.col("_embedded")).printSchema()
# root
#  |-- _embedded: struct (nullable = true)  <1>
#  |    |-- episodes: array (nullable = true)  <2>
#  |    |    |-- element: struct (containsNull = true)
#  |    |    |    |-- _links: struct (nullable = true)  <3>
#  |    |    |    |    |-- self: struct (nullable = true)
#  |    |    |    |    |    |-- href: string (nullable = true)
#  |    |    |    |-- airdate: string (nullable = true)
#  |    |    |    |-- id: long (nullable = true)
#  |    |    |    |-- image: struct (nullable = true)
#  |    |    |    |    |-- medium: string (nullable = true)
#  |    |    |    |    |-- original: string (nullable = true)
#  |    |    |    |-- name: string (nullable = true)
#  |    |    |    |-- number: long (nullable = true)
#  |    |    |    |-- runtime: long (nullable = true)
#  |    |    |    |-- season: long (nullable = true)
#  |    |    |    |-- summary: string (nullable = true)
#  |    |    |    |-- url: string (nullable = true)

# end::ch06-embedded[]

# tag::ch06-promote-to-column[]
shows_clean = shows.withColumn(
    "episodes", F.col("_embedded.episodes")
).drop("_embedded")

shows_clean.printSchema()
# root
#  |-- _links: struct (nullable = true)
#  |    |-- previousepisode: struct (nullable = true)
#  |    |    |-- href: string (nullable = true)
#  |    |-- self: struct (nullable = true)
#  |    |    |-- href: string (nullable = true)
#  |-- externals: struct (nullable = true)
#  |    |-- imdb: string (nullable = true)
#  [...]
#  |-- episodes: array (nullable = true)  <1>
#  |    |-- element: struct (containsNull = true)
#  |    |    |-- _links: struct (nullable = true)
#  |    |    |    |-- self: struct (nullable = true)
#  |    |    |    |    |-- href: string (nullable = true)
#  |    |    |-- airdate: string (nullable = true)
#  |    |    |-- airstamp: string (nullable = true)
#  |    |    |-- airtime: string (nullable = true)
#  |    |    |-- id: long (nullable = true)
#  |    |    |-- image: struct (nullable = true)
#  |    |    |    |-- medium: string (nullable = true)
#  |    |    |    |-- original: string (nullable = true)
# [... rest of schema]
# end::ch06-promote-to-column[]

# tag::ch06-array-struct[]

episodes_name = shows_clean.select(F.col("episodes.name"))  # <1>
episodes_name.printSchema()

# root
#  |-- name: array (nullable = true)
#  |    |-- element: string (containsNull = true)

episodes_name.select(F.explode("name").alias("name")).show(3, False)  # <2>
# +-------------------------+
# |name                     |
# +-------------------------+
# |Minimum Viable Product   |
# |The Cap Table            |
# |Articles of Incorporation|
# +-------------------------+
# end::ch06-array-struct[]

# tag::ch06-schema-example[]
shows.printSchema()
# root  # <1>
#  |-- _links: struct (nullable = true)
#  |    |-- previousepisode: struct (nullable = true)
#  |    |    |-- href: string (nullable = true)
#  |    |-- self: struct (nullable = true)
#  |    |    |-- href: string (nullable = true)
#  |-- externals: struct (nullable = true)
#  |    |-- imdb: string (nullable = true)
#  [... rest of schema]
# end::ch06-schema-example[]

# tag::ch06-first-part-schema[]

import pyspark.sql.types as T

episode_links_schema = T.StructType(
    [
        T.StructField(
            "self", T.StructType([T.StructField("href", T.StringType())])
        )
    ]
)  # <1>

episode_image_schema = T.StructType(
    [
        T.StructField("medium", T.StringType()),
        T.StructField("original", T.StringType()),
    ]
)  # <2>

episode_schema = T.StructType(
    [
        T.StructField("_links", episode_links_schema),  # <3>
        T.StructField("airdate", T.DateType()),
        T.StructField("airstamp", T.TimestampType()),
        T.StructField("airtime", T.StringType()),
        T.StructField("id", T.StringType()),
        T.StructField("image", episode_image_schema),  # <3>
        T.StructField("name", T.StringType()),
        T.StructField("number", T.LongType()),
        T.StructField("runtime", T.LongType()),
        T.StructField("season", T.LongType()),
        T.StructField("summary", T.StringType()),
        T.StructField("url", T.StringType()),
    ]
)

embedded_schema = T.StructType(
    [
        T.StructField(
            "_embedded",
            T.StructType(
                [
                    T.StructField(
                        "episodes", T.ArrayType(episode_schema)  # <4>
                    )
                ]
            ),
        )
    ]
)

# end::ch06-first-part-schema[]

# tag::ch06-reread[]

shows_with_schema = spark.read.json(
    "./data/shows/shows-silicon-valley.json",
    schema=embedded_schema,  # <1>
    mode="FAILFAST",  # <2>
)

# end::ch06-reread[]

# tag::ch06-schema-validation[]
for column in ["airdate", "airstamp"]:
    shows.select(f"_embedded.episodes.{column}").select(
        F.explode(column)
    ).show(5)

# +----------+
# |       col|
# +----------+
# |2014-04-06|
# |2014-04-13|
# |2014-04-20|
# |2014-04-27|
# |2014-05-04|
# +----------+
# only showing top 5 rows

# +-------------------+
# |                col|
# +-------------------+
# |2014-04-06 22:00:00|
# |2014-04-13 22:00:00|
# |2014-04-20 22:00:00|
# |2014-04-27 22:00:00|
# |2014-05-04 22:00:00|
# +-------------------+
# only showing top 5 rows

# end::ch06-schema-validation[]

# tag::ch06-wrong-schema[]
from py4j.protocol import Py4JJavaError  # <1>

episode_schema_BAD = T.StructType(
    [
        T.StructField("_links", episode_links_schema),
        T.StructField("airdate", T.DateType()),
        T.StructField("airstamp", T.TimestampType()),
        T.StructField("airtime", T.StringType()),
        T.StructField("id", T.StringType()),
        T.StructField("image", episode_image_schema),
        T.StructField("name", T.StringType()),
        T.StructField("number", T.LongType()),
        T.StructField("runtime", T.LongType()),
        T.StructField("season", T.LongType()),
        T.StructField("summary", T.LongType()),  # <2>
        T.StructField("url", T.LongType()),  # <2>
    ]
)

embedded_schema2 = T.StructType(
    [
        T.StructField(
            "_embedded",
            T.StructType(
                [
                    T.StructField(
                        "episodes", T.ArrayType(episode_schema_BAD)
                    )
                ]
            ),
        )
    ]
)

shows_with_schema_wrong = spark.read.json(
    "./data/shows/shows-silicon-valley.json",
    schema=embedded_schema2,
    mode="FAILFAST",
)

try:
    shows_with_schema_wrong.show()
except Py4JJavaError:
    pass

# Huge Spark ERROR stacktrace, relevant bit:
#
# Caused by: java.lang.RuntimeException: Failed to parse a value for data type
#   bigint (current token: VALUE_STRING). <3>

# end::ch06-wrong-schema[]

# tag::ch06-json-schema[]

import pprint  # <1>

pprint.pprint(
    shows_with_schema.select(
        F.explode("_embedded.episodes").alias("episode")
    )
    .select("episode.airtime")
    .schema.jsonValue()
)
# {'fields': [{'metadata': {},
#             'name': 'airtime',
#             'nullable': True,
#             'type': 'string'}],
# 'type': 'struct'}
# end::ch06-json-schema[]

# tag::ch06-complex-json[]

pprint.pprint(
    T.StructField("array_example", T.ArrayType(T.StringType())).jsonValue()
)

# {'metadata': {},
#  'name': 'array_example',
#  'nullable': True,
#  'type': {'containsNull': True, 'elementType': 'string', 'type': 'array'}}  # <1>

pprint.pprint(
    T.StructField(
        "map_example", T.MapType(T.StringType(), T.LongType())
    ).jsonValue()
)

# {'metadata': {},
#  'name': 'map_example',
#  'nullable': True,
#  'type': {'keyType': 'string',
#           'type': 'map',
#           'valueContainsNull': True,
#           'valueType': 'long'}}  <2>

pprint.pprint(
    T.StructType(
        [
            T.StructField(
                "map_example", T.MapType(T.StringType(), T.LongType())
            ),
            T.StructField("array_example", T.ArrayType(T.StringType())),
        ]
    ).jsonValue()
)

# {'fields': [{'metadata': {},  <3>
#              'name': 'map_example',
#              'nullable': True,
#              'type': {'keyType': 'string',
#                       'type': 'map',
#                       'valueContainsNull': True,
#                       'valueType': 'long'}},
#             {'metadata': {},
#              'name': 'array_example',
#              'nullable': True,
#              'type': {'containsNull': True,
#                       'elementType': 'string',
#                       'type': 'array'}}],
#  'type': 'struct'}

# end::ch06-complex-json[]

# tag::ch06-json-schema-comparison[]

other_shows_schema = T.StructType.fromJson(
    json.loads(shows_with_schema.schema.json())
)

print(other_shows_schema == shows_with_schema.schema)  # True
# end::ch06-json-schema-comparison[]

# tag::ch06-explode[]
episodes = shows.select(
    "id", F.explode("_embedded.episodes").alias("episodes")
)  # <1>
episodes.show(5, truncate=70)

# +---+----------------------------------------------------------------------+
# | id|                                                              episodes|
# +---+----------------------------------------------------------------------+
# |143|{{{http://api.tvmaze.com/episodes/10897}}, 2014-04-06, 2014-04-07T0...|
# |143|{{{http://api.tvmaze.com/episodes/10898}}, 2014-04-13, 2014-04-14T0...|
# |143|{{{http://api.tvmaze.com/episodes/10899}}, 2014-04-20, 2014-04-21T0...|
# |143|{{{http://api.tvmaze.com/episodes/10900}}, 2014-04-27, 2014-04-28T0...|
# |143|{{{http://api.tvmaze.com/episodes/10901}}, 2014-05-04, 2014-05-05T0...|
# +---+----------------------------------------------------------------------+
# only showing top 5 rows

episodes.count()  # 53

# end::ch06-explode[]

# tag::ch06-explode-map[]
episode_name_id = shows.select(
    F.map_from_arrays(  # <1>
        F.col("_embedded.episodes.id"), F.col("_embedded.episodes.name")
    ).alias("name_id")
)

episode_name_id = episode_name_id.select(
    F.posexplode("name_id").alias("position", "id", "name")  # <2>
)

episode_name_id.show(5)

# +--------+-----+--------------------+
# |position|   id|                name|
# +--------+-----+--------------------+
# |       0|10897|Minimum Viable Pr...|
# |       1|10898|       The Cap Table|
# |       2|10899|Articles of Incor...|
# |       3|10900|    Fiduciary Duties|
# |       4|10901|      Signaling Risk|
# +--------+-----+--------------------+
# only showing top 5 rows

# end::ch06-explode-map[]

# tag::ch06-collect[]
collected = episodes.groupby("id").agg(
    F.collect_list("episodes").alias("episodes")
)

collected.count()  # 1

collected.printSchema()
# |-- id: long (nullable = true)
# |-- episodes: array (nullable = true)
# |    |-- element: struct (containsNull = false)
# |    |    |-- _links: struct (nullable = true)
# |    |    |    |-- self: struct (nullable = true)
# |    |    |    |    |-- href: string (nullable = true)
# |    |    |-- airdate: string (nullable = true)
# |    |    |-- airstamp: timestamp (nullable = true)
# |    |    |-- airtime: string (nullable = true)
# |    |    |-- id: long (nullable = true)
# |    |    |-- image: struct (nullable = true)
# |    |    |    |-- medium: string (nullable = true)
# |    |    |    |-- original: string (nullable = true)
# |    |    |-- name: string (nullable = true)
# |    |    |-- number: long (nullable = true)
# |    |    |-- runtime: long (nullable = true)
# |    |    |-- season: long (nullable = true)
# |    |    |-- summary: string (nullable = true)
# |    |    |-- url: string (nullable = true)
# end::ch06-collect[]

# tag::ch06-struct-in-struct[]
struct_ex = shows.select(
    F.struct(  # <1>
        F.col("status"), F.col("weight"), F.lit(True).alias("has_watched")
    ).alias("info")
)

struct_ex.show(1, False)
# +-----------------+
# |info             |
# +-----------------+
# |{Ended, 96, true}|  <2>
# +-----------------+

struct_ex.printSchema()
# root
#  |-- info: struct (nullable = false)  <2>
#  |    |-- status: string (nullable = true)
#  |    |-- weight: long (nullable = true)
#  |    |-- has_watched: boolean (nullable = false)
# end::ch06-struct-in-struct[]
