# /usr/bin/env python3

# pylint: disable=C0413,C0411,C0116

"""Code inside the chapter for the book PySpark in Action (chapter 8)."""

# tag::ch09-python-bq[]
from pyspark.sql import SparkSession

spark = SparkSession.builder.config(
    "spark.jars.packages",
    "com.google.cloud.spark:spark-bigquery-with-dependencies_2.12:0.19.1",  # <1>
).getOrCreate()

# [...]
# com.google.cloud.spark#spark-bigquery-with-dependencies_2.12 added as a dependency
# :: resolving dependencies :: org.apache.spark#spark-submit-parent-77d4bbf3-1fa4-4d43-b5f7-59944801d46c;1.0
# 	confs: [default]
# 	found com.google.cloud.spark#spark-bigquery-with-dependencies_2.12;0.19.1 in central
# downloading https://repo1.maven.org/maven2/com/google/cloud/spark/spark-bigquery-with-dependencies_2.12/0.19.1/spark-bigquery-with-dependencies_2.12-0.19.1.jar ...
# 	[SUCCESSFUL ] com.google.cloud.spark#spark-bigquery-with-dependencies_2.12;0.19.1!spark-bigquery-with-dependencies_2.12.jar (888ms)
# :: resolution report :: resolve 633ms :: artifacts dl 889ms
# 	:: modules in use:
# 	com.google.cloud.spark#spark-bigquery-with-dependencies_2.12;0.19.1 from central in [default]
# 	---------------------------------------------------------------------
# 	|                  |            modules            ||   artifacts   |
# 	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
# 	---------------------------------------------------------------------
# 	|      default     |   1   |   1   |   1   |   0   ||   1   |   1   |
# 	---------------------------------------------------------------------
# :: retrieving :: org.apache.spark#spark-submit-parent-77d4bbf3-1fa4-4d43-b5f7-59944801d46c
# 	confs: [default]
# 	1 artifacts copied, 0 already retrieved (33158kB/23ms)


# end::ch09-python-bq[]

# tag::ch09-read-bq[]

from functools import reduce
import pyspark.sql.functions as F


def read_df_from_bq(year):  # <1>
    return (
        spark.read.format("bigquery").option(  # <2>
            "table", f"bigquery-public-data.noaa_gsod.gsod{year}"
        )  # <3>
        # .option("credentialsFile", "bq-key.json")  # <4>
        .load()
    )


gsod = (
    reduce(
        lambda x, y: x.unionByName(y, allowMissingColumns=True),
        [read_df_from_bq(year) for year in range(2010, 2021)],  # <5>
    )
    .dropna(subset=["year", "mo", "da", "temp"])
    .where(F.col("temp") != 9999.9)
    .drop("date")
)

# end::ch09-read-bq[]

# tag::ch09-read-bq-alternate[]
gsod_alt = read_df_from_bq(2010)  # <1>
for year in range(2011, 2020):
    gsod_alt = gsod_alt.unionByName(
        read_df_from_bq(year), allowMissingColumns=True
    )
gsod_alt = gsod_alt.drop("date")

# end::ch09-read-bq-alternate[]
