# /usr/bin/env python3

# pylint: disable=C0413,C0411,C0116

# tag::code-window-read-bq[]

from functools import reduce
import pyspark.sql.functions as F
from pyspark.sql import SparkSession

spark = SparkSession.builder.config(
    "spark.jars.packages",
    "com.google.cloud.spark:spark-bigquery-with-dependencies_2.12:0.19.1",
).getOrCreate()


def read_df_from_bq(year):
    return (
        spark.read.format("bigquery")
        .option("table", f"bigquery-public-data.noaa_gsod.gsod{year}")
        .option("credentialsFile", "bq-key.json")
        .load()
    )


gsod = (
    reduce(
        lambda x, y: x.unionByName(y, allowMissingColumns=True),
        [read_df_from_bq(year) for year in range(2017, 2020)],
    )
    .dropna(subset=["year", "mo", "da", "temp"])
    .where(F.col("temp") != 9999.9)
)

gsod.write.parquet("./data/Window/gsod.parquet")  # <1>
gsod = spark.read.parquet("./data/Window/gsod.parquet")  # <1>

# end::code-window-read-bq[]

# tag::code-window-gsod-light[]

gsod2 = gsod.select(gsod.columns[:7]).drop("wban")
gsod_light = (
    gsod2.where("year = 2017")
    .limit(3)
    .union(gsod2.where("year = 2018").limit(3))
    .union(gsod2.where("year = 2019").limit(4))
)

gsod_light.coalesce(1).write.parquet(
    "./data/Window/gsod_light.parquet"
)

# end::code-window-gsod-light[]
