#!/usr/bin/env bash
set -euo pipefail

gcloud dataproc clusters create jrx-pia \
    --enable-component-gateway \
    --region us-east4 \
    --subnet default \
    --zone us-east4-b \
    --master-machine-type n1-standard-4 \
    --master-boot-disk-size 500 \
    --num-workers 2 \
    --worker-machine-type n1-standard-4 \
    --worker-boot-disk-size 500 \
    --image-version preview-debian10 \
    --optional-components ANACONDA \
    --max-age 7200s

PYSPARK_DRIVER_PYTHON=ipython pyspark \
    --conf spark.driver.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true" \
    --conf spark.executor.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true" \
    --jars gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar


from functools import reduce
from pyspark.sql import DataFrame
from pyspark.sql import functions as F

def read_df_from_bq(year):  # <1>
    return (
        spark.read.format("bigquery")  # <2>
        .option("table", f"bigquery-public-data.noaa_gsod.gsod{year}")  # <3>
        .load()
    )


gsod = (
    reduce(
        DataFrame.union, [read_df_from_bq(year) for year in range(1950, 2020)]  # <5>
    )
    .dropna(subset=["year", "mo", "da", "temp"])
    .where(F.col("temp") != 9999.9)
)
