#!/usr/bin/env sh
set -euox pipefail

BUCKET_NAME="gsod_noaa_pyspark_$$"

echo "Creating bucket ${BUCKET_NAME}"
gsutil mb "gs://${BUCKET_NAME}"

for i in $(seq 2010 2020)
do
    echo "Extracting gsod$i from Google BigQuery in Parquet format to Google Cloud Storage.";
    bq extract \
       --destination_format PARQUET \
       --compression SNAPPY \
       "bigquery-public-data:noaa_gsod.gsod${i}" \
       "gs://${BUCKET_NAME}/gsod${i}.parquet";
done
set +x

echo "Downloading the data to disk and deleting it from Google Cloud Storage."
[[ -d ./data/gsod_noaa ]] && rm -r ./data/gsod_noaa
mkdir -p ./data/gsod_noaa
gsutil -m cp -r "gs://${BUCKET_NAME}/*" ./data/gsod_noaa
gsutil -m rm -r "gs://${BUCKET_NAME}/"

echo "Finished!"

