#!/usr/bin/env python3

DIRECTORY = "./data/fake_data/"

import os
import pandas as pd

test = pd.Series(range(0, 10_000)).reset_index()

from pyspark.sql import SparkSession
import pyspark.sql.functions as F

spark = SparkSession.builder.getOrCreate()

df = spark.createDataFrame(test, schema=["index", "left"]).drop("index")

for x in range(1, 25):
    df = df.withColumn(f"mult_{x}", F.col("left") * x)

# Experiment 1: CSV vs. Parquet

df.write.csv(os.path.join(DIRECTORY, "1_csv"), header=True)
df.write.csv(
    os.path.join(DIRECTORY, "1_csv_gz"), header=True, compression="gzip"
)
df.write.parquet(os.path.join(DIRECTORY, "2_parquet"))


def read_filter_and_count(directory, fmt):
    return (
        spark.read.format(fmt)
        .load(directory)
        .where("left > 1000")
        .count()
    )

def read_filter_and_count_csv(directory, fmt="csv"):
    return (
        spark.read.format(fmt)
        .option("header", "True")
        .option("inferSchema", "True")
        .load(directory)
        .where("left > 1000")
        .count()
    )
