#!/usr/bin/env python3

from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

# tag::D-long-loop[]
# Without a list comprehension

to_delete = []
for col in df.columns:
    if "ID" in col:
        to_delete.append(col)

df = f.drop(*to_delete)

# With a list comprehension

df = df.drop(*[col for col in df.columns if "ID" in col])
# end::D-long-loop[]

# tag::appc-list-comp[]
print([x + y for x in [0, 1, 2] for y in [0, 1, 2] if x != y])  # <1>

# => [1, 2, 1, 3, 2, 3]
# end::appc-list-comp[]


# tag::appc-create-df[]
sample = spark.createDataFrame(
    [[1, 2, 3, 4], [2, 2, 3, 4], [3, 2, 3, 4]],
    ["feat1", "pred1", "pred2", "feat2"],
)
sample.show()
# +-----+-----+-----+-----+
# |feat1|pred1|pred2|feat2|
# +-----+-----+-----+-----+
# |    1|    2|    3|    4|
# |    2|    2|    3|    4|
# |    3|    2|    3|    4|
# +-----+-----+-----+-----+
# end::appc-create-df[]

# tag::appc-filtering-columns[]
to_delete = [c for c in sample.columns if str.startswith(c, "pred")]
print(to_delete)  # => ['pred1', 'pred2']
# end::appc-filtering-columns[]

# tag::appc-star-drop[]
sample.drop(*to_delete).printSchema()
# root
#  |-- feat1: long (nullable = true)
#  |-- feat2: long (nullable = true)  <1>
# end::appc-star-drop[]


# tag::appc-drop-impl[]
def my_drop(df, *cols):  # <1>
    return df.select(*[x for x in df.columns if x not in cols])


# end::appc-drop-impl[]

# tag::appc-pandas-udf[]
import pandas as pd
import pyspark.sql.types as T
import pyspark.sql.functions as F


@F.pandas_udf(T.DoubleType())
def f_to_c(degrees: pd.Series) -> pd.Series:  # <1>
    """Transforms Farhenheit to Celcius."""
    return (degrees - 32) * 5 / 9


# end::appc-pandas-udf[]

# tag::appc-function-transformation[]
def modulo_of(df, old_column, new_column, modulo_value):
    return df.withColumn(new_column, F.col(old_column) % modulo_value)


# end::appc-function-transformation[]


# tag::appc-modulo-transform[]
from typing import Callable
from pyspark.sql import DataFrame


def modulo_of(
    new_name: str, old_col: str, modulo_value: int
) -> Callable[[DataFrame], DataFrame]:  # <1>
    """Return the value from the column mod `modulo_value`

    Transform-enabled function."""

    def _inner_func(df: DataFrame) -> DataFrame:  # <2>
        # Function knows about new_name and old_col and modulo_value
        return df.withColumn(new_name, F.col(old_col) % modulo_value)

    return _inner_func  # <3>


# end::appc-modulo-transform[]

# tag::appc-modulo-application[]
df = spark.createDataFrame(
    [[1, 2, 4, 1], [3, 6, 5, 0], [9, 4, None, 1], [11, 17, None, 1]],
    ["one", "two", "three", "four"],
)

(
    df.transform(modulo_of("three_mod2", "three", 2))
    .transform(modulo_of("one_mod10", "one", 10))
    .show()
)

# +---+---+-----+----+----------+---------+
# |one|two|three|four|three_mod2|one_mod10|
# +---+---+-----+----+----------+---------+
# |  1|  2|    4|   1|         0|        1|
# |  3|  6|    5|   0|         1|        3|
# |  9|  4| null|   1|      null|        9|
# | 11| 17| null|   1|      null|        1|
# +---+---+-----+----+----------+---------+

# end::appc-modulo-application[]

# tag::appc-udf-decorator[]
@F.pandas_udf(T.DoubleType())  # <1>
def f_to_c(degrees: pd.Series) -> pd.Series:
    """Transforms Farhenheit to Celcius."""
    return (degrees - 32) * 5 / 9
# end::appc-udf-decorator[]


# tag::appc-sample-decorator[]
def record_counter(f):

    def _wrapper(value):  # <1>
        print("Before: {} records".format(value.count()))  # <2>
        applied_f = f(value)  # <3>
        print("After: {} records".format(applied_f.count()))
        return applied_f  # <4>

    return _wrapper  # <5>
# end::appc-sample-decorator[]

# tag::appc-sample-decorator-application[]
@record_counter  # <1>
def modulo_data_frame(df):
    return (
        df.transform(modulo_of("three_mod2", "three", 2))
        .transform(modulo_of("one_mod10", "one", 10))
        .show()
    )
# end::appc-sample-decorator-application[]


# tag::appc-sample-decorator-usage[]
modulo_data_frame(df)
# Before: 4 records  <1>
# +---+---+-----+----+----------+---------+
# |one|two|three|four|three_mod2|one_mod10|
# +---+---+-----+----+----------+---------+
# |  1|  2|    4|   1|         0|        1|
# |  3|  6|    5|   0|         1|        3|
# |  9|  4| null|   1|      null|        9|
# | 11| 17| null|   1|      null|        1|
# +---+---+-----+----+----------+---------+

# After: 4 records  <1>
# end::appc-sample-decorator-usage[]

# tag::appc-decorator-as-a-function[]
def modulo_data_frame2(df):
    return (
        df.transform(modulo_of("three_mod2", "three", 2))
        .transform(modulo_of("one_mod10", "one", 10))
        .show()
    )


modulo_data_frame_d2 = record_counter(modulo_data_frame2)
# end::appc-decorator-as-a-function[]

# tag::appc-decorator-func[]
print(f_to_c.func(pd.Series([1,2,3])))
# 0   -17.222222
# 1   -16.666667
# 2   -16.111111
# dtype: float64
# end::appc-decorator-func[]
