import numpy as np
import pandas as pd


def makeGenerator( rng, mean, stddev, fractionNull ):
    # This is a function that defines and returns a function
    # to its caller.
    # The return function is a function with no parameters.
    return (lambda: rng.normal( mean, stddev )
        if  rng.random() > fractionNull else np.nan)
# end of makeGenerator()

def main():
    # Create a DataFrame that contains numerical
    # variables, a categorical variable, and that
    # also has missing values in the numerical columns.

    # Create a random number generator.
    rng = np.random.default_rng()

    # Create a function that will produces values
    # for the numerical variables in our dataset.
    # Here, we are specifying mean = 4.0, 
    # standard deviation = 2.0, and the fraction
    # of values that are missing will be about
    # about 0.25.
    g = makeGenerator( rng, 4.0, 2.0, 0.25 )

    # Create a list of lists.
    # This creates a table with 8 columns and 4 rows.
    samples = [[g() for i in range(8)] for j in range(4)]

    # Create a DataFrame with 4 columns and 8 rows.
    # Use numpy's transpose function to translate the
    # 8 column/4 row data structure into a 4 column/8 row
    # data structure.
    # Of course, in this case we could have just created
    # a 4 column/8 row data structure in the first step
    # and saved ourselves the need to transpose.
    # We could have done this because we are generating
    # values for all elements of the table in the same way.
    # In another project, we might want to fill different
    # columns in different ways. 
    data = pd.DataFrame( np.transpose( np.array( samples ) ),
        columns = ['A', 'B', 'C', 'D'] )

    # Print only 2 digits to the right of the decimal point.
    pd.set_option( 'display.precision', 2 )

    # Take a look at the numerical data.
    print( data )

    # Now add a categorical variable.
    data['Size'] = [
        'high', 
        'low', 
        'low',
        'medium',
        'high',
        'medium',
        'low',
        'high']

    print( '\n' )

    print( data ) 

    # TO-DO: Add code here that...
    #    1) Takes care of the missing values.
    #    2) Scales the numerical values.
    #    3) Applies 'one-hot encoding' to the categorical variable.
# end of main()

if __name__ == '__main__':
    main()