import numpy as np import pandas as pd def makeGenerator( rng, mean, stddev, fractionNull ): # This is a function that defines and returns a function # to its caller. # The return function is a function with no parameters. return (lambda: rng.normal( mean, stddev ) if rng.random() > fractionNull else np.nan) # end of makeGenerator() def main(): # Create a DataFrame that contains numerical # variables, a categorical variable, and that # also has missing values in the numerical columns. # Create a random number generator. rng = np.random.default_rng() # Create a function that will produces values # for the numerical variables in our dataset. # Here, we are specifying mean = 4.0, # standard deviation = 2.0, and the fraction # of values that are missing will be about # about 0.25. g = makeGenerator( rng, 4.0, 2.0, 0.25 ) # Create a list of lists. # This creates a table with 8 columns and 4 rows. samples = [[g() for i in range(8)] for j in range(4)] # Create a DataFrame with 4 columns and 8 rows. # Use numpy's transpose function to translate the # 8 column/4 row data structure into a 4 column/8 row # data structure. # Of course, in this case we could have just created # a 4 column/8 row data structure in the first step # and saved ourselves the need to transpose. # We could have done this because we are generating # values for all elements of the table in the same way. # In another project, we might want to fill different # columns in different ways. data = pd.DataFrame( np.transpose( np.array( samples ) ), columns = ['A', 'B', 'C', 'D'] ) # Print only 2 digits to the right of the decimal point. pd.set_option( 'display.precision', 2 ) # Take a look at the numerical data. print( data ) # Now add a categorical variable. data['Size'] = [ 'high', 'low', 'low', 'medium', 'high', 'medium', 'low', 'high'] print( '\n' ) print( data ) # TO-DO: Add code here that... # 1) Takes care of the missing values. # 2) Scales the numerical values. # 3) Applies 'one-hot encoding' to the categorical variable. # end of main() if __name__ == '__main__': main()