
# Predict housing prices.

# Here the are the libraries that we will need.
import numpy as np
import pandas as pd

import os
import tarfile
import urllib.request

from datetime import datetime, timedelta, timezone
import io
import logging

from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor

# These constants tell us where to find our dataset on the
# Internet and where to store a copy of that dataset on our
# own computer.
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

my_logger = logging.getLogger( 'csc316_logger' )
console_handler = logging.StreamHandler()
my_logger.setLevel( logging.INFO )
my_logger.addHandler( console_handler )

# Central Daylight Time is 5 hours behind UTC (Coordinated Universal Time)
my_time_zone = timezone( -timedelta(hours = 5) )
timestamp = lambda : print( datetime.now( tz = my_time_zone ).strftime(
    '\nCSC316 Machine Learning %d %B %Y at %H:%M:%S') )


  ## (7) Uses the Pandas get_dummies() function for one hot encoding of
  ## the catagorical variable.

  ## (8) Concatenates the DataFrames to create a single DataFrame that
  ## contains all columns (except for median_house_value).

  ## (9) Creates an instance of LinearRegression.

  ## (10) Calls LinearRegression's fit() method.

  ## (11) Calls LinearRegression's predict() method.

  ## (12) Uses the mean_squared_error() function to
  ## produce a measure of how well the model predicts housing
  ## prices.

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()
# end of fetch_housing_data()

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)
# end of load_housing_data()

def get_data():
    # TO-DO: We can put this next function call
    # in a comment after we have run the program
    # once. Explain.
    #fetch_housing_data()

    # TO-DO: We have seen two file formats this week.
    # What is JSON? Where are we using it?
    # What is a tar file? Where are we using it?
    housing = load_housing_data()

    return housing
# end of get_data()

def prepare_data( dataset ):
    ## (3) From a single DataFrame that contains all columns except
    ## for median_house_value, create\ldots

    blocks = dataset.drop( 'median_house_value', axis = 1 )
    blocks.reset_index( drop = True, inplace = True )

    labels = dataset[['median_house_value']]
    labels.reset_index( drop = True, inplace = True )

    my_logger.info( '\n All columns except median_house_value. \n' )

    my_logger.info( blocks.info() )

    my_logger.info( '\n Only the median_house_value column. \n' )

    my_logger.info( labels.info() )

      ## (a) A DataFrame that contains only the single categorical
      ## variable in the dataset.

    ocean = blocks[[ 'ocean_proximity' ]]
    ocean.reset_index( drop = True, inplace = True )

    my_logger.info( '\n Only the ocean_proximity column. \n' )

    my_logger.info( ocean.info() )

      ## (b) A DataFrame that contains only the single column
      ## that contains missing values. This is the total_bedrooms
      ## column.

    bedrooms = blocks[[ 'total_bedrooms' ]]
    bedrooms.reset_index( drop = True, inplace = True )

    my_logger.info( '\n # of nans in total_bedrooms \n' )
    my_logger.info( bedrooms['total_bedrooms'].isna().sum() )

    my_logger.info( '\n Just before imputing. \n' )
    my_logger.info( bedrooms.info() )

    imputer = SimpleImputer(strategy="median")

    imputer.fit(bedrooms)
    data = imputer.transform(bedrooms)

    my_logger.info( '\n Impute \n' )
    my_logger.info( type(data) )
    my_logger.info( data.shape )

    bedrooms = pd.DataFrame( data, columns = ['total_bedrooms'] )

    my_logger.info( '\n Just before putting bedrooms back. \n' )
    my_logger.info( '\n # of nulls \n' )
    my_logger.info( bedrooms.isna().sum() )
    my_logger.info( '\n' )
    my_logger.info( bedrooms.info() )

    #blocks['total_bedrooms'] = bedrooms['total_bedrooms']
    blocks.drop( ['total_bedrooms'], axis = 1, inplace = True )
    blocks = pd.concat( [blocks, bedrooms], axis = 1 )
    blocks.reset_index( drop = True, inplace = True )

    my_logger.info( '\n Only the total_bedrooms columns. \n' )

    my_logger.info( blocks.info() )

      ## (c) A DataFrame that contains new variables: 
      ##       * rooms_per_household,
      ##       * population_per_household,
      ##       * bedrooms_per_room.

    extracted = pd.DataFrame()

    extracted['rooms_per_household'] = \
        blocks['total_rooms']/blocks['households']

    extracted['bedrooms_per_room'] = \
        blocks['total_bedrooms']/blocks['total_rooms']

    extracted['population_per_household'] = \
        blocks['population']/blocks['households']

    my_logger.info( '\n New (extracted) variables. \n' )

    my_logger.info( extracted.info() )

    ## (4) Replaces all missing values in total_bedrooms with
    ## the median value in that column.

    ## I did this in part 3.b above.

    ## (5) Concatenates the DataFrames to create a new DataFrame that
    ## contains all numerical variables (except for median_house_value).

    blocks.drop( ['ocean_proximity', 'total_bedrooms'], 
      axis = 1, inplace = True )
    numerical = pd.concat( [blocks, bedrooms, extracted], axis = 1 )

    my_logger.info( '\n Numerical variables before scaling. \n' )
    my_logger.info( numerical.info() )

    ## (6) Scales all of the numerical data in a way that makes the mean value
    ## in each column is 0.0 and the standard deviation is 1.0.

    scaler = StandardScaler()
    data = scaler.fit_transform( numerical )
    blocks = pd.DataFrame( data, 
        columns = numerical.columns, index = numerical.index )

    my_logger.info( '\n Scaled numerical data. \n' )
    my_logger.info( numerical.describe() )

    my_logger.info( '\n Look at values of the categorical variable. \n' )

    my_logger.info( ocean['ocean_proximity'].value_counts() )

    ocean_nearness = {
      'ocean_proximity_<1H OCEAN':'HOUR', 
      'ocean_proximity_INLAND': 'INLAND', 
      'ocean_proximity_NEAR OCEAN': 'OCEAN', 
      'ocean_proximity_NEAR BAY': 'BAY', 
      'ocean_proximity_ISLAND': 'ISLAND'}

    ocean_onehot = pd.get_dummies( ocean )
    ocean_onehot.rename( ocean_nearness, axis = 1, inplace = True )

    my_logger.info( '\n One-hot encoding of categorical variable. \n' )
    my_logger.info( ocean_onehot.info() )

    my_logger.info( '\n Prepared data. \n' )

    blocks_prepared = pd.concat( [numerical, ocean_onehot], axis = 1 )

    my_logger.info( blocks_prepared.info() )

    y = labels.to_numpy().ravel()
    my_logger.info( y.shape )

    return (blocks_prepared, y)
# end of prepare_data()

def main():

    ## (1) Builds a DataFrame that contains the California
    ## Housing Prices dataset.

    my_logger.info( '\n Get the data. \n' )

    housing = get_data()

    my_logger.info( housing.info() )

    ## (2) Splits the dataset into a training set and test set.
    ## Do not bother with stratified sampling.

    my_logger.info( '\n Split dataset. \n' )

    (training, test) = train_test_split( housing, train_size = 0.8 )

    my_logger.info( '\n # of rows in training set. \n' )

    my_logger.info( len(training) )

    my_logger.info( '\n # of rows in test set. \n' )

    my_logger.info( len(test) )

    (blocks_prepared, y) = prepare_data( training )

    #regression = DecisionTreeRegressor()
    regression = LinearRegression()
    #regression = RandomForestRegressor()
    regression.fit( blocks_prepared, y )

    predictions = regression.predict( blocks_prepared )

    sum_of_squares = mean_squared_error( predictions, y )

    my_logger.info( '\n RMSE \n' )
    my_logger.info( np.sqrt( sum_of_squares )  )

    # TO-DO: Add cross validation and a grid search
    # to optimize parameters.

# end of main()

if __name__ == '__main__':
    main()
