
# Here the are the libraries that we will need.
import numpy as np
import pandas as pd

import os
import tarfile
import urllib.request

from datetime import datetime, timedelta, timezone
import io
import logging


from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor

# These constants tell us where to find our dataset on the
# Internet and where to store a copy of that dataset on our
# own computer.
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

WRITE_LOG_TO_CONSOLE = True

log_file = 'LHT_housing_log.txt'

log_format = '%(message)s'

log_handlers = [logging.FileHandler(log_file, mode = 'w')]

if WRITE_LOG_TO_CONSOLE:
    log_handlers.append( logging.StreamHandler() )

logging.basicConfig(
    level = logging.NOTSET,
    handlers = log_handlers,
    format = log_format )

logging.info( '\nDone with imports and configuration of logger.\n' )

# Central Daylight Time is 5 hours behind UTC (Coordinated Universal Time)
my_time_zone = timezone( -timedelta(hours = 5) )
timestamp = lambda : print( datetime.now( tz = my_time_zone ).strftime(
    '\nLHT %d %B %Y at %H:%M:%S') )

timestamp()


def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()
# end of fetch_housing_data()

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)
# end of load_housing_data()

def get_data():
    # TO-DO: We can put this next function call
    # in a comment after we have run the program
    # once. Explain.
    #fetch_housing_data()

    # TO-DO: We have seen two file formats this week.
    # What is JSON? Where are we using it?
    # What is a tar file? Where are we using it?
    housing = load_housing_data()

    return housing
# end of get_data()

def examine_data( housing ):
    # TO-DO:
    # We want to predict the price of houses in a neighborhood.
    # What kind of data do we have on which to base those
    # prediction?

    # We have one categorical variable. What is it? 

    logging.info( '\n **First few rows of DataFrame.** \n' )

    logging.info( housing.head(4) )

    logging.info( '\n **DataFrame info.** \n' )

    logging.info( housing.info() )

    logging.info( '\n **Count of values in ocean_proximity column.** \n' )

    logging.info( housing['ocean_proximity'].value_counts() )

    logging.info( '\n **DataFrame describe.** \n' )

    description = housing.describe()

    for column in description.columns:
        logging.info( f'\n {column} \n' )
        logging.info( description[column] )

    # Compute the correlation of every value with
    # every other variable.
    correlation_matrix = housing.corr()

    # But we are really just interested in the correlation
    # of each variable with the median_house_value variable.
    relevant_numbers = correlation_matrix["median_house_value"]
    relevant_numbers = relevant_numbers.sort_values(ascending=False)

    logging.info( '\n **Correlations.** \n' )
    logging.info( relevant_numbers )

    # Of course, median_house_value correlates perfectly
    # with median_house_value!

    # We could examine the dataset in other ways, too.
    # For example, we could add code that plots the data,
    # draws histograms, graphically shows the correlation
    # of one variable with another, and so on.

# end of examine_data()

def income_cat_proportions(data):
    # This is a little helper function that we
    # use to compare the distributions of values
    # in two different splits of our dataset.
    return data["income_cat"].value_counts() / len(data)
# end of income_cat_proportions()

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room

        # specify indices of columns
        self.rooms_ix =3
        self.bedrooms_ix = 4
        self.population_ix = 5
        self.households_ix = 6
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        rooms_per_household = X[:, self.rooms_ix] / X[:, self.households_ix]
        population_per_household = X[:, self.population_ix] / X[:, self.households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, self.bedrooms_ix] / X[:, self.rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]
# end of CombinedAttributesAdder class

def split_dataset( housing ):
    # split dataset without stratification

    # TO-DO:
    # What is the type of the value returned by 
    # train_test_split?

    # Search on the Internet for advice: what should
    # the test_size be?

    # What happens if we assign a different value
    # to random_state? What happens if we call the function
    # without this third argument?

    sets = train_test_split(housing, test_size=0.2, random_state=42)
    training_set = sets[0]
    test_set = sets[1]

    logging.info( f'\n # of rows in training set = {len(training_set)} \n' )
    logging.info( f'\n # of rows in test set = {len(test_set)} \n' )

    # or split dataset with stratification

    # TO-DO:
    # Geron defines stratification with an example
    # in which a data scientist wants proportional
    # numbers of men and women.
    # Which variable are we going to use for stratification?

    # Add a new column to our dataset.
    # This is a temporary addition. We will remove this
    # column later.

    # TO-DO: What is np.inf?
    
    boundaries = [0., 1.5, 3.0, 4.5, 6., np.inf]
    categoryNumbers = [1, 2, 3, 4, 5]
    housing["income_cat"] = pd.cut(housing["median_income"],
                               bins = boundaries,
                               labels = categoryNumbers )

    # TO-DO: Write code that prints the number of records in each
    # of the 5 categories just defined.

    # TO-DO: Write code that calculates and prints the number
    # of blocks in which the median income lies between
    # $15,000 and $45,000.

    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

    # TO-DO: What type of variable is split?
    logging.info( type(split) )

    # TO-DO: What type of variable does split's split function
    # return to its caller?
    t = split.split(housing, housing["income_cat"])
    logging.info( type(t) )

    # TO-DO: How many times this loop run?
    # What do we get out of t?
    for a, b in t:
        logging.info( type(a) )
        logging.info( len(a) )
        logging.info( type(b) )
        logging.info( len(b) )

    # Okay, now that we have some understanding of what this
    # class is going to give us, let's use it.
    for train_index, test_index in split.split(housing, housing["income_cat"]):
        strat_train_set = housing.loc[train_index]
        strat_test_set = housing.loc[test_index]

    # Let's make sure this stratified sampling did what
    # we wanted it to do.


    # We need to split the dataset again because housing now has
    # a new column and we want the train_set and test_set to have
    # that new column, too.
    train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

    # TO-DO: Here's another way to create a DataFrame. What is the
    # type of the argument for the constructor in this case?
    compare_props = pd.DataFrame({
        "Overall": income_cat_proportions(housing),
        "Stratified": income_cat_proportions(strat_test_set),
        "Random": income_cat_proportions(test_set),
    })

    # TO-DO: What happens if you leave out this next statement?
    compare_props = compare_props.sort_index()

    logging.info( compare_props )

    # Let's add some columns to the compare_props DataFrame that
    # will make it easier to compare the two ways of splitting
    # our data.

    logging.info( '\n' )

    compare_props["Rand. %error"] = \
        (100 * compare_props["Random"] / compare_props["Overall"] - 100)

    compare_props["Strat. %error"] = \
        (100 * compare_props["Stratified"] / compare_props["Overall"] - 100)

    logging.info( '\n' )

    # TO-DO: What do you see now?
    logging.info( compare_props )

    # We added a column to use in making the stratified split.
    # Now that we have split the dataset into a training set and a 
    # test set we no longer need that column. Let's get rid of it.

    # TO-DO: Could we do this without a loop? How could we verify
    # that we have succeeded in removing the column?
    for set_ in (strat_train_set, strat_test_set):
        set_.drop("income_cat", axis=1, inplace=True)

    return (strat_train_set, strat_test_set) 
# end of split_dataset()

def extract_features( housing_numerical ):

    # Let's add some new variables (new columns) that
    # that combine information from existing columns...

    # Two ways to do this...

    betterWay = False

    if( betterWay ):
        housing_numerical["rooms_per_household"] = \
            housing["total_rooms"]/housing["households"]

        housing_numerical["bedrooms_per_room"] = \
            housing["total_bedrooms"]/housing["total_rooms"]

        housing_numerical["population_per_household"] = \
            housing["population"]/housing["households"]
    # end if
    else:
        new_columns = ['rooms_per_household', 
            'bedrooms_per_room', 'population_per_household']

        column_list = housing_numerical.columns.to_list()
        column_list.extend( new_columns )

        all_columns = column_list

        attribute_adder = CombinedAttributesAdder(add_bedrooms_per_room=True)
        data = attribute_adder.transform(housing_numerical.to_numpy())
        housing_numerical = pd.DataFrame( data, 
            columns = all_columns, index = housing_numerical.index ) 

        logging.info( '\n Dataset with new columns \n' )
        logging.info( housing_numerical.info() )
        logging.info( housing_numerical.head(8) )

        logging.info( '\n Done with new columns \n' )
    # end else


    # Learn how all variables correlate with one another.
    correlation_matrix = housing_numerical.corr()

    # Let's look at the most relevant column.
    correlation_with_price = \
        correlation_matrix['median_house_value'].sort_values(ascending = False)

    logging.info( correlation_with_price )

    return housing_numerical
# end of extract_features()

def process_numerical( housing_numerical ):
    # TO-DO: The next step is to do something about missing values.
    # We have 3 choices. What are they?

    imputer = SimpleImputer(strategy="median")

    housing_numerical = extract_features( housing_numerical )

    imputer.fit(housing_numerical)
    X = imputer.transform(housing_numerical)

    logging.info( '\n **housing_num type** \n' )
    logging.info( type(housing_numerical) )

    logging.info( '\n **X type** \n' )
    logging.info( type(X) )

    # Skipped scaling.

    housing_numerical = pd.DataFrame(X, columns=housing_numerical.columns,
        index=housing_numerical.index)

    logging.info( '\n **housing_numerical type** \n' )
    logging.info( type(housing_numerical) )

    return housing_numerical
# end of process_numerical()

def process_categorical( housing_categorical ):

    # We could replace strings with integers...
    ordinal_encoder = OrdinalEncoder()
    housing_categorical_encoded = \
        ordinal_encoder.fit_transform(housing_categorical)

    # ...or replace the one column that contains a categorical
    # variable that can take several different values with
    # several columns (one for each possible category) that
    # will contain only 0's and 1's
    categorical_encoder = OneHotEncoder()
    housing_categorical_1hot = \
        categorical_encoder.fit_transform(housing_categorical)

    # housing_categorical_1hot is a matrix

    # To get a DataFrame, use the get_dummies() function
    df = pd.get_dummies( housing_categorical )

    logging.info( df.info() )

    return df
# end of housing_categorical()

def pipeline( housing ):
    bestWay = True

    housing_numerical = housing.drop("ocean_proximity", axis=1)
    # alternatively: housing_num = housing.select_dtypes(include=[np.number])

    housing_categorical = housing[["ocean_proximity"]]

    if( not bestWay ):
        housing_numerical = process_numerical( housing_numerical )
        housing_categorical = process_categorical( housing_categorical )

        # This is an incomplete response.
        # I am faking it here.
        housing_prepared = housing_numerical
    else:
        num_attribs = list(housing_numerical)
        cat_attribs = ["ocean_proximity"]

        num_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy="median")),
            ('attribs_adder', CombinedAttributesAdder()),
            ('std_scaler', StandardScaler()),
        ])

        full_pipeline = ColumnTransformer([
            ("num", num_pipeline, num_attribs),
            ("cat", OneHotEncoder(), cat_attribs),
        ])

        housing_prepared = full_pipeline.fit_transform(housing)
    # end else

    return housing_prepared
# end of pipeline()

def main():

    # We want a create a function y = f(x) that predicts
    # the prices of houses in a neighborhood.
    # Here, x denotes the characteristics of the neighborhood
    # and y is the median price of a home in that neighborhood.

    # x is a vector---it is not a single number, but a collection
    # of numbers that tell us something about geographic location,
    # the incomes of the population, the size of the houses, and
    # so on.

    # Our training set currently contains both x's and y's.
    # We need to pull out the x's and put them in their own
    # DataFrame. Then we need to do the same thing with the y's.

    # We'll put the x's in a DataFrame that we will call "housing."
    # This DataFrame contains all the columns in strat_train_set
    # except for the median_house_value column.
    #housing = strat_train_set.drop("median_house_value", axis=1)

    # We'll put the y's in a DataFrame that we will call "housing_labels."
    # This DataFrame contains only the median_house_value column. 
    #housing_labels = strat_train_set["median_house_value"].copy()


    # After we take care of missing values, our next task will
    # to do something with our one categorical variable.

    # Then we will scale the data...

    # Then we will see a way of putting all of these
    # transformations into a pipeline...
    
    # Then we will be ready to build a model and make
    # some predictions!

    # More to come....

    housing = get_data()

    examine_data( housing )

    (strat_train_set, strat_test_set) = split_dataset( housing )

    # drop labels for training set
    housing = strat_train_set.drop("median_house_value", axis=1) 

    # test set has only the labels
    housing_labels = strat_train_set["median_house_value"].copy()

    housing_prepared = pipeline( housing )

    logging.info( '\n housing_prepared shape \n' )
    logging.info( housing_prepared.shape )

    housing_prepared_df = pd.DataFrame( housing_prepared )

    #logging.info( '\n Type of housing \n' )
    #logging.info( type(housing) )
    #logging.info( housing.info() )

    lin_reg = LinearRegression()
    lin_reg.fit(housing_prepared, housing_labels)

    # let's try the full preprocessing pipeline on a few training instances
    some_data = housing_prepared_df.iloc[:5]
    some_labels = housing_labels.iloc[:5]
    #some_data_prepared = pipeline(some_data)

    logging.info('Predictions:' + str(lin_reg.predict(some_data)) )
    logging.info('Labels:' + str(list(some_labels)) )

    predictions = lin_reg.predict( housing_prepared )
    rmse = np.sqrt( mean_squared_error( housing_labels, predictions ) )

    logging.info( '\n RMSE \n' )
    logging.info( rmse )



# end of main()

if __name__ == '__main__':
    logging.info( 'Guten Tag!' )
    main()
    logging.info( "Auf wiedersehen" )

