
# Here the are the libraries that we will need.
import numpy as np
import pandas as pd

import os
import tarfile
import urllib.request

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

# These constants tell us where to find our dataset on the
# Internet and where to store a copy of that dataset on our
# own computer.
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()
# end of fetch_housing_data()

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)
# end of load_housing_data()

def examine_data( housing ):
    print( '\n **First few rows of DataFrame.** \n' )

    print( housing.head(4) )

    print( '\n **DataFrame info.** \n' )

    print( housing.info() )

    print( '\n **Count of values in ocean_proximity column.** \n' )

    print( housing['ocean_proximity'].value_counts() )

    print( '\n **DataFrame describe.** \n' )

    description = housing.describe()

    for column in description.columns:
        print( f'\n {column} \n' )
        print( description[column] )

    # Compute the correlation of every value with
    # every other variable.
    correlation_matrix = housing.corr()

    # But we are really just interested in the correlation
    # of each variable with the median_house_value variable.
    relevant_numbers = correlation_matrix["median_house_value"]
    relevant_numbers = relevant_numbers.sort_values(ascending=False)

    print( '\n **Correlations.** \n' )
    print( relevant_numbers )

    # Of course, median_house_value correlates perfectly
    # with median_house_value!

    # We could examine the dataset in other ways, too.
    # For example, we could add code that plots the data,
    # draws histograms, graphically shows the correlation
    # of one variable with another, and so on.
# end of examine_data()

def income_cat_proportions(data):
    # This is a little helper function that we
    # use to compare the distributions of values
    # in two different splits of our dataset.
    return data["income_cat"].value_counts() / len(data)
# end of income_cat_proportions()

def main():
    # TO-DO: We can put this next function call
    # in a comment after we have run the program
    # once. Explain.
    #fetch_housing_data()

    # TO-DO: We have seen two file formats this week.
    # What is JSON? Where are we using it?
    # What is a tar file? Where are we using it?
    housing = load_housing_data()

    # TO-DO:
    # We want to predict the price of houses in a neighborhood.
    # What kind of data do we have on which to base those
    # prediction?

    # We have one categorical variable. What is it? 
    examine_data( housing )

    # split dataset without stratification

    # TO-DO:
    # What is the type of the value returned by 
    # train_test_split?

    # Search on the Internet for advice: what should
    # the test_size be?

    # What happens if we assign a different value
    # to random_state? What happens if we call the function
    # without this third argument?

    sets = train_test_split(housing, test_size=0.2, random_state=42)
    training_set = sets[0]
    test_set = sets[1]

    print( f'\n # of rows in training set = {len(training_set)} \n' )
    print( f'\n # of rows in test set = {len(test_set)} \n' )

    # or split dataset with stratification

    # TO-DO:
    # Geron defines stratification with an example
    # in which a data scientist wants proportional
    # numbers of men and women.
    # Which variable are we going to use for stratification?

    # Add a new column to our dataset.
    # This is a temporary addition. We will remove this
    # column later.

    # TO-DO: What is np.inf?
    
    boundaries = [0., 1.5, 3.0, 4.5, 6., np.inf]
    categoryNumbers = [1, 2, 3, 4, 5]
    housing["income_cat"] = pd.cut(housing["median_income"],
                               bins = boundaries,
                               labels = categoryNumbers )

    # TO-DO: Write code that prints the number of records in each
    # of the 5 categories just defined.

    # TO-DO: Write code that calculates and prints the number
    # of blocks in which the median income lies between
    # $15,000 and $45,000.

    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

    # TO-DO: What type of variable is split?
    print( type(split) )

    # TO-DO: What type of variable does split's split function
    # return to its caller?
    t = split.split(housing, housing["income_cat"])
    print( type(t) )

    # TO-DO: How many times this loop run?
    # What do we get out of t?
    for a, b in t:
        print( type(a) )
        print( len(a) )
        print( type(b) )
        print( len(b) )

    # Okay, now that we have some understanding of what this
    # class is going to give us, let's use it.
    for train_index, test_index in split.split(housing, housing["income_cat"]):
        strat_train_set = housing.loc[train_index]
        strat_test_set = housing.loc[test_index]

    # Let's make sure this stratified sampling did what
    # we wanted it to do.


    # We need to split the dataset again because housing now has
    # a new column and we want the train_set and test_set to have
    # that new column, too.
    train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

    # TO-DO: Here's another way to create a DataFrame. What is the
    # type of the argument for the constructor in this case?
    compare_props = pd.DataFrame({
        "Overall": income_cat_proportions(housing),
        "Stratified": income_cat_proportions(strat_test_set),
        "Random": income_cat_proportions(test_set),
    })

    # TO-DO: What happens if you leave out this next statement?
    compare_props = compare_props.sort_index()

    print( compare_props )

    # Let's add some columns to the compare_props DataFrame that
    # will make it easier to compare the two ways of splitting
    # our data.

    print( '\n' )

    compare_props["Rand. %error"] = \
        (100 * compare_props["Random"] / compare_props["Overall"] - 100)

    compare_props["Strat. %error"] = \
        (100 * compare_props["Stratified"] / compare_props["Overall"] - 100)

    print( '\n' )

    # TO-DO: What do you see now?
    print( compare_props )

    # We added a column to use in making the stratified split.
    # Now that we have split the dataset into a training set and a 
    # test set we no longer need that column. Let's get rid of it.

    # TO-DO: Could we do this without a loop? How could we verify
    # that we have succeeded in removing the column?
    for set_ in (strat_train_set, strat_test_set):
        set_.drop("income_cat", axis=1, inplace=True)

    # We want a create a function y = f(x) that predicts
    # the prices of houses in a neighborhood.
    # Here, x denotes the characteristics of the neighborhood
    # and y is the median price of a home in that neighborhood.

    # x is a vector---it is not a single number, but a collection
    # of numbers that tell us something about geographic location,
    # the incomes of the population, the size of the houses, and
    # so on.

    # Our training set currently contains both x's and y's.
    # We need to pull out the x's and put them in their own
    # DataFrame. Then we need to do the same thing with the y's.

    # We'll put the x's in a DataFrame that we will call "housing."
    # This DataFrame contains all the columns in strat_train_set
    # except for the median_house_value column.
    housing = strat_train_set.drop("median_house_value", axis=1)

    # We'll put the y's in a DataFrame that we will call "housing_labels."
    # This DataFrame contains only the median_house_value column. 
    housing_labels = strat_train_set["median_house_value"].copy()

    # TO-DO: The next step is to do something about missing values.
    # We have 3 choices. What are they?


    # After we take care of missing values, our next task will
    # to do something with our one categorical variable.

    # Then we will add some new variables (new columns) that
    # that combine information from existing columns...

    # Then we will scale the data...

    # Then we will see a way of putting all of these
    # transformations into a pipeline...
    
    # Then we will be ready to build a model and make
    # some predictions!

    # More to come....
# end of main()

if __name__ == '__main__':
    print( 'Guten Tag!' )
    main()

