{
"cells": [
{
"cell_type": "code",
"execution_count": 89,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Country | \n",
" Capital | \n",
" Africa | \n",
" Asia | \n",
" C_America | \n",
" Europe | \n",
" N_America | \n",
" Oceania | \n",
" S_America | \n",
" Population | \n",
" Area | \n",
" Density | \n",
" GDP_millions | \n",
" GDP_per_capita | \n",
" Birth_rate | \n",
" Death_rate | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" India | \n",
" New Delhi | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1373233170 | \n",
" 1283232.0 | \n",
" 1070.136320 | \n",
" 11370978.8 | \n",
" 8280.442862 | \n",
" 18.7 | \n",
" 7.3 | \n",
"
\n",
" \n",
" 1 | \n",
" Andorra | \n",
" Andorra La Vella | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 77204 | \n",
" 180.0 | \n",
" 428.911111 | \n",
" 3327.0 | \n",
" 43093.622092 | \n",
" 7.3 | \n",
" 7.4 | \n",
"
\n",
" \n",
" 2 | \n",
" Sierra Leone | \n",
" Freetown | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 7894921 | \n",
" 27394.0 | \n",
" 288.198912 | \n",
" 11361.6 | \n",
" 1439.102431 | \n",
" 36.0 | \n",
" 10.2 | \n",
"
\n",
" \n",
" 3 | \n",
" Luxembourg | \n",
" Luxembourg | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 620853 | \n",
" 993.0 | \n",
" 625.229607 | \n",
" 63428.4 | \n",
" 102163.314021 | \n",
" 11.6 | \n",
" 7.3 | \n",
"
\n",
" \n",
" 4 | \n",
" Bahamas, The | \n",
" Nassau | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 391369 | \n",
" 3858.0 | \n",
" 101.443494 | \n",
" 11725.0 | \n",
" 29958.939006 | \n",
" 15.1 | \n",
" 7.3 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Country Capital Africa Asia C_America Europe N_America \\\n",
"0 India New Delhi 0 1 0 0 0 \n",
"1 Andorra Andorra La Vella 0 0 0 1 0 \n",
"2 Sierra Leone Freetown 1 0 0 0 0 \n",
"3 Luxembourg Luxembourg 0 0 0 1 0 \n",
"4 Bahamas, The Nassau 0 0 1 0 0 \n",
"\n",
" Oceania S_America Population Area Density GDP_millions \\\n",
"0 0 0 1373233170 1283232.0 1070.136320 11370978.8 \n",
"1 0 0 77204 180.0 428.911111 3327.0 \n",
"2 0 0 7894921 27394.0 288.198912 11361.6 \n",
"3 0 0 620853 993.0 625.229607 63428.4 \n",
"4 0 0 391369 3858.0 101.443494 11725.0 \n",
"\n",
" GDP_per_capita Birth_rate Death_rate \n",
"0 8280.442862 18.7 7.3 \n",
"1 43093.622092 7.3 7.4 \n",
"2 1439.102431 36.0 10.2 \n",
"3 102163.314021 11.6 7.3 \n",
"4 29958.939006 15.1 7.3 "
]
},
"execution_count": 89,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import xlrd\n",
"\n",
"xlsx = xlrd.open_workbook(\"CSC 357 Week 2 Lesson.xlsx\", on_demand=True)\n",
"with pd.ExcelFile(xlsx) as wb:\n",
" df = pd.read_excel(wb, \"info\")\n",
"\n",
"df.head(5)"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Africa | \n",
" Asia | \n",
" C_America | \n",
" Europe | \n",
" N_America | \n",
" Oceania | \n",
" S_America | \n",
" Population | \n",
" Area | \n",
" Density | \n",
" GDP_millions | \n",
" GDP_per_capita | \n",
" Birth_rate | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1373233170 | \n",
" 1283232.0 | \n",
" 1070.136320 | \n",
" 11370978.8 | \n",
" 8280.442862 | \n",
" 18.7 | \n",
"
\n",
" \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 77204 | \n",
" 180.0 | \n",
" 428.911111 | \n",
" 3327.0 | \n",
" 43093.622092 | \n",
" 7.3 | \n",
"
\n",
" \n",
" 2 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 7894921 | \n",
" 27394.0 | \n",
" 288.198912 | \n",
" 11361.6 | \n",
" 1439.102431 | \n",
" 36.0 | \n",
"
\n",
" \n",
" 3 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 620853 | \n",
" 993.0 | \n",
" 625.229607 | \n",
" 63428.4 | \n",
" 102163.314021 | \n",
" 11.6 | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 391369 | \n",
" 3858.0 | \n",
" 101.443494 | \n",
" 11725.0 | \n",
" 29958.939006 | \n",
" 15.1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Africa Asia C_America Europe N_America Oceania S_America Population \\\n",
"0 0 1 0 0 0 0 0 1373233170 \n",
"1 0 0 0 1 0 0 0 77204 \n",
"2 1 0 0 0 0 0 0 7894921 \n",
"3 0 0 0 1 0 0 0 620853 \n",
"4 0 0 1 0 0 0 0 391369 \n",
"\n",
" Area Density GDP_millions GDP_per_capita Birth_rate \n",
"0 1283232.0 1070.136320 11370978.8 8280.442862 18.7 \n",
"1 180.0 428.911111 3327.0 43093.622092 7.3 \n",
"2 27394.0 288.198912 11361.6 1439.102431 36.0 \n",
"3 993.0 625.229607 63428.4 102163.314021 11.6 \n",
"4 3858.0 101.443494 11725.0 29958.939006 15.1 "
]
},
"execution_count": 90,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"birth_data = df.drop(columns = [\"Country\", \"Capital\", \"Death_rate\"])\n",
"death_data = df.drop(columns = [\"Country\", \"Capital\", \"Birth_rate\"])\n",
"birth_data.head(5)"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Africa | \n",
" Asia | \n",
" C_America | \n",
" Europe | \n",
" N_America | \n",
" Oceania | \n",
" S_America | \n",
" Population | \n",
" Area | \n",
" Density | \n",
" GDP_millions | \n",
" GDP_per_capita | \n",
" Death_rate | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1373233170 | \n",
" 1283232.0 | \n",
" 1070.136320 | \n",
" 11370978.8 | \n",
" 8280.442862 | \n",
" 7.3 | \n",
"
\n",
" \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 77204 | \n",
" 180.0 | \n",
" 428.911111 | \n",
" 3327.0 | \n",
" 43093.622092 | \n",
" 7.4 | \n",
"
\n",
" \n",
" 2 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 7894921 | \n",
" 27394.0 | \n",
" 288.198912 | \n",
" 11361.6 | \n",
" 1439.102431 | \n",
" 10.2 | \n",
"
\n",
" \n",
" 3 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 620853 | \n",
" 993.0 | \n",
" 625.229607 | \n",
" 63428.4 | \n",
" 102163.314021 | \n",
" 7.3 | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 391369 | \n",
" 3858.0 | \n",
" 101.443494 | \n",
" 11725.0 | \n",
" 29958.939006 | \n",
" 7.3 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Africa Asia C_America Europe N_America Oceania S_America Population \\\n",
"0 0 1 0 0 0 0 0 1373233170 \n",
"1 0 0 0 1 0 0 0 77204 \n",
"2 1 0 0 0 0 0 0 7894921 \n",
"3 0 0 0 1 0 0 0 620853 \n",
"4 0 0 1 0 0 0 0 391369 \n",
"\n",
" Area Density GDP_millions GDP_per_capita Death_rate \n",
"0 1283232.0 1070.136320 11370978.8 8280.442862 7.3 \n",
"1 180.0 428.911111 3327.0 43093.622092 7.4 \n",
"2 27394.0 288.198912 11361.6 1439.102431 10.2 \n",
"3 993.0 625.229607 63428.4 102163.314021 7.3 \n",
"4 3858.0 101.443494 11725.0 29958.939006 7.3 "
]
},
"execution_count": 91,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"death_data.head(5)"
]
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"birth_train, birth_test = train_test_split(birth_data, test_size=0.2, random_state=42)\n",
"death_train, death_test = train_test_split(death_data, test_size=0.2, random_state=42)"
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Birth_rate 1.000000\n",
"Africa 0.742076\n",
"Oceania 0.000850\n",
"Population -0.030827\n",
"C_America -0.068494\n",
"Area -0.080206\n",
"S_America -0.084688\n",
"N_America -0.095789\n",
"Asia -0.104140\n",
"GDP_millions -0.139544\n",
"Density -0.194040\n",
"GDP_per_capita -0.534990\n",
"Europe -0.542382\n",
"Name: Birth_rate, dtype: float64"
]
},
"execution_count": 93,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"birth_corr_matrix = birth_train.corr()\n",
"birth_corr_matrix[\"Birth_rate\"].sort_values(ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Death_rate 1.000000\n",
"Europe 0.498187\n",
"Africa 0.094868\n",
"Area 0.089829\n",
"N_America 0.030557\n",
"GDP_millions 0.011293\n",
"Population -0.000442\n",
"GDP_per_capita -0.070903\n",
"Density -0.076134\n",
"C_America -0.115274\n",
"S_America -0.125681\n",
"Oceania -0.148835\n",
"Asia -0.383382\n",
"Name: Death_rate, dtype: float64"
]
},
"execution_count": 94,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"death_corr_matrix = death_train.corr()\n",
"death_corr_matrix[\"Death_rate\"].sort_values(ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {},
"outputs": [],
"source": [
"birth_train_labels = birth_train[\"Birth_rate\"].copy()\n",
"birth_train = birth_train.drop(\"Birth_rate\", axis=1)\n",
"death_train_labels = death_train[\"Death_rate\"].copy()\n",
"death_train = death_train.drop(\"Death_rate\", axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Africa | \n",
" Asia | \n",
" C_America | \n",
" Europe | \n",
" N_America | \n",
" Oceania | \n",
" S_America | \n",
" Population | \n",
" Area | \n",
" Density | \n",
" GDP_millions | \n",
" GDP_per_capita | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" -0.616872 | \n",
" -0.547723 | \n",
" -0.313993 | \n",
" -0.587220 | \n",
" -0.113961 | \n",
" 3.630677 | \n",
" -0.261712 | \n",
" -0.101264 | \n",
" 3.356525 | \n",
" -0.210259 | \n",
" 0.181815 | \n",
" 0.935520 | \n",
"
\n",
" \n",
" 1 | \n",
" -0.616872 | \n",
" -0.547723 | \n",
" -0.313993 | \n",
" -0.587220 | \n",
" -0.113961 | \n",
" 3.630677 | \n",
" -0.261712 | \n",
" -0.257790 | \n",
" -0.362200 | \n",
" 0.016860 | \n",
" -0.254971 | \n",
" -0.243053 | \n",
"
\n",
" \n",
" 2 | \n",
" -0.616872 | \n",
" -0.547723 | \n",
" -0.313993 | \n",
" 1.702939 | \n",
" -0.113961 | \n",
" -0.275431 | \n",
" -0.261712 | \n",
" -0.246636 | \n",
" -0.356950 | \n",
" -0.136961 | \n",
" -0.247924 | \n",
" -0.419685 | \n",
"
\n",
" \n",
" 3 | \n",
" -0.616872 | \n",
" 1.825742 | \n",
" -0.313993 | \n",
" -0.587220 | \n",
" -0.113961 | \n",
" -0.275431 | \n",
" -0.261712 | \n",
" 0.058688 | \n",
" -0.313464 | \n",
" 0.015897 | \n",
" 0.467978 | \n",
" 0.617741 | \n",
"
\n",
" \n",
" 4 | \n",
" -0.616872 | \n",
" -0.547723 | \n",
" -0.313993 | \n",
" 1.702939 | \n",
" -0.113961 | \n",
" -0.275431 | \n",
" -0.261712 | \n",
" -0.249666 | \n",
" -0.341158 | \n",
" -0.198096 | \n",
" -0.239894 | \n",
" 0.343716 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Africa Asia C_America Europe N_America Oceania S_America \\\n",
"0 -0.616872 -0.547723 -0.313993 -0.587220 -0.113961 3.630677 -0.261712 \n",
"1 -0.616872 -0.547723 -0.313993 -0.587220 -0.113961 3.630677 -0.261712 \n",
"2 -0.616872 -0.547723 -0.313993 1.702939 -0.113961 -0.275431 -0.261712 \n",
"3 -0.616872 1.825742 -0.313993 -0.587220 -0.113961 -0.275431 -0.261712 \n",
"4 -0.616872 -0.547723 -0.313993 1.702939 -0.113961 -0.275431 -0.261712 \n",
"\n",
" Population Area Density GDP_millions GDP_per_capita \n",
"0 -0.101264 3.356525 -0.210259 0.181815 0.935520 \n",
"1 -0.257790 -0.362200 0.016860 -0.254971 -0.243053 \n",
"2 -0.246636 -0.356950 -0.136961 -0.247924 -0.419685 \n",
"3 0.058688 -0.313464 0.015897 0.467978 0.617741 \n",
"4 -0.249666 -0.341158 -0.198096 -0.239894 0.343716 "
]
},
"execution_count": 96,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.preprocessing import StandardScaler\n",
"scaler = StandardScaler()\n",
"birth_train = pd.DataFrame(scaler.fit_transform(birth_train))\n",
"death_train = pd.DataFrame(scaler.fit_transform(death_train))\n",
"birth_train.columns = [\"Africa\", \"Asia\", \"C_America\", \"Europe\", \"N_America\", \"Oceania\", \"S_America\",\n",
" \"Population\", \"Area\", \"Density\", \"GDP_millions\", \"GDP_per_capita\"]\n",
"death_train.columns = [\"Africa\", \"Asia\", \"C_America\", \"Europe\", \"N_America\", \"Oceania\", \"S_America\",\n",
" \"Population\", \"Area\", \"Density\", \"GDP_millions\", \"GDP_per_capita\"]\n",
"birth_train.head(5)"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Africa | \n",
" Asia | \n",
" C_America | \n",
" Europe | \n",
" N_America | \n",
" Oceania | \n",
" S_America | \n",
" Population | \n",
" Area | \n",
" Density | \n",
" GDP_millions | \n",
" GDP_per_capita | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" -0.616872 | \n",
" -0.547723 | \n",
" -0.313993 | \n",
" -0.587220 | \n",
" -0.113961 | \n",
" 3.630677 | \n",
" -0.261712 | \n",
" -0.101264 | \n",
" 3.356525 | \n",
" -0.210259 | \n",
" 0.181815 | \n",
" 0.935520 | \n",
"
\n",
" \n",
" 1 | \n",
" -0.616872 | \n",
" -0.547723 | \n",
" -0.313993 | \n",
" -0.587220 | \n",
" -0.113961 | \n",
" 3.630677 | \n",
" -0.261712 | \n",
" -0.257790 | \n",
" -0.362200 | \n",
" 0.016860 | \n",
" -0.254971 | \n",
" -0.243053 | \n",
"
\n",
" \n",
" 2 | \n",
" -0.616872 | \n",
" -0.547723 | \n",
" -0.313993 | \n",
" 1.702939 | \n",
" -0.113961 | \n",
" -0.275431 | \n",
" -0.261712 | \n",
" -0.246636 | \n",
" -0.356950 | \n",
" -0.136961 | \n",
" -0.247924 | \n",
" -0.419685 | \n",
"
\n",
" \n",
" 3 | \n",
" -0.616872 | \n",
" 1.825742 | \n",
" -0.313993 | \n",
" -0.587220 | \n",
" -0.113961 | \n",
" -0.275431 | \n",
" -0.261712 | \n",
" 0.058688 | \n",
" -0.313464 | \n",
" 0.015897 | \n",
" 0.467978 | \n",
" 0.617741 | \n",
"
\n",
" \n",
" 4 | \n",
" -0.616872 | \n",
" -0.547723 | \n",
" -0.313993 | \n",
" 1.702939 | \n",
" -0.113961 | \n",
" -0.275431 | \n",
" -0.261712 | \n",
" -0.249666 | \n",
" -0.341158 | \n",
" -0.198096 | \n",
" -0.239894 | \n",
" 0.343716 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Africa Asia C_America Europe N_America Oceania S_America \\\n",
"0 -0.616872 -0.547723 -0.313993 -0.587220 -0.113961 3.630677 -0.261712 \n",
"1 -0.616872 -0.547723 -0.313993 -0.587220 -0.113961 3.630677 -0.261712 \n",
"2 -0.616872 -0.547723 -0.313993 1.702939 -0.113961 -0.275431 -0.261712 \n",
"3 -0.616872 1.825742 -0.313993 -0.587220 -0.113961 -0.275431 -0.261712 \n",
"4 -0.616872 -0.547723 -0.313993 1.702939 -0.113961 -0.275431 -0.261712 \n",
"\n",
" Population Area Density GDP_millions GDP_per_capita \n",
"0 -0.101264 3.356525 -0.210259 0.181815 0.935520 \n",
"1 -0.257790 -0.362200 0.016860 -0.254971 -0.243053 \n",
"2 -0.246636 -0.356950 -0.136961 -0.247924 -0.419685 \n",
"3 0.058688 -0.313464 0.015897 0.467978 0.617741 \n",
"4 -0.249666 -0.341158 -0.198096 -0.239894 0.343716 "
]
},
"execution_count": 97,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"death_train.head(5)"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.linear_model import LinearRegression\n",
"lin_reg = LinearRegression()\n",
"birth_lr = lin_reg.fit(birth_train, birth_train_labels)\n",
"death_lr = lin_reg.fit(death_train, death_train_labels)"
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"RMSE for Birth Rate: 15.211499774254994 \n",
"RMSE for Death Rate: 2.108474856168463\n"
]
}
],
"source": [
"from sklearn.metrics import mean_squared_error\n",
"birth_pred = birth_lr.predict(birth_train)\n",
"birth_mse = mean_squared_error(birth_train_labels, birth_pred)\n",
"birth_rmse = np.sqrt(birth_mse)\n",
"death_pred = death_lr.predict(death_train)\n",
"death_mse = mean_squared_error(death_train_labels, death_pred)\n",
"death_rmse = np.sqrt(death_mse)\n",
"print(\"RMSE for Birth Rate: \", birth_rmse, \"\\nRMSE for Death Rate: \", death_rmse)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}