{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import xlrd\n", "\n", "xlsx = xlrd.open_workbook(\"CSC 357 Week 1 Lesson.xlsx\", on_demand=True)\n", "with pd.ExcelFile(xlsx) as wb:\n", " basic_info = pd.read_excel(wb, \"basic_info\")\n", " birth_death_rates = pd.read_excel(wb, \"birth_death_rates\")" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CountryCapitalRegionPopulationAreaGDP (millions)
0IndiaNew DelhiAsia13732331701283232.011370978.8
1AndorraAndorra La VellaEurope77204180.03327.0
2Sierra LeoneFreetownAfrica789492127394.011361.6
3LuxembourgLuxembourgEurope620853993.063428.4
4Bahamas, TheNassauCentral America3913693858.011725.0
\n", "
" ], "text/plain": [ " Country Capital Region Population Area \\\n", "0 India New Delhi Asia 1373233170 1283232.0 \n", "1 Andorra Andorra La Vella Europe 77204 180.0 \n", "2 Sierra Leone Freetown Africa 7894921 27394.0 \n", "3 Luxembourg Luxembourg Europe 620853 993.0 \n", "4 Bahamas, The Nassau Central America 391369 3858.0 \n", "\n", " GDP (millions) \n", "0 11370978.8 \n", "1 3327.0 \n", "2 11361.6 \n", "3 63428.4 \n", "4 11725.0 " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "basic_info.head(5)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CountryBirth RateDeath Rate
0Afghanistan37.513.2
1Albania13.26.9
2Algeria21.54.3
3Andorra7.37.4
4Angola43.79.0
\n", "
" ], "text/plain": [ " Country Birth Rate Death Rate\n", "0 Afghanistan 37.5 13.2\n", "1 Albania 13.2 6.9\n", "2 Algeria 21.5 4.3\n", "3 Andorra 7.3 7.4\n", "4 Angola 43.7 9.0" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "birth_death_rates.head(5)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 198 entries, 0 to 197\n", "Data columns (total 6 columns):\n", "Country 198 non-null object\n", "Capital 198 non-null object\n", "Region 198 non-null object\n", "Population 198 non-null int64\n", "Area 198 non-null float64\n", "GDP (millions) 198 non-null float64\n", "dtypes: float64(2), int64(1), object(3)\n", "memory usage: 9.4+ KB\n" ] } ], "source": [ "basic_info.info()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 198 entries, 0 to 197\n", "Data columns (total 3 columns):\n", "Country 198 non-null object\n", "Birth Rate 195 non-null float64\n", "Death Rate 195 non-null float64\n", "dtypes: float64(2), object(1)\n", "memory usage: 4.8+ KB\n" ] } ], "source": [ "birth_death_rates.info()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PopulationAreaGDP (millions)
count1.980000e+021.980000e+021.980000e+02
mean3.909775e+072.542835e+056.807511e+05
std1.459478e+087.134595e+052.574923e+06
min8.000000e+021.700000e-014.360000e+01
25%1.908340e+068.200500e+031.961620e+04
50%8.605306e+064.378300e+047.260940e+04
75%2.818871e+071.962750e+053.682574e+05
max1.436571e+096.493917e+062.605856e+07
\n", "
" ], "text/plain": [ " Population Area GDP (millions)\n", "count 1.980000e+02 1.980000e+02 1.980000e+02\n", "mean 3.909775e+07 2.542835e+05 6.807511e+05\n", "std 1.459478e+08 7.134595e+05 2.574923e+06\n", "min 8.000000e+02 1.700000e-01 4.360000e+01\n", "25% 1.908340e+06 8.200500e+03 1.961620e+04\n", "50% 8.605306e+06 4.378300e+04 7.260940e+04\n", "75% 2.818871e+07 1.962750e+05 3.682574e+05\n", "max 1.436571e+09 6.493917e+06 2.605856e+07" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "basic_info.describe()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Birth RateDeath Rate
count195.000000195.000000
mean19.4020517.687179
std9.5782132.678184
min6.5000001.600000
25%11.7500005.950000
50%17.2000007.400000
75%23.7500009.400000
max43.70000015.100000
\n", "
" ], "text/plain": [ " Birth Rate Death Rate\n", "count 195.000000 195.000000\n", "mean 19.402051 7.687179\n", "std 9.578213 2.678184\n", "min 6.500000 1.600000\n", "25% 11.750000 5.950000\n", "50% 17.200000 7.400000\n", "75% 23.750000 9.400000\n", "max 43.700000 15.100000" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "birth_death_rates.describe()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Int64Index: 195 entries, 0 to 197\n", "Data columns (total 3 columns):\n", "Country 195 non-null object\n", "Birth Rate 195 non-null float64\n", "Death Rate 195 non-null float64\n", "dtypes: float64(2), object(1)\n", "memory usage: 6.1+ KB\n" ] } ], "source": [ "birth_death_rates = birth_death_rates.dropna()\n", "birth_death_rates.info()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CountryCapitalRegionPopulationAreaGDP (millions)Birth RateDeath Rate
0IndiaNew DelhiAsia13732331701283232.011370978.818.77.3
1AndorraAndorra La VellaEurope77204180.03327.07.37.4
2Sierra LeoneFreetownAfrica789492127394.011361.636.010.2
3LuxembourgLuxembourgEurope620853993.063428.411.67.3
4Bahamas, TheNassauCentral America3913693858.011725.015.17.3
\n", "
" ], "text/plain": [ " Country Capital Region Population Area \\\n", "0 India New Delhi Asia 1373233170 1283232.0 \n", "1 Andorra Andorra La Vella Europe 77204 180.0 \n", "2 Sierra Leone Freetown Africa 7894921 27394.0 \n", "3 Luxembourg Luxembourg Europe 620853 993.0 \n", "4 Bahamas, The Nassau Central America 391369 3858.0 \n", "\n", " GDP (millions) Birth Rate Death Rate \n", "0 11370978.8 18.7 7.3 \n", "1 3327.0 7.3 7.4 \n", "2 11361.6 36.0 10.2 \n", "3 63428.4 11.6 7.3 \n", "4 11725.0 15.1 7.3 " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "combined_data = pd.merge(basic_info, birth_death_rates, how=\"inner\", on=\"Country\")\n", "combined_data.head(5)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Int64Index: 195 entries, 0 to 194\n", "Data columns (total 8 columns):\n", "Country 195 non-null object\n", "Capital 195 non-null object\n", "Region 195 non-null object\n", "Population 195 non-null int64\n", "Area 195 non-null float64\n", "GDP (millions) 195 non-null float64\n", "Birth Rate 195 non-null float64\n", "Death Rate 195 non-null float64\n", "dtypes: float64(4), int64(1), object(3)\n", "memory usage: 13.7+ KB\n" ] } ], "source": [ "combined_data.info()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "combined_data = combined_data.rename(columns = {\"GDP (millions)\":\"GDP_millions\"})\n", "combined_data = combined_data.rename(columns = {\"Birth Rate\":\"Birth_rate\"})\n", "combined_data = combined_data.rename(columns = {\"Death Rate\":\"Death_rate\"})" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "combined_data[\"Density\"] = combined_data.Population / combined_data.Area\n", "combined_data[\"GDP_per_capita\"] = combined_data.GDP_millions / combined_data.Population * 1000000" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CountryCapitalRegionPopulationAreaDensityGDP_millionsGDP_per_capitaBirth_rateDeath_rate
0IndiaNew DelhiAsia13732331701283232.01070.13632011370978.88280.44286218.77.3
1AndorraAndorra La VellaEurope77204180.0428.9111113327.043093.6220927.37.4
2Sierra LeoneFreetownAfrica789492127394.0288.19891211361.61439.10243136.010.2
3LuxembourgLuxembourgEurope620853993.0625.22960763428.4102163.31402111.67.3
4Bahamas, TheNassauCentral America3913693858.0101.44349411725.029958.93900615.17.3
\n", "
" ], "text/plain": [ " Country Capital Region Population Area \\\n", "0 India New Delhi Asia 1373233170 1283232.0 \n", "1 Andorra Andorra La Vella Europe 77204 180.0 \n", "2 Sierra Leone Freetown Africa 7894921 27394.0 \n", "3 Luxembourg Luxembourg Europe 620853 993.0 \n", "4 Bahamas, The Nassau Central America 391369 3858.0 \n", "\n", " Density GDP_millions GDP_per_capita Birth_rate Death_rate \n", "0 1070.136320 11370978.8 8280.442862 18.7 7.3 \n", "1 428.911111 3327.0 43093.622092 7.3 7.4 \n", "2 288.198912 11361.6 1439.102431 36.0 10.2 \n", "3 625.229607 63428.4 102163.314021 11.6 7.3 \n", "4 101.443494 11725.0 29958.939006 15.1 7.3 " ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "combined_data = combined_data[[\"Country\", \"Capital\", \"Region\", \"Population\", \"Area\", \"Density\",\n", " \"GDP_millions\", \"GDP_per_capita\", \"Birth_rate\", \"Death_rate\"]]\n", "combined_data.head(5)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CountryCapitalRegionPopulationAreaDensityGDP_millionsGDP_per_capitaBirth_rateDeath_rate
0IndiaNew Delhi013732331701283232.01070.13632011370978.88280.44286218.77.3
1AndorraAndorra La Vella177204180.0428.9111113327.043093.6220927.37.4
2Sierra LeoneFreetown2789492127394.0288.19891211361.61439.10243136.010.2
3LuxembourgLuxembourg1620853993.0625.22960763428.4102163.31402111.67.3
4Bahamas, TheNassau33913693858.0101.44349411725.029958.93900615.17.3
\n", "
" ], "text/plain": [ " Country Capital Region Population Area Density \\\n", "0 India New Delhi 0 1373233170 1283232.0 1070.136320 \n", "1 Andorra Andorra La Vella 1 77204 180.0 428.911111 \n", "2 Sierra Leone Freetown 2 7894921 27394.0 288.198912 \n", "3 Luxembourg Luxembourg 1 620853 993.0 625.229607 \n", "4 Bahamas, The Nassau 3 391369 3858.0 101.443494 \n", "\n", " GDP_millions GDP_per_capita Birth_rate Death_rate \n", "0 11370978.8 8280.442862 18.7 7.3 \n", "1 3327.0 43093.622092 7.3 7.4 \n", "2 11361.6 1439.102431 36.0 10.2 \n", "3 63428.4 102163.314021 11.6 7.3 \n", "4 11725.0 29958.939006 15.1 7.3 " ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ml_data = combined_data\n", "labels, uniques = pd.factorize(ml_data.Region)\n", "ml_data.Region = labels\n", "ml_data.head(5)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "ml_birth_data = ml_data[[\"Region\", \"Density\", \"GDP_per_capita\", \"Birth_rate\"]]\n", "ml_death_data = ml_data[[\"Region\", \"Density\", \"GDP_per_capita\", \"Death_rate\"]]" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
RegionDensityGDP_per_capitaBirth_rate
548.60162848799.22557312.0
13541332.09876515996.29286423.2
1221435.73590811080.14332412.1
16701326.48488439954.5678578.3
85179.47827432327.6989969.9
\n", "
" ], "text/plain": [ " Region Density GDP_per_capita Birth_rate\n", "5 4 8.601628 48799.225573 12.0\n", "135 4 1332.098765 15996.292864 23.2\n", "122 1 435.735908 11080.143324 12.1\n", "167 0 1326.484884 39954.567857 8.3\n", "85 1 79.478274 32327.698996 9.9" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import numpy as np\n", "from sklearn.model_selection import train_test_split\n", "\n", "birth_train, birth_test = train_test_split(ml_birth_data, test_size=0.2, random_state=42)\n", "death_train, death_test = train_test_split(ml_death_data, test_size=0.2, random_state=42)\n", "birth_train.head()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
RegionDensityGDP_per_capitaDeath_rate
548.60162848799.2255737.3
13541332.09876515996.2928645.9
1221435.73590811080.1433244.5
16701326.48488439954.5678576.3
85179.47827432327.69899612.7
\n", "
" ], "text/plain": [ " Region Density GDP_per_capita Death_rate\n", "5 4 8.601628 48799.225573 7.3\n", "135 4 1332.098765 15996.292864 5.9\n", "122 1 435.735908 11080.143324 4.5\n", "167 0 1326.484884 39954.567857 6.3\n", "85 1 79.478274 32327.698996 12.7" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "death_train.head()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 2 }