In [46]:
import numpy as np
import pandas as pd

dictionary = {"Price":[10, 20, 40, 80], "Weight":[2, 5, 7, np.nan], "Fruit":["Apple", "Banana", "Orange", "Orange"]}
df = pd.DataFrame(dictionary)

df
Out[46]:
Price Weight Fruit
0 10 2.0 Apple
1 20 5.0 Banana
2 40 7.0 Orange
3 80 NaN Orange
In [47]:
df_num = df.loc[:, df.columns != "Fruit"]
df_num
Out[47]:
Price Weight
0 10 2.0
1 20 5.0
2 40 7.0
3 80 NaN
In [48]:
df_cat = df[["Fruit"]]
df_cat
Out[48]:
Fruit
0 Apple
1 Banana
2 Orange
3 Orange
In [49]:
from sklearn.preprocessing import OneHotEncoder

df_cat_encoder = OneHotEncoder(sparse = False)
df_cat_1hot = df_cat_encoder.fit_transform(df_cat)

print(df_cat_1hot)
print(df_cat_encoder.categories_)
[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 1.]]
[array(['Apple', 'Banana', 'Orange'], dtype=object)]
In [50]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer


num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy = "median")),
    ('stad_scaler', StandardScaler()),
])

print(num_pipeline.fit_transform(df_num))
[[-1.02575529 -1.54030809]
 [-0.65275337  0.14002801]
 [ 0.09325048  1.26025208]
 [ 1.58525817  0.14002801]]
In [51]:
from sklearn.compose import ColumnTransformer

num_attribs = list(df_num.columns)
cat_attribs = ["Fruit"]

print(num_attribs)
print(cat_attribs)
['Price', 'Weight']
['Fruit']
In [53]:
full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

df_prepared = full_pipeline.fit_transform(df)

df_prepared
Out[53]:
array([[-1.02575529, -1.54030809,  1.        ,  0.        ,  0.        ],
       [-0.65275337,  0.14002801,  0.        ,  1.        ,  0.        ],
       [ 0.09325048,  1.26025208,  0.        ,  0.        ,  1.        ],
       [ 1.58525817,  0.14002801,  0.        ,  0.        ,  1.        ]])
In [61]:
all_attributes = num_attribs + list(df_cat_encoder.categories_[0])

df_prepared = pd.DataFrame(df_prepared, columns = all_attributes)

df_prepared
Out[61]:
Price Weight Apple Banana Orange
0 -1.025755 -1.540308 1.0 0.0 0.0
1 -0.652753 0.140028 0.0 1.0 0.0
2 0.093250 1.260252 0.0 0.0 1.0
3 1.585258 0.140028 0.0 0.0 1.0
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [53]:
 
In [ ]:
 
In [ ]: