import numpy as np
import pandas as pd
dictionary = {"Price":[10, 20, 40, 80], "Weight":[2, 5, 7, np.nan], "Fruit":["Apple", "Banana", "Orange", "Orange"]}
df = pd.DataFrame(dictionary)
df
df_num = df.loc[:, df.columns != "Fruit"]
df_num
df_cat = df[["Fruit"]]
df_cat
from sklearn.preprocessing import OneHotEncoder
df_cat_encoder = OneHotEncoder(sparse = False)
df_cat_1hot = df_cat_encoder.fit_transform(df_cat)
print(df_cat_1hot)
print(df_cat_encoder.categories_)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy = "median")),
('stad_scaler', StandardScaler()),
])
print(num_pipeline.fit_transform(df_num))
from sklearn.compose import ColumnTransformer
num_attribs = list(df_num.columns)
cat_attribs = ["Fruit"]
print(num_attribs)
print(cat_attribs)
full_pipeline = ColumnTransformer([
("num", num_pipeline, num_attribs),
("cat", OneHotEncoder(), cat_attribs),
])
df_prepared = full_pipeline.fit_transform(df)
df_prepared
all_attributes = num_attribs + list(df_cat_encoder.categories_[0])
df_prepared = pd.DataFrame(df_prepared, columns = all_attributes)
df_prepared