prep_titanic
Reproducibility puts the science in data science!
Docstrings are also your friends.
Helper functions work like plastic brick toys. ;)
By the end of the acquire lesson and exercises, you will be able to...
from sklearn.model_selection import train_test_split
train_validate, test = train_test_split(df, test_size=.2,
random_state=123,
stratify=df.target_column)
train, validate = train_test_split(train_validate, test_size=.3,
random_state=123,
stratify=train_validate.target_column)
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = 'desired_strategy')
train['column_name'] = imputer.fit_transform(train[['column_name']])
validate['column_name'] = imputer.transform(validate[['column_name']])
test['column_name'] = imputer.transform(test[['column_name']])
pd.get_dummies(df.column, drop_first=True)
from classification_acquire import get_titanic_data, get_iris_data
from classification_prepare import prep_iris, prep_titanic
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
# ignore warnings
import warnings
warnings.filterwarnings("ignore")
from darden_class_acquire import get_titanic_data, get_iris_data
iris = get_iris_data()
iris.head()
iris = iris.drop(columns='species_id')
iris.head(2)
iris = iris.rename(columns={'species_name': 'species'})
iris.head(2)
species_dummies = pd.get_dummies(iris.species, drop_first=True)
species_dummies.head(3)
iris = pd.concat([iris, species_dummies], axis=1)
iris.head()
def prep_iris(cached=True):
'''
This function acquires and prepares the iris data from a local csv, default.
Passing cached=False acquires fresh data from Codeup db and writes to csv.
Returns the iris df with dummy variables encoding species.
'''
# use my aquire function to read data into a df from a csv file
df = get_iris_data(cached)
# drop and rename columns
df = df.drop(columns='species_id').rename(columns={'species_name': 'species'})
# create dummy columns for species
species_dummies = pd.get_dummies(df.species, drop_first=True)
# add dummy columns to df
df = pd.concat([df, species_dummies], axis=1)
return df
iris = prep_iris()
iris.sample(7)
titanic = get_titanic_data()
titanic.head()
titanic[titanic.embark_town.isnull()]
titanic[titanic.embarked.isnull()]
# using the complement operator, ~, to return the inverse of our instance above. Return everything but the null values.
titanic = titanic[~titanic.embarked.isnull()]
titanic.info()
titanic = titanic.drop(columns='deck')
titanic.info()
titanic_dummies = pd.get_dummies(titanic.embarked, drop_first=True)
titanic_dummies.sample(10)
titanic = pd.concat([titanic, titanic_dummies], axis=1)
titanic.head()
train_validate, test = train_test_split(titanic, test_size=.2,
random_state=123,
stratify=titanic.survived)
train, validate = train_test_split(train_validate, test_size=.3,
random_state=123,
stratify=train_validate.survived)
print(f'train -> {train.shape}')
print(f'validate -> {validate.shape}')
print(f'test -> {test.shape}')
def titanic_split(df):
'''
This function performs split on titanic data, stratify survived.
Returns train, validate, and test dfs.
'''
train_validate, test = train_test_split(df, test_size=.2,
random_state=123,
stratify=df.survived)
train, validate = train_test_split(train_validate, test_size=.3,
random_state=123,
stratify=train_validate.survived)
return train, validate, test
train, validate, test = titanic_split(titanic)
print(f'train -> {train.shape}')
print(f'validate -> {validate.shape}')
print(f'test -> {test.shape}')
train.head(2)
# Create the imputer object.
imputer = SimpleImputer(strategy = 'mean')
# Fit the imputer to train and transform.
train['age'] = imputer.fit_transform(train[['age']])
# quick check
train['age'].isnull().sum()
# Transform the validate and test df age columns
validate['age'] = imputer.transform(validate[['age']])
test['age'] = imputer.transform(test[['age']])
def impute_mean_age(train, validate, test):
'''
This function imputes the mean of the age column into
observations with missing values.
Returns transformed train, validate, and test df.
'''
# create the imputer object with mean strategy
imputer = SimpleImputer(strategy = 'mean')
# fit on and transform age column in train
train['age'] = imputer.fit_transform(train[['age']])
# transform age column in validate
validate['age'] = imputer.transform(validate[['age']])
# transform age column in test
test['age'] = imputer.transform(test[['age']])
return train, validate, test
prep_titanic
¶def prep_titanic(cached=True):
'''
This function reads titanic data into a df from a csv file.
Returns prepped train, validate, and test dfs
'''
# use my acquire function to read data into a df from a csv file
df = get_titanic_data(cached)
# drop rows where embarked/embark town are null values
df = df[~df.embarked.isnull()]
# encode embarked using dummy columns
titanic_dummies = pd.get_dummies(df.embarked, drop_first=True)
# join dummy columns back to df
df = pd.concat([df, titanic_dummies], axis=1)
# drop the deck column
df = df.drop(columns='deck')
# split data into train, validate, test dfs
train, validate, test = titanic_split(df)
# impute mean of age into null values in age column
train, validate, test = impute_mean_age(train, validate, test)
return train, validate, test
train, validate, test = prep_titanic()
print(f'train -> {train.shape}')
print(f'validate -> {validate.shape}')
print(f'test -> {test.shape}')