Day 17: Explore Machine Learning with Python-2

Today I am checking out “How to get started with Machine Learning in about 10 minutes”. Few excerpts:

IMPORT

import numpy as np import pandas as pdimport matplotlib.pyplot as pltimport seaborn as snsimport warningswarnings.filterwarnings(‘ignore’)%matplotlib inline

LOAD

train_df=pd.read_csv(“train.csv”)train_df.head()

CHECK MISSING

def missingdata(data): total = data.isnull().sum().sort_values(ascending = False) percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False) ms=pd.concat([total, percent], axis=1, keys=[‘Total’, ‘Percent’]) ms= ms[ms[“Percent”] > 0] f,ax =plt.subplots(figsize=(8,6)) plt.xticks(rotation=’90’) fig=sns.barplot(ms.index, ms[“Percent”],color=”green”,alpha=0.8) plt.xlabel(‘Features’, fontsize=15) plt.ylabel(‘Percent of missing values’, fontsize=15) plt.title(‘Percent missing data by feature’, fontsize=15) return ms

missingdata(train_df)

CLEAN DATA

train_df[‘Embarked’].fillna(train_df[‘Embarked’].mode()[0], inplace = True)train_df[‘Age’].fillna(train_df[‘Age’].median(), inplace = True)

drop_column = [‘Cabin’]train_df.drop(drop_column, axis=1, inplace = True)

print(‘check the nan value in train data’)

print(train_df.isnull().sum())

See rest of code and explanation here.

 

 

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Google photo

You are commenting using your Google account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s