Today I am checking out “How to get started with Machine Learning in about 10 minutes”. Few excerpts:
IMPORT
import numpy as np import pandas as pdimport matplotlib.pyplot as pltimport seaborn as snsimport warningswarnings.filterwarnings(‘ignore’)%matplotlib inline
LOAD
train_df=pd.read_csv(“train.csv”)train_df.head()
CHECK MISSING
def missingdata(data): total = data.isnull().sum().sort_values(ascending = False) percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False) ms=pd.concat([total, percent], axis=1, keys=[‘Total’, ‘Percent’]) ms= ms[ms[“Percent”] > 0] f,ax =plt.subplots(figsize=(8,6)) plt.xticks(rotation=’90’) fig=sns.barplot(ms.index, ms[“Percent”],color=”green”,alpha=0.8) plt.xlabel(‘Features’, fontsize=15) plt.ylabel(‘Percent of missing values’, fontsize=15) plt.title(‘Percent missing data by feature’, fontsize=15) return ms
missingdata(train_df)
CLEAN DATA
train_df[‘Embarked’].fillna(train_df[‘Embarked’].mode()[0], inplace = True)train_df[‘Age’].fillna(train_df[‘Age’].median(), inplace = True)
drop_column = [‘Cabin’]train_df.drop(drop_column, axis=1, inplace = True)
print(‘check the nan value in train data’)
print(train_df.isnull().sum())
See rest of code and explanation here.