Problem statement: Predict using independent variables if an individual is a smoker or not
Download The Dataset
Download The Code File
Variables:
Independent Variables : age, sex, bmi, children, expenses
Dependent Variable : smoker
#Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
#Change the working directory and set it as current console's working directory
#Importing the Dataset
dataset = pd.read_csv('insurance.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 5].values
X = pd.DataFrame(X)
y = pd.DataFrame(y)
#Taking care of Missing Data
import sklearn
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values = 'NaN', strategy = 'mean')
imputer = imputer.fit(X.values[:, 2:])
X.values[:, 2:] = imputer.transform(X.values[:, 2:])
#Encoding Categorical Data
#Encoding the Independent Variable
from sklearn.preprocessing import LabelEncoder
labelencoder_X = LabelEncoder()
X.values[:, 1] = labelencoder_X.fit_transform(X.values[:, 1])
#Encoding the Dependent Variable
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)
#Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)




















