The Algorithms logo
The Algorithms
AboutDonate

Support Vector Machine

t

Import Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Read Data

ibm = pd.read_csv('/WA_Fn-UseC_-HR-Employee-Attrition.csv')
pd.set_option('display.max_columns', None)

Dateset Information

ibm.shape
(1470, 35)
ibm.describe()
Age DailyRate DistanceFromHome Education EmployeeCount EmployeeNumber EnvironmentSatisfaction HourlyRate JobInvolvement JobLevel JobSatisfaction MonthlyIncome MonthlyRate NumCompaniesWorked PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
count 1470.000000 1470.000000 1470.000000 1470.000000 1470.0 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.0 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000
mean 36.923810 802.485714 9.192517 2.912925 1.0 1024.865306 2.721769 65.891156 2.729932 2.063946 2.728571 6502.931293 14313.103401 2.693197 15.209524 3.153741 2.712245 80.0 0.793878 11.279592 2.799320 2.761224 7.008163 4.229252 2.187755 4.123129
std 9.135373 403.509100 8.106864 1.024165 0.0 602.024335 1.093082 20.329428 0.711561 1.106940 1.102846 4707.956783 7117.786044 2.498009 3.659938 0.360824 1.081209 0.0 0.852077 7.780782 1.289271 0.706476 6.126525 3.623137 3.222430 3.568136
min 18.000000 102.000000 1.000000 1.000000 1.0 1.000000 1.000000 30.000000 1.000000 1.000000 1.000000 1009.000000 2094.000000 0.000000 11.000000 3.000000 1.000000 80.0 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000
25% 30.000000 465.000000 2.000000 2.000000 1.0 491.250000 2.000000 48.000000 2.000000 1.000000 2.000000 2911.000000 8047.000000 1.000000 12.000000 3.000000 2.000000 80.0 0.000000 6.000000 2.000000 2.000000 3.000000 2.000000 0.000000 2.000000
50% 36.000000 802.000000 7.000000 3.000000 1.0 1020.500000 3.000000 66.000000 3.000000 2.000000 3.000000 4919.000000 14235.500000 2.000000 14.000000 3.000000 3.000000 80.0 1.000000 10.000000 3.000000 3.000000 5.000000 3.000000 1.000000 3.000000
75% 43.000000 1157.000000 14.000000 4.000000 1.0 1555.750000 4.000000 83.750000 3.000000 3.000000 4.000000 8379.000000 20461.500000 4.000000 18.000000 3.000000 4.000000 80.0 1.000000 15.000000 3.000000 3.000000 9.000000 7.000000 3.000000 7.000000
max 60.000000 1499.000000 29.000000 5.000000 1.0 2068.000000 4.000000 100.000000 4.000000 5.000000 4.000000 19999.000000 26999.000000 9.000000 25.000000 4.000000 4.000000 80.0 3.000000 40.000000 6.000000 4.000000 40.000000 18.000000 15.000000 17.000000
import statistics
for i in ibm.columns:
    print(i, " mode: ", statistics.mode(ibm[i]));
Age  mode:  35
Attrition  mode:  No
BusinessTravel  mode:  Travel_Rarely
DailyRate  mode:  691
Department  mode:  Research & Development
DistanceFromHome  mode:  2
Education  mode:  3
EducationField  mode:  Life Sciences
EmployeeCount  mode:  1
EmployeeNumber  mode:  1
EnvironmentSatisfaction  mode:  3
Gender  mode:  Male
HourlyRate  mode:  66
JobInvolvement  mode:  3
JobLevel  mode:  1
JobRole  mode:  Sales Executive
JobSatisfaction  mode:  4
MaritalStatus  mode:  Married
MonthlyIncome  mode:  2342
MonthlyRate  mode:  9150
NumCompaniesWorked  mode:  1
Over18  mode:  Y
OverTime  mode:  No
PercentSalaryHike  mode:  11
PerformanceRating  mode:  3
RelationshipSatisfaction  mode:  3
StandardHours  mode:  80
StockOptionLevel  mode:  0
TotalWorkingYears  mode:  10
TrainingTimesLastYear  mode:  2
WorkLifeBalance  mode:  3
YearsAtCompany  mode:  5
YearsInCurrentRole  mode:  2
YearsSinceLastPromotion  mode:  0
YearsWithCurrManager  mode:  2
ibm.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                  1470 non-null   int64 
 15  JobRole                   1470 non-null   object
 16  JobSatisfaction           1470 non-null   int64 
 17  MaritalStatus             1470 non-null   object
 18  MonthlyIncome             1470 non-null   int64 
 19  MonthlyRate               1470 non-null   int64 
 20  NumCompaniesWorked        1470 non-null   int64 
 21  Over18                    1470 non-null   object
 22  OverTime                  1470 non-null   object
 23  PercentSalaryHike         1470 non-null   int64 
 24  PerformanceRating         1470 non-null   int64 
 25  RelationshipSatisfaction  1470 non-null   int64 
 26  StandardHours             1470 non-null   int64 
 27  StockOptionLevel          1470 non-null   int64 
 28  TotalWorkingYears         1470 non-null   int64 
 29  TrainingTimesLastYear     1470 non-null   int64 
 30  WorkLifeBalance           1470 non-null   int64 
 31  YearsAtCompany            1470 non-null   int64 
 32  YearsInCurrentRole        1470 non-null   int64 
 33  YearsSinceLastPromotion   1470 non-null   int64 
 34  YearsWithCurrManager      1470 non-null   int64 
dtypes: int64(26), object(9)
memory usage: 402.1+ KB

Data Preprocessing

ibm.drop(columns = 'EmployeeCount', inplace = True)
ibm.drop(columns = 'EmployeeNumber', inplace = True)
ibm.drop(columns = 'Over18', inplace = True)
ibm.drop(columns = 'StandardHours', inplace = True)
ibm.drop_duplicates()
Age Attrition BusinessTravel DailyRate Department DistanceFromHome Education EducationField EnvironmentSatisfaction Gender HourlyRate JobInvolvement JobLevel JobRole JobSatisfaction MaritalStatus MonthlyIncome MonthlyRate NumCompaniesWorked OverTime PercentSalaryHike PerformanceRating RelationshipSatisfaction StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 41 Yes Travel_Rarely 1102 Sales 1 2 Life Sciences 2 Female 94 3 2 Sales Executive 4 Single 5993 19479 8 Yes 11 3 1 0 8 0 1 6 4 0 5
1 49 No Travel_Frequently 279 Research & Development 8 1 Life Sciences 3 Male 61 2 2 Research Scientist 2 Married 5130 24907 1 No 23 4 4 1 10 3 3 10 7 1 7
2 37 Yes Travel_Rarely 1373 Research & Development 2 2 Other 4 Male 92 2 1 Laboratory Technician 3 Single 2090 2396 6 Yes 15 3 2 0 7 3 3 0 0 0 0
3 33 No Travel_Frequently 1392 Research & Development 3 4 Life Sciences 4 Female 56 3 1 Research Scientist 3 Married 2909 23159 1 Yes 11 3 3 0 8 3 3 8 7 3 0
4 27 No Travel_Rarely 591 Research & Development 2 1 Medical 1 Male 40 3 1 Laboratory Technician 2 Married 3468 16632 9 No 12 3 4 1 6 3 3 2 2 2 2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1465 36 No Travel_Frequently 884 Research & Development 23 2 Medical 3 Male 41 4 2 Laboratory Technician 4 Married 2571 12290 4 No 17 3 3 1 17 3 3 5 2 0 3
1466 39 No Travel_Rarely 613 Research & Development 6 1 Medical 4 Male 42 2 3 Healthcare Representative 1 Married 9991 21457 4 No 15 3 1 1 9 5 3 7 7 1 7
1467 27 No Travel_Rarely 155 Research & Development 4 3 Life Sciences 2 Male 87 4 2 Manufacturing Director 2 Married 6142 5174 1 Yes 20 4 2 1 6 0 3 6 2 0 3
1468 49 No Travel_Frequently 1023 Sales 2 3 Medical 4 Male 63 2 2 Sales Executive 2 Married 5390 13243 2 No 14 3 4 0 17 3 2 9 6 0 8
1469 34 No Travel_Rarely 628 Research & Development 8 3 Medical 2 Male 82 4 2 Laboratory Technician 3 Married 4404 10228 2 No 12 3 1 0 6 3 4 4 3 1 2

1470 rows × 31 columns

ibm.isnull().sum()
Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSinceLastPromotion     0
YearsWithCurrManager        0
dtype: int64
# replace Attrition (0 - No, 1 - Yes)
ibm.replace({'Attrition' : {'Yes': 1, 'No': 0}}, inplace = True)
# replace BusinessTravel (0 - Non-Travel, 1 - Travel_Rarely, 2 - Travel_Frequently)
ibm.replace({'BusinessTravel' : {'Non-Travel': 0, 'Travel_Rarely': 1, 'Travel_Frequently': 2}}, inplace = True)
#Department
dummy = pd.get_dummies(ibm['Department'])
ibm.insert(5,'Dp_Sales&Development', dummy['Research & Development'])
ibm.insert(6,'Dp_Sales', dummy['Sales'])
ibm.insert(7,'Dp_HumanResources', dummy['Human Resources'])

ibm.drop(columns = 'Department', inplace = True)
#EducationField
dummy = pd.get_dummies(ibm['EducationField'])
ibm.insert(11,'EF_Life Sciences',dummy['Life Sciences'])
ibm.insert(12,'EF_Medical',dummy['Medical'])
ibm.insert(13,'EF_Marketing',dummy['Marketing'])
ibm.insert(14,'EF_TechnicalDegree',dummy['Technical Degree'])
ibm.insert(15,'EF_HumanResources',dummy['Human Resources'])
ibm.insert(16,'EF_Other',dummy['Other'])

ibm.drop(columns = 'EducationField', inplace = True)
# replace Gender (0 - Male; 1 - Female)
ibm.replace({'Gender': {'Male': 0, 'Female': 1}}, inplace = True)
# Job role dummy variables
dummy=pd.get_dummies(ibm['JobRole'])
ibm.insert(23, 'JR_HealthcareRepresentive', dummy['Healthcare Representative'])
ibm.insert(24, 'JR_HumanResource', dummy['Human Resources'])
ibm.insert(25, 'JR_LaboratoryTechnician', dummy['Laboratory Technician'])
ibm.insert(26, 'JR_Manager', dummy['Manager'])
ibm.insert(27, 'JR_ManufacturingDirector', dummy['Manufacturing Director'])
ibm.insert(28, 'JR_ResearchDirector', dummy['Research Director'])
ibm.insert(29, 'JR_ResearchScientist', dummy['Research Scientist'])
ibm.insert(30, 'JR_SalesExecutive', dummy['Sales Executive'])
ibm.insert(31, 'JR_SalesRepresentative', dummy['Sales Representative'])

ibm.drop(columns = 'JobRole', inplace = True)
# MaritalStatus role dummy variables
dummy=pd.get_dummies(ibm['MaritalStatus'])
ibm.insert(34, 'MS_Married', dummy['Married'])
ibm.insert(35, 'MS_Single', dummy['Single'])
ibm.insert(36, 'MS_Divorced', dummy['Divorced'])

ibm.drop(columns = 'MaritalStatus', inplace = True)
# replace Overtime (0 - No; 1 - Yes)
ibm.replace({'OverTime': {'No': 0, 'Yes': 1}}, inplace = True)
# replace Over18 (0 - N; 1 - Y)
ibm.replace({'Over18': {'N': 0, 'Y': 1}}, inplace = True)
def iqr_outliers(data):
    out=[]
    
    firstQuartile = data.quantile(0.25)
    thirdQuartile = data.quantile(0.75)
    
    iqr = thirdQuartile-firstQuartile
    
    Lower_bound = firstQuartile - 1.5 * iqr
    Upper_bound = thirdQuartile + 1.5 * iqr
    
    for i in data:
        if i > Upper_bound or i < Lower_bound:
            out.append(i)
            
    print("Outliers:",out , "\nCount: ", len(out), "\n")
    return out
for c_name in ibm.columns:
    print (c_name)
    iqr_outliers(ibm[c_name])
Age
Outliers: [] 
Count:  0 

Attrition
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 
Count:  237 

BusinessTravel
Outliers: [2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 0, 2, 0, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 2, 2, 0, 2, 0, 0, 2, 0, 2, 0, 2, 2, 0, 0, 2, 2, 0, 2, 0, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 0, 2, 0, 2, 0, 0, 2, 0, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 0, 0, 0, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 0, 0, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 0, 2, 2, 0, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 0, 2, 2, 2, 0, 0, 2, 0, 0, 2, 2, 0, 2, 0, 0, 2, 0, 0, 0, 2, 2, 0, 2, 2, 0, 2, 2, 0, 0, 0, 2, 0, 0, 2, 2, 2, 2, 2, 0, 2, 0, 0, 2, 2, 2, 2, 2, 2, 0, 0, 0, 2, 2, 0, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 0, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0, 0, 0, 2, 2, 0, 0, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 0, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 0, 2, 2, 0, 2, 2, 0, 0, 2, 0, 2, 2, 2, 2, 0, 2, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 2, 0, 0, 2, 2, 0, 0, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 2, 0, 2, 0, 0, 0, 2, 2, 0, 0, 2, 2, 0, 2, 2] 
Count:  427 

DailyRate
Outliers: [] 
Count:  0 

Dp_Sales&Development
Outliers: [] 
Count:  0 

Dp_Sales
Outliers: [] 
Count:  0 

Dp_HumanResources
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 
Count:  63 

DistanceFromHome
Outliers: [] 
Count:  0 

Education
Outliers: [] 
Count:  0 

EnvironmentSatisfaction
Outliers: [] 
Count:  0 

EF_Life Sciences
Outliers: [] 
Count:  0 

EF_Medical
Outliers: [] 
Count:  0 

EF_Marketing
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 
Count:  159 

EF_TechnicalDegree
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 
Count:  132 

EF_HumanResources
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 
Count:  27 

EF_Other
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 
Count:  82 

Gender
Outliers: [] 
Count:  0 

HourlyRate
Outliers: [] 
Count:  0 

JobInvolvement
Outliers: [] 
Count:  0 

JobLevel
Outliers: [] 
Count:  0 

JobSatisfaction
Outliers: [] 
Count:  0 

JR_HealthcareRepresentive
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 
Count:  131 

JR_HumanResource
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 
Count:  52 

JR_LaboratoryTechnician
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 
Count:  259 

JR_Manager
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 
Count:  102 

JR_ManufacturingDirector
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 
Count:  145 

JR_ResearchDirector
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 
Count:  80 

JR_ResearchScientist
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 
Count:  292 

JR_SalesExecutive
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 
Count:  326 

JR_SalesRepresentative
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 
Count:  83 

MonthlyIncome
Outliers: [19094, 18947, 19545, 18740, 18844, 18172, 17328, 16959, 19537, 17181, 19926, 19033, 18722, 19999, 16792, 19232, 19517, 19068, 19202, 19436, 16872, 19045, 19144, 17584, 18665, 17068, 19272, 18300, 16659, 19406, 19197, 19566, 18041, 17046, 17861, 16835, 16595, 19502, 18200, 16627, 19513, 19141, 19189, 16856, 19859, 18430, 17639, 16752, 19246, 17159, 17924, 17099, 17444, 17399, 19419, 18303, 19973, 19845, 17650, 19237, 19627, 16756, 17665, 16885, 17465, 19626, 19943, 18606, 17048, 17856, 19081, 17779, 19740, 18711, 18265, 18213, 18824, 18789, 19847, 19190, 18061, 17123, 16880, 17861, 19187, 19717, 16799, 17328, 19701, 17169, 16598, 17007, 16606, 19586, 19331, 19613, 17567, 19049, 19658, 17426, 17603, 16704, 19833, 19038, 19328, 19392, 19665, 16823, 17174, 17875, 19161, 19636, 19431, 18880] 
Count:  114 

MonthlyRate
Outliers: [] 
Count:  0 

NumCompaniesWorked
Outliers: [9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9] 
Count:  52 

MS_Married
Outliers: [] 
Count:  0 

MS_Single
Outliers: [] 
Count:  0 

MS_Divorced
Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 
Count:  327 

OverTime
Outliers: [] 
Count:  0 

PercentSalaryHike
Outliers: [] 
Count:  0 

PerformanceRating
Outliers: [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4] 
Count:  226 

RelationshipSatisfaction
Outliers: [] 
Count:  0 

StockOptionLevel
Outliers: [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3] 
Count:  85 

TotalWorkingYears
Outliers: [31, 29, 37, 38, 30, 40, 36, 34, 32, 33, 37, 30, 36, 31, 33, 32, 37, 31, 32, 32, 30, 34, 30, 40, 29, 35, 31, 33, 31, 29, 32, 30, 33, 30, 29, 31, 32, 33, 36, 34, 31, 36, 33, 31, 29, 33, 29, 32, 31, 35, 29, 32, 34, 36, 32, 30, 36, 29, 34, 37, 29, 29, 35] 
Count:  63 

TrainingTimesLastYear
Outliers: [0, 5, 5, 5, 6, 5, 5, 5, 6, 6, 0, 0, 0, 5, 0, 5, 5, 5, 6, 6, 5, 0, 6, 5, 5, 0, 5, 5, 6, 5, 5, 5, 0, 5, 5, 5, 5, 6, 6, 5, 5, 5, 5, 0, 0, 5, 5, 5, 6, 6, 5, 0, 5, 0, 5, 5, 0, 6, 0, 5, 5, 6, 6, 5, 6, 5, 0, 5, 5, 5, 5, 0, 6, 5, 5, 5, 5, 6, 5, 5, 6, 5, 5, 5, 0, 5, 0, 5, 5, 6, 5, 6, 5, 0, 5, 5, 0, 6, 6, 5, 6, 0, 5, 0, 6, 6, 6, 6, 5, 5, 0, 5, 0, 0, 6, 0, 6, 5, 6, 5, 5, 0, 5, 6, 6, 5, 5, 0, 0, 6, 0, 0, 5, 0, 5, 6, 5, 5, 6, 6, 5, 5, 5, 5, 5, 6, 5, 6, 6, 0, 6, 6, 5, 5, 0, 0, 6, 6, 0, 5, 0, 0, 0, 0, 0, 5, 5, 6, 5, 5, 0, 5, 5, 0, 5, 5, 6, 5, 5, 5, 6, 5, 5, 5, 0, 0, 5, 5, 5, 5, 6, 0, 0, 6, 6, 6, 6, 5, 5, 5, 6, 5, 0, 5, 5, 6, 5, 6, 6, 5, 6, 6, 5, 0, 5, 5, 5, 5, 5, 0, 0, 0, 6, 5, 6, 6, 5, 6, 0, 6, 6, 5, 6, 6, 5, 5, 5, 0] 
Count:  238 

WorkLifeBalance
Outliers: [] 
Count:  0 

YearsAtCompany
Outliers: [25, 22, 22, 27, 21, 22, 37, 25, 20, 40, 20, 24, 20, 24, 33, 20, 19, 22, 33, 24, 19, 21, 20, 36, 20, 20, 22, 24, 21, 21, 25, 21, 29, 20, 27, 20, 31, 32, 20, 20, 21, 22, 22, 34, 24, 26, 31, 20, 31, 26, 19, 21, 21, 32, 21, 19, 20, 22, 20, 21, 26, 20, 22, 24, 33, 29, 25, 21, 19, 19, 20, 19, 33, 19, 19, 20, 20, 20, 20, 20, 32, 20, 21, 33, 36, 26, 30, 22, 23, 23, 21, 21, 22, 22, 19, 22, 19, 22, 20, 20, 20, 22, 20, 20] 
Count:  104 

YearsInCurrentRole
Outliers: [15, 16, 18, 15, 18, 17, 16, 15, 16, 15, 16, 16, 15, 16, 17, 15, 15, 15, 17, 17, 16] 
Count:  21 

YearsSinceLastPromotion
Outliers: [8, 15, 8, 8, 9, 13, 12, 10, 11, 9, 12, 15, 15, 15, 9, 11, 11, 9, 12, 11, 15, 11, 10, 9, 11, 9, 8, 11, 11, 8, 13, 9, 9, 12, 10, 11, 15, 13, 9, 11, 10, 8, 8, 11, 9, 11, 12, 11, 14, 13, 14, 8, 11, 15, 10, 11, 11, 15, 11, 13, 11, 13, 15, 8, 13, 15, 11, 14, 15, 15, 9, 11, 9, 8, 9, 15, 11, 12, 9, 8, 10, 14, 8, 13, 13, 12, 14, 8, 8, 8, 14, 14, 8, 12, 13, 14, 14, 12, 11, 8, 11, 9, 12, 8, 9, 11, 9] 
Count:  107 

YearsWithCurrManager
Outliers: [17, 15, 15, 15, 15, 17, 16, 17, 15, 17, 17, 17, 17, 16] 
Count:  14 

def remove_outliers(c_name):
    outliers = iqr_outliers(ibm[c_name])

    while (len(outliers)!=0):
        for i in outliers:
            ibm.drop(ibm.loc[ibm[c_name]==i].index, inplace = True)
        outliers = iqr_outliers(ibm[c_name])
remove_outliers('MonthlyIncome')
Outliers: [19094, 18947, 19545, 18740, 18844, 18172, 17328, 16959, 19537, 17181, 19926, 19033, 18722, 19999, 16792, 19232, 19517, 19068, 19202, 19436, 16872, 19045, 19144, 17584, 18665, 17068, 19272, 18300, 16659, 19406, 19197, 19566, 18041, 17046, 17861, 16835, 16595, 19502, 18200, 16627, 19513, 19141, 19189, 16856, 19859, 18430, 17639, 16752, 19246, 17159, 17924, 17099, 17444, 17399, 19419, 18303, 19973, 19845, 17650, 19237, 19627, 16756, 17665, 16885, 17465, 19626, 19943, 18606, 17048, 17856, 19081, 17779, 19740, 18711, 18265, 18213, 18824, 18789, 19847, 19190, 18061, 17123, 16880, 17861, 19187, 19717, 16799, 17328, 19701, 17169, 16598, 17007, 16606, 19586, 19331, 19613, 17567, 19049, 19658, 17426, 17603, 16704, 19833, 19038, 19328, 19392, 19665, 16823, 17174, 17875, 19161, 19636, 19431, 18880] 
Count:  114 

Outliers: [15427, 13458, 14756, 13245, 13664, 13503, 13549, 13872, 13734, 13591, 16064, 13675, 13496, 13603, 13525, 16015, 13964, 15992, 14336, 13212, 16555, 14118, 13610, 13237, 16184, 15402, 14814, 13770, 16307, 13826, 14275, 13582, 14852, 13194, 13973, 13726, 13320, 13120, 13499, 13758, 13191, 16124, 13577, 14026, 13142, 13695, 13402, 13247, 14732, 16422, 13757, 16032, 16328, 14411, 16437, 15202, 16413, 13269, 13966, 15972, 15379, 12936, 12965, 13116, 13464, 16291, 15787, 13225, 13348, 13341, 13206, 13744, 13570] 
Count:  73 

Outliers: [11994, 12490, 12185, 11849, 11996, 12061, 11878, 12504, 11935, 12808, 11836, 12742, 11904, 12169, 11916, 11957, 12031] 
Count:  17 

Outliers: [11713, 11691] 
Count:  2 

Outliers: [11631] 
Count:  1 

Outliers: [] 
Count:  0 

ibm
Age Attrition BusinessTravel DailyRate Dp_Sales&Development Dp_Sales Dp_HumanResources DistanceFromHome Education EnvironmentSatisfaction EF_Life Sciences EF_Medical EF_Marketing EF_TechnicalDegree EF_HumanResources EF_Other Gender HourlyRate JobInvolvement JobLevel JobSatisfaction JR_HealthcareRepresentive JR_HumanResource JR_LaboratoryTechnician JR_Manager JR_ManufacturingDirector JR_ResearchDirector JR_ResearchScientist JR_SalesExecutive JR_SalesRepresentative MonthlyIncome MonthlyRate NumCompaniesWorked MS_Married MS_Single MS_Divorced OverTime PercentSalaryHike PerformanceRating RelationshipSatisfaction StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 41 1 1 1102 0 1 0 1 2 2 1 0 0 0 0 0 1 94 3 2 4 0 0 0 0 0 0 0 1 0 5993 19479 8 0 1 0 1 11 3 1 0 8 0 1 6 4 0 5
1 49 0 2 279 1 0 0 8 1 3 1 0 0 0 0 0 0 61 2 2 2 0 0 0 0 0 0 1 0 0 5130 24907 1 1 0 0 0 23 4 4 1 10 3 3 10 7 1 7
2 37 1 1 1373 1 0 0 2 2 4 0 0 0 0 0 1 0 92 2 1 3 0 0 1 0 0 0 0 0 0 2090 2396 6 0 1 0 1 15 3 2 0 7 3 3 0 0 0 0
3 33 0 2 1392 1 0 0 3 4 4 1 0 0 0 0 0 1 56 3 1 3 0 0 0 0 0 0 1 0 0 2909 23159 1 1 0 0 1 11 3 3 0 8 3 3 8 7 3 0
4 27 0 1 591 1 0 0 2 1 1 0 1 0 0 0 0 0 40 3 1 2 0 0 1 0 0 0 0 0 0 3468 16632 9 1 0 0 0 12 3 4 1 6 3 3 2 2 2 2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1465 36 0 2 884 1 0 0 23 2 3 0 1 0 0 0 0 0 41 4 2 4 0 0 1 0 0 0 0 0 0 2571 12290 4 1 0 0 0 17 3 3 1 17 3 3 5 2 0 3
1466 39 0 1 613 1 0 0 6 1 4 0 1 0 0 0 0 0 42 2 3 1 1 0 0 0 0 0 0 0 0 9991 21457 4 1 0 0 0 15 3 1 1 9 5 3 7 7 1 7
1467 27 0 1 155 1 0 0 4 3 2 1 0 0 0 0 0 0 87 4 2 2 0 0 0 0 1 0 0 0 0 6142 5174 1 1 0 0 1 20 4 2 1 6 0 3 6 2 0 3
1468 49 0 2 1023 0 1 0 2 3 4 0 1 0 0 0 0 0 63 2 2 2 0 0 0 0 0 0 0 1 0 5390 13243 2 1 0 0 0 14 3 4 0 17 3 2 9 6 0 8
1469 34 0 1 628 1 0 0 8 3 2 0 1 0 0 0 0 0 82 4 2 3 0 0 1 0 0 0 0 0 0 4404 10228 2 1 0 0 0 12 3 1 0 6 3 4 4 3 1 2

1263 rows × 48 columns

Classification

Support Vector Machine (prepared by Teh Liang Sean)

# import important library to do SVM
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn import metrics
#The target for SVM will be the attrition of IBM employees to know whether the employees will continue stay or leave IBM
x_svm_find = ibm.drop(columns = 'Attrition')
y_svm = ibm['Attrition']
# Try use SelectKBest and chi-squared (chi²) statistical test for non-negative feature to find top 15 best features
#Import library
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
#Use SelectKBest class to find top 15 best features
best_15_features = SelectKBest(score_func=chi2, k=15)
fit = best_15_features.fit(x_svm_find,y_svm)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(x_svm_find.columns)
#Try to concat two dataframes for a better visualization 
top_15_feature_scores = pd.concat([dfcolumns,dfscores],axis=1)
#Name the dataframe columns
top_15_feature_scores.columns = ['Features','Score']  
#Show 15 best features
print(top_15_feature_scores.nlargest(15,'Score'))  
                     Features         Score
29              MonthlyIncome  26471.159476
30                MonthlyRate   1308.443569
2                   DailyRate   1111.594737
44         YearsInCurrentRole    109.263859
43             YearsAtCompany    103.805057
46       YearsWithCurrManager    100.636711
40          TotalWorkingYears     95.843571
35                   OverTime     60.367656
6            DistanceFromHome     57.197704
0                         Age     46.705340
28     JR_SalesRepresentative     27.299127
33                  MS_Single     26.251695
39           StockOptionLevel     24.376114
20  JR_HealthcareRepresentive     10.935616
24   JR_ManufacturingDirector      9.987076
ibm_svm_features_df = pd.DataFrame()
# Set up data to do SVM using top 15 best features identified
ibm_svm_features_df.insert(0,'MonthlyIncome',ibm['MonthlyIncome'])
ibm_svm_features_df.insert(1,'MonthlyRate',ibm['MonthlyRate'])
ibm_svm_features_df.insert(2,'DailyRate',ibm['DailyRate'])
ibm_svm_features_df.insert(3,'YearsInCurrentRole',ibm['YearsInCurrentRole'])
ibm_svm_features_df.insert(4,'YearsAtCompany',ibm['YearsAtCompany'])
ibm_svm_features_df.insert(5,'YearsWithCurrManager',ibm['YearsWithCurrManager'])
ibm_svm_features_df.insert(6,'TotalWorkingYears',ibm['TotalWorkingYears'])
ibm_svm_features_df.insert(7,'OverTime',ibm['OverTime'])
ibm_svm_features_df.insert(8,'DistanceFromHome',ibm['DistanceFromHome'])
ibm_svm_features_df.insert(9,'Age',ibm['Age'])
ibm_svm_features_df.insert(10,'JR_SalesRepresentative',ibm['JR_SalesRepresentative'])
ibm_svm_features_df.insert(11,'MS_Single',ibm['MS_Single'])
ibm_svm_features_df.insert(12,'StockOptionLevel',ibm['StockOptionLevel'])
ibm_svm_features_df.insert(13,'JR_HealthcareRepresentive ',ibm['JR_HealthcareRepresentive'])
ibm_svm_features_df.insert(14,'JR_ManufacturingDirector',ibm['JR_ManufacturingDirector'])
ibm_svm_features_df
MonthlyIncome MonthlyRate DailyRate YearsInCurrentRole YearsAtCompany YearsWithCurrManager TotalWorkingYears OverTime DistanceFromHome Age JR_SalesRepresentative MS_Single StockOptionLevel JR_HealthcareRepresentive JR_ManufacturingDirector
0 5993 19479 1102 4 6 5 8 1 1 41 0 1 0 0 0
1 5130 24907 279 7 10 7 10 0 8 49 0 0 1 0 0
2 2090 2396 1373 0 0 0 7 1 2 37 0 1 0 0 0
3 2909 23159 1392 7 8 0 8 1 3 33 0 0 0 0 0
4 3468 16632 591 2 2 2 6 0 2 27 0 0 1 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1465 2571 12290 884 2 5 3 17 0 23 36 0 0 1 0 0
1466 9991 21457 613 7 7 7 9 0 6 39 0 0 1 1 0
1467 6142 5174 155 2 6 3 6 1 4 27 0 0 1 0 1
1468 5390 13243 1023 6 9 8 17 0 2 49 0 0 0 0 0
1469 4404 10228 628 3 4 2 6 0 8 34 0 0 0 0 0

1263 rows × 15 columns

#assignment ibm_svm_features to x
x_svm = ibm_svm_features_df
#Try to scale all the numeric data of each features to make svm model train more effective
from sklearn.preprocessing import StandardScaler
s_scaler = StandardScaler()
x_scaled_svm = s_scaler.fit_transform(x_svm)
#Try to use tomek link to solve undersampling problem as attriction too few 'yes' value for imbalanced classification 
from imblearn.under_sampling import TomekLinks

tl_svm = TomekLinks(sampling_strategy='not minority')
x_tl_svm, y_tl_svm= tl_svm.fit_resample(x_svm, y_svm)
#Train the modals with 80% and test 20% of the data
x_train_svm, x_test_svm, y_train_svm, y_test_svm = train_test_split(x_tl_svm,y_tl_svm, test_size=0.2,random_state=40, stratify=y_tl_svm)
# Model 1 is using the manual tuning for some hyperparameters of SVM
model_1_svm=svm.SVC(C=2,kernel='sigmoid',gamma='scale',coef0=0.6,random_state=40,probability=True)
model_1_svm.fit(x_train_svm,y_train_svm)
y_predict_1_svm=model_1_svm.predict(x_test_svm)
# Modal 2 is using GridSearchCV to find the best hyperparameters for SVM using cross validation
# Only some hyperparameters are tuned 

# import GridSearchCV library
from sklearn.model_selection import GridSearchCV

#Try to tune the hyperparameter with
#kernel type: linear/rbf/sigmoid
#C which is the regularization parameter: range 0-1 increase by 0.1
#coef0 that is the independent term for kernel method (only for sigmoid): range 0.0-0.5 increase by 0.1
#degree for the polynomial ('poly') kernel method: range 0-5 increase by 1
#gamma that are kernel coefficient for 'rbf' and 'poly': scale/auto

param_grid={'kernel':('linear','rbf','sigmoid'),
        'C':[i for i in np.arange(1.0,3.0,0.1)],
        'coef0':[y for y in np.arange(0.0,1.5,0.1)],
        'degree':[z for z in np.arange(3,6,1)],
        'gamma':('auto','scale'),}
# set random state to 40
find_best_para_model=svm.SVC(random_state=40)
Grid_search_svm=GridSearchCV(find_best_para_model,param_grid, n_jobs=-1,verbose=2,cv=4)
# this may take some time to run
Grid_search_svm.fit(x_train_svm,y_train_svm)
Fitting 4 folds for each of 5400 candidates, totalling 21600 fits
# Show the best hyperparameter found by grid search
Grid_search_svm.best_params_
{'C': 2.8000000000000016,
 'coef0': 0.0,
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf'}
# Use hyperparameter found grid search to build modal 
model_2_svm=svm.SVC(C=2.8000000000000016,kernel='rbf',degree=3,gamma='scale',coef0=0.0,probability=True,random_state=40)
model_2_svm.fit(x_train_svm,y_train_svm)
y_predict_2_svm=model_2_svm.predict(x_test_svm)
#Evaluate accurracy of classification result
print('Accuracy of prediction classification result for 2 model')
print('Hyperparameters that try to tune manually (model 1): ',metrics.accuracy_score(y_test_svm, y_predict_1_svm))
print('Best hyperparameters found using GridSearchCV (model 2): ',metrics.accuracy_score(y_test_svm, y_predict_2_svm)) 
Accuracy of prediction classification result for 2 model
Hyperparameters that try to tune manually (model 1):  0.7416666666666667
Best hyperparameters found using GridSearchCV (model 2):  0.8166666666666667
#Evaluating classification result by confusion matrix
from sklearn.metrics import confusion_matrix
print (confusion_matrix(y_test_svm, y_predict_2_svm,[0,1]))

#Evaluating classification result by Precision, Recall and F1-Measure
from sklearn.metrics import classification_report
print (classification_report(y_test_svm, y_predict_2_svm))
[[183  12]
 [ 32  13]]
              precision    recall  f1-score   support

           0       0.85      0.94      0.89       195
           1       0.52      0.29      0.37        45

    accuracy                           0.82       240
   macro avg       0.69      0.61      0.63       240
weighted avg       0.79      0.82      0.79       240

C:\Users\USER\anaconda3\lib\site-packages\sklearn\utils\validation.py:70: FutureWarning: Pass labels=[0, 1] as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error
  warnings.warn(f"Pass {args_msg} as keyword args. From version "
from matplotlib import pyplot as plt
#Evaluating classification result by ROC curves
from sklearn.metrics import roc_curve
y_pred_prob_svm = model_2_svm.predict_proba(x_test_svm)[:,1]
fpr, tpr, threshold = roc_curve(y_test_svm, y_pred_prob_svm)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr,tpr)
auc = roc_auc_score(y_test_svm,  y_pred_prob_svm)
plt.title(f'AUC: {auc}')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()