from sklearn import datasets
import pandas as pdiris = datasets.load_iris()df = pd.DataFrame(iris.data)
df.columns = ["sl", "sw", 'pl', 'pw']def abc(k, *val):
if k < val[0]:
return 0
else:
return 1df.sl.apply(abc, args=(5,))0 1
1 0
2 0
3 0
4 1
5 1
6 0
7 1
8 0
9 0
10 1
11 0
12 0
13 0
14 1
15 1
16 1
17 1
18 1
19 1
20 1
21 1
22 0
23 1
24 0
25 1
26 1
27 1
28 1
29 0
..
120 1
121 1
122 1
123 1
124 1
125 1
126 1
127 1
128 1
129 1
130 1
131 1
132 1
133 1
134 1
135 1
136 1
137 1
138 1
139 1
140 1
141 1
142 1
143 1
144 1
145 1
146 1
147 1
148 1
149 1
Name: sl, dtype: int64def label(val, *boundaries):
if (val < boundaries[0]):
return 'a'
elif (val < boundaries[1]):
return 'b'
elif (val < boundaries[2]):
return 'c'
else:
return 'd'
def toLabel(df, old_feature_name):
second = df[old_feature_name].mean()
minimum = df[old_feature_name].min()
first = (minimum + second)/2
maximum = df[old_feature_name].max()
third = (maximum + second)/2
return df[old_feature_name].apply(label, args= (first, second, third))df['sl_labeled'] = toLabel(df, 'sl')
df['sw_labeled'] = toLabel(df, 'sw')
df['pl_labeled'] = toLabel(df, 'pl')
df['pw_labeled'] = toLabel(df, 'pw')
df| sl | sw | pl | pw | sl_labeled | sw_labeled | pl_labeled | pw_labeled | |
|---|---|---|---|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | b | c | a | a |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | a | b | a | a |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 | a | c | a | a |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 | a | c | a | a |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | a | c | a | a |
| 5 | 5.4 | 3.9 | 1.7 | 0.4 | b | d | a | a |
| 6 | 4.6 | 3.4 | 1.4 | 0.3 | a | c | a | a |
| 7 | 5.0 | 3.4 | 1.5 | 0.2 | a | c | a | a |
| 8 | 4.4 | 2.9 | 1.4 | 0.2 | a | b | a | a |
| 9 | 4.9 | 3.1 | 1.5 | 0.1 | a | c | a | a |
| 10 | 5.4 | 3.7 | 1.5 | 0.2 | b | c | a | a |
| 11 | 4.8 | 3.4 | 1.6 | 0.2 | a | c | a | a |
| 12 | 4.8 | 3.0 | 1.4 | 0.1 | a | b | a | a |
| 13 | 4.3 | 3.0 | 1.1 | 0.1 | a | b | a | a |
| 14 | 5.8 | 4.0 | 1.2 | 0.2 | b | d | a | a |
| 15 | 5.7 | 4.4 | 1.5 | 0.4 | b | d | a | a |
| 16 | 5.4 | 3.9 | 1.3 | 0.4 | b | d | a | a |
| 17 | 5.1 | 3.5 | 1.4 | 0.3 | b | c | a | a |
| 18 | 5.7 | 3.8 | 1.7 | 0.3 | b | d | a | a |
| 19 | 5.1 | 3.8 | 1.5 | 0.3 | b | d | a | a |
| 20 | 5.4 | 3.4 | 1.7 | 0.2 | b | c | a | a |
| 21 | 5.1 | 3.7 | 1.5 | 0.4 | b | c | a | a |
| 22 | 4.6 | 3.6 | 1.0 | 0.2 | a | c | a | a |
| 23 | 5.1 | 3.3 | 1.7 | 0.5 | b | c | a | a |
| 24 | 4.8 | 3.4 | 1.9 | 0.2 | a | c | a | a |
| 25 | 5.0 | 3.0 | 1.6 | 0.2 | a | b | a | a |
| 26 | 5.0 | 3.4 | 1.6 | 0.4 | a | c | a | a |
| 27 | 5.2 | 3.5 | 1.5 | 0.2 | b | c | a | a |
| 28 | 5.2 | 3.4 | 1.4 | 0.2 | b | c | a | a |
| 29 | 4.7 | 3.2 | 1.6 | 0.2 | a | c | a | a |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 120 | 6.9 | 3.2 | 5.7 | 2.3 | d | c | d | d |
| 121 | 5.6 | 2.8 | 4.9 | 2.0 | b | b | c | d |
| 122 | 7.7 | 2.8 | 6.7 | 2.0 | d | b | d | d |
| 123 | 6.3 | 2.7 | 4.9 | 1.8 | c | b | c | c |
| 124 | 6.7 | 3.3 | 5.7 | 2.1 | c | c | d | d |
| 125 | 7.2 | 3.2 | 6.0 | 1.8 | d | c | d | c |
| 126 | 6.2 | 2.8 | 4.8 | 1.8 | c | b | c | c |
| 127 | 6.1 | 3.0 | 4.9 | 1.8 | c | b | c | c |
| 128 | 6.4 | 2.8 | 5.6 | 2.1 | c | b | d | d |
| 129 | 7.2 | 3.0 | 5.8 | 1.6 | d | b | d | c |
| 130 | 7.4 | 2.8 | 6.1 | 1.9 | d | b | d | d |
| 131 | 7.9 | 3.8 | 6.4 | 2.0 | d | d | d | d |
| 132 | 6.4 | 2.8 | 5.6 | 2.2 | c | b | d | d |
| 133 | 6.3 | 2.8 | 5.1 | 1.5 | c | b | c | c |
| 134 | 6.1 | 2.6 | 5.6 | 1.4 | c | b | d | c |
| 135 | 7.7 | 3.0 | 6.1 | 2.3 | d | b | d | d |
| 136 | 6.3 | 3.4 | 5.6 | 2.4 | c | c | d | d |
| 137 | 6.4 | 3.1 | 5.5 | 1.8 | c | c | d | c |
| 138 | 6.0 | 3.0 | 4.8 | 1.8 | c | b | c | c |
| 139 | 6.9 | 3.1 | 5.4 | 2.1 | d | c | d | d |
| 140 | 6.7 | 3.1 | 5.6 | 2.4 | c | c | d | d |
| 141 | 6.9 | 3.1 | 5.1 | 2.3 | d | c | c | d |
| 142 | 5.8 | 2.7 | 5.1 | 1.9 | b | b | c | d |
| 143 | 6.8 | 3.2 | 5.9 | 2.3 | c | c | d | d |
| 144 | 6.7 | 3.3 | 5.7 | 2.5 | c | c | d | d |
| 145 | 6.7 | 3.0 | 5.2 | 2.3 | c | b | c | d |
| 146 | 6.3 | 2.5 | 5.0 | 1.9 | c | a | c | d |
| 147 | 6.5 | 3.0 | 5.2 | 2.0 | c | b | c | d |
| 148 | 6.2 | 3.4 | 5.4 | 2.3 | c | c | d | d |
| 149 | 5.9 | 3.0 | 5.1 | 1.8 | c | b | c | c |
150 rows × 8 columns
df.drop(['sl', 'sw', 'pl', 'pw'], axis = 1, inplace = True)set(df['sl_labeled']){'a', 'b', 'c', 'd'}df["output"] = iris.targetdf| sl_labeled | sw_labeled | pl_labeled | pw_labeled | output | |
|---|---|---|---|---|---|
| 0 | b | c | a | a | 0 |
| 1 | a | b | a | a | 0 |
| 2 | a | c | a | a | 0 |
| 3 | a | c | a | a | 0 |
| 4 | a | c | a | a | 0 |
| 5 | b | d | a | a | 0 |
| 6 | a | c | a | a | 0 |
| 7 | a | c | a | a | 0 |
| 8 | a | b | a | a | 0 |
| 9 | a | c | a | a | 0 |
| 10 | b | c | a | a | 0 |
| 11 | a | c | a | a | 0 |
| 12 | a | b | a | a | 0 |
| 13 | a | b | a | a | 0 |
| 14 | b | d | a | a | 0 |
| 15 | b | d | a | a | 0 |
| 16 | b | d | a | a | 0 |
| 17 | b | c | a | a | 0 |
| 18 | b | d | a | a | 0 |
| 19 | b | d | a | a | 0 |
| 20 | b | c | a | a | 0 |
| 21 | b | c | a | a | 0 |
| 22 | a | c | a | a | 0 |
| 23 | b | c | a | a | 0 |
| 24 | a | c | a | a | 0 |
| 25 | a | b | a | a | 0 |
| 26 | a | c | a | a | 0 |
| 27 | b | c | a | a | 0 |
| 28 | b | c | a | a | 0 |
| 29 | a | c | a | a | 0 |
| ... | ... | ... | ... | ... | ... |
| 120 | d | c | d | d | 2 |
| 121 | b | b | c | d | 2 |
| 122 | d | b | d | d | 2 |
| 123 | c | b | c | c | 2 |
| 124 | c | c | d | d | 2 |
| 125 | d | c | d | c | 2 |
| 126 | c | b | c | c | 2 |
| 127 | c | b | c | c | 2 |
| 128 | c | b | d | d | 2 |
| 129 | d | b | d | c | 2 |
| 130 | d | b | d | d | 2 |
| 131 | d | d | d | d | 2 |
| 132 | c | b | d | d | 2 |
| 133 | c | b | c | c | 2 |
| 134 | c | b | d | c | 2 |
| 135 | d | b | d | d | 2 |
| 136 | c | c | d | d | 2 |
| 137 | c | c | d | c | 2 |
| 138 | c | b | c | c | 2 |
| 139 | d | c | d | d | 2 |
| 140 | c | c | d | d | 2 |
| 141 | d | c | c | d | 2 |
| 142 | b | b | c | d | 2 |
| 143 | c | c | d | d | 2 |
| 144 | c | c | d | d | 2 |
| 145 | c | b | c | d | 2 |
| 146 | c | a | c | d | 2 |
| 147 | c | b | c | d | 2 |
| 148 | c | c | d | d | 2 |
| 149 | c | b | c | c | 2 |
150 rows × 5 columns
def fit(data):
output_name = data.columns[-1]
features = data.columns[0:-1]
counts = {}
possible_outputs = set(data[output_name])
for output in possible_outputs:
counts[output] = {}
smallData = data[data[output_name] == output]
counts[output]["total_count"] = len(smallData)
for f in features:
counts[output][f] = {}
possible_values = set(smallData[f])
for value in possible_values:
val_count = len(smallData[smallData[f] == value])
counts[output][f][value] = val_count
return countsfit(df){0: {'pl_labeled': {'a': 50},
'pw_labeled': {'a': 50},
'sl_labeled': {'a': 28, 'b': 22},
'sw_labeled': {'a': 1, 'b': 7, 'c': 32, 'd': 10},
'total_count': 50},
1: {'pl_labeled': {'b': 7, 'c': 43},
'pw_labeled': {'b': 10, 'c': 40},
'sl_labeled': {'a': 3, 'b': 21, 'c': 24, 'd': 2},
'sw_labeled': {'a': 13, 'b': 29, 'c': 8},
'total_count': 50},
2: {'pl_labeled': {'c': 20, 'd': 30},
'pw_labeled': {'c': 16, 'd': 34},
'sl_labeled': {'a': 1, 'b': 5, 'c': 29, 'd': 15},
'sw_labeled': {'a': 5, 'b': 28, 'c': 15, 'd': 2},
'total_count': 50}}