Here we apply various techniques to perform regression and classification tasks on the famous iris dataset.
See: https://www.kaggle.com/datasets/vikrishnan/iris-dataset
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, confusion_matrix
from sklearn import preprocessing
from sklearn.metrics import ConfusionMatrixDisplay
from scipy.stats import multivariate_normal
import numpy as np
import collections
iris = pd.read_csv("iris.csv")
iris
sepal_length | sepal_width | petal_length | petal_width | species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
... | ... | ... | ... | ... | ... |
145 | 6.7 | 3.0 | 5.2 | 2.3 | virginica |
146 | 6.3 | 2.5 | 5.0 | 1.9 | virginica |
147 | 6.5 | 3.0 | 5.2 | 2.0 | virginica |
148 | 6.2 | 3.4 | 5.4 | 2.3 | virginica |
149 | 5.9 | 3.0 | 5.1 | 1.8 | virginica |
150 rows × 5 columns
for name, group in iris.groupby("species"):
plt.plot(group.sepal_length, group.petal_length,
marker="o", linestyle='', label=name)
plt.xlabel("sepal length")
plt.ylabel("petal length")
plt.grid(linestyle="--")
plt.legend()
<matplotlib.legend.Legend at 0x7f3cb259e860>
lower_d = PCA(n_components=2) \
.fit_transform(iris.drop(['species'], axis=1))
lower_d = pd.DataFrame(data=lower_d, columns=["pca1", "pca2"])
lower_d["species"] = iris["species"]
lower_d
pca1 | pca2 | species | |
---|---|---|---|
0 | -2.684207 | 0.326607 | setosa |
1 | -2.715391 | -0.169557 | setosa |
2 | -2.889820 | -0.137346 | setosa |
3 | -2.746437 | -0.311124 | setosa |
4 | -2.728593 | 0.333925 | setosa |
... | ... | ... | ... |
145 | 1.944017 | 0.187415 | virginica |
146 | 1.525664 | -0.375021 | virginica |
147 | 1.764046 | 0.078519 | virginica |
148 | 1.901629 | 0.115877 | virginica |
149 | 1.389666 | -0.282887 | virginica |
150 rows × 3 columns
for name, group in lower_d.groupby("species"):
plt.plot(group.pca1, group.pca2,
marker="o", linestyle='', label=name)
plt.xlabel("pca1")
plt.ylabel("pca2")
plt.grid(linestyle="--")
plt.legend()
<matplotlib.legend.Legend at 0x7f3cb2575ef0>
iris.species = pd.Categorical(iris.species)
iris_onehot = pd.get_dummies(iris)
X = iris_onehot.drop(["sepal_length"], axis=1)
y = iris_onehot.sepal_length
test_pct = 0.35
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_pct, random_state=42)
model = linear_model.LinearRegression()
model.fit(X_train, y_train)
LinearRegression()
y_pred = model.predict(X_test)
print("Test mean squared error:", mean_squared_error(model.predict(X_train), y_train))
print("Mean squared error:", mean_squared_error(y_pred, y_test))
Test mean squared error: 0.09182444089587832 Mean squared error: 0.09148440122003154
def plot_groups(X_test, y_test, y_pred):
cats = X_test.iloc[:,-3:]
groups = collections.defaultdict(list)
for i, row in cats.iterrows():
col_index = row.index[row==1].tolist()[0]
print("=====")
print("INDEX:", col_index)
print("ROW")
print(row)
y_test.index
Int64Index([ 73, 18, 118, 78, 76, 31, 64, 141, 68, 82, 110, 12, 36, 9, 19, 56, 104, 69, 55, 132, 29, 127, 26, 128, 131, 145, 108, 143, 45, 30, 22, 15, 65, 11, 42, 146, 51, 27, 4, 32, 142, 85, 86, 16, 10, 81, 133, 137, 75, 109, 96, 105, 66], dtype='int64')
Mean squared error doesn't give us a great intuition for how close the predictions were, so let's plot the predictions vs the true value.
i = 0
colours = ["blue", "black", "red"]
markers = ["+", "o", "x"]
test_plottable = iris.iloc[y_test.index].copy()
test_plottable["pred"] = y_pred
for name, group in test_plottable.groupby("species"):
plt.plot(y_test.loc[group.index],
group.pred,
markerfacecolor=colours[i],
markeredgecolor=colours[i],
fillstyle="none",
marker=markers[i],
linestyle="",
label=name)
i += 1
plt.xlabel("Sepal length")
plt.ylabel("Prediction")
plt.grid(linestyle="--")
plt.legend()
<matplotlib.legend.Legend at 0x7f3cb17612b0>
iris.species = pd.Categorical(iris.species)
X = iris.drop(["species"], axis=1)
y = iris.species
test_pct = 0.35
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_pct, random_state=42)
scaler = preprocessing.StandardScaler()
num_cols = [name for name in iris.columns if iris[name].dtype.kind == "f"]
X_train.loc[:, num_cols] = scaler.fit_transform(X_train[num_cols])
X_test.loc[:, num_cols] = scaler.fit_transform(X_test[num_cols])
/home/kg/.local/lib/python3.6/site-packages/pandas/core/indexing.py:670: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy iloc._setitem_with_indexer(indexer, value) /home/kg/.local/lib/python3.6/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy after removing the cwd from sys.path. /home/kg/.local/lib/python3.6/site-packages/pandas/core/indexing.py:670: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy iloc._setitem_with_indexer(indexer, value) /home/kg/.local/lib/python3.6/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy """
^ Not sure how to get rid of those warnings, oh well.
model = linear_model.LogisticRegression()
model.fit(X_train, y_train)
LogisticRegression()
print(model.classes_, model.n_features_in_, model.n_iter_)
['setosa' 'versicolor' 'virginica'] 4 [24]
print("{}% accuracy".format(round(100*model.score(X_test, y_test), 3)))
98.113% accuracy
cm = confusion_matrix(y_test, model.predict(X_test))
ConfusionMatrixDisplay(confusion_matrix=cm,
display_labels=model.classes_) \
.plot()
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f01beedf3c8>
Just 1 mistake, a virginica that got classified as a versicolor. That makes sense because those clusters are close together.
We want to solve for x
in
Ax = b,
where A
is our data matrix (rows are samples) and b
is the variable we're trying to predict. However, A
is generally not invertible. A^T A
, however, IS invertible, as long as the columns of A
are linearly independent. So we instead solve for xhat
in
A^T A xhat = A^T b,
xhat = (A^T A)^-1 A^T b,
and this minimises the error because A xhat
is the projection of b
onto the column space of A
, i.e. xhat
gets us as close to b
as possible given A
.
A = iris.drop(["species", "petal_width"], axis=1).to_numpy()
b = iris["petal_width"].to_numpy()
proj = np.matmul(np.linalg.inv(np.matmul(A.T, A)), A.T)
xhat = np.dot(proj, b)
xhat
array([-0.25010419, 0.20945778, 0.53804957])
bhat = np.dot(A, xhat)
err = (b-bhat)
print("Mean squared error:", np.dot(err, err)/len(b))
Mean squared error: 0.03631971456742261
model = linear_model.LinearRegression()
model.fit(A, b)
LinearRegression()
sk_bhat = model.predict(A)
sk_err = (b-sk_bhat)
print("sklearn mean squared error:", np.dot(sk_err, sk_err)/len(b))
sklearn mean squared error: 0.03584110914511173
I'm not sure how the sklearn version is able to get slightly lower error! Maybe it adds a column of all-1s to allow a constant term?
This is a generative classification algorithm. Rather than directly predicting the probability P(y|x)
for a class y
and data x
, it instead models the data in each class using a Gaussian distribution, giving P(x|y)
. Then, using the prior probabilities P(y)
and Bayes' Rule, this gives...
P(y0|x) = P(x,y)/P(x)
= P(y0)P(x|y0) / (sum_{y} P(y)P(x|y)).
Let's see how it compares to logistic regression. Assuming a Gaussian distribution might not be a bad idea, given that phenotypes traits in animals tend to be normally distributed due to being the result of many different effects (genes).
Also worth noting that GDA can be reduced to logistic regression, but not vice versa.
class GDA:
def fit(self, X, y):
self.classes, counts = np.unique(y, return_counts=True)
N = len(y)
self.P_y = counts / N
self.means = []
for i, c in enumerate(self.classes):
mask = y == c
self.means.append(np.sum(X[mask], axis=0)/np.sum(mask))
A = X - np.mean(X, axis=0)
# We share the covariance matrix across all the classes.
self.cov = np.matmul(A.T, A)/N
def predict(self, X, return_prob=False):
P_x_and_y = []
for mean, P_y in zip(self.means, self.P_y):
P_x_and_y.append(P_y * multivariate_normal.pdf(X, mean=mean, cov=self.cov))
P_y_given_x = np.stack(P_x_and_y).T
P_y_given_x /= np.sum(P_y_given_x, axis=1)[:,np.newaxis]
if return_prob:
return P_y_given_x
return np.array([self.classes[np.argmax(probs)]
for probs in P_y_given_x])
iris.species = pd.Categorical(iris.species)
X = iris.drop(["species"], axis=1).to_numpy()
y = pd.Categorical(iris.species).codes
test_pct = 0.35
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_pct, random_state=42)
model = GDA()
model.fit(X_train, y_train)
Take a look at the model parameters. A reminder that the features are: sepal_length, sepal_width, petal_length, petal_width.
print(model.means)
print(model.P_y)
print(model.cov)
[array([4.96451613, 3.36129032, 1.46774194, 0.24516129]), array([5.86363636, 2.71212121, 4.21212121, 1.3030303 ]), array([6.52121212, 2.96969697, 5.51212121, 2.00909091])] [0.31958763 0.34020619 0.34020619] [[ 0.6785567 -0.01742268 1.22969072 0.49030928] [-0.01742268 0.17654586 -0.26200128 -0.09271761] [ 1.22969072 -0.26200128 2.9838155 1.24197258] [ 0.49030928 -0.09271761 1.24197258 0.56296312]]
pred_test = model.predict(X_test)
pred_test
array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 2, 2, 1, 1, 2, 0, 2, 0, 2, 2, 2, 1, 2, 0, 0, 0, 0, 2, 0, 0, 1, 2, 0, 0, 0, 2, 2, 2, 0, 0, 1, 1, 2, 1, 2, 1, 2, 2], dtype=int8)
We see that GDA doesn't perform as well on this dataset.
print("{}% accuracy".format(round(100*np.sum(pred_test==y_test)/len(y_test), 3)))
83.019% accuracy
The confusion matrix gives a hint as to why. It struggles to distinguish between versicolor and virginica because there isn't a clear separating boundary between them. For this reason, the more general-purpose logistic regression model is better in this case. GDA would have the upper hand in cases where its stronger assumptions compensate for a lack of data.
cm = confusion_matrix(y_test, pred_test)
ConfusionMatrixDisplay(confusion_matrix=cm,
display_labels=["setosa", "versicolor", "virginica"]) \
.plot()
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f01bf5ecda0>