import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, confusion_matrix
from sklearn import preprocessing
from sklearn.metrics import ConfusionMatrixDisplay
from scipy.stats import multivariate_normal
import numpy as np
import collections


iris = pd.read_csv("iris.csv")


iris


for name, group in iris.groupby("species"):
    plt.plot(group.sepal_length, group.petal_length,
             marker="o", linestyle='', label=name)
plt.xlabel("sepal length")
plt.ylabel("petal length")
plt.grid(linestyle="--")
plt.legend()

<matplotlib.legend.Legend at 0x7f3cb259e860>


lower_d = PCA(n_components=2) \
  .fit_transform(iris.drop(['species'], axis=1))


lower_d = pd.DataFrame(data=lower_d, columns=["pca1", "pca2"])
lower_d["species"] = iris["species"]
lower_d


for name, group in lower_d.groupby("species"):
    plt.plot(group.pca1, group.pca2,
             marker="o", linestyle='', label=name)
plt.xlabel("pca1")
plt.ylabel("pca2")
plt.grid(linestyle="--")
plt.legend()

<matplotlib.legend.Legend at 0x7f3cb2575ef0>


iris.species = pd.Categorical(iris.species)


iris_onehot = pd.get_dummies(iris)

X = iris_onehot.drop(["sepal_length"], axis=1)
y = iris_onehot.sepal_length

test_pct = 0.35
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_pct, random_state=42)


model = linear_model.LinearRegression()
model.fit(X_train, y_train)

LinearRegression()


y_pred = model.predict(X_test)


print("Test mean squared error:", mean_squared_error(model.predict(X_train), y_train))
print("Mean squared error:", mean_squared_error(y_pred, y_test))

Test mean squared error: 0.09182444089587832
Mean squared error: 0.09148440122003154


def plot_groups(X_test, y_test, y_pred):
    cats = X_test.iloc[:,-3:]
    groups = collections.defaultdict(list)
    for i, row in cats.iterrows():
        col_index = row.index[row==1].tolist()[0]
        print("=====")
        print("INDEX:", col_index)
        print("ROW")
        print(row)


y_test.index

Int64Index([ 73,  18, 118,  78,  76,  31,  64, 141,  68,  82, 110,  12,  36,
              9,  19,  56, 104,  69,  55, 132,  29, 127,  26, 128, 131, 145,
            108, 143,  45,  30,  22,  15,  65,  11,  42, 146,  51,  27,   4,
             32, 142,  85,  86,  16,  10,  81, 133, 137,  75, 109,  96, 105,
             66],
           dtype='int64')


i = 0
colours = ["blue", "black", "red"]
markers = ["+", "o", "x"]
test_plottable = iris.iloc[y_test.index].copy()
test_plottable["pred"] = y_pred
for name, group in test_plottable.groupby("species"):
    plt.plot(y_test.loc[group.index],
             group.pred,
             markerfacecolor=colours[i],
             markeredgecolor=colours[i],
             fillstyle="none",
             marker=markers[i],
             linestyle="",
             label=name)
    i += 1
plt.xlabel("Sepal length")
plt.ylabel("Prediction")
plt.grid(linestyle="--")
plt.legend()

<matplotlib.legend.Legend at 0x7f3cb17612b0>


iris.species = pd.Categorical(iris.species)

X = iris.drop(["species"], axis=1)
y = iris.species

test_pct = 0.35
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_pct, random_state=42)


scaler = preprocessing.StandardScaler()
num_cols = [name for name in iris.columns if iris[name].dtype.kind == "f"]

X_train.loc[:, num_cols] = scaler.fit_transform(X_train[num_cols])
X_test.loc[:, num_cols] = scaler.fit_transform(X_test[num_cols])

/home/kg/.local/lib/python3.6/site-packages/pandas/core/indexing.py:670: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
/home/kg/.local/lib/python3.6/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
/home/kg/.local/lib/python3.6/site-packages/pandas/core/indexing.py:670: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
/home/kg/.local/lib/python3.6/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


model = linear_model.LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression()


print(model.classes_, model.n_features_in_, model.n_iter_)

['setosa' 'versicolor' 'virginica'] 4 [24]


print("{}% accuracy".format(round(100*model.score(X_test, y_test), 3)))

98.113% accuracy


cm = confusion_matrix(y_test, model.predict(X_test))
ConfusionMatrixDisplay(confusion_matrix=cm,
                       display_labels=model.classes_) \
    .plot()

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f01beedf3c8>


A = iris.drop(["species", "petal_width"], axis=1).to_numpy()
b = iris["petal_width"].to_numpy()


proj = np.matmul(np.linalg.inv(np.matmul(A.T, A)), A.T)


xhat = np.dot(proj, b)


xhat

array([-0.25010419,  0.20945778,  0.53804957])


bhat = np.dot(A, xhat)


err = (b-bhat)
print("Mean squared error:", np.dot(err, err)/len(b))

Mean squared error: 0.03631971456742261


model = linear_model.LinearRegression()
model.fit(A, b)

LinearRegression()


sk_bhat = model.predict(A)
sk_err = (b-sk_bhat)
print("sklearn mean squared error:", np.dot(sk_err, sk_err)/len(b))

sklearn mean squared error: 0.03584110914511173


class GDA:
    def fit(self, X, y):
        self.classes, counts = np.unique(y, return_counts=True)
        N = len(y)
        self.P_y = counts / N
        self.means = []
        for i, c in enumerate(self.classes):
            mask = y == c
            self.means.append(np.sum(X[mask], axis=0)/np.sum(mask))
        A = X - np.mean(X, axis=0)
        # We share the covariance matrix across all the classes.
        self.cov = np.matmul(A.T, A)/N
    
    def predict(self, X, return_prob=False):
        P_x_and_y = []
        for mean, P_y in zip(self.means, self.P_y):
            P_x_and_y.append(P_y * multivariate_normal.pdf(X, mean=mean, cov=self.cov))
        P_y_given_x = np.stack(P_x_and_y).T
        P_y_given_x /= np.sum(P_y_given_x, axis=1)[:,np.newaxis]
        if return_prob:
            return P_y_given_x
        return np.array([self.classes[np.argmax(probs)]
                        for probs in P_y_given_x])


iris.species = pd.Categorical(iris.species)

X = iris.drop(["species"], axis=1).to_numpy()
y = pd.Categorical(iris.species).codes

test_pct = 0.35
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_pct, random_state=42)


model = GDA()
model.fit(X_train, y_train)


print(model.means)
print(model.P_y)
print(model.cov)

[array([4.96451613, 3.36129032, 1.46774194, 0.24516129]), array([5.86363636, 2.71212121, 4.21212121, 1.3030303 ]), array([6.52121212, 2.96969697, 5.51212121, 2.00909091])]
[0.31958763 0.34020619 0.34020619]
[[ 0.6785567  -0.01742268  1.22969072  0.49030928]
 [-0.01742268  0.17654586 -0.26200128 -0.09271761]
 [ 1.22969072 -0.26200128  2.9838155   1.24197258]
 [ 0.49030928 -0.09271761  1.24197258  0.56296312]]


pred_test = model.predict(X_test)
pred_test

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 2, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 1, 2, 0, 0, 0, 0, 2, 0, 0, 1, 2, 0, 0, 0, 2, 2, 2, 0,
       0, 1, 1, 2, 1, 2, 1, 2, 2], dtype=int8)


print("{}% accuracy".format(round(100*np.sum(pred_test==y_test)/len(y_test), 3)))

83.019% accuracy


cm = confusion_matrix(y_test, pred_test)
ConfusionMatrixDisplay(confusion_matrix=cm,
                       display_labels=["setosa", "versicolor", "virginica"]) \
    .plot()

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f01bf5ecda0>

	pca1	pca2	species
0	-2.684207	0.326607	setosa
1	-2.715391	-0.169557	setosa
2	-2.889820	-0.137346	setosa
3	-2.746437	-0.311124	setosa
4	-2.728593	0.333925	setosa
...	...	...	...
145	1.944017	0.187415	virginica
146	1.525664	-0.375021	virginica
147	1.764046	0.078519	virginica
148	1.901629	0.115877	virginica
149	1.389666	-0.282887	virginica

Iris dataset¶

Imports¶

Load the data¶

Visualise 2 features¶

Visualise all features using dimensionality reduction (PCA)¶

Using linear regression to predict sepal length¶

Logistic regression to classify flower type¶

Homegrown linear regression¶

Gaussian discriminative analysis (GDA)¶

	sepal_length	sepal_width	petal_length	petal_width	species
0	5.1	3.5	1.4	0.2	setosa
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa
...	...	...	...	...	...
145	6.7	3.0	5.2	2.3	virginica
146	6.3	2.5	5.0	1.9	virginica
147	6.5	3.0	5.2	2.0	virginica
148	6.2	3.4	5.4	2.3	virginica
149	5.9	3.0	5.1	1.8	virginica