import pandas as pd
from sklearn.decomposition import PCA
from scipy.stats import multivariate_normal
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import numpy as np
import random


df = pd.read_csv("countries/data.csv")
df


for i, row in pd.read_csv("countries/dictionary.csv").iterrows():
    print(row["Column Name"] + ":")
    print("  ", row["Description"])

country:
   Name of the country
child_mort:
   Death of children under 5 years of age per 1000 live births
exports:
   Exports of goods and services per capita. Given as %age of the GDP per capita
health:
   Total health spending per capita. Given as %age of GDP per capita
imports:
   Imports of goods and services per capita. Given as %age of the GDP per capita
Income:
   Net income per person
Inflation:
   The measurement of the annual growth rate of the Total GDP
life_expec:
   The average number of years a new born child would live if the current mortality patterns are to remain the same
total_fer:
   The number of children that would be born to each woman if the current age-fertility rates remain the same.
gdpp:
   The GDP per capita. Calculated as the Total GDP divided by the total population.


for name in ["child_mort", "exports", "health", "imports",
             "income", "inflation", "life_expec",
             "total_fer", "gdpp"]:
    df[name] -= df[name].mean()
    df[name] /= df[name].std()


print(df["income"].mean(), df["income"].std())

-7.711728793803483e-17 1.0


pca = PCA(n_components=2)
plottable = pca.fit_transform(df.drop(["country"], axis=1))


plt.figure(figsize=(8, 6), dpi=80)
plt.scatter(plottable[:,0], plottable[:,1], marker=".")
plt.xlabel("pca 1")
plt.ylabel("pca 2")
plt.grid(linestyle="--")
for name, (x, y) in zip(df["country"], plottable):
    plt.text(x, y, name)


def silhouette_score(X, k, clusters):
    """Calculate the silhouette score for a dataset `X`, `k` clusters and
    cluster assignments `clusters`."""
    silhouette_score = 0
    cluster_ids, gt0_counts = np.unique(clusters, return_counts=True)
    counts = np.zeros(k)
    for cluster_id, count, in zip(cluster_ids, gt0_counts):
        counts[cluster_id] = count
    for i in range(k):
        cluster = X[clusters == i]
        for x in cluster:
            silhouette_score += _calc_silhouette_score(x, X, cluster, clusters, counts, i, k)
    return silhouette_score/len(X)
    
def _calc_silhouette_score(x, X, cluster, clusters, counts, i, k):
    if counts[i] == 1:
        return 0
    # Average difference from x to the other points in its cluster.
    # It's okay that we don't exclude x from the mask because the
    # distance from x to x is 0, obvs.
    sim_score = (1/(counts[i]-1)) * np.sum(np.sqrt(np.sum((x-cluster)**2, axis=1)))
    dissim_score = min(_calc_dissim_score(x, X, clusters, counts, j)
                       for j in range(k)
                       if j != i)
    return (dissim_score - sim_score)/max(sim_score, dissim_score)

def _calc_dissim_score(x, X, clusters, counts, j):
    if counts[j] == 0:
        return 0
    # Really need to extract the euclidean distance thing into a function.
    return np.sum(np.sqrt(np.sum((x-X[clusters == j])**2, axis=1))) / counts[j]


class KMeans:
    def __init__(self, k):
        self.k = k
    
    def fit(self, X, max_rounds=100, starting_centers=None):
        minmaxes = list(zip(np.min(X, axis=0), np.max(X, axis=0)))
        if starting_centers is None:
            self.C = np.array([[random.uniform(minval, maxval)
                                for minval, maxval in minmaxes]
                               for _ in range(self.k)])
        else:
            self.C = starting_centers
        self.groups = np.zeros(len(X), dtype=np.uint)
        # Repeatedly assign datapoints to clusters and update the
        # centroids, until convergence.
        self.rounds = 0
        while self.rounds < max_rounds:
            num_changed = self._update_centroids(X)
            self.rounds += 1
            if num_changed == 0:
                break
        
        self.group_sizes = dict([(group_id, 0) for group_id in range(self.k)])
        unique, counts = np.unique(self.groups, return_counts=True)
        for group_id, count in zip(unique, counts):
            self.group_sizes[group_id] = count

    def _update_centroids(self, X):
        new_groups = np.zeros_like(self.groups)
        for i, x in enumerate(X):
            diffs = np.sum((self.C - x)**2, axis=1)
            new_groups[i] = np.argmin(diffs)
        num_changed = np.sum(self.groups != new_groups)
        self.groups[:] = new_groups
        if num_changed > 0:
            for group_idx in range(self.k):
                mask = self.groups == group_idx
                if np.sum(mask) != 0:
                    self.C[group_idx] = np.mean(X[mask], axis=0)
        return num_changed
    
    def compute_sq_dist_sum(self, X):
        """Once the clusters have been established via `fit(...)`, this metric
        gives the squared Euclidean distance from each point to the center of its cluster.
        """
        s = 0
        for i, c in enumerate(self.C):
            s += np.sum((X[self.groups == i] - c)**2)
        return s


X1 = np.array([-5, 0]) + np.random.normal(size=(10,2), scale=.5)
X2 = np.array([5, 0]) + np.random.normal(size=(10,2), scale=.5)
X = np.vstack([X1, X2])
model = KMeans(2)
model.fit(X)

ranges [(-5.858868315688944, 6.716140336530863), (-0.8580526401389359, 0.6439390434900443)]
starting centroids [[5.81492768 0.16871547]
 [5.09883677 0.08176992]]


plt.scatter(*zip(*X1), label="group 1", marker=".")
plt.scatter(*zip(*X2), label="group 2", marker=".")
plt.scatter(*zip(*model.C), label="centroids")
print("centroids", model.C)
print("rounds", model.rounds)
print("group assignments", model.groups)
plt.grid(linestyle="--")
plt.legend()

centroids [[ 5.25385166 -0.1102428 ]
 [-5.06643466 -0.12965083]]
rounds 3
group assignments [1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0]

<matplotlib.legend.Legend at 0x7fbca9ade1d0>


model = KMeans(2)
model.fit(df.drop(["country"], axis=1).to_numpy())


print("rounds", model.rounds)

rounds 10


pca = PCA(n_components=2)
pca_data = pca.fit_transform(df.drop(["country"], axis=1))


plt.figure(figsize=(8, 6), dpi=80)
for group_id in range(model.k):
    plt.scatter(*zip(*pca_data[model.groups==group_id]),
                marker=".",
                label=f"group {group_id+1}")
    # Apply PCA transform to centroid as well.
    plt.scatter(*pca.transform([model.C[group_id]])[0],
                marker="o",
                label=f"centroid {group_id+1}")
plt.xlabel("pca 1")
plt.ylabel("pca 2")
plt.grid(linestyle="--")
plt.legend()

<matplotlib.legend.Legend at 0x7fbca2ece160>


for group_id in range(model.k):
    print(f"Group {group_id+1}:")
    print("  ", ", ".join(list(df["country"][np.where(model.groups == group_id)[0]])))

group 1:
   Afghanistan, Algeria, Angola, Argentina, Armenia, Azerbaijan, Bangladesh, Belize, Benin, Bhutan, Bolivia, Botswana, Burkina Faso, Burundi, Cambodia, Cameroon, Cape Verde, Central African Republic, Chad, China, Comoros, Congo, Dem. Rep., Congo, Rep., Cote d'Ivoire, Dominican Republic, Ecuador, Egypt, El Salvador, Equatorial Guinea, Eritrea, Fiji, Gabon, Gambia, Ghana, Grenada, Guatemala, Guinea, Guinea-Bissau, Guyana, Haiti, India, Indonesia, Iran, Iraq, Jamaica, Jordan, Kazakhstan, Kenya, Kiribati, Kyrgyz Republic, Lao, Lesotho, Liberia, Madagascar, Malawi, Mali, Mauritania, Micronesia, Fed. Sts., Mongolia, Morocco, Mozambique, Myanmar, Namibia, Nepal, Niger, Nigeria, Pakistan, Paraguay, Peru, Philippines, Russia, Rwanda, Samoa, Senegal, Sierra Leone, Solomon Islands, South Africa, Sri Lanka, St. Vincent and the Grenadines, Sudan, Suriname, Tajikistan, Tanzania, Timor-Leste, Togo, Tonga, Turkmenistan, Uganda, Uzbekistan, Vanuatu, Venezuela, Yemen, Zambia
group 2:
   Albania, Antigua and Barbuda, Australia, Austria, Bahamas, Bahrain, Barbados, Belarus, Belgium, Bosnia and Herzegovina, Brazil, Brunei, Bulgaria, Canada, Chile, Colombia, Costa Rica, Croatia, Cyprus, Czech Republic, Denmark, Estonia, Finland, France, Georgia, Germany, Greece, Hungary, Iceland, Ireland, Israel, Italy, Japan, Kuwait, Latvia, Lebanon, Libya, Lithuania, Luxembourg, Macedonia, FYR, Malaysia, Maldives, Malta, Mauritius, Moldova, Montenegro, Netherlands, New Zealand, Norway, Oman, Panama, Poland, Portugal, Qatar, Romania, Saudi Arabia, Serbia, Seychelles, Singapore, Slovak Republic, Slovenia, South Korea, Spain, Sweden, Switzerland, Thailand, Tunisia, Turkey, Ukraine, United Arab Emirates, United Kingdom, United States, Uruguay, Vietnam


X = df.drop(["country"], axis=1).to_numpy()


ks = [2, 3, 4, 5, 6]
avg_dists = []
for k in ks:
    model = KMeans(k)
    model.fit(X)
    avg_dists.append(model.compute_sq_dist_sum(X))


plt.plot(ks, avg_dists)
plt.xlabel("k")
plt.ylabel("Squared Distance Sum")
plt.grid(linestyle="--")
plt.axes().xaxis.set_major_locator(MaxNLocator(integer=True))
plt.ylim(bottom=0)

/home/kg/.local/lib/python3.6/site-packages/ipykernel_launcher.py:5: MatplotlibDeprecationWarning: Adding an axes using the same arguments as a previous axes currently reuses the earlier instance.  In a future version, a new instance will always be created and returned.  Meanwhile, this warning can be suppressed, and the future behavior ensured, by passing a unique label to each axes instance.
  """

(0.0, 1430.0654607818988)


runs_per_k = 5
ks = [2, 3, 4, 5, 6]
best_Cs = []
scores = []
for k in ks:
    print("k:", k)
    # actual scores be between -1 and +1, so this will
    # be overwritten immediately.
    score = -2
    C = None
    for _ in range(runs_per_k):
        model = KMeans(k)
        model.fit(X)
        new_score = silhouette_score(X, k, model.groups)
        if new_score > score:
            C = model.C
            score = new_score
    best_Cs.append(C)
    scores.append(score)

k: 2
k: 3
k: 4
k: 5
k: 6


plt.plot(ks, scores)
plt.xlabel("k")
plt.ylabel("silhouette score")
plt.axes().xaxis.set_major_locator(MaxNLocator(integer=True))
plt.grid(linestyle="--")
plt.ylim(bottom=-1, top=1)

/home/kg/.local/lib/python3.6/site-packages/ipykernel_launcher.py:4: MatplotlibDeprecationWarning: Adding an axes using the same arguments as a previous axes currently reuses the earlier instance.  In a future version, a new instance will always be created and returned.  Meanwhile, this warning can be suppressed, and the future behavior ensured, by passing a unique label to each axes instance.
  after removing the cwd from sys.path.

(-1.0, 1.0)


def fit_and_plot(X, k, ks, best_Cs):
    k_index = ks.index(k)
    # Reusing the best centers that we got above, should converge in 1 round.
    model = KMeans(k)
    model.fit(X, starting_centers=best_Cs[k_index])
    
    pca = PCA(n_components=2)
    pca_data = pca.fit_transform(X)
    
    plt.figure(figsize=(8, 6), dpi=80)
    for group_id in range(model.k):
        plt.scatter(*zip(*pca_data[model.groups==group_id]),
                    marker=".",
                    label=f"group {group_id+1}")
    plt.scatter(*zip(*pca.transform(model.C)),
                marker="o",
                facecolor="None",
                edgecolor="black",
                label=f"centroids")
    plt.xlabel("pca 1")
    plt.ylabel("pca 2")
    plt.grid(linestyle="--")
    plt.legend()
    
    for group_id in range(model.k):
        print(f"Group {group_id+1}:")
        print("  ", ", ".join(list(df["country"][np.where(model.groups == group_id)[0]])))


fit_and_plot(X, 3, ks, best_Cs)

Group 1:
   Ireland, Luxembourg, Malta, Singapore
Group 2:
   Nigeria
Group 3:
   Afghanistan, Albania, Algeria, Angola, Antigua and Barbuda, Argentina, Armenia, Australia, Austria, Azerbaijan, Bahamas, Bahrain, Bangladesh, Barbados, Belarus, Belgium, Belize, Benin, Bhutan, Bolivia, Bosnia and Herzegovina, Botswana, Brazil, Brunei, Bulgaria, Burkina Faso, Burundi, Cambodia, Cameroon, Canada, Cape Verde, Central African Republic, Chad, Chile, China, Colombia, Comoros, Congo, Dem. Rep., Congo, Rep., Costa Rica, Cote d'Ivoire, Croatia, Cyprus, Czech Republic, Denmark, Dominican Republic, Ecuador, Egypt, El Salvador, Equatorial Guinea, Eritrea, Estonia, Fiji, Finland, France, Gabon, Gambia, Georgia, Germany, Ghana, Greece, Grenada, Guatemala, Guinea, Guinea-Bissau, Guyana, Haiti, Hungary, Iceland, India, Indonesia, Iran, Iraq, Israel, Italy, Jamaica, Japan, Jordan, Kazakhstan, Kenya, Kiribati, Kuwait, Kyrgyz Republic, Lao, Latvia, Lebanon, Lesotho, Liberia, Libya, Lithuania, Macedonia, FYR, Madagascar, Malawi, Malaysia, Maldives, Mali, Mauritania, Mauritius, Micronesia, Fed. Sts., Moldova, Mongolia, Montenegro, Morocco, Mozambique, Myanmar, Namibia, Nepal, Netherlands, New Zealand, Niger, Norway, Oman, Pakistan, Panama, Paraguay, Peru, Philippines, Poland, Portugal, Qatar, Romania, Russia, Rwanda, Samoa, Saudi Arabia, Senegal, Serbia, Seychelles, Sierra Leone, Slovak Republic, Slovenia, Solomon Islands, South Africa, South Korea, Spain, Sri Lanka, St. Vincent and the Grenadines, Sudan, Suriname, Sweden, Switzerland, Tajikistan, Tanzania, Thailand, Timor-Leste, Togo, Tonga, Tunisia, Turkey, Turkmenistan, Uganda, Ukraine, United Arab Emirates, United Kingdom, United States, Uruguay, Uzbekistan, Vanuatu, Venezuela, Vietnam, Yemen, Zambia


fit_and_plot(X, 4, ks, best_Cs)

Group 1:
   Afghanistan, Angola, Benin, Botswana, Burkina Faso, Burundi, Cameroon, Central African Republic, Chad, Comoros, Congo, Dem. Rep., Congo, Rep., Cote d'Ivoire, Equatorial Guinea, Eritrea, Gabon, Gambia, Ghana, Guinea, Guinea-Bissau, Haiti, India, Iraq, Kenya, Kiribati, Lao, Lesotho, Liberia, Madagascar, Malawi, Mali, Mauritania, Mozambique, Myanmar, Namibia, Nepal, Niger, Pakistan, Rwanda, Senegal, Sierra Leone, Solomon Islands, South Africa, Sudan, Tajikistan, Tanzania, Timor-Leste, Togo, Uganda, Vanuatu, Yemen, Zambia
Group 2:
   Nigeria
Group 3:
   Albania, Algeria, Antigua and Barbuda, Argentina, Armenia, Australia, Austria, Azerbaijan, Bahamas, Bahrain, Bangladesh, Barbados, Belarus, Belgium, Belize, Bhutan, Bolivia, Bosnia and Herzegovina, Brazil, Brunei, Bulgaria, Cambodia, Canada, Cape Verde, Chile, China, Colombia, Costa Rica, Croatia, Cyprus, Czech Republic, Denmark, Dominican Republic, Ecuador, Egypt, El Salvador, Estonia, Fiji, Finland, France, Georgia, Germany, Greece, Grenada, Guatemala, Guyana, Hungary, Iceland, Indonesia, Iran, Ireland, Israel, Italy, Jamaica, Japan, Jordan, Kazakhstan, Kuwait, Kyrgyz Republic, Latvia, Lebanon, Libya, Lithuania, Macedonia, FYR, Malaysia, Maldives, Mauritius, Micronesia, Fed. Sts., Moldova, Mongolia, Montenegro, Morocco, Netherlands, New Zealand, Norway, Oman, Panama, Paraguay, Peru, Philippines, Poland, Portugal, Qatar, Romania, Russia, Samoa, Saudi Arabia, Serbia, Seychelles, Slovak Republic, Slovenia, South Korea, Spain, Sri Lanka, St. Vincent and the Grenadines, Suriname, Sweden, Switzerland, Thailand, Tonga, Tunisia, Turkey, Turkmenistan, Ukraine, United Arab Emirates, United Kingdom, United States, Uruguay, Uzbekistan, Venezuela, Vietnam
Group 4:
   Luxembourg, Malta, Singapore


class GaussianMixture:
    def __init__(self, k):
        self.k = k
    
    def fit(self, X, eps=.01):
        self._init_params(X)
        l = None
        while True:
            self._do_steps(X)
            l0 = self._log_likelihood(X)
            # likelihood should always increase, so difference should
            # always be positive. But just in case...
            if l is not None and abs(l0 - l) < eps: 
                break
            l = l0
        return self.predict(X)
    
    def _do_steps(self, X):
        self._expectation_step(X)
        self._maximisation_step(X)
            
    def _init_params(self, X):
        self.N = len(X)
        self.cov_reg = 1e-6*np.eye(len(X[0]))
        self._init_model_params(X)
        # This array stores the posterior probability distribution for each
        # datapoint's latent variable.
        self.P_z_given_x = np.zeros((self.N, self.k))
        
    def _init_model_params(self, X):
        # Set the means to be random datapoints.
        self.means = X[np.random.choice(X.shape[0], self.k, replace=False), :]
        # Seed the covariance matrices with the sample covariance matrix over
        # all the data.
        A = X - np.mean(X, axis=0)
        cov = np.matmul(A.T, A) / self.N
        self.covs = np.array([cov[:] for _ in range(self.k)])
        # Use equal priors.
        self.P_z = np.repeat(1/self.k, self.k)
    
    def _expectation_step(self, X):
        self._compute_P_z_given_x(X, self.P_z_given_x)
    
    def _compute_P_z_given_x(self, X, storage_matrix):
        # If there are m clusters, then...
        #   P(z=k|x) = P(x,z=k)/P(x)
        #            = P(x,z=k)/(\sum_{j=1}^{m} P(x,z=j))
        #            = P(z=k)P(x|z=k)/(\sum_{j=1}^{m} P(z=j)P(x|z=j))
        # As with GDA, P(z=k) is the prior and P(x|z=k) is the probability
        # density of the gaussian distribution that models the k-th cluster.
        for j in range(self.k):
            # First calculate P(x,z=j) for all x & j.
            storage_matrix[:, j] = self._compute_P_x_and_zj(X, j)
        # Then normalise each P(x,z=j) by dividing by the sum over all values of j.
        storage_matrix /= np.sum(storage_matrix, axis=1)[:, np.newaxis]
        
    def _compute_P_x_and_zj(self, X, j):
        return self.P_z[j]*multivariate_normal.pdf(X, mean=self.means[j], cov=self.covs[j])
    
    def _maximisation_step(self, X):
        self.P_z[:] = np.sum(self.P_z_given_x, axis=0) / self.N
        # Zero the covariance matrices first 'cause we're going to
        # compute the new estimates by summing.
        self.covs[:] = 0
        for j in range(self.k):
            E_samples = np.sum(self.P_z_given_x[:,j])
            self.means[j] = (np.sum(self.P_z_given_x[:,j][:,None] * X, axis=0)
                             / E_samples)
            # There has to be a vectorised way of doing this...
            for i in range(self.N):
                a = X[i] - self.means[j]
                self.covs[j] += self.P_z_given_x[i,j] * np.matmul(a[:,np.newaxis], a[np.newaxis,:])
            # This is to make sure we don't get a singular covariance matrix when
            # the Gaussian basically collapses down to contain a single datapoint.
            # See: https://stats.stackexchange.com/questions/219302/singularity-issues-in-gaussian-mixture-model
            self.covs[j] += self.cov_reg
            self.covs[j] /= E_samples
           
    def _log_likelihood(self, X):
        # If theta represents all the parameters of the model, the log likelihood
        # given a set of observations {x} is
        #   l(theta) = \sum_{x} log P(x|theta).
        #            = \sum_{x} log(\sum_{j} P(x,z=j))
        # I think it's okay to use the density for P(x|z=j).
        likelihood_by_x = np.zeros(self.N)
        for j in range(self.k):
            # Redundant computation here, we're doing the same thing in the
            # expectation step.
            likelihood_by_x += self._compute_P_x_and_zj(X, j)
        return np.sum(np.log(likelihood_by_x))
    
    def predict(self, X):
        N = len(X)
        max_likelihood = np.zeros(N)
        result = np.zeros(N, dtype=np.uint)
        for j in range(self.k):
            likelihood = self._compute_P_x_and_zj(X, j)
            mask = likelihood > max_likelihood
            result[mask] = j
            max_likelihood[mask] = likelihood[mask]
        return result


u1 = np.array([2., 3.])
cov1 = np.eye(2, dtype=float)
N1 = 100
u2 = np.array([-2., -2.])
cov2 = np.array([[2., .3], [.3, 1.]])
N2 = 150
X = np.vstack([
    np.random.multivariate_normal(u1, cov1, N1),
    np.random.multivariate_normal(u2, cov2, N2),
])
y = np.concatenate([np.repeat(0, N1), np.repeat(1, N2)])


plt.scatter(*zip(*X), c=y)
plt.grid(linestyle="--")
plt.xlabel("x")
plt.ylabel("y")

Text(0, 0.5, 'y')


model = GaussianMixture(2)
model.fit(X)


pred = model.predict(X)
correct = (pred == y)
print(np.sum(correct))

1


pred = (pred+1) % 2
correct = (pred == y)
print(np.sum(correct))

249


print("Likelihood:", model._log_likelihood(X))
plt.scatter(*zip(*X[correct]), c=y[correct], label="correct")
plt.scatter(*zip(*model.means), label="model means")
plt.scatter(*zip(*np.vstack([u1, u2])), label="real means")
plt.scatter(*zip(*X[~correct]), marker="x", c=pred[~correct], label="incorrect")
plt.grid(linestyle="--")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()

Likelihood: -905.1927796905122

<matplotlib.legend.Legend at 0x7f12844c01d0>


X = df.drop(["country"], axis=1).to_numpy()


runs_per_k = 5
ks = [2, 3, 4, 5, 6]
best_clusterings = []
scores = []
for k in ks:
    print("k:", k)
    score = -2
    best_clustering = None
    for _ in range(runs_per_k):
        model = GaussianMixture(k)
        clustering = model.fit(X)
        new_score = silhouette_score(X, k, clustering)
        if new_score > score:
            best_clustering = clustering
            score = new_score
    best_clusterings.append(best_clustering)
    scores.append(score)

k: 2
k: 3
k: 4
k: 5
k: 6


plt.plot(ks[:len(scores)], scores)
plt.xlabel("k")
plt.ylabel("silhouette score")
plt.axes().xaxis.set_major_locator(MaxNLocator(integer=True))
plt.grid(linestyle="--")
plt.ylim(bottom=-1, top=1)

/home/kg/.local/lib/python3.6/site-packages/ipykernel_launcher.py:4: MatplotlibDeprecationWarning: Adding an axes using the same arguments as a previous axes currently reuses the earlier instance.  In a future version, a new instance will always be created and returned.  Meanwhile, this warning can be suppressed, and the future behavior ensured, by passing a unique label to each axes instance.
  after removing the cwd from sys.path.

(-1.0, 1.0)


for k, clustering in zip(ks, best_clusterings):
    for i in range(k):
        plt.scatter(*zip(*pca_data[clustering==i]),
                    marker=".",
                    label=f"group {i+1}")
    plt.xlabel("pca 1")
    plt.ylabel("pca 2")
    plt.grid(linestyle="--")
    plt.title(f"k={k}")
    plt.legend()
    plt.show()

	country	child_mort	exports	health	imports	income	inflation	life_expec	total_fer	gdpp
0	Afghanistan	90.2	10.0	7.58	44.9	1610	9.44	56.2	5.82	553
1	Albania	16.6	28.0	6.55	48.6	9930	4.49	76.3	1.65	4090
2	Algeria	27.3	38.4	4.17	31.4	12900	16.10	76.5	2.89	4460
3	Angola	119.0	62.3	2.85	42.9	5900	22.40	60.1	6.16	3530
4	Antigua and Barbuda	10.3	45.5	6.03	58.9	19100	1.44	76.8	2.13	12200
...	...	...	...	...	...	...	...	...	...	...
162	Vanuatu	29.2	46.6	5.25	52.7	2950	2.62	63.0	3.50	2970
163	Venezuela	17.1	28.5	4.91	17.6	16500	45.90	75.4	2.47	13500
164	Vietnam	23.3	72.0	6.84	80.2	4490	12.10	73.1	1.95	1310
165	Yemen	56.3	30.0	5.18	34.4	4480	23.60	67.5	4.67	1310
166	Zambia	83.1	37.0	5.89	30.9	3280	14.00	52.0	5.40	1460

Unsupervised Learning on Country Data¶

Imports¶

Reading and preprocessing the data¶

Metrics for clustering¶

Implementation of k-means clustering¶

Applying 2-means clustering to the actual country data¶

Using metrics to determine an appropriate number of clusters¶

Implementing Gaussian Mixture Model¶

Applying GMM to country clustering¶

Conclusions¶