import pandas as pd
import numpy as np
import collections
import re
import sklearn.model_selection
import sklearn.metrics
from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt


df = pd.read_csv("spam.csv", encoding="latin1")


df.rename(columns=dict(v1="spam", v2="text"),
          inplace=True)
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df.spam = (df.spam == "spam").astype(np.uint)

def preprocess_text(text):
    return text.lower()
    
df.text = df.text.apply(preprocess_text)


for i, row in df.sample(n=10).iterrows():
    print(i, row.spam, row.text)
    print("======")

1284 0 but if she.s drinkin i'm ok.
======
787 1 ever thought about living a good life with a perfect partner? just txt back name and age to join the mobile community. (100p/sms)
======
1035 0 hello baby, did you get back to your mom's ? are you setting up the computer now ? filling your belly ? how goes it loverboy ? i miss you already ... *sighs*
======
2090 0 s:-)kallis wont play in first two odi:-)
======
4531 0 ok both our days. so what are you making for dinner tonite? am i invited?
======
4338 0 just got outta class gonna go gym.
======
1913 0 you want to go? 
======
5468 1 urgent! last weekend's draw shows that you have won å£1000 cash or a spanish holiday! call now 09050000332 to claim. t&c: rstm, sw7 3ss. 150ppm
======
4196 1 free for 1st week! no1 nokia tone 4 ur mob every week just txt nokia to 8007 get txting and tell ur mates www.getzed.co.uk pobox 36504 w45wq norm150p/tone 16+
======
3277 0 what happened in interview?
======


X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
    df.drop(["spam"], axis=1),
    df.drop(["text"], axis=1))
N_train, N_test = len(X_train), len(X_test)
print(N_train, N_test, N_train/(N_train+N_test))

4179 1393 0.75


PS = PorterStemmer()
REG = re.compile("([a-zA-Z]+.*[a-zA-Z])|[a-zA-Z]")
def extract_words(text):
    for chunk in text.split(" "):
        m = REG.match(chunk)
        if m:
            yield PS.stem(m.group())
print([w for w in extract_words("blah!  $1000 i'm i clams    favorite")])

['blah', "i'm", 'i', 'clam', 'favorit']


wordcounts = collections.defaultdict(int)
for text in X_train["text"]:
    words_in_this = set()
    for w in extract_words(text):
        words_in_this.add(w)
    for w in words_in_this:
        wordcounts[w] += 1


final_words = sorted([(w, c) for (w, c) in wordcounts.items() if c > 1],
                     key=lambda wc: wc[1],
                     reverse=True)
print(len(final_words))
print(final_words[:10])
print(final_words[-10:])

2801
[('to', 1264), ('i', 1222), ('you', 1126), ('a', 859), ('the', 756), ('u', 609), ('and', 606), ('in', 602), ('is', 542), ('me', 515)]
[('www.smsco.net', 2), ('now.i', 2), ('whatsup', 2), ('fujitsu', 2), ('underwear', 2), ('juli', 2), ('race', 2), ('rice', 2), ('ericsson', 2), ('smart', 2)]


final_words = set([w for w, _ in final_words])


class FeatureMap:
    def __init__(self):
        self._i = 0
        self._label_to_index = {}
        self._index_to_label = {}
    
    def add_feature(self, label):
        self._label_to_index[label] = self._i
        self._index_to_label[self._i] = label
        self._i += 1
    
    def init_features(self, dtype=None):
        return np.zeros(self._i, **{} if dtype is None else dict(dtype=dtype))
    
    def set_feature(self, features, label, value):
        features[self._label_to_index[label]] = value
    
    def num_features(self):
        return self._i
    
    def get_label(self, index):
        return self._index_to_label[index]
    
    def get_index(self, label):
        return self._label_to_index[label]


NUMREG = re.compile("[0-9]+")
NUM_RANGES = [1, 2, 4, 8]
MONEY_SYMBOLS = ["$", "€", "£", "¥"]
MONEY_LABEL = "%money"

def make_feature_map(wordlist, num_labels):
    fm = FeatureMap()
    for word in wordlist:
        fm.add_feature(word)
    for label in num_labels:
        fm.add_feature(label)
    fm.add_feature(MONEY_LABEL)
    return fm

def make_num_labels(num_ranges):
    return [f"%num{n}" for n in num_ranges]

def make_extract_features(wordlist, num_ranges):
    num_labels = make_num_labels(num_ranges)
    fm = make_feature_map(wordlist, num_labels)
    def extract_features(text):
        f = fm.init_features()
        for w in extract_words(text):
            if w in wordlist:
                fm.set_feature(f, w, 1)
        for n in NUMREG.findall(text):
            for lower_bound, label in zip(reversed(num_ranges), reversed(num_labels)):
                if len(n) >= lower_bound:
                    fm.set_feature(f, label, 1)
                    break            
        for c in text:
            if c in MONEY_SYMBOLS:
                fm.set_feature(f, MONEY_LABEL, 1)
                break
        return f
    return extract_features


ef = make_extract_features(["blah", "i", "go", "beer", "call"], NUM_RANGES)
print(ef("i am going to call you at 10399 $"))

[0. 1. 1. 0. 1. 0. 0. 1. 0. 1.]


def calc_p_x_and_y(X, P_y, P_x_given_y):
    p = np.repeat(P_y, len(X))
    # There should be a way to vectorise this.
    for i, row in enumerate(X):
        p[i] *= np.prod(P_x_given_y[row]) * np.prod(1-P_x_given_y[~row])
    return p

class NaiveBayes:
    """Expects input arrays to have elements of type np.bool."""
    def __init__(self):
        self.P_y1 = None
        self.P_x_given_y0 = None
        self.P_x_given_y1 = None
        
    def fit(self, X, y):
        N = len(y)
        y1s = np.sum(y)
        self.P_y1 = y1s/N
        self.P_x_given_y0 = (np.sum(X[~y,:], axis=0)+1) / (N-y1s+2)
        self.P_x_given_y1 = (np.sum(X[y,:], axis=0)+1) / (y1s+2)
    
    def predict(self, X, raw_prob=False, debug=False):
        P_x_and_y0 = calc_p_x_and_y(X, 1-self.P_y1, self.P_x_given_y0)
        P_x_and_y1 = calc_p_x_and_y(X, self.P_y1, self.P_x_given_y1)
        P_y1_given_x = P_x_and_y1/(P_x_and_y1 + P_x_and_y0)
        if debug:
            print(P_x_and_y0)
            print(P_x_and_y1)
        return P_y1_given_x if raw_prob else (P_y1_given_x > .5).astype(np.ubyte)


X = np.array([[1, 0, 1, 0],
              [0, 1, 1, 0],
              [0, 1, 1, 0],
              [1, 1, 1, 1]],
             dtype=np.bool)
y = np.array([1, 0, 1, 0], dtype=np.bool)

nb = NaiveBayes()
nb.fit(X, y)
print("--- parameters")
print(nb.P_y1)
print(nb.P_x_given_y0)
print(nb.P_x_given_y1)

print("--- prediction")
print(nb.predict(np.array([[1, 1, 1, 0],
                           [1, 0, 1, 0]],
                          dtype=np.bool),
                 raw_prob=True))

--- parameters
0.5
[0.5  0.75 0.75 0.5 ]
[0.5  0.5  0.75 0.25]
--- prediction
[0.5  0.75]


ef = make_extract_features(final_words, NUM_RANGES)
def df_extract_features(df, ef):
    return np.array([
        ef(row.text) for _, row in df.iterrows()
    ], dtype=np.bool)

def to_model_format(X, y, ef):
    return df_extract_features(X, ef), np.array(y.spam, dtype=np.bool)


m_X_train, m_y_train = to_model_format(X_train, y_train, ef)
m_X_test, m_y_test = to_model_format(X_test, y_test, ef)


model = NaiveBayes()
model.fit(m_X_train, m_y_train)


pred_train = model.predict(m_X_train)


def calc_accuracy_percentage(pred, y):
    return 100*np.sum(pred==y)/len(y)


print("Training set accuracy percentage:", round(calc_accuracy_percentage(pred_train, m_y_train), 3))

Training set accuracy percentage: 99.067


pred_test = model.predict(m_X_test)
print("Test set accuracy percentage:", round(calc_accuracy_percentage(pred_test, m_y_test), 3))

Test set accuracy percentage: 98.923


model.P_y1

0.13591768365637713


print("Recall:", sklearn.metrics.recall_score(m_y_test, pred_test))
print("Precision:", sklearn.metrics.precision_score(m_y_test, pred_test))
print("Confusion matrix:")
print(sklearn.metrics.confusion_matrix(m_y_test, pred_test))

Recall: 0.9273743016759777
Precision: 0.9880952380952381
Confusion matrix:
[[1212    2]
 [  13  166]]


mask = (m_y_test != pred_test)
incorrect = X_test[mask].join(y_test[mask])
for _, row in incorrect.iterrows():
    print("spam:" if row.spam == 1 else "ham:", row.text)

spam: bloomberg -message center +447797706009 why wait? apply for your future http://careers. bloomberg.com
spam: sorry i missed your call let's talk when you have the time. i'm on 07090201529
spam: how come it takes so little time for a child who is afraid of the dark to become a teenager who wants to stay out all night?
spam: money i have won wining number 946 wot do i do next
spam: hello darling how are you today? i would love to have a chat, why dont you tell me what you look like and what you are in to sexy?
spam: reminder: you have not downloaded the content you have already paid for. goto http://doit. mymoby. tv/ to collect your content.
ham: we have sent jd for customer service cum accounts executive to ur mail id, for details contact us
spam: do you ever notice that when you're driving, anyone going slower than you is an idiot and everyone driving faster than you is a maniac?
spam: email alertfrom: jeri stewartsize: 2kbsubject: low-cost prescripiton drvgsto listen to email call 123
spam: interflora - åòit's not too late to order interflora flowers for christmas call 0800 505060 to place your order before midnight tomorrow.
spam: filthy stories and girls waiting for your
ham: plz note: if anyone calling from a mobile co. &amp; asks u to type # &lt;#&gt;  or # &lt;#&gt; . do not do so. disconnect the call,coz it iz an attempt of 'terrorist' to make use of the sim card no. itz confirmd by nokia n motorola n has been verified by cnn ibn.
spam: check out choose your babe videos @ sms.shsex.netun fgkslpopw fgkslpo
spam: not heard from u4 a while. call me now am here all night with just my knickers on. make me beg for it like u did last time 01223585236 xx luv nikiyu4.net
spam: your credits have been topped up for http://www.bubbletext.com your renewal pin is tgxxrz


feature_map = make_feature_map(final_words, make_num_labels(NUM_RANGES))
X_one_only = np.eye(feature_map.num_features(), dtype=np.bool)
pred_one_only = model.predict(X_one_only, raw_prob=True)
word_pred_pairs = [(feature_map.get_label(i), p)
                   for i, p in enumerate(pred_one_only)]
word_pred_pairs.sort(key=lambda pair: pair[1], reverse=True)
print("Most spammy features:")
for label, p in word_pred_pairs[:20]:
    print("  ", label, p)
print("Least spammy features:")
for label, p in word_pred_pairs[-10:]:
    print("  ", label, p)

Most spammy features:
   %num8 4.849944998306583e-05
   claim 2.1551104205454864e-05
   %num4 2.1359130935198095e-05
   prize 1.5827663335078766e-05
   won 1.208368150055925e-05
   award 1.0286417904084979e-05
   tone 9.031665881670267e-06
   guarante 9.031665881670262e-06
   voucher 6.831659735162032e-06
   t&c 6.352690324168992e-06
   pobox 5.170588351753613e-06
   %num2 5.1376721899143955e-06
   rington 4.703775728428128e-06
   bonu 4.703775728428107e-06
   %money 4.5865934436622556e-06
   weekli 4.471642379852343e-06
   mob 4.009898550877469e-06
   unsubscrib 4.009898550877443e-06
   valid 3.780278947337647e-06
   poli 3.7802789473376362e-06
Least spammy features:
   happi 3.317933789215728e-09
   she 3.2659681445431903e-09
   alreadi 3.21557721637072e-09
   too 2.7846021725829294e-09
   da 2.4834266711742456e-09
   say 2.264651635389679e-09
   later 2.101937702462825e-09
   he 2.038973503715481e-09
   lor 1.8703193998888758e-09
   i'll 1.7267543362561323e-09


print(np.prod(1-model.P_x_given_y0))
print(np.prod(1-model.P_x_given_y1))

2.5492878988569676e-06
5.517622638497361e-13


plt.hist(np.sum(m_X_train[~m_y_train], axis=1),
         label="ham",
         alpha=.5,
         density=True)
plt.hist(np.sum(m_X_train[m_y_train], axis=1),
         label="spam",
         alpha=.5,
         density=True)
plt.legend()
plt.xlabel("Number of features present")
plt.ylabel("Fraction of messages")

Text(0, 0.5, 'Fraction of messages')


for _, row in X_test[pred_test == m_y_test].join(y_test[pred_test == m_y_test]) \
        .sample(n=10) \
        .iterrows():
    print(row.spam, row.text)

0 thanks chikku..:-) gud nyt:-*
0 eh u send wrongly lar...
0 what time is ur flight tmr?
0 that way transport is less problematic than on sat night. by the way, if u want to ask  n  to join my bday, feel free. but need to know definite nos as booking on fri. 
1 tbs/persolvo. been chasing us since sept forå£38 definitely not paying now thanks to your information. we will ignore them. kath. manchester.
1 sms auction - a brand new nokia 7250 is up 4 auction today! auction is free 2 join & take part! txt nokia to 86021 now!
1 ur ringtone service has changed! 25 free credits! go to club4mobiles.com to choose content now! stop? txt club stop to 87070. 150p/wk club4 po box1146 mk45 2wt
0 wow. you're right! i didn't mean to do that. i guess once i gave up on boston men and changed my search location to nyc, something changed. cuz on my signin page it still says boston.
1 update_now - 12mths half price orange line rental: 400mins...call mobileupd8 on 08000839402 or call2optout=j5q
0 ill be at yours in about 3 mins but look out for me


def classify_text(model, extract_features_fn, text):
    return model.predict(extract_features_fn(text)[np.newaxis,:].astype(np.bool))[0]
TESTS = [
    # Some I made up to avoid spammy features.
    "c4ll one eight eight nine five seven for s3x",
    "help me, I'm in danger! call 12345678",
    # Trying to modify this spam message to bypass the filter, but it was hard! Had
    # to remove the number and "win", basically. Adding ham-indicative words didn't seem
    # to balance out spam-indicative words.
    "sunshine quiz wkly q! get a sony dvd player if u know which country liverpool played in mid week? ansr to eight-two-two-seven-seven. i'll say he later i'm happy to see you",
    # It's annoyingly easy to circumvent the filter by adding extra characters between letters or switching
    # out characters for, e.g., numbers. To be more resilient we'd have to account for that.
    "s.ms auction - a fresh nokia 7-2-5-0 is up 4 au-ction today! au-ction is gratis 2 join & take part! t3xt nokia to 8-6-0-2-1 now!"
]
for text in TESTS:
    print("spam" if classify_text(model, ef, text) == 1 else "ham", "-", text)

ham - c4ll one eight eight nine five seven for s3x
ham - help me, I'm in danger! call 12345678
ham - sunshine quiz wkly q! get a sony dvd player if u know which country liverpool played in mid week? ansr to eight-two-two-seven-seven. i'll say he later i'm happy to see you
ham - s.ms auction - a fresh nokia 7-2-5-0 is up 4 au-ction today! au-ction is gratis 2 join & take part! t3xt nokia to 8-6-0-2-1 now!

SMS Spam Collection Dataset¶

Imports¶

Read the dataset and do some preprocessing¶

Split into train/test sets.¶

Feature selection and extraction.¶

Naive Bayes implementation.¶

Bringing it all together and evaluating on the test set.¶

Analysis of results.¶

Future work.¶