This countvectorizer sklearn example is from Pycon Dublin 2016. For further information please visit this link. The dataset is from UCI.
In [2]:
messages = [line.rstrip() for line in open('smsspamcollection/SMSSpamCollection')]
In [3]:
print (len(messages))
In [5]:
for num,message in enumerate(messages[:10]):
print(num,message)
print ('n')
In [6]:
import pandas
In [7]:
messages = pandas.read_csv('smsspamcollection/SMSSpamCollection',
sep='t',names=['labels','message'])
In [9]:
messages.head()
Out[9]:
In [10]:
messages.describe()
Out[10]:
In [11]:
messages.info()
In [12]:
messages.groupby('labels').describe()
Out[12]:
In [13]:
messages['length'] = messages['message'].apply(len)
messages.head()
Out[13]:
In [14]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [16]:
messages['length'].plot(bins=50,kind = 'hist')
Out[16]:
In [17]:
messages['length'].describe()
Out[17]:
In [20]:
messages[messages['length'] == 910]['message'].iloc[0]
Out[20]:
In [22]:
messages.hist(column='length',by ='labels',bins=50,figsize = (10,4))
Out[22]:
In [23]:
import string
In [24]:
mess = 'Sample message ! Notice: it has punctuation'
In [25]:
string.punctuation
Out[25]:
In [26]:
nopunc = [char for char in mess if char not in string.punctuation]
In [27]:
nopunc = ''.join(nopunc)
In [28]:
nopunc
Out[28]:
In [30]:
from nltk.corpus import stopwords
In [31]:
stopwords.words('english')[0:10]
Out[31]:
In [32]:
nopunc.split()
Out[32]:
In [33]:
clean_mess = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
In [34]:
clean_mess
Out[34]:
In [35]:
def text_process(mess):
"""
Takes in a string of text, then performs the following:
1. Remove all punctuation
2. Remove all stopwords
3. Returns a list of the cleaned text
"""
# Check characters to see if they are in punctuation
nopunc = [char for char in mess if char not in string.punctuation]
# Join the characters again to form the string.
nopunc = ''.join(nopunc)
# Now just remove any stopwords
return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
In [36]:
messages.head()
Out[36]:
In [37]:
messages['message'].head(5).apply(text_process)
Out[37]:
In [40]:
from sklearn.feature_extraction.text import CountVectorizer
In [44]:
bow_transformer = CountVectorizer(analyzer=text_process)
In [45]:
bow_transformer.fit(messages['message'])
Out[45]:
In [46]:
message4 = messages['message'][3]
In [47]:
print (message4)
In [48]:
bow4 = bow_transformer.transform([message4])
In [49]:
print (bow4)
In [50]:
print (bow_transformer.get_feature_names()[4073])
In [51]:
print (bow_transformer.get_feature_names()[4068])
In [52]:
print (bow_transformer.get_feature_names()[9554])
In [53]:
messages_bow = bow_transformer.transform(messages['message'])
In [54]:
print ('Shape of Sparse Matrix: ', messages_bow.shape)
print ('Amount of Non-Zero occurences: ', messages_bow.nnz)
print ('sparsity: %.2f%%' % (100.0 * messages_bow.nnz /
(messages_bow.shape[0] * messages_bow.shape[1])))
In [55]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(messages_bow)
In [56]:
tfidf4 = tfidf_transformer.transform(bow4)
In [57]:
print (tfidf4)
In [58]:
print (tfidf_transformer.idf_[bow_transformer.vocabulary_['u']])
print (tfidf_transformer.idf_[bow_transformer.vocabulary_['university']])
In [59]:
messages_tfidf = tfidf_transformer.transform(messages_bow)
In [60]:
print (messages_tfidf.shape)
In [61]:
from sklearn.naive_bayes import MultinomialNB
In [62]:
spam_detect_model = MultinomialNB().fit(messages_tfidf,messages['labels'])
In [64]:
print ('Predicted: ',spam_detect_model.predict(tfidf4)[0] )
print ('Expected: ',messages['labels'][3])
In [65]:
all_predictions = spam_detect_model.predict(messages_tfidf)
print (all_predictions)
In [67]:
from sklearn.metrics import classification_report
print (classification_report(messages['labels'], all_predictions))
In [69]:
from sklearn.cross_validation import train_test_split
msg_train, msg_test, label_train, label_test =
train_test_split(messages['message'], messages['labels'], test_size=0.2)
print (len(msg_train), len(msg_test), len(msg_train) + len(msg_test))
In [70]:
from sklearn.pipeline import Pipeline
In [71]:
pipeline = Pipeline([('bow',CountVectorizer(analyzer =text_process)),
('tfidf',TfidfTransformer()),
('classifier',MultinomialNB())])
In [72]:
pipeline.fit(msg_train,label_train)
Out[72]:
In [73]:
predictions = pipeline.predict(msg_test)
In [74]:
print (classification_report(predictions,label_test))
It was running perfectly when i used it and the results were upto the marks..!!
bow_transformer.fit(messages[‘message’]) produces the following errot in Python3.7 , expecting index to be integer not a string
Traceback (most recent call last):
File “C:\Users\NLP\AppData\Local\Programs\Python\Python37-32\NLP_Programs\clean.py”, line 39, in
bow_transformer.fit(posts[‘post’])
File “C:\Users\NLP\AppData\Local\Programs\Python\Python37-32\lib\site-packages\pandas\core\series.py”, line 767, in __getitem__
result = self.index.get_value(self, key)
File “C:\Users\NLP\AppData\Local\Programs\Python\Python37-32\lib\site-packages\pandas\core\indexes\base.py”, line 3118, in get_value
tz=getattr(series.dtype, ‘tz’, None))
File “pandas\_libs\index.pyx”, line 106, in pandas._libs.index.IndexEngine.get_value
File “pandas\_libs\index.pyx”, line 114, in pandas._libs.index.IndexEngine.get_value
File “pandas\_libs\index.pyx”, line 164, in pandas._libs.index.IndexEngine.get_loc
KeyError: ‘post’
Error
‘utf-8’ codec can’t decode byte 0xe5 in position 135: invalid continuation byte
after running the line –
messages = pd.read_csv(‘spam.csv’, sep=’\t’,names=[‘labels’,’message’])
This has to be corrected by giving proper encoding as below –
messages = pd.read_csv(‘spam.csv’, names=[‘labels’,’message’], encoding=’latin1′)
you passed wrong order in the last step “classification report”.
It should be: classification_report(y_true=label_test,y_pred=predictions)
so the recall rate for spam is only 0.71
Error
NotFittedError: CountVectorizer – Vocabulary wasn’t fitted.