Demo entry 6899790

python

   

Submitted by wl884 on Oct 08, 2019 at 23:06
Language: Python 3. Code size: 2.6 kB.

# -*- coding: utf-8 -*-
#download 20 newsgroups dataset
from sklearn.datasets import fetch_20newsgroups
twenty_train=fetch_20newsgroups(subset='train',shuffle=True,random_state=42)
#category names of news
twenty_train.target_names
#number of news files
len(twenty_train.filenames)
twenty_train.target[:10]
for t in twenty_train.target[:10]:
    print(twenty_train.target_names[t])

#transform documents to feature vectors
from sklearn.feature_extraction.text import CountVectorizer
count_vect=CountVectorizer()
x_train_counts=count_vect.fit_transform(twenty_train.data)
x_train_counts.shape

## computes frequences "tf" and "tfidf"

from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer=TfidfTransformer()
x_train_tfidf=tfidf_transformer.fit_transform(x_train_counts)
x_train_tfidf.shape

##train classifier

from sklearn.naive_bayes import MultinomialNB
clf=MultinomialNB(alpha=0.01).fit(x_train_tfidf,twenty_train.target)

##prediction

twenty_test=fetch_20newsgroups(subset='test',shuffle=True,random_state=42)
d_test=twenty_test.data

#process test data
d_test=count_vect.transform(d_test)
d_test=tfidf_transformer.transform(d_test)

#predict

predicted=clf.predict(d_test)

import numpy as np


##classification accuracy

np.mean(predicted==twenty_test.target)

from sklearn import metrics

#print accuracy

print(metrics.classification_report(twenty_test.target,predicted,target_names=twenty_test.target_names))

from sklearn.metrics import confusion_matrix
#confusion matrix

confusion_matrix(twenty_test.target,predicted)

from sklearn.preprocessing import label_binarize
n_classes = len(twenty_train.target_names)
pro_pred = clf.predict_proba(d_test)
binarized_target = label_binarize(twenty_test.target, np.arange(n_classes))

#compute ROC curve
from sklearn.metrics import roc_curve,auc
fpr=dict()
tpr=dict()
roc_auc=dict()

for i in range(n_classes):
    fpr[i],tpr[i],_=roc_curve(binarized_target[:,i],pro_pred[:,i])
    roc_auc[i]=auc(fpr[i],tpr[i])

import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib','inline')

#plot roc

for i in range (n_classes):
    plt.figure()
    lw=2
    plt.plot(fpr[i],tpr[i],color='darkorange',lw=lw,label='ROC curve(area=%0.2f)'%roc_auc[i])
    plt.plot([0,1],[0,1],color='navy',lw=lw,linestyle='--')
    plt.xlim([0.0,1.0])
    plt.ylim([0.0,1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic curve - %d'%(i))
    plt.legend(loc="lower right")
    plt.show()

This snippet took 0.00 seconds to highlight.

Back to the Entry List or Home.

Delete this entry (admin only).