0
import nltk
import random
from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier
import pickle
import sys
sys.getdefaultencoding()
import os
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import SGDClassifier
from nltk.classify import ClassifierI
from statistics import mode
from nltk.tokenize import word_tokenize
class VoteClassifier(ClassifierI):
 def __int__(self, *classifiers):
 self._classifiers = classifiers
 def classify(self, features):
 votes = [ ]
 for c in self._classifiers:
 v = c.classify(features)
 votes.append(v)
 return mode(votes)
 def confidence(self, features):
 votes = [ ]
 for c in self._classifiers:
 v = c.classify(features)
 votes.append(v)
 choice_votes = votes.count(mode(votes))
 conf = choice_votes / len(votes)
 return conf
short_pos = os.open("positive.txt", os.O_RDONLY).read()
short_neg = os.open("negative.txt", os.O_RDONLY).read()
documents = [ ]
for r in short_pos.split('\n'):
 documents.append( (r, "pos") )
for r in short_neg.split('\n'):
 documents.append( (r, "neg") )
 all_words = [ ]
 short_pos_words = word_tokenize(short_pos)
 short_neg_words = word_tokenize(short_neg)
 for w in short_pos_words:
 all_words.append(w.lower())
 for w in short_neg_words:
 all_words.append(w.lower())
all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())[:5000]
def find_features(document):
 words = set(document)
 features = {}
 for w in word_features:
 features[w] = (w in words) 
 return features
# print ((find_features(movie_reviews.words('neg/cv000_29416.txt'))))
featuresets = [(find_features(rev), category) for (rev, category) in documents] random.shuffle(featuresets)
#training with increased data collection 
training_set = featuresets[:10000]
#testing with increased data Collection
testing_set = featuresets[10000:]
#define and train classifier
classifier = nltk.NaiveBayesClassifier.train(training_set)
#testing classifier
#print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)
#show the 15 most valuable words when it comes to positive or negative reviews
#classifier.show_most_informative_features(15)
#saving classifier
#save_classifier = open("naivebayes.pickle", "wb")
#pickle.dump(classifier, save_classifier)
#save_classifier.close()
#loading classifier
#classifier_f = open("naivebayes.pickle", "rb")
#classifier = pickle.load(classifier_f)
#classifier_f.close()
print("Original Naive Bayes Alogrithm acurracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
 classifier.show_most_informative_features(15)
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MultinomialNB accuracy percent:", nltk.classify.accuracy(MNB_classifier, testing_set))
BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB accuracy percent:", nltk.classify.accuracy(BernoulliNB_classifier, testing_set))
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDclassifier accuracy percent:", nltk.classify.accuracy(SGDClassifier_classifier, testing_set))
#SVC_classifier = SklearnClassifier(SVC())
#SVC_classifier.train(training_set)
#print("SVC accuracy percent:", nltk.classify.accuracy(SVC_classifier, testing_set))
voted_classifier = VoteClassifier(classifier, 
 SGDClassifier_classifier,
 MNB_classifier, 
 BernoulliNB_classifier )
print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, training_set))*100)
#print ("Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence%:", voted_classifier.confidence(testing_set[0][0])*100)
#print ("Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence%:", voted_classifier.confidence(testing_set[1][0])*100)
#print ("Classification:", voted_classifier.classify(testing_set[2][0]), "Confidence%:", voted_classifier.confidence(testing_set[2][0])*100)
#print ("Classification:", voted_classifier.classify(testing_set[3][0]), "Confidence%:", voted_classifier.confidence(testing_set[3][0])*100)
#print ("Classification:", voted_classifier.classify(testing_set[4][0]), "Confidence%:", voted_classifier.confidence(testing_set[4][0])*100)
#print ("Classification:", voted_classifier.classify(testing_set[5][0]), "Confidence%:", voted_classifier.confidence(testing_set[5][0])*100)

When running the above code I get the error,

short_pos = os.open("positive.txt", os.O_RDONLY).read()
AttributeError: 'int' object has no attribute 'read'

Why is this error occurring, and how do I prevent this error from appearing again?

willeM_ Van Onsem
482k33 gold badges484 silver badges624 bronze badges
asked Feb 14, 2017 at 14:06

1 Answer 1

2

It is because you are trying to call .read() method on a return value from os.open(), which returns an int (file descriptor), not a file-like object.

I think what you meant to do was using a simple

with open('filename.txt', 'r') as f:
 text = f.read()

Or if you really want a one-liner:

text = open('filename.txt', 'r').read()

Those two lines:

short_pos = os.open("positive.txt", os.O_RDONLY).read()
short_neg = os.open("negative.txt", os.O_RDONLY).read()

Should be changed to:

with open("positive.txt", 'r') as f:
 short_pos = f.read()
with open("negative.txt", 'r') as f:
 short_neg = f.read()

Also, instead of reading the contents of the whole file and then splitting those by a \n like this:

for r in short_pos.split('\n'): # This .split()
 documents.append( (r, "pos") )
for r in short_neg.split('\n'): # And this .split()
 documents.append( (r, "neg") )

instead of using str.split() it would be much better idea to read the file using .readlines() instead of read() in the first place. The former will return a list of lines from the file stream, and you don't have to worry about different line ending schemes that different operating systems use.

answered Feb 14, 2017 at 14:08
Sign up to request clarification or add additional context in comments.

2 Comments

Thank You very much. But I now have the error: return codecs.ascii_decode(input, self.errors)[0] UnicodeDecodeError: 'ascii' codec can't decode byte 0xf3 in position 4645: ordinal not in range(128)
@A.Lona It doesn't seem to originate from the code that you posted, therefore I can't really be certain, but I've got a hunch, that you are trying to decode something that already is decoded into ascii/utf. iIf the input variable is something from the file that you are opening, then if you were using os.open() to open this file, the output would be a byte-string, which you would have to decode with some kind of codec. You are now opening your file with a simple open(), therefore the output is already a standard string that doesn't require any decoding. Could you post full stack trace?

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.