|
| 1 | +from sklearn.feature_extraction.text import CountVectorizer |
| 2 | +import nltk |
| 3 | +import pandas as pd #pandas is a library where your data can be stored, analyzed and processed in row and column representation |
| 4 | +from openpyxl import Workbook |
| 5 | +sentences=input("Enter your sentences: ") |
| 6 | +#eg. My name is sanya. I am caring and loving. I am generous. |
| 7 | +#converting to lower case (normalization) |
| 8 | +sentences=sentences.lower() |
| 9 | +#sentence tokenized |
| 10 | +tokenized_sentences=nltk.tokenize.sent_tokenize(sentences) |
| 11 | +print(tokenized_sentences) |
| 12 | +tokenized_sentences1=[] |
| 13 | +for x in tokenized_sentences: |
| 14 | + x=x.replace(".","") #removed . |
| 15 | + tokenized_sentences1.append(x) |
| 16 | +print(tokenized_sentences1) #list of word can be converted to set to get unique words |
| 17 | +#instantiating CountVectorizer() |
| 18 | +countVectorizer=CountVectorizer() #BOW |
| 19 | +#transforming text from to vectors where each word and its count is a feature |
| 20 | +tmpbow=countVectorizer.fit_transform(tokenized_sentences1)#pass list of sentences as arguments |
| 21 | +print("tmpbow \n",tmpbow) #bag of word model is ready |
| 22 | + |
| 23 | +bow=tmpbow.toarray() |
| 24 | +print("Vocabulary = ",countVectorizer.vocabulary_) |
| 25 | +print("Features = ",countVectorizer.get_feature_names()) |
| 26 | +#Features in machine learning are nothing but names of the columns |
| 27 | +print("BOW ",bow) |
| 28 | + |
| 29 | +#create dataframe #DataFrame is an analogy to excel-spreadsheet |
| 30 | +cv_dataframe=pd.DataFrame(bow,columns=countVectorizer.get_feature_names()) |
| 31 | + |
| 32 | +print("cv_dataframe is below\n",cv_dataframe) |
| 33 | +cv_dataframe.to_excel('./Bag of words model/bowp.xlsx', sheet_name='data') |
0 commit comments