Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 2bb1705

Browse files
Image captioning model
1 parent 2717a2c commit 2bb1705

File tree

4 files changed

+291
-39
lines changed

4 files changed

+291
-39
lines changed

‎Image-Captioning/image-captioning.py‎

Lines changed: 269 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,269 @@
1+
# Read text captions
2+
def readTextFile(path):
3+
with open(path) as f:
4+
captions = f.read()
5+
return captions
6+
7+
# Location of captions
8+
captions = readTextFile('files/captions.txt')
9+
captions = captions.split("\n")[1:-1]
10+
print(len(captions)) # Total captions
11+
12+
# Creating dictionary - {"image name": ["caption1","caption2"...]}
13+
description = {}
14+
for x in captions:
15+
parts = x.split(',')
16+
img_name = parts[0][:-4]
17+
coment = parts[1]
18+
if description.get(img_name) is None:
19+
description[img_name] = []
20+
description[img_name].append(coment)
21+
22+
# All libraries
23+
import numpy as np
24+
import matplotlib.pyplot as plt
25+
import keras
26+
import re
27+
import tensorflow as tf
28+
from tf.keras.applications.resnet50 import ResNet50,preprocess_input
29+
from keras.preprocessing import image
30+
from keras.utils import load_img,img_to_array
31+
from keras.models import Model
32+
from tf.keras.preprocessing.sequence import pad_sequences
33+
from keras.utils import to_categorical
34+
from keras.layers import Dense,Dropout,Embedding,LSTM
35+
from keras.layers.merging import add
36+
37+
# Data cleaning
38+
# Don't remove stopwords because we need to make meaningful words. Also stemming will also not applicable because we require texts has correct vocabulary
39+
# Remove numbers, lower() , punctuations remove
40+
def clean_text(sentence):
41+
sentence = sentence.lower()
42+
sentence = re.sub("[^a-z]+"," ",sentence)
43+
sentence = sentence.split()
44+
45+
sentence = [s for s in sentence if len(s)>1]
46+
sentence = " ".join(sentence)
47+
return sentence
48+
49+
# clean all captions
50+
for key,caption_list in description.items():
51+
for i in range(len(caption_list)):
52+
caption_list[i] = clean_text(caption_list[i])
53+
54+
# Total number of words across all the sentences
55+
total_words = []
56+
for key in description.keys():
57+
[total_words.append(i) for des in description[key] for i in des.split()]
58+
print(len(total_words))
59+
60+
# Filter words from the vocab according to the certain threshold frequency
61+
import collections
62+
counter = collections.Counter(total_words)
63+
freq_cnt = dict(counter)
64+
65+
# Sort this dictionary according to freq count
66+
sorted_freq_cnt = sorted(freq_cnt.items(),reverse=True,key=lambda x:x[1])
67+
68+
# Filtering
69+
threshold = 5
70+
sorted_freq_cnt = [x for x in sorted_freq_cnt if x[1]>threshold]
71+
total_words = [x[0] for x in sorted_freq_cnt]
72+
73+
# Prepare train/test data
74+
train_filedata = readTextFile("files/Flickr_8k.trainImages.txt")
75+
test_filedata = readTextFile("files/Flickr_8k.testImages.txt")
76+
77+
train = [row.split(".")[0] for row in train_filedata.split("\n")[:-1]]
78+
test = [row.split(".")[0] for row in test_filedata.split("\n")[:-1]]
79+
80+
# Prepare description for the training data
81+
# Tweak - add <S> and <e> token to our training data
82+
train_description = {}
83+
for img_id in train:
84+
train_description[img_id] = []
85+
for cap in description[img_id]:
86+
cap_to_append = "startseq " + cap + " endseq"
87+
train_description[img_id].append(cap_to_append)
88+
89+
90+
# Transfer learning
91+
# Step 1. Image feature extraction
92+
model = ResNet50(weights='imagenet',input_shape=(224,224,3)) # Using pretrained ResNet50 model for extracting preprocessing images
93+
model.summary()
94+
95+
new_model = Model(model.input,model.layers[-2].output) # Removing last 2 layers of ResNet50 model
96+
new_model.summary()
97+
98+
def preprocess_img(img):
99+
img = load_img(img,target_size=(224,224))
100+
img = img_to_array(img)
101+
img = np.expand_dims(img,axis=0)
102+
# normalisation -> preprocess_input
103+
img = preprocess_input(img)
104+
return img
105+
106+
def encode_image(img):
107+
img = preprocess_img(img)
108+
feature_vector = new_model.predict(img,verbose=0)
109+
# print(feature_vector.shape)
110+
feature_vector = feature_vector.reshape((-1,))
111+
return feature_vector
112+
113+
# encode all train images
114+
encoding_train = {}
115+
# image_id --> feature vector extrcted from resnet
116+
for ix,img_id in enumerate(train):
117+
img_path = "files/Images/"+img_id+".jpg"
118+
encoding_train[img_id] = encode_image(img_path)
119+
# if ix%100==0:
120+
# print(ix)
121+
122+
# encode all test images
123+
encoding_test = {}
124+
# image_id --> feature vector extrcted from resnet
125+
for ix,img_id in enumerate(test):
126+
img_path = "files/Images/"+img_id+".jpg"
127+
encoding_test[img_id] = encode_image(img_path)
128+
# if ix%100==0:
129+
# print(ix)
130+
131+
132+
word_to_idx = {}
133+
idx_to_word = {}
134+
for i,word in enumerate(total_words):
135+
word_to_idx[word] = i+1
136+
idx_to_word[i+1] = word
137+
word_to_idx['startseq'] = 2573
138+
word_to_idx['endseq'] = 2574
139+
idx_to_word[2573] = 'startseq'
140+
idx_to_word[2574] = 'endseq'
141+
142+
# Model training
143+
# RNN model ->
144+
# Find max length of any caption to decide RNN model size
145+
max_len=0
146+
for key in train_description.keys():
147+
for cap in train_description[key]:
148+
max_len = max(max_len,len(cap.split())) # Max length of any caption
149+
150+
# Data Loader(generator)
151+
def data_generator(train_description,encoding_train,word_to_idx,max_len,batch_size,vocab_size=2574):
152+
x1,x2,y = [],[],[]
153+
n=0
154+
155+
while True:
156+
for key,desc_list in train_description.items():
157+
n+=1
158+
photo = encoding_train[key]
159+
for desc in desc_list:
160+
seq = [word_to_idx[word] for word in desc.split() if word in word_to_idx.keys()]
161+
for i in range(1,len(seq)):
162+
xi = seq[0:i]
163+
yi = seq[i]
164+
165+
xi = pad_sequences([xi],maxlen=max_len,value=0,padding='post')[0]
166+
yi = to_categorical([yi-1],num_classes=vocab_size)[0]
167+
x1.append(photo) # 2048
168+
x2.append(xi) # 35 -> glove
169+
y.append(yi) # vocab_size->2574
170+
171+
if n==batch_size:
172+
yield [[np.array(x1),np.array(x2)],np.array(y)]
173+
x1,x2,y = [],[],[]
174+
n=0
175+
176+
# WORD EMBEDDINGS
177+
# The text data should be embedded before passing to RNN/LSTM layer
178+
f = open("files/glove.6B.50d.txt" , encoding='utf8')
179+
embedding_index = {}
180+
181+
for line in f:
182+
values = line.split()
183+
word = values[0]
184+
word_embedding = np.array(values[1:],dtype='float')
185+
embedding_index[word] = word_embedding
186+
187+
def get_embedding_matrix(vocab_size=2574):
188+
emb_dim = 50
189+
matrix = np.zeros((vocab_size,emb_dim))
190+
for word,idx in word_to_idx.items():
191+
embedding_vector = embedding_index.get(word)
192+
if embedding_vector is not None:
193+
matrix[idx] = embedding_vector
194+
return matrix
195+
196+
197+
embedding_matrix = get_embedding_matrix()
198+
199+
vocab_size = 2574
200+
from keras.layers import Input
201+
input_img_features = Input(shape=(2048,))
202+
input_img1 = Dropout(0.3)(input_img_features)
203+
input_img2 = Dense(256,activation="relu")(input_img1)
204+
205+
# Captions as input => batch_size*35 -> batch_size*35*50 -> 256
206+
input_captions = Input(shape=(max_len,))
207+
# Now here we use customize embedding and not the glove vector embedding yet
208+
input_cap1 = Embedding(input_dim=vocab_size,output_dim=50,mask_zero=True)(input_captions)
209+
input_cap2 = Dropout(0.3)(input_cap1)
210+
input_cap3 = LSTM(256)(input_cap2)
211+
212+
# Add inputs and decode them
213+
decoder1 = add([input_img2,input_cap3])
214+
decoder2 = Dense(256,activation='relu')(decoder1)
215+
outputs = Dense(vocab_size,activation='softmax')(decoder2)
216+
217+
# COMBINED MODEL
218+
model = Model(inputs=[input_img_features,input_captions],outputs=outputs)
219+
220+
# Important thing -- Embedding layer # Here we defined the matrix to be choose for the words with integers
221+
model.layers[2].set_weights([embedding_matrix])
222+
model.layers[2].trainable = False
223+
model.compile(loss="categorical_crossentropy",optimizer="adam")
224+
225+
print(model.summary())
226+
227+
# Training of Model
228+
epochs = 10
229+
batch_size = 3 # no if images per batch
230+
steps = len(train_description)//batch_size
231+
def train():
232+
for i in range(epochs):
233+
generator = data_generator(train_description,encoding_train,word_to_idx,max_len,batch_size)
234+
model.fit(generator,epochs=1,steps_per_epoch=steps,verbose=1)
235+
model.save("models/"+"9"+'.h5')
236+
237+
train()
238+
239+
# Prediction Function
240+
def predict_caption(photo):
241+
in_text = "startseq"
242+
for i in range(max_len):
243+
sequence = [word_to_idx[w] for w in in_text.split() if w in word_to_idx]
244+
sequence = pad_sequences([sequence],maxlen=max_len,padding='post')
245+
ypred = model.predict([photo,sequence])
246+
ypred = ypred.argmax() # word with max probability -> greedy sampling
247+
word = idx_to_word[ypred+1]
248+
in_text += (' ' + word)
249+
if word=='endseq':
250+
break
251+
final_caption = in_text.split()[1:-1]
252+
final_caption = ' '.join(final_caption)
253+
return final_caption
254+
255+
256+
# Pick some random images
257+
for i in range(15):
258+
no = np.random.randint(0,1000)
259+
all_img_names = list(encoding_test.keys())
260+
img_name = all_img_names[no]
261+
photo_2048 = encoding_test[img_name].reshape((1,2048))
262+
263+
caption = predict_caption(photo_2048)
264+
265+
i = plt.imread("files/Images/"+img_name+".jpg")
266+
print(caption)
267+
plt.imshow(i)
268+
plt.axis("off")
269+
plt.show()

‎Image-Captioning/models/9.h5‎

18.5 MB
Binary file not shown.

‎Image-Captioning/readme.md‎

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# Image Captioning Model
2+
3+
* Uses pretrained ResNet50 model and Glove embeddings to caption any image
4+
5+
<pre>
6+
Model ARCHITECTURE
7+
img feature --------> MODEL --> Next word in sequence ----
8+
partial sequence ---> |
9+
| _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ |
10+
Partial caption ----> RNN
11+
\
12+
\ Feed forward network ----> predicted word,next
13+
/ ending with softmax in the sequence of
14+
/ partial caption
15+
Image vector
16+
</pre>
17+
18+
## <a href="https://towardsdatascience.com/image-captioning-with-keras-teaching-computers-to-describe-pictures-c88a46a311b8">Link</a> for this model
19+
20+
## Installation
21+
pip install tensorflow
22+
pip install keras

‎Keylogger/keylogger.py‎

Lines changed: 0 additions & 39 deletions
This file was deleted.

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /