# Import necessary libraries
import os # Operating system-specific functionality
import pickle # Serialization and deserialization of Python objects
import numpy as np # Numerical operations
import matplotlib.pyplot as plt # Plotting library
from PIL import Image # Python Imaging Library for image processing
from math import ceil # Ceiling function for calculating steps_per_epoch
from collections import defaultdict # Default dictionary for convenient data structure
# Import the TensorFlow library
import tensorflow as tf # High-level machine learning library
# Import the VGG16 model and related modules from Keras
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input # VGG16 model and preprocessing functions
from tensorflow.keras.preprocessing.image import load_img, img_to_array # Image loading and conversion utilities
# Import the Tokenizer class from Keras for text tokenization
from tensorflow.keras.preprocessing.text import Tokenizer # Tokenizer for text data
# Import the pad_sequences function from Keras for sequence padding
from tensorflow.keras.preprocessing.sequence import pad_sequences # Function for padding sequences to a fixed length
# Import the Model and load_model classes from Keras for model building and loading
from tensorflow.keras.models import Model, load_model # Model-related functionalities
# Import the to_categorical function from Keras for one-hot encoding
from tensorflow.keras.utils import to_categorical # Utility function for one-hot encoding
# Import specific layers and utilities from Keras
from tensorflow.keras.layers import * # Importing various layers like Dense, Dropout, etc.
# Import the corpus_bleu function from NLTK for calculating BLEU scores
from nltk.translate.bleu_score import corpus_bleu # BLEU score calculation using NLTK
The Flickr8k Dataset is a widely used collection in the field of image captioning. It consists of 8,000 images gathered from the photo-sharing platform Flickr. Each image is accompanied by five descriptive captions, providing rich and diverse textual descriptions for various scenes and subjects.
The dataset covers a broad range of subjects, including people, animals, objects, and scenic views. Each image is a snapshot of a moment, and the five captions associated with it capture different aspects and interpretations.
# Specify the input directory path
INPUT_DIR = './dataset/' # Directory where the input data is stored
# Specify the output directory path
OUTPUT_DIR = './models/' # Directory where the output models will be saved
# Utilizing a pre-trained VGG16 model for feature extraction
# Loading the VGG16 model with pre-trained weights
# Specify the parameters
weights = 'imagenet' # Use pre-trained weights from ImageNet
include_top = True # Include the fully connected layers at the top of the network
input_shape = (224, 224, 3) # Input shape of the images
# Create the VGG16 model with the specified parameters
model = VGG16(weights=weights, include_top=include_top, input_shape=input_shape)
# Modifying the model structure to exclude the final classification layer, enabling access to the model's output features
model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
# Displaying a summary of the modified model
model.summary()
Model: "model" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= input_1 (InputLayer) [(None, 224, 224, 3)] 0 block1_conv1 (Conv2D) (None, 224, 224, 64) 1792 block1_conv2 (Conv2D) (None, 224, 224, 64) 36928 block1_pool (MaxPooling2D) (None, 112, 112, 64) 0 block2_conv1 (Conv2D) (None, 112, 112, 128) 73856 block2_conv2 (Conv2D) (None, 112, 112, 128) 147584 block2_pool (MaxPooling2D) (None, 56, 56, 128) 0 block3_conv1 (Conv2D) (None, 56, 56, 256) 295168 block3_conv2 (Conv2D) (None, 56, 56, 256) 590080 block3_conv3 (Conv2D) (None, 56, 56, 256) 590080 block3_pool (MaxPooling2D) (None, 28, 28, 256) 0 block4_conv1 (Conv2D) (None, 28, 28, 512) 1180160 block4_conv2 (Conv2D) (None, 28, 28, 512) 2359808 block4_conv3 (Conv2D) (None, 28, 28, 512) 2359808 block4_pool (MaxPooling2D) (None, 14, 14, 512) 0 block5_conv1 (Conv2D) (None, 14, 14, 512) 2359808 block5_conv2 (Conv2D) (None, 14, 14, 512) 2359808 block5_conv3 (Conv2D) (None, 14, 14, 512) 2359808 block5_pool (MaxPooling2D) (None, 7, 7, 512) 0 flatten (Flatten) (None, 25088) 0 fc1 (Dense) (None, 4096) 102764544 fc2 (Dense) (None, 4096) 16781312 ================================================================= Total params: 134,260,544 Trainable params: 134,260,544 Non-trainable params: 0 _________________________________________________________________ None
# Visualizing the architecture of the specified model
# Plotting the model with shapes included for a clearer understanding of layer dimensions
plot_model(model, show_shapes=True)
As I delve into the depths of this VGG16 model, it's fascinating to see the progression of features through convolutional layers and the richness of parameters contributing to its understanding of diverse visual patterns.
VGG16, short for the Visual Geometry Group's 16-layer model, stands out for its simplicity and effectiveness in image classification tasks. Comprising 16 weight layers, it has a straightforward structure with small, 3x3 convolutional filters stacked on top of each other.
Here's a breakdown of its key components:
VGG16 boasts a substantial parameter count, totaling 134,260,544. Every parameter plays a crucial role in the model's ability to understand and differentiate between various visual patterns.
I appreciate the elegance of VGG16's architecture. The repeated pattern of small convolutional filters facilitates feature extraction, making it easy to comprehend and implement.
While VGG16 may not be the most lightweight model, its performance on image classification tasks, especially when pre-trained on large datasets like ImageNet, is remarkable. The learned features can be valuable for diverse computer vision applications.
The versatility of VGG16 extends beyond classification. Its hierarchical feature extraction makes it suitable for tasks like feature extraction and transfer learning, aligning well with the goals of my current project.
As I navigate through the intricacies of deep learning, VGG16 serves as a reliable companion, offering both simplicity in design and strength in performance. Its contribution to my image captioning endeavor lies in its ability to comprehend the intricate details of images, laying the foundation for crafting rich and meaningful captions.patterns.
# Create an empty dictionary to hold the extracted features of images
image_features = {}
# Specify the directory path where the images are stored
img_dir = os.path.join(INPUT_DIR, 'Images')
# Iterate through each image in the directory
for img_name in os.listdir(img_dir):
# Load the image from the file
img_path = os.path.join(img_dir, img_name)
loaded_image = load_img(img_path, target_size=(224, 224))
# Convert the image pixels to a NumPy array
image_array = img_to_array(loaded_image)
# Reshape the data to match the model's input requirements
reshaped_image = image_array.reshape((1, image_array.shape[0], image_array.shape[1], image_array.shape[2]))
# Perform preprocessing specific to the VGG16 model
preprocessed_image = preprocess_input(reshaped_image)
# Extract features using the pre-trained VGG16 model
extracted_feature = model.predict(preprocessed_image, verbose=0)
# Obtain the image ID by removing the file extension
image_id = img_name.split('.')[0]
# Store the extracted feature in the dictionary with the image ID as the key
image_features[image_id] = extracted_feature
# Serialize and save the extracted image features using the pickle module
# Combine the output directory path with the desired file name
output_file_path = os.path.join(OUTPUT_DIR, 'new_img_features.pkl')
# Open the file for writing in binary mode
with open(output_file_path, 'wb') as file:
# Use pickle to store the image features in the specified file
pickle.dump(image_features, file)
# Specify the file path for the pickled image features
pickle_file_path = os.path.join(OUTPUT_DIR, 'new_img_features.pkl')
# Open the file for reading in binary mode
with open(pickle_file_path, 'rb') as file:
# Load the stored image features from the specified file using pickle
loaded_features = pickle.load(file)
# Open and read the contents of the 'captions.txt' file located in the input directory
with open(os.path.join(INPUT_DIR, 'captions.txt'), 'r') as file:
# Skip the header line and read the remaining content
next(file)
captions_doc = file.read()
# Create a defaultdict to store lists of captions associated with each image
image_to_captions_mapping = defaultdict(list)
# Iterate through each line in the captions document
for line in captions_doc.split('\n'):
# Split the line into tokens using a comma as a delimiter
tokens = line.split(',')
# Check if the number of tokens is less than 2, and continue to the next iteration if true
if len(tokens) < 2:
continue
# Extract image ID and captions from the tokens
image_id, *captions = tokens
# Remove the file extension from the image ID
image_id = image_id.split('.')[0]
# Join the remaining tokens to form the complete caption
caption = " ".join(captions)
# Append the caption to the list associated with the image ID in the mapping
image_to_captions_mapping[image_id].append(caption)
# Calculate and print the total number of captions
total_captions = sum(len(captions) for captions in image_to_captions_mapping.values())
print("Total Number of Captions:", total_captions)
Total Number of Captions: 40455
def clean(mapping):
"""
Clean and preprocess captions in the given mapping.
Args:
- mapping (dict): A dictionary mapping image IDs to lists of captions.
Returns:
- None: The function modifies the captions in-place.
"""
for key, captions in mapping.items():
for i in range(len(captions)):
caption = captions[i]
# Convert to lowercase
caption = caption.lower()
# Remove non-alphabetic characters except spaces
caption = ''.join(char for char in caption if char.isalpha() or char.isspace())
# Replace multiple spaces with a single space
caption = caption.replace('\s+', ' ')
# Add start and end tokens to the caption
caption = 'startseq ' + ' '.join([word for word in caption.split() if len(word) > 1]) + ' endseq'
# Update the caption in the list
captions[i] = caption
# Before Text Preprocessing
image_to_captions_mapping['3637013_c675de7705']
["A couple stands close at the water 's edge .", 'The two people stand by a body of water and in front of bushes in fall .', 'Two people hold each other near a pond .', 'Two people stand by the water .', 'Two people stand together on the edge of the water on the grass .']
# Text Preprocessing Step
clean(image_to_captions_mapping)
image_to_captions_mapping['3637013_c675de7705']
['startseq couple stands close at the water edge endseq', 'startseq the two people stand by body of water and in front of bushes in fall endseq', 'startseq two people hold each other near pond endseq', 'startseq two people stand by the water endseq', 'startseq two people stand together on the edge of the water on the grass endseq']
all_captions = [caption for captions in image_to_captions_mapping.values() for caption in captions]
all_captions[:10]
['startseq child in pink dress is climbing up set of stairs in an entry way endseq', 'startseq girl going into wooden building endseq', 'startseq little girl climbing into wooden playhouse endseq', 'startseq little girl climbing the stairs to her playhouse endseq', 'startseq little girl in pink dress going into wooden cabin endseq', 'startseq black dog and spotted dog are fighting endseq', 'startseq black dog and tricolored dog playing with each other on the road endseq', 'startseq black dog and white dog with brown spots are staring at each other in the street endseq', 'startseq two dogs of different breeds looking at each other on the road endseq', 'startseq two dogs on pavement moving toward each other endseq']
# Create a Tokenizer object
tokenizer = Tokenizer()
# Fit the Tokenizer on the list of all captions to build the vocabulary
tokenizer.fit_on_texts(all_captions)
# Serialize and save the Tokenizer object using pickle
with open('tokenizer.pkl', 'wb') as tokenizer_file:
pickle.dump(tokenizer, tokenizer_file)
# Deserialize and load the Tokenizer object from the saved file using pickle
with open('tokenizer.pkl', 'rb') as tokenizer_file:
tokenizer = pickle.load(tokenizer_file)
# Determine the maximum caption length among all captions using the Tokenizer
max_caption_length = max(len(tokenizer.texts_to_sequences([caption])[0]) for caption in all_captions)
# Calculate the vocabulary size based on the Tokenizer's word index
vocab_size = len(tokenizer.word_index) + 1
# Print the calculated vocabulary size and maximum caption length
print("Vocabulary Size:", vocab_size)
print("Maximum Caption Length:", max_caption_length)
Vocabulary Size: 8768 Maximum Caption Length: 34
# Create a list of image IDs from the keys of the image_to_captions_mapping dictionary
image_ids = list(image_to_captions_mapping.keys())
# Determine the split index for creating training and test sets
split = int(len(image_ids) * 0.90)
# Create the training set by selecting the first 90% of image IDs
train = image_ids[:split]
# Create the test set by selecting the remaining 10% of image IDs
test = image_ids[split:]
# Generator function for training data
def data_generator(train_keys, img_to_caps_mapping, img_features, tokenizer, max_cap_length, vocab_size, batch_size):
"""
Generates batches of training data for the image captioning model.
Args:
- train_keys (list): List of image keys for the training set.
- img_to_caps_mapping (dict): Dictionary mapping image keys to associated captions.
- img_features (dict): Dictionary containing image features extracted using a pre-trained model.
- tokenizer (Tokenizer): Tokenizer object for text tokenization.
- max_cap_length (int): Maximum length of captions after tokenization.
- vocab_size (int): Size of the vocabulary.
- batch_size (int): Size of each training batch.
Yields:
- tuple: Tuple containing input data (image features and input sequences) and target output sequences.
"""
# Lists to store batch data
X1_batch, X2_batch, y_batch = [], [], []
# Counter for the current batch size
batch_count = 0
while True:
# Loop through each image in the current batch
for img_key in train_keys:
# Get the captions associated with the current image
captions = img_to_caps_mapping[img_key]
# Loop through each caption for the current image
for caption in captions:
# Convert the caption to a sequence of token IDs
caption_seq = tokenizer.texts_to_sequences([caption])[0]
# Loop through the tokens in the caption sequence
for i in range(1, len(caption_seq)):
# Split the sequence into input and output pairs
in_seq, out_seq = caption_seq[:i], caption_seq[i]
# Pad the input sequence to the specified maximum caption length
in_seq = pad_sequences([in_seq], maxlen=max_cap_length)[0]
# Convert the output sequence to one-hot encoded format
out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
# Append data to batch lists
X1_batch.append(img_features[img_key][0]) # Image features
X2_batch.append(in_seq) # Input sequence
y_batch.append(out_seq) # Output sequence
# Increase the batch counter
batch_count += 1
# If the batch is complete, yield the batch and reset lists and counter
if batch_count == batch_size:
X1_batch, X2_batch, y_batch = np.array(X1_batch), np.array(X2_batch), np.array(y_batch)
yield [X1_batch, X2_batch], y_batch
X1_batch, X2_batch, y_batch = [], [], []
batch_count = 0
# Encoder model
inputs1 = Input(shape=(4096,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)
fe2_projected = RepeatVector(max_caption_length)(fe2)
fe2_projected = Bidirectional(LSTM(256, return_sequences=True))(fe2_projected)
# Sequence feature layers
inputs2 = Input(shape=(max_caption_length,))
se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = Bidirectional(LSTM(256, return_sequences=True))(se2)
# Apply attention mechanism using Dot product
attention = Dot(axes=[2, 2])([fe2_projected, se3]) # Calculate attention scores
# Softmax attention scores
attention_scores = Activation('softmax')(attention)
# Apply attention scores to sequence embeddings
attention_context = Lambda(lambda x: tf.einsum('ijk,ijl->ikl', x[0], x[1]))([attention_scores, se3])
# Sum the attended sequence embeddings along the time axis
context_vector = tf.reduce_sum(attention_context, axis=1)
# Decoder model
decoder_input = concatenate([context_vector, fe2], axis=-1)
decoder1 = Dense(256, activation='relu')(decoder_input)
outputs = Dense(vocab_size, activation='softmax')(decoder1)
# Create the model
model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')
# Visualize the model
model.summary()
Model: "model_1" __________________________________________________________________________________________________ Layer (type) Output Shape Param # Connected to ================================================================================================== input_3 (InputLayer) [(None, 4096)] 0 [] dropout_1 (Dropout) (None, 4096) 0 ['input_3[0][0]'] input_4 (InputLayer) [(None, 34)] 0 [] dense_1 (Dense) (None, 256) 1048832 ['dropout_1[0][0]'] embedding (Embedding) (None, 34, 256) 2244608 ['input_4[0][0]'] repeat_vector (RepeatVector) (None, 34, 256) 0 ['dense_1[0][0]'] dropout_2 (Dropout) (None, 34, 256) 0 ['embedding[0][0]'] bidirectional (Bidirectional) (None, 34, 512) 1050624 ['repeat_vector[0][0]'] bidirectional_1 (Bidirectional (None, 34, 512) 1050624 ['dropout_2[0][0]'] ) dot (Dot) (None, 34, 34) 0 ['bidirectional[0][0]', 'bidirectional_1[0][0]'] activation (Activation) (None, 34, 34) 0 ['dot[0][0]'] lambda (Lambda) (None, 34, 512) 0 ['activation[0][0]', 'bidirectional_1[0][0]'] tf.math.reduce_sum (TFOpLambda (None, 512) 0 ['lambda[0][0]'] ) concatenate (Concatenate) (None, 768) 0 ['tf.math.reduce_sum[0][0]', 'dense_1[0][0]'] dense_2 (Dense) (None, 256) 196864 ['concatenate[0][0]'] dense_3 (Dense) (None, 8768) 2253376 ['dense_2[0][0]'] ================================================================================================== Total params: 7,844,928 Trainable params: 7,844,928 Non-trainable params: 0 __________________________________________________________________________________________________
# Visualize the model architecture with shapes
plot_model(model, show_shapes=True)
In the development of my custom image captioning model (named "model_1"), I have carefully crafted a neural network that combines both convolutional and recurrent layers. This model is designed to generate descriptive captions for images.
Input Layers:
Dropout Layer (dropout_1):
Dense Layer (dense_1):
Embedding Layer (embedding):
RepeatVector Layer (repeat_vector):
Dropout Layer (dropout_2):
Bidirectional LSTM Layers (bidirectional, bidirectional_1):
Dot Layer (dot):
Activation Layer (activation):
Lambda Layer (lambda):
Reduce Sum Layer (tf.math.reduce_sum):
Concatenate Layer (concatenate):
Dense Layers (dense_2, dense_3):
This architecture is carefully designed to combine image features and sequential information for accurate and context-aware caption generation. The total parameters reflect the model's capacity to learn and generalize from diverse image and caption data during training.
# Set the number of training epochs and batch size
epochs = 50
batch_size = 32
# Calculate the steps per epoch based on the number of batches in one epoch
steps_per_epoch = ceil(len(train) / batch_size)
validation_steps = ceil(len(test) / batch_size) # Calculate the steps for validation data
# Loop through the epochs for training
for epoch in range(epochs):
print(f"Epoch {epoch + 1}/{epochs}")
# Set up data generators for training and validation
train_generator = data_generator(train, image_to_captions_mapping, loaded_features, tokenizer, max_caption_length, vocab_size, batch_size)
validation_generator = data_generator(test, image_to_captions_mapping, loaded_features, tokenizer, max_caption_length, vocab_size, batch_size)
# Train the model for one epoch using the data generators
model.fit(train_generator, epochs=1, steps_per_epoch=steps_per_epoch,
validation_data=validation_generator, validation_steps=validation_steps,
verbose=1)
Epoch 1/50 228/228 [==============================] - 274s 1s/step - loss: 6.4268 - val_loss: 6.4850 Epoch 2/50 228/228 [==============================] - 253s 1s/step - loss: 5.2290 - val_loss: 6.4891 Epoch 3/50 228/228 [==============================] - 226s 989ms/step - loss: 4.8122 - val_loss: 6.2731 Epoch 4/50 228/228 [==============================] - 246s 1s/step - loss: 4.4898 - val_loss: 6.6783 Epoch 5/50 228/228 [==============================] - 237s 1s/step - loss: 4.2175 - val_loss: 6.7722 Epoch 6/50 228/228 [==============================] - 232s 1s/step - loss: 3.9323 - val_loss: 7.4398 Epoch 7/50 228/228 [==============================] - 275s 1s/step - loss: 3.5772 - val_loss: 7.3053 Epoch 8/50 228/228 [==============================] - 249s 1s/step - loss: 3.2714 - val_loss: 6.9621 Epoch 9/50 228/228 [==============================] - 296s 1s/step - loss: 3.0953 - val_loss: 7.4834 Epoch 10/50 228/228 [==============================] - 240s 1s/step - loss: 2.8158 - val_loss: 6.7651 Epoch 11/50 228/228 [==============================] - 221s 968ms/step - loss: 2.6980 - val_loss: 7.0226 Epoch 12/50 228/228 [==============================] - 158s 691ms/step - loss: 2.5395 - val_loss: 7.1167 Epoch 13/50 228/228 [==============================] - 152s 668ms/step - loss: 2.4294 - val_loss: 6.8429 Epoch 14/50 228/228 [==============================] - 154s 676ms/step - loss: 2.2601 - val_loss: 6.8003 Epoch 15/50 228/228 [==============================] - 151s 664ms/step - loss: 2.1455 - val_loss: 7.3354 Epoch 16/50 228/228 [==============================] - 154s 677ms/step - loss: 1.9821 - val_loss: 7.0470 Epoch 17/50 228/228 [==============================] - 150s 659ms/step - loss: 1.8604 - val_loss: 7.3459 Epoch 18/50 228/228 [==============================] - 150s 658ms/step - loss: 1.7827 - val_loss: 7.6326 Epoch 19/50 228/228 [==============================] - 149s 652ms/step - loss: 1.7168 - val_loss: 7.7972 Epoch 20/50 228/228 [==============================] - 149s 652ms/step - loss: 1.6248 - val_loss: 7.8400 Epoch 21/50 228/228 [==============================] - 152s 668ms/step - loss: 1.5278 - val_loss: 8.2000 Epoch 22/50 228/228 [==============================] - 153s 672ms/step - loss: 1.4666 - val_loss: 8.2232 Epoch 23/50 228/228 [==============================] - 153s 672ms/step - loss: 1.3989 - val_loss: 7.7666 Epoch 24/50 228/228 [==============================] - 153s 673ms/step - loss: 1.3431 - val_loss: 8.5297 Epoch 25/50 228/228 [==============================] - 155s 678ms/step - loss: 1.2546 - val_loss: 9.2400 Epoch 26/50 228/228 [==============================] - 159s 695ms/step - loss: 1.2188 - val_loss: 9.3097 Epoch 27/50 228/228 [==============================] - 155s 679ms/step - loss: 1.1183 - val_loss: 9.1880 Epoch 28/50 228/228 [==============================] - 156s 682ms/step - loss: 1.1076 - val_loss: 10.1482 Epoch 29/50 228/228 [==============================] - 155s 678ms/step - loss: 1.0801 - val_loss: 9.9377 Epoch 30/50 228/228 [==============================] - 152s 669ms/step - loss: 1.0158 - val_loss: 10.2713 Epoch 31/50 228/228 [==============================] - 153s 671ms/step - loss: 0.9524 - val_loss: 11.0540 Epoch 32/50 228/228 [==============================] - 154s 677ms/step - loss: 0.8777 - val_loss: 11.4524 Epoch 33/50 228/228 [==============================] - 165s 725ms/step - loss: 0.8454 - val_loss: 10.8839 Epoch 34/50 228/228 [==============================] - 172s 756ms/step - loss: 0.8011 - val_loss: 10.9346 Epoch 35/50 228/228 [==============================] - 172s 755ms/step - loss: 0.7754 - val_loss: 11.8426 Epoch 36/50 228/228 [==============================] - 168s 735ms/step - loss: 0.7140 - val_loss: 12.4560 Epoch 37/50 228/228 [==============================] - 171s 751ms/step - loss: 0.6856 - val_loss: 11.8752 Epoch 38/50 228/228 [==============================] - 172s 756ms/step - loss: 0.6648 - val_loss: 12.7536 Epoch 39/50 228/228 [==============================] - 171s 748ms/step - loss: 0.6507 - val_loss: 14.1370 Epoch 40/50 228/228 [==============================] - 178s 780ms/step - loss: 0.6400 - val_loss: 12.7257 Epoch 41/50 228/228 [==============================] - 177s 777ms/step - loss: 0.6346 - val_loss: 12.7481 Epoch 42/50 228/228 [==============================] - 176s 774ms/step - loss: 0.5966 - val_loss: 13.0265 Epoch 43/50 228/228 [==============================] - 178s 780ms/step - loss: 0.5656 - val_loss: 13.6092 Epoch 44/50 228/228 [==============================] - 218s 955ms/step - loss: 0.5219 - val_loss: 13.2933 Epoch 45/50 228/228 [==============================] - 209s 917ms/step - loss: 0.5004 - val_loss: 14.4468 Epoch 46/50 228/228 [==============================] - 170s 745ms/step - loss: 0.4939 - val_loss: 13.7727 Epoch 47/50 228/228 [==============================] - 171s 749ms/step - loss: 0.5200 - val_loss: 14.9553 Epoch 48/50 228/228 [==============================] - 180s 788ms/step - loss: 0.5434 - val_loss: 13.8833 Epoch 49/50 228/228 [==============================] - 179s 786ms/step - loss: 0.5359 - val_loss: 15.6298 Epoch 50/50 228/228 [==============================] - 187s 819ms/step - loss: 0.5167 - val_loss: 14.1596
# Save the trained model to a file
model.save(os.path.join(OUTPUT_DIR, 'mymodel.h5'))
# Load the trained model from the saved file
model = load_model(os.path.join(OUTPUT_DIR, 'mymodel.h5'))
def get_word_from_index(index, tokenizer):
"""
Retrieve the word corresponding to a given index from the tokenizer's vocabulary.
Args:
- index (int): Index of the word.
- tokenizer (Tokenizer): Tokenizer object.
Returns:
- str or None: The word corresponding to the index, or None if not found.
"""
return next((word for word, idx in tokenizer.word_index.items() if idx == index), None)
def predict_caption(model, image_features, tokenizer, max_caption_length):
"""
Generate a descriptive caption for a given image using the provided image captioning model.
Args:
- model (Model): Trained image captioning model.
- image_features (numpy.ndarray): Extracted features of the input image.
- tokenizer (Tokenizer): Tokenizer object used for text tokenization.
- max_caption_length (int): Maximum length of the generated caption.
Returns:
- str: Predicted caption for the input image.
"""
# Initialize the caption sequence
caption = 'startseq'
# Generate the caption
for _ in range(max_caption_length):
# Convert the current caption to a sequence of token indices
sequence = tokenizer.texts_to_sequences([caption])[0]
# Pad the sequence to match the maximum caption length
sequence = pad_sequences([sequence], maxlen=max_caption_length)
# Predict the probability distribution for the next word
y_hat = model.predict([image_features, sequence], verbose=0)
# Get the index with the highest predicted probability
predicted_index = np.argmax(y_hat)
# Convert the index to a word
predicted_word = get_word_from_index(predicted_index, tokenizer)
# Append the predicted word to the caption
caption += " " + predicted_word
# Stop if the predicted word is None or if the end sequence tag is encountered
if predicted_word is None or predicted_word == 'endseq':
break
return caption
# Initialize lists to store actual and predicted captions
actual_captions_list = []
predicted_captions_list = []
# Loop through the test data
for key in test:
# Retrieve actual captions for the current image
actual_captions = image_to_captions_mapping[key]
# Generate a caption for the image using the trained model
predicted_caption = predict_caption(model, loaded_features[key], tokenizer, max_caption_length)
# Split actual captions into individual words
actual_captions_words = [caption.split() for caption in actual_captions]
# Split the predicted caption into words
predicted_caption_words = predicted_caption.split()
# Append the word lists to their respective lists
actual_captions_list.append(actual_captions_words)
predicted_captions_list.append(predicted_caption_words)
# Calculate BLEU scores for unigram and bigram precision
bleu_score_1 = corpus_bleu(actual_captions_list, predicted_captions_list, weights=(1.0, 0, 0, 0))
bleu_score_2 = corpus_bleu(actual_captions_list, predicted_captions_list, weights=(0.5, 0.5, 0, 0))
print("BLEU-1 Score: %f" % bleu_score_1)
print("BLEU-2 Score: %f" % bleu_score_2)
BLEU-1: 0.547920 BLEU-2: 0.244377
# Save the lists of actual captions using pickle
with open('actual_captions_list.pkl', 'wb') as actual_file:
pickle.dump(actual_captions_list, actual_file)
# Save the lists of predicted captions using pickle
with open('predicted_captions_list.pkl', 'wb') as predicted_file:
pickle.dump(predicted_captions_list, predicted_file)
# Load actual captions list
with open('actual_captions_list.pkl', 'rb') as file:
loaded_actual_captions_list = pickle.load(file)
# Load predicted captions list
with open('predicted_captions_list.pkl', 'rb') as file:
loaded_predicted_captions_list = pickle.load(file)
def generate_caption(image_name):
"""
Display actual and predicted captions for a given image along with its visual representation.
Args:
- image_name (str): Name of the image file.
Returns:
- None
"""
# Extract image ID from the image name
image_id = image_name.split('.')[0]
# Get the full path of the image file
img_path = os.path.join(INPUT_DIR, "Images", image_name)
# Open the image using the PIL library
image = Image.open(img_path)
# Retrieve actual captions for the image
captions = image_to_captions_mapping[image_id]
print('---------------------Actual---------------------')
# Display actual captions
for caption in captions:
print(caption)
# Generate and display the predicted caption
y_pred = predict_caption(model, loaded_features[image_id], tokenizer, max_caption_length)
print('--------------------Predicted--------------------')
print(y_pred)
# Display the image
plt.imshow(image)
generate_caption("101669240_b2d3e7f17b.jpg")
---------------------Actual--------------------- startseq man in hat is displaying pictures next to skier in blue hat endseq startseq man skis past another man displaying paintings in the snow endseq startseq person wearing skis looking at framed pictures set up in the snow endseq startseq skier looks at framed pictures in the snow next to trees endseq startseq man on skis looking at artwork for sale in the snow endseq --------------------Predicted-------------------- startseq man on skis looking at artwork for sale in the snow endseq
generate_caption("1077546505_a4f6c4daa9.jpg")
---------------------Actual--------------------- startseq boy in blue shorts slides down slide into pool endseq startseq boy in blue swimming trunks slides down yellow slide into wading pool with inflatable toys floating in the water endseq startseq boy rides down slide into small backyard pool endseq startseq boy sliding down slide into pool with colorful tubes endseq startseq child is falling off slide onto colored balloons floating on pool of water endseq --------------------Predicted-------------------- startseq boy rides down slide into small backyard pool endseq
generate_caption("106490881_5a2dd9b7bd.jpg")
---------------------Actual--------------------- startseq boy in his blue swim shorts at the beach endseq startseq boy smiles for the camera at beach endseq startseq young boy in swimming trunks is walking with his arms outstretched on the beach endseq startseq children playing on the beach endseq startseq the boy is playing on the shore of an ocean endseq --------------------Predicted-------------------- startseq boy in his blue swim shorts at the beach endseq
I've created a user-friendly version of the Image Caption Generator using Streamlit, offering a seamless one-click experience. This version allows users to effortlessly generate the image caption with just a single click.
To explore the Streamlit version, click the button below: