# Import necessary libraries
import os  # Operating system-specific functionality
import pickle  # Serialization and deserialization of Python objects
import numpy as np  # Numerical operations
import matplotlib.pyplot as plt  # Plotting library
from PIL import Image  # Python Imaging Library for image processing
from math import ceil  # Ceiling function for calculating steps_per_epoch
from collections import defaultdict  # Default dictionary for convenient data structure


# Import the TensorFlow library
import tensorflow as tf  # High-level machine learning library

# Import the VGG16 model and related modules from Keras
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input  # VGG16 model and preprocessing functions
from tensorflow.keras.preprocessing.image import load_img, img_to_array  # Image loading and conversion utilities


# Import the Tokenizer class from Keras for text tokenization
from tensorflow.keras.preprocessing.text import Tokenizer  # Tokenizer for text data

# Import the pad_sequences function from Keras for sequence padding
from tensorflow.keras.preprocessing.sequence import pad_sequences  # Function for padding sequences to a fixed length


# Import the Model and load_model classes from Keras for model building and loading
from tensorflow.keras.models import Model, load_model  # Model-related functionalities

# Import the to_categorical function from Keras for one-hot encoding
from tensorflow.keras.utils import to_categorical  # Utility function for one-hot encoding

# Import specific layers and utilities from Keras
from tensorflow.keras.layers import *  # Importing various layers like Dense, Dropout, etc.


# Import the corpus_bleu function from NLTK for calculating BLEU scores
from nltk.translate.bleu_score import corpus_bleu  # BLEU score calculation using NLTK


# Specify the input directory path
INPUT_DIR = './dataset/'  # Directory where the input data is stored

# Specify the output directory path
OUTPUT_DIR = './models/'  # Directory where the output models will be saved


# Utilizing a pre-trained VGG16 model for feature extraction
# Loading the VGG16 model with pre-trained weights
# Specify the parameters
weights = 'imagenet'  # Use pre-trained weights from ImageNet
include_top = True     # Include the fully connected layers at the top of the network
input_shape = (224, 224, 3)  # Input shape of the images

# Create the VGG16 model with the specified parameters
model = VGG16(weights=weights, include_top=include_top, input_shape=input_shape)

# Modifying the model structure to exclude the final classification layer, enabling access to the model's output features
model = Model(inputs=model.inputs, outputs=model.layers[-2].output)

# Displaying a summary of the modified model
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input_1 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0         
                                                                 
 block3_conv1 (Conv2D)       (None, 56, 56, 256)       295168    
                                                                 
 block3_conv2 (Conv2D)       (None, 56, 56, 256)       590080    
                                                                 
 block3_conv3 (Conv2D)       (None, 56, 56, 256)       590080    
                                                                 
 block3_pool (MaxPooling2D)  (None, 28, 28, 256)       0         
                                                                 
 block4_conv1 (Conv2D)       (None, 28, 28, 512)       1180160   
                                                                 
 block4_conv2 (Conv2D)       (None, 28, 28, 512)       2359808   
                                                                 
 block4_conv3 (Conv2D)       (None, 28, 28, 512)       2359808   
                                                                 
 block4_pool (MaxPooling2D)  (None, 14, 14, 512)       0         
                                                                 
 block5_conv1 (Conv2D)       (None, 14, 14, 512)       2359808   
                                                                 
 block5_conv2 (Conv2D)       (None, 14, 14, 512)       2359808   
                                                                 
 block5_conv3 (Conv2D)       (None, 14, 14, 512)       2359808   
                                                                 
 block5_pool (MaxPooling2D)  (None, 7, 7, 512)         0         
                                                                 
 flatten (Flatten)           (None, 25088)             0         
                                                                 
 fc1 (Dense)                 (None, 4096)              102764544 
                                                                 
 fc2 (Dense)                 (None, 4096)              16781312  
                                                                 
=================================================================
Total params: 134,260,544
Trainable params: 134,260,544
Non-trainable params: 0
_________________________________________________________________
None


# Visualizing the architecture of the specified model
# Plotting the model with shapes included for a clearer understanding of layer dimensions
plot_model(model, show_shapes=True)


# Create an empty dictionary to hold the extracted features of images
image_features = {}

# Specify the directory path where the images are stored
img_dir = os.path.join(INPUT_DIR, 'Images')

# Iterate through each image in the directory
for img_name in os.listdir(img_dir):
    # Load the image from the file
    img_path = os.path.join(img_dir, img_name)
    loaded_image = load_img(img_path, target_size=(224, 224))
    
    # Convert the image pixels to a NumPy array
    image_array = img_to_array(loaded_image)
    
    # Reshape the data to match the model's input requirements
    reshaped_image = image_array.reshape((1, image_array.shape[0], image_array.shape[1], image_array.shape[2]))
    
    # Perform preprocessing specific to the VGG16 model
    preprocessed_image = preprocess_input(reshaped_image)
    
    # Extract features using the pre-trained VGG16 model
    extracted_feature = model.predict(preprocessed_image, verbose=0)
    
    # Obtain the image ID by removing the file extension
    image_id = img_name.split('.')[0]
    
    # Store the extracted feature in the dictionary with the image ID as the key
    image_features[image_id] = extracted_feature


# Serialize and save the extracted image features using the pickle module
# Combine the output directory path with the desired file name
output_file_path = os.path.join(OUTPUT_DIR, 'new_img_features.pkl')
# Open the file for writing in binary mode
with open(output_file_path, 'wb') as file:
    # Use pickle to store the image features in the specified file
    pickle.dump(image_features, file)


# Specify the file path for the pickled image features
pickle_file_path = os.path.join(OUTPUT_DIR, 'new_img_features.pkl')

# Open the file for reading in binary mode
with open(pickle_file_path, 'rb') as file:
    # Load the stored image features from the specified file using pickle
    loaded_features = pickle.load(file)


# Open and read the contents of the 'captions.txt' file located in the input directory
with open(os.path.join(INPUT_DIR, 'captions.txt'), 'r') as file:
    # Skip the header line and read the remaining content
    next(file)
    captions_doc = file.read()


# Create a defaultdict to store lists of captions associated with each image
image_to_captions_mapping = defaultdict(list)


# Iterate through each line in the captions document
for line in captions_doc.split('\n'):
    # Split the line into tokens using a comma as a delimiter
    tokens = line.split(',')
    
    # Check if the number of tokens is less than 2, and continue to the next iteration if true
    if len(tokens) < 2:
        continue
    
    # Extract image ID and captions from the tokens
    image_id, *captions = tokens
    
    # Remove the file extension from the image ID
    image_id = image_id.split('.')[0]
    
    # Join the remaining tokens to form the complete caption
    caption = " ".join(captions)
    
    # Append the caption to the list associated with the image ID in the mapping
    image_to_captions_mapping[image_id].append(caption)

# Calculate and print the total number of captions
total_captions = sum(len(captions) for captions in image_to_captions_mapping.values())
print("Total Number of Captions:", total_captions)

Total Number of Captions: 40455


def clean(mapping):
    """
    Clean and preprocess captions in the given mapping.

    Args:
    - mapping (dict): A dictionary mapping image IDs to lists of captions.

    Returns:
    - None: The function modifies the captions in-place.
    """
    for key, captions in mapping.items():
        for i in range(len(captions)):
            caption = captions[i]

            # Convert to lowercase
            caption = caption.lower()

            # Remove non-alphabetic characters except spaces
            caption = ''.join(char for char in caption if char.isalpha() or char.isspace())

            # Replace multiple spaces with a single space
            caption = caption.replace('\s+', ' ')

            # Add start and end tokens to the caption
            caption = 'startseq ' + ' '.join([word for word in caption.split() if len(word) > 1]) + ' endseq'
            
            # Update the caption in the list
            captions[i] = caption


# Before Text Preprocessing
image_to_captions_mapping['3637013_c675de7705']

["A couple stands close at the water 's edge .",
 'The two people stand by a body of water and in front of bushes in fall .',
 'Two people hold each other near a pond .',
 'Two people stand by the water .',
 'Two people stand together on the edge of the water on the grass .']


# Text Preprocessing Step
clean(image_to_captions_mapping)


image_to_captions_mapping['3637013_c675de7705']

['startseq couple stands close at the water edge endseq',
 'startseq the two people stand by body of water and in front of bushes in fall endseq',
 'startseq two people hold each other near pond endseq',
 'startseq two people stand by the water endseq',
 'startseq two people stand together on the edge of the water on the grass endseq']


all_captions = [caption for captions in image_to_captions_mapping.values() for caption in captions]


all_captions[:10]

['startseq child in pink dress is climbing up set of stairs in an entry way endseq',
 'startseq girl going into wooden building endseq',
 'startseq little girl climbing into wooden playhouse endseq',
 'startseq little girl climbing the stairs to her playhouse endseq',
 'startseq little girl in pink dress going into wooden cabin endseq',
 'startseq black dog and spotted dog are fighting endseq',
 'startseq black dog and tricolored dog playing with each other on the road endseq',
 'startseq black dog and white dog with brown spots are staring at each other in the street endseq',
 'startseq two dogs of different breeds looking at each other on the road endseq',
 'startseq two dogs on pavement moving toward each other endseq']


# Create a Tokenizer object
tokenizer = Tokenizer()

# Fit the Tokenizer on the list of all captions to build the vocabulary
tokenizer.fit_on_texts(all_captions)


# Serialize and save the Tokenizer object using pickle
with open('tokenizer.pkl', 'wb') as tokenizer_file:
    pickle.dump(tokenizer, tokenizer_file)


# Deserialize and load the Tokenizer object from the saved file using pickle
with open('tokenizer.pkl', 'rb') as tokenizer_file:
    tokenizer = pickle.load(tokenizer_file)


# Determine the maximum caption length among all captions using the Tokenizer
max_caption_length = max(len(tokenizer.texts_to_sequences([caption])[0]) for caption in all_captions)

# Calculate the vocabulary size based on the Tokenizer's word index
vocab_size = len(tokenizer.word_index) + 1

# Print the calculated vocabulary size and maximum caption length
print("Vocabulary Size:", vocab_size)
print("Maximum Caption Length:", max_caption_length)

Vocabulary Size: 8768
Maximum Caption Length: 34


# Create a list of image IDs from the keys of the image_to_captions_mapping dictionary
image_ids = list(image_to_captions_mapping.keys())


# Determine the split index for creating training and test sets
split = int(len(image_ids) * 0.90)

# Create the training set by selecting the first 90% of image IDs
train = image_ids[:split]

# Create the test set by selecting the remaining 10% of image IDs
test = image_ids[split:]


# Generator function for training data
def data_generator(train_keys, img_to_caps_mapping, img_features, tokenizer, max_cap_length, vocab_size, batch_size):
    """
    Generates batches of training data for the image captioning model.

    Args:
    - train_keys (list): List of image keys for the training set.
    - img_to_caps_mapping (dict): Dictionary mapping image keys to associated captions.
    - img_features (dict): Dictionary containing image features extracted using a pre-trained model.
    - tokenizer (Tokenizer): Tokenizer object for text tokenization.
    - max_cap_length (int): Maximum length of captions after tokenization.
    - vocab_size (int): Size of the vocabulary.
    - batch_size (int): Size of each training batch.

    Yields:
    - tuple: Tuple containing input data (image features and input sequences) and target output sequences.
    """
    # Lists to store batch data
    X1_batch, X2_batch, y_batch = [], [], []
    # Counter for the current batch size
    batch_count = 0

    while True:
        # Loop through each image in the current batch
        for img_key in train_keys:
            # Get the captions associated with the current image
            captions = img_to_caps_mapping[img_key]

            # Loop through each caption for the current image
            for caption in captions:
                # Convert the caption to a sequence of token IDs
                caption_seq = tokenizer.texts_to_sequences([caption])[0]

                # Loop through the tokens in the caption sequence
                for i in range(1, len(caption_seq)):
                    # Split the sequence into input and output pairs
                    in_seq, out_seq = caption_seq[:i], caption_seq[i]

                    # Pad the input sequence to the specified maximum caption length
                    in_seq = pad_sequences([in_seq], maxlen=max_cap_length)[0]

                    # Convert the output sequence to one-hot encoded format
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]

                    # Append data to batch lists
                    X1_batch.append(img_features[img_key][0])  # Image features
                    X2_batch.append(in_seq)  # Input sequence
                    y_batch.append(out_seq)  # Output sequence

                    # Increase the batch counter
                    batch_count += 1

                    # If the batch is complete, yield the batch and reset lists and counter
                    if batch_count == batch_size:
                        X1_batch, X2_batch, y_batch = np.array(X1_batch), np.array(X2_batch), np.array(y_batch)
                        yield [X1_batch, X2_batch], y_batch
                        X1_batch, X2_batch, y_batch = [], [], []
                        batch_count = 0


# Encoder model
inputs1 = Input(shape=(4096,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)
fe2_projected = RepeatVector(max_caption_length)(fe2)
fe2_projected = Bidirectional(LSTM(256, return_sequences=True))(fe2_projected)

# Sequence feature layers
inputs2 = Input(shape=(max_caption_length,))
se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = Bidirectional(LSTM(256, return_sequences=True))(se2)

# Apply attention mechanism using Dot product
attention = Dot(axes=[2, 2])([fe2_projected, se3])  # Calculate attention scores

# Softmax attention scores
attention_scores = Activation('softmax')(attention)

# Apply attention scores to sequence embeddings
attention_context = Lambda(lambda x: tf.einsum('ijk,ijl->ikl', x[0], x[1]))([attention_scores, se3])

# Sum the attended sequence embeddings along the time axis
context_vector = tf.reduce_sum(attention_context, axis=1)

# Decoder model
decoder_input = concatenate([context_vector, fe2], axis=-1)
decoder1 = Dense(256, activation='relu')(decoder_input)
outputs = Dense(vocab_size, activation='softmax')(decoder1)

# Create the model
model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Visualize the model
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
==================================================================================================
 input_3 (InputLayer)           [(None, 4096)]       0           []                               
                                                                                                  
 dropout_1 (Dropout)            (None, 4096)         0           ['input_3[0][0]']                
                                                                                                  
 input_4 (InputLayer)           [(None, 34)]         0           []                               
                                                                                                  
 dense_1 (Dense)                (None, 256)          1048832     ['dropout_1[0][0]']              
                                                                                                  
 embedding (Embedding)          (None, 34, 256)      2244608     ['input_4[0][0]']                
                                                                                                  
 repeat_vector (RepeatVector)   (None, 34, 256)      0           ['dense_1[0][0]']                
                                                                                                  
 dropout_2 (Dropout)            (None, 34, 256)      0           ['embedding[0][0]']              
                                                                                                  
 bidirectional (Bidirectional)  (None, 34, 512)      1050624     ['repeat_vector[0][0]']          
                                                                                                  
 bidirectional_1 (Bidirectional  (None, 34, 512)     1050624     ['dropout_2[0][0]']              
 )                                                                                                
                                                                                                  
 dot (Dot)                      (None, 34, 34)       0           ['bidirectional[0][0]',          
                                                                  'bidirectional_1[0][0]']        
                                                                                                  
 activation (Activation)        (None, 34, 34)       0           ['dot[0][0]']                    
                                                                                                  
 lambda (Lambda)                (None, 34, 512)      0           ['activation[0][0]',             
                                                                  'bidirectional_1[0][0]']        
                                                                                                  
 tf.math.reduce_sum (TFOpLambda  (None, 512)         0           ['lambda[0][0]']                 
 )                                                                                                
                                                                                                  
 concatenate (Concatenate)      (None, 768)          0           ['tf.math.reduce_sum[0][0]',     
                                                                  'dense_1[0][0]']                
                                                                                                  
 dense_2 (Dense)                (None, 256)          196864      ['concatenate[0][0]']            
                                                                                                  
 dense_3 (Dense)                (None, 8768)         2253376     ['dense_2[0][0]']                
                                                                                                  
==================================================================================================
Total params: 7,844,928
Trainable params: 7,844,928
Non-trainable params: 0
__________________________________________________________________________________________________


# Visualize the model architecture with shapes
plot_model(model, show_shapes=True)


# Set the number of training epochs and batch size
epochs = 50
batch_size = 32

# Calculate the steps per epoch based on the number of batches in one epoch
steps_per_epoch = ceil(len(train) / batch_size)
validation_steps = ceil(len(test) / batch_size)  # Calculate the steps for validation data

# Loop through the epochs for training
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    
    # Set up data generators for training and validation
    train_generator = data_generator(train, image_to_captions_mapping, loaded_features, tokenizer, max_caption_length, vocab_size, batch_size)
    validation_generator = data_generator(test, image_to_captions_mapping, loaded_features, tokenizer, max_caption_length, vocab_size, batch_size)
    
    # Train the model for one epoch using the data generators
    model.fit(train_generator, epochs=1, steps_per_epoch=steps_per_epoch,
              validation_data=validation_generator, validation_steps=validation_steps,
              verbose=1)

Epoch 1/50
228/228 [==============================] - 274s 1s/step - loss: 6.4268 - val_loss: 6.4850
Epoch 2/50
228/228 [==============================] - 253s 1s/step - loss: 5.2290 - val_loss: 6.4891
Epoch 3/50
228/228 [==============================] - 226s 989ms/step - loss: 4.8122 - val_loss: 6.2731
Epoch 4/50
228/228 [==============================] - 246s 1s/step - loss: 4.4898 - val_loss: 6.6783
Epoch 5/50
228/228 [==============================] - 237s 1s/step - loss: 4.2175 - val_loss: 6.7722
Epoch 6/50
228/228 [==============================] - 232s 1s/step - loss: 3.9323 - val_loss: 7.4398
Epoch 7/50
228/228 [==============================] - 275s 1s/step - loss: 3.5772 - val_loss: 7.3053
Epoch 8/50
228/228 [==============================] - 249s 1s/step - loss: 3.2714 - val_loss: 6.9621
Epoch 9/50
228/228 [==============================] - 296s 1s/step - loss: 3.0953 - val_loss: 7.4834
Epoch 10/50
228/228 [==============================] - 240s 1s/step - loss: 2.8158 - val_loss: 6.7651
Epoch 11/50
228/228 [==============================] - 221s 968ms/step - loss: 2.6980 - val_loss: 7.0226
Epoch 12/50
228/228 [==============================] - 158s 691ms/step - loss: 2.5395 - val_loss: 7.1167
Epoch 13/50
228/228 [==============================] - 152s 668ms/step - loss: 2.4294 - val_loss: 6.8429
Epoch 14/50
228/228 [==============================] - 154s 676ms/step - loss: 2.2601 - val_loss: 6.8003
Epoch 15/50
228/228 [==============================] - 151s 664ms/step - loss: 2.1455 - val_loss: 7.3354
Epoch 16/50
228/228 [==============================] - 154s 677ms/step - loss: 1.9821 - val_loss: 7.0470
Epoch 17/50
228/228 [==============================] - 150s 659ms/step - loss: 1.8604 - val_loss: 7.3459
Epoch 18/50
228/228 [==============================] - 150s 658ms/step - loss: 1.7827 - val_loss: 7.6326
Epoch 19/50
228/228 [==============================] - 149s 652ms/step - loss: 1.7168 - val_loss: 7.7972
Epoch 20/50
228/228 [==============================] - 149s 652ms/step - loss: 1.6248 - val_loss: 7.8400
Epoch 21/50
228/228 [==============================] - 152s 668ms/step - loss: 1.5278 - val_loss: 8.2000
Epoch 22/50
228/228 [==============================] - 153s 672ms/step - loss: 1.4666 - val_loss: 8.2232
Epoch 23/50
228/228 [==============================] - 153s 672ms/step - loss: 1.3989 - val_loss: 7.7666
Epoch 24/50
228/228 [==============================] - 153s 673ms/step - loss: 1.3431 - val_loss: 8.5297
Epoch 25/50
228/228 [==============================] - 155s 678ms/step - loss: 1.2546 - val_loss: 9.2400
Epoch 26/50
228/228 [==============================] - 159s 695ms/step - loss: 1.2188 - val_loss: 9.3097
Epoch 27/50
228/228 [==============================] - 155s 679ms/step - loss: 1.1183 - val_loss: 9.1880
Epoch 28/50
228/228 [==============================] - 156s 682ms/step - loss: 1.1076 - val_loss: 10.1482
Epoch 29/50
228/228 [==============================] - 155s 678ms/step - loss: 1.0801 - val_loss: 9.9377
Epoch 30/50
228/228 [==============================] - 152s 669ms/step - loss: 1.0158 - val_loss: 10.2713
Epoch 31/50
228/228 [==============================] - 153s 671ms/step - loss: 0.9524 - val_loss: 11.0540
Epoch 32/50
228/228 [==============================] - 154s 677ms/step - loss: 0.8777 - val_loss: 11.4524
Epoch 33/50
228/228 [==============================] - 165s 725ms/step - loss: 0.8454 - val_loss: 10.8839
Epoch 34/50
228/228 [==============================] - 172s 756ms/step - loss: 0.8011 - val_loss: 10.9346
Epoch 35/50
228/228 [==============================] - 172s 755ms/step - loss: 0.7754 - val_loss: 11.8426
Epoch 36/50
228/228 [==============================] - 168s 735ms/step - loss: 0.7140 - val_loss: 12.4560
Epoch 37/50
228/228 [==============================] - 171s 751ms/step - loss: 0.6856 - val_loss: 11.8752
Epoch 38/50
228/228 [==============================] - 172s 756ms/step - loss: 0.6648 - val_loss: 12.7536
Epoch 39/50
228/228 [==============================] - 171s 748ms/step - loss: 0.6507 - val_loss: 14.1370
Epoch 40/50
228/228 [==============================] - 178s 780ms/step - loss: 0.6400 - val_loss: 12.7257
Epoch 41/50
228/228 [==============================] - 177s 777ms/step - loss: 0.6346 - val_loss: 12.7481
Epoch 42/50
228/228 [==============================] - 176s 774ms/step - loss: 0.5966 - val_loss: 13.0265
Epoch 43/50
228/228 [==============================] - 178s 780ms/step - loss: 0.5656 - val_loss: 13.6092
Epoch 44/50
228/228 [==============================] - 218s 955ms/step - loss: 0.5219 - val_loss: 13.2933
Epoch 45/50
228/228 [==============================] - 209s 917ms/step - loss: 0.5004 - val_loss: 14.4468
Epoch 46/50
228/228 [==============================] - 170s 745ms/step - loss: 0.4939 - val_loss: 13.7727
Epoch 47/50
228/228 [==============================] - 171s 749ms/step - loss: 0.5200 - val_loss: 14.9553
Epoch 48/50
228/228 [==============================] - 180s 788ms/step - loss: 0.5434 - val_loss: 13.8833
Epoch 49/50
228/228 [==============================] - 179s 786ms/step - loss: 0.5359 - val_loss: 15.6298
Epoch 50/50
228/228 [==============================] - 187s 819ms/step - loss: 0.5167 - val_loss: 14.1596


# Save the trained model to a file
model.save(os.path.join(OUTPUT_DIR, 'mymodel.h5'))


# Load the trained model from the saved file
model = load_model(os.path.join(OUTPUT_DIR, 'mymodel.h5'))


def get_word_from_index(index, tokenizer):
    """
    Retrieve the word corresponding to a given index from the tokenizer's vocabulary.

    Args:
    - index (int): Index of the word.
    - tokenizer (Tokenizer): Tokenizer object.

    Returns:
    - str or None: The word corresponding to the index, or None if not found.
    """
    return next((word for word, idx in tokenizer.word_index.items() if idx == index), None)


def predict_caption(model, image_features, tokenizer, max_caption_length):
    """
    Generate a descriptive caption for a given image using the provided image captioning model.

    Args:
    - model (Model): Trained image captioning model.
    - image_features (numpy.ndarray): Extracted features of the input image.
    - tokenizer (Tokenizer): Tokenizer object used for text tokenization.
    - max_caption_length (int): Maximum length of the generated caption.

    Returns:
    - str: Predicted caption for the input image.
    """
    # Initialize the caption sequence
    caption = 'startseq'
    
    # Generate the caption
    for _ in range(max_caption_length):
        # Convert the current caption to a sequence of token indices
        sequence = tokenizer.texts_to_sequences([caption])[0]
        # Pad the sequence to match the maximum caption length
        sequence = pad_sequences([sequence], maxlen=max_caption_length)
        # Predict the probability distribution for the next word
        y_hat = model.predict([image_features, sequence], verbose=0)
        # Get the index with the highest predicted probability
        predicted_index = np.argmax(y_hat)
        # Convert the index to a word
        predicted_word = get_word_from_index(predicted_index, tokenizer)
        
        # Append the predicted word to the caption
        caption += " " + predicted_word
        
        # Stop if the predicted word is None or if the end sequence tag is encountered
        if predicted_word is None or predicted_word == 'endseq':
            break
    
    return caption


# Initialize lists to store actual and predicted captions
actual_captions_list = []
predicted_captions_list = []

# Loop through the test data
for key in test:
    # Retrieve actual captions for the current image
    actual_captions = image_to_captions_mapping[key]
    # Generate a caption for the image using the trained model
    predicted_caption = predict_caption(model, loaded_features[key], tokenizer, max_caption_length)
    
    # Split actual captions into individual words
    actual_captions_words = [caption.split() for caption in actual_captions]
    # Split the predicted caption into words
    predicted_caption_words = predicted_caption.split()
    
    # Append the word lists to their respective lists
    actual_captions_list.append(actual_captions_words)
    predicted_captions_list.append(predicted_caption_words)

# Calculate BLEU scores for unigram and bigram precision
bleu_score_1 = corpus_bleu(actual_captions_list, predicted_captions_list, weights=(1.0, 0, 0, 0))
bleu_score_2 = corpus_bleu(actual_captions_list, predicted_captions_list, weights=(0.5, 0.5, 0, 0))

print("BLEU-1 Score: %f" % bleu_score_1)
print("BLEU-2 Score: %f" % bleu_score_2)

BLEU-1: 0.547920
BLEU-2: 0.244377


# Save the lists of actual captions using pickle
with open('actual_captions_list.pkl', 'wb') as actual_file:
    pickle.dump(actual_captions_list, actual_file)

# Save the lists of predicted captions using pickle
with open('predicted_captions_list.pkl', 'wb') as predicted_file:
    pickle.dump(predicted_captions_list, predicted_file)


# Load actual captions list
with open('actual_captions_list.pkl', 'rb') as file:
    loaded_actual_captions_list = pickle.load(file)

# Load predicted captions list
with open('predicted_captions_list.pkl', 'rb') as file:
    loaded_predicted_captions_list = pickle.load(file)


def generate_caption(image_name):
    """
    Display actual and predicted captions for a given image along with its visual representation.

    Args:
    - image_name (str): Name of the image file.

    Returns:
    - None
    """
    # Extract image ID from the image name
    image_id = image_name.split('.')[0]
    # Get the full path of the image file
    img_path = os.path.join(INPUT_DIR, "Images", image_name)
    # Open the image using the PIL library
    image = Image.open(img_path)
    # Retrieve actual captions for the image
    captions = image_to_captions_mapping[image_id]
    
    print('---------------------Actual---------------------')
    # Display actual captions
    for caption in captions:
        print(caption)
    
    # Generate and display the predicted caption
    y_pred = predict_caption(model, loaded_features[image_id], tokenizer, max_caption_length)
    print('--------------------Predicted--------------------')
    print(y_pred)
    
    # Display the image
    plt.imshow(image)


generate_caption("101669240_b2d3e7f17b.jpg")

---------------------Actual---------------------
startseq man in hat is displaying pictures next to skier in blue hat endseq
startseq man skis past another man displaying paintings in the snow endseq
startseq person wearing skis looking at framed pictures set up in the snow endseq
startseq skier looks at framed pictures in the snow next to trees endseq
startseq man on skis looking at artwork for sale in the snow endseq
--------------------Predicted--------------------
startseq man on skis looking at artwork for sale in the snow endseq


generate_caption("1077546505_a4f6c4daa9.jpg")

---------------------Actual---------------------
startseq boy in blue shorts slides down slide into pool endseq
startseq boy in blue swimming trunks slides down yellow slide into wading pool with inflatable toys floating in the water endseq
startseq boy rides down slide into small backyard pool endseq
startseq boy sliding down slide into pool with colorful tubes endseq
startseq child is falling off slide onto colored balloons floating on pool of water endseq
--------------------Predicted--------------------
startseq boy rides down slide into small backyard pool endseq


generate_caption("106490881_5a2dd9b7bd.jpg")

---------------------Actual---------------------
startseq boy in his blue swim shorts at the beach endseq
startseq boy smiles for the camera at beach endseq
startseq young boy in swimming trunks is walking with his arms outstretched on the beach endseq
startseq children playing on the beach endseq
startseq the boy is playing on the shore of an ocean endseq
--------------------Predicted--------------------
startseq boy in his blue swim shorts at the beach endseq

Deep Learning: Image Captioning using VGG16 and Customized LSTM Neural Network Model¶

Importing Necessary Libraries¶

Flickr8k Dataset Overview¶

Setting the Input and Output Directory Path¶

Transfer Learning using Pre-trained VGG16 Model¶

Model Architecture¶

The VGG16 Model¶

Convolutional Blocks¶

Fully Connected Layers¶

Total Parameters¶

Why VGG16?¶

Simplicity and Elegance¶

Strong Performance¶

Versatility¶

Image Preprocessing and Image Feature Extraction¶

Saving the Image Features Using Pickle¶

Loading the Image Features Using Pickle¶

Image Caption Preprocessing¶

Tokenizing Image Caption¶

Splitting the Dataset into the Training Set and The Testing Set¶

Building Customized LSTM Neural Network¶

Layers and Connections¶

Total Parameters¶

Training the LSTM Neural Network¶

Saving and Loading the LSTM Model¶

Predicting Image Caption Given the Image¶

Evaluating BLEU Scores¶

Saving and Loading the Actual and Predicted Image Caption¶

Displaying Predicted Image Caption¶

Streamlit User-Friendly Version¶