In [1]:
import numpy as np
import os
import re
import pandas as pd
from scipy.io import loadmat
import json

### Download the image dataset and metadata files

The dog breed image dataset is made available by Stanford and can by downloaded from [here](http://vision.stanford.edu/aditya86/ImageNetDogs/images.tar)  
The dataset can be divided into train and test set by downloading the information files from [here](http://vision.stanford.edu/aditya86/ImageNetDogs/lists.tar)  

In [2]:
#load the information files
full_list = loadmat("data/file_list.mat")
train_list = loadmat("data/train_list.mat")
test_list = loadmat("data/test_list.mat")

In [3]:
#function to convert the information files to a useful format
def create_img_info_df(mat_dict_file):
    image_info_list = []
    for i in range(mat_dict_file['file_list'].shape[0]):
        file_path = mat_dict_file['file_list'][i][0][0]
        breed = re.search(r'(?<=-)\w*', file_path).group()
        image_info_list.append([file_path, breed])
    image_info_df = pd.DataFrame(image_info_list, columns=["file_path", "breed"])
    return image_info_df

In [4]:
full_set_info_df = create_img_info_df(full_list)
train_info_df = create_img_info_df(train_list)
test_info_df = create_img_info_df(test_list)

In [5]:
print("The complete dataset has {} images".format(full_set_info_df.shape[0]))
print("the train set has {} images".format(train_info_df.shape[0]))
print("the test set has {} images".format(test_info_df.shape[0]))

The complete dataset has 20580 images
the train set has 12000 images
the test set has 8580 images


In [6]:
#get a list of unique breeds
breeds = list(full_set_info_df.groupby(by="breed").count().index)

In [7]:
print("Total number of unique breeds: {}".format(len(breeds)))

Total number of unique breeds: 120


In [8]:
#create dictionaries to map breed to class_id and vice-versa
breed_to_class_id = {x:y for x, y in zip(breeds, range(len(breeds)))}
class_id_to_breed = {y:x for x,y in breed_to_class_id.items()}

In [None]:
#dump as json files for use during prediction
with open("data/breed_to_class_id.json", "w") as filehandler:
    json.dump(breed_to_class_id, filehandler)
with open("data/class_id_to_breed.json", "w") as filehandler:
    json.dump(class_id_to_breed, filehandler)

### Convert and save the train and test images to hd5 format to make the reading fast during the training process

In [9]:
import h5py
import numpy as np
import pandas as pd
from PIL import Image

In [10]:
h5_train_file = h5py.File("data/train.h5", 'w')
h5_test_file = h5py.File("data/test.h5", 'w')

In [11]:
#initialize the h5 files with appropriate size (i.e 224 by 224 to be used with resnet) and fill with zeros
train_features = h5_train_file.create_dataset('data', shape=(len(train_info_df), 224, 224, 3), dtype=np.uint8, fillvalue=0)
train_labels = h5_train_file.create_dataset('labels', shape=(len(train_info_df),1), dtype=np.uint8)
test_features = h5_test_file.create_dataset('data', shape=(len(test_info_df), 224, 224, 3), dtype=np.uint8, fillvalue=0)
test_labels = h5_test_file.create_dataset('labels', shape=(len(test_info_df),1), dtype=np.uint8)

In [12]:
#function to convert all the images to a defined width and height
def resizeImage(size, image):
    image.thumbnail(size, Image.ANTIALIAS)
    background = Image.new('RGB', size, (0, 0, 0))
    background.paste(image, (int((size[0] - image.size[0]) / 2), int((size[1] - image.size[1]) / 2)))
    return background

In [13]:
# write image and label to h5_train_file and h5_test_file
for i in range(len(train_info_df)):
    image_file_name = train_info_df.iloc[i]['file_path']
    img = Image.open("data/Images/" + image_file_name)
    img = resizeImage((224,224), img)
    img = np.array(img)
    train_features[i] = img #write to h5 dataset
    train_labels[i] = breed_to_class_id[train_info_df.iloc[i]['breed']] #write to h5 dataset
    
for i in range(len(test_info_df)):
    image_file_name = test_info_df.iloc[i]['file_path']
    img = Image.open("data/Images/" + image_file_name)
    img = resizeImage((224,224), img)
    img = np.array(img)
    test_features[i] = img #write to h5 dataset
    test_labels[i] = breed_to_class_id[test_info_df.iloc[i]['breed']] #write to h5 dataset

In [15]:
h5_train_file.close()
h5_test_file.close()