From 23a0591630c4bfb95ae3c82abb92cb95cbbcc6f2 Mon Sep 17 00:00:00 2001 From: Nigel Barink Date: Thu, 30 Mar 2023 19:41:46 +0200 Subject: [PATCH] Dataloaders and datasets --- .gitignore | 2 +- data.ipynb | 267 +++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 3 +- 3 files changed, 270 insertions(+), 2 deletions(-) create mode 100644 data.ipynb diff --git a/.gitignore b/.gitignore index 2603ec8..a712188 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ data -venv +.venv .vscode diff --git a/data.ipynb b/data.ipynb new file mode 100644 index 0000000..40614f8 --- /dev/null +++ b/data.ipynb @@ -0,0 +1,267 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Code for processing data samples can get messy and hard to maintain; we ideally want our dataset code to be decoupled from our model training code for better readability and modularity. PyTorch provides two data primitives: torch.utils.data.DataLoader and torch.utils.data.Dataset that allow you to use pre-loaded datasets as well as your own data. Dataset stores the samples and their corresponding labels, and DataLoader wraps an iterable around the Dataset to enable easy access to the samples." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from torch.utils.data import Dataset\n", + "from torchvision import datasets\n", + "from torchvision.transforms import ToTensor\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Using FashionMNIST dataset as an example again!\n", + "\n", + "training_data = datasets.FashionMNIST(\n", + " root=\"data\", # Path where the data will be / is stored \n", + " train=True, # Specify wheter this is for training or test\n", + " download=True, # Should we download the data if its not available in previously specified `root` path\n", + " transform=ToTensor() # Feature and label Transformations \n", + ")\n", + "\n", + "test_data = datasets.FashionMNIST(\n", + " root=\"data\",\n", + " train=False,\n", + " download=True,\n", + " transform=ToTensor()\n", + ")\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Iterating and Visulalizing the Dataset\n", + "\"\"\"\n", + "We can index Datasets manually like a list: training_data[index].\n", + "We use matplotlib to visualize some samples in our training data.\n", + "\"\"\"\n", + "labels_map = {\n", + " 0: \"T-Shirt\",\n", + " 1: \"Trouser\",\n", + " 2: \"Pullover\",\n", + " 3: \"Dress\",\n", + " 4: \"Coat\",\n", + " 5: \"Sandal\",\n", + " 6: \"Shirt\",\n", + " 7: \"Sneaker\",\n", + " 8: \"Bag\",\n", + " 9: \"Ankle Boot\",\n", + "}\n", + "\n", + "figure = plt.figure(figsize=(8,8))\n", + "cols, rows = 3, 3\n", + "for i in range(1, cols * rows + 1):\n", + " sample_idx = torch.randint(len(training_data), size=(1,)).item()\n", + " img, label = training_data[sample_idx]\n", + " figure.add_subplot(rows, cols, i)\n", + " plt.title(labels_map[label])\n", + " plt.axis(\"off\")\n", + " plt.imshow(img.squeeze(), cmap=\"gray\")\n", + "\n", + "plt.show()\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A custom Dataset class must implement three functions: __init__, __len__, and __getitem__. Take a look at this implementation; the FashionMNIST images are stored in a directory img_dir, and their labels are stored separately in a CSV file annotations_file." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import pandas as pd\n", + "from torchvision.io import read_image\n", + "\n", + "class CustomImageDataset(Dataset):\n", + " def __init__(self, annotations_file, img_dir, transform=None, target_transform=None):\n", + " self.img_labels = pd.read_csv(annotations_file)\n", + " self.img_dir = img_dir\n", + " self.transform = transform\n", + " self.target_transform = target_transform\n", + "\n", + "\n", + " def __len__(self):\n", + " return len(self.img_labels)\n", + " \n", + " def __getitem__(self, idx):\n", + " img_path = os.path.joi(self.img_dir, self.img_labels.iloc[idx,0])\n", + " image = read_image(img_path)\n", + " label = self.img_labels.iloc[idx,1]\n", + " if self.transform:\n", + " image = self.transform(image)\n", + " if self.target_transform:\n", + " label = self.target_transform(label)\n", + " return image, label\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The __init__ function is run once when instantiating the Dataset object. We initialize the directory containing the images, the annotations file, and both transforms\n", + "\n", + "The labels.csv file looks like:\n", + "\n", + "```csv\n", + "\n", + "tshirt1.jpg, 0\n", + "tshirt2.jpg, 0\n", + "......\n", + "ankleboot999.jpg, 9\n", + "\n", + "```" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The __len__ function returns the number of samples in our dataset." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The __getitem__ function loads and returns a sample from the dataset at the given index idx. Based on the index, it identifies the image’s location on disk, converts that to a tensor using read_image, retrieves the corresponding label from the csv data in self.img_labels, calls the transform functions on them (if applicable), and returns the tensor image and corresponding label in a tuple." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Preparing data for training with DataLoaders\n", + "\n", + "The Dataset retrieves our dataset’s features and labels one sample at a time. While training a model, we typically want to pass samples in “minibatches”, reshuffle the data at every epoch to reduce model overfitting, and use Python’s multiprocessing to speed up data retrieval.\n", + "\n", + "DataLoader is an iterable that abstracts this complexity for us in an easy API." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "from torch.utils.data import DataLoader\n", + "\n", + "train_dataloader = DataLoader(training_data, batch_size=64, shuffle=True)\n", + "test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True)\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We have loaded that dataset into the DataLoader and can iterate through the dataset as needed. Each iteration below returns a batch of train_features and train_labels (containing batch_size=64 features and labels respectively). Because we specified shuffle=True, after we iterate over all batches the data is shuffled (for finer-grained control over the data loading order, take a look at Samplers)." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Feature batch shape; torch.Size([64, 1, 28, 28])\n", + "Labels batch shape: torch.Size([64])\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Label: 6\n" + ] + } + ], + "source": [ + "# Display image and label\n", + "train_features, train_labels = next(iter(train_dataloader))\n", + "print(f\"Feature batch shape; {train_features.size()}\")\n", + "print(f\"Labels batch shape: {train_labels.size()}\")\n", + "img = train_features[0].squeeze()\n", + "label = train_labels[0]\n", + "plt.imshow(img, cmap=\"gray\")\n", + "plt.show()\n", + "print(f\"Label: {label}\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.10" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "4944a85e4459d92b06dc1c94852b4e8e8e6d0531f16bd543c843a0ca37cdfcdb" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/requirements.txt b/requirements.txt index 1579240..b68e154 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,5 @@ torch torchvision --index-url https://download.pytorch.org/whl/cu117 torchaudio -numpy \ No newline at end of file +numpy +pandas \ No newline at end of file