diff --git a/Makefile b/Makefile index 5e994b01141..0a36670dd6c 100644 --- a/Makefile +++ b/Makefile @@ -86,6 +86,9 @@ download: wget -nv -N https://www.manythings.org/anki/deu-eng.zip -P $(DATADIR) unzip -o $(DATADIR)/deu-eng.zip -d beginner_source/data/ + # Download PennFudanPed dataset for intermediate_source/torchvision_tutorial.py + wget https://www.cis.upenn.edu/~jshi/ped_html/PennFudanPed.zip -P $(DATADIR) + unzip -o $(DATADIR)/PennFudanPed.zip -d intermediate_source/data/ docs: make download @@ -103,3 +106,5 @@ html-noplot: clean-cache: make clean rm -rf advanced beginner intermediate recipes + # remove additional python files downloaded for torchvision_tutorial.py + rm -rf intermediate_source/engine.py intermediate_source/utils.py intermediate_source/transforms.py intermediate_source/coco_eval.py intermediate_source/coco_utils.py diff --git a/_static/img/tv_tutorial/tv_image01.png b/_static/img/tv_tutorial/tv_image01.png deleted file mode 100644 index bb47d27d24e..00000000000 Binary files a/_static/img/tv_tutorial/tv_image01.png and /dev/null differ diff --git a/_static/img/tv_tutorial/tv_image02.png b/_static/img/tv_tutorial/tv_image02.png deleted file mode 100644 index 8717199010b..00000000000 Binary files a/_static/img/tv_tutorial/tv_image02.png and /dev/null differ diff --git a/_static/img/tv_tutorial/tv_image05.png b/_static/img/tv_tutorial/tv_image05.png deleted file mode 100644 index 3961033693a..00000000000 Binary files a/_static/img/tv_tutorial/tv_image05.png and /dev/null differ diff --git a/_static/img/tv_tutorial/tv_image06.png b/_static/img/tv_tutorial/tv_image06.png deleted file mode 100644 index 4c20d89026a..00000000000 Binary files a/_static/img/tv_tutorial/tv_image06.png and /dev/null differ diff --git a/en-wordlist.txt b/en-wordlist.txt index addeb78ebfb..c520d6360ad 100644 --- a/en-wordlist.txt +++ b/en-wordlist.txt @@ -16,6 +16,7 @@ RRef OOM subfolder Dialogs +PennFudan performant multithreading linearities @@ -36,6 +37,8 @@ breakpoint MobileNet DeepLabV Resampling +RCNN +RPN APIs ATen AVX @@ -145,6 +148,7 @@ LRSchedulers Lua Luong macos +mAP MLP MLPs MNIST @@ -178,10 +182,12 @@ OU PIL PPO Plotly +pre Prec Profiler PyTorch's RGB +RGBA RL RNN RNNs @@ -345,6 +351,7 @@ jit jitter jpg judgements +keypoint kwargs labelled learnable @@ -425,6 +432,7 @@ reinitializes relu reproducibility rescale +rescaling resnet restride rewinded @@ -476,10 +484,12 @@ torchscriptable torchtext torchtext's torchvision +TorchVision torchviz traceback tradeoff tradeoffs +uint uncomment uncommented underflowing diff --git a/_static/tv-training-code.py b/intermediate_source/torchvision_tutorial.py similarity index 91% rename from _static/tv-training-code.py rename to intermediate_source/torchvision_tutorial.py index bdd93760a7d..f1562d71a47 100644 --- a/_static/tv-training-code.py +++ b/intermediate_source/torchvision_tutorial.py @@ -6,17 +6,10 @@ ###################################################################### # -# .. tip:: -# -# To get the most of this tutorial, we suggest using this -# `Colab Version `__. -# This will allow you to experiment with the information presented below. -# -# # For this tutorial, we will be finetuning a pre-trained `Mask -# R-CNN `__ model on the `Penn-Fudan +# R-CNN `_ model on the `Penn-Fudan # Database for Pedestrian Detection and -# Segmentation `__. It contains +# Segmentation `_. It contains # 170 images with 345 instances of pedestrians, and we will use it to # illustrate how to use the new features in torchvision in order to train # an object detection and instance segmentation model on a custom dataset. @@ -35,7 +28,7 @@ # The reference scripts for training object detection, instance # segmentation and person keypoint detection allows for easily supporting # adding new custom datasets. The dataset should inherit from the standard -# ``torch.utils.data.Dataset`` class, and implement ``__len__`` and +# :class:`torch.utils.data.Dataset` class, and implement ``__len__`` and # ``__getitem__``. # # The only specificity that we require is that the dataset ``__getitem__`` @@ -65,7 +58,7 @@ # ``pycocotools`` which can be installed with ``pip install pycocotools``. # # .. note :: -# For Windows, please install ``pycocotools`` from `gautamchitnis `__ with command +# For Windows, please install ``pycocotools`` from `gautamchitnis `_ with command # # ``pip install git+https://github.com/gautamchitnis/cocoapi.git@cocodataset-master#subdirectory=PythonAPI`` # @@ -85,10 +78,16 @@ # Writing a custom dataset for PennFudan # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # -# Let’s write a dataset for the PennFudan dataset. After `downloading and -# extracting the zip -# file `__, we -# have the following folder structure: +# Let’s write a dataset for the PennFudan dataset. First, let's download the dataset and +# extract the `zip file `_: +# +# .. code:: python +# +# wget https://www.cis.upenn.edu/~jshi/ped_html/PennFudanPed.zip -P data +# cd data && unzip PennFudanPed.zip +# +# +# We have the following folder structure: # # :: # @@ -106,21 +105,33 @@ # FudanPed00004.png # # Here is one example of a pair of images and segmentation masks -# -# .. image:: ../../_static/img/tv_tutorial/tv_image01.png -# -# .. image:: ../../_static/img/tv_tutorial/tv_image02.png -# + +import matplotlib.pyplot as plt +from torchvision.io import read_image + + +image = read_image("data/PennFudanPed/PNGImages/FudanPed00046.png") +mask = read_image("data/PennFudanPed/PedMasks/FudanPed00046_mask.png") + +plt.figure(figsize=(16, 8)) +plt.subplot(121) +plt.title("Image") +plt.imshow(image.permute(1, 2, 0)) +plt.subplot(122) +plt.title("Mask") +plt.imshow(mask.permute(1, 2, 0)) + +###################################################################### # So each image has a corresponding # segmentation mask, where each color correspond to a different instance. # Let’s write a :class:`torch.utils.data.Dataset` class for this dataset. # In the code below, we are wrapping images, bounding boxes and masks into -# ``torchvision.TVTensor`` classes so that we will be able to apply torchvision +# :class:`torchvision.tv_tensors.TVTensor` classes so that we will be able to apply torchvision # built-in transformations (`new Transforms API `_) # for the given object detection and segmentation task. # Namely, image tensors will be wrapped by :class:`torchvision.tv_tensors.Image`, bounding boxes into # :class:`torchvision.tv_tensors.BoundingBoxes` and masks into :class:`torchvision.tv_tensors.Mask`. -# As ``torchvision.TVTensor`` are :class:`torch.Tensor` subclasses, wrapped objects are also tensors and inherit the plain +# As :class:`torchvision.tv_tensors.TVTensor` are :class:`torch.Tensor` subclasses, wrapped objects are also tensors and inherit the plain # :class:`torch.Tensor` API. For more information about torchvision ``tv_tensors`` see # `this documentation `_. @@ -196,8 +207,8 @@ def __len__(self): # ------------------- # # In this tutorial, we will be using `Mask -# R-CNN `__, which is based on top of -# `Faster R-CNN `__. Faster R-CNN is a +# R-CNN `_, which is based on top of +# `Faster R-CNN `_. Faster R-CNN is a # model that predicts both bounding boxes and class scores for potential # objects in the image. # @@ -345,6 +356,7 @@ def get_model_instance_segmentation(num_classes): os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/coco_eval.py") os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/transforms.py") +###################################################################### # Since v0.15.0 torchvision provides `new Transforms API `_ # to easily write data augmentation pipelines for Object Detection and Segmentation tasks. # @@ -362,7 +374,7 @@ def get_transform(train): transforms.append(T.ToPureTensor()) return T.Compose(transforms) - +###################################################################### # Testing ``forward()`` method (Optional) # --------------------------------------- # @@ -455,8 +467,8 @@ def get_transform(train): gamma=0.1 ) -# let's train it for 5 epochs -num_epochs = 5 +# let's train it just for 2 epochs +num_epochs = 2 for epoch in range(num_epochs): # train for one epoch, printing every 10 iterations @@ -477,14 +489,12 @@ def get_transform(train): # But what do the predictions look like? Let’s take one image in the # dataset and verify # -# .. image:: ../../_static/img/tv_tutorial/tv_image05.png -# import matplotlib.pyplot as plt from torchvision.utils import draw_bounding_boxes, draw_segmentation_masks -image = read_image("../_static/img/tv_tutorial/tv_image05.png") +image = read_image("data/PennFudanPed/PNGImages/FudanPed00046.png") eval_transform = get_transform(train=False) model.eval() @@ -517,7 +527,7 @@ def get_transform(train): # # In this tutorial, you have learned how to create your own training # pipeline for object detection models on a custom dataset. For -# that, you wrote a ``torch.utils.data.Dataset`` class that returns the +# that, you wrote a :class:`torch.utils.data.Dataset` class that returns the # images and the ground truth boxes and segmentation masks. You also # leveraged a Mask R-CNN model pre-trained on COCO train2017 in order to # perform transfer learning on this new dataset. @@ -526,5 +536,3 @@ def get_transform(train): # training, check ``references/detection/train.py``, which is present in # the torchvision repository. # -# You can download a full source file for this tutorial -# `here `__. \ No newline at end of file diff --git a/intermediate_source/torchvision_tutorial.rst b/intermediate_source/torchvision_tutorial.rst deleted file mode 100644 index a3856c16a11..00000000000 --- a/intermediate_source/torchvision_tutorial.rst +++ /dev/null @@ -1,638 +0,0 @@ -TorchVision Object Detection Finetuning Tutorial -==================================================== - -.. tip:: - - To get the most of this tutorial, we suggest using this - `Colab Version `__. - This will allow you to experiment with the information presented below. - - -For this tutorial, we will be finetuning a pre-trained `Mask -R-CNN `__ model on the `Penn-Fudan -Database for Pedestrian Detection and -Segmentation `__. It contains -170 images with 345 instances of pedestrians, and we will use it to -illustrate how to use the new features in torchvision in order to train -an object detection and instance segmentation model on a custom dataset. - - -.. note :: - - This tutorial works only with torchvision version >=0.16 or nightly. - If you're using torchvision<=0.15, please follow - `this tutorial instead `_. - - -Defining the Dataset --------------------- - -The reference scripts for training object detection, instance -segmentation and person keypoint detection allows for easily supporting -adding new custom datasets. The dataset should inherit from the standard -``torch.utils.data.Dataset`` class, and implement ``__len__`` and -``__getitem__``. - -The only specificity that we require is that the dataset ``__getitem__`` -should return a tuple: - -- image: :class:`torchvision.tv_tensors.Image` of shape ``[3, H, W]``, a pure tensor, or a PIL Image of size ``(H, W)`` -- target: a dict containing the following fields - - - ``boxes``, :class:`torchvision.tv_tensors.BoundingBoxes` of shape ``[N, 4]``: - the coordinates of the ``N`` bounding boxes in ``[x0, y0, x1, y1]`` format, ranging from ``0`` - to ``W`` and ``0`` to ``H`` - - ``labels``, integer :class:`torch.Tensor` of shape ``[N]``: the label for each bounding box. - ``0`` represents always the background class. - - ``image_id``, int: an image identifier. It should be - unique between all the images in the dataset, and is used during - evaluation - - ``area``, float :class:`torch.Tensor` of shape ``[N]``: the area of the bounding box. This is used - during evaluation with the COCO metric, to separate the metric - scores between small, medium and large boxes. - - ``iscrowd``, uint8 :class:`torch.Tensor` of shape ``[N]``: instances with ``iscrowd=True`` will be - ignored during evaluation. - - (optionally) ``masks``, :class:`torchvision.tv_tensors.Mask` of shape ``[N, H, W]``: the segmentation - masks for each one of the objects - -If your dataset is compliant with above requirements then it will work for both -training and evaluation codes from the reference script. Evaluation code will use scripts from -``pycocotools`` which can be installed with ``pip install pycocotools``. - -.. note :: - For Windows, please install ``pycocotools`` from `gautamchitnis `__ with command - - ``pip install git+https://github.com/gautamchitnis/cocoapi.git@cocodataset-master#subdirectory=PythonAPI`` - -One note on the ``labels``. The model considers class ``0`` as background. If your dataset does not contain the background class, -you should not have ``0`` in your ``labels``. For example, assuming you have just two classes, *cat* and *dog*, you can -define ``1`` (not ``0``) to represent *cats* and ``2`` to represent *dogs*. So, for instance, if one of the images has both -classes, your ``labels`` tensor should look like ``[1, 2]``. - -Additionally, if you want to use aspect ratio grouping during training -(so that each batch only contains images with similar aspect ratios), -then it is recommended to also implement a ``get_height_and_width`` -method, which returns the height and the width of the image. If this -method is not provided, we query all elements of the dataset via -``__getitem__`` , which loads the image in memory and is slower than if -a custom method is provided. - -Writing a custom dataset for PennFudan -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Let’s write a dataset for the PennFudan dataset. After `downloading and -extracting the zip -file `__, we -have the following folder structure: - -:: - - PennFudanPed/ - PedMasks/ - FudanPed00001_mask.png - FudanPed00002_mask.png - FudanPed00003_mask.png - FudanPed00004_mask.png - ... - PNGImages/ - FudanPed00001.png - FudanPed00002.png - FudanPed00003.png - FudanPed00004.png - -Here is one example of a pair of images and segmentation masks - -.. image:: ../../_static/img/tv_tutorial/tv_image01.png - -.. image:: ../../_static/img/tv_tutorial/tv_image02.png - -So each image has a corresponding -segmentation mask, where each color correspond to a different instance. -Let’s write a :class:`torch.utils.data.Dataset` class for this dataset. -In the code below, we are wrapping images, bounding boxes and masks into -``torchvision.TVTensor`` classes so that we will be able to apply torchvision -built-in transformations (`new Transforms API `_) -for the given object detection and segmentation task. -Namely, image tensors will be wrapped by :class:`torchvision.tv_tensors.Image`, bounding boxes into -:class:`torchvision.tv_tensors.BoundingBoxes` and masks into :class:`torchvision.tv_tensors.Mask`. -As ``torchvision.TVTensor`` are :class:`torch.Tensor` subclasses, wrapped objects are also tensors and inherit the plain -:class:`torch.Tensor` API. For more information about torchvision ``tv_tensors`` see -`this documentation `_. - -.. code:: python - - import os - import torch - - from torchvision.io import read_image - from torchvision.ops.boxes import masks_to_boxes - from torchvision import tv_tensors - from torchvision.transforms.v2 import functional as F - - - class PennFudanDataset(torch.utils.data.Dataset): - def __init__(self, root, transforms): - self.root = root - self.transforms = transforms - # load all image files, sorting them to - # ensure that they are aligned - self.imgs = list(sorted(os.listdir(os.path.join(root, "PNGImages")))) - self.masks = list(sorted(os.listdir(os.path.join(root, "PedMasks")))) - - def __getitem__(self, idx): - # load images and masks - img_path = os.path.join(self.root, "PNGImages", self.imgs[idx]) - mask_path = os.path.join(self.root, "PedMasks", self.masks[idx]) - img = read_image(img_path) - mask = read_image(mask_path) - # instances are encoded as different colors - obj_ids = torch.unique(mask) - # first id is the background, so remove it - obj_ids = obj_ids[1:] - num_objs = len(obj_ids) - - # split the color-encoded mask into a set - # of binary masks - masks = (mask == obj_ids[:, None, None]).to(dtype=torch.uint8) - - # get bounding box coordinates for each mask - boxes = masks_to_boxes(masks) - - # there is only one class - labels = torch.ones((num_objs,), dtype=torch.int64) - - image_id = idx - area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0]) - # suppose all instances are not crowd - iscrowd = torch.zeros((num_objs,), dtype=torch.int64) - - # Wrap sample and targets into torchvision tv_tensors: - img = tv_tensors.Image(img) - - target = {} - target["boxes"] = tv_tensors.BoundingBoxes(boxes, format="XYXY", canvas_size=F.get_size(img)) - target["masks"] = tv_tensors.Mask(masks) - target["labels"] = labels - target["image_id"] = image_id - target["area"] = area - target["iscrowd"] = iscrowd - - if self.transforms is not None: - img, target = self.transforms(img, target) - - return img, target - - def __len__(self): - return len(self.imgs) - - -That’s all for the dataset. Now let’s define a model that can perform -predictions on this dataset. - -Defining your model -------------------- - -In this tutorial, we will be using `Mask -R-CNN `__, which is based on top of -`Faster R-CNN `__. Faster R-CNN is a -model that predicts both bounding boxes and class scores for potential -objects in the image. - -.. image:: ../../_static/img/tv_tutorial/tv_image03.png - -Mask R-CNN adds an extra branch -into Faster R-CNN, which also predicts segmentation masks for each -instance. - -.. image:: ../../_static/img/tv_tutorial/tv_image04.png - -There are two common -situations where one might want -to modify one of the available models in TorchVision Model Zoo. The first -is when we want to start from a pre-trained model, and just finetune the -last layer. The other is when we want to replace the backbone of the -model with a different one (for faster predictions, for example). - -Let’s go see how we would do one or another in the following sections. - -1 - Finetuning from a pretrained model -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Let’s suppose that you want to start from a model pre-trained on COCO -and want to finetune it for your particular classes. Here is a possible -way of doing it: - - -.. code:: python - - import torchvision - from torchvision.models.detection.faster_rcnn import FastRCNNPredictor - - # load a model pre-trained on COCO - model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT") - - # replace the classifier with a new one, that has - # num_classes which is user-defined - num_classes = 2 # 1 class (person) + background - # get number of input features for the classifier - in_features = model.roi_heads.box_predictor.cls_score.in_features - # replace the pre-trained head with a new one - model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes) - - -2 - Modifying the model to add a different backbone -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: python - - import torchvision - from torchvision.models.detection import FasterRCNN - from torchvision.models.detection.rpn import AnchorGenerator - - # load a pre-trained model for classification and return - # only the features - backbone = torchvision.models.mobilenet_v2(weights="DEFAULT").features - # ``FasterRCNN`` needs to know the number of - # output channels in a backbone. For mobilenet_v2, it's 1280 - # so we need to add it here - backbone.out_channels = 1280 - - # let's make the RPN generate 5 x 3 anchors per spatial - # location, with 5 different sizes and 3 different aspect - # ratios. We have a Tuple[Tuple[int]] because each feature - # map could potentially have different sizes and - # aspect ratios - anchor_generator = AnchorGenerator( - sizes=((32, 64, 128, 256, 512),), - aspect_ratios=((0.5, 1.0, 2.0),) - ) - - # let's define what are the feature maps that we will - # use to perform the region of interest cropping, as well as - # the size of the crop after rescaling. - # if your backbone returns a Tensor, featmap_names is expected to - # be [0]. More generally, the backbone should return an - # ``OrderedDict[Tensor]``, and in ``featmap_names`` you can choose which - # feature maps to use. - roi_pooler = torchvision.ops.MultiScaleRoIAlign( - featmap_names=['0'], - output_size=7, - sampling_ratio=2, - ) - - # put the pieces together inside a Faster-RCNN model - model = FasterRCNN( - backbone, - num_classes=2, - rpn_anchor_generator=anchor_generator, - box_roi_pool=roi_pooler, - ) - - -Object detection and instance segmentation model for PennFudan Dataset -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -In our case, we want to finetune from a pre-trained model, given that -our dataset is very small, so we will be following approach number 1. - -Here we want to also compute the instance segmentation masks, so we will -be using Mask R-CNN: - -.. code:: python - - import torchvision - from torchvision.models.detection.faster_rcnn import FastRCNNPredictor - from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor - - - def get_model_instance_segmentation(num_classes): - # load an instance segmentation model pre-trained on COCO - model = torchvision.models.detection.maskrcnn_resnet50_fpn(weights="DEFAULT") - - # get number of input features for the classifier - in_features = model.roi_heads.box_predictor.cls_score.in_features - # replace the pre-trained head with a new one - model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes) - - # now get the number of input features for the mask classifier - in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels - hidden_layer = 256 - # and replace the mask predictor with a new one - model.roi_heads.mask_predictor = MaskRCNNPredictor( - in_features_mask, - hidden_layer, - num_classes, - ) - - return model - - -That’s it, this will make ``model`` be ready to be trained and evaluated -on your custom dataset. - -Putting everything together ---------------------------- - -In ``references/detection/``, we have a number of helper functions to -simplify training and evaluating detection models. Here, we will use -``references/detection/engine.py`` and ``references/detection/utils.py``. -Just download everything under ``references/detection`` to your folder and use them here. -On Linux if you have ``wget``, you can download them using below commands: - -.. code:: python - - os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/engine.py") - os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/utils.py") - os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/coco_utils.py") - os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/coco_eval.py") - os.system("wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/transforms.py") - - -Since v0.15.0 torchvision provides `new Transforms API `_ -to easily write data augmentation pipelines for Object Detection and Segmentation tasks. - -Let’s write some helper functions for data augmentation / -transformation: - -.. code:: python - - from torchvision.transforms import v2 as T - - - def get_transform(train): - transforms = [] - if train: - transforms.append(T.RandomHorizontalFlip(0.5)) - transforms.append(T.ToDtype(torch.float, scale=True)) - transforms.append(T.ToPureTensor()) - return T.Compose(transforms) - - -Testing ``forward()`` method (Optional) ---------------------------------------- - -Before iterating over the dataset, it's good to see what the model -expects during training and inference time on sample data. - -.. code:: python - - import utils - - - model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT") - dataset = PennFudanDataset('data/PennFudanPed', get_transform(train=True)) - data_loader = torch.utils.data.DataLoader( - dataset, - batch_size=2, - shuffle=True, - num_workers=4, - collate_fn=utils.collate_fn - ) - - # For Training - images, targets = next(iter(data_loader)) - images = list(image for image in images) - targets = [{k: v for k, v in t.items()} for t in targets] - output = model(images, targets) # Returns losses and detections - print(output) - - # For inference - model.eval() - x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)] - predictions = model(x) # Returns predictions - print(predictions[0]) - -:: - - {'loss_classifier': tensor(0.0820, grad_fn=), 'loss_box_reg': tensor(0.0278, grad_fn=), 'loss_objectness': tensor(0.0027, grad_fn=), 'loss_rpn_box_reg': tensor(0.0036, grad_fn=)} - {'boxes': tensor([], size=(0, 4), grad_fn=), 'labels': tensor([], dtype=torch.int64), 'scores': tensor([], grad_fn=)} - - -Let’s now write the main function which performs the training and the -validation: - -.. code:: python - - from engine import train_one_epoch, evaluate - - # train on the GPU or on the CPU, if a GPU is not available - device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') - - # our dataset has two classes only - background and person - num_classes = 2 - # use our dataset and defined transformations - dataset = PennFudanDataset('data/PennFudanPed', get_transform(train=True)) - dataset_test = PennFudanDataset('data/PennFudanPed', get_transform(train=False)) - - # split the dataset in train and test set - indices = torch.randperm(len(dataset)).tolist() - dataset = torch.utils.data.Subset(dataset, indices[:-50]) - dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:]) - - # define training and validation data loaders - data_loader = torch.utils.data.DataLoader( - dataset, - batch_size=2, - shuffle=True, - num_workers=4, - collate_fn=utils.collate_fn - ) - - data_loader_test = torch.utils.data.DataLoader( - dataset_test, - batch_size=1, - shuffle=False, - num_workers=4, - collate_fn=utils.collate_fn - ) - - # get the model using our helper function - model = get_model_instance_segmentation(num_classes) - - # move model to the right device - model.to(device) - - # construct an optimizer - params = [p for p in model.parameters() if p.requires_grad] - optimizer = torch.optim.SGD( - params, - lr=0.005, - momentum=0.9, - weight_decay=0.0005 - ) - - # and a learning rate scheduler - lr_scheduler = torch.optim.lr_scheduler.StepLR( - optimizer, - step_size=3, - gamma=0.1 - ) - - # let's train it for 5 epochs - num_epochs = 5 - - for epoch in range(num_epochs): - # train for one epoch, printing every 10 iterations - train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10) - # update the learning rate - lr_scheduler.step() - # evaluate on the test dataset - evaluate(model, data_loader_test, device=device) - - print("That's it!") - -:: - - Epoch: [0] [ 0/60] eta: 0:02:43 lr: 0.000090 loss: 2.8181 (2.8181) loss_classifier: 0.5218 (0.5218) loss_box_reg: 0.1272 (0.1272) loss_mask: 2.1324 (2.1324) loss_objectness: 0.0346 (0.0346) loss_rpn_box_reg: 0.0022 (0.0022) time: 2.7332 data: 0.4483 max mem: 1984 - Epoch: [0] [10/60] eta: 0:00:24 lr: 0.000936 loss: 1.3190 (1.6752) loss_classifier: 0.4611 (0.4213) loss_box_reg: 0.2928 (0.3031) loss_mask: 0.6962 (0.9183) loss_objectness: 0.0238 (0.0253) loss_rpn_box_reg: 0.0074 (0.0072) time: 0.4944 data: 0.0439 max mem: 2762 - Epoch: [0] [20/60] eta: 0:00:13 lr: 0.001783 loss: 0.9419 (1.2621) loss_classifier: 0.2171 (0.3037) loss_box_reg: 0.2906 (0.3064) loss_mask: 0.4174 (0.6243) loss_objectness: 0.0190 (0.0210) loss_rpn_box_reg: 0.0059 (0.0068) time: 0.2108 data: 0.0042 max mem: 2823 - Epoch: [0] [30/60] eta: 0:00:08 lr: 0.002629 loss: 0.6349 (1.0344) loss_classifier: 0.1184 (0.2339) loss_box_reg: 0.2706 (0.2873) loss_mask: 0.2276 (0.4897) loss_objectness: 0.0065 (0.0168) loss_rpn_box_reg: 0.0059 (0.0067) time: 0.1650 data: 0.0051 max mem: 2823 - Epoch: [0] [40/60] eta: 0:00:05 lr: 0.003476 loss: 0.4631 (0.8771) loss_classifier: 0.0650 (0.1884) loss_box_reg: 0.1924 (0.2604) loss_mask: 0.1734 (0.4084) loss_objectness: 0.0029 (0.0135) loss_rpn_box_reg: 0.0051 (0.0063) time: 0.1760 data: 0.0052 max mem: 2823 - Epoch: [0] [50/60] eta: 0:00:02 lr: 0.004323 loss: 0.3261 (0.7754) loss_classifier: 0.0368 (0.1606) loss_box_reg: 0.1424 (0.2366) loss_mask: 0.1479 (0.3599) loss_objectness: 0.0022 (0.0116) loss_rpn_box_reg: 0.0051 (0.0067) time: 0.1775 data: 0.0052 max mem: 2823 - Epoch: [0] [59/60] eta: 0:00:00 lr: 0.005000 loss: 0.3261 (0.7075) loss_classifier: 0.0415 (0.1433) loss_box_reg: 0.1114 (0.2157) loss_mask: 0.1573 (0.3316) loss_objectness: 0.0020 (0.0103) loss_rpn_box_reg: 0.0052 (0.0066) time: 0.2064 data: 0.0049 max mem: 2823 - Epoch: [0] Total time: 0:00:14 (0.2412 s / it) - creating index... - index created! - Test: [ 0/50] eta: 0:00:25 model_time: 0.1576 (0.1576) evaluator_time: 0.0029 (0.0029) time: 0.5063 data: 0.3452 max mem: 2823 - Test: [49/50] eta: 0:00:00 model_time: 0.0335 (0.0701) evaluator_time: 0.0025 (0.0038) time: 0.0594 data: 0.0025 max mem: 2823 - Test: Total time: 0:00:04 (0.0862 s / it) - Averaged stats: model_time: 0.0335 (0.0701) evaluator_time: 0.0025 (0.0038) - Accumulating evaluation results... - DONE (t=0.01s). - Accumulating evaluation results... - DONE (t=0.01s). - IoU metric: bbox - Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.722 - Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.987 - Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.938 - Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.359 - Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.752 - Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.730 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.353 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.762 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.762 - Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.500 - Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.775 - Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.769 - IoU metric: segm - Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.726 - Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.993 - Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.913 - Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.344 - Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.593 - Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.743 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.360 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.760 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.760 - Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.633 - Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.662 - Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.772 - - ... - - Epoch: [4] [ 0/60] eta: 0:00:32 lr: 0.000500 loss: 0.1593 (0.1593) loss_classifier: 0.0194 (0.0194) loss_box_reg: 0.0272 (0.0272) loss_mask: 0.1046 (0.1046) loss_objectness: 0.0044 (0.0044) loss_rpn_box_reg: 0.0037 (0.0037) time: 0.5369 data: 0.3801 max mem: 3064 - Epoch: [4] [10/60] eta: 0:00:10 lr: 0.000500 loss: 0.1609 (0.1870) loss_classifier: 0.0194 (0.0236) loss_box_reg: 0.0272 (0.0383) loss_mask: 0.1140 (0.1190) loss_objectness: 0.0005 (0.0023) loss_rpn_box_reg: 0.0029 (0.0037) time: 0.2016 data: 0.0378 max mem: 3064 - Epoch: [4] [20/60] eta: 0:00:08 lr: 0.000500 loss: 0.1652 (0.1826) loss_classifier: 0.0224 (0.0242) loss_box_reg: 0.0286 (0.0374) loss_mask: 0.1075 (0.1165) loss_objectness: 0.0003 (0.0016) loss_rpn_box_reg: 0.0016 (0.0029) time: 0.1866 data: 0.0044 max mem: 3064 - Epoch: [4] [30/60] eta: 0:00:06 lr: 0.000500 loss: 0.1676 (0.1884) loss_classifier: 0.0245 (0.0264) loss_box_reg: 0.0286 (0.0401) loss_mask: 0.1075 (0.1175) loss_objectness: 0.0003 (0.0013) loss_rpn_box_reg: 0.0018 (0.0030) time: 0.2106 data: 0.0055 max mem: 3064 - Epoch: [4] [40/60] eta: 0:00:03 lr: 0.000500 loss: 0.1726 (0.1884) loss_classifier: 0.0245 (0.0265) loss_box_reg: 0.0283 (0.0394) loss_mask: 0.1187 (0.1184) loss_objectness: 0.0003 (0.0011) loss_rpn_box_reg: 0.0020 (0.0029) time: 0.1897 data: 0.0056 max mem: 3064 - Epoch: [4] [50/60] eta: 0:00:01 lr: 0.000500 loss: 0.1910 (0.1938) loss_classifier: 0.0273 (0.0280) loss_box_reg: 0.0414 (0.0418) loss_mask: 0.1177 (0.1198) loss_objectness: 0.0003 (0.0010) loss_rpn_box_reg: 0.0022 (0.0031) time: 0.1623 data: 0.0056 max mem: 3064 - Epoch: [4] [59/60] eta: 0:00:00 lr: 0.000500 loss: 0.1732 (0.1888) loss_classifier: 0.0273 (0.0278) loss_box_reg: 0.0327 (0.0405) loss_mask: 0.0993 (0.1165) loss_objectness: 0.0003 (0.0010) loss_rpn_box_reg: 0.0023 (0.0030) time: 0.1732 data: 0.0056 max mem: 3064 - Epoch: [4] Total time: 0:00:11 (0.1920 s / it) - creating index... - index created! - Test: [ 0/50] eta: 0:00:21 model_time: 0.0589 (0.0589) evaluator_time: 0.0032 (0.0032) time: 0.4269 data: 0.3641 max mem: 3064 - Test: [49/50] eta: 0:00:00 model_time: 0.0515 (0.0521) evaluator_time: 0.0020 (0.0031) time: 0.0579 data: 0.0024 max mem: 3064 - Test: Total time: 0:00:03 (0.0679 s / it) - Averaged stats: model_time: 0.0515 (0.0521) evaluator_time: 0.0020 (0.0031) - Accumulating evaluation results... - DONE (t=0.01s). - Accumulating evaluation results... - DONE (t=0.01s). - IoU metric: bbox - Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.846 - Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.997 - Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.978 - Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.412 - Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.689 - Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.864 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.417 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.876 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.876 - Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.567 - Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.750 - Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.896 - IoU metric: segm - Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.777 - Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.997 - Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.961 - Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.424 - Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.631 - Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.791 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.373 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.814 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.814 - Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.633 - Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.713 - Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.827 - - That's it! - - -So after one epoch of training, we obtain a COCO-style mAP > 50, and -a mask mAP of 65. - -But what do the predictions look like? Let’s take one image in the -dataset and verify - -.. image:: ../../_static/img/tv_tutorial/tv_image05.png - -.. code:: python - - import matplotlib.pyplot as plt - from torchvision.utils import draw_bounding_boxes, draw_segmentation_masks - - image = read_image("../_static/img/tv_tutorial/tv_image05.png") - eval_transform = get_transform(train=False) - - model.eval() - with torch.no_grad(): - x = eval_transform(image) - # convert RGBA -> RGB and move to device - x = x[:3, ...].to(device) - predictions = model([x, ]) - pred = predictions[0] - - image = (255.0 * (image - image.min()) / (image.max() - image.min())).to(torch.uint8) - image = image[:3, ...] - pred_labels = [f"pedestrian: {score:.3f}" for label, score in zip(pred["labels"], pred["scores"])] - pred_boxes = pred["boxes"].long() - output_image = draw_bounding_boxes(image, pred_boxes, pred_labels, colors="red") - - masks = (pred["masks"] > 0.7).squeeze(1) - output_image = draw_segmentation_masks(output_image, masks, alpha=0.5, colors="blue") - - plt.figure(figsize=(12, 12)) - plt.imshow(output_image.permute(1, 2, 0)) - - -.. image:: ../../_static/img/tv_tutorial/tv_image06.png - - -The results look good! - -Wrapping up ------------ - -In this tutorial, you have learned how to create your own training -pipeline for object detection models on a custom dataset. For -that, you wrote a ``torch.utils.data.Dataset`` class that returns the -images and the ground truth boxes and segmentation masks. You also -leveraged a Mask R-CNN model pre-trained on COCO train2017 in order to -perform transfer learning on this new dataset. - -For a more complete example, which includes multi-machine / multi-GPU -training, check ``references/detection/train.py``, which is present in -the torchvision repository. - -You can download a full source file for this tutorial -`here `__. \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index fd1dfe1b0cb..8a7fa10a4f1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -59,6 +59,5 @@ gymnasium[mujoco]==0.27.0 timm iopath pygame==2.1.2 +pycocotools semilearn==0.3.2 - -