From d803b2aab0452fc72e3919b4f333e7ff74193df6 Mon Sep 17 00:00:00 2001
From: Joel Ray Holveck <jholveck@nvidia.com>
Date: Wed, 21 Jan 2026 22:57:44 -0800
Subject: [PATCH 1/8] New demo: cat detector

This will detect if a cat is on the screen.  By which I mean displayed
on the screen, not sitting on your laptop.

This is meant as a simple demo of using MSS for AI.  It works as-is,
but needs to be documented, and there's some bits that could do with
cleanup.

There are a lot of additional features that could be added, such as
showing a window with bounding boxes, but that's probably more
complexity than is called for here.
---
 demos/cat-detector.py | 153 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 153 insertions(+)
 create mode 100644 demos/cat-detector.py

diff --git a/demos/cat-detector.py b/demos/cat-detector.py
new file mode 100644
index 0000000..330f6fb
--- /dev/null
+++ b/demos/cat-detector.py
@@ -0,0 +1,153 @@
+#! /usr/bin/env python3
+
+import itertools
+import time
+
+# You'll need to "pip install mss numpy pillow".  Additionally, you'll
+# need to install PyTorch and TorchVision, and the best way to do that
+# can vary depending on your system.  Often, "pip install torch
+# torchvision" will be sufficient, but you can get specific
+# instructions at <https://pytorch.org/get-started/locally/>.
+import numpy as np
+from PIL import Image
+import torch
+import torchvision.models.detection
+import torchvision.transforms.v2
+
+import mss
+
+
+def top_unique_labels(labels, scores):
+    """Return the unique labels, ordered by score descending.
+
+    In other words, if you have a person (0.67), dog (0.98), tv
+    (0.88), dog (0.71), you'll get back the labels for dog, tv,
+    person, in that order.
+
+    The labels are a 1d tensor of integers, which are identifiers for
+    model-specific categories, such as indices into
+    weights.meta["categories"].
+
+    The scores are a parallel 1d tensor of the same size of floats: in
+    other words, score[0] is the score of label[0].
+    """
+    uniq, inv = torch.unique(labels, return_inverse=True)
+    max_per = torch.full((uniq.numel(),), -torch.inf, device=scores.device, dtype=scores.dtype)
+    max_per.scatter_reduce_(0, inv, scores, reduce="amax")
+    order = torch.argsort(max_per, descending=True)
+    return uniq[order]
+
+
+# We run the entire program in inference mode.  This is telling
+# PyTorch to not bother tracking data that's only useful for training
+# a neural net.
+@torch.inference_mode()
+def main():
+    # Use CUDA if it's installed and available.  This is much faster
+    # than doing all the work on the CPU.
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+
+    # The first time you run this demo, Torchvision will download a
+    # 167 MByte DNN.  This is cached in ~/.cache/torch/hub/checkpoints
+    # on Unix; not sure where it's cached on other platforms.
+    weights = torchvision.models.detection.FasterRCNN_ResNet50_FPN_V2_Weights.DEFAULT
+    model = torchvision.models.detection.fasterrcnn_resnet50_fpn_v2(weights=weights).to(device).eval()
+    preprocess = weights.transforms()
+
+    model_labels = weights.meta["categories"]
+    cat_label = model_labels.index("cat")
+
+    score_thresh = 0.60
+    img_long_side = 960
+    min_area_frac = 0.001  # Fraction of image
+
+    with mss.mss() as sct:
+        monitor = sct.monitors[1]  # primary monitor
+
+        img_area = monitor["width"] * monitor["height"]
+        min_box_area = min_area_frac * img_area
+
+        cat_has_been_visible = False
+        elapsed_per_frame_running_avg = None
+        time_last_frame = None
+
+        for frame_number in itertools.count():
+            time_this_frame = time.monotonic()
+            if time_last_frame is not None:
+                elapsed_this_frame = time_this_frame - time_last_frame
+                if frame_number < 5:
+                    # We don't try to keep a moving average until the
+                    # pipeline has warmed up.
+                    elapsed_per_frame_running_avg = elapsed_this_frame
+                else:
+                    elapsed_per_frame_running_avg = elapsed_per_frame_running_avg * 0.9 + elapsed_this_frame * 0.1
+            time_last_frame = time_this_frame
+
+            sct_img = sct.grab(monitor)
+            img = Image.frombytes("RGB", sct_img.size, sct_img.bgra, "raw", "BGRX")
+            # We explicitly convert it to a tensor here, even though
+            # Torchvision can also convert it in the preprocess step.
+            # This is so that we send it to the GPU to do the
+            # preprocessing; PIL images are always on the CPU.
+            img_tensor = torchvision.transforms.v2.functional.pil_to_tensor(img).to(device)
+
+            x = preprocess(img_tensor)  # tensor CxHxW
+            pred = model([x])[0]
+
+            labels = pred["labels"]
+            scores = pred["scores"]
+            boxes = pred["boxes"]
+
+            areas = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+
+            # Find the score of the highest-scoring cat that's large
+            # enough, even if it's not high enough to register the
+            # detector.  We always log that.
+            cat_mask = (labels == cat_label) & (areas >= min_box_area)
+            if cat_mask.any():
+                cat_score = scores[cat_mask].max().item()
+            else:
+                cat_score = 0.0
+
+            cat_in_frame = cat_score >= score_thresh
+            cat_status_changed = cat_in_frame != cat_has_been_visible
+            if cat_status_changed:
+                cat_has_been_visible = cat_in_frame
+
+            if not cat_in_frame:
+                # Find all objects that score sufficiently well.  We
+                # log them if there's no cat to talk about.
+                mask = (scores >= score_thresh) & (areas >= min_box_area)
+                if mask.any():
+                    show_labels = top_unique_labels(labels[mask], scores[mask])
+                else:
+                    show_labels = torch.empty((0,), dtype=labels.dtype)
+
+            if elapsed_per_frame_running_avg is not None:
+                # Record the score of the most cat-like image for
+                # logging purposes
+                cat_scores = scores[labels == cat_label]
+                if cat_scores.any():
+                    best = float(cat_scores.max())
+                else:
+                    best = 0.0
+
+                status_line_time = time.strftime("%H:%M:%S", time.localtime())
+                if cat_in_frame:
+                    status_line_msg = f"Meow!  Hello kitty-cat!"
+                else:
+                    status_line_msg = "no cats"
+                    if show_labels.shape[0] != 0:
+                        label_words = [model_labels[i] for i in show_labels.cpu()]
+                        label_words = [w for w in label_words if w != "N/A"]
+                        status_line_msg += f":{','.join(label_words)}"
+                        if len(status_line_msg) > 31:
+                            status_line_msg = status_line_msg[:28] + "..."
+                status_line = (f"{status_line_time} {frame_number:4d} "
+                               f"{elapsed_per_frame_running_avg * 1000:5.0f} ms/frame "
+                               f"| {status_line_msg:31s} (cat score={best:.2f})")
+                print(f"\r{status_line}", end="\n" if cat_status_changed else "")
+
+
+if __name__ == "__main__":
+    main()

From 84c80763f1a65027a17b7a569580c1ca58c13b7f Mon Sep 17 00:00:00 2001
From: Joel Holveck <jholveck@nvidia.com>
Date: Thu, 22 Jan 2026 10:37:04 +0000
Subject: [PATCH 2/8] Start on front-of-file comments

---
 demos/cat-detector.py | 145 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 138 insertions(+), 7 deletions(-)

diff --git a/demos/cat-detector.py b/demos/cat-detector.py
index 330f6fb..29fc634 100644
--- a/demos/cat-detector.py
+++ b/demos/cat-detector.py
@@ -1,5 +1,128 @@
 #! /usr/bin/env python3
 
+# This demo shows how to use MSS for artificial intelligence.  For
+# this demo, we'll be using a simple object detection task: see if
+# there's a cat on your monitor.  I mean, displayed on the monitor,
+# not sitting on your laptop.
+#
+# This demo is not meant to be an introduction to AI or computer
+# vision.  We assume you have an understanding of the basics of AI,
+# and of PyTorch.
+#
+# Object Detection
+# ================
+#
+# An object detector is a different beast than an object classifier.
+# Object classifiers are a common introduction to computer vision.
+# These will look at a picture that has a single foreground object,
+# front and center, and try to identify what type of object this is: a
+# cat, person, bicycle, etc.
+#
+# An object detector looks at an image and identifies _multiple
+# objects_ within it.  Instead of assigning a single label to the
+# whole image, saying "this is a picture of a cat", it might say
+# "there is a cat here, and a bicycle over there," and provide some
+# basic information about each one.  This is, for instance, what a
+# self-driving car uses to identify what it's seeing on its cameras.
+#
+# For this demo, we want to tell if a cat is anywhere on the screen,
+# not if the whole screen is a picture of a cat.  That means that we
+# want to use an detector, not a classifier.
+#
+# The detector will find any number of objects.  For each object it
+# detects, a typical detector produces three pieces of information:
+#
+# - A *label*, which identifies _what kind of object_ the detector
+#   believes it has found.  Labels are represented internally as
+#   integers that map to a fixed list of categories the model was
+#   trained on (for example, "cat," "bicycle," or "person").
+#
+# - A *position*, usually given as a bounding box.  A bounding box
+#   describes _where_ the object appears in the image, using a small
+#   set of numbers that define a rectangle around it.
+#
+# - A *score*, which indicates how confident the model is in that
+#   detection.  Higher scores mean the model is more confident; lower
+#   scores mean it is less confident.  The score is a relative
+#   confidence signal, not a calibrated probability, and it should not
+#   be interpreted as a percentage or compared across different
+#   models.
+#
+# Most modern object detectors follow this same basic pattern, even if
+# their internal architectures differ.  In the Torchvision model used
+# in this demo, these results are returned as parallel one-dimensional
+# tensors: one tensor of labels, one tensor of bounding boxes, and one
+# tensor of scores.  Each index across these tensors refers to the
+# same detected object.
+#
+# The Model We're Using
+# =====================
+#
+# In this demo, we use a pre-trained object-detection model provided
+# by PyTorch's Torchvision library: `fasterrcnn_resnet50_fpn_v2`, with
+# weights `FasterRCNN_ResNet50_FPN_V2_Weights.COCO_V1`.
+#
+# This name is long, but each part reflects a piece of a larger system
+# built up over many years of research and engineering.
+#
+# *Faster R-CNN* is the overall object-detection architecture.
+# Introduced in 2015, it builds on earlier R-CNN variants and
+# established the now-common two-stage approach to detection: first
+# proposing regions that might contain objects, then classifying and
+# refining those regions.  This basic structure is still widely used
+# today.
+#
+# *ResNet-50* refers to the convolutional neural network used as the
+# _backbone_.  ResNet itself was originally developed for image
+# classification, but its feature-extraction layers proved broadly
+# useful and are now reused in many vision systems.  In this model,
+# ResNet-50 converts raw pixels into _features_ - numerical
+# representations that capture visual patterns such as edges,
+# textures, shapes, and object parts - while the original
+# classification layers are replaced by the detection-specific
+# components of Faster R-CNN.
+#
+# *FPN*, or Feature Pyramid Network, is a later addition that
+# addresses one of the main challenges in object detection: scale.  It
+# combines high-level, semantically rich features (good at recognizing
+# _what_ is present) with lower-level, higher-resolution features
+# (better at preserving _where_ things are).  By layering these ideas
+# on top of the backbone, the model can detect both large and small
+# objects more reliably.
+#
+# The *v2* suffix indicates a newer Torchvision implementation that
+# incorporates refinements from more recent research and practice.  In
+# particular, it follows a standardized training and configuration
+# setup described in the 2021 paper "Benchmarking Detection Transfer
+# Learning with Vision Transformers".  Despite the paper's title, this
+# model does *not* use Transformers; it uses a ResNet-50 backbone, but
+# benefits from the same modernized training approach.
+#
+# Finally, *COCO_V1* indicates that the model was trained on the COCO
+# dataset, a widely used community benchmark for object detection.
+# COCO contains hundreds of thousands of labeled images covering 80
+# common object categories (such as people, animals, and vehicles),
+# along with a small number of additional placeholder categories that
+# appear as "N/A" in the model metadata.
+#
+# Performance
+# ===========
+#
+# The biggest determinant of performance is whether the model runs on
+# a GPU or on the CPU.  GPUs are extremely well-suited to AI
+# workloads, and PyTorch’s strongest and most mature GPU support today
+# is through NVIDIA’s CUDA platform.
+#
+# With a CUDA-capable GPU, this demo’s main loop typically runs in
+# around 100 ms per frame (about 10 fps).  When run on the CPU, the
+# same work takes roughly 5000 ms per frame (about 0.2 fps).
+#
+# Screen size has little effect on performance.  The preprocessing
+# stage scales the captured image to a fixed size, so the slow part -
+# running the neural network - takes roughly the same amount of time
+# regardless of the original screen resolution.
+
+
 import itertools
 import time
 
@@ -18,11 +141,10 @@
 
 
 def top_unique_labels(labels, scores):
-    """Return the unique labels, ordered by score descending.
+    """Return the unique labels, ordered by descending score.
 
-    In other words, if you have a person (0.67), dog (0.98), tv
-    (0.88), dog (0.71), you'll get back the labels for dog, tv,
-    person, in that order.
+    If you have a person (0.67), dog (0.98), tv (0.88), dog (0.71),
+    you'll get back the labels for dog, tv, person, in that order.
 
     The labels are a 1d tensor of integers, which are identifiers for
     model-specific categories, such as indices into
@@ -43,9 +165,18 @@ def top_unique_labels(labels, scores):
 # a neural net.
 @torch.inference_mode()
 def main():
-    # Use CUDA if it's installed and available.  This is much faster
-    # than doing all the work on the CPU.
-    device = "cuda" if torch.cuda.is_available() else "cpu"
+    # Prefer CUDA if available.  PyTorch’s CUDA backend is the most
+    # mature and consistently supported option, and can be tens of
+    # times faster than running the same model on the CPU.
+    #
+    # Other GPU backends (such as Apple’s MPS, AMD ROCm, or Intel XPU)
+    # exist, but support and configuration vary widely across systems.
+    # Since this demo hasn’t been tested on those platforms, it
+    # conservatively falls back to the CPU when CUDA is not available.
+    if torch.cuda.is_available():
+        device = "cuda"
+    else:
+        device = "cpu"
 
     # The first time you run this demo, Torchvision will download a
     # 167 MByte DNN.  This is cached in ~/.cache/torch/hub/checkpoints

From bc2ce7a76421925a544fa4338f700549d9d9a1b8 Mon Sep 17 00:00:00 2001
From: Joel Ray Holveck <joelh@piquan.org>
Date: Fri, 23 Jan 2026 02:09:08 -0800
Subject: [PATCH 3/8] Add many more comments

---
 demos/cat-detector.py | 249 ++++++++++++++++++++++++++++++++----------
 1 file changed, 194 insertions(+), 55 deletions(-)
 mode change 100644 => 100755 demos/cat-detector.py

diff --git a/demos/cat-detector.py b/demos/cat-detector.py
old mode 100644
new mode 100755
index 29fc634..162f325
--- a/demos/cat-detector.py
+++ b/demos/cat-detector.py
@@ -110,28 +110,35 @@
 #
 # The biggest determinant of performance is whether the model runs on
 # a GPU or on the CPU.  GPUs are extremely well-suited to AI
-# workloads, and PyTorch’s strongest and most mature GPU support today
-# is through NVIDIA’s CUDA platform.
-#
-# With a CUDA-capable GPU, this demo’s main loop typically runs in
-# around 100 ms per frame (about 10 fps).  When run on the CPU, the
-# same work takes roughly 5000 ms per frame (about 0.2 fps).
+# workloads, and PyTorch's strongest and most mature GPU support today
+# is through NVIDIA's CUDA platform.
 #
 # Screen size has little effect on performance.  The preprocessing
 # stage scales the captured image to a fixed size, so the slow part -
 # running the neural network - takes roughly the same amount of time
 # regardless of the original screen resolution.
+#
+# With a CUDA-capable GPU, this demo's main loop typically runs in
+# around 100 ms per frame (about 10 fps).  When run on the CPU, the
+# same work takes roughly 5000 ms per frame (about 0.2 fps).
+#
+# FIXME Categorize
+# ================
+#
+# The first time you run this demo, Torchvision will download a
+# 167 MByte DNN.  This is cached in ~/.cache/torch/hub/checkpoints
+# on Unix.  I'm not sure where it's cached on other platforms, but
+# it will tell you.
 
 
 import itertools
 import time
 
-# You'll need to "pip install mss numpy pillow".  Additionally, you'll
+# You'll need to "pip install mss pillow".  Additionally, you'll
 # need to install PyTorch and TorchVision, and the best way to do that
 # can vary depending on your system.  Often, "pip install torch
 # torchvision" will be sufficient, but you can get specific
 # instructions at <https://pytorch.org/get-started/locally/>.
-import numpy as np
 from PIL import Image
 import torch
 import torchvision.models.detection
@@ -139,24 +146,58 @@
 
 import mss
 
+# The model will identify objects even if they only vaguely look like
+# something.  It also tell us a score of how certain it is, on a scale
+# from 0 to 1.  To prevent false positives, we set a threshold and
+# ignore any results below it.  The score doesn't have any real
+# external meaning: to pick the cutoff, you just try different images
+# and get a sense of what seems about right.
+SCORE_THRESH = 0.60
+
+# If an image is too small, then it's got a pretty decent chance of
+# being a false positive: it's hard to tell if a Discord or Slack
+# reaction icon is a cat or something different.  We ignore any
+# results that are too small to be reliable.  Here, this cutoff is
+# 0.1% of the whole monitor (about 1.5 cm square on a 27" monitor).
+MIN_AREA_FRAC = 0.001  # Fraction of image
 
 def top_unique_labels(labels, scores):
     """Return the unique labels, ordered by descending score.
 
     If you have a person (0.67), dog (0.98), tv (0.88), dog (0.71),
     you'll get back the labels for dog, tv, person, in that order.
-
-    The labels are a 1d tensor of integers, which are identifiers for
-    model-specific categories, such as indices into
-    weights.meta["categories"].
-
-    The scores are a parallel 1d tensor of the same size of floats: in
-    other words, score[0] is the score of label[0].
     """
+
+    # Find the set of unique labels.
+    # `uniq` contains each distinct label once.
+    # `inv` maps each original label to its index in `uniq`.
+    #
+    # Example:
+    #   labels = [person, dog, tv, dog]
+    #   uniq   = [person, dog, tv]
+    #   inv    = [0,      1,   2,  1]
     uniq, inv = torch.unique(labels, return_inverse=True)
-    max_per = torch.full((uniq.numel(),), -torch.inf, device=scores.device, dtype=scores.dtype)
+
+    # Create a tensor to hold the maximum score seen for each unique
+    # label.  We initialize to -inf so any real score will replace it.
+    max_per = torch.full(
+        (uniq.numel(),),
+        -torch.inf,
+        device=scores.device,
+        dtype=scores.dtype,
+    )
+
+    # For each element in `scores`, reduce it into `max_per` using
+    # `inv` as an index map, taking the maximum score per label.
+    #
+    # After this, max_per[i] is the highest score associated with
+    # uniq[i].
     max_per.scatter_reduce_(0, inv, scores, reduce="amax")
+
+    # Sort the unique labels by their maximum score, highest first.
     order = torch.argsort(max_per, descending=True)
+
+    # Return the unique labels in score-ranked order.
     return uniq[order]
 
 
@@ -165,104 +206,202 @@ def top_unique_labels(labels, scores):
 # a neural net.
 @torch.inference_mode()
 def main():
-    # Prefer CUDA if available.  PyTorch’s CUDA backend is the most
+    # Prefer CUDA if available.  PyTorch's CUDA backend is the most
     # mature and consistently supported option, and can be tens of
     # times faster than running the same model on the CPU.
     #
-    # Other GPU backends (such as Apple’s MPS, AMD ROCm, or Intel XPU)
+    # Other GPU backends (such as Apple's MPS, AMD ROCm, or Intel XPU)
     # exist, but support and configuration vary widely across systems.
-    # Since this demo hasn’t been tested on those platforms, it
+    # Since this demo hasn't been tested on those platforms, it
     # conservatively falls back to the CPU when CUDA is not available.
     if torch.cuda.is_available():
         device = "cuda"
     else:
         device = "cpu"
 
-    # The first time you run this demo, Torchvision will download a
-    # 167 MByte DNN.  This is cached in ~/.cache/torch/hub/checkpoints
-    # on Unix; not sure where it's cached on other platforms.
-    weights = torchvision.models.detection.FasterRCNN_ResNet50_FPN_V2_Weights.DEFAULT
+    # Neural networks, often just called *models*, have two aspects to
+    # them: the *architecture*, and the *weights*.  The architecture
+    # is the layout of the neural network: what the different units
+    # are, how they're connected, and so forth.  The weights are the
+    # results of training that neural network; they're numbers saying
+    # how much the units in the network influence each other.
+    #
+    # The same architecture can be trained on different data sets for
+    # different purposes.  Different companies might use the exact
+    # same object detector architecture for different purposes: a
+    # company making a photo editing app might train the model to
+    # recognize faces, smiles, or closed eyes for auto-enhancement,
+    # while a wildlife research group could train the same
+    # architecture to identify animals in wilderness camera photos.
+    #
+    # The weights are specific to the architecture: you can't plug
+    # weights from a training run with the ResNet50 architecture into
+    # a Visual Transformers architecture.
+    #
+    # As described in the comments at the top of the file, we're using
+    # the fasterrcnn_resnet50_fpn_v2 architecture, and the weights
+    # obtained by training it with the COCO dataset.  Plugging those
+    # weights into the architecture produces our model.
+    weights = torchvision.models.detection.FasterRCNN_ResNet50_FPN_V2_Weights.COCO_V1
     model = torchvision.models.detection.fasterrcnn_resnet50_fpn_v2(weights=weights).to(device).eval()
+
+    # When you train a model, you almost always want to pre-process
+    # your input data.  It's important that when you use that model
+    # later, you do the same kind of pre-processing.  Otherwise, it'd
+    # be like learning a language from slow, carefully-enunciated
+    # speech, and then getting dropped right into conversations on a
+    # subway.
+    #
+    # For the model we're using, the preprocessing is to scale the
+    # input image to a consistent size, and to normalize its range
+    # (kinda similar to the "Auto" filter on your phone).
+    # Fortunately, for its pretrained models, Torchvision gives us
+    # an easy way to get the correct preprocessing function.
     preprocess = weights.transforms()
 
+    # The labels ("what type of object is this") that the model gives
+    # us are just integers; for this model, they're from 0 to 90.  The
+    # English words describing them ("cat") are in a list, stored in
+    # the weight's metadata.
     model_labels = weights.meta["categories"]
     cat_label = model_labels.index("cat")
 
-    score_thresh = 0.60
-    img_long_side = 960
-    min_area_frac = 0.001  # Fraction of image
-
     with mss.mss() as sct:
-        monitor = sct.monitors[1]  # primary monitor
+        monitor = sct.monitors[1]
 
         img_area = monitor["width"] * monitor["height"]
-        min_box_area = min_area_frac * img_area
+        # FIXME verify whether the ROI boxes are relative to the
+        # original or preprocessed image.
+        min_box_area = MIN_AREA_FRAC * img_area
 
+        # We start a new line of the log if the cat visibility status
+        # changes.  That way, your terminal will show essentially a
+        # log of all the times when a cat appeared or vanished.
         cat_has_been_visible = False
-        elapsed_per_frame_running_avg = None
+
+        # Track an exponential moving average of how long each frame
+        # takes, essentially an FPS counter.
+        elapsed_per_frame_moving_avg = None
+
+        # When was the last frame?
         time_last_frame = None
 
         for frame_number in itertools.count():
+            # Do all the work to keep the frame timer.
             time_this_frame = time.monotonic()
-            if time_last_frame is not None:
+            if time_last_frame is not None:  # Skip the first loop
                 elapsed_this_frame = time_this_frame - time_last_frame
                 if frame_number < 5:
                     # We don't try to keep a moving average until the
-                    # pipeline has warmed up.
-                    elapsed_per_frame_running_avg = elapsed_this_frame
+                    # pipeline has warmed up for a few frames: the
+                    # times are too variable before PyTorch has gotten
+                    # a sense of the inputs we're sending it, and we
+                    # don't want those initial outlies to affect the
+                    # EMA.  Instead, we just show the most recent
+                    # frame's number.
+                    elapsed_per_frame_moving_avg = elapsed_this_frame
                 else:
-                    elapsed_per_frame_running_avg = elapsed_per_frame_running_avg * 0.9 + elapsed_this_frame * 0.1
+                    # The exponential moving average we track is based
+                    # 90% on the old average, and 10% on the most
+                    # recent frame.  If you do the math, each frame's
+                    # timing has half as much influence every 7 frames
+                    # or so.
+                    elapsed_per_frame_moving_avg = elapsed_per_frame_moving_avg * 0.9 + elapsed_this_frame * 0.1
             time_last_frame = time_this_frame
 
+            # Grab the screenshot.
             sct_img = sct.grab(monitor)
+
+            # We transfer the image from MSS to PyTorch by going
+            # through a Pillow Image.  There are faster ways to do
+            # this transfer, but here, the vast bulk of the time is
+            # occupied by the AI work, so we just use the most
+            # convenient mechanism.
             img = Image.frombytes("RGB", sct_img.size, sct_img.bgra, "raw", "BGRX")
             # We explicitly convert it to a tensor here, even though
             # Torchvision can also convert it in the preprocess step.
-            # This is so that we send it to the GPU to do the
-            # preprocessing; PIL images are always on the CPU.
+            # This is so that we send it to the GPU before we do the
+            # preprocessing: PIL Images are always on the CPU, and
+            # doing the preprocessing on the GPU is much faster.
+            #
+            # Most image APIs, including MSS, use an array layout of
+            # [height, width, channels].  In MSS, the ScreenShot.bgra
+            # data follows this convention, even though it's exposed
+            # as a flat bytes object.
+            #
+            # In contrast, most AI frameworks expect images in
+            # [channels, height, width] order.  The pil_to_tensor
+            # helper performs this rearrangement for us.
             img_tensor = torchvision.transforms.v2.functional.pil_to_tensor(img).to(device)
 
-            x = preprocess(img_tensor)  # tensor CxHxW
-            pred = model([x])[0]
+            # Do the preprocessing stages that the trained model
+            # expects; see the comment where we define preprocess.
+            # The traditional name for inputs to a neural net is "x",
+            # because AI programmers aren't terribly imaginative.
+            x = preprocess(img_tensor)
+            # In most AI networks, the model expects to take an array
+            # of inputs, and will return an array of outputs.  This is
+            # because it's _much_ more efficient to operate on batches
+            # of inputs than on individual inputs, because of how the
+            # matrix math works.  For instance, banks will use batches
+            # of transactions in AIs to flag transactions for review
+            # as potentially fraudulent.  Because of that design, we
+            # need to provide the model our input as a batch of one
+            # image, rather than a single image by itself.  That's
+            # what the unsqueeze does: it adds a new dimension of
+            # length 1 to the beginning of the input.  Also, the
+            # output will be in a batch, so we just take the first
+            # element, hence the [0].
+            pred = model(x.unsqueeze(0))[0]
 
+            # The value of pred is a dict, giving us the labels,
+            # scores, and bounding boxes.  See the comments at the top
+            # of the file for more information.
             labels = pred["labels"]
             scores = pred["scores"]
             boxes = pred["boxes"]
 
+            # We only want to allow detections that are large enough
+            # to be reliable; see the comments on MIN_AREA_FRAC for
+            # more information.  Here, we compute the areas of all the
+            # boxes we got, using operations that work on all the
+            # detected objects in parallel.
             areas = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
 
             # Find the score of the highest-scoring cat that's large
-            # enough, even if it's not high enough to register the
-            # detector.  We always log that.
+            # enough, even if it's not high enough to register as
+            # sufficiently certain for our program.  We always log
+            # that, as the "cat score".
             cat_mask = (labels == cat_label) & (areas >= min_box_area)
             if cat_mask.any():
                 cat_score = scores[cat_mask].max().item()
             else:
                 cat_score = 0.0
 
-            cat_in_frame = cat_score >= score_thresh
+            # Is there a cat on the screen?
+            cat_in_frame = cat_score >= SCORE_THRESH
+            # Did a cat just appear or disappear?  We create a new log
+            # line when this happens, so the user gets a log of cat
+            # appearances and disappearances.
             cat_status_changed = cat_in_frame != cat_has_been_visible
             if cat_status_changed:
                 cat_has_been_visible = cat_in_frame
 
             if not cat_in_frame:
-                # Find all objects that score sufficiently well.  We
-                # log them if there's no cat to talk about.
-                mask = (scores >= score_thresh) & (areas >= min_box_area)
+                # Find all objects that score sufficiently well.
+                # We're going to log them if there's no cat to talk
+                # about.
+                mask = (scores >= SCORE_THRESH) & (areas >= min_box_area)
                 if mask.any():
                     show_labels = top_unique_labels(labels[mask], scores[mask])
                 else:
                     show_labels = torch.empty((0,), dtype=labels.dtype)
 
-            if elapsed_per_frame_running_avg is not None:
-                # Record the score of the most cat-like image for
-                # logging purposes
-                cat_scores = scores[labels == cat_label]
-                if cat_scores.any():
-                    best = float(cat_scores.max())
-                else:
-                    best = 0.0
-
+            # Give the user our results.  We only do this if the
+            # per-frame durations have been initialized (we're on at
+            # least the second frame), just to simplify the layout
+            # logic.
+            if elapsed_per_frame_moving_avg is not None:
                 status_line_time = time.strftime("%H:%M:%S", time.localtime())
                 if cat_in_frame:
                     status_line_msg = f"Meow!  Hello kitty-cat!"
@@ -275,8 +414,8 @@ def main():
                         if len(status_line_msg) > 31:
                             status_line_msg = status_line_msg[:28] + "..."
                 status_line = (f"{status_line_time} {frame_number:4d} "
-                               f"{elapsed_per_frame_running_avg * 1000:5.0f} ms/frame "
-                               f"| {status_line_msg:31s} (cat score={best:.2f})")
+                               f"{elapsed_per_frame_moving_avg * 1000:5.0f} ms/frame "
+                               f"| {status_line_msg:31s} (cat score={cat_score:.2f})")
                 print(f"\r{status_line}", end="\n" if cat_status_changed else "")
 
 

From cda4c2e83f4b2d6de8794eeeac7a53506b329279 Mon Sep 17 00:00:00 2001
From: Joel Ray Holveck <jholveck@nvidia.com>
Date: Fri, 23 Jan 2026 16:36:40 -0800
Subject: [PATCH 4/8] Improve comments, and add screenshot_to_tensor

---
 demos/cat-detector.py | 519 +++++++++++++++++++-----------------------
 1 file changed, 240 insertions(+), 279 deletions(-)

diff --git a/demos/cat-detector.py b/demos/cat-detector.py
index 162f325..e3acdd0 100755
--- a/demos/cat-detector.py
+++ b/demos/cat-detector.py
@@ -1,167 +1,153 @@
 #! /usr/bin/env python3
 
-# This demo shows how to use MSS for artificial intelligence.  For
-# this demo, we'll be using a simple object detection task: see if
-# there's a cat on your monitor.  I mean, displayed on the monitor,
-# not sitting on your laptop.
+# This demo shows how to use MSS for artificial intelligence.  For this demo, we'll be using a simple object detection
+# task: see if there's a cat on your monitor.  I mean, displayed on the monitor, not sitting on your laptop.
 #
-# This demo is not meant to be an introduction to AI or computer
-# vision.  We assume you have an understanding of the basics of AI,
-# and of PyTorch.
+# This demo is not meant to be an introduction to AI or computer vision.  We assume you have an understanding of the
+# basics of AI, and of PyTorch.
 #
 # Object Detection
 # ================
 #
-# An object detector is a different beast than an object classifier.
-# Object classifiers are a common introduction to computer vision.
-# These will look at a picture that has a single foreground object,
-# front and center, and try to identify what type of object this is: a
-# cat, person, bicycle, etc.
+# An object detector is a different beast than an object classifier.  Object classifiers are a common introduction to
+# computer vision.  These will look at a picture that has a single foreground object, front and center, and try to
+# identify what type of object this is: a cat, person, bicycle, etc.
 #
-# An object detector looks at an image and identifies _multiple
-# objects_ within it.  Instead of assigning a single label to the
-# whole image, saying "this is a picture of a cat", it might say
-# "there is a cat here, and a bicycle over there," and provide some
-# basic information about each one.  This is, for instance, what a
-# self-driving car uses to identify what it's seeing on its cameras.
+# An object detector looks at an image and identifies _multiple objects_ within it.  Instead of assigning a single
+# label to the whole image, saying "this is a picture of a cat", it might say "there is a cat here, and a bicycle over
+# there," and provide some basic information about each one.  This is, for instance, what a self-driving car uses to
+# identify what it's seeing on its cameras.
 #
-# For this demo, we want to tell if a cat is anywhere on the screen,
-# not if the whole screen is a picture of a cat.  That means that we
-# want to use an detector, not a classifier.
+# For this demo, we want to tell if a cat is anywhere on the screen, not if the whole screen is a picture of a cat.
+# That means that we want to use an detector, not a classifier.
 #
-# The detector will find any number of objects.  For each object it
-# detects, a typical detector produces three pieces of information:
+# The detector will find any number of objects.  For each object it detects, a typical detector produces three pieces
+# of information:
 #
-# - A *label*, which identifies _what kind of object_ the detector
-#   believes it has found.  Labels are represented internally as
-#   integers that map to a fixed list of categories the model was
-#   trained on (for example, "cat," "bicycle," or "person").
+# - A *label*, which identifies _what kind of object_ the detector believes it has found.  Labels are represented
+#   internally as integers that map to a fixed list of categories the model was trained on (for example, "cat,"
+#   "bicycle," or "person").
 #
-# - A *position*, usually given as a bounding box.  A bounding box
-#   describes _where_ the object appears in the image, using a small
-#   set of numbers that define a rectangle around it.
+# - A *position*, usually given as a bounding box.  A bounding box describes _where_ the object appears in the image,
+#   using a small set of numbers that define a rectangle around it.
 #
-# - A *score*, which indicates how confident the model is in that
-#   detection.  Higher scores mean the model is more confident; lower
-#   scores mean it is less confident.  The score is a relative
-#   confidence signal, not a calibrated probability, and it should not
-#   be interpreted as a percentage or compared across different
-#   models.
+# - A *score*, which indicates how confident the model is in that detection.  Higher scores mean the model is more
+#   confident; lower scores mean it is less confident.  The score is a relative confidence signal, not a calibrated
+#   probability, and it should not be interpreted as a percentage or compared across different models.
 #
-# Most modern object detectors follow this same basic pattern, even if
-# their internal architectures differ.  In the Torchvision model used
-# in this demo, these results are returned as parallel one-dimensional
-# tensors: one tensor of labels, one tensor of bounding boxes, and one
-# tensor of scores.  Each index across these tensors refers to the
-# same detected object.
+# Most modern object detectors follow this same basic pattern, even if their internal architectures differ.  In the
+# Torchvision model used in this demo, these results are returned as parallel one-dimensional tensors: one tensor of
+# labels, one tensor of bounding boxes, and one tensor of scores.  Each index across these tensors refers to the same
+# detected object.
 #
 # The Model We're Using
 # =====================
 #
-# In this demo, we use a pre-trained object-detection model provided
-# by PyTorch's Torchvision library: `fasterrcnn_resnet50_fpn_v2`, with
-# weights `FasterRCNN_ResNet50_FPN_V2_Weights.COCO_V1`.
+# In this demo, we use a pre-trained object-detection model provided by PyTorch's Torchvision library:
+# `fasterrcnn_resnet50_fpn_v2`, with weights `FasterRCNN_ResNet50_FPN_V2_Weights.COCO_V1`.
 #
-# This name is long, but each part reflects a piece of a larger system
-# built up over many years of research and engineering.
+# This name is long, but each part reflects a piece of a larger system built up over many years of research and
+# engineering.
 #
-# *Faster R-CNN* is the overall object-detection architecture.
-# Introduced in 2015, it builds on earlier R-CNN variants and
-# established the now-common two-stage approach to detection: first
-# proposing regions that might contain objects, then classifying and
-# refining those regions.  This basic structure is still widely used
-# today.
+# *Faster R-CNN* is the overall object-detection architecture.  Introduced in 2015, it builds on earlier R-CNN
+# variants and established the now-common two-stage approach to detection: first proposing regions that might contain
+# objects, then classifying and refining those regions.  This basic structure is still widely used today.
 #
-# *ResNet-50* refers to the convolutional neural network used as the
-# _backbone_.  ResNet itself was originally developed for image
-# classification, but its feature-extraction layers proved broadly
-# useful and are now reused in many vision systems.  In this model,
-# ResNet-50 converts raw pixels into _features_ - numerical
-# representations that capture visual patterns such as edges,
-# textures, shapes, and object parts - while the original
-# classification layers are replaced by the detection-specific
-# components of Faster R-CNN.
+# *ResNet-50* refers to the convolutional neural network used as the _backbone_.  ResNet itself was originally
+# developed for image classification, but its feature-extraction layers proved broadly useful and are now reused in
+# many vision systems.  In this model, ResNet-50 converts raw pixels into _features_ - numerical representations that
+# capture visual patterns such as edges, textures, shapes, and object parts - while the original classification layers
+# are replaced by the detection-specific components of Faster R-CNN.
 #
-# *FPN*, or Feature Pyramid Network, is a later addition that
-# addresses one of the main challenges in object detection: scale.  It
-# combines high-level, semantically rich features (good at recognizing
-# _what_ is present) with lower-level, higher-resolution features
-# (better at preserving _where_ things are).  By layering these ideas
-# on top of the backbone, the model can detect both large and small
-# objects more reliably.
+# *FPN*, or Feature Pyramid Network, is a later addition that addresses one of the main challenges in object
+# detection: scale.  It combines high-level, semantically rich features (good at recognizing _what_ is present) with
+# lower-level, higher-resolution features (better at preserving _where_ things are).  By layering these ideas on top
+# of the backbone, the model can detect both large and small objects more reliably.
 #
-# The *v2* suffix indicates a newer Torchvision implementation that
-# incorporates refinements from more recent research and practice.  In
-# particular, it follows a standardized training and configuration
-# setup described in the 2021 paper "Benchmarking Detection Transfer
-# Learning with Vision Transformers".  Despite the paper's title, this
-# model does *not* use Transformers; it uses a ResNet-50 backbone, but
-# benefits from the same modernized training approach.
+# The *v2* suffix indicates a newer Torchvision implementation that incorporates refinements from more recent research
+# and practice.  In particular, it follows a standardized training and configuration setup described in the 2021 paper
+# "Benchmarking Detection Transfer Learning with Vision Transformers".  Despite the paper's title, this model does
+# *not* use Transformers; it uses a ResNet-50 backbone, but benefits from the same modernized training approach.
 #
-# Finally, *COCO_V1* indicates that the model was trained on the COCO
-# dataset, a widely used community benchmark for object detection.
-# COCO contains hundreds of thousands of labeled images covering 80
-# common object categories (such as people, animals, and vehicles),
-# along with a small number of additional placeholder categories that
-# appear as "N/A" in the model metadata.
+# Finally, *COCO_V1* indicates that the model was trained on the COCO dataset, a widely used community benchmark for
+# object detection.  COCO contains hundreds of thousands of labeled images covering 80 common object categories (such
+# as people, animals, and vehicles), along with a small number of additional placeholder categories that appear as
+# "N/A" in the model metadata.
 #
 # Performance
 # ===========
 #
-# The biggest determinant of performance is whether the model runs on
-# a GPU or on the CPU.  GPUs are extremely well-suited to AI
-# workloads, and PyTorch's strongest and most mature GPU support today
-# is through NVIDIA's CUDA platform.
+# The biggest determinant of performance is whether the model runs on a GPU or on the CPU.  GPUs are extremely
+# well-suited to AI workloads, and PyTorch's strongest and most mature GPU support today is through NVIDIA's CUDA
+# platform.
 #
-# Screen size has little effect on performance.  The preprocessing
-# stage scales the captured image to a fixed size, so the slow part -
-# running the neural network - takes roughly the same amount of time
-# regardless of the original screen resolution.
+# Screen size has little effect on performance.  The model starts by scaling the captured image to a consistent size
+# (fitting it within 1333x800 px), so the slow part - running the neural network - takes roughly the same amount of
+# time regardless of the original screen resolution.
 #
-# With a CUDA-capable GPU, this demo's main loop typically runs in
-# around 100 ms per frame (about 10 fps).  When run on the CPU, the
-# same work takes roughly 5000 ms per frame (about 0.2 fps).
+# With a CUDA-capable GPU, this demo's main loop typically runs in around 100 ms per frame (about 10 fps).  When run
+# on the CPU, the same work takes roughly 5000 ms per frame (about 0.2 fps).
 #
-# FIXME Categorize
-# ================
+# Cached Data
+# ===========
 #
-# The first time you run this demo, Torchvision will download a
-# 167 MByte DNN.  This is cached in ~/.cache/torch/hub/checkpoints
-# on Unix.  I'm not sure where it's cached on other platforms, but
-# it will tell you.
+# The first time you run this demo, Torchvision will download a 167 MByte DNN.  This is cached in
+# ~/.cache/torch/hub/checkpoints on Unix.  I'm not sure where it's cached on other platforms, but it will tell you.
 
+from __future__ import annotations
 
 import itertools
 import time
 
-# You'll need to "pip install mss pillow".  Additionally, you'll
-# need to install PyTorch and TorchVision, and the best way to do that
-# can vary depending on your system.  Often, "pip install torch
-# torchvision" will be sufficient, but you can get specific
-# instructions at <https://pytorch.org/get-started/locally/>.
-from PIL import Image
+# You'll need to install PyTorch and TorchVision, and the best way to do that can vary depending on your system.
+# Often, "pip install torch torchvision" will be sufficient, but you can get specific instructions at
+# <https://pytorch.org/get-started/locally/>.
 import torch
 import torchvision.models.detection
 import torchvision.transforms.v2
 
+# You'll also need to "pip install mss pillow".
+from PIL import Image
+
 import mss
 
-# The model will identify objects even if they only vaguely look like
-# something.  It also tell us a score of how certain it is, on a scale
-# from 0 to 1.  To prevent false positives, we set a threshold and
-# ignore any results below it.  The score doesn't have any real
-# external meaning: to pick the cutoff, you just try different images
-# and get a sense of what seems about right.
+# The model will identify objects even if they only vaguely look like something.  It also tell us a score of how
+# certain it is, on a scale from 0 (not a cat) to 1 (very confidently a cat).  To prevent false positives, we set a
+# threshold and ignore any results below it.  The score doesn't have any real external meaning: to pick the cutoff,
+# you just try different images, look at the scores, and get a sense of what seems about right.
 SCORE_THRESH = 0.60
 
-# If an image is too small, then it's got a pretty decent chance of
-# being a false positive: it's hard to tell if a Discord or Slack
-# reaction icon is a cat or something different.  We ignore any
-# results that are too small to be reliable.  Here, this cutoff is
-# 0.1% of the whole monitor (about 1.5 cm square on a 27" monitor).
-MIN_AREA_FRAC = 0.001  # Fraction of image
-
-def top_unique_labels(labels, scores):
+# If an image is too small, then it's got a pretty decent chance of being a false positive: it's hard to tell if a
+# Discord or Slack reaction icon is a cat or something different.  We ignore any results that are too small to be
+# reliable.  Here, this cutoff is 0.1% of the whole monitor (about 1.5 cm square on a 27" monitor, the diameter of a
+# AA battery).  Like the score threshold, this is just something you try and see what the model seems to be able to
+# recognize reliably.
+MIN_AREA_FRAC = 0.001
+
+
+# This function is here for illustrative purposes: the demo doesn't currently call it, but there's a commented-out
+# line in the main loop that shows how you might use it.
+def screenshot_to_tensor(sct_img: mss.ScreenShot, device: str | torch.device) -> torch.Tensor:
+    """Convert an MSS ScreenShot to a CHW PyTorch tensor."""
+
+    # Get a 1d tensor of BGRA values.  PyTorch will issue a warning at this step: the ScreenShot's bgra object is
+    # read-only, but PyTorch doesn't support read-only tensors.  However, this is harmless in our case: we'll end up
+    # copying the data anyway when we run contiguous().
+    img = torch.frombuffer(sct_img.bgra, dtype=torch.uint8)
+    # Do the rest of this on the GPU, if desired.
+    img = img.to(device)
+    # Convert to an HWC view: (H, W, 4)
+    img = img.view(sct_img.height, sct_img.width, 4)
+    # Drop alpha and reorder BGR -> RGB
+    rgb_hwc = img[..., [2, 1, 0]]
+    # HWC -> CHW
+    rgb_chw = rgb_hwc.permute(2, 0, 1)
+    # Copy this into contiguous memory, for improved performance.  (Some models might be faster with
+    # .to(memory_format=torch.channels_last) instead.)
+    return rgb_chw.contiguous()
+
+
+def top_unique_labels(labels: torch.Tensor, scores: torch.Tensor) -> torch.Tensor:
     """Return the unique labels, ordered by descending score.
 
     If you have a person (0.67), dog (0.98), tv (0.88), dog (0.71),
@@ -178,8 +164,8 @@ def top_unique_labels(labels, scores):
     #   inv    = [0,      1,   2,  1]
     uniq, inv = torch.unique(labels, return_inverse=True)
 
-    # Create a tensor to hold the maximum score seen for each unique
-    # label.  We initialize to -inf so any real score will replace it.
+    # Create a tensor to hold the maximum score seen for each unique label.  We initialize to -inf so any real score
+    # will replace it.
     max_per = torch.full(
         (uniq.numel(),),
         -torch.inf,
@@ -187,11 +173,10 @@ def top_unique_labels(labels, scores):
         dtype=scores.dtype,
     )
 
-    # For each element in `scores`, reduce it into `max_per` using
-    # `inv` as an index map, taking the maximum score per label.
+    # For each element in `scores`, reduce it into `max_per` using `inv` as an index map, taking the maximum score per
+    # label.
     #
-    # After this, max_per[i] is the highest score associated with
-    # uniq[i].
+    # After this, max_per[i] is the highest score associated with uniq[i].
     max_per.scatter_reduce_(0, inv, scores, reduce="amax")
 
     # Sort the unique labels by their maximum score, highest first.
@@ -201,195 +186,158 @@ def top_unique_labels(labels, scores):
     return uniq[order]
 
 
-# We run the entire program in inference mode.  This is telling
-# PyTorch to not bother tracking data that's only useful for training
-# a neural net.
+# We run the entire program in inference mode.  This is telling PyTorch to not bother tracking data that's only useful
+# for training a neural net.
 @torch.inference_mode()
-def main():
-    # Prefer CUDA if available.  PyTorch's CUDA backend is the most
-    # mature and consistently supported option, and can be tens of
-    # times faster than running the same model on the CPU.
+def main() -> None:
+    # Prefer CUDA if available.  PyTorch's CUDA backend is the most mature and consistently supported option, and can
+    # be tens of times faster than running the same model on the CPU.
     #
-    # Other GPU backends (such as Apple's MPS, AMD ROCm, or Intel XPU)
-    # exist, but support and configuration vary widely across systems.
-    # Since this demo hasn't been tested on those platforms, it
-    # conservatively falls back to the CPU when CUDA is not available.
-    if torch.cuda.is_available():
-        device = "cuda"
-    else:
-        device = "cpu"
-
-    # Neural networks, often just called *models*, have two aspects to
-    # them: the *architecture*, and the *weights*.  The architecture
-    # is the layout of the neural network: what the different units
-    # are, how they're connected, and so forth.  The weights are the
-    # results of training that neural network; they're numbers saying
-    # how much the units in the network influence each other.
+    # Other GPU backends (such as Apple's MPS, AMD ROCm, or Intel XPU) exist, but support and configuration vary
+    # widely across systems.  Since this demo hasn't been tested on those platforms, it conservatively falls back to
+    # the CPU when CUDA is not available.
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+
+    # Neural networks, often just called *models*, have two aspects to them: the *architecture*, and the *weights*.
+    # The architecture is the layout of the neural network: what the different units are, how they're connected, and
+    # so forth.  The weights are the results of training that neural network; they're numbers saying how much the
+    # units in the network influence each other.
     #
-    # The same architecture can be trained on different data sets for
-    # different purposes.  Different companies might use the exact
-    # same object detector architecture for different purposes: a
-    # company making a photo editing app might train the model to
-    # recognize faces, smiles, or closed eyes for auto-enhancement,
-    # while a wildlife research group could train the same
-    # architecture to identify animals in wilderness camera photos.
+    # The same architecture can be trained on different data sets for different purposes.  Different companies might
+    # use the exact same object detector architecture for different purposes: a company making a photo editing app
+    # might train the model to recognize faces, smiles, or closed eyes for auto-enhancement, while a wildlife research
+    # group could train the same architecture to identify animals in wilderness camera photos.
     #
-    # The weights are specific to the architecture: you can't plug
-    # weights from a training run with the ResNet50 architecture into
-    # a Visual Transformers architecture.
+    # The weights are specific to the architecture: you can't plug weights from a training run with the ResNet50
+    # architecture into a Visual Transformers architecture.
     #
-    # As described in the comments at the top of the file, we're using
-    # the fasterrcnn_resnet50_fpn_v2 architecture, and the weights
-    # obtained by training it with the COCO dataset.  Plugging those
-    # weights into the architecture produces our model.
+    # As described in the comments at the top of the file, we're using the fasterrcnn_resnet50_fpn_v2 architecture,
+    # and the weights obtained by training it with the COCO dataset.  Plugging those weights into the architecture
+    # produces our model.
     weights = torchvision.models.detection.FasterRCNN_ResNet50_FPN_V2_Weights.COCO_V1
-    model = torchvision.models.detection.fasterrcnn_resnet50_fpn_v2(weights=weights).to(device).eval()
-
-    # When you train a model, you almost always want to pre-process
-    # your input data.  It's important that when you use that model
-    # later, you do the same kind of pre-processing.  Otherwise, it'd
-    # be like learning a language from slow, carefully-enunciated
-    # speech, and then getting dropped right into conversations on a
-    # subway.
+    model = torchvision.models.detection.fasterrcnn_resnet50_fpn_v2(weights=weights)
+    # Move the model to the GPU, if we've selected that, and put it in evaluation mode (as opposed to training mode).
+    # Training mode often uses features meant to make the training more robust, such as randomly ignoring some
+    # connections to make sure the model learns some redundancy.  Evaluation mode puts it in a mode to perform the
+    # best it can.
+    model = model.to(device).eval()
+
+    # When you train a model, you almost always want to pre-process your input data.  It's important that when you use
+    # that model later, you do the same kind of pre-processing.  Otherwise, it'd be like learning a language from
+    # slow, carefully-enunciated speech, and then getting dropped right into conversations on a subway.
+    #
+    # For the model we're using, the preprocessing is simply to standardize the representation: it will convert PIL
+    # images to a tensor representation, and convert all images to floating-point 0.0-1.0 instead of integer 0-255.
+    # Some other models do more preprocessing.
     #
-    # For the model we're using, the preprocessing is to scale the
-    # input image to a consistent size, and to normalize its range
-    # (kinda similar to the "Auto" filter on your phone).
-    # Fortunately, for its pretrained models, Torchvision gives us
-    # an easy way to get the correct preprocessing function.
+    # Fortunately, for its pretrained models, Torchvision gives us an easy way to get the correct preprocessing
+    # function.
     preprocess = weights.transforms()
 
-    # The labels ("what type of object is this") that the model gives
-    # us are just integers; for this model, they're from 0 to 90.  The
-    # English words describing them ("cat") are in a list, stored in
-    # the weight's metadata.
+    # The labels ("what type of object is this") that the model gives us are just integers; for this model, they're
+    # from 0 to 90.  The English words describing them ("cat") are in a list, stored in the weight's metadata.
     model_labels = weights.meta["categories"]
     cat_label = model_labels.index("cat")
 
     with mss.mss() as sct:
         monitor = sct.monitors[1]
 
+        # Compute the minimum size, in square pixels, that we'll consider reliable.
         img_area = monitor["width"] * monitor["height"]
-        # FIXME verify whether the ROI boxes are relative to the
-        # original or preprocessed image.
         min_box_area = MIN_AREA_FRAC * img_area
 
-        # We start a new line of the log if the cat visibility status
-        # changes.  That way, your terminal will show essentially a
-        # log of all the times when a cat appeared or vanished.
+        # We start a new line of the log if the cat visibility status changes.  That way, your terminal will show
+        # essentially a log of all the times when a cat appeared or vanished.
         cat_has_been_visible = False
 
-        # Track an exponential moving average of how long each frame
-        # takes, essentially an FPS counter.
-        elapsed_per_frame_moving_avg = None
+        # Track an exponential moving average of how long each frame takes, essentially an FPS counter.
+        frame_duration_avg = None
 
         # When was the last frame?
-        time_last_frame = None
+        prev_frame_start = None
 
+        # We run forever, or until the user interrupts us.
+        print("Looking for kitty cats!  Press Ctrl-C to stop.")
         for frame_number in itertools.count():
             # Do all the work to keep the frame timer.
-            time_this_frame = time.monotonic()
-            if time_last_frame is not None:  # Skip the first loop
-                elapsed_this_frame = time_this_frame - time_last_frame
+            frame_start = time.monotonic()
+            if prev_frame_start is not None:  # Skip the first loop
+                frame_duration = frame_start - prev_frame_start
+                # Track frame timing with exponential moving average.  Skip the first few frames while PyTorch
+                # optimizes its computations.
                 if frame_number < 5:
-                    # We don't try to keep a moving average until the
-                    # pipeline has warmed up for a few frames: the
-                    # times are too variable before PyTorch has gotten
-                    # a sense of the inputs we're sending it, and we
-                    # don't want those initial outlies to affect the
-                    # EMA.  Instead, we just show the most recent
-                    # frame's number.
-                    elapsed_per_frame_moving_avg = elapsed_this_frame
+                    frame_duration_avg = frame_duration
                 else:
-                    # The exponential moving average we track is based
-                    # 90% on the old average, and 10% on the most
-                    # recent frame.  If you do the math, each frame's
-                    # timing has half as much influence every 7 frames
-                    # or so.
-                    elapsed_per_frame_moving_avg = elapsed_per_frame_moving_avg * 0.9 + elapsed_this_frame * 0.1
-            time_last_frame = time_this_frame
+                    # Exponential moving average: weight recent frame 10%, historical average 90%.  This means each
+                    # frame's influence halves every ~7 frames.
+                    assert frame_duration_avg is not None
+                    frame_duration_avg = frame_duration_avg * 0.9 + frame_duration * 0.1
+            prev_frame_start = frame_start
 
             # Grab the screenshot.
             sct_img = sct.grab(monitor)
 
-            # We transfer the image from MSS to PyTorch by going
-            # through a Pillow Image.  There are faster ways to do
-            # this transfer, but here, the vast bulk of the time is
-            # occupied by the AI work, so we just use the most
-            # convenient mechanism.
+            # We transfer the image from MSS to PyTorch via a Pillow Image.  Faster approaches exist (see below) but
+            # PIL is more readable.  The bulk of the time in this program is spent doing the AI work, so we just use
+            # the most convenient mechanism.
             img = Image.frombytes("RGB", sct_img.size, sct_img.bgra, "raw", "BGRX")
-            # We explicitly convert it to a tensor here, even though
-            # Torchvision can also convert it in the preprocess step.
-            # This is so that we send it to the GPU before we do the
-            # preprocessing: PIL Images are always on the CPU, and
-            # doing the preprocessing on the GPU is much faster.
+
+            # We explicitly convert it to a tensor here, even though Torchvision can also convert it in the preprocess
+            # step.  This is so that we send it to the GPU before we do the preprocessing: PIL Images are always on
+            # the CPU, and doing the preprocessing on the GPU is much faster.
             #
-            # Most image APIs, including MSS, use an array layout of
-            # [height, width, channels].  In MSS, the ScreenShot.bgra
-            # data follows this convention, even though it's exposed
-            # as a flat bytes object.
+            # Most image APIs, including MSS, use an array layout of [height, width, channels].  In MSS, the
+            # ScreenShot.bgra data follows this convention, even though it's exposed as a flat bytes object.
             #
-            # In contrast, most AI frameworks expect images in
-            # [channels, height, width] order.  The pil_to_tensor
+            # In contrast, most AI frameworks expect images in [channels, height, width] order.  The pil_to_tensor
             # helper performs this rearrangement for us.
             img_tensor = torchvision.transforms.v2.functional.pil_to_tensor(img).to(device)
 
-            # Do the preprocessing stages that the trained model
-            # expects; see the comment where we define preprocess.
-            # The traditional name for inputs to a neural net is "x",
-            # because AI programmers aren't terribly imaginative.
+            # An alternative to using PIL is shown in screenshot_to_tensor.  In one test, this saves about 20 ms per
+            # frame if using a GPU, but is actually slower if using the CPU.  This would replace the "img=" and
+            # "img_tensor=" lines above.
+            #
+            #img_tensor = screenshot_to_tensor(sct_img, device)
+
+            # Do the preprocessing stages that the trained model expects; see the comment where we define preprocess.
+            # The traditional name for inputs to a neural net is "x", because AI programmers aren't terribly
+            # imaginative.
             x = preprocess(img_tensor)
-            # In most AI networks, the model expects to take an array
-            # of inputs, and will return an array of outputs.  This is
-            # because it's _much_ more efficient to operate on batches
-            # of inputs than on individual inputs, because of how the
-            # matrix math works.  For instance, banks will use batches
-            # of transactions in AIs to flag transactions for review
-            # as potentially fraudulent.  Because of that design, we
-            # need to provide the model our input as a batch of one
-            # image, rather than a single image by itself.  That's
-            # what the unsqueeze does: it adds a new dimension of
-            # length 1 to the beginning of the input.  Also, the
-            # output will be in a batch, so we just take the first
-            # element, hence the [0].
+            # In most AI networks, the model expects to take a batch of inputs, and will return an batch of outputs.
+            # This is because it's _much_ more efficient to operate on batches of inputs than on individual inputs
+            # when you're doing matrix math.  For instance, banks will use batches of transactions in AIs to flag
+            # transactions for review as potentially fraudulent.  Because of that design, we need to provide the model
+            # our input as a batch of one image, rather than a single image by itself.  That's what the unsqueeze
+            # does: it adds a new dimension of length 1 to the beginning of the input.  Also, the output will be in a
+            # batch, so we just take the first element, hence the [0].
             pred = model(x.unsqueeze(0))[0]
 
-            # The value of pred is a dict, giving us the labels,
-            # scores, and bounding boxes.  See the comments at the top
-            # of the file for more information.
+            # The value of pred is a dict, giving us the labels, scores, and bounding boxes.  See the comments at the
+            # top of the file for more information.
             labels = pred["labels"]
             scores = pred["scores"]
             boxes = pred["boxes"]
 
-            # We only want to allow detections that are large enough
-            # to be reliable; see the comments on MIN_AREA_FRAC for
-            # more information.  Here, we compute the areas of all the
-            # boxes we got, using operations that work on all the
-            # detected objects in parallel.
+            # We only want to allow detections that are large enough to be reliable; see the comments on MIN_AREA_FRAC
+            # for more information.  Here, we compute the areas of all the boxes we got, using operations that work on
+            # all the detected objects in parallel.
             areas = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
 
-            # Find the score of the highest-scoring cat that's large
-            # enough, even if it's not high enough to register as
-            # sufficiently certain for our program.  We always log
-            # that, as the "cat score".
+            # Find the score of the highest-scoring cat that's large enough, even if it's not high enough to register
+            # as sufficiently certain for our program.  We always log that, as the "cat score".
             cat_mask = (labels == cat_label) & (areas >= min_box_area)
-            if cat_mask.any():
-                cat_score = scores[cat_mask].max().item()
-            else:
-                cat_score = 0.0
+            cat_score = scores[cat_mask].max().item() if cat_mask.any() else 0.0
 
             # Is there a cat on the screen?
             cat_in_frame = cat_score >= SCORE_THRESH
-            # Did a cat just appear or disappear?  We create a new log
-            # line when this happens, so the user gets a log of cat
-            # appearances and disappearances.
+            # Did a cat just appear or disappear?  We create a new log line when this happens, so the user gets a log
+            # of cat appearances and disappearances.
             cat_status_changed = cat_in_frame != cat_has_been_visible
             if cat_status_changed:
                 cat_has_been_visible = cat_in_frame
 
             if not cat_in_frame:
-                # Find all objects that score sufficiently well.
-                # We're going to log them if there's no cat to talk
+                # Find all objects that score sufficiently well.  We're going to log them if there's no cat to talk
                 # about.
                 mask = (scores >= SCORE_THRESH) & (areas >= min_box_area)
                 if mask.any():
@@ -397,26 +345,39 @@ def main():
                 else:
                     show_labels = torch.empty((0,), dtype=labels.dtype)
 
-            # Give the user our results.  We only do this if the
-            # per-frame durations have been initialized (we're on at
-            # least the second frame), just to simplify the layout
-            # logic.
-            if elapsed_per_frame_moving_avg is not None:
-                status_line_time = time.strftime("%H:%M:%S", time.localtime())
-                if cat_in_frame:
-                    status_line_msg = f"Meow!  Hello kitty-cat!"
-                else:
-                    status_line_msg = "no cats"
-                    if show_labels.shape[0] != 0:
-                        label_words = [model_labels[i] for i in show_labels.cpu()]
-                        label_words = [w for w in label_words if w != "N/A"]
-                        status_line_msg += f":{','.join(label_words)}"
-                        if len(status_line_msg) > 31:
-                            status_line_msg = status_line_msg[:28] + "..."
-                status_line = (f"{status_line_time} {frame_number:4d} "
-                               f"{elapsed_per_frame_moving_avg * 1000:5.0f} ms/frame "
-                               f"| {status_line_msg:31s} (cat score={cat_score:.2f})")
-                print(f"\r{status_line}", end="\n" if cat_status_changed else "")
+            # Give the user our results.
+            status_line_time = time.strftime("%H:%M:%S", time.localtime())
+            if cat_in_frame:
+                status_line_msg = "Meow!  Hello kitty-cat!"
+            else:
+                status_line_msg = "no cats"
+                # If there isn't a cat, but there are other objects, list them.
+                if show_labels.shape[0] != 0:
+                    label_words = [model_labels[i] for i in show_labels.cpu()]
+                    # Filter out anything marked as "N/A": these are non-objects (like "sky"), and the training for
+                    # this model doesn't really cover them.
+                    label_words = [w for w in label_words if w != "N/A"]
+                    # Build these into a comma-separated list.  Make sure the whole string is at most 31 characters,
+                    # the width we provide for it in the message.
+                    status_line_msg += f":{','.join(label_words)}"
+                    if len(status_line_msg) > 31:
+                        status_line_msg = status_line_msg[:28] + "..."
+            # The frame_duration_avg will be None in the first iteration, since there isn't yet a full iteration to
+            # measure.
+            duration_avg_str = (
+                f"{frame_duration_avg * 1000:5.0f}" if frame_duration_avg is not None else "-----"
+            )
+
+            # Build the whole status line.  It's a constant width, so that when we overwrite it each frame, the new
+            # status line will completely overwrite the previous one.
+            status_line = (
+                f"{status_line_time} {frame_number:4d} "
+                f"{duration_avg_str} ms/frame "
+                f"| {status_line_msg:31s} (cat score={cat_score:.2f})"
+            )
+            # If a cat just appeared or disappeared, start a new line after this status line.  This lets the user see
+            # a history of all the cat status changes.
+            print(f"\r{status_line}", end="\n" if cat_status_changed else "")
 
 
 if __name__ == "__main__":

From 45b57948f0207c2bc26cf7ef44c95acc018af893 Mon Sep 17 00:00:00 2001
From: Joel Holveck <jholveck@nvidia.com>
Date: Tue, 27 Jan 2026 16:41:30 -0800
Subject: [PATCH 5/8] Apply suggestions from code review

Co-authored-by: Halldor Fannar <halldorfannar@users.noreply.github.com>
---
 demos/cat-detector.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/demos/cat-detector.py b/demos/cat-detector.py
index e3acdd0..2cb6e10 100755
--- a/demos/cat-detector.py
+++ b/demos/cat-detector.py
@@ -19,7 +19,7 @@
 # identify what it's seeing on its cameras.
 #
 # For this demo, we want to tell if a cat is anywhere on the screen, not if the whole screen is a picture of a cat.
-# That means that we want to use an detector, not a classifier.
+# That means that we want to use a detector, not a classifier.
 #
 # The detector will find any number of objects.  For each object it detects, a typical detector produces three pieces
 # of information:
@@ -92,7 +92,7 @@
 # ===========
 #
 # The first time you run this demo, Torchvision will download a 167 MByte DNN.  This is cached in
-# ~/.cache/torch/hub/checkpoints on Unix.  I'm not sure where it's cached on other platforms, but it will tell you.
+# ~/.cache/torch/hub/checkpoints on Unix.  If you want to know where the cache is stored on other platforms, this information will be displayed after downloading the DNN.
 
 from __future__ import annotations
 
@@ -120,7 +120,7 @@
 # If an image is too small, then it's got a pretty decent chance of being a false positive: it's hard to tell if a
 # Discord or Slack reaction icon is a cat or something different.  We ignore any results that are too small to be
 # reliable.  Here, this cutoff is 0.1% of the whole monitor (about 1.5 cm square on a 27" monitor, the diameter of a
-# AA battery).  Like the score threshold, this is just something you try and see what the model seems to be able to
+# AA battery).  Like the score threshold, this is just something you try and see what the model is able to
 # recognize reliably.
 MIN_AREA_FRAC = 0.001
 
@@ -235,7 +235,7 @@ def main() -> None:
     preprocess = weights.transforms()
 
     # The labels ("what type of object is this") that the model gives us are just integers; for this model, they're
-    # from 0 to 90.  The English words describing them ("cat") are in a list, stored in the weight's metadata.
+    # from 0 to 90.  The English words describing them (like "cat") are in a list, stored in the weight's metadata.
     model_labels = weights.meta["categories"]
     cat_label = model_labels.index("cat")
 

From 98a6ada40b424246ae6f6b12e98ca78bd2c82219 Mon Sep 17 00:00:00 2001
From: Joel Ray Holveck <jholveck@nvidia.com>
Date: Tue, 27 Jan 2026 17:01:37 -0800
Subject: [PATCH 6/8] Add changes per review comments

---
 demos/cat-detector.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/demos/cat-detector.py b/demos/cat-detector.py
index 2cb6e10..3c63bbd 100755
--- a/demos/cat-detector.py
+++ b/demos/cat-detector.py
@@ -77,9 +77,11 @@
 # Performance
 # ===========
 #
-# The biggest determinant of performance is whether the model runs on a GPU or on the CPU.  GPUs are extremely
-# well-suited to AI workloads, and PyTorch's strongest and most mature GPU support today is through NVIDIA's CUDA
-# platform.
+# This demo can run the model on either the CPU or a GPU.  The single biggest factor affecting performance is which
+# one you use.  Modern neural networks are designed around large amounts of parallel computation, which GPUs handle
+# much more efficiently than CPUs.  In practice, that means the same model runs dramatically faster on a GPU than on
+# the CPU, even though the underlying math is identical.  PyTorch's strongest and most mature GPU support today is
+# through Nvidia's CUDA platform, so that is the only GPU supported by this demo.
 #
 # Screen size has little effect on performance.  The model starts by scaling the captured image to a consistent size
 # (fitting it within 1333x800 px), so the slow part - running the neural network - takes roughly the same amount of
@@ -92,7 +94,8 @@
 # ===========
 #
 # The first time you run this demo, Torchvision will download a 167 MByte DNN.  This is cached in
-# ~/.cache/torch/hub/checkpoints on Unix.  If you want to know where the cache is stored on other platforms, this information will be displayed after downloading the DNN.
+# ~/.cache/torch/hub/checkpoints on Unix.  If you want to know where the cache is stored on other platforms, it will
+# be displayed while downloading the DNN.
 
 from __future__ import annotations
 
@@ -106,7 +109,7 @@
 import torchvision.models.detection
 import torchvision.transforms.v2
 
-# You'll also need to "pip install mss pillow".
+# You'll also need to install MSS and Pillow, such as with "pip install mss pillow".
 from PIL import Image
 
 import mss

From 24eef572820e5b0ea63a981457ad30508128a523 Mon Sep 17 00:00:00 2001
From: Joel Ray Holveck <jholveck@nvidia.com>
Date: Tue, 27 Jan 2026 18:38:02 -0800
Subject: [PATCH 7/8] Improve screenshot_to_tensor

My last version accidentally had an intermediate copy.  This version
prevents that.
---
 demos/cat-detector.py | 34 +++++++++++++++-------------------
 1 file changed, 15 insertions(+), 19 deletions(-)

diff --git a/demos/cat-detector.py b/demos/cat-detector.py
index 3c63bbd..c8ff117 100755
--- a/demos/cat-detector.py
+++ b/demos/cat-detector.py
@@ -135,19 +135,17 @@ def screenshot_to_tensor(sct_img: mss.ScreenShot, device: str | torch.device) ->
 
     # Get a 1d tensor of BGRA values.  PyTorch will issue a warning at this step: the ScreenShot's bgra object is
     # read-only, but PyTorch doesn't support read-only tensors.  However, this is harmless in our case: we'll end up
-    # copying the data anyway when we run contiguous().
+    # copying the data anyway.
     img = torch.frombuffer(sct_img.bgra, dtype=torch.uint8)
-    # Do the rest of this on the GPU, if desired.
+    # Bring everything to the desired device.  This is still just a linear buffer of BGRA bytes.
     img = img.to(device)
-    # Convert to an HWC view: (H, W, 4)
-    img = img.view(sct_img.height, sct_img.width, 4)
-    # Drop alpha and reorder BGR -> RGB
-    rgb_hwc = img[..., [2, 1, 0]]
-    # HWC -> CHW
-    rgb_chw = rgb_hwc.permute(2, 0, 1)
-    # Copy this into contiguous memory, for improved performance.  (Some models might be faster with
-    # .to(memory_format=torch.channels_last) instead.)
-    return rgb_chw.contiguous()
+    # The next two steps will all just create views of the original tensor, without copying the data.
+    img = img.view(sct_img.height, sct_img.width, 4)  # Interpret as BGRA HWC
+    img = img.permute(2, 0, 1)  # Permute the axes: BGRA CHW
+    # This final step will create a copy.  Copying the data is required to reorder the channels.  This also has the
+    # advantage of also making the tensor contiguous, for more efficient access.
+    img = img[[2, 1, 0], ...]  # Reorder the channels: RGB CHW
+    return img
 
 
 def top_unique_labels(labels: torch.Tensor, scores: torch.Tensor) -> torch.Tensor:
@@ -280,9 +278,9 @@ def main() -> None:
             # Grab the screenshot.
             sct_img = sct.grab(monitor)
 
-            # We transfer the image from MSS to PyTorch via a Pillow Image.  Faster approaches exist (see below) but
-            # PIL is more readable.  The bulk of the time in this program is spent doing the AI work, so we just use
-            # the most convenient mechanism.
+            # We transfer the image from MSS to PyTorch via a Pillow Image.  Faster approaches exist (see
+            # screenshot_to_tensor), but PIL is more readable.  The bulk of the time in this program is spent doing
+            # the AI work, so we just use the most convenient mechanism.
             img = Image.frombytes("RGB", sct_img.size, sct_img.bgra, "raw", "BGRX")
 
             # We explicitly convert it to a tensor here, even though Torchvision can also convert it in the preprocess
@@ -297,8 +295,8 @@ def main() -> None:
             img_tensor = torchvision.transforms.v2.functional.pil_to_tensor(img).to(device)
 
             # An alternative to using PIL is shown in screenshot_to_tensor.  In one test, this saves about 20 ms per
-            # frame if using a GPU, but is actually slower if using the CPU.  This would replace the "img=" and
-            # "img_tensor=" lines above.
+            # frame if using a GPU, and about 200 ms if using a CPU.  This would replace the "img=" and "img_tensor="
+            # lines above.
             #
             #img_tensor = screenshot_to_tensor(sct_img, device)
 
@@ -367,9 +365,7 @@ def main() -> None:
                         status_line_msg = status_line_msg[:28] + "..."
             # The frame_duration_avg will be None in the first iteration, since there isn't yet a full iteration to
             # measure.
-            duration_avg_str = (
-                f"{frame_duration_avg * 1000:5.0f}" if frame_duration_avg is not None else "-----"
-            )
+            duration_avg_str = f"{frame_duration_avg * 1000:5.0f}" if frame_duration_avg is not None else "-----"
 
             # Build the whole status line.  It's a constant width, so that when we overwrite it each frame, the new
             # status line will completely overwrite the previous one.

From 848ad6e5d48248453694e6af818a1871615dc3ec Mon Sep 17 00:00:00 2001
From: Joel Ray Holveck <jholveck@nvidia.com>
Date: Tue, 27 Jan 2026 23:07:08 -0800
Subject: [PATCH 8/8] Add CHANGELOG and docs entries

---
 CHANGELOG.md             | 2 +-
 docs/source/examples.rst | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 004252a..9c8397f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,7 +12,7 @@ See Git commit messages for full history.
 - Windows: improve error checking and messages for Win32 API calls (#448)
 - Mac: fix memory leak (#450, #453)
 - improve multithreading: allow multiple threads to use the same MSS object, allow multiple MSS objects to concurrently take screenshots, and document multithreading guarantees (#446, #452)
-- Add full demos for different ways to use MSS (#444, #456)
+- Add full demos for different ways to use MSS (#444, #456, #465)
 - :heart: contributors: @jholveck, @halldorfannar
 
 ## 10.1.0 (2025-08-16)
diff --git a/docs/source/examples.rst b/docs/source/examples.rst
index 7b636bb..ce54014 100644
--- a/docs/source/examples.rst
+++ b/docs/source/examples.rst
@@ -225,3 +225,4 @@ scenarios.
 These include:
 - MP4 video capture with encoding using PyAV (FFmpeg bindings)
 - Live streaming to a TinyTV as MJPEG
+- Detect images of cats on the screen