From d803b2aab0452fc72e3919b4f333e7ff74193df6 Mon Sep 17 00:00:00 2001 From: Joel Ray Holveck Date: Wed, 21 Jan 2026 22:57:44 -0800 Subject: [PATCH 1/8] New demo: cat detector This will detect if a cat is on the screen. By which I mean displayed on the screen, not sitting on your laptop. This is meant as a simple demo of using MSS for AI. It works as-is, but needs to be documented, and there's some bits that could do with cleanup. There are a lot of additional features that could be added, such as showing a window with bounding boxes, but that's probably more complexity than is called for here. --- demos/cat-detector.py | 153 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 153 insertions(+) create mode 100644 demos/cat-detector.py diff --git a/demos/cat-detector.py b/demos/cat-detector.py new file mode 100644 index 0000000..330f6fb --- /dev/null +++ b/demos/cat-detector.py @@ -0,0 +1,153 @@ +#! /usr/bin/env python3 + +import itertools +import time + +# You'll need to "pip install mss numpy pillow". Additionally, you'll +# need to install PyTorch and TorchVision, and the best way to do that +# can vary depending on your system. Often, "pip install torch +# torchvision" will be sufficient, but you can get specific +# instructions at . +import numpy as np +from PIL import Image +import torch +import torchvision.models.detection +import torchvision.transforms.v2 + +import mss + + +def top_unique_labels(labels, scores): + """Return the unique labels, ordered by score descending. + + In other words, if you have a person (0.67), dog (0.98), tv + (0.88), dog (0.71), you'll get back the labels for dog, tv, + person, in that order. + + The labels are a 1d tensor of integers, which are identifiers for + model-specific categories, such as indices into + weights.meta["categories"]. + + The scores are a parallel 1d tensor of the same size of floats: in + other words, score[0] is the score of label[0]. + """ + uniq, inv = torch.unique(labels, return_inverse=True) + max_per = torch.full((uniq.numel(),), -torch.inf, device=scores.device, dtype=scores.dtype) + max_per.scatter_reduce_(0, inv, scores, reduce="amax") + order = torch.argsort(max_per, descending=True) + return uniq[order] + + +# We run the entire program in inference mode. This is telling +# PyTorch to not bother tracking data that's only useful for training +# a neural net. +@torch.inference_mode() +def main(): + # Use CUDA if it's installed and available. This is much faster + # than doing all the work on the CPU. + device = "cuda" if torch.cuda.is_available() else "cpu" + + # The first time you run this demo, Torchvision will download a + # 167 MByte DNN. This is cached in ~/.cache/torch/hub/checkpoints + # on Unix; not sure where it's cached on other platforms. + weights = torchvision.models.detection.FasterRCNN_ResNet50_FPN_V2_Weights.DEFAULT + model = torchvision.models.detection.fasterrcnn_resnet50_fpn_v2(weights=weights).to(device).eval() + preprocess = weights.transforms() + + model_labels = weights.meta["categories"] + cat_label = model_labels.index("cat") + + score_thresh = 0.60 + img_long_side = 960 + min_area_frac = 0.001 # Fraction of image + + with mss.mss() as sct: + monitor = sct.monitors[1] # primary monitor + + img_area = monitor["width"] * monitor["height"] + min_box_area = min_area_frac * img_area + + cat_has_been_visible = False + elapsed_per_frame_running_avg = None + time_last_frame = None + + for frame_number in itertools.count(): + time_this_frame = time.monotonic() + if time_last_frame is not None: + elapsed_this_frame = time_this_frame - time_last_frame + if frame_number < 5: + # We don't try to keep a moving average until the + # pipeline has warmed up. + elapsed_per_frame_running_avg = elapsed_this_frame + else: + elapsed_per_frame_running_avg = elapsed_per_frame_running_avg * 0.9 + elapsed_this_frame * 0.1 + time_last_frame = time_this_frame + + sct_img = sct.grab(monitor) + img = Image.frombytes("RGB", sct_img.size, sct_img.bgra, "raw", "BGRX") + # We explicitly convert it to a tensor here, even though + # Torchvision can also convert it in the preprocess step. + # This is so that we send it to the GPU to do the + # preprocessing; PIL images are always on the CPU. + img_tensor = torchvision.transforms.v2.functional.pil_to_tensor(img).to(device) + + x = preprocess(img_tensor) # tensor CxHxW + pred = model([x])[0] + + labels = pred["labels"] + scores = pred["scores"] + boxes = pred["boxes"] + + areas = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) + + # Find the score of the highest-scoring cat that's large + # enough, even if it's not high enough to register the + # detector. We always log that. + cat_mask = (labels == cat_label) & (areas >= min_box_area) + if cat_mask.any(): + cat_score = scores[cat_mask].max().item() + else: + cat_score = 0.0 + + cat_in_frame = cat_score >= score_thresh + cat_status_changed = cat_in_frame != cat_has_been_visible + if cat_status_changed: + cat_has_been_visible = cat_in_frame + + if not cat_in_frame: + # Find all objects that score sufficiently well. We + # log them if there's no cat to talk about. + mask = (scores >= score_thresh) & (areas >= min_box_area) + if mask.any(): + show_labels = top_unique_labels(labels[mask], scores[mask]) + else: + show_labels = torch.empty((0,), dtype=labels.dtype) + + if elapsed_per_frame_running_avg is not None: + # Record the score of the most cat-like image for + # logging purposes + cat_scores = scores[labels == cat_label] + if cat_scores.any(): + best = float(cat_scores.max()) + else: + best = 0.0 + + status_line_time = time.strftime("%H:%M:%S", time.localtime()) + if cat_in_frame: + status_line_msg = f"Meow! Hello kitty-cat!" + else: + status_line_msg = "no cats" + if show_labels.shape[0] != 0: + label_words = [model_labels[i] for i in show_labels.cpu()] + label_words = [w for w in label_words if w != "N/A"] + status_line_msg += f":{','.join(label_words)}" + if len(status_line_msg) > 31: + status_line_msg = status_line_msg[:28] + "..." + status_line = (f"{status_line_time} {frame_number:4d} " + f"{elapsed_per_frame_running_avg * 1000:5.0f} ms/frame " + f"| {status_line_msg:31s} (cat score={best:.2f})") + print(f"\r{status_line}", end="\n" if cat_status_changed else "") + + +if __name__ == "__main__": + main() From 84c80763f1a65027a17b7a569580c1ca58c13b7f Mon Sep 17 00:00:00 2001 From: Joel Holveck Date: Thu, 22 Jan 2026 10:37:04 +0000 Subject: [PATCH 2/8] Start on front-of-file comments --- demos/cat-detector.py | 145 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 138 insertions(+), 7 deletions(-) diff --git a/demos/cat-detector.py b/demos/cat-detector.py index 330f6fb..29fc634 100644 --- a/demos/cat-detector.py +++ b/demos/cat-detector.py @@ -1,5 +1,128 @@ #! /usr/bin/env python3 +# This demo shows how to use MSS for artificial intelligence. For +# this demo, we'll be using a simple object detection task: see if +# there's a cat on your monitor. I mean, displayed on the monitor, +# not sitting on your laptop. +# +# This demo is not meant to be an introduction to AI or computer +# vision. We assume you have an understanding of the basics of AI, +# and of PyTorch. +# +# Object Detection +# ================ +# +# An object detector is a different beast than an object classifier. +# Object classifiers are a common introduction to computer vision. +# These will look at a picture that has a single foreground object, +# front and center, and try to identify what type of object this is: a +# cat, person, bicycle, etc. +# +# An object detector looks at an image and identifies _multiple +# objects_ within it. Instead of assigning a single label to the +# whole image, saying "this is a picture of a cat", it might say +# "there is a cat here, and a bicycle over there," and provide some +# basic information about each one. This is, for instance, what a +# self-driving car uses to identify what it's seeing on its cameras. +# +# For this demo, we want to tell if a cat is anywhere on the screen, +# not if the whole screen is a picture of a cat. That means that we +# want to use an detector, not a classifier. +# +# The detector will find any number of objects. For each object it +# detects, a typical detector produces three pieces of information: +# +# - A *label*, which identifies _what kind of object_ the detector +# believes it has found. Labels are represented internally as +# integers that map to a fixed list of categories the model was +# trained on (for example, "cat," "bicycle," or "person"). +# +# - A *position*, usually given as a bounding box. A bounding box +# describes _where_ the object appears in the image, using a small +# set of numbers that define a rectangle around it. +# +# - A *score*, which indicates how confident the model is in that +# detection. Higher scores mean the model is more confident; lower +# scores mean it is less confident. The score is a relative +# confidence signal, not a calibrated probability, and it should not +# be interpreted as a percentage or compared across different +# models. +# +# Most modern object detectors follow this same basic pattern, even if +# their internal architectures differ. In the Torchvision model used +# in this demo, these results are returned as parallel one-dimensional +# tensors: one tensor of labels, one tensor of bounding boxes, and one +# tensor of scores. Each index across these tensors refers to the +# same detected object. +# +# The Model We're Using +# ===================== +# +# In this demo, we use a pre-trained object-detection model provided +# by PyTorch's Torchvision library: `fasterrcnn_resnet50_fpn_v2`, with +# weights `FasterRCNN_ResNet50_FPN_V2_Weights.COCO_V1`. +# +# This name is long, but each part reflects a piece of a larger system +# built up over many years of research and engineering. +# +# *Faster R-CNN* is the overall object-detection architecture. +# Introduced in 2015, it builds on earlier R-CNN variants and +# established the now-common two-stage approach to detection: first +# proposing regions that might contain objects, then classifying and +# refining those regions. This basic structure is still widely used +# today. +# +# *ResNet-50* refers to the convolutional neural network used as the +# _backbone_. ResNet itself was originally developed for image +# classification, but its feature-extraction layers proved broadly +# useful and are now reused in many vision systems. In this model, +# ResNet-50 converts raw pixels into _features_ - numerical +# representations that capture visual patterns such as edges, +# textures, shapes, and object parts - while the original +# classification layers are replaced by the detection-specific +# components of Faster R-CNN. +# +# *FPN*, or Feature Pyramid Network, is a later addition that +# addresses one of the main challenges in object detection: scale. It +# combines high-level, semantically rich features (good at recognizing +# _what_ is present) with lower-level, higher-resolution features +# (better at preserving _where_ things are). By layering these ideas +# on top of the backbone, the model can detect both large and small +# objects more reliably. +# +# The *v2* suffix indicates a newer Torchvision implementation that +# incorporates refinements from more recent research and practice. In +# particular, it follows a standardized training and configuration +# setup described in the 2021 paper "Benchmarking Detection Transfer +# Learning with Vision Transformers". Despite the paper's title, this +# model does *not* use Transformers; it uses a ResNet-50 backbone, but +# benefits from the same modernized training approach. +# +# Finally, *COCO_V1* indicates that the model was trained on the COCO +# dataset, a widely used community benchmark for object detection. +# COCO contains hundreds of thousands of labeled images covering 80 +# common object categories (such as people, animals, and vehicles), +# along with a small number of additional placeholder categories that +# appear as "N/A" in the model metadata. +# +# Performance +# =========== +# +# The biggest determinant of performance is whether the model runs on +# a GPU or on the CPU. GPUs are extremely well-suited to AI +# workloads, and PyTorch’s strongest and most mature GPU support today +# is through NVIDIA’s CUDA platform. +# +# With a CUDA-capable GPU, this demo’s main loop typically runs in +# around 100 ms per frame (about 10 fps). When run on the CPU, the +# same work takes roughly 5000 ms per frame (about 0.2 fps). +# +# Screen size has little effect on performance. The preprocessing +# stage scales the captured image to a fixed size, so the slow part - +# running the neural network - takes roughly the same amount of time +# regardless of the original screen resolution. + + import itertools import time @@ -18,11 +141,10 @@ def top_unique_labels(labels, scores): - """Return the unique labels, ordered by score descending. + """Return the unique labels, ordered by descending score. - In other words, if you have a person (0.67), dog (0.98), tv - (0.88), dog (0.71), you'll get back the labels for dog, tv, - person, in that order. + If you have a person (0.67), dog (0.98), tv (0.88), dog (0.71), + you'll get back the labels for dog, tv, person, in that order. The labels are a 1d tensor of integers, which are identifiers for model-specific categories, such as indices into @@ -43,9 +165,18 @@ def top_unique_labels(labels, scores): # a neural net. @torch.inference_mode() def main(): - # Use CUDA if it's installed and available. This is much faster - # than doing all the work on the CPU. - device = "cuda" if torch.cuda.is_available() else "cpu" + # Prefer CUDA if available. PyTorch’s CUDA backend is the most + # mature and consistently supported option, and can be tens of + # times faster than running the same model on the CPU. + # + # Other GPU backends (such as Apple’s MPS, AMD ROCm, or Intel XPU) + # exist, but support and configuration vary widely across systems. + # Since this demo hasn’t been tested on those platforms, it + # conservatively falls back to the CPU when CUDA is not available. + if torch.cuda.is_available(): + device = "cuda" + else: + device = "cpu" # The first time you run this demo, Torchvision will download a # 167 MByte DNN. This is cached in ~/.cache/torch/hub/checkpoints From bc2ce7a76421925a544fa4338f700549d9d9a1b8 Mon Sep 17 00:00:00 2001 From: Joel Ray Holveck Date: Fri, 23 Jan 2026 02:09:08 -0800 Subject: [PATCH 3/8] Add many more comments --- demos/cat-detector.py | 249 ++++++++++++++++++++++++++++++++---------- 1 file changed, 194 insertions(+), 55 deletions(-) mode change 100644 => 100755 demos/cat-detector.py diff --git a/demos/cat-detector.py b/demos/cat-detector.py old mode 100644 new mode 100755 index 29fc634..162f325 --- a/demos/cat-detector.py +++ b/demos/cat-detector.py @@ -110,28 +110,35 @@ # # The biggest determinant of performance is whether the model runs on # a GPU or on the CPU. GPUs are extremely well-suited to AI -# workloads, and PyTorch’s strongest and most mature GPU support today -# is through NVIDIA’s CUDA platform. -# -# With a CUDA-capable GPU, this demo’s main loop typically runs in -# around 100 ms per frame (about 10 fps). When run on the CPU, the -# same work takes roughly 5000 ms per frame (about 0.2 fps). +# workloads, and PyTorch's strongest and most mature GPU support today +# is through NVIDIA's CUDA platform. # # Screen size has little effect on performance. The preprocessing # stage scales the captured image to a fixed size, so the slow part - # running the neural network - takes roughly the same amount of time # regardless of the original screen resolution. +# +# With a CUDA-capable GPU, this demo's main loop typically runs in +# around 100 ms per frame (about 10 fps). When run on the CPU, the +# same work takes roughly 5000 ms per frame (about 0.2 fps). +# +# FIXME Categorize +# ================ +# +# The first time you run this demo, Torchvision will download a +# 167 MByte DNN. This is cached in ~/.cache/torch/hub/checkpoints +# on Unix. I'm not sure where it's cached on other platforms, but +# it will tell you. import itertools import time -# You'll need to "pip install mss numpy pillow". Additionally, you'll +# You'll need to "pip install mss pillow". Additionally, you'll # need to install PyTorch and TorchVision, and the best way to do that # can vary depending on your system. Often, "pip install torch # torchvision" will be sufficient, but you can get specific # instructions at . -import numpy as np from PIL import Image import torch import torchvision.models.detection @@ -139,24 +146,58 @@ import mss +# The model will identify objects even if they only vaguely look like +# something. It also tell us a score of how certain it is, on a scale +# from 0 to 1. To prevent false positives, we set a threshold and +# ignore any results below it. The score doesn't have any real +# external meaning: to pick the cutoff, you just try different images +# and get a sense of what seems about right. +SCORE_THRESH = 0.60 + +# If an image is too small, then it's got a pretty decent chance of +# being a false positive: it's hard to tell if a Discord or Slack +# reaction icon is a cat or something different. We ignore any +# results that are too small to be reliable. Here, this cutoff is +# 0.1% of the whole monitor (about 1.5 cm square on a 27" monitor). +MIN_AREA_FRAC = 0.001 # Fraction of image def top_unique_labels(labels, scores): """Return the unique labels, ordered by descending score. If you have a person (0.67), dog (0.98), tv (0.88), dog (0.71), you'll get back the labels for dog, tv, person, in that order. - - The labels are a 1d tensor of integers, which are identifiers for - model-specific categories, such as indices into - weights.meta["categories"]. - - The scores are a parallel 1d tensor of the same size of floats: in - other words, score[0] is the score of label[0]. """ + + # Find the set of unique labels. + # `uniq` contains each distinct label once. + # `inv` maps each original label to its index in `uniq`. + # + # Example: + # labels = [person, dog, tv, dog] + # uniq = [person, dog, tv] + # inv = [0, 1, 2, 1] uniq, inv = torch.unique(labels, return_inverse=True) - max_per = torch.full((uniq.numel(),), -torch.inf, device=scores.device, dtype=scores.dtype) + + # Create a tensor to hold the maximum score seen for each unique + # label. We initialize to -inf so any real score will replace it. + max_per = torch.full( + (uniq.numel(),), + -torch.inf, + device=scores.device, + dtype=scores.dtype, + ) + + # For each element in `scores`, reduce it into `max_per` using + # `inv` as an index map, taking the maximum score per label. + # + # After this, max_per[i] is the highest score associated with + # uniq[i]. max_per.scatter_reduce_(0, inv, scores, reduce="amax") + + # Sort the unique labels by their maximum score, highest first. order = torch.argsort(max_per, descending=True) + + # Return the unique labels in score-ranked order. return uniq[order] @@ -165,104 +206,202 @@ def top_unique_labels(labels, scores): # a neural net. @torch.inference_mode() def main(): - # Prefer CUDA if available. PyTorch’s CUDA backend is the most + # Prefer CUDA if available. PyTorch's CUDA backend is the most # mature and consistently supported option, and can be tens of # times faster than running the same model on the CPU. # - # Other GPU backends (such as Apple’s MPS, AMD ROCm, or Intel XPU) + # Other GPU backends (such as Apple's MPS, AMD ROCm, or Intel XPU) # exist, but support and configuration vary widely across systems. - # Since this demo hasn’t been tested on those platforms, it + # Since this demo hasn't been tested on those platforms, it # conservatively falls back to the CPU when CUDA is not available. if torch.cuda.is_available(): device = "cuda" else: device = "cpu" - # The first time you run this demo, Torchvision will download a - # 167 MByte DNN. This is cached in ~/.cache/torch/hub/checkpoints - # on Unix; not sure where it's cached on other platforms. - weights = torchvision.models.detection.FasterRCNN_ResNet50_FPN_V2_Weights.DEFAULT + # Neural networks, often just called *models*, have two aspects to + # them: the *architecture*, and the *weights*. The architecture + # is the layout of the neural network: what the different units + # are, how they're connected, and so forth. The weights are the + # results of training that neural network; they're numbers saying + # how much the units in the network influence each other. + # + # The same architecture can be trained on different data sets for + # different purposes. Different companies might use the exact + # same object detector architecture for different purposes: a + # company making a photo editing app might train the model to + # recognize faces, smiles, or closed eyes for auto-enhancement, + # while a wildlife research group could train the same + # architecture to identify animals in wilderness camera photos. + # + # The weights are specific to the architecture: you can't plug + # weights from a training run with the ResNet50 architecture into + # a Visual Transformers architecture. + # + # As described in the comments at the top of the file, we're using + # the fasterrcnn_resnet50_fpn_v2 architecture, and the weights + # obtained by training it with the COCO dataset. Plugging those + # weights into the architecture produces our model. + weights = torchvision.models.detection.FasterRCNN_ResNet50_FPN_V2_Weights.COCO_V1 model = torchvision.models.detection.fasterrcnn_resnet50_fpn_v2(weights=weights).to(device).eval() + + # When you train a model, you almost always want to pre-process + # your input data. It's important that when you use that model + # later, you do the same kind of pre-processing. Otherwise, it'd + # be like learning a language from slow, carefully-enunciated + # speech, and then getting dropped right into conversations on a + # subway. + # + # For the model we're using, the preprocessing is to scale the + # input image to a consistent size, and to normalize its range + # (kinda similar to the "Auto" filter on your phone). + # Fortunately, for its pretrained models, Torchvision gives us + # an easy way to get the correct preprocessing function. preprocess = weights.transforms() + # The labels ("what type of object is this") that the model gives + # us are just integers; for this model, they're from 0 to 90. The + # English words describing them ("cat") are in a list, stored in + # the weight's metadata. model_labels = weights.meta["categories"] cat_label = model_labels.index("cat") - score_thresh = 0.60 - img_long_side = 960 - min_area_frac = 0.001 # Fraction of image - with mss.mss() as sct: - monitor = sct.monitors[1] # primary monitor + monitor = sct.monitors[1] img_area = monitor["width"] * monitor["height"] - min_box_area = min_area_frac * img_area + # FIXME verify whether the ROI boxes are relative to the + # original or preprocessed image. + min_box_area = MIN_AREA_FRAC * img_area + # We start a new line of the log if the cat visibility status + # changes. That way, your terminal will show essentially a + # log of all the times when a cat appeared or vanished. cat_has_been_visible = False - elapsed_per_frame_running_avg = None + + # Track an exponential moving average of how long each frame + # takes, essentially an FPS counter. + elapsed_per_frame_moving_avg = None + + # When was the last frame? time_last_frame = None for frame_number in itertools.count(): + # Do all the work to keep the frame timer. time_this_frame = time.monotonic() - if time_last_frame is not None: + if time_last_frame is not None: # Skip the first loop elapsed_this_frame = time_this_frame - time_last_frame if frame_number < 5: # We don't try to keep a moving average until the - # pipeline has warmed up. - elapsed_per_frame_running_avg = elapsed_this_frame + # pipeline has warmed up for a few frames: the + # times are too variable before PyTorch has gotten + # a sense of the inputs we're sending it, and we + # don't want those initial outlies to affect the + # EMA. Instead, we just show the most recent + # frame's number. + elapsed_per_frame_moving_avg = elapsed_this_frame else: - elapsed_per_frame_running_avg = elapsed_per_frame_running_avg * 0.9 + elapsed_this_frame * 0.1 + # The exponential moving average we track is based + # 90% on the old average, and 10% on the most + # recent frame. If you do the math, each frame's + # timing has half as much influence every 7 frames + # or so. + elapsed_per_frame_moving_avg = elapsed_per_frame_moving_avg * 0.9 + elapsed_this_frame * 0.1 time_last_frame = time_this_frame + # Grab the screenshot. sct_img = sct.grab(monitor) + + # We transfer the image from MSS to PyTorch by going + # through a Pillow Image. There are faster ways to do + # this transfer, but here, the vast bulk of the time is + # occupied by the AI work, so we just use the most + # convenient mechanism. img = Image.frombytes("RGB", sct_img.size, sct_img.bgra, "raw", "BGRX") # We explicitly convert it to a tensor here, even though # Torchvision can also convert it in the preprocess step. - # This is so that we send it to the GPU to do the - # preprocessing; PIL images are always on the CPU. + # This is so that we send it to the GPU before we do the + # preprocessing: PIL Images are always on the CPU, and + # doing the preprocessing on the GPU is much faster. + # + # Most image APIs, including MSS, use an array layout of + # [height, width, channels]. In MSS, the ScreenShot.bgra + # data follows this convention, even though it's exposed + # as a flat bytes object. + # + # In contrast, most AI frameworks expect images in + # [channels, height, width] order. The pil_to_tensor + # helper performs this rearrangement for us. img_tensor = torchvision.transforms.v2.functional.pil_to_tensor(img).to(device) - x = preprocess(img_tensor) # tensor CxHxW - pred = model([x])[0] + # Do the preprocessing stages that the trained model + # expects; see the comment where we define preprocess. + # The traditional name for inputs to a neural net is "x", + # because AI programmers aren't terribly imaginative. + x = preprocess(img_tensor) + # In most AI networks, the model expects to take an array + # of inputs, and will return an array of outputs. This is + # because it's _much_ more efficient to operate on batches + # of inputs than on individual inputs, because of how the + # matrix math works. For instance, banks will use batches + # of transactions in AIs to flag transactions for review + # as potentially fraudulent. Because of that design, we + # need to provide the model our input as a batch of one + # image, rather than a single image by itself. That's + # what the unsqueeze does: it adds a new dimension of + # length 1 to the beginning of the input. Also, the + # output will be in a batch, so we just take the first + # element, hence the [0]. + pred = model(x.unsqueeze(0))[0] + # The value of pred is a dict, giving us the labels, + # scores, and bounding boxes. See the comments at the top + # of the file for more information. labels = pred["labels"] scores = pred["scores"] boxes = pred["boxes"] + # We only want to allow detections that are large enough + # to be reliable; see the comments on MIN_AREA_FRAC for + # more information. Here, we compute the areas of all the + # boxes we got, using operations that work on all the + # detected objects in parallel. areas = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) # Find the score of the highest-scoring cat that's large - # enough, even if it's not high enough to register the - # detector. We always log that. + # enough, even if it's not high enough to register as + # sufficiently certain for our program. We always log + # that, as the "cat score". cat_mask = (labels == cat_label) & (areas >= min_box_area) if cat_mask.any(): cat_score = scores[cat_mask].max().item() else: cat_score = 0.0 - cat_in_frame = cat_score >= score_thresh + # Is there a cat on the screen? + cat_in_frame = cat_score >= SCORE_THRESH + # Did a cat just appear or disappear? We create a new log + # line when this happens, so the user gets a log of cat + # appearances and disappearances. cat_status_changed = cat_in_frame != cat_has_been_visible if cat_status_changed: cat_has_been_visible = cat_in_frame if not cat_in_frame: - # Find all objects that score sufficiently well. We - # log them if there's no cat to talk about. - mask = (scores >= score_thresh) & (areas >= min_box_area) + # Find all objects that score sufficiently well. + # We're going to log them if there's no cat to talk + # about. + mask = (scores >= SCORE_THRESH) & (areas >= min_box_area) if mask.any(): show_labels = top_unique_labels(labels[mask], scores[mask]) else: show_labels = torch.empty((0,), dtype=labels.dtype) - if elapsed_per_frame_running_avg is not None: - # Record the score of the most cat-like image for - # logging purposes - cat_scores = scores[labels == cat_label] - if cat_scores.any(): - best = float(cat_scores.max()) - else: - best = 0.0 - + # Give the user our results. We only do this if the + # per-frame durations have been initialized (we're on at + # least the second frame), just to simplify the layout + # logic. + if elapsed_per_frame_moving_avg is not None: status_line_time = time.strftime("%H:%M:%S", time.localtime()) if cat_in_frame: status_line_msg = f"Meow! Hello kitty-cat!" @@ -275,8 +414,8 @@ def main(): if len(status_line_msg) > 31: status_line_msg = status_line_msg[:28] + "..." status_line = (f"{status_line_time} {frame_number:4d} " - f"{elapsed_per_frame_running_avg * 1000:5.0f} ms/frame " - f"| {status_line_msg:31s} (cat score={best:.2f})") + f"{elapsed_per_frame_moving_avg * 1000:5.0f} ms/frame " + f"| {status_line_msg:31s} (cat score={cat_score:.2f})") print(f"\r{status_line}", end="\n" if cat_status_changed else "") From cda4c2e83f4b2d6de8794eeeac7a53506b329279 Mon Sep 17 00:00:00 2001 From: Joel Ray Holveck Date: Fri, 23 Jan 2026 16:36:40 -0800 Subject: [PATCH 4/8] Improve comments, and add screenshot_to_tensor --- demos/cat-detector.py | 519 +++++++++++++++++++----------------------- 1 file changed, 240 insertions(+), 279 deletions(-) diff --git a/demos/cat-detector.py b/demos/cat-detector.py index 162f325..e3acdd0 100755 --- a/demos/cat-detector.py +++ b/demos/cat-detector.py @@ -1,167 +1,153 @@ #! /usr/bin/env python3 -# This demo shows how to use MSS for artificial intelligence. For -# this demo, we'll be using a simple object detection task: see if -# there's a cat on your monitor. I mean, displayed on the monitor, -# not sitting on your laptop. +# This demo shows how to use MSS for artificial intelligence. For this demo, we'll be using a simple object detection +# task: see if there's a cat on your monitor. I mean, displayed on the monitor, not sitting on your laptop. # -# This demo is not meant to be an introduction to AI or computer -# vision. We assume you have an understanding of the basics of AI, -# and of PyTorch. +# This demo is not meant to be an introduction to AI or computer vision. We assume you have an understanding of the +# basics of AI, and of PyTorch. # # Object Detection # ================ # -# An object detector is a different beast than an object classifier. -# Object classifiers are a common introduction to computer vision. -# These will look at a picture that has a single foreground object, -# front and center, and try to identify what type of object this is: a -# cat, person, bicycle, etc. +# An object detector is a different beast than an object classifier. Object classifiers are a common introduction to +# computer vision. These will look at a picture that has a single foreground object, front and center, and try to +# identify what type of object this is: a cat, person, bicycle, etc. # -# An object detector looks at an image and identifies _multiple -# objects_ within it. Instead of assigning a single label to the -# whole image, saying "this is a picture of a cat", it might say -# "there is a cat here, and a bicycle over there," and provide some -# basic information about each one. This is, for instance, what a -# self-driving car uses to identify what it's seeing on its cameras. +# An object detector looks at an image and identifies _multiple objects_ within it. Instead of assigning a single +# label to the whole image, saying "this is a picture of a cat", it might say "there is a cat here, and a bicycle over +# there," and provide some basic information about each one. This is, for instance, what a self-driving car uses to +# identify what it's seeing on its cameras. # -# For this demo, we want to tell if a cat is anywhere on the screen, -# not if the whole screen is a picture of a cat. That means that we -# want to use an detector, not a classifier. +# For this demo, we want to tell if a cat is anywhere on the screen, not if the whole screen is a picture of a cat. +# That means that we want to use an detector, not a classifier. # -# The detector will find any number of objects. For each object it -# detects, a typical detector produces three pieces of information: +# The detector will find any number of objects. For each object it detects, a typical detector produces three pieces +# of information: # -# - A *label*, which identifies _what kind of object_ the detector -# believes it has found. Labels are represented internally as -# integers that map to a fixed list of categories the model was -# trained on (for example, "cat," "bicycle," or "person"). +# - A *label*, which identifies _what kind of object_ the detector believes it has found. Labels are represented +# internally as integers that map to a fixed list of categories the model was trained on (for example, "cat," +# "bicycle," or "person"). # -# - A *position*, usually given as a bounding box. A bounding box -# describes _where_ the object appears in the image, using a small -# set of numbers that define a rectangle around it. +# - A *position*, usually given as a bounding box. A bounding box describes _where_ the object appears in the image, +# using a small set of numbers that define a rectangle around it. # -# - A *score*, which indicates how confident the model is in that -# detection. Higher scores mean the model is more confident; lower -# scores mean it is less confident. The score is a relative -# confidence signal, not a calibrated probability, and it should not -# be interpreted as a percentage or compared across different -# models. +# - A *score*, which indicates how confident the model is in that detection. Higher scores mean the model is more +# confident; lower scores mean it is less confident. The score is a relative confidence signal, not a calibrated +# probability, and it should not be interpreted as a percentage or compared across different models. # -# Most modern object detectors follow this same basic pattern, even if -# their internal architectures differ. In the Torchvision model used -# in this demo, these results are returned as parallel one-dimensional -# tensors: one tensor of labels, one tensor of bounding boxes, and one -# tensor of scores. Each index across these tensors refers to the -# same detected object. +# Most modern object detectors follow this same basic pattern, even if their internal architectures differ. In the +# Torchvision model used in this demo, these results are returned as parallel one-dimensional tensors: one tensor of +# labels, one tensor of bounding boxes, and one tensor of scores. Each index across these tensors refers to the same +# detected object. # # The Model We're Using # ===================== # -# In this demo, we use a pre-trained object-detection model provided -# by PyTorch's Torchvision library: `fasterrcnn_resnet50_fpn_v2`, with -# weights `FasterRCNN_ResNet50_FPN_V2_Weights.COCO_V1`. +# In this demo, we use a pre-trained object-detection model provided by PyTorch's Torchvision library: +# `fasterrcnn_resnet50_fpn_v2`, with weights `FasterRCNN_ResNet50_FPN_V2_Weights.COCO_V1`. # -# This name is long, but each part reflects a piece of a larger system -# built up over many years of research and engineering. +# This name is long, but each part reflects a piece of a larger system built up over many years of research and +# engineering. # -# *Faster R-CNN* is the overall object-detection architecture. -# Introduced in 2015, it builds on earlier R-CNN variants and -# established the now-common two-stage approach to detection: first -# proposing regions that might contain objects, then classifying and -# refining those regions. This basic structure is still widely used -# today. +# *Faster R-CNN* is the overall object-detection architecture. Introduced in 2015, it builds on earlier R-CNN +# variants and established the now-common two-stage approach to detection: first proposing regions that might contain +# objects, then classifying and refining those regions. This basic structure is still widely used today. # -# *ResNet-50* refers to the convolutional neural network used as the -# _backbone_. ResNet itself was originally developed for image -# classification, but its feature-extraction layers proved broadly -# useful and are now reused in many vision systems. In this model, -# ResNet-50 converts raw pixels into _features_ - numerical -# representations that capture visual patterns such as edges, -# textures, shapes, and object parts - while the original -# classification layers are replaced by the detection-specific -# components of Faster R-CNN. +# *ResNet-50* refers to the convolutional neural network used as the _backbone_. ResNet itself was originally +# developed for image classification, but its feature-extraction layers proved broadly useful and are now reused in +# many vision systems. In this model, ResNet-50 converts raw pixels into _features_ - numerical representations that +# capture visual patterns such as edges, textures, shapes, and object parts - while the original classification layers +# are replaced by the detection-specific components of Faster R-CNN. # -# *FPN*, or Feature Pyramid Network, is a later addition that -# addresses one of the main challenges in object detection: scale. It -# combines high-level, semantically rich features (good at recognizing -# _what_ is present) with lower-level, higher-resolution features -# (better at preserving _where_ things are). By layering these ideas -# on top of the backbone, the model can detect both large and small -# objects more reliably. +# *FPN*, or Feature Pyramid Network, is a later addition that addresses one of the main challenges in object +# detection: scale. It combines high-level, semantically rich features (good at recognizing _what_ is present) with +# lower-level, higher-resolution features (better at preserving _where_ things are). By layering these ideas on top +# of the backbone, the model can detect both large and small objects more reliably. # -# The *v2* suffix indicates a newer Torchvision implementation that -# incorporates refinements from more recent research and practice. In -# particular, it follows a standardized training and configuration -# setup described in the 2021 paper "Benchmarking Detection Transfer -# Learning with Vision Transformers". Despite the paper's title, this -# model does *not* use Transformers; it uses a ResNet-50 backbone, but -# benefits from the same modernized training approach. +# The *v2* suffix indicates a newer Torchvision implementation that incorporates refinements from more recent research +# and practice. In particular, it follows a standardized training and configuration setup described in the 2021 paper +# "Benchmarking Detection Transfer Learning with Vision Transformers". Despite the paper's title, this model does +# *not* use Transformers; it uses a ResNet-50 backbone, but benefits from the same modernized training approach. # -# Finally, *COCO_V1* indicates that the model was trained on the COCO -# dataset, a widely used community benchmark for object detection. -# COCO contains hundreds of thousands of labeled images covering 80 -# common object categories (such as people, animals, and vehicles), -# along with a small number of additional placeholder categories that -# appear as "N/A" in the model metadata. +# Finally, *COCO_V1* indicates that the model was trained on the COCO dataset, a widely used community benchmark for +# object detection. COCO contains hundreds of thousands of labeled images covering 80 common object categories (such +# as people, animals, and vehicles), along with a small number of additional placeholder categories that appear as +# "N/A" in the model metadata. # # Performance # =========== # -# The biggest determinant of performance is whether the model runs on -# a GPU or on the CPU. GPUs are extremely well-suited to AI -# workloads, and PyTorch's strongest and most mature GPU support today -# is through NVIDIA's CUDA platform. +# The biggest determinant of performance is whether the model runs on a GPU or on the CPU. GPUs are extremely +# well-suited to AI workloads, and PyTorch's strongest and most mature GPU support today is through NVIDIA's CUDA +# platform. # -# Screen size has little effect on performance. The preprocessing -# stage scales the captured image to a fixed size, so the slow part - -# running the neural network - takes roughly the same amount of time -# regardless of the original screen resolution. +# Screen size has little effect on performance. The model starts by scaling the captured image to a consistent size +# (fitting it within 1333x800 px), so the slow part - running the neural network - takes roughly the same amount of +# time regardless of the original screen resolution. # -# With a CUDA-capable GPU, this demo's main loop typically runs in -# around 100 ms per frame (about 10 fps). When run on the CPU, the -# same work takes roughly 5000 ms per frame (about 0.2 fps). +# With a CUDA-capable GPU, this demo's main loop typically runs in around 100 ms per frame (about 10 fps). When run +# on the CPU, the same work takes roughly 5000 ms per frame (about 0.2 fps). # -# FIXME Categorize -# ================ +# Cached Data +# =========== # -# The first time you run this demo, Torchvision will download a -# 167 MByte DNN. This is cached in ~/.cache/torch/hub/checkpoints -# on Unix. I'm not sure where it's cached on other platforms, but -# it will tell you. +# The first time you run this demo, Torchvision will download a 167 MByte DNN. This is cached in +# ~/.cache/torch/hub/checkpoints on Unix. I'm not sure where it's cached on other platforms, but it will tell you. +from __future__ import annotations import itertools import time -# You'll need to "pip install mss pillow". Additionally, you'll -# need to install PyTorch and TorchVision, and the best way to do that -# can vary depending on your system. Often, "pip install torch -# torchvision" will be sufficient, but you can get specific -# instructions at . -from PIL import Image +# You'll need to install PyTorch and TorchVision, and the best way to do that can vary depending on your system. +# Often, "pip install torch torchvision" will be sufficient, but you can get specific instructions at +# . import torch import torchvision.models.detection import torchvision.transforms.v2 +# You'll also need to "pip install mss pillow". +from PIL import Image + import mss -# The model will identify objects even if they only vaguely look like -# something. It also tell us a score of how certain it is, on a scale -# from 0 to 1. To prevent false positives, we set a threshold and -# ignore any results below it. The score doesn't have any real -# external meaning: to pick the cutoff, you just try different images -# and get a sense of what seems about right. +# The model will identify objects even if they only vaguely look like something. It also tell us a score of how +# certain it is, on a scale from 0 (not a cat) to 1 (very confidently a cat). To prevent false positives, we set a +# threshold and ignore any results below it. The score doesn't have any real external meaning: to pick the cutoff, +# you just try different images, look at the scores, and get a sense of what seems about right. SCORE_THRESH = 0.60 -# If an image is too small, then it's got a pretty decent chance of -# being a false positive: it's hard to tell if a Discord or Slack -# reaction icon is a cat or something different. We ignore any -# results that are too small to be reliable. Here, this cutoff is -# 0.1% of the whole monitor (about 1.5 cm square on a 27" monitor). -MIN_AREA_FRAC = 0.001 # Fraction of image - -def top_unique_labels(labels, scores): +# If an image is too small, then it's got a pretty decent chance of being a false positive: it's hard to tell if a +# Discord or Slack reaction icon is a cat or something different. We ignore any results that are too small to be +# reliable. Here, this cutoff is 0.1% of the whole monitor (about 1.5 cm square on a 27" monitor, the diameter of a +# AA battery). Like the score threshold, this is just something you try and see what the model seems to be able to +# recognize reliably. +MIN_AREA_FRAC = 0.001 + + +# This function is here for illustrative purposes: the demo doesn't currently call it, but there's a commented-out +# line in the main loop that shows how you might use it. +def screenshot_to_tensor(sct_img: mss.ScreenShot, device: str | torch.device) -> torch.Tensor: + """Convert an MSS ScreenShot to a CHW PyTorch tensor.""" + + # Get a 1d tensor of BGRA values. PyTorch will issue a warning at this step: the ScreenShot's bgra object is + # read-only, but PyTorch doesn't support read-only tensors. However, this is harmless in our case: we'll end up + # copying the data anyway when we run contiguous(). + img = torch.frombuffer(sct_img.bgra, dtype=torch.uint8) + # Do the rest of this on the GPU, if desired. + img = img.to(device) + # Convert to an HWC view: (H, W, 4) + img = img.view(sct_img.height, sct_img.width, 4) + # Drop alpha and reorder BGR -> RGB + rgb_hwc = img[..., [2, 1, 0]] + # HWC -> CHW + rgb_chw = rgb_hwc.permute(2, 0, 1) + # Copy this into contiguous memory, for improved performance. (Some models might be faster with + # .to(memory_format=torch.channels_last) instead.) + return rgb_chw.contiguous() + + +def top_unique_labels(labels: torch.Tensor, scores: torch.Tensor) -> torch.Tensor: """Return the unique labels, ordered by descending score. If you have a person (0.67), dog (0.98), tv (0.88), dog (0.71), @@ -178,8 +164,8 @@ def top_unique_labels(labels, scores): # inv = [0, 1, 2, 1] uniq, inv = torch.unique(labels, return_inverse=True) - # Create a tensor to hold the maximum score seen for each unique - # label. We initialize to -inf so any real score will replace it. + # Create a tensor to hold the maximum score seen for each unique label. We initialize to -inf so any real score + # will replace it. max_per = torch.full( (uniq.numel(),), -torch.inf, @@ -187,11 +173,10 @@ def top_unique_labels(labels, scores): dtype=scores.dtype, ) - # For each element in `scores`, reduce it into `max_per` using - # `inv` as an index map, taking the maximum score per label. + # For each element in `scores`, reduce it into `max_per` using `inv` as an index map, taking the maximum score per + # label. # - # After this, max_per[i] is the highest score associated with - # uniq[i]. + # After this, max_per[i] is the highest score associated with uniq[i]. max_per.scatter_reduce_(0, inv, scores, reduce="amax") # Sort the unique labels by their maximum score, highest first. @@ -201,195 +186,158 @@ def top_unique_labels(labels, scores): return uniq[order] -# We run the entire program in inference mode. This is telling -# PyTorch to not bother tracking data that's only useful for training -# a neural net. +# We run the entire program in inference mode. This is telling PyTorch to not bother tracking data that's only useful +# for training a neural net. @torch.inference_mode() -def main(): - # Prefer CUDA if available. PyTorch's CUDA backend is the most - # mature and consistently supported option, and can be tens of - # times faster than running the same model on the CPU. +def main() -> None: + # Prefer CUDA if available. PyTorch's CUDA backend is the most mature and consistently supported option, and can + # be tens of times faster than running the same model on the CPU. # - # Other GPU backends (such as Apple's MPS, AMD ROCm, or Intel XPU) - # exist, but support and configuration vary widely across systems. - # Since this demo hasn't been tested on those platforms, it - # conservatively falls back to the CPU when CUDA is not available. - if torch.cuda.is_available(): - device = "cuda" - else: - device = "cpu" - - # Neural networks, often just called *models*, have two aspects to - # them: the *architecture*, and the *weights*. The architecture - # is the layout of the neural network: what the different units - # are, how they're connected, and so forth. The weights are the - # results of training that neural network; they're numbers saying - # how much the units in the network influence each other. + # Other GPU backends (such as Apple's MPS, AMD ROCm, or Intel XPU) exist, but support and configuration vary + # widely across systems. Since this demo hasn't been tested on those platforms, it conservatively falls back to + # the CPU when CUDA is not available. + device = "cuda" if torch.cuda.is_available() else "cpu" + + # Neural networks, often just called *models*, have two aspects to them: the *architecture*, and the *weights*. + # The architecture is the layout of the neural network: what the different units are, how they're connected, and + # so forth. The weights are the results of training that neural network; they're numbers saying how much the + # units in the network influence each other. # - # The same architecture can be trained on different data sets for - # different purposes. Different companies might use the exact - # same object detector architecture for different purposes: a - # company making a photo editing app might train the model to - # recognize faces, smiles, or closed eyes for auto-enhancement, - # while a wildlife research group could train the same - # architecture to identify animals in wilderness camera photos. + # The same architecture can be trained on different data sets for different purposes. Different companies might + # use the exact same object detector architecture for different purposes: a company making a photo editing app + # might train the model to recognize faces, smiles, or closed eyes for auto-enhancement, while a wildlife research + # group could train the same architecture to identify animals in wilderness camera photos. # - # The weights are specific to the architecture: you can't plug - # weights from a training run with the ResNet50 architecture into - # a Visual Transformers architecture. + # The weights are specific to the architecture: you can't plug weights from a training run with the ResNet50 + # architecture into a Visual Transformers architecture. # - # As described in the comments at the top of the file, we're using - # the fasterrcnn_resnet50_fpn_v2 architecture, and the weights - # obtained by training it with the COCO dataset. Plugging those - # weights into the architecture produces our model. + # As described in the comments at the top of the file, we're using the fasterrcnn_resnet50_fpn_v2 architecture, + # and the weights obtained by training it with the COCO dataset. Plugging those weights into the architecture + # produces our model. weights = torchvision.models.detection.FasterRCNN_ResNet50_FPN_V2_Weights.COCO_V1 - model = torchvision.models.detection.fasterrcnn_resnet50_fpn_v2(weights=weights).to(device).eval() - - # When you train a model, you almost always want to pre-process - # your input data. It's important that when you use that model - # later, you do the same kind of pre-processing. Otherwise, it'd - # be like learning a language from slow, carefully-enunciated - # speech, and then getting dropped right into conversations on a - # subway. + model = torchvision.models.detection.fasterrcnn_resnet50_fpn_v2(weights=weights) + # Move the model to the GPU, if we've selected that, and put it in evaluation mode (as opposed to training mode). + # Training mode often uses features meant to make the training more robust, such as randomly ignoring some + # connections to make sure the model learns some redundancy. Evaluation mode puts it in a mode to perform the + # best it can. + model = model.to(device).eval() + + # When you train a model, you almost always want to pre-process your input data. It's important that when you use + # that model later, you do the same kind of pre-processing. Otherwise, it'd be like learning a language from + # slow, carefully-enunciated speech, and then getting dropped right into conversations on a subway. + # + # For the model we're using, the preprocessing is simply to standardize the representation: it will convert PIL + # images to a tensor representation, and convert all images to floating-point 0.0-1.0 instead of integer 0-255. + # Some other models do more preprocessing. # - # For the model we're using, the preprocessing is to scale the - # input image to a consistent size, and to normalize its range - # (kinda similar to the "Auto" filter on your phone). - # Fortunately, for its pretrained models, Torchvision gives us - # an easy way to get the correct preprocessing function. + # Fortunately, for its pretrained models, Torchvision gives us an easy way to get the correct preprocessing + # function. preprocess = weights.transforms() - # The labels ("what type of object is this") that the model gives - # us are just integers; for this model, they're from 0 to 90. The - # English words describing them ("cat") are in a list, stored in - # the weight's metadata. + # The labels ("what type of object is this") that the model gives us are just integers; for this model, they're + # from 0 to 90. The English words describing them ("cat") are in a list, stored in the weight's metadata. model_labels = weights.meta["categories"] cat_label = model_labels.index("cat") with mss.mss() as sct: monitor = sct.monitors[1] + # Compute the minimum size, in square pixels, that we'll consider reliable. img_area = monitor["width"] * monitor["height"] - # FIXME verify whether the ROI boxes are relative to the - # original or preprocessed image. min_box_area = MIN_AREA_FRAC * img_area - # We start a new line of the log if the cat visibility status - # changes. That way, your terminal will show essentially a - # log of all the times when a cat appeared or vanished. + # We start a new line of the log if the cat visibility status changes. That way, your terminal will show + # essentially a log of all the times when a cat appeared or vanished. cat_has_been_visible = False - # Track an exponential moving average of how long each frame - # takes, essentially an FPS counter. - elapsed_per_frame_moving_avg = None + # Track an exponential moving average of how long each frame takes, essentially an FPS counter. + frame_duration_avg = None # When was the last frame? - time_last_frame = None + prev_frame_start = None + # We run forever, or until the user interrupts us. + print("Looking for kitty cats! Press Ctrl-C to stop.") for frame_number in itertools.count(): # Do all the work to keep the frame timer. - time_this_frame = time.monotonic() - if time_last_frame is not None: # Skip the first loop - elapsed_this_frame = time_this_frame - time_last_frame + frame_start = time.monotonic() + if prev_frame_start is not None: # Skip the first loop + frame_duration = frame_start - prev_frame_start + # Track frame timing with exponential moving average. Skip the first few frames while PyTorch + # optimizes its computations. if frame_number < 5: - # We don't try to keep a moving average until the - # pipeline has warmed up for a few frames: the - # times are too variable before PyTorch has gotten - # a sense of the inputs we're sending it, and we - # don't want those initial outlies to affect the - # EMA. Instead, we just show the most recent - # frame's number. - elapsed_per_frame_moving_avg = elapsed_this_frame + frame_duration_avg = frame_duration else: - # The exponential moving average we track is based - # 90% on the old average, and 10% on the most - # recent frame. If you do the math, each frame's - # timing has half as much influence every 7 frames - # or so. - elapsed_per_frame_moving_avg = elapsed_per_frame_moving_avg * 0.9 + elapsed_this_frame * 0.1 - time_last_frame = time_this_frame + # Exponential moving average: weight recent frame 10%, historical average 90%. This means each + # frame's influence halves every ~7 frames. + assert frame_duration_avg is not None + frame_duration_avg = frame_duration_avg * 0.9 + frame_duration * 0.1 + prev_frame_start = frame_start # Grab the screenshot. sct_img = sct.grab(monitor) - # We transfer the image from MSS to PyTorch by going - # through a Pillow Image. There are faster ways to do - # this transfer, but here, the vast bulk of the time is - # occupied by the AI work, so we just use the most - # convenient mechanism. + # We transfer the image from MSS to PyTorch via a Pillow Image. Faster approaches exist (see below) but + # PIL is more readable. The bulk of the time in this program is spent doing the AI work, so we just use + # the most convenient mechanism. img = Image.frombytes("RGB", sct_img.size, sct_img.bgra, "raw", "BGRX") - # We explicitly convert it to a tensor here, even though - # Torchvision can also convert it in the preprocess step. - # This is so that we send it to the GPU before we do the - # preprocessing: PIL Images are always on the CPU, and - # doing the preprocessing on the GPU is much faster. + + # We explicitly convert it to a tensor here, even though Torchvision can also convert it in the preprocess + # step. This is so that we send it to the GPU before we do the preprocessing: PIL Images are always on + # the CPU, and doing the preprocessing on the GPU is much faster. # - # Most image APIs, including MSS, use an array layout of - # [height, width, channels]. In MSS, the ScreenShot.bgra - # data follows this convention, even though it's exposed - # as a flat bytes object. + # Most image APIs, including MSS, use an array layout of [height, width, channels]. In MSS, the + # ScreenShot.bgra data follows this convention, even though it's exposed as a flat bytes object. # - # In contrast, most AI frameworks expect images in - # [channels, height, width] order. The pil_to_tensor + # In contrast, most AI frameworks expect images in [channels, height, width] order. The pil_to_tensor # helper performs this rearrangement for us. img_tensor = torchvision.transforms.v2.functional.pil_to_tensor(img).to(device) - # Do the preprocessing stages that the trained model - # expects; see the comment where we define preprocess. - # The traditional name for inputs to a neural net is "x", - # because AI programmers aren't terribly imaginative. + # An alternative to using PIL is shown in screenshot_to_tensor. In one test, this saves about 20 ms per + # frame if using a GPU, but is actually slower if using the CPU. This would replace the "img=" and + # "img_tensor=" lines above. + # + #img_tensor = screenshot_to_tensor(sct_img, device) + + # Do the preprocessing stages that the trained model expects; see the comment where we define preprocess. + # The traditional name for inputs to a neural net is "x", because AI programmers aren't terribly + # imaginative. x = preprocess(img_tensor) - # In most AI networks, the model expects to take an array - # of inputs, and will return an array of outputs. This is - # because it's _much_ more efficient to operate on batches - # of inputs than on individual inputs, because of how the - # matrix math works. For instance, banks will use batches - # of transactions in AIs to flag transactions for review - # as potentially fraudulent. Because of that design, we - # need to provide the model our input as a batch of one - # image, rather than a single image by itself. That's - # what the unsqueeze does: it adds a new dimension of - # length 1 to the beginning of the input. Also, the - # output will be in a batch, so we just take the first - # element, hence the [0]. + # In most AI networks, the model expects to take a batch of inputs, and will return an batch of outputs. + # This is because it's _much_ more efficient to operate on batches of inputs than on individual inputs + # when you're doing matrix math. For instance, banks will use batches of transactions in AIs to flag + # transactions for review as potentially fraudulent. Because of that design, we need to provide the model + # our input as a batch of one image, rather than a single image by itself. That's what the unsqueeze + # does: it adds a new dimension of length 1 to the beginning of the input. Also, the output will be in a + # batch, so we just take the first element, hence the [0]. pred = model(x.unsqueeze(0))[0] - # The value of pred is a dict, giving us the labels, - # scores, and bounding boxes. See the comments at the top - # of the file for more information. + # The value of pred is a dict, giving us the labels, scores, and bounding boxes. See the comments at the + # top of the file for more information. labels = pred["labels"] scores = pred["scores"] boxes = pred["boxes"] - # We only want to allow detections that are large enough - # to be reliable; see the comments on MIN_AREA_FRAC for - # more information. Here, we compute the areas of all the - # boxes we got, using operations that work on all the - # detected objects in parallel. + # We only want to allow detections that are large enough to be reliable; see the comments on MIN_AREA_FRAC + # for more information. Here, we compute the areas of all the boxes we got, using operations that work on + # all the detected objects in parallel. areas = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) - # Find the score of the highest-scoring cat that's large - # enough, even if it's not high enough to register as - # sufficiently certain for our program. We always log - # that, as the "cat score". + # Find the score of the highest-scoring cat that's large enough, even if it's not high enough to register + # as sufficiently certain for our program. We always log that, as the "cat score". cat_mask = (labels == cat_label) & (areas >= min_box_area) - if cat_mask.any(): - cat_score = scores[cat_mask].max().item() - else: - cat_score = 0.0 + cat_score = scores[cat_mask].max().item() if cat_mask.any() else 0.0 # Is there a cat on the screen? cat_in_frame = cat_score >= SCORE_THRESH - # Did a cat just appear or disappear? We create a new log - # line when this happens, so the user gets a log of cat - # appearances and disappearances. + # Did a cat just appear or disappear? We create a new log line when this happens, so the user gets a log + # of cat appearances and disappearances. cat_status_changed = cat_in_frame != cat_has_been_visible if cat_status_changed: cat_has_been_visible = cat_in_frame if not cat_in_frame: - # Find all objects that score sufficiently well. - # We're going to log them if there's no cat to talk + # Find all objects that score sufficiently well. We're going to log them if there's no cat to talk # about. mask = (scores >= SCORE_THRESH) & (areas >= min_box_area) if mask.any(): @@ -397,26 +345,39 @@ def main(): else: show_labels = torch.empty((0,), dtype=labels.dtype) - # Give the user our results. We only do this if the - # per-frame durations have been initialized (we're on at - # least the second frame), just to simplify the layout - # logic. - if elapsed_per_frame_moving_avg is not None: - status_line_time = time.strftime("%H:%M:%S", time.localtime()) - if cat_in_frame: - status_line_msg = f"Meow! Hello kitty-cat!" - else: - status_line_msg = "no cats" - if show_labels.shape[0] != 0: - label_words = [model_labels[i] for i in show_labels.cpu()] - label_words = [w for w in label_words if w != "N/A"] - status_line_msg += f":{','.join(label_words)}" - if len(status_line_msg) > 31: - status_line_msg = status_line_msg[:28] + "..." - status_line = (f"{status_line_time} {frame_number:4d} " - f"{elapsed_per_frame_moving_avg * 1000:5.0f} ms/frame " - f"| {status_line_msg:31s} (cat score={cat_score:.2f})") - print(f"\r{status_line}", end="\n" if cat_status_changed else "") + # Give the user our results. + status_line_time = time.strftime("%H:%M:%S", time.localtime()) + if cat_in_frame: + status_line_msg = "Meow! Hello kitty-cat!" + else: + status_line_msg = "no cats" + # If there isn't a cat, but there are other objects, list them. + if show_labels.shape[0] != 0: + label_words = [model_labels[i] for i in show_labels.cpu()] + # Filter out anything marked as "N/A": these are non-objects (like "sky"), and the training for + # this model doesn't really cover them. + label_words = [w for w in label_words if w != "N/A"] + # Build these into a comma-separated list. Make sure the whole string is at most 31 characters, + # the width we provide for it in the message. + status_line_msg += f":{','.join(label_words)}" + if len(status_line_msg) > 31: + status_line_msg = status_line_msg[:28] + "..." + # The frame_duration_avg will be None in the first iteration, since there isn't yet a full iteration to + # measure. + duration_avg_str = ( + f"{frame_duration_avg * 1000:5.0f}" if frame_duration_avg is not None else "-----" + ) + + # Build the whole status line. It's a constant width, so that when we overwrite it each frame, the new + # status line will completely overwrite the previous one. + status_line = ( + f"{status_line_time} {frame_number:4d} " + f"{duration_avg_str} ms/frame " + f"| {status_line_msg:31s} (cat score={cat_score:.2f})" + ) + # If a cat just appeared or disappeared, start a new line after this status line. This lets the user see + # a history of all the cat status changes. + print(f"\r{status_line}", end="\n" if cat_status_changed else "") if __name__ == "__main__": From 45b57948f0207c2bc26cf7ef44c95acc018af893 Mon Sep 17 00:00:00 2001 From: Joel Holveck Date: Tue, 27 Jan 2026 16:41:30 -0800 Subject: [PATCH 5/8] Apply suggestions from code review Co-authored-by: Halldor Fannar --- demos/cat-detector.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/demos/cat-detector.py b/demos/cat-detector.py index e3acdd0..2cb6e10 100755 --- a/demos/cat-detector.py +++ b/demos/cat-detector.py @@ -19,7 +19,7 @@ # identify what it's seeing on its cameras. # # For this demo, we want to tell if a cat is anywhere on the screen, not if the whole screen is a picture of a cat. -# That means that we want to use an detector, not a classifier. +# That means that we want to use a detector, not a classifier. # # The detector will find any number of objects. For each object it detects, a typical detector produces three pieces # of information: @@ -92,7 +92,7 @@ # =========== # # The first time you run this demo, Torchvision will download a 167 MByte DNN. This is cached in -# ~/.cache/torch/hub/checkpoints on Unix. I'm not sure where it's cached on other platforms, but it will tell you. +# ~/.cache/torch/hub/checkpoints on Unix. If you want to know where the cache is stored on other platforms, this information will be displayed after downloading the DNN. from __future__ import annotations @@ -120,7 +120,7 @@ # If an image is too small, then it's got a pretty decent chance of being a false positive: it's hard to tell if a # Discord or Slack reaction icon is a cat or something different. We ignore any results that are too small to be # reliable. Here, this cutoff is 0.1% of the whole monitor (about 1.5 cm square on a 27" monitor, the diameter of a -# AA battery). Like the score threshold, this is just something you try and see what the model seems to be able to +# AA battery). Like the score threshold, this is just something you try and see what the model is able to # recognize reliably. MIN_AREA_FRAC = 0.001 @@ -235,7 +235,7 @@ def main() -> None: preprocess = weights.transforms() # The labels ("what type of object is this") that the model gives us are just integers; for this model, they're - # from 0 to 90. The English words describing them ("cat") are in a list, stored in the weight's metadata. + # from 0 to 90. The English words describing them (like "cat") are in a list, stored in the weight's metadata. model_labels = weights.meta["categories"] cat_label = model_labels.index("cat") From 98a6ada40b424246ae6f6b12e98ca78bd2c82219 Mon Sep 17 00:00:00 2001 From: Joel Ray Holveck Date: Tue, 27 Jan 2026 17:01:37 -0800 Subject: [PATCH 6/8] Add changes per review comments --- demos/cat-detector.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/demos/cat-detector.py b/demos/cat-detector.py index 2cb6e10..3c63bbd 100755 --- a/demos/cat-detector.py +++ b/demos/cat-detector.py @@ -77,9 +77,11 @@ # Performance # =========== # -# The biggest determinant of performance is whether the model runs on a GPU or on the CPU. GPUs are extremely -# well-suited to AI workloads, and PyTorch's strongest and most mature GPU support today is through NVIDIA's CUDA -# platform. +# This demo can run the model on either the CPU or a GPU. The single biggest factor affecting performance is which +# one you use. Modern neural networks are designed around large amounts of parallel computation, which GPUs handle +# much more efficiently than CPUs. In practice, that means the same model runs dramatically faster on a GPU than on +# the CPU, even though the underlying math is identical. PyTorch's strongest and most mature GPU support today is +# through Nvidia's CUDA platform, so that is the only GPU supported by this demo. # # Screen size has little effect on performance. The model starts by scaling the captured image to a consistent size # (fitting it within 1333x800 px), so the slow part - running the neural network - takes roughly the same amount of @@ -92,7 +94,8 @@ # =========== # # The first time you run this demo, Torchvision will download a 167 MByte DNN. This is cached in -# ~/.cache/torch/hub/checkpoints on Unix. If you want to know where the cache is stored on other platforms, this information will be displayed after downloading the DNN. +# ~/.cache/torch/hub/checkpoints on Unix. If you want to know where the cache is stored on other platforms, it will +# be displayed while downloading the DNN. from __future__ import annotations @@ -106,7 +109,7 @@ import torchvision.models.detection import torchvision.transforms.v2 -# You'll also need to "pip install mss pillow". +# You'll also need to install MSS and Pillow, such as with "pip install mss pillow". from PIL import Image import mss From 24eef572820e5b0ea63a981457ad30508128a523 Mon Sep 17 00:00:00 2001 From: Joel Ray Holveck Date: Tue, 27 Jan 2026 18:38:02 -0800 Subject: [PATCH 7/8] Improve screenshot_to_tensor My last version accidentally had an intermediate copy. This version prevents that. --- demos/cat-detector.py | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/demos/cat-detector.py b/demos/cat-detector.py index 3c63bbd..c8ff117 100755 --- a/demos/cat-detector.py +++ b/demos/cat-detector.py @@ -135,19 +135,17 @@ def screenshot_to_tensor(sct_img: mss.ScreenShot, device: str | torch.device) -> # Get a 1d tensor of BGRA values. PyTorch will issue a warning at this step: the ScreenShot's bgra object is # read-only, but PyTorch doesn't support read-only tensors. However, this is harmless in our case: we'll end up - # copying the data anyway when we run contiguous(). + # copying the data anyway. img = torch.frombuffer(sct_img.bgra, dtype=torch.uint8) - # Do the rest of this on the GPU, if desired. + # Bring everything to the desired device. This is still just a linear buffer of BGRA bytes. img = img.to(device) - # Convert to an HWC view: (H, W, 4) - img = img.view(sct_img.height, sct_img.width, 4) - # Drop alpha and reorder BGR -> RGB - rgb_hwc = img[..., [2, 1, 0]] - # HWC -> CHW - rgb_chw = rgb_hwc.permute(2, 0, 1) - # Copy this into contiguous memory, for improved performance. (Some models might be faster with - # .to(memory_format=torch.channels_last) instead.) - return rgb_chw.contiguous() + # The next two steps will all just create views of the original tensor, without copying the data. + img = img.view(sct_img.height, sct_img.width, 4) # Interpret as BGRA HWC + img = img.permute(2, 0, 1) # Permute the axes: BGRA CHW + # This final step will create a copy. Copying the data is required to reorder the channels. This also has the + # advantage of also making the tensor contiguous, for more efficient access. + img = img[[2, 1, 0], ...] # Reorder the channels: RGB CHW + return img def top_unique_labels(labels: torch.Tensor, scores: torch.Tensor) -> torch.Tensor: @@ -280,9 +278,9 @@ def main() -> None: # Grab the screenshot. sct_img = sct.grab(monitor) - # We transfer the image from MSS to PyTorch via a Pillow Image. Faster approaches exist (see below) but - # PIL is more readable. The bulk of the time in this program is spent doing the AI work, so we just use - # the most convenient mechanism. + # We transfer the image from MSS to PyTorch via a Pillow Image. Faster approaches exist (see + # screenshot_to_tensor), but PIL is more readable. The bulk of the time in this program is spent doing + # the AI work, so we just use the most convenient mechanism. img = Image.frombytes("RGB", sct_img.size, sct_img.bgra, "raw", "BGRX") # We explicitly convert it to a tensor here, even though Torchvision can also convert it in the preprocess @@ -297,8 +295,8 @@ def main() -> None: img_tensor = torchvision.transforms.v2.functional.pil_to_tensor(img).to(device) # An alternative to using PIL is shown in screenshot_to_tensor. In one test, this saves about 20 ms per - # frame if using a GPU, but is actually slower if using the CPU. This would replace the "img=" and - # "img_tensor=" lines above. + # frame if using a GPU, and about 200 ms if using a CPU. This would replace the "img=" and "img_tensor=" + # lines above. # #img_tensor = screenshot_to_tensor(sct_img, device) @@ -367,9 +365,7 @@ def main() -> None: status_line_msg = status_line_msg[:28] + "..." # The frame_duration_avg will be None in the first iteration, since there isn't yet a full iteration to # measure. - duration_avg_str = ( - f"{frame_duration_avg * 1000:5.0f}" if frame_duration_avg is not None else "-----" - ) + duration_avg_str = f"{frame_duration_avg * 1000:5.0f}" if frame_duration_avg is not None else "-----" # Build the whole status line. It's a constant width, so that when we overwrite it each frame, the new # status line will completely overwrite the previous one. From 848ad6e5d48248453694e6af818a1871615dc3ec Mon Sep 17 00:00:00 2001 From: Joel Ray Holveck Date: Tue, 27 Jan 2026 23:07:08 -0800 Subject: [PATCH 8/8] Add CHANGELOG and docs entries --- CHANGELOG.md | 2 +- docs/source/examples.rst | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 004252a..9c8397f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,7 +12,7 @@ See Git commit messages for full history. - Windows: improve error checking and messages for Win32 API calls (#448) - Mac: fix memory leak (#450, #453) - improve multithreading: allow multiple threads to use the same MSS object, allow multiple MSS objects to concurrently take screenshots, and document multithreading guarantees (#446, #452) -- Add full demos for different ways to use MSS (#444, #456) +- Add full demos for different ways to use MSS (#444, #456, #465) - :heart: contributors: @jholveck, @halldorfannar ## 10.1.0 (2025-08-16) diff --git a/docs/source/examples.rst b/docs/source/examples.rst index 7b636bb..ce54014 100644 --- a/docs/source/examples.rst +++ b/docs/source/examples.rst @@ -225,3 +225,4 @@ scenarios. These include: - MP4 video capture with encoding using PyAV (FFmpeg bindings) - Live streaming to a TinyTV as MJPEG +- Detect images of cats on the screen