From 7cadb41a75c52eefa187057479700f10e3551eeb Mon Sep 17 00:00:00 2001 From: Joel Ray Holveck Date: Sat, 20 Dec 2025 23:14:19 -0800 Subject: [PATCH 01/16] Basic draft of the video capture code. I'll probably break this into a simple and advanced version too. I may have to take out the audio code. This also currently uses some of my work in the (unmerged) feat-buffer branch, so I'll need to switch it to use what's available now. --- demos/video-capture.py | 199 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 199 insertions(+) create mode 100755 demos/video-capture.py diff --git a/demos/video-capture.py b/demos/video-capture.py new file mode 100755 index 0000000..c8a565d --- /dev/null +++ b/demos/video-capture.py @@ -0,0 +1,199 @@ +#! /usr/bin/env python3 + +from fractions import Fraction +import queue +import threading +import time + +import av +import numpy as np +import soundcard as sc +from tqdm.auto import trange + +import mss + +CODEC_OPTIONS_GLOBAL = { + "g": "60", # GOP size: aim for about 2 sec + "bf": "2", # enable bframes + "b": "6M", # nominal average bitrate target + "maxrate": "12M", # peak + "bufsize": "24M", # VBV buffer; 1-4 seconds +} + +# Some options are, of course, implementation-dependent. I've +# tried to make these basically similar, but for all I know, they +# might actually produce significantly different output quality. +CODECS = { + "h264_nvenc": { + "rc": "vbr", + "tune": "hq", + "cq": "23", # quality; similar spirit to CRF, but different + # The modern presets are the p# ones. The others are + # deprecated, often aliases. + "preset": "p4", # p1..p7 (higher = slower/better) + "rc-lookahead": "40", + "spatial-aq": "1", + "temporal-aq": "1", + "b_ref_mode": "1", + }, + "libx264": { + # I think that with VBR enabled (as in the global options), + # libx264 ignores CRF. + "crf": "23", # quality; lower=better/larger + "preset": "medium", # speed/quality trade-off + "rc-lookahead": "40", + "aq-mode": "3", + }, +} + + +def main(): + av.logging.set_level(av.logging.VERBOSE) + + fps = 60 + monitor_id = 1 + duration_secs = 30 + codec = None + + if codec is None: + for codec in CODECS: + try: + # This normalizes the name. + av.codec.Codec(codec, "w") + break + except av.codec.codec.UnknownCodecError: + pass + else: + raise RuntimeError("No viable H.264 codec found") + else: + # Normalize the name, for the options lookup. + codec = av.codec.Codec(codec, "w").name + + mic = sc.get_microphone("loopback") + + with mss.mss() as sct: + monitor = sct.monitors[monitor_id] + + with av.open("capture.mp4", "w", format="mp4") as avmux: + time_denom = 90000 # This is a widely-used standard + time_base = Fraction(1, time_denom) + + audio_stream = avmux.add_stream("opus", options={"b": "64k"}) + audio_stream.time_base = time_base + # We pre-open the codec, to make sure there's not a warmup frame. + audio_stream.open() + + options = dict(CODEC_OPTIONS_GLOBAL) + if codec in CODECS: + options.update(CODECS[codec]) + video_stream = avmux.add_stream(codec, rate=fps, options=options) + video_stream.width = monitor["width"] + video_stream.height = monitor["height"] + video_stream.time_base = time_base + if any(f.name == "bgra" for f in video_stream.codec.video_formats): + video_stream.pix_fmt = "bgra" + # We pre-open the codec, to make sure there's not a warmup frame. + video_stream.open() + + def pipeline(q_input, fn, q_output): + try: + while True: + try: + val_input = q_input.get(timeout=5) + except queue.ShutDown: + break + val_output = fn(val_input) + if q_output is not None: + q_output.put(val_output, timeout=5) + finally: + q_input.shutdown() + if q_output is not None: + q_output.shutdown() + + q_audio_preprocess = queue.Queue(1) + q_audio_encode = queue.Queue(1) + q_video_preprocess = queue.Queue(1) + q_video_encode = queue.Queue(1) + q_mux = queue.Queue(1) + + def video_capture(): + try: + next_frame_at = first_frame_at + for i in trange(duration_secs * fps): + while ((now := time.clock_gettime(time.CLOCK_MONOTONIC)) < next_frame_at): + time.sleep(next_frame_at - now) + # I think there's an easy way to make this a leaky bucket, but can't quite + # think through the math right now. + next_frame_at = next_frame_at + 1/fps + screenshot = sct.grab(monitor) + q_video_preprocess.put((screenshot, now), timeout=5) + finally: + q_video_preprocess.shutdown() + + def video_preprocess(screenshot_and_timestamp): + (screenshot, timestamp) = screenshot_and_timestamp + + ndarray = np.frombuffer(screenshot.buffer(), dtype=np.uint8) + ndarray = ndarray.reshape(monitor["height"], monitor["width"], 4) + # from_numpy_buffer isn't documented. from_ndarray is, + # but that copies the data. That's slow enough to + # slow things down to the point of being a bottleneck! + frame = av.VideoFrame.from_numpy_buffer(ndarray, format="bgra") + + frame.pts = int((timestamp - first_frame_at) * 90000) + frame.time_base = Fraction(1, 90000) + return frame + + video_encode = video_stream.encode + + def audio_preprocess(audio_and_timestamp): + (audio, timestamp) = audio_and_timestamp + audio = audio.reshape(1, -1) + frame = av.AudioFrame.from_ndarray(audio, format='flt', layout='stereo') + frame.sample_rate = 48000 + frame.pts = int((timestamp - first_frame_at) * 90000) + frame.time_base = Fraction(1, 90000) + return frame + + audio_encode = audio_stream.encode + + t_video_capture = threading.Thread(target=video_capture, name="video_capture") + t_video_preprocess = threading.Thread(target=pipeline, args=(q_video_preprocess, video_preprocess, q_video_encode), name="video_preprocess") + t_video_encode = threading.Thread(target=pipeline, args=(q_video_encode, video_encode, q_mux), name="video_encode") + t_audio_preprocess = threading.Thread(target=pipeline, args=(q_audio_preprocess, audio_preprocess, q_audio_encode), name="audio_preprocess") + t_audio_encode = threading.Thread(target=pipeline, args=(q_audio_encode, audio_encode, q_mux), name="audio_encode") + t_mux = threading.Thread(target=pipeline, args=(q_mux, avmux.mux, None), name="mux") + + first_frame_at = time.clock_gettime(time.CLOCK_MONOTONIC) + t_mux.start() + t_video_encode.start() + t_video_preprocess.start() + t_audio_encode.start() + t_audio_preprocess.start() + t_video_capture.start() + + print("Capture: ", t_video_capture.native_id) + print("Preprocess:", t_video_preprocess.native_id) + print("Encode: ", t_video_encode.native_id) + print("Mux: ", t_mux.native_id) + + with mic.recorder(samplerate=48000) as audio_recorder: + while t_video_capture.is_alive(): + data = audio_recorder.record() + now = time.clock_gettime(time.CLOCK_MONOTONIC) + timestamp = now - audio_recorder.latency + q_audio_preprocess.put((data, timestamp)) + + t_video_capture.join() + t_video_preprocess.join() + t_video_encode.join() + t_audio_preprocess.join() + t_audio_encode.join() + t_mux.join() + + print(f"Used format {video_stream.format}, " + f"reformatter {video_stream.reformatter}") + + +if __name__ == "__main__": + main() From ca91fe515b1d14a6b6a693e4129266526b071904 Mon Sep 17 00:00:00 2001 From: Joel Ray Holveck Date: Sat, 10 Jan 2026 23:40:52 -0800 Subject: [PATCH 02/16] Work the video demo to a more viable form --- demos/common/__init__.py | 0 demos/common/pipeline.py | 300 ++++++++++++++++++++++ demos/tinytv-stream.py | 303 +--------------------- demos/video-capture.py | 524 ++++++++++++++++++++++++++------------- 4 files changed, 658 insertions(+), 469 deletions(-) create mode 100644 demos/common/__init__.py create mode 100644 demos/common/pipeline.py diff --git a/demos/common/__init__.py b/demos/common/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/demos/common/pipeline.py b/demos/common/pipeline.py new file mode 100644 index 0000000..6ae9d67 --- /dev/null +++ b/demos/common/pipeline.py @@ -0,0 +1,300 @@ +from __future__ import annotations + +import contextlib +import itertools +from collections.abc import Callable, Generator, Iterable, Iterator +from threading import Condition, Lock, Thread +from typing import Generic, TypeVar, overload + +T = TypeVar("T") +U = TypeVar("U") + + +class MailboxShutDown(Exception): # noqa: N818 (An exception, but not an error) + """Exception to indicate that a Mailbox has been shut down. + + This will be raised if Mailbox.get() or Mailbox.put() is run on a + mailbox after its .shutdown() method has been called, or if it is + called while waiting. + """ + + def __init__(self, mailbox: Mailbox) -> None: + #: The mailbox that was shut down + self.mailbox = mailbox + + def __str__(self) -> str: + return f"Mailbox shut down: {self.mailbox}" + + +class Mailbox(Generic[T]): + """Thread-safe container to pass a single object at a time between threads. + + A Mailbox can be shut down to indicate that it is no longer + available. This can be used by a producer to indicate that no + more items will be forthcoming, or by a consumer to indicate that + it is no longer able to accept more objects. + + In Python 3.13, this has the same basic functionality as + queue.Queue(1). Prior to 3.13, there was no + queue.Queue.shutdown() method. The mechanisms for using mailboxes + as iterables, or adding items from iterables, are also not part of + queue.Queue in any version of Python. + """ + + def __init__(self) -> None: + #: Lock to protect mailbox state + self.lock = Lock() + self._condition = Condition(lock=self.lock) + #: Indicates whether an item is present in the mailbox + self.has_item = False + self._item: T | None = None + #: Indicates whether the mailbox has been shut down + self.is_shutdown = False + + def get(self) -> T: + """Return and remove the item being held by the mailbox. + + If an item is not presently available, block until another + thread calls .put(). + """ + with self._condition: + while True: + # We test to see if an item is present before testing if the queue is shut down. This is so that a + # non-immediate shutdown allows the mailbox to be drained. + if self.has_item: + rv = self._item + self._item = None # Don't hold an unnecessary reference + self.has_item = False + self._condition.notify_all() + return rv # type:ignore[return-value] + if self.is_shutdown: + raise MailboxShutDown(self) + self._condition.wait() + + def get_many(self) -> Iterable[T]: + """Yield items as they appear in the mailbox. + + The iterator exits the mailbox is shut down; MailboxShutDown + is not raised into the caller. + """ + return iter(self) + + def put(self, item: T) -> None: + """Store an item in the mailbox. + + If an item is already in the mailbox, block until another + thread calls .get(). + """ + with self._condition: + while True: + if self.is_shutdown: + raise MailboxShutDown(self) + if not self.has_item: + self._item = item + self.has_item = True + self._condition.notify() + return + self._condition.wait() + + def put_many(self, items: Iterable[T]) -> Iterator[T]: + """Put the elements of iterable in the mailbox, one at a time. + + If the mailbox is shut down before all the elements can be put + into it, a MailboxShutDown exception is _not_ raised. + + Returns an iterator containing any remaining items, including + the one that was being processed when the mailbox was shut + down. The first item (if any) of this iterator can be + immediately accessed with next; subsequent items defer to the + input iterable, so may block. + """ + iterator = iter(items) + for item in iterator: + # We put this try/except inside the for loop, to make sure we don't accidentally filter out an exception + # that escaped the items iterator. + try: + self.put(item) + except MailboxShutDown: + return itertools.chain([item], iterator) + # Remove references to the value once it's not needed. This lets objects with advanced buffer semantics + # reclaim the object's memory immediately, without waiting for the next iteration of the iterable. + del item + return iter([]) + + def shutdown(self, *, immediate: bool = False) -> None: + """Shut down the mailbox, marking it as unavailable for future use. + + Any callers currently blocked in .get or .put, or any future + caller to those methods, will recieve a MailboxShutDown + exception. Callers using .get_many or iterating over the + mailbox will see the iteration end. Callers to .put_many will + stop adding items. + + If immediate is False (the default), and an item is currently + in the mailbox, it will be returned by the next call to + .get(), and the one after that will raise MailboxShutDown. + + It is safe to call this method multiple times, including to + promote a non-immediate shutdown to an immediate one. + """ + with self._condition: + # We don't actually need to check whether we've been called already. + self.is_shutdown = True + if immediate: + self._item = None + self.has_item = False + self._condition.notify_all() + + def __iter__(self) -> Iterator[T]: + """Yield items as they appear in the mailbox. + + The iterator exits when the mailbox is shut down; + MailboxShutDown is not raised into the caller. + """ + with contextlib.suppress(MailboxShutDown): + while True: + yield self.get() + + +class PipelineStage(Thread, Generic[T, U]): + """A stage of a multi-threaded pipeline. + + The target function will be called once, and should yield one + value for each element. + + If an in_mailbox is provided, the function will get an iterable of + its successive elements. If an out_mailbox is provided, it will + be supplied with the successive outputs of the target function. + + If the either mailbox is shut down, the target function's loop + will stop being called. Both mailboxes will be shut down when the + target function ends. + + Note to readers adapting this class to their own programs: + + This is designed for linear pipelines: it is not meant to support + fan-in (multiple stages feeding one mailbox) or fan-out (one + mailbox feeding multiple stages). The shutdown semantics of these + sorts of pipelines will depend heavily on what it's used for, and + this demo only needs a simple pipeline. + """ + + # Source stage + @overload + def __init__( + self, + target: Callable[[], Generator[U]], + *, + out_mailbox: Mailbox[U], + name: str | None = None, + ) -> None: ... + + # Transformer stage + @overload + def __init__( + self, + target: Callable[[Iterable[T]], Generator[U]], + *, + in_mailbox: Mailbox[T], + out_mailbox: Mailbox[U], + name: str | None = None, + ) -> None: ... + + # Sink stage + @overload + def __init__( + self, + target: Callable[[Iterable[T]], None], + *, + in_mailbox: Mailbox[T], + name: str | None = None, + ) -> None: ... + + def __init__( + self, + target: Callable[[], Generator[U]] | Callable[[Iterable[T]], Generator[U]] | Callable[[Iterable[T]], None], + *, + in_mailbox: Mailbox[T] | None = None, + out_mailbox: Mailbox[U] | None = None, + name: str | None = None, + ) -> None: + """Initialize the PipelineStage. + + Either :param:`in_mailbox` or :param:`out_mailbox` is + required. Otherwise, it would be a pipeline stage that can't + connect to anything else. (You can always use + :class:`threading.Thread` directly if you need that behavior.) + + :param target: Function to run during the stage. This will be + called once, in a separate thread. This should take one + argument if :param:`in_mailbox` is provided, or no + arguments otherwise. If you want additional arguments + (such as configuration), use :func:`functools.partial`. + :param in_mailbox: An optional :class:`Mailbox` to provide + inputs to the target function. The target function will + be called with one argument, an iterable that you can use + in a for loop or similar construct, to get the successive + values. + :param out_mailbox: An optional :class:`Mailbox` to receive + outputs from the target function. If this is provided, + the target function must be a generator (a function that + uses ``yield`` instead of ``return``). The successive + outputs from the function will be placed in + :param:`out_mailbox`. + :param name: An optional name for debugging purposes; see + :attr:`threading.Thread.name`. + """ + if in_mailbox is None and out_mailbox is None: + msg = "Cannot have a pipeline stage with neither inputs nor outputs" + raise ValueError(msg) + self.in_mailbox = in_mailbox + self.out_mailbox = out_mailbox + self.target = target + #: The exception (if any) raised by the target function + self.exc: Exception | None = None + super().__init__(name=name, daemon=True) + + def run(self) -> None: + """Execute the pipeline stage. + + This should not be run directly. Instead, use the start() + method (inherited from threading.Thread) to run this in a + background thread. + + This will run the target function, managing input and output + mailboxes. When the stage completes, whether normally or with + an error, the mailboxes will be shut down. + """ + try: + if self.out_mailbox is None: + # This is a sink function, the easiest to deal with. Since a mailbox is iterable, we can just pass it + # to the target function. + assert self.in_mailbox is not None # noqa: S101 + self.target(self.in_mailbox) # type:ignore[call-arg] + return + # This is a source or transformation function. + out_iterable = self.target() if self.in_mailbox is None else self.target(self.in_mailbox) # type:ignore[call-arg] + if not isinstance(out_iterable, Generator): + msg = ( + "Pipeline target function was expected to be a generator; " + f"instead, it returned a {type(out_iterable)}." + ) + raise TypeError(msg) # noqa: TRY301 + # Once a generator is closed, the yield call (where they block when they send an object downstream) will + # raise GeneratorExit. That lets finally: blocks, with: exits, etc. run. This happens automatically when + # out_iterable is garbage-collected. We still close it explicitly to so it gets the GeneratorExit, in case + # something (like an exception object) is holding a reference to out_iterable. + with contextlib.closing(out_iterable): + self.out_mailbox.put_many(out_iterable) + except Exception as e: + # We store the exception, so that our caller can choose what to do about it after they call join. + self.exc = e + raise + finally: + if self.in_mailbox is not None: + self.in_mailbox.shutdown() + if self.out_mailbox is not None: + self.out_mailbox.shutdown() + + def __str__(self) -> str: + return f"" diff --git a/demos/tinytv-stream.py b/demos/tinytv-stream.py index dcbcb18..a399389 100755 --- a/demos/tinytv-stream.py +++ b/demos/tinytv-stream.py @@ -132,19 +132,15 @@ from __future__ import annotations import argparse -import contextlib import functools import io -import itertools import logging import os import re import sys import time from collections import deque -from collections.abc import Generator, Iterable, Iterator -from threading import Condition, Lock, Thread -from typing import TYPE_CHECKING, Generic, Literal, TypeVar, overload +from typing import TYPE_CHECKING, Literal import serial from PIL import Image, ImageOps @@ -153,8 +149,10 @@ import mss +from common.pipeline import Mailbox, PipelineStage + if TYPE_CHECKING: - from collections.abc import Callable + from collections.abc import Generator, Iterable # The keys in this are substrings in the tvType query. Make sure that they're all distinct: having both "TinyTV2" and # "TinyTV2.1" in here would mean that a 2.1 might be misidentified as a 2. We use substrings instead of parsing the @@ -191,302 +189,9 @@ DEFAULT_JPEG_QUALITY = 75 -T = TypeVar("T") -U = TypeVar("U") - LOGGER = logging.getLogger("tinytv-stream") -class MailboxShutDown(Exception): # noqa: N818 (An exception, but not an error) - """Exception to indicate that a Mailbox has been shut down. - - This will be raised if Mailbox.get() or Mailbox.put() is run on a - mailbox after its .shutdown() method has been called, or if it is - called while waiting. - """ - - def __init__(self, mailbox: Mailbox) -> None: - #: The mailbox that was shut down - self.mailbox = mailbox - - def __str__(self) -> str: - return f"Mailbox shut down: {self.mailbox}" - - -class Mailbox(Generic[T]): - """Thread-safe container to pass a single object at a time between threads. - - A Mailbox can be shut down to indicate that it is no longer - available. This can be used by a producer to indicate that no - more items will be forthcoming, or by a consumer to indicate that - it is no longer able to accept more objects. - - In Python 3.13, this has the same basic functionality as - queue.Queue(1). Prior to 3.13, there was no - queue.Queue.shutdown() method. The mechanisms for using mailboxes - as iterables, or adding items from iterables, are also not part of - queue.Queue in any version of Python. - """ - - def __init__(self) -> None: - #: Lock to protect mailbox state - self.lock = Lock() - self._condition = Condition(lock=self.lock) - #: Indicates whether an item is present in the mailbox - self.has_item = False - self._item: T | None = None - #: Indicates whether the mailbox has been shut down - self.is_shutdown = False - - def get(self) -> T: - """Return and remove the item being held by the mailbox. - - If an item is not presently available, block until another - thread calls .put(). - """ - with self._condition: - while True: - # We test to see if an item is present before testing if the queue is shut down. This is so that a - # non-immediate shutdown allows the mailbox to be drained. - if self.has_item: - rv = self._item - self._item = None # Don't hold an unnecessary reference - self.has_item = False - self._condition.notify_all() - return rv # type:ignore[return-value] - if self.is_shutdown: - raise MailboxShutDown(self) - self._condition.wait() - - def get_many(self) -> Iterable[T]: - """Yield items as they appear in the mailbox. - - The iterator exits the mailbox is shut down; MailboxShutDown - is not raised into the caller. - """ - return iter(self) - - def put(self, item: T) -> None: - """Store an item in the mailbox. - - If an item is already in the mailbox, block until another - thread calls .get(). - """ - with self._condition: - while True: - if self.is_shutdown: - raise MailboxShutDown(self) - if not self.has_item: - self._item = item - self.has_item = True - self._condition.notify() - return - self._condition.wait() - - def put_many(self, items: Iterable[T]) -> Iterator[T]: - """Put the elements of iterable in the mailbox, one at a time. - - If the mailbox is shut down before all the elements can be put - into it, a MailboxShutDown exception is _not_ raised. - - Returns an iterator containing any remaining items, including - the one that was being processed when the mailbox was shut - down. The first item (if any) of this iterator can be - immediately accessed with next; subsequent items defer to the - input iterable, so may block. - """ - iterator = iter(items) - for item in iterator: - # We put this try/except inside the for loop, to make sure we don't accidentally filter out an exception - # that escaped the items iterator. - try: - self.put(item) - except MailboxShutDown: - return itertools.chain([item], iterator) - # Remove references to the value once it's not needed. This lets objects with advanced buffer semantics - # reclaim the object's memory immediately, without waiting for the next iteration of the iterable. - del item - return iter([]) - - def shutdown(self, *, immediate: bool = False) -> None: - """Shut down the mailbox, marking it as unavailable for future use. - - Any callers currently blocked in .get or .put, or any future - caller to those methods, will recieve a MailboxShutDown - exception. Callers using .get_many or iterating over the - mailbox will see the iteration end. Callers to .put_many will - stop adding items. - - If immediate is False (the default), and an item is currently - in the mailbox, it will be returned by the next call to - .get(), and the one after that will raise MailboxShutDown. - - It is safe to call this method multiple times, including to - promote a non-immediate shutdown to an immediate one. - """ - with self._condition: - # We don't actually need to check whether we've been called already. - self.is_shutdown = True - if immediate: - self._item = None - self.has_item = False - self._condition.notify_all() - - def __iter__(self) -> Iterator[T]: - """Yield items as they appear in the mailbox. - - The iterator exits when the mailbox is shut down; - MailboxShutDown is not raised into the caller. - """ - with contextlib.suppress(MailboxShutDown): - while True: - yield self.get() - - -class PipelineStage(Thread, Generic[T, U]): - """A stage of a multi-threaded pipeline. - - The target function will be called once, and should yield one - value for each element. - - If an in_mailbox is provided, the function will get an iterable of - its successive elements. If an out_mailbox is provided, it will - be supplied with the successive outputs of the target function. - - If the either mailbox is shut down, the target function's loop - will stop being called. Both mailboxes will be shut down when the - target function ends. - - Note to readers adapting this class to their own programs: - - This is designed for linear pipelines: it is not meant to support - fan-in (multiple stages feeding one mailbox) or fan-out (one - mailbox feeding multiple stages). The shutdown semantics of these - sorts of pipelines will depend heavily on what it's used for, and - this demo only needs a simple pipeline. - """ - - # Source stage - @overload - def __init__( - self, - target: Callable[[], Generator[U]], - *, - out_mailbox: Mailbox[U], - name: str | None = None, - ) -> None: ... - - # Transformer stage - @overload - def __init__( - self, - target: Callable[[Iterable[T]], Generator[U]], - *, - in_mailbox: Mailbox[T], - out_mailbox: Mailbox[U], - name: str | None = None, - ) -> None: ... - - # Sink stage - @overload - def __init__( - self, - target: Callable[[Iterable[T]], None], - *, - in_mailbox: Mailbox[T], - name: str | None = None, - ) -> None: ... - - def __init__( - self, - target: Callable[[], Generator[U]] | Callable[[Iterable[T]], Generator[U]] | Callable[[Iterable[T]], None], - *, - in_mailbox: Mailbox[T] | None = None, - out_mailbox: Mailbox[U] | None = None, - name: str | None = None, - ) -> None: - """Initialize the PipelineStage. - - Either :param:`in_mailbox` or :param:`out_mailbox` is - required. Otherwise, it would be a pipeline stage that can't - connect to anything else. (You can always use - :class:`threading.Thread` directly if you need that behavior.) - - :param target: Function to run during the stage. This will be - called once, in a separate thread. This should take one - argument if :param:`in_mailbox` is provided, or no - arguments otherwise. If you want additional arguments - (such as configuration), use :func:`functools.partial`. - :param in_mailbox: An optional :class:`Mailbox` to provide - inputs to the target function. The target function will - be called with one argument, an iterable that you can use - in a for loop or similar construct, to get the successive - values. - :param out_mailbox: An optional :class:`Mailbox` to receive - outputs from the target function. If this is provided, - the target function must be a generator (a function that - uses ``yield`` instead of ``return``). The successive - outputs from the function will be placed in - :param:`out_mailbox`. - :param name: An optional name for debugging purposes; see - :attr:`threading.Thread.name`. - """ - if in_mailbox is None and out_mailbox is None: - msg = "Cannot have a pipeline stage with neither inputs nor outputs" - raise ValueError(msg) - self.in_mailbox = in_mailbox - self.out_mailbox = out_mailbox - self.target = target - #: The exception (if any) raised by the target function - self.exc: Exception | None = None - super().__init__(name=name, daemon=True) - - def run(self) -> None: - """Execute the pipeline stage. - - This should not be run directly. Instead, use the start() - method (inherited from threading.Thread) to run this in a - background thread. - - This will run the target function, managing input and output - mailboxes. When the stage completes, whether normally or with - an error, the mailboxes will be shut down. - """ - try: - if self.out_mailbox is None: - # This is a sink function, the easiest to deal with. Since a mailbox is iterable, we can just pass it - # to the target function. - assert self.in_mailbox is not None # noqa: S101 - self.target(self.in_mailbox) # type:ignore[call-arg] - return - # This is a source or transformation function. - out_iterable = self.target() if self.in_mailbox is None else self.target(self.in_mailbox) # type:ignore[call-arg] - if not isinstance(out_iterable, Generator): - msg = ( - "Pipeline target function was expected to be a generator; " - f"instead, it returned a {type(out_iterable)}." - ) - raise TypeError(msg) # noqa: TRY301 - # Once a generator is closed, the yield call (where they block when they send an object downstream) will - # raise GeneratorExit. That lets finally: blocks, with: exits, etc. run. This happens automatically when - # out_iterable is garbage-collected. We still close it explicitly to so it gets the GeneratorExit, in case - # something (like an exception object) is holding a reference to out_iterable. - with contextlib.closing(out_iterable): - self.out_mailbox.put_many(out_iterable) - except Exception as e: - # We store the exception, so that our caller can choose what to do about it after they call join. - self.exc = e - raise - finally: - if self.in_mailbox is not None: - self.in_mailbox.shutdown() - if self.out_mailbox is not None: - self.out_mailbox.shutdown() - - def __str__(self) -> str: - return f"" - - def list_devices() -> None: """Display all USB serial ports in a formatted table.""" ports = list(list_ports.comports()) diff --git a/demos/video-capture.py b/demos/video-capture.py index c8a565d..b048ee9 100755 --- a/demos/video-capture.py +++ b/demos/video-capture.py @@ -1,198 +1,382 @@ #! /usr/bin/env python3 -from fractions import Fraction -import queue -import threading +# In one test, here's some numbers this program could achieve. This +# is just meant as a rough guide; your results will almost certainly +# vary significantly. +# - libx264, 1920x1080: 80 fps +# - libx264, 3840x2160: 18 fps +# - h264_nvenc, 1920x1080: 190 fps +# - h264_nvenc, 3840x2160: 41 fps + +import argparse +import logging +import signal import time +from collections import deque +from collections.abc import Generator, Iterable, Sequence +from fractions import Fraction +from functools import partial +from math import floor +from threading import Event +from typing import Any import av import numpy as np -import soundcard as sc -from tqdm.auto import trange +from common.pipeline import Mailbox, PipelineStage +from si_prefix import si_format import mss -CODEC_OPTIONS_GLOBAL = { - "g": "60", # GOP size: aim for about 2 sec - "bf": "2", # enable bframes - "b": "6M", # nominal average bitrate target - "maxrate": "12M", # peak - "bufsize": "24M", # VBV buffer; 1-4 seconds -} +# These are the options you'd give to ffmpeg that would affect the +# video codec. +CODEC_OPTIONS = { + # The "high" profile means that the encoder can use some H.264 + # features that are widely supported, but not mandatory. + "profile": "high", + + # The "medium" preset is as good of a preset as any for a demo + # like this. Different codecs have different presets; the the + # h264_nvenc actually prefers "p4", but accepts "medium" as a + # similar preset. + "preset": "medium", -# Some options are, of course, implementation-dependent. I've -# tried to make these basically similar, but for all I know, they -# might actually produce significantly different output quality. -CODECS = { - "h264_nvenc": { - "rc": "vbr", - "tune": "hq", - "cq": "23", # quality; similar spirit to CRF, but different - # The modern presets are the p# ones. The others are - # deprecated, often aliases. - "preset": "p4", # p1..p7 (higher = slower/better) - "rc-lookahead": "40", - "spatial-aq": "1", - "temporal-aq": "1", - "b_ref_mode": "1", - }, - "libx264": { - # I think that with VBR enabled (as in the global options), - # libx264 ignores CRF. - "crf": "23", # quality; lower=better/larger - "preset": "medium", # speed/quality trade-off - "rc-lookahead": "40", - "aq-mode": "3", - }, + # 6 Mbit/sec is vaguely the ballpark for a good-quality video at + # 1080p and 30 fps, but there's a lot of variation. We're just + # giving the target bitrate: the second-to-second bitrate will + # vary a lot, and slowly approach this bitrate. If you're trying + # this on a nearly-still screen, though, then the actual bitrate + # will be much lower, since there's not much motion to encode! + "b": "6M", + + # Let the encoder hold some frames for analysis, and flush them + # later. This especially helps with the hardware-accelerated + # codecs. + "rc-lookahead": "40", } +# There are a lot of different places in a video encoding pipeline +# where time_base matters, and they don't necessarily have to be the +# same, so the time base has to be set on several objects. In this +# program, we do use a common time base of 1/90000 seconds everywhere. +# This is a common standard, from the MPEG world. +TIME_BASE = Fraction(1, 90000) -def main(): - av.logging.set_level(av.logging.VERBOSE) +LOGGER = logging.getLogger("video-capture") - fps = 60 - monitor_id = 1 - duration_secs = 30 - codec = None - - if codec is None: - for codec in CODECS: - try: - # This normalizes the name. - av.codec.Codec(codec, "w") - break - except av.codec.codec.UnknownCodecError: - pass - else: - raise RuntimeError("No viable H.264 codec found") - else: - # Normalize the name, for the options lookup. - codec = av.codec.Codec(codec, "w").name +def video_capture( + fps: int, + sct: mss.base.MSSBase, + monitor: mss.models.Monitor, + shutdown_requested: Event, +) -> Generator[tuple[mss.screenshot.ScreenShot, float], None, None]: + next_frame_at = time.monotonic() + capture_period = 1 / fps + while not shutdown_requested.is_set(): + # Wait until we're ready. + while (now := time.monotonic()) < next_frame_at: + time.sleep(next_frame_at - now) - mic = sc.get_microphone("loopback") + # Capture and yield a frame. + screenshot = sct.grab(monitor) + yield screenshot, now - with mss.mss() as sct: - monitor = sct.monitors[monitor_id] + # We try to keep the capture rate at the desired fps on + # average. If we can't quite keep up for a moment (such as if + # the computer is a little overloaded), then we'll accumulate + # a bit of "timing debt" in next_frame_at: it'll be a little + # sooner than now + one frame. We'll hopefully be able to + # catch up soon. + next_frame_at = next_frame_at + capture_period + + # If we've accumulated over one frame's worth of catch-up, + # then that will say that next_frame_at is sooner than now. + # If we're accumulating too much debt, we want to wipe it out, + # rather than having a huge burst of closely-spaced captures + # as soon as we can get back to our desired capture rate. + # When we wipe that out, we still try to preserve the timing + # cycle's phase to keep the capture cadence smooth, rather + # than having a jittery burst of closely-spaced captures. In + # other words, we increment next_frame_at by a multiple of the + # desired capture period. + if next_frame_at < now: + missed_frames = floor((now - next_frame_at) * fps) + next_frame_at += (missed_frames + 1) * capture_period + + +def video_process( + screenshot_and_timestamp: Iterable[ + tuple[mss.screenshot.ScreenShot, float] + ], +) -> Generator[av.VideoFrame, None, None]: + first_frame_at: float | None = None + for screenshot, timestamp in screenshot_and_timestamp: + ndarray = np.frombuffer(screenshot.bgra, dtype=np.uint8) + ndarray = ndarray.reshape(screenshot.height, screenshot.width, 4) + # from_numpy_buffer isn't documented. from_ndarray is, but + # that copies the data. That's slow enough to slow things + # down to the point of being a real bottleneck! + frame = av.VideoFrame.from_numpy_buffer(ndarray, format="bgra") + if first_frame_at is None: + first_frame_at = timestamp + frame.pts = int((timestamp - first_frame_at) / TIME_BASE) + frame.time_base = TIME_BASE + yield frame + + +def video_encode( + video_stream: av.video.stream.VideoStream, frames: Iterable[av.VideoFrame] +) -> Generator[Sequence[av.Packet], None, None]: + for frame in frames: + yield video_stream.encode(frame) + # Our input has run out. Flush the frames that the encoder still + # is holding internally (such as to compute B-frames). + yield video_stream.encode(None) + + +def show_stats(packet_batches: Iterable[Sequence[av.Packet]]) -> Iterable[Sequence[av.Packet]]: + """Display streaming statistics (FPS and throughput). + + Statistics are displayed over a 100-frame sliding window. + + FPS indicates how fast the entire pipeline can run as a whole, not + any individual stage. + """ + # The start time is only used for showing the clock. The actual + # timing stats all use the times we put in the captured frames. + start_time = time.monotonic() + time_deque: deque[int] = deque(maxlen=100) + bit_count_deque: deque[int] = deque(maxlen=100) + next_display_update = 0.0 + last_status_len = 0 + + for frame_count, packet_batch in enumerate(packet_batches): + # Yield the packet data immediately, so the mux gets it as + # soon as possible, while we update our stats. + yield packet_batch - with av.open("capture.mp4", "w", format="mp4") as avmux: - time_denom = 90000 # This is a widely-used standard - time_base = Fraction(1, time_denom) + for packet in packet_batch: + # The PTS would make more sense for logging FPS than the + # DTS, but because of frame reordering, it makes the stats + # unstable. Using DTS consistently makes the timing quite + # stable, and over the 100-frame window, still quite + # precise. + time_deque.append(packet.dts) + bit_count = packet.size * 8 + bit_count_deque.append(bit_count) - audio_stream = avmux.add_stream("opus", options={"b": "64k"}) - audio_stream.time_base = time_base - # We pre-open the codec, to make sure there's not a warmup frame. - audio_stream.open() + now = time.monotonic() + if now >= next_display_update and len(time_deque) > 1: + next_display_update = now + 0.1 + running_time = now - start_time + running_minutes = int(running_time / 60) + running_seconds = int(running_time % 60) + window_secs = (time_deque[-1] - time_deque[0]) * TIME_BASE + # We can't use the last frame in the window when we divide + # by window_secs; that would be a fencepost error. + window_frames = len(time_deque) - 1 + window_bits = sum(bit_count_deque) - bit_count_deque[-1] + fps = window_frames / window_secs + bits_per_sec = int(window_bits / window_secs) + line = (f"{running_minutes:02d}:{running_seconds:02d} " + f"frame {frame_count}: {fps:.2f} fps, " + f"{si_format(bits_per_sec, precision=2)}bps") + this_status_len = len(line) + full_line = f"\r{line}{' ' * (last_status_len - this_status_len)}" + print(full_line, end="") + last_status_len = this_status_len + # It's difficult to correctly print the fps and bitrate near the + # tail, since we get the last many frames as a big batch. Instead + # of leaving misleading information on the screen, we erase the + # status display. + print(f"\r{' ' * last_status_len}\r", end="") - options = dict(CODEC_OPTIONS_GLOBAL) - if codec in CODECS: - options.update(CODECS[codec]) - video_stream = avmux.add_stream(codec, rate=fps, options=options) + +def mux(avmux: av.container.OutputContainer, packet_batches: Iterable[Sequence[av.Packet]]) -> None: + for packet_batch in packet_batches: + avmux.mux(packet_batch) + + +def parse_region(s: str) -> tuple[int, int, int, int]: + """Parse comma-separated region string into (left, top, right, bottom).""" + parts = s.split(",") + if len(parts) != 4: + msg = "region must be four comma-separated integers" + raise argparse.ArgumentTypeError(msg) + try: + return tuple(int(p.strip()) for p in parts) # type: ignore[return-value] + except ValueError as e: + msg = "region values must be integers" + raise argparse.ArgumentTypeError(msg) from e + + +def main() -> None: + logging.basicConfig(level=logging.DEBUG) + # If we don't enable PyAV's own logging, a lot of important error + # messages from libav won't be shown. + av.logging.set_level(av.logging.VERBOSE) + + parser = argparse.ArgumentParser( + description="Capture screen video to MP4 file" + ) + parser.add_argument( + "--fps", + type=int, + default=30, + help="frames per second (default: 30)" + ) + monitor_group = parser.add_mutually_exclusive_group() + monitor_group.add_argument( + "--monitor", + type=int, + default=1, + help="monitor ID to capture (default: 1)" + ) + monitor_group.add_argument( + "--region", + type=parse_region, + metavar="LEFT,TOP,RIGHT,BOTTOM", + help="region to capture as comma-separated coordinates" + ) + parser.add_argument( + "--codec", + default="libx264", + help="video codec (default: libx264; try h264_nvenc for Nvidia hardware encoding)" + ) + parser.add_argument( + "--output", + default="capture.mp4", + help="output filename (default: capture.mp4)" + ) + args = parser.parse_args() + + fps = args.fps + codec = args.codec + filename = args.output + + with mss.mss() as sct: + if args.region: + left, top, right, bottom = args.region + monitor = { + "left": left, + "top": top, + "width": right - left, + "height": bottom - top, + } + else: + monitor = sct.monitors[args.monitor] + + with av.open(filename, "w") as avmux: + # We could initialize video_stream in video_encode, but + # doing it here means that we can open it before starting + # the capture thread, which avoids a warmup frame (one + # that takes longer to encode because the encoder is just + # starting). + # + # The rate= parameter here is just the nominal frame rate: + # some tools (like file browsers) might display this as + # the frame rate. But we actually control timing via the + # pts and time_base values on the frames themselves. + video_stream = avmux.add_stream( + codec, rate=fps, options=CODEC_OPTIONS + ) video_stream.width = monitor["width"] video_stream.height = monitor["height"] - video_stream.time_base = time_base + # Setting the time_base on the stream is possible, but + # isn't what we need (for reasons I'm unclear on): we need + # to set it on the codec context. + video_stream.codec_context.time_base = TIME_BASE + # Assigning the pix_fmt is telling the video encoder what + # we'll be sending it, not necessarily what it will + # output. If the codec supports BGRx inputs, then that's + # the most efficient way for us to send it our frames. + # Otherwise, there will be a software, CPU-side conversion + # step when we send it our BGRx frames. We're actually + # probably sending it frames in BGR0, not BGRA, but PyAV + # doesn't support reading frames in BGR0, only BGRA. + # H.264 doesn't support an alpha channel anyway, so we can + # just send it BGR0 frames and tell it they're BGRA. if any(f.name == "bgra" for f in video_stream.codec.video_formats): video_stream.pix_fmt = "bgra" - # We pre-open the codec, to make sure there's not a warmup frame. + # We open (initialize) the codec explicitly here. PyAV + # will automatically open it the first time we call + # video_stream.encode, but the time it takes to set the + # codec up means the first frame would be particularly + # slow. video_stream.open() - def pipeline(q_input, fn, q_output): - try: - while True: - try: - val_input = q_input.get(timeout=5) - except queue.ShutDown: - break - val_output = fn(val_input) - if q_output is not None: - q_output.put(val_output, timeout=5) - finally: - q_input.shutdown() - if q_output is not None: - q_output.shutdown() - - q_audio_preprocess = queue.Queue(1) - q_audio_encode = queue.Queue(1) - q_video_preprocess = queue.Queue(1) - q_video_encode = queue.Queue(1) - q_mux = queue.Queue(1) - - def video_capture(): - try: - next_frame_at = first_frame_at - for i in trange(duration_secs * fps): - while ((now := time.clock_gettime(time.CLOCK_MONOTONIC)) < next_frame_at): - time.sleep(next_frame_at - now) - # I think there's an easy way to make this a leaky bucket, but can't quite - # think through the math right now. - next_frame_at = next_frame_at + 1/fps - screenshot = sct.grab(monitor) - q_video_preprocess.put((screenshot, now), timeout=5) - finally: - q_video_preprocess.shutdown() - - def video_preprocess(screenshot_and_timestamp): - (screenshot, timestamp) = screenshot_and_timestamp - - ndarray = np.frombuffer(screenshot.buffer(), dtype=np.uint8) - ndarray = ndarray.reshape(monitor["height"], monitor["width"], 4) - # from_numpy_buffer isn't documented. from_ndarray is, - # but that copies the data. That's slow enough to - # slow things down to the point of being a bottleneck! - frame = av.VideoFrame.from_numpy_buffer(ndarray, format="bgra") - - frame.pts = int((timestamp - first_frame_at) * 90000) - frame.time_base = Fraction(1, 90000) - return frame - - video_encode = video_stream.encode - - def audio_preprocess(audio_and_timestamp): - (audio, timestamp) = audio_and_timestamp - audio = audio.reshape(1, -1) - frame = av.AudioFrame.from_ndarray(audio, format='flt', layout='stereo') - frame.sample_rate = 48000 - frame.pts = int((timestamp - first_frame_at) * 90000) - frame.time_base = Fraction(1, 90000) - return frame - - audio_encode = audio_stream.encode - - t_video_capture = threading.Thread(target=video_capture, name="video_capture") - t_video_preprocess = threading.Thread(target=pipeline, args=(q_video_preprocess, video_preprocess, q_video_encode), name="video_preprocess") - t_video_encode = threading.Thread(target=pipeline, args=(q_video_encode, video_encode, q_mux), name="video_encode") - t_audio_preprocess = threading.Thread(target=pipeline, args=(q_audio_preprocess, audio_preprocess, q_audio_encode), name="audio_preprocess") - t_audio_encode = threading.Thread(target=pipeline, args=(q_audio_encode, audio_encode, q_mux), name="audio_encode") - t_mux = threading.Thread(target=pipeline, args=(q_mux, avmux.mux, None), name="mux") - - first_frame_at = time.clock_gettime(time.CLOCK_MONOTONIC) - t_mux.start() - t_video_encode.start() - t_video_preprocess.start() - t_audio_encode.start() - t_audio_preprocess.start() - t_video_capture.start() - - print("Capture: ", t_video_capture.native_id) - print("Preprocess:", t_video_preprocess.native_id) - print("Encode: ", t_video_encode.native_id) - print("Mux: ", t_mux.native_id) - - with mic.recorder(samplerate=48000) as audio_recorder: - while t_video_capture.is_alive(): - data = audio_recorder.record() - now = time.clock_gettime(time.CLOCK_MONOTONIC) - timestamp = now - audio_recorder.latency - q_audio_preprocess.put((data, timestamp)) - - t_video_capture.join() - t_video_preprocess.join() - t_video_encode.join() - t_audio_preprocess.join() - t_audio_encode.join() - t_mux.join() - - print(f"Used format {video_stream.format}, " - f"reformatter {video_stream.reformatter}") + shutdown_requested = Event() + def sigint_handler(_signum: int, _frame: Any) -> None: + # The status line will typically be visible, so start + # a fresh line for this message. + print("\nShutting down") + shutdown_requested.set() + signal.signal(signal.SIGINT, sigint_handler) + + mailbox_screenshot: Mailbox[ + tuple[mss.screenshot.ScreenShot, float] + ] = Mailbox() + mailbox_frame: Mailbox[av.VideoFrame] = Mailbox() + mailbox_packet_to_stats: Mailbox[Sequence[av.Packet]] = Mailbox() + mailbox_packet_to_mux: Mailbox[Sequence[av.Packet]] = Mailbox() + + stage_video_capture = PipelineStage( + name="video_capture", + target=partial( + video_capture, + fps, + sct, + monitor, + shutdown_requested, + ), + out_mailbox=mailbox_screenshot, + ) + stage_video_process = PipelineStage( + name="video_process", + in_mailbox=mailbox_screenshot, + target=partial(video_process), + out_mailbox=mailbox_frame, + ) + stage_video_encode = PipelineStage( + name="video_encode", + in_mailbox=mailbox_frame, + target=partial(video_encode, video_stream), + out_mailbox=mailbox_packet_to_stats, + ) + stage_show_stats = PipelineStage( + name="show_stats", + in_mailbox=mailbox_packet_to_stats, + target=show_stats, + out_mailbox=mailbox_packet_to_mux, + ) + stage_mux = PipelineStage( + name="stream_mux", + in_mailbox=mailbox_packet_to_mux, + target=partial(mux, avmux), + ) + + stage_mux.start() + stage_show_stats.start() + stage_video_process.start() + stage_video_encode.start() + stage_video_capture.start() + + LOGGER.debug("Thread IDs:") + LOGGER.debug(" Capture: %s", stage_video_capture.native_id) + LOGGER.debug(" Preprocess: %s", stage_video_process.native_id) + LOGGER.debug(" Encode: %s", stage_video_encode.native_id) + LOGGER.debug(" Mux: %s", stage_mux.native_id) + + print("Starting video capture. Press Ctrl-C to stop.") + + stage_video_capture.join() + stage_video_process.join() + stage_video_encode.join() + stage_show_stats.join() + stage_mux.join() + + if codec != "libx264" and video_stream.reformatter is not None: + LOGGER.warning("Software encoder is in a hardware encoding " + "path; this may slow things down") if __name__ == "__main__": From fba9bdabf885403a603415cfa07866502b481e30 Mon Sep 17 00:00:00 2001 From: Joel Ray Holveck Date: Thu, 15 Jan 2026 23:16:16 -0800 Subject: [PATCH 03/16] Add more docs to the video capture demo Very much incomplete, sometimes stopping mid-sentence. But I've written enough that I don't want to lose it, so here's an intermediate commit. --- demos/video-capture.py | 425 +++++++++++++++++++++++++++++++++++------ 1 file changed, 371 insertions(+), 54 deletions(-) diff --git a/demos/video-capture.py b/demos/video-capture.py index b048ee9..ed43db7 100755 --- a/demos/video-capture.py +++ b/demos/video-capture.py @@ -1,8 +1,256 @@ #! /usr/bin/env python3 -# In one test, here's some numbers this program could achieve. This -# is just meant as a rough guide; your results will almost certainly -# vary significantly. +# This demo isn't meant to be a comprehensive explanation of video +# encoding. There are, however, some concepts that are unavoidable +# when converting from a sequence of snapshots to a video file. We'll +# go over some of those here. +# +# The descriptions given here are simplified. It doesn't go into the +# more obscure details, like H.264 switching frames or the AAC priming +# delay. Nevertheless, this should be enough to get the concepts +# you'll need to understand and build on this demo. +# +# +# libav +# ----- +# +# If you care enough about video files to be reading this, you've +# probably used ffmpeg. This is a Swiss Army Knife of video file +# manipulation. +# +# The ffmpeg tool is based on several libraries, which are part of +# ffmpeg, and widely used elsewhere: +# +# - libavcodec: Encoding/decoding library +# - libavfilter: Graph-based frame editing library +# - libavformat: I/O and muxing/demuxing library +# - libavdevice: Special devices muxing/demuxing library +# - libavutil: Common utility library +# - libswresample: Audio resampling, format conversion and mixing +# - libswscale: Color conversion and scaling library +# +# In this demo, I just refer to these collectively as "libav". Think +# of these as the library version of ffmpeg. We mostly use libavcodec +# and libavformat, but that detail isn't something we see in Python: +# all these libraries are essentially one giant bundle as far as we +# care. +# +# The libav libaries are in C. We use the PyAV library. This is not +# simply bindings or a direct translation of the libav C API to +# Python, but rather, a library that's based on libav, but meant to be +# more Pythonic. +# +# [note: it's important to include the fact that pyav.org has outdated +# docs, since they show up prominently in Google searches. The link +# to the GitHub issue is just to tell people that the ] +# +# The docs for PyAV are at . +# The older docs at pyav.org are outdated; see +# . +# +# There was briefly a fork called basewood-av, but it has since been +# discontinued and merged back into PyAV; see +# . Despite the +# domain name, pyav.basswood-io.com hosts the current official PyAV +# documentation, not fork-specific docs. +# +# The PyAV developers are separate from ffmpeg, and there is a bit of +# a difference in the approaches that PyAV takes. See also +# https://pyav.basswood-io.com/docs/stable/overview/caveats.html +# +# +# Container Files +# --------------- +# +# A single file, like kittycat.mp4, is called a "container" in media +# file terms. This is a collection of "streams" (sometimes called +# "elementary streams"), all woven together. +# +# It might contain just a video stream (like we do here), just an +# audio stream (like in a .m4a file, which is just a renamed .mp4 +# file), or both (most common). It might also contain several of +# each; for instance, different languages will usually be in separate +# audio streams. There are other stream types, like subtitles, as +# well. +# +# Weaving these streams together is called "multiplexing", or "muxing" +# for short. Each stream's data gets bundled into "chunks" that are +# typically called "packets". The container keeps packets from the +# same time, but different streams, close to each other in the file. +# +# (By the way, the term "packet" is a holdover from MPEG-2 and before. +# Technically, MP4 files don't have packets: they have chunks, which +# can hold AAC frames, H.264 NALs, etc. The data that used to be in +# MPEG-2 packet headers is now in MP4 tables. To keep the terminology +# consistent between codecs and container formats, libav refers to the +# objects encapsulating all this as packets, regardless of the codec +# or container format.) +# +# For instance, at the beginning of the file, you might have one audio +# packet covering the first 21 ms, then a subtitle packet covering the +# first several seconds, then seven video packets each covering 3 ms, +# followed by another audio packet for the next 21 ms, and so on. +# +# +# Video Codecs +# ------------ +# +# Within an MP4 file, the video can be stored in a lot of different +# formats. These are the most common: +# +# - MPEG-2: used by DVDs, not much else anymore. +# - MPEG-4 Part 2: also known as DivX. Very popular in the early 2000s, +# not seen much anymore except older archives. +# - H.264: commonly used by BluRay, many streaming services, and many +# MP4 files in the wild. +# - H.265: increasingly used, but not supported by older hardware. +# - AV1 and VP9: used by some streaming services; hardware support +# varies, so these are typically offered alongside H.264 (or H.265) +# as fallbacks. +# +# These are all stream formats. There are many libraries that can +# create these files. These libraries are known as "codecs". In +# some contexts, the word "codec" is also used to name the stream +# format itself, so "H.264" might sometimes be called a codec. +# +# In this demo, we use H.264, since it's the most common. You can +# also specify other codecs. +# +# In ffmpeg, and the av libraries that we use here, the best codec for +# H.264 that doesn't require any specific hardware is libx264. There +# are also faster ones that are hardware-accelerated, such as +# h264_nvenc which uses specialized chips on Nvidia video cards. +# +# +# Frame Types +# ----------- +# +# Reference: https://en.wikipedia.org/wiki/Video_compression_picture_types +# +# [Note: We can probably just give a brief description of the frame +# types.] +# +# The reason that video files can compress so well, much better than +# storing a JPEG for each frame, is that the file often can describe +# just the motion. In a video of a cat meowing, the first frame will +# have everything that's visible: the room in the background, the +# entire cat, the whole thing. We call a video frame that stores the +# whole picture an "I-frame". +# +# But the second frame just has to talk about what's changed in that +# 1/30 sec: it can just say that the tail moved this much to the left, +# the eyes closed slightly, what the now-visible bits of the eyelids +# look like, what's changed about the ear when it moved, etc. We call +# this sort of frame, one that just stores the differences from a +# previous frame, a "P-frame". +# +# We still want to refresh the whole picture from scratch from time to +# time. Since the differences between video frames are compressed, +# they're also imperfect. Over time, these imperfections can +# accumulate. Also, sometimes a frame may have been lost between when +# we store it and when the viewer sees it, such as if we made a DVD +# that later got scratched; we want to let the viewer recover from +# such a situation. To keep things clean, we sometimes send out a new +# I-frame, redrawing the whole picture anew. This normally happens +# about every 0.5 to 2 seconds, depending on the program's purpose. +# The group of pictures starting with a fresh I-frame is, +# straightforwardly enough, called a "group of pictures" (GOP). +# +# Sometimes, it's useful for a frame to give motion based not just on +# the past, but also the future. For instance, when the cat's mouth +# first starts to open, you might want to say "look ahead at how the +# inside of the mouth looks when it's totally open, and draw just this +# tiny sliver of it now." These are called "B-frames". +# +# A GOP usually arranges these frame types in a cadence, like +# IBBPBBPBB.... The specifics are up to the encoder, but the user can +# normally configure it to some degree. +# +# +# Timestamps +# ---------- +# +# [note: Managing the PTS is a big part of the code, so I want to +# describe it. The DTS is also worth at least highlighting, as is the +# fact that packets from the encoder may be in a different order than +# presentation order.] +# +# In a video file, time is very important. It's used to synchronize +# audio and video, to prevent frame timing quantization from causing +# the clock to drift, and many other purposes. +# +# The time at which each frame should be shown is called its +# "presentation time stamp", or "PTS". Normally, the PTS of the first +# frame is 0, and the rest of the video file is based on that. +# +# Because B-frames can require future frames to interpret, the future +# frames they depend on have to be decoded first. That means that the +# order in which frames are decoded can be different from the order in +# which they are presented. This leads to a second timestamp on each +# frame: the "decoding time stamp", or "DTS". +# +# Different container formats store the timestamps in different +# places: the container's structures, the packet headers, the streams, +# etc. Because of this, there are multiple places that carry +# timestamps. You can just set the timestamp on the video frame, and +# libav will propagate it from there to the packets and so forth. +# +# +# Time Base +# --------- +# +# [note: Most people new to video encoding may assume that timestamps +# are in float or integer nanoseconds or something, so the concept of +# the time base is significant. We also attach it to multiple +# objects: the container object, the video stream context object, and +# each frame. So, the reason we do that is worth noting. Preserve the +# link to the PyAV docs.] +# +# In most video file formats, the time isn't specified in predefined +# units like nanoseconds. Instead, in your video file, you specify +# the time units you're using, a fraction of a second. This is called +# your time base. +# +# There are a lot of different places in a video encoding pipeline +# where you set a time base: everywhere that might need to encode a +# timestamp. They don't necessarily have to be the same (PyAV will +# convert between the different time bases as needed), so the time +# base has to be set on several objects. See also +# +# +# In this demo, we use a common time base of 1/90000 sec everywhere. +# This is a common standard, from the MPEG world. It became a +# standard because it can exactly represent 24 fps (film), 25 fps +# (European TV), 30 fps (US TV, nominally), and 30000/1001 fps (about +# 29.97, US broadcast TV). +# +# +# Performance +# ----------- +# +# This demo uses multiple threads to improve performance. These +# threads are pipelined; see the comments at the start of +# common/pipeline.py for information about that concept. +# +# In a pipelined design, the slowest stage usually sets the overall +# rate. Suppose you and your roommates are all doing the dishes: +# Alice collects dishes and scrapes off food, Bob washes the dishes, +# Carol rinses them, Dave dries them, and Evelyn puts them away. +# If the +# +# [note: A detailed description of pipelining threads is in +# common/pipeline.py. This section should discuss the stages +# we're using, and note that the encoding stage is usually the +# bottleneck.] +# +# +# +# [note: Not sure where to integrate this, but make sure the numbers +# are somewhere.] +# +# In one test, here's some numbers this program could achieve, on an +# idle system. This is just meant as a rough guide; your results will +# almost certainly vary significantly. # - libx264, 1920x1080: 80 fps # - libx264, 3840x2160: 18 fps # - h264_nvenc, 1920x1080: 190 fps @@ -22,10 +270,11 @@ import av import numpy as np -from common.pipeline import Mailbox, PipelineStage from si_prefix import si_format import mss +from common.pipeline import Mailbox, PipelineStage + # These are the options you'd give to ffmpeg that would affect the # video codec. @@ -33,13 +282,11 @@ # The "high" profile means that the encoder can use some H.264 # features that are widely supported, but not mandatory. "profile": "high", - # The "medium" preset is as good of a preset as any for a demo # like this. Different codecs have different presets; the the # h264_nvenc actually prefers "p4", but accepts "medium" as a # similar preset. "preset": "medium", - # 6 Mbit/sec is vaguely the ballpark for a good-quality video at # 1080p and 30 fps, but there's a lot of variation. We're just # giving the target bitrate: the second-to-second bitrate will @@ -47,36 +294,46 @@ # this on a nearly-still screen, though, then the actual bitrate # will be much lower, since there's not much motion to encode! "b": "6M", - # Let the encoder hold some frames for analysis, and flush them # later. This especially helps with the hardware-accelerated # codecs. "rc-lookahead": "40", } -# There are a lot of different places in a video encoding pipeline -# where time_base matters, and they don't necessarily have to be the -# same, so the time base has to be set on several objects. In this -# program, we do use a common time base of 1/90000 seconds everywhere. -# This is a common standard, from the MPEG world. + TIME_BASE = Fraction(1, 90000) LOGGER = logging.getLogger("video-capture") + def video_capture( fps: int, sct: mss.base.MSSBase, monitor: mss.models.Monitor, shutdown_requested: Event, ) -> Generator[tuple[mss.screenshot.ScreenShot, float], None, None]: + # Keep track of the time when we want to get the next frame. We + # limit the frame time this way instead of sleeping 1/30 sec each + # frame, since we want to also account for the time taken to get + # the screenshot and other overhead. + # + # Repeatedly adding small floating-point numbers to a total does + # cause some numeric inaccuracies, but it's small enough for our + # purposes. The program would have to run for three months to + # accumulate one millisecond of inaccuracy. next_frame_at = time.monotonic() - capture_period = 1 / fps + + # Keep running this loop until the main thread says we should + # stop. while not shutdown_requested.is_set(): - # Wait until we're ready. + + # Wait until we're ready. This should, ideally, happen every + # 1/30 second. while (now := time.monotonic()) < next_frame_at: + # time.sleep(next_frame_at - now) - # Capture and yield a frame. + # Capture a frame, and send it to the next processing stage. screenshot = sct.grab(monitor) yield screenshot, now @@ -86,9 +343,9 @@ def video_capture( # a bit of "timing debt" in next_frame_at: it'll be a little # sooner than now + one frame. We'll hopefully be able to # catch up soon. - next_frame_at = next_frame_at + capture_period + next_frame_at = next_frame_at + (1 / fps) - # If we've accumulated over one frame's worth of catch-up, + # If we've accumulated over one frame's worth of timing debt, # then that will say that next_frame_at is sooner than now. # If we're accumulating too much debt, we want to wipe it out, # rather than having a huge burst of closely-spaced captures @@ -100,7 +357,7 @@ def video_capture( # desired capture period. if next_frame_at < now: missed_frames = floor((now - next_frame_at) * fps) - next_frame_at += (missed_frames + 1) * capture_period + next_frame_at += (missed_frames + 1) / fps def video_process( @@ -108,14 +365,43 @@ def video_process( tuple[mss.screenshot.ScreenShot, float] ], ) -> Generator[av.VideoFrame, None, None]: + # We track when the first first_frame_at: float | None = None + for screenshot, timestamp in screenshot_and_timestamp: + # A screenshot's pixel data can take a long time to copy. + # Just for the CPU to copy the bytes, on my hardware, takes + # about 3ms for a 4k screenshot. This means we want to be + # very careful about how we want to get the data from the + # ScreenShot object to the VideoFrame. + # + # In Python, there's a concept called a "buffer". This is a + # range of memory that can be shared between objects, so the + # objects don't have to copy the data. This is very common in + # libraries like NumPy that work with very large datasets, and + # interpret that data in different ways. + # + # The most common buffers are in extensions written in C, but + # Python objects of type memoryview, bytes, bytearray, and + # array.array are all buffers. The screenshot.bgra attribute + # is also a buffer. (Currently, it's a bytes object, but this + # may change in the future.) + # + # PyAV doesn't let you create a VideoFrame object directly + # from pixel data in a buffer. (It is possible to update the + # data in a VideoFrame to point to a different buffer, but + # that still allocates the memory first.) + # + # However, while it's not documented, PyAV does have the + # from_numpy_buffer method (separately from the from_ndarray + # method). This creates a VideoFrame that shares memory with + # a NumPy array. We tell NumPy to create a new ndarray that + # shares the screenshot's buffer, and create a VideoFrame that + # uses that buffer. ndarray = np.frombuffer(screenshot.bgra, dtype=np.uint8) ndarray = ndarray.reshape(screenshot.height, screenshot.width, 4) - # from_numpy_buffer isn't documented. from_ndarray is, but - # that copies the data. That's slow enough to slow things - # down to the point of being a real bottleneck! frame = av.VideoFrame.from_numpy_buffer(ndarray, format="bgra") + # Set the PTS and time base for the frame. if first_frame_at is None: first_frame_at = timestamp frame.pts = int((timestamp - first_frame_at) / TIME_BASE) @@ -133,7 +419,9 @@ def video_encode( yield video_stream.encode(None) -def show_stats(packet_batches: Iterable[Sequence[av.Packet]]) -> Iterable[Sequence[av.Packet]]: +def show_stats( + packet_batches: Iterable[Sequence[av.Packet]], +) -> Iterable[Sequence[av.Packet]]: """Display streaming statistics (FPS and throughput). Statistics are displayed over a 100-frame sliding window. @@ -157,9 +445,9 @@ def show_stats(packet_batches: Iterable[Sequence[av.Packet]]) -> Iterable[Sequen for packet in packet_batch: # The PTS would make more sense for logging FPS than the # DTS, but because of frame reordering, it makes the stats - # unstable. Using DTS consistently makes the timing quite - # stable, and over the 100-frame window, still quite - # precise. + # a little bit unstable. Using DTS consistently makes the + # timing quite stable, and over the 100-frame window, + # still quite precise. time_deque.append(packet.dts) bit_count = packet.size * 8 bit_count_deque.append(bit_count) @@ -177,9 +465,11 @@ def show_stats(packet_batches: Iterable[Sequence[av.Packet]]) -> Iterable[Sequen window_bits = sum(bit_count_deque) - bit_count_deque[-1] fps = window_frames / window_secs bits_per_sec = int(window_bits / window_secs) - line = (f"{running_minutes:02d}:{running_seconds:02d} " - f"frame {frame_count}: {fps:.2f} fps, " - f"{si_format(bits_per_sec, precision=2)}bps") + line = ( + f"{running_minutes:02d}:{running_seconds:02d} " + f"frame {frame_count}: {fps:.2f} fps, " + f"{si_format(bits_per_sec, precision=2)}bps" + ) this_status_len = len(line) full_line = f"\r{line}{' ' * (last_status_len - this_status_len)}" print(full_line, end="") @@ -191,7 +481,10 @@ def show_stats(packet_batches: Iterable[Sequence[av.Packet]]) -> Iterable[Sequen print(f"\r{' ' * last_status_len}\r", end="") -def mux(avmux: av.container.OutputContainer, packet_batches: Iterable[Sequence[av.Packet]]) -> None: +def mux( + avmux: av.container.OutputContainer, + packet_batches: Iterable[Sequence[av.Packet]], +) -> None: for packet_batch in packet_batches: avmux.mux(packet_batch) @@ -219,39 +512,42 @@ def main() -> None: description="Capture screen video to MP4 file" ) parser.add_argument( - "--fps", - type=int, - default=30, - help="frames per second (default: 30)" + "-f", "--fps", type=int, default=30, help="frames per second (default: 30)" ) monitor_group = parser.add_mutually_exclusive_group() monitor_group.add_argument( - "--monitor", + "-m", "--monitor", type=int, default=1, - help="monitor ID to capture (default: 1)" + help="monitor ID to capture (default: 1)", ) monitor_group.add_argument( - "--region", + "-r", "--region", type=parse_region, metavar="LEFT,TOP,RIGHT,BOTTOM", - help="region to capture as comma-separated coordinates" + help="region to capture as comma-separated coordinates", ) parser.add_argument( - "--codec", + "-c", "--codec", default="libx264", - help="video codec (default: libx264; try h264_nvenc for Nvidia hardware encoding)" + help="video codec (default: libx264; try h264_nvenc for Nvidia hardware encoding)", ) parser.add_argument( - "--output", + "-d", "--duration-secs", + type=float, + help="Duration to record (default: no limit)", + ) + parser.add_argument( + "-o", "--output", default="capture.mp4", - help="output filename (default: capture.mp4)" + help="output filename (default: capture.mp4)", ) args = parser.parse_args() fps = args.fps codec = args.codec filename = args.output + duration_secs = args.duration_secs with mss.mss() as sct: if args.region: @@ -292,10 +588,12 @@ def main() -> None: # Otherwise, there will be a software, CPU-side conversion # step when we send it our BGRx frames. We're actually # probably sending it frames in BGR0, not BGRA, but PyAV - # doesn't support reading frames in BGR0, only BGRA. - # H.264 doesn't support an alpha channel anyway, so we can - # just send it BGR0 frames and tell it they're BGRA. - if any(f.name == "bgra" for f in video_stream.codec.video_formats): + # doesn't claim to support reading frames in BGR0, only + # BGRA. H.264 doesn't support an alpha channel anyway, so + # we can just send it BGR0 frames and tell it they're BGRA. + if any( + f.name == "bgra" for f in video_stream.codec.video_formats + ): video_stream.pix_fmt = "bgra" # We open (initialize) the codec explicitly here. PyAV # will automatically open it the first time we call @@ -305,12 +603,6 @@ def main() -> None: video_stream.open() shutdown_requested = Event() - def sigint_handler(_signum: int, _frame: Any) -> None: - # The status line will typically be visible, so start - # a fresh line for this message. - print("\nShutting down") - shutdown_requested.set() - signal.signal(signal.SIGINT, sigint_handler) mailbox_screenshot: Mailbox[ tuple[mss.screenshot.ScreenShot, float] @@ -360,7 +652,7 @@ def sigint_handler(_signum: int, _frame: Any) -> None: stage_video_encode.start() stage_video_capture.start() - LOGGER.debug("Thread IDs:") + LOGGER.debug("Native thread IDs:") LOGGER.debug(" Capture: %s", stage_video_capture.native_id) LOGGER.debug(" Preprocess: %s", stage_video_process.native_id) LOGGER.debug(" Encode: %s", stage_video_encode.native_id) @@ -368,6 +660,29 @@ def sigint_handler(_signum: int, _frame: Any) -> None: print("Starting video capture. Press Ctrl-C to stop.") + old_sigint_handler = None + def sigint_handler(_signum: int, _frame: Any) -> None: + # Restore the default behavior, so if our shutdown + # doesn't work because of a bug in our code, the user + # can still press ^C again to terminate the program. + # (The default handler is also in + # signal.default_int_handler, but that's not + # documented.) + signal.signal(signal.SIGINT, old_sigint_handler) + # The status line will typically be visible, so start + # a fresh line for this message. + print("\nShutting down") + shutdown_requested.set() + signal.signal(signal.SIGINT, sigint_handler) + + if duration_secs is not None: + stage_video_capture.join(timeout=duration_secs) + # Either the join timed out, or we processed a ^C and + # requested it exit. Either way, it's safe to set the + # shutdown event again, and return to our normal + # processing loop. + shutdown_requested.set() + stage_video_capture.join() stage_video_process.join() stage_video_encode.join() @@ -375,8 +690,10 @@ def sigint_handler(_signum: int, _frame: Any) -> None: stage_mux.join() if codec != "libx264" and video_stream.reformatter is not None: - LOGGER.warning("Software encoder is in a hardware encoding " - "path; this may slow things down") + LOGGER.warning( + "Software encoder is in a hardware encoding " + "path; this may slow things down" + ) if __name__ == "__main__": From e788690a5feacbbf8d1c5490c34a845ab5a20858 Mon Sep 17 00:00:00 2001 From: Joel Holveck Date: Fri, 16 Jan 2026 09:55:36 +0000 Subject: [PATCH 04/16] Improve comments --- demos/video-capture.py | 495 ++++++++++++++++------------------------- 1 file changed, 195 insertions(+), 300 deletions(-) diff --git a/demos/video-capture.py b/demos/video-capture.py index ed43db7..ea28c02 100755 --- a/demos/video-capture.py +++ b/demos/video-capture.py @@ -1,260 +1,92 @@ #! /usr/bin/env python3 -# This demo isn't meant to be a comprehensive explanation of video -# encoding. There are, however, some concepts that are unavoidable -# when converting from a sequence of snapshots to a video file. We'll -# go over some of those here. +# This demo shows one common use case for MSS: capture the screen and +# write a real video file (MP4) rather than saving individual images. # -# The descriptions given here are simplified. It doesn't go into the -# more obscure details, like H.264 switching frames or the AAC priming -# delay. Nevertheless, this should be enough to get the concepts -# you'll need to understand and build on this demo. +# It’s intentionally not a full “video encoding” course. The goal is +# to explain the few concepts that show up throughout the program so +# you can read, tweak, and extend it. # +# What tools are we using? +# ------------------------ # -# libav -# ----- +# Most people first meet video encoding through the `ffmpeg` command. +# Under the hood, ffmpeg is built on the “libav*” C libraries. In this +# demo we use PyAV (`import av`), which is a Pythonic wrapper around +# those libraries. # -# If you care enough about video files to be reading this, you've -# probably used ffmpeg. This is a Swiss Army Knife of video file -# manipulation. -# -# The ffmpeg tool is based on several libraries, which are part of -# ffmpeg, and widely used elsewhere: -# -# - libavcodec: Encoding/decoding library -# - libavfilter: Graph-based frame editing library -# - libavformat: I/O and muxing/demuxing library -# - libavdevice: Special devices muxing/demuxing library -# - libavutil: Common utility library -# - libswresample: Audio resampling, format conversion and mixing -# - libswscale: Color conversion and scaling library -# -# In this demo, I just refer to these collectively as "libav". Think -# of these as the library version of ffmpeg. We mostly use libavcodec -# and libavformat, but that detail isn't something we see in Python: -# all these libraries are essentially one giant bundle as far as we -# care. -# -# The libav libaries are in C. We use the PyAV library. This is not -# simply bindings or a direct translation of the libav C API to -# Python, but rather, a library that's based on libav, but meant to be -# more Pythonic. -# -# [note: it's important to include the fact that pyav.org has outdated -# docs, since they show up prominently in Google searches. The link -# to the GitHub issue is just to tell people that the ] -# -# The docs for PyAV are at . -# The older docs at pyav.org are outdated; see +# PyAV docs: +# Note: the older docs at pyav.org are outdated; see # . +# Caveats: # -# There was briefly a fork called basewood-av, but it has since been -# discontinued and merged back into PyAV; see -# . Despite the -# domain name, pyav.basswood-io.com hosts the current official PyAV -# documentation, not fork-specific docs. -# -# The PyAV developers are separate from ffmpeg, and there is a bit of -# a difference in the approaches that PyAV takes. See also -# https://pyav.basswood-io.com/docs/stable/overview/caveats.html -# -# -# Container Files -# --------------- -# -# A single file, like kittycat.mp4, is called a "container" in media -# file terms. This is a collection of "streams" (sometimes called -# "elementary streams"), all woven together. -# -# It might contain just a video stream (like we do here), just an -# audio stream (like in a .m4a file, which is just a renamed .mp4 -# file), or both (most common). It might also contain several of -# each; for instance, different languages will usually be in separate -# audio streams. There are other stream types, like subtitles, as -# well. -# -# Weaving these streams together is called "multiplexing", or "muxing" -# for short. Each stream's data gets bundled into "chunks" that are -# typically called "packets". The container keeps packets from the -# same time, but different streams, close to each other in the file. -# -# (By the way, the term "packet" is a holdover from MPEG-2 and before. -# Technically, MP4 files don't have packets: they have chunks, which -# can hold AAC frames, H.264 NALs, etc. The data that used to be in -# MPEG-2 packet headers is now in MP4 tables. To keep the terminology -# consistent between codecs and container formats, libav refers to the -# objects encapsulating all this as packets, regardless of the codec -# or container format.) -# -# For instance, at the beginning of the file, you might have one audio -# packet covering the first 21 ms, then a subtitle packet covering the -# first several seconds, then seven video packets each covering 3 ms, -# followed by another audio packet for the next 21 ms, and so on. -# +# Containers, streams, and codecs +# ------------------------------- # -# Video Codecs -# ------------ +# A file like `capture.mp4` is a *container*: it holds one or more +# *streams* (usually video and/or audio). This demo writes one video +# stream. # -# Within an MP4 file, the video can be stored in a lot of different -# formats. These are the most common: +# The container interleaves (“muxes”) stream data so players can read +# everything in timestamp order. libav calls those pieces “packets”. +# (In MP4 they’re not literally network-style packets; the term is a +# longstanding libav abstraction.) # -# - MPEG-2: used by DVDs, not much else anymore. -# - MPEG-4 Part 2: also known as DivX. Very popular in the early 2000s, -# not seen much anymore except older archives. -# - H.264: commonly used by BluRay, many streaming services, and many -# MP4 files in the wild. -# - H.265: increasingly used, but not supported by older hardware. -# - AV1 and VP9: used by some streaming services; hardware support -# varies, so these are typically offered alongside H.264 (or H.265) -# as fallbacks. +# A *codec* is the algorithm that compresses/decompresses a stream. +# For MP4 video, common codecs include H.264 and H.265. This demo +# defaults to H.264 via `libx264`, because it’s widely supported. You +# can switch to hardware encoders (e.g. `h264_nvenc`) if available. # -# These are all stream formats. There are many libraries that can -# create these files. These libraries are known as "codecs". In -# some contexts, the word "codec" is also used to name the stream -# format itself, so "H.264" might sometimes be called a codec. +# Frames and frame reordering (I/P/B) +# ---------------------------------- # -# In this demo, we use H.264, since it's the most common. You can -# also specify other codecs. +# Video is encoded as a sequence of frames: +# - I-frames: complete images. +# - P-frames: changes from previous frames. +# - B-frames: changes predicted using both past *and future* frames. # -# In ffmpeg, and the av libraries that we use here, the best codec for -# H.264 that doesn't require any specific hardware is libx264. There -# are also faster ones that are hardware-accelerated, such as -# h264_nvenc which uses specialized chips on Nvidia video cards. +# B-frames are why “the order frames are encoded/decoded” can differ +# from “the order frames are shown”. That leads directly to timestamps. # +# Timestamps (PTS/DTS) +# -------------------- # -# Frame Types -# ----------- +# Every frame has a *presentation timestamp* (PTS): when the viewer +# should see it. # -# Reference: https://en.wikipedia.org/wiki/Video_compression_picture_types +# Encoders may output packets in a different order due to B-frames. +# Those packets also have a *decode timestamp* (DTS): when the decoder +# must decode them so the PTS schedule can be met. # -# [Note: We can probably just give a brief description of the frame -# types.] +# In this demo we set PTS on `VideoFrame`s and let libav/PyAV propagate +# timestamps into the encoded packets. # -# The reason that video files can compress so well, much better than -# storing a JPEG for each frame, is that the file often can describe -# just the motion. In a video of a cat meowing, the first frame will -# have everything that's visible: the room in the background, the -# entire cat, the whole thing. We call a video frame that stores the -# whole picture an "I-frame". -# -# But the second frame just has to talk about what's changed in that -# 1/30 sec: it can just say that the tail moved this much to the left, -# the eyes closed slightly, what the now-visible bits of the eyelids -# look like, what's changed about the ear when it moved, etc. We call -# this sort of frame, one that just stores the differences from a -# previous frame, a "P-frame". -# -# We still want to refresh the whole picture from scratch from time to -# time. Since the differences between video frames are compressed, -# they're also imperfect. Over time, these imperfections can -# accumulate. Also, sometimes a frame may have been lost between when -# we store it and when the viewer sees it, such as if we made a DVD -# that later got scratched; we want to let the viewer recover from -# such a situation. To keep things clean, we sometimes send out a new -# I-frame, redrawing the whole picture anew. This normally happens -# about every 0.5 to 2 seconds, depending on the program's purpose. -# The group of pictures starting with a fresh I-frame is, -# straightforwardly enough, called a "group of pictures" (GOP). -# -# Sometimes, it's useful for a frame to give motion based not just on -# the past, but also the future. For instance, when the cat's mouth -# first starts to open, you might want to say "look ahead at how the -# inside of the mouth looks when it's totally open, and draw just this -# tiny sliver of it now." These are called "B-frames". -# -# A GOP usually arranges these frame types in a cadence, like -# IBBPBBPBB.... The specifics are up to the encoder, but the user can -# normally configure it to some degree. -# -# -# Timestamps -# ---------- -# -# [note: Managing the PTS is a big part of the code, so I want to -# describe it. The DTS is also worth at least highlighting, as is the -# fact that packets from the encoder may be in a different order than -# presentation order.] -# -# In a video file, time is very important. It's used to synchronize -# audio and video, to prevent frame timing quantization from causing -# the clock to drift, and many other purposes. -# -# The time at which each frame should be shown is called its -# "presentation time stamp", or "PTS". Normally, the PTS of the first -# frame is 0, and the rest of the video file is based on that. -# -# Because B-frames can require future frames to interpret, the future -# frames they depend on have to be decoded first. That means that the -# order in which frames are decoded can be different from the order in -# which they are presented. This leads to a second timestamp on each -# frame: the "decoding time stamp", or "DTS". -# -# Different container formats store the timestamps in different -# places: the container's structures, the packet headers, the streams, -# etc. Because of this, there are multiple places that carry -# timestamps. You can just set the timestamp on the video frame, and -# libav will propagate it from there to the packets and so forth. -# -# -# Time Base +# Time base # --------- # -# [note: Most people new to video encoding may assume that timestamps -# are in float or integer nanoseconds or something, so the concept of -# the time base is significant. We also attach it to multiple -# objects: the container object, the video stream context object, and -# each frame. So, the reason we do that is worth noting. Preserve the -# link to the PyAV docs.] -# -# In most video file formats, the time isn't specified in predefined -# units like nanoseconds. Instead, in your video file, you specify -# the time units you're using, a fraction of a second. This is called -# your time base. +# Timestamps are integers, and their unit is a fraction of a second +# called the *time base*. For example, with a time base of 1/90000, +# a timestamp of 90000 means “1 second”. PyAV will convert between time +# bases when needed, but you must set them consistently where you +# generate timestamps. # -# There are a lot of different places in a video encoding pipeline -# where you set a time base: everywhere that might need to encode a -# timestamp. They don't necessarily have to be the same (PyAV will -# convert between the different time bases as needed), so the time -# base has to be set on several objects. See also -# +# See # -# In this demo, we use a common time base of 1/90000 sec everywhere. -# This is a common standard, from the MPEG world. It became a -# standard because it can exactly represent 24 fps (film), 25 fps -# (European TV), 30 fps (US TV, nominally), and 30000/1001 fps (about -# 29.97, US broadcast TV). +# This demo uses a time base of 1/90000 (a common MPEG-derived choice). # +# Performance (why multiple threads?) +# ---------------------------------- # -# Performance -# ----------- +# Capturing frames, converting them to `VideoFrame`s, encoding, and +# muxing are separate stages. This demo pipelines those stages across +# threads so that (for example) encoding can run while the next screen +# grab is happening. The slowest stage typically limits overall FPS. # -# This demo uses multiple threads to improve performance. These -# threads are pipelined; see the comments at the start of -# common/pipeline.py for information about that concept. -# -# In a pipelined design, the slowest stage usually sets the overall -# rate. Suppose you and your roommates are all doing the dishes: -# Alice collects dishes and scrapes off food, Bob washes the dishes, -# Carol rinses them, Dave dries them, and Evelyn puts them away. -# If the -# -# [note: A detailed description of pipelining threads is in -# common/pipeline.py. This section should discuss the stages -# we're using, and note that the encoding stage is usually the -# bottleneck.] -# -# -# -# [note: Not sure where to integrate this, but make sure the numbers -# are somewhere.] -# -# In one test, here's some numbers this program could achieve, on an -# idle system. This is just meant as a rough guide; your results will -# almost certainly vary significantly. -# - libx264, 1920x1080: 80 fps -# - libx264, 3840x2160: 18 fps -# - h264_nvenc, 1920x1080: 190 fps -# - h264_nvenc, 3840x2160: 41 fps +# On an idle system (rough guide; will vary widely): +# - libx264, 1920x1080: ~80 fps +# - libx264, 3840x2160: ~18 fps +# - h264_nvenc, 1920x1080: ~190 fps +# - h264_nvenc, 3840x2160: ~41 fps import argparse import logging @@ -283,7 +115,7 @@ # features that are widely supported, but not mandatory. "profile": "high", # The "medium" preset is as good of a preset as any for a demo - # like this. Different codecs have different presets; the the + # like this. Different codecs have different presets; the # h264_nvenc actually prefers "p4", but accepts "medium" as a # similar preset. "preset": "medium", @@ -313,7 +145,7 @@ def video_capture( shutdown_requested: Event, ) -> Generator[tuple[mss.screenshot.ScreenShot, float], None, None]: # Keep track of the time when we want to get the next frame. We - # limit the frame time this way instead of sleeping 1/30 sec each + # limit the frame time this way instead of sleeping 1/fps sec each # frame, since we want to also account for the time taken to get # the screenshot and other overhead. # @@ -326,11 +158,9 @@ def video_capture( # Keep running this loop until the main thread says we should # stop. while not shutdown_requested.is_set(): - # Wait until we're ready. This should, ideally, happen every - # 1/30 second. + # 1/fps second. while (now := time.monotonic()) < next_frame_at: - # time.sleep(next_frame_at - now) # Capture a frame, and send it to the next processing stage. @@ -365,42 +195,51 @@ def video_process( tuple[mss.screenshot.ScreenShot, float] ], ) -> Generator[av.VideoFrame, None, None]: - # We track when the first + # We track when the first frame happened so we can make PTS start at 0. + # Many video players and other tools expect that. first_frame_at: float | None = None for screenshot, timestamp in screenshot_and_timestamp: - # A screenshot's pixel data can take a long time to copy. - # Just for the CPU to copy the bytes, on my hardware, takes - # about 3ms for a 4k screenshot. This means we want to be - # very careful about how we want to get the data from the - # ScreenShot object to the VideoFrame. + # Avoiding extra pixel copies + # --------------------------- # - # In Python, there's a concept called a "buffer". This is a - # range of memory that can be shared between objects, so the - # objects don't have to copy the data. This is very common in - # libraries like NumPy that work with very large datasets, and - # interpret that data in different ways. + # Copying a full frame of pixels is expensive. On typical + # hardware, a plain CPU memcpy of a 4K BGRA image can cost on + # the order of ~3ms by itself, which is a big chunk of a + # 30fps budget (33ms) and an even bigger chunk of a 60fps + # budget (16.7ms). # - # The most common buffers are in extensions written in C, but - # Python objects of type memoryview, bytes, bytearray, and - # array.array are all buffers. The screenshot.bgra attribute - # is also a buffer. (Currently, it's a bytes object, but this - # may change in the future.) + # So we want to be careful about the *conversion* step from an + # MSS `ScreenShot` to a PyAV `VideoFrame`. Ideally, that step + # should reuse the same underlying bytes rather than creating + # additional intermediate copies. # - # PyAV doesn't let you create a VideoFrame object directly - # from pixel data in a buffer. (It is possible to update the - # data in a VideoFrame to point to a different buffer, but - # that still allocates the memory first.) + # Buffers in Python + # ----------------- # - # However, while it's not documented, PyAV does have the - # from_numpy_buffer method (separately from the from_ndarray - # method). This creates a VideoFrame that shares memory with - # a NumPy array. We tell NumPy to create a new ndarray that - # shares the screenshot's buffer, and create a VideoFrame that - # uses that buffer. + # Many Python objects expose their underlying memory via the + # "buffer protocol". A buffer is just a view of raw bytes that + # other libraries can interpret without copying. + # + # Common buffer objects include: `bytes`, `bytearray`, + # `memoryview`, and `array.array`. `screenshot.bgra` is also a + # buffer (currently it is a `bytes` object, though that detail + # may change in the future). + # + # Minimum-copy path: ScreenShot -> NumPy -> VideoFrame + # -------------------------------------------------- + # + # `np.frombuffer()` creates an ndarray *view* of an existing + # buffer (no copy). Reshaping also stays as a view. + # + # PyAV's `VideoFrame.from_ndarray()` always copies the data + # into a new frame-owned buffer. For this demo we use + # the undocumented `VideoFrame.from_numpy_buffer()`, which creates a + # `VideoFrame` that shares memory with the ndarray. ndarray = np.frombuffer(screenshot.bgra, dtype=np.uint8) ndarray = ndarray.reshape(screenshot.height, screenshot.width, 4) frame = av.VideoFrame.from_numpy_buffer(ndarray, format="bgra") + # Set the PTS and time base for the frame. if first_frame_at is None: first_frame_at = timestamp @@ -430,7 +269,8 @@ def show_stats( any individual stage. """ # The start time is only used for showing the clock. The actual - # timing stats all use the times we put in the captured frames. + # timing stats use packet timestamps (ultimately derived from the + # frame PTS we compute during capture). start_time = time.monotonic() time_deque: deque[int] = deque(maxlen=100) bit_count_deque: deque[int] = deque(maxlen=100) @@ -443,11 +283,20 @@ def show_stats( yield packet_batch for packet in packet_batch: - # The PTS would make more sense for logging FPS than the - # DTS, but because of frame reordering, it makes the stats - # a little bit unstable. Using DTS consistently makes the - # timing quite stable, and over the 100-frame window, - # still quite precise. + # FPS from timestamps: why DTS, not PTS? + # + # Intuitively, you'd expect to compute FPS from PTS (the + # time the viewer should *see* each frame). But encoders + # can reorder frames internally (especially with B-frames), + # so packets may come out in a different order than PTS. + # + # If we update a sliding window with out-of-order PTS + # values, the window start/end can "wiggle" even when the + # pipeline is steady, which makes the displayed FPS noisy. + # + # DTS is the time order the decoder must process packets. + # Packets are emitted in DTS order, so using DTS gives a + # stable, monotonic timeline for the sliding window. time_deque.append(packet.dts) bit_count = packet.size * 8 bit_count_deque.append(bit_count) @@ -474,9 +323,11 @@ def show_stats( full_line = f"\r{line}{' ' * (last_status_len - this_status_len)}" print(full_line, end="") last_status_len = this_status_len - # It's difficult to correctly print the fps and bitrate near the - # tail, since we get the last many frames as a big batch. Instead - # of leaving misleading information on the screen, we erase the + # Near shutdown the encoder flush can emit packets in large bursts, + # and we also throttle status updates (to avoid spamming the + # terminal). That combination means the last displayed line may be + # stale or not representative of the final frames. Rather than + # leaving potentially misleading numbers on screen, erase the # status display. print(f"\r{' ' * last_status_len}\r", end="") @@ -512,33 +363,42 @@ def main() -> None: description="Capture screen video to MP4 file" ) parser.add_argument( - "-f", "--fps", type=int, default=30, help="frames per second (default: 30)" + "-f", + "--fps", + type=int, + default=30, + help="frames per second (default: 30)", ) monitor_group = parser.add_mutually_exclusive_group() monitor_group.add_argument( - "-m", "--monitor", + "-m", + "--monitor", type=int, default=1, help="monitor ID to capture (default: 1)", ) monitor_group.add_argument( - "-r", "--region", + "-r", + "--region", type=parse_region, metavar="LEFT,TOP,RIGHT,BOTTOM", help="region to capture as comma-separated coordinates", ) parser.add_argument( - "-c", "--codec", + "-c", + "--codec", default="libx264", help="video codec (default: libx264; try h264_nvenc for Nvidia hardware encoding)", ) parser.add_argument( - "-d", "--duration-secs", + "-d", + "--duration-secs", type=float, help="Duration to record (default: no limit)", ) parser.add_argument( - "-o", "--output", + "-o", + "--output", default="capture.mp4", help="output filename (default: capture.mp4)", ) @@ -577,23 +437,32 @@ def main() -> None: ) video_stream.width = monitor["width"] video_stream.height = monitor["height"] - # Setting the time_base on the stream is possible, but - # isn't what we need (for reasons I'm unclear on): we need - # to set it on the codec context. + # There are multiple time bases in play (stream, + # codec context, per-frame). Depending on the container + # and codec, some of these might be ignored or overridden. + # We set the desired time base consistently everywhere, + # so that the saved timestamps are correct regardless of what + # format we're saving to. + video_stream.time_base = TIME_BASE video_stream.codec_context.time_base = TIME_BASE - # Assigning the pix_fmt is telling the video encoder what - # we'll be sending it, not necessarily what it will - # output. If the codec supports BGRx inputs, then that's - # the most efficient way for us to send it our frames. - # Otherwise, there will be a software, CPU-side conversion - # step when we send it our BGRx frames. We're actually - # probably sending it frames in BGR0, not BGRA, but PyAV - # doesn't claim to support reading frames in BGR0, only - # BGRA. H.264 doesn't support an alpha channel anyway, so - # we can just send it BGR0 frames and tell it they're BGRA. - if any( - f.name == "bgra" for f in video_stream.codec.video_formats - ): + # `pix_fmt` here describes the pixel format we will *feed* + # into the encoder (not necessarily what the encoder will + # store in the bitstream). H.264 encoders ultimately + # convert to a YUV format internally. + # + # If the encoder accepts BGRA input (e.g., h264_nvenc), we + # can hand it MSS's BGRA frames directly and avoid an extra + # pre-conversion step on our side. + # + # If the encoder doesn't accept BGRA input (e.g., libx264), + # PyAV will insert a conversion step automatically. In that + # case, we let the codec choose the pix_fmt it's going to + # expect. + # + # Note: the alpha channel is ignored by H.264. We may + # effectively be sending BGRx/BGR0. But PyAV's VideoFrame + # only exposes "bgra" as the closest supported format. + if any(f.name == "bgra" for f in video_stream.codec.video_formats): video_stream.pix_fmt = "bgra" # We open (initialize) the codec explicitly here. PyAV # will automatically open it the first time we call @@ -658,9 +527,8 @@ def main() -> None: LOGGER.debug(" Encode: %s", stage_video_encode.native_id) LOGGER.debug(" Mux: %s", stage_mux.native_id) - print("Starting video capture. Press Ctrl-C to stop.") + old_sigint_handler = signal.getsignal(signal.SIGINT) - old_sigint_handler = None def sigint_handler(_signum: int, _frame: Any) -> None: # Restore the default behavior, so if our shutdown # doesn't work because of a bug in our code, the user @@ -673,7 +541,10 @@ def sigint_handler(_signum: int, _frame: Any) -> None: # a fresh line for this message. print("\nShutting down") shutdown_requested.set() - signal.signal(signal.SIGINT, sigint_handler) + + old_sigint_handler = signal.signal(signal.SIGINT, sigint_handler) + + print("Starting video capture. Press Ctrl-C to stop.") if duration_secs is not None: stage_video_capture.join(timeout=duration_secs) @@ -682,17 +553,41 @@ def sigint_handler(_signum: int, _frame: Any) -> None: # shutdown event again, and return to our normal # processing loop. shutdown_requested.set() - + stage_video_capture.join() stage_video_process.join() stage_video_encode.join() stage_show_stats.join() stage_mux.join() - if codec != "libx264" and video_stream.reformatter is not None: + # PyAV may insert an implicit conversion step between the frames we + # provide and what the encoder actually accepts (pixel format, + # colorspace, etc.). When that happens, `video_stream.reformatter` + # gets set. + # + # This is useful to know for performance: those conversions are + # typically CPU-side work and can become a bottleneck. + # Hardware-accelerated encoders, such as `h264_nvenc`, often accept + # BGRx, and can perform the conversion using specialized hardware. + # + # We already know that libx264 doesn't accept RGB input, so + # we don't warn about that. (There is a libx264rgb, but that + # uses a H.264 format that is not widely supported.) + # We just want to warn about other + # codecs, since some of them might have ways to use BGRx input, + # and the programmer might want to investigate. + # + # Note: `reformatter` is created lazily, so it may only be set after + # frames have been sent through the encoder, which is why we check + # it at the end. + if video_stream.reformatter is not None and codec != "libx264": LOGGER.warning( - "Software encoder is in a hardware encoding " - "path; this may slow things down" + "PyAV inserted a CPU-side pixel-format/colorspace conversion " + "step (video_stream.reformatter is set) while encoding with %s; " + "this can reduce FPS. Check the acceptable pix_fmts for this codec, " + "and see if one of them can accept some variation of BGRx input " + "directly.", + codec, ) From eed9245badec16caf170c4d05c874b49c8a2b4dc Mon Sep 17 00:00:00 2001 From: Joel Holveck Date: Fri, 16 Jan 2026 10:02:09 +0000 Subject: [PATCH 05/16] Add a comment about color spaces --- demos/video-capture.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/demos/video-capture.py b/demos/video-capture.py index ea28c02..836037d 100755 --- a/demos/video-capture.py +++ b/demos/video-capture.py @@ -435,6 +435,13 @@ def main() -> None: video_stream = avmux.add_stream( codec, rate=fps, options=CODEC_OPTIONS ) + # Ideally, we would set attributes such as colorspace, + # color_range, color_primaries, and color_trc here to + # describe the colorspace accurately. This would be + # significant if we're capturing on a Display P3 Mac, while + # the video file is on an sRGB Windows machine. Currently, + # MSS doesn't give us that information, so we skip it for + # now. video_stream.width = monitor["width"] video_stream.height = monitor["height"] # There are multiple time bases in play (stream, From 03da28dce335fc452037c28296ddfe65bab04561 Mon Sep 17 00:00:00 2001 From: Joel Ray Holveck Date: Fri, 16 Jan 2026 17:29:15 -0800 Subject: [PATCH 06/16] Add notes about colorspace tagging Also reformat the comments. --- demos/video-capture.py | 194 ++++++++++++++++++++++++----------------- 1 file changed, 116 insertions(+), 78 deletions(-) diff --git a/demos/video-capture.py b/demos/video-capture.py index 836037d..3ffd1e8 100755 --- a/demos/video-capture.py +++ b/demos/video-capture.py @@ -3,7 +3,7 @@ # This demo shows one common use case for MSS: capture the screen and # write a real video file (MP4) rather than saving individual images. # -# It’s intentionally not a full “video encoding” course. The goal is +# It's intentionally not a full "video encoding" course. The goal is # to explain the few concepts that show up throughout the program so # you can read, tweak, and extend it. # @@ -11,9 +11,9 @@ # ------------------------ # # Most people first meet video encoding through the `ffmpeg` command. -# Under the hood, ffmpeg is built on the “libav*” C libraries. In this -# demo we use PyAV (`import av`), which is a Pythonic wrapper around -# those libraries. +# Under the hood, ffmpeg is built on the "libav*" C libraries. In +# this demo we use PyAV (`import av`), which is a Pythonic wrapper +# around those libraries. # # PyAV docs: # Note: the older docs at pyav.org are outdated; see @@ -24,17 +24,17 @@ # ------------------------------- # # A file like `capture.mp4` is a *container*: it holds one or more -# *streams* (usually video and/or audio). This demo writes one video +# *streams* (usually video and/or audio). This demo writes one video # stream. # -# The container interleaves (“muxes”) stream data so players can read -# everything in timestamp order. libav calls those pieces “packets”. -# (In MP4 they’re not literally network-style packets; the term is a +# The container interleaves ("muxes") stream data so players can read +# everything in timestamp order. libav calls those pieces "packets". +# (In MP4 they're not literally network-style packets; the term is a # longstanding libav abstraction.) # # A *codec* is the algorithm that compresses/decompresses a stream. -# For MP4 video, common codecs include H.264 and H.265. This demo -# defaults to H.264 via `libx264`, because it’s widely supported. You +# For MP4 video, common codecs include H.264 and H.265. This demo +# defaults to H.264 via `libx264`, because it's widely supported. You # can switch to hardware encoders (e.g. `h264_nvenc`) if available. # # Frames and frame reordering (I/P/B) @@ -45,8 +45,9 @@ # - P-frames: changes from previous frames. # - B-frames: changes predicted using both past *and future* frames. # -# B-frames are why “the order frames are encoded/decoded” can differ -# from “the order frames are shown”. That leads directly to timestamps. +# B-frames are why "the order frames are encoded/decoded" can differ +# from "the order frames are shown". That leads directly to +# timestamps. # # Timestamps (PTS/DTS) # -------------------- @@ -58,15 +59,15 @@ # Those packets also have a *decode timestamp* (DTS): when the decoder # must decode them so the PTS schedule can be met. # -# In this demo we set PTS on `VideoFrame`s and let libav/PyAV propagate -# timestamps into the encoded packets. +# In this demo we set PTS on `VideoFrame`s and let libav/PyAV +# propagate timestamps into the encoded packets. # # Time base # --------- # # Timestamps are integers, and their unit is a fraction of a second -# called the *time base*. For example, with a time base of 1/90000, -# a timestamp of 90000 means “1 second”. PyAV will convert between time +# called the *time base*. For example, with a time base of 1/90000, a +# timestamp of 90000 means "1 second". PyAV will convert between time # bases when needed, but you must set them consistently where you # generate timestamps. # @@ -78,9 +79,9 @@ # ---------------------------------- # # Capturing frames, converting them to `VideoFrame`s, encoding, and -# muxing are separate stages. This demo pipelines those stages across +# muxing are separate stages. This demo pipelines those stages across # threads so that (for example) encoding can run while the next screen -# grab is happening. The slowest stage typically limits overall FPS. +# grab is happening. The slowest stage typically limits overall FPS. # # On an idle system (rough guide; will vary widely): # - libx264, 1920x1080: ~80 fps @@ -135,6 +136,10 @@ TIME_BASE = Fraction(1, 90000) +# Currently, MSS doesn't give us information about the display's +# colorspace. See where this is used below for more information. +DISPLAY_IS_SRGB = False + LOGGER = logging.getLogger("video-capture") @@ -195,8 +200,8 @@ def video_process( tuple[mss.screenshot.ScreenShot, float] ], ) -> Generator[av.VideoFrame, None, None]: - # We track when the first frame happened so we can make PTS start at 0. - # Many video players and other tools expect that. + # We track when the first frame happened so we can make PTS start + # at 0. Many video players and other tools expect that. first_frame_at: float | None = None for screenshot, timestamp in screenshot_and_timestamp: @@ -205,9 +210,9 @@ def video_process( # # Copying a full frame of pixels is expensive. On typical # hardware, a plain CPU memcpy of a 4K BGRA image can cost on - # the order of ~3ms by itself, which is a big chunk of a - # 30fps budget (33ms) and an even bigger chunk of a 60fps - # budget (16.7ms). + # the order of ~3ms by itself, which is a big chunk of a 30fps + # budget (33ms) and an even bigger chunk of a 60fps budget + # (16.7ms). # # So we want to be careful about the *conversion* step from an # MSS `ScreenShot` to a PyAV `VideoFrame`. Ideally, that step @@ -218,24 +223,24 @@ def video_process( # ----------------- # # Many Python objects expose their underlying memory via the - # "buffer protocol". A buffer is just a view of raw bytes that - # other libraries can interpret without copying. + # "buffer protocol". A buffer is just a view of raw bytes + # that other libraries can interpret without copying. # # Common buffer objects include: `bytes`, `bytearray`, - # `memoryview`, and `array.array`. `screenshot.bgra` is also a - # buffer (currently it is a `bytes` object, though that detail - # may change in the future). + # `memoryview`, and `array.array`. `screenshot.bgra` is also + # a buffer (currently it is a `bytes` object, though that + # detail may change in the future). # # Minimum-copy path: ScreenShot -> NumPy -> VideoFrame - # -------------------------------------------------- + # ---------------------------------------------------- # # `np.frombuffer()` creates an ndarray *view* of an existing # buffer (no copy). Reshaping also stays as a view. # # PyAV's `VideoFrame.from_ndarray()` always copies the data - # into a new frame-owned buffer. For this demo we use - # the undocumented `VideoFrame.from_numpy_buffer()`, which creates a - # `VideoFrame` that shares memory with the ndarray. + # into a new frame-owned buffer. For this demo we use the + # undocumented `VideoFrame.from_numpy_buffer()`, which creates + # a `VideoFrame` that shares memory with the ndarray. ndarray = np.frombuffer(screenshot.bgra, dtype=np.uint8) ndarray = ndarray.reshape(screenshot.height, screenshot.width, 4) frame = av.VideoFrame.from_numpy_buffer(ndarray, format="bgra") @@ -245,6 +250,14 @@ def video_process( first_frame_at = timestamp frame.pts = int((timestamp - first_frame_at) / TIME_BASE) frame.time_base = TIME_BASE + + # If we know the colorspace of our frames, mark them + # accordingly. See the comment where we set these attributes + # on video_stream for details. + if DISPLAY_IS_SRGB: + frame.colorspace = av.video.reformatter.Colorspace.ITU709 + frame.color_range = av.video.reformatter.ColorRange.JPEG + yield frame @@ -287,8 +300,9 @@ def show_stats( # # Intuitively, you'd expect to compute FPS from PTS (the # time the viewer should *see* each frame). But encoders - # can reorder frames internally (especially with B-frames), - # so packets may come out in a different order than PTS. + # can reorder frames internally (especially with + # B-frames), so packets may come out in a different order + # than PTS. # # If we update a sliding window with out-of-order PTS # values, the window start/end can "wiggle" even when the @@ -323,11 +337,11 @@ def show_stats( full_line = f"\r{line}{' ' * (last_status_len - this_status_len)}" print(full_line, end="") last_status_len = this_status_len - # Near shutdown the encoder flush can emit packets in large bursts, - # and we also throttle status updates (to avoid spamming the - # terminal). That combination means the last displayed line may be - # stale or not representative of the final frames. Rather than - # leaving potentially misleading numbers on screen, erase the + # Near shutdown the encoder flush can emit packets in large + # bursts, and we also throttle status updates (to avoid spamming + # the terminal). That combination means the last displayed line + # may be stale or not representative of the final frames. Rather + # than leaving potentially misleading numbers on screen, erase the # status display. print(f"\r{' ' * last_status_len}\r", end="") @@ -435,20 +449,44 @@ def main() -> None: video_stream = avmux.add_stream( codec, rate=fps, options=CODEC_OPTIONS ) + # Ideally, we would set attributes such as colorspace, # color_range, color_primaries, and color_trc here to - # describe the colorspace accurately. This would be - # significant if we're capturing on a Display P3 Mac, while - # the video file is on an sRGB Windows machine. Currently, - # MSS doesn't give us that information, so we skip it for - # now. + # describe the colorspace accurately. Otherwise, the + # player has to guess whether this was recorded on an sRGB + # Windows machine, a Display P3 Mac, or if it's using + # linear RGB. Currently, MSS doesn't give us colorspace + # information (DISPLAY_IS_SRGB is always False in this + # demo), so we don't try to specify a particular + # colorspace. However, if your application knows the + # colorspace you're recording from, then you can set those + # attributes on the stream and the frames accordingly. + # + # These properties on the stream (actually, they're + # attached to its CodecContext) are used to tell the + # stream and container how to label the video stream's + # colorspace. There are similar attributes on the frame + # itself; those are used to identify its colorspace, so + # the codec can do the correct RGB to YUV conversion. + if DISPLAY_IS_SRGB: + video_stream.color_primaries = 1 # libavutil's AVCOL_PRI_BT709; PyAV doesn't define constants for color primaries. + video_stream.colorspace = av.video.reformatter.Colorspace.ITU709 # More commonly called BT.709 + # The "JPEG" color range is saying that we're using a + # color range like a computer, not like broadcast TV. + video_stream.color_range = av.video.reformatter.ColorRange.JPEG + # Technically, sRGB's transformation characteristic is + # AVCOL_TRC_IEC61966_2_1. It's nearly the same as + # BT.709's TRC, so some video encoders will tag it as + # AVCOL_TRC_BT709 (1) instead. + video_stream.color_trc = 13 # libavutil's AVCOL_TRC_IEC61966_2_1; PyAV doesn't define constants for TRCs. + video_stream.width = monitor["width"] video_stream.height = monitor["height"] - # There are multiple time bases in play (stream, - # codec context, per-frame). Depending on the container - # and codec, some of these might be ignored or overridden. - # We set the desired time base consistently everywhere, - # so that the saved timestamps are correct regardless of what + # There are multiple time bases in play (stream, codec + # context, per-frame). Depending on the container and + # codec, some of these might be ignored or overridden. We + # set the desired time base consistently everywhere, so + # that the saved timestamps are correct regardless of what # format we're saving to. video_stream.time_base = TIME_BASE video_stream.codec_context.time_base = TIME_BASE @@ -458,13 +496,13 @@ def main() -> None: # convert to a YUV format internally. # # If the encoder accepts BGRA input (e.g., h264_nvenc), we - # can hand it MSS's BGRA frames directly and avoid an extra - # pre-conversion step on our side. + # can hand it MSS's BGRA frames directly and avoid an + # extra pre-conversion step on our side. # - # If the encoder doesn't accept BGRA input (e.g., libx264), - # PyAV will insert a conversion step automatically. In that - # case, we let the codec choose the pix_fmt it's going to - # expect. + # If the encoder doesn't accept BGRA input (e.g., + # libx264), PyAV will insert a conversion step + # automatically. In that case, we let the codec choose + # the pix_fmt it's going to expect. # # Note: the alpha channel is ignored by H.264. We may # effectively be sending BGRx/BGR0. But PyAV's VideoFrame @@ -567,34 +605,34 @@ def sigint_handler(_signum: int, _frame: Any) -> None: stage_show_stats.join() stage_mux.join() - # PyAV may insert an implicit conversion step between the frames we - # provide and what the encoder actually accepts (pixel format, - # colorspace, etc.). When that happens, `video_stream.reformatter` - # gets set. + # PyAV may insert an implicit conversion step between the + # frames we provide and what the encoder actually accepts + # (pixel format, colorspace, etc.). When that happens, + # `video_stream.reformatter` gets set. # - # This is useful to know for performance: those conversions are - # typically CPU-side work and can become a bottleneck. - # Hardware-accelerated encoders, such as `h264_nvenc`, often accept - # BGRx, and can perform the conversion using specialized hardware. + # This is useful to know for performance: those + # conversions are typically CPU-side work and can become a + # bottleneck. Hardware-accelerated encoders, such as + # `h264_nvenc`, often accept BGRx, and can perform the + # conversion using specialized hardware. # - # We already know that libx264 doesn't accept RGB input, so - # we don't warn about that. (There is a libx264rgb, but that - # uses a H.264 format that is not widely supported.) - # We just want to warn about other - # codecs, since some of them might have ways to use BGRx input, - # and the programmer might want to investigate. + # We already know that libx264 doesn't accept RGB input, + # so we don't warn about that. (There is a libx264rgb, + # but that writes to a different H.264 format.) We just + # want to warn about other codecs, since some of them + # might have ways to use BGRx input, and the programmer + # might want to investigate. # - # Note: `reformatter` is created lazily, so it may only be set after - # frames have been sent through the encoder, which is why we check - # it at the end. + # Note: `reformatter` is created lazily, so it may only be + # set after frames have been sent through the encoder, + # which is why we check it at the end. if video_stream.reformatter is not None and codec != "libx264": LOGGER.warning( - "PyAV inserted a CPU-side pixel-format/colorspace conversion " - "step (video_stream.reformatter is set) while encoding with %s; " - "this can reduce FPS. Check the acceptable pix_fmts for this codec, " - "and see if one of them can accept some variation of BGRx input " - "directly.", - codec, + "PyAV inserted a CPU-side pixel-format/colorspace " + "conversion step; this can reduce FPS. Check the " + "acceptable pix_fmts for this codec, and see if one " + "of them can accept some variation of BGRx input " + "directly." ) From 883f365174bf8c5aed1d6d1d193d1493f94de673 Mon Sep 17 00:00:00 2001 From: Joel Ray Holveck Date: Fri, 16 Jan 2026 17:43:46 -0800 Subject: [PATCH 07/16] Add a pointer to the comments in pipeline.py --- demos/video-capture.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/demos/video-capture.py b/demos/video-capture.py index 3ffd1e8..83ee4eb 100755 --- a/demos/video-capture.py +++ b/demos/video-capture.py @@ -81,7 +81,11 @@ # Capturing frames, converting them to `VideoFrame`s, encoding, and # muxing are separate stages. This demo pipelines those stages across # threads so that (for example) encoding can run while the next screen -# grab is happening. The slowest stage typically limits overall FPS. +# grab is happening. The comments at the top of common/pipeline.py +# describe pipelining in detail. +# +# The slowest stage typically limits overall FPS. Usually, that's the +# encoder. # # On an idle system (rough guide; will vary widely): # - libx264, 1920x1080: ~80 fps From dfe27840b4371ce0c88844e9df23ae077f8c5f76 Mon Sep 17 00:00:00 2001 From: Joel Ray Holveck Date: Sat, 17 Jan 2026 01:49:59 -0800 Subject: [PATCH 08/16] Add comments and help strings about using other codecs --- demos/video-capture.py | 44 +++++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/demos/video-capture.py b/demos/video-capture.py index 83ee4eb..979bebe 100755 --- a/demos/video-capture.py +++ b/demos/video-capture.py @@ -113,11 +113,17 @@ from common.pipeline import Mailbox, PipelineStage -# These are the options you'd give to ffmpeg that would affect the -# video codec. +# These are the options you'd give to ffmpeg that it sends to the +# video codec. The options you can use here can be listed with +# `ffmpeg -help encoder=libx264`, or whatever encoder you're using for +# this demo's `--codec` flag. The options for each encoder are described +# in more detail in `man ffmpeg-codecs`. CODEC_OPTIONS = { # The "high" profile means that the encoder can use some H.264 - # features that are widely supported, but not mandatory. + # features that are widely supported, but not mandatory. If + # you're using a codec other than H.264, you'll need to comment + # out this line: the relevant features are already part of the + # main profile in later codecs like H.265, VP8, VP9, and AV1. "profile": "high", # The "medium" preset is as good of a preset as any for a demo # like this. Different codecs have different presets; the @@ -406,7 +412,12 @@ def main() -> None: "-c", "--codec", default="libx264", - help="video codec (default: libx264; try h264_nvenc for Nvidia hardware encoding)", + help=( + 'video codec implementation, same as the ffmpeg "-c:v" flag. ' + 'Run "python3 -m av --codecs" for a full list. ' + "(default: libx264. Try h264_nvenc for Nvidia " + "hardware encoding.)" + ), ) parser.add_argument( "-d", @@ -439,6 +450,8 @@ def main() -> None: else: monitor = sct.monitors[args.monitor] + # We don't pass the container format to av.open here, so it + # will choose it based on the extension: .mp4, .mkv, etc. with av.open(filename, "w") as avmux: # We could initialize video_stream in video_encode, but # doing it here means that we can open it before starting @@ -473,16 +486,25 @@ def main() -> None: # itself; those are used to identify its colorspace, so # the codec can do the correct RGB to YUV conversion. if DISPLAY_IS_SRGB: - video_stream.color_primaries = 1 # libavutil's AVCOL_PRI_BT709; PyAV doesn't define constants for color primaries. - video_stream.colorspace = av.video.reformatter.Colorspace.ITU709 # More commonly called BT.709 + # color_primaries=1 is libavutil's AVCOL_PRI_BT709; + # PyAV doesn't define named constants for color + # primaries. + video_stream.color_primaries = 1 + # What PyAV refers to as ITU709 is more commonly known + # as BT.709. + video_stream.colorspace = ( + av.video.reformatter.Colorspace.ITU709 + ) # The "JPEG" color range is saying that we're using a # color range like a computer, not like broadcast TV. video_stream.color_range = av.video.reformatter.ColorRange.JPEG - # Technically, sRGB's transformation characteristic is - # AVCOL_TRC_IEC61966_2_1. It's nearly the same as - # BT.709's TRC, so some video encoders will tag it as - # AVCOL_TRC_BT709 (1) instead. - video_stream.color_trc = 13 # libavutil's AVCOL_TRC_IEC61966_2_1; PyAV doesn't define constants for TRCs. + # PyAV doesn't define named constants for TRCs, so we + # pass it a numeric value. Technically, sRGB's + # transformation characteristic is + # AVCOL_TRC_IEC61966_2_1 (13). It's nearly the same + # as BT.709's TRC, so some video encoders will tag it + # as AVCOL_TRC_BT709 (1) instead. + video_stream.color_trc = 13 video_stream.width = monitor["width"] video_stream.height = monitor["height"] From 8705054dfa1d9f8eed760e4d91b0887283dfffff Mon Sep 17 00:00:00 2001 From: Joel Ray Holveck Date: Sun, 18 Jan 2026 16:59:35 -0800 Subject: [PATCH 09/16] Add a simple version --- demos/video-capture-simple.py | 225 ++++++++++++++++++++++++++++++++++ 1 file changed, 225 insertions(+) create mode 100755 demos/video-capture-simple.py diff --git a/demos/video-capture-simple.py b/demos/video-capture-simple.py new file mode 100755 index 0000000..aa17a8d --- /dev/null +++ b/demos/video-capture-simple.py @@ -0,0 +1,225 @@ +#! /usr/bin/env python3 + +# A lot of people want to use MSS to record a video of the screen. +# Doing it really well can be difficult - there's a reason OBS is such +# a significant program - but the basics are surprisingly easy! +# +# There's a more advanced example, video-capture-stream.py, that has +# more features, and better performance. But this simple demo is +# easier to understand, because it does everything in a +# straightforward way, without any complicated features. +# +# Here, we're going to record the screen for 10 seconds, and save the +# result in capture.mp4, as an H.264 video stream. +# +# Sometimes, in film, cameramen will "undercrank", filming the action +# at a slower frame rate than how it will eventually be projected. In +# that case, motion appears artificially sped up, either for comedy +# (like the Benny Hill TV show), or for fast and frenetic action (like +# Mad Max: Fury Road). +# +# In this demo, we put in the file a marker saying that it's at 30 +# fps. But since this is a simple demo, your computer might not be +# able to keep up with writing video frames at that speed. In that +# case, you'll see the same effect: sped-up motion. +# +# The advanced demo has several techniques to mitigate that. First, +# it uses pipelined threads to let the video encoder use a full CPU +# core (often more, internally), rather than having to share a CPU +# core with all the other tasks. Second, it puts a timestamp marker +# on each frame saying exactly when it's supposed to be shown, rather +# than just saying to show all the frames at 30 fps. +# +# For this simple demo, though, we just record the frames and add them +# to the file one at a time. +# +# We use three libraries that don't come with Python: Pillow, PyAV, +# and (of course) MSS. You'll need to install those with "pip install +# pillow av mss". Normally, you'll want to install these into a venv; +# if you don't know about those, there are lots of great tutorials +# online. + +import logging +import time + +import av +from PIL import Image + +import mss + +# These are the options you'd give to ffmpeg that would affect the way +# the video is encoded. There are comments in the advanced demo that +# go into more detail. +CODEC_OPTIONS = { + "profile": "high", + "preset": "medium", + "b": "6M", + "rc-lookahead": "40", +} + +# We'll try to capture at 30 fps, if the system can keep up with it +# (typically, that's possible at 1080p, but not at 4k). Regardless of +# what the system can keep up with, we'll mark the file as being at 30 +# fps. +FPS = 30 + +# The program will exit after 10 seconds of recording. +CAPTURE_SECONDS = 10 + +# Within an MP4 file, the video can be stored in a lot of different +# formats. In this demo, we use H.264, since it's the most widely +# supported. +# +# In ffmpeg, and the av libraries that we use here, the best codec for +# H.264 that doesn't require any specific hardware is libx264. There +# are faster ones that are hardware-accelerated, such as h264_nvenc +# which uses specialized chips on Nvidia video cards. +CODEC = "libx264" + +FILENAME = "capture.mp4" + + +def main() -> None: + logging.basicConfig(level=logging.DEBUG) + # If we don't enable PyAV's own logging, a lot of important error + # messages from libav won't be shown. + av.logging.set_level(av.logging.VERBOSE) + + with mss.mss() as sct: + monitor = sct.monitors[1] + + with av.open(FILENAME, "w") as avmux: + # The "avmux" object we get back from "av.open" represents + # the MP4 file. That's a container that holds the video, + # as well as possibly audio and more. These are each + # called "streams". We only create one stream here, since + # we're just recording video. + video_stream = avmux.add_stream( + CODEC, rate=FPS, options=CODEC_OPTIONS + ) + video_stream.width = monitor["width"] + video_stream.height = monitor["height"] + # There are more options you can set on the video stream; + # the advanced demo uses some of those. + + # Count how many frames we're capturing, so we can log + # the FPS later. + frame_count = 0 + + # Mark the times when we start and end the recording. + capture_start_time = time.monotonic() + capture_end_time = capture_start_time + CAPTURE_SECONDS + + # MSS can capture very fast, and libav can encode very + # fast, depending on your hardware and screen size. We + # don't want to capture faster than 30 fps (or whatever + # you set FPS to). To slow down to our desired rate, we + # keep a variable "next_frame_time" to track when it's + # time to track the next frame. + # + # Some programs will just sleep for 1/30 sec in each loop. + # But by tracking the time when we want to capture the + # next frame, instead of always sleeping for 1/30 sec, the + # time that is spent doing the capture and encode (which + # can be substantial) is counted as part of the total time + # we need to delay. + next_frame_time = capture_start_time + + print("Capturing to", FILENAME, "for", CAPTURE_SECONDS, "seconds") + while True: + # Wait until we reach the time for the next frame. + while (now := time.monotonic()) < next_frame_time: + time.sleep(next_frame_time - now) + + # Try to capture the next frame 1/30 sec after our + # target time for this frame. We update this based on + # the target time instead of the actual time so that, + # if we were a little slow capturing this frame, we'll + # be a little fast capturing the next one, and even + # things out. (There's a slightly better, but more + # complex, way to update next_frame_time in the + # advanced demo.) + next_frame_time = next_frame_time + 1 / FPS + + # See if we've finished the requested capture + # duration. + if now > capture_end_time: + break + + # Print dots for each frame, so you know it's not + # frozen. + print(".", end="", flush=True) + + # Grab a screenshot. + screenshot = sct.grab(monitor) + frame_count += 1 + + # There are a few ways to get the screenshot into a + # VideoFrame. The highest-performance way isn't hard, + # and is shown in the advanced demo: search for + # from_numpy_buffer. But the most obvious way is to + # use PIL: you can create an Image from the + # screenshot, and create a VideoFrame from that. That + # said, if you want to boost the fps rate by about + # 50%, check out the advanced demo, and search for + # from_numpy_buffer. + img = Image.frombytes( + "RGB", screenshot.size, screenshot.bgra, "raw", "BGRX" + ) + frame = av.VideoFrame.from_image(img) + + # When we encode frames, we get back a list of + # packets. Often, we'll get no packets at first: the + # video encoder wants to wait and see the motion + # before it decides how it wants to encode the frames. + # Later, once it's decided about the earlier frames, + # we'll start getting those packets, while it's + # holding on to later frames. + # + # You can imagine that the encoder is a factory. + # You're providing it frames, one at a time, each as a + # box of raw materials. It cranks out packets as its + # finished product. But there's some delay while it's + # working. You can imagine these on a conveyor belt + # moving left to right as time progresses: + # + # FRAMES ENCODER PACKETS + # [1]________-> (Factory) ->____________ + # [3]_[2]_[1]-> (Factory) ->____________ + # [6]_[5]_[4]-> (Factory) ->{1}_________ + # [8]_[7]_[6]-> (Factory) ->{3}_{2}_{1}_ + # + # Sometimes, when you send in a frame, you'll get no + # packets, sometimes you'll get one, and sometimes + # you'll get a batch of several. It depends on how + # the encoder works. + # + # The point is, the packets you're getting back from + # this call are whatever the encoder is ready to give + # you, not necessarily the packets related to the + # frame you're handing it right now. + packets = video_stream.encode(frame) + + # As we said, the MP4 file is a bunch of packets from + # possibly many streams, all woven (or "muxed") + # together. So the ultimate destination of the data + # is to send it to the MP4 file, avmux. + avmux.mux(packets) + + # Print an empty line to end our line of dots. + print() + + # Earlier, we mentioned that the encoder might hold onto + # some frames, while it decides how to encode them based + # on future frames. Now that we're done sending it + # frames, we need to get the packets for any frames it's + # still holding onto. We do this by sending None instead + # of a frame object. + packets = video_stream.encode(None) + avmux.mux(packets) + + print(f"Capture complete: {frame_count / CAPTURE_SECONDS:.1f} fps") + + +if __name__ == "__main__": + main() From 76d3b853725e0c5c88619021c4eec5111f73e049 Mon Sep 17 00:00:00 2001 From: Joel Ray Holveck Date: Sun, 18 Jan 2026 17:50:00 -0800 Subject: [PATCH 10/16] Add information about VFR --- demos/video-capture.py | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/demos/video-capture.py b/demos/video-capture.py index 979bebe..5363bfd 100755 --- a/demos/video-capture.py +++ b/demos/video-capture.py @@ -75,6 +75,35 @@ # # This demo uses a time base of 1/90000 (a common MPEG-derived choice). # +# Constant Frame Rate (CFR) and Variable Frame Rate (VFR) +# ------------------------------------------------------- +# +# Many video files run at a fixed frame rate, like 30 fps. Each frame +# is shown at 1/30 sec intervals. This is called *constant frame +# rate*, or *CFR*, and that's what we do in the simple version of this +# demo. +# +# One problem with this is that, if the encoder can't keep up, the +# video will appear sped-up when played back. The comments at the +# beginning of the simple version of this demo go into more detail +# about that problem. +# +# In this advanced version, we use *variable frame rate*, or *VFR*. +# That's because we can't be sure that the encoder will be able to +# work fast enough: we haven't tuned its settings for your screen +# resolution and hardware. While the encoder might be fast enough, it +# might only be able to operate at 18 fps, or even less. +# +# Instead, we mark each frame with the correct time that it should be +# shown. Even if the encoder is falling behind, its frames are still +# marked with the right times, so the player will just keep the +# previous frame on the screen a little longer. +# +# Some video editing software historically has had problems with VFR +# video. It's much better now than it was a few years ago, but if you +# plan to edit the video, you may need to convert it to CFR. There +# are many resources online about how to do that. +# # Performance (why multiple threads?) # ---------------------------------- # @@ -128,7 +157,8 @@ # The "medium" preset is as good of a preset as any for a demo # like this. Different codecs have different presets; the # h264_nvenc actually prefers "p4", but accepts "medium" as a - # similar preset. + # similar preset. You might prefer "fast" if you're not getting + # enough FPS. "preset": "medium", # 6 Mbit/sec is vaguely the ballpark for a good-quality video at # 1080p and 30 fps, but there's a lot of variation. We're just From 44d795bcf876787d23558c193367eff4a3aaea3a Mon Sep 17 00:00:00 2001 From: Joel Ray Holveck Date: Tue, 20 Jan 2026 13:25:40 -0800 Subject: [PATCH 11/16] Add a comment about the term "flushing" the video stream --- demos/video-capture-simple.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/demos/video-capture-simple.py b/demos/video-capture-simple.py index aa17a8d..0396ba6 100755 --- a/demos/video-capture-simple.py +++ b/demos/video-capture-simple.py @@ -213,8 +213,9 @@ def main() -> None: # some frames, while it decides how to encode them based # on future frames. Now that we're done sending it # frames, we need to get the packets for any frames it's - # still holding onto. We do this by sending None instead - # of a frame object. + # still holding onto. This is referred to as "flushing" + # the stream. We do this by sending None instead of a + # frame object. packets = video_stream.encode(None) avmux.mux(packets) From 06b88850157498ed071a7825f9226a74b8cd209b Mon Sep 17 00:00:00 2001 From: Joel Ray Holveck Date: Tue, 20 Jan 2026 15:45:41 -0800 Subject: [PATCH 12/16] Add comments about installing third-party libs with pip --- demos/video-capture-simple.py | 1 + demos/video-capture.py | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/demos/video-capture-simple.py b/demos/video-capture-simple.py index 0396ba6..2292279 100755 --- a/demos/video-capture-simple.py +++ b/demos/video-capture-simple.py @@ -42,6 +42,7 @@ import logging import time +# Install the necessary libraries with "pip install av mss pillow". import av from PIL import Image diff --git a/demos/video-capture.py b/demos/video-capture.py index 5363bfd..813f920 100755 --- a/demos/video-capture.py +++ b/demos/video-capture.py @@ -10,6 +10,13 @@ # What tools are we using? # ------------------------ # +# You'll need a few libraries that don't come with Python: PyAV, +# NumPy, SI-Prefix, and (of course) MSS. You'll need to install those +# with "pip install av mss numpy si-prefix". Normally, you'll want to +# install these into a venv; if you don't know about those, there are +# lots of great tutorials online. The most critical one we use is +# PyAV. +# # Most people first meet video encoding through the `ffmpeg` command. # Under the hood, ffmpeg is built on the "libav*" C libraries. In # this demo we use PyAV (`import av`), which is a Pythonic wrapper @@ -134,6 +141,7 @@ from threading import Event from typing import Any +# Install the necessary libraries with "pip install av mss numpy si-prefix". import av import numpy as np from si_prefix import si_format From 974b822e1430b3ad1b4b8e6014ccf1cf5ff3c6d7 Mon Sep 17 00:00:00 2001 From: Joel Ray Holveck Date: Tue, 20 Jan 2026 22:44:40 -0800 Subject: [PATCH 13/16] Reformat to 120-wide project standard --- demos/video-capture-simple.py | 193 +++++--------- demos/video-capture.py | 468 +++++++++++++--------------------- 2 files changed, 244 insertions(+), 417 deletions(-) diff --git a/demos/video-capture-simple.py b/demos/video-capture-simple.py index 2292279..086dbaa 100755 --- a/demos/video-capture-simple.py +++ b/demos/video-capture-simple.py @@ -1,43 +1,31 @@ #! /usr/bin/env python3 -# A lot of people want to use MSS to record a video of the screen. -# Doing it really well can be difficult - there's a reason OBS is such -# a significant program - but the basics are surprisingly easy! +# A lot of people want to use MSS to record a video of the screen. Doing it really well can be difficult - there's a +# reason OBS is such a significant program - but the basics are surprisingly easy! # -# There's a more advanced example, video-capture-stream.py, that has -# more features, and better performance. But this simple demo is -# easier to understand, because it does everything in a -# straightforward way, without any complicated features. +# There's a more advanced example, video-capture.py, that has more features, and better performance. But this simple +# demo is easier to understand, because it does everything in a straightforward way, without any complicated features. # -# Here, we're going to record the screen for 10 seconds, and save the -# result in capture.mp4, as an H.264 video stream. +# Here, we're going to record the screen for 10 seconds, and save the result in capture.mp4, as an H.264 video stream. # -# Sometimes, in film, cameramen will "undercrank", filming the action -# at a slower frame rate than how it will eventually be projected. In -# that case, motion appears artificially sped up, either for comedy -# (like the Benny Hill TV show), or for fast and frenetic action (like -# Mad Max: Fury Road). +# Sometimes, in film, cameramen will "undercrank", filming the action at a slower frame rate than how it will +# eventually be projected. In that case, motion appears artificially sped up, either for comedy (like the Benny Hill +# TV show), or for fast and frenetic action (like Mad Max: Fury Road). # -# In this demo, we put in the file a marker saying that it's at 30 -# fps. But since this is a simple demo, your computer might not be -# able to keep up with writing video frames at that speed. In that -# case, you'll see the same effect: sped-up motion. +# In this demo, we put in the file a marker saying that it's at 30 fps. But since this is a simple demo, your +# computer might not be able to keep up with writing video frames at that speed. In that case, you'll see the same +# effect: sped-up motion. # -# The advanced demo has several techniques to mitigate that. First, -# it uses pipelined threads to let the video encoder use a full CPU -# core (often more, internally), rather than having to share a CPU -# core with all the other tasks. Second, it puts a timestamp marker -# on each frame saying exactly when it's supposed to be shown, rather -# than just saying to show all the frames at 30 fps. +# The full demo has several techniques to mitigate that. First, it uses pipelined threads to let the video encoder +# use a full CPU core (often more, internally), rather than having to share a CPU core with all the other tasks. +# Second, it puts a timestamp marker on each frame saying exactly when it's supposed to be shown, rather than just +# saying to show all the frames at 30 fps. # -# For this simple demo, though, we just record the frames and add them -# to the file one at a time. +# For this simple demo, though, we just record the frames and add them to the file one at a time. # -# We use three libraries that don't come with Python: Pillow, PyAV, -# and (of course) MSS. You'll need to install those with "pip install -# pillow av mss". Normally, you'll want to install these into a venv; -# if you don't know about those, there are lots of great tutorials -# online. +# We use three libraries that don't come with Python: Pillow, PyAV, and (of course) MSS. You'll need to install those +# with "pip install pillow av mss". Normally, you'll want to install these into a venv; if you don't know about +# those, there are lots of great tutorials online. import logging import time @@ -48,9 +36,8 @@ import mss -# These are the options you'd give to ffmpeg that would affect the way -# the video is encoded. There are comments in the advanced demo that -# go into more detail. +# These are the options you'd give to ffmpeg that would affect the way the video is encoded. There are comments in +# the full demo that go into more detail. CODEC_OPTIONS = { "profile": "high", "preset": "medium", @@ -58,23 +45,19 @@ "rc-lookahead": "40", } -# We'll try to capture at 30 fps, if the system can keep up with it -# (typically, that's possible at 1080p, but not at 4k). Regardless of -# what the system can keep up with, we'll mark the file as being at 30 -# fps. +# We'll try to capture at 30 fps, if the system can keep up with it (typically, that's possible at 1080p, but not at +# 4k). Regardless of what the system can keep up with, we'll mark the file as being at 30 fps. FPS = 30 # The program will exit after 10 seconds of recording. CAPTURE_SECONDS = 10 -# Within an MP4 file, the video can be stored in a lot of different -# formats. In this demo, we use H.264, since it's the most widely -# supported. +# Within an MP4 file, the video can be stored in a lot of different formats. In this demo, we use H.264, since it's +# the most widely supported. # -# In ffmpeg, and the av libraries that we use here, the best codec for -# H.264 that doesn't require any specific hardware is libx264. There -# are faster ones that are hardware-accelerated, such as h264_nvenc -# which uses specialized chips on Nvidia video cards. +# In ffmpeg, and the av libraries that we use here, the best codec for H.264 that doesn't require any specific +# hardware is libx264. There are faster ones that are hardware-accelerated, such as h264_nvenc which uses specialized +# chips on Nvidia video cards. CODEC = "libx264" FILENAME = "capture.mp4" @@ -82,48 +65,35 @@ def main() -> None: logging.basicConfig(level=logging.DEBUG) - # If we don't enable PyAV's own logging, a lot of important error - # messages from libav won't be shown. + # If we don't enable PyAV's own logging, a lot of important error messages from libav won't be shown. av.logging.set_level(av.logging.VERBOSE) with mss.mss() as sct: monitor = sct.monitors[1] with av.open(FILENAME, "w") as avmux: - # The "avmux" object we get back from "av.open" represents - # the MP4 file. That's a container that holds the video, - # as well as possibly audio and more. These are each - # called "streams". We only create one stream here, since - # we're just recording video. - video_stream = avmux.add_stream( - CODEC, rate=FPS, options=CODEC_OPTIONS - ) + # The "avmux" object we get back from "av.open" represents the MP4 file. That's a container that holds + # the video, as well as possibly audio and more. These are each called "streams". We only create one + # stream here, since we're just recording video. + video_stream = avmux.add_stream(CODEC, rate=FPS, options=CODEC_OPTIONS) video_stream.width = monitor["width"] video_stream.height = monitor["height"] - # There are more options you can set on the video stream; - # the advanced demo uses some of those. + # There are more options you can set on the video stream; the full demo uses some of those. - # Count how many frames we're capturing, so we can log - # the FPS later. + # Count how many frames we're capturing, so we can log the FPS later. frame_count = 0 # Mark the times when we start and end the recording. capture_start_time = time.monotonic() capture_end_time = capture_start_time + CAPTURE_SECONDS - # MSS can capture very fast, and libav can encode very - # fast, depending on your hardware and screen size. We - # don't want to capture faster than 30 fps (or whatever - # you set FPS to). To slow down to our desired rate, we - # keep a variable "next_frame_time" to track when it's - # time to track the next frame. + # MSS can capture very fast, and libav can encode very fast, depending on your hardware and screen size. + # We don't want to capture faster than 30 fps (or whatever you set FPS to). To slow down to our desired + # rate, we keep a variable "next_frame_time" to track when it's time to track the next frame. # - # Some programs will just sleep for 1/30 sec in each loop. - # But by tracking the time when we want to capture the - # next frame, instead of always sleeping for 1/30 sec, the - # time that is spent doing the capture and encode (which - # can be substantial) is counted as part of the total time - # we need to delay. + # Some programs will just sleep for 1/30 sec in each loop. But by tracking the time when we want to + # capture the next frame, instead of always sleeping for 1/30 sec, the time that is spent doing the + # capture and encode (which can be substantial) is counted as part of the total time we need to delay. next_frame_time = capture_start_time print("Capturing to", FILENAME, "for", CAPTURE_SECONDS, "seconds") @@ -132,57 +102,39 @@ def main() -> None: while (now := time.monotonic()) < next_frame_time: time.sleep(next_frame_time - now) - # Try to capture the next frame 1/30 sec after our - # target time for this frame. We update this based on - # the target time instead of the actual time so that, - # if we were a little slow capturing this frame, we'll - # be a little fast capturing the next one, and even - # things out. (There's a slightly better, but more - # complex, way to update next_frame_time in the - # advanced demo.) + # Try to capture the next frame 1/30 sec after our target time for this frame. We update this based + # on the target time instead of the actual time so that, if we were a little slow capturing this + # frame, we'll be a little fast capturing the next one, and even things out. (There's a slightly + # better, but more complex, way to update next_frame_time in the full demo.) next_frame_time = next_frame_time + 1 / FPS - # See if we've finished the requested capture - # duration. + # See if we've finished the requested capture duration. if now > capture_end_time: break - # Print dots for each frame, so you know it's not - # frozen. + # Print dots for each frame, so you know it's not frozen. print(".", end="", flush=True) # Grab a screenshot. screenshot = sct.grab(monitor) frame_count += 1 - # There are a few ways to get the screenshot into a - # VideoFrame. The highest-performance way isn't hard, - # and is shown in the advanced demo: search for - # from_numpy_buffer. But the most obvious way is to - # use PIL: you can create an Image from the - # screenshot, and create a VideoFrame from that. That - # said, if you want to boost the fps rate by about - # 50%, check out the advanced demo, and search for + # There are a few ways to get the screenshot into a VideoFrame. The highest-performance way isn't + # hard, and is shown in the full demo: search for from_numpy_buffer. But the most obvious way is to + # use PIL: you can create an Image from the screenshot, and create a VideoFrame from that. That said, + # if you want to boost the fps rate by about 50%, check out the full demo, and search for # from_numpy_buffer. - img = Image.frombytes( - "RGB", screenshot.size, screenshot.bgra, "raw", "BGRX" - ) + img = Image.frombytes("RGB", screenshot.size, screenshot.bgra, "raw", "BGRX") frame = av.VideoFrame.from_image(img) - # When we encode frames, we get back a list of - # packets. Often, we'll get no packets at first: the - # video encoder wants to wait and see the motion - # before it decides how it wants to encode the frames. - # Later, once it's decided about the earlier frames, - # we'll start getting those packets, while it's + # When we encode frames, we get back a list of packets. Often, we'll get no packets at first: the + # video encoder wants to wait and see the motion before it decides how it wants to encode the frames. + # Later, once it's decided about the earlier frames, we'll start getting those packets, while it's # holding on to later frames. # - # You can imagine that the encoder is a factory. - # You're providing it frames, one at a time, each as a - # box of raw materials. It cranks out packets as its - # finished product. But there's some delay while it's - # working. You can imagine these on a conveyor belt - # moving left to right as time progresses: + # You can imagine that the encoder is a factory. You're providing it frames, one at a time, each as a + # box of raw materials. It cranks out packets as its finished product. But there's some delay while + # it's working. You can imagine these on a conveyor belt moving left to right as time progresses: # # FRAMES ENCODER PACKETS # [1]________-> (Factory) ->____________ @@ -190,33 +142,24 @@ def main() -> None: # [6]_[5]_[4]-> (Factory) ->{1}_________ # [8]_[7]_[6]-> (Factory) ->{3}_{2}_{1}_ # - # Sometimes, when you send in a frame, you'll get no - # packets, sometimes you'll get one, and sometimes - # you'll get a batch of several. It depends on how - # the encoder works. + # Sometimes, when you send in a frame, you'll get no packets, sometimes you'll get one, and sometimes + # you'll get a batch of several. It depends on how the encoder works. # - # The point is, the packets you're getting back from - # this call are whatever the encoder is ready to give - # you, not necessarily the packets related to the - # frame you're handing it right now. + # The point is, the packets you're getting back from this call are whatever the encoder is ready to + # give you, not necessarily the packets related to the frame you're handing it right now. packets = video_stream.encode(frame) - # As we said, the MP4 file is a bunch of packets from - # possibly many streams, all woven (or "muxed") - # together. So the ultimate destination of the data - # is to send it to the MP4 file, avmux. + # As we said, the MP4 file is a bunch of packets from possibly many streams, all woven (or "muxed") + # together. So the ultimate destination of the data is to send it to the MP4 file, avmux. avmux.mux(packets) # Print an empty line to end our line of dots. print() - # Earlier, we mentioned that the encoder might hold onto - # some frames, while it decides how to encode them based - # on future frames. Now that we're done sending it - # frames, we need to get the packets for any frames it's - # still holding onto. This is referred to as "flushing" - # the stream. We do this by sending None instead of a - # frame object. + # Earlier, we mentioned that the encoder might hold onto some frames, while it decides how to encode them + # based on future frames. Now that we're done sending it frames, we need to get the packets for any + # frames it's still holding onto. This is referred to as "flushing" the stream. We do this by sending + # None instead of a frame object. packets = video_stream.encode(None) avmux.mux(packets) diff --git a/demos/video-capture.py b/demos/video-capture.py index 813f920..098d9cb 100755 --- a/demos/video-capture.py +++ b/demos/video-capture.py @@ -1,26 +1,20 @@ #! /usr/bin/env python3 -# This demo shows one common use case for MSS: capture the screen and -# write a real video file (MP4) rather than saving individual images. +# This demo shows one common use case for MSS: capture the screen and write a real video file (MP4) rather than saving +# individual images. # -# It's intentionally not a full "video encoding" course. The goal is -# to explain the few concepts that show up throughout the program so -# you can read, tweak, and extend it. +# It's intentionally not a full "video encoding" course. The goal is to explain the few concepts that show up +# throughout the program so you can read, tweak, and extend it. # # What tools are we using? # ------------------------ # -# You'll need a few libraries that don't come with Python: PyAV, -# NumPy, SI-Prefix, and (of course) MSS. You'll need to install those -# with "pip install av mss numpy si-prefix". Normally, you'll want to -# install these into a venv; if you don't know about those, there are -# lots of great tutorials online. The most critical one we use is -# PyAV. +# You'll need a few libraries that don't come with Python: PyAV, NumPy, SI-Prefix, and (of course) MSS. You'll need +# to install those with "pip install av mss numpy si-prefix". Normally, you'll want to install these into a venv; if +# you don't know about those, there are lots of great tutorials online. The most critical one we use is PyAV. # -# Most people first meet video encoding through the `ffmpeg` command. -# Under the hood, ffmpeg is built on the "libav*" C libraries. In -# this demo we use PyAV (`import av`), which is a Pythonic wrapper -# around those libraries. +# Most people first meet video encoding through the `ffmpeg` command. Under the hood, ffmpeg is built on the "libav*" +# C libraries. In this demo we use PyAV (`import av`), which is a Pythonic wrapper around those libraries. # # PyAV docs: # Note: the older docs at pyav.org are outdated; see @@ -30,19 +24,16 @@ # Containers, streams, and codecs # ------------------------------- # -# A file like `capture.mp4` is a *container*: it holds one or more -# *streams* (usually video and/or audio). This demo writes one video -# stream. +# A file like `capture.mp4` is a *container*: it holds one or more *streams* (usually video and/or audio). This demo +# writes one video stream. # -# The container interleaves ("muxes") stream data so players can read -# everything in timestamp order. libav calls those pieces "packets". -# (In MP4 they're not literally network-style packets; the term is a -# longstanding libav abstraction.) +# The container interleaves ("muxes") stream data so players can read everything in timestamp order. libav calls those +# pieces "packets". (In MP4 they're not literally network-style packets; the term is a longstanding libav +# abstraction.) # -# A *codec* is the algorithm that compresses/decompresses a stream. -# For MP4 video, common codecs include H.264 and H.265. This demo -# defaults to H.264 via `libx264`, because it's widely supported. You -# can switch to hardware encoders (e.g. `h264_nvenc`) if available. +# A *codec* is the algorithm that compresses/decompresses a stream. For MP4 video, common codecs include H.264 and +# H.265. This demo defaults to H.264 via `libx264`, because it's widely supported. You can switch to hardware +# encoders (e.g. `h264_nvenc`) if available. # # Frames and frame reordering (I/P/B) # ---------------------------------- @@ -52,31 +43,25 @@ # - P-frames: changes from previous frames. # - B-frames: changes predicted using both past *and future* frames. # -# B-frames are why "the order frames are encoded/decoded" can differ -# from "the order frames are shown". That leads directly to -# timestamps. +# B-frames are why "the order frames are encoded/decoded" can differ from "the order frames are shown". That leads +# directly to timestamps. # # Timestamps (PTS/DTS) # -------------------- # -# Every frame has a *presentation timestamp* (PTS): when the viewer -# should see it. +# Every frame has a *presentation timestamp* (PTS): when the viewer should see it. # -# Encoders may output packets in a different order due to B-frames. -# Those packets also have a *decode timestamp* (DTS): when the decoder -# must decode them so the PTS schedule can be met. +# Encoders may output packets in a different order due to B-frames. Those packets also have a *decode timestamp* +# (DTS): when the decoder must decode them so the PTS schedule can be met. # -# In this demo we set PTS on `VideoFrame`s and let libav/PyAV -# propagate timestamps into the encoded packets. +# In this demo we set PTS on `VideoFrame`s and let libav/PyAV propagate timestamps into the encoded packets. # # Time base # --------- # -# Timestamps are integers, and their unit is a fraction of a second -# called the *time base*. For example, with a time base of 1/90000, a -# timestamp of 90000 means "1 second". PyAV will convert between time -# bases when needed, but you must set them consistently where you -# generate timestamps. +# Timestamps are integers, and their unit is a fraction of a second called the *time base*. For example, with a time +# base of 1/90000, a timestamp of 90000 means "1 second". PyAV will convert between time bases when needed, but you +# must set them consistently where you generate timestamps. # # See # @@ -85,43 +70,32 @@ # Constant Frame Rate (CFR) and Variable Frame Rate (VFR) # ------------------------------------------------------- # -# Many video files run at a fixed frame rate, like 30 fps. Each frame -# is shown at 1/30 sec intervals. This is called *constant frame -# rate*, or *CFR*, and that's what we do in the simple version of this -# demo. +# Many video files run at a fixed frame rate, like 30 fps. Each frame is shown at 1/30 sec intervals. This is called +# *constant frame rate*, or *CFR*, and that's what we do in the simple version of this demo. # -# One problem with this is that, if the encoder can't keep up, the -# video will appear sped-up when played back. The comments at the -# beginning of the simple version of this demo go into more detail -# about that problem. +# One problem with this is that, if the encoder can't keep up, the video will appear sped-up when played back. The +# comments at the beginning of the simple version of this demo go into more detail about that problem. # -# In this advanced version, we use *variable frame rate*, or *VFR*. -# That's because we can't be sure that the encoder will be able to -# work fast enough: we haven't tuned its settings for your screen -# resolution and hardware. While the encoder might be fast enough, it -# might only be able to operate at 18 fps, or even less. +# In this advanced version, we use *variable frame rate*, or *VFR*. That's because we can't be sure that the encoder +# will be able to work fast enough: we haven't tuned its settings for your screen resolution and hardware. While the +# encoder might be fast enough, it might only be able to operate at 18 fps, or even less. # -# Instead, we mark each frame with the correct time that it should be -# shown. Even if the encoder is falling behind, its frames are still -# marked with the right times, so the player will just keep the -# previous frame on the screen a little longer. +# Instead, we mark each frame with the correct time that it should be shown. Even if the encoder is falling behind, +# its frames are still marked with the right times, so the player will just keep the previous frame on the screen a +# little longer. # -# Some video editing software historically has had problems with VFR -# video. It's much better now than it was a few years ago, but if you -# plan to edit the video, you may need to convert it to CFR. There -# are many resources online about how to do that. +# Some video editing software historically has had problems with VFR video. It's much better now than it was a few +# years ago, but if you plan to edit the video, you may need to convert it to CFR. There are many resources online +# about how to do that. # # Performance (why multiple threads?) # ---------------------------------- # -# Capturing frames, converting them to `VideoFrame`s, encoding, and -# muxing are separate stages. This demo pipelines those stages across -# threads so that (for example) encoding can run while the next screen -# grab is happening. The comments at the top of common/pipeline.py -# describe pipelining in detail. +# Capturing frames, converting them to `VideoFrame`s, encoding, and muxing are separate stages. This demo pipelines +# those stages across threads so that (for example) encoding can run while the next screen grab is happening. The +# comments at the top of common/pipeline.py describe pipelining in detail. # -# The slowest stage typically limits overall FPS. Usually, that's the -# encoder. +# The slowest stage typically limits overall FPS. Usually, that's the encoder. # # On an idle system (rough guide; will vary widely): # - libx264, 1920x1080: ~80 fps @@ -150,42 +124,33 @@ from common.pipeline import Mailbox, PipelineStage -# These are the options you'd give to ffmpeg that it sends to the -# video codec. The options you can use here can be listed with -# `ffmpeg -help encoder=libx264`, or whatever encoder you're using for -# this demo's `--codec` flag. The options for each encoder are described -# in more detail in `man ffmpeg-codecs`. +# These are the options you'd give to ffmpeg that it sends to the video codec. The options you can use here can be +# listed with `ffmpeg -help encoder=libx264`, or whatever encoder you're using for this demo's `--codec` flag. The +# options for each encoder are described in more detail in `man ffmpeg-codecs`. CODEC_OPTIONS = { - # The "high" profile means that the encoder can use some H.264 - # features that are widely supported, but not mandatory. If - # you're using a codec other than H.264, you'll need to comment - # out this line: the relevant features are already part of the - # main profile in later codecs like H.265, VP8, VP9, and AV1. + # The "high" profile means that the encoder can use some H.264 features that are widely supported, but not + # mandatory. If you're using a codec other than H.264, you'll need to comment out this line: the relevant + # features are already part of the main profile in later codecs like H.265, VP8, VP9, and AV1. "profile": "high", - # The "medium" preset is as good of a preset as any for a demo - # like this. Different codecs have different presets; the - # h264_nvenc actually prefers "p4", but accepts "medium" as a - # similar preset. You might prefer "fast" if you're not getting - # enough FPS. + # The "medium" preset is as good of a preset as any for a demo like this. Different codecs have different + # presets; the h264_nvenc actually prefers "p4", but accepts "medium" as a similar preset. You might prefer + # "fast" if you're not getting enough FPS. "preset": "medium", - # 6 Mbit/sec is vaguely the ballpark for a good-quality video at - # 1080p and 30 fps, but there's a lot of variation. We're just - # giving the target bitrate: the second-to-second bitrate will - # vary a lot, and slowly approach this bitrate. If you're trying - # this on a nearly-still screen, though, then the actual bitrate - # will be much lower, since there's not much motion to encode! + # 6 Mbit/sec is vaguely the ballpark for a good-quality video at 1080p and 30 fps, but there's a lot of variation. + # We're just giving the target bitrate: the second-to-second bitrate will vary a lot, and slowly approach this + # bitrate. If you're trying this on a nearly-still screen, though, then the actual bitrate will be much lower, + # since there's not much motion to encode! "b": "6M", - # Let the encoder hold some frames for analysis, and flush them - # later. This especially helps with the hardware-accelerated - # codecs. + # Let the encoder hold some frames for analysis, and flush them later. This especially helps with the + # hardware-accelerated codecs. "rc-lookahead": "40", } TIME_BASE = Fraction(1, 90000) -# Currently, MSS doesn't give us information about the display's -# colorspace. See where this is used below for more information. +# Currently, MSS doesn't give us information about the display's colorspace. See where this is used below for more +# information. DISPLAY_IS_SRGB = False LOGGER = logging.getLogger("video-capture") @@ -197,22 +162,17 @@ def video_capture( monitor: mss.models.Monitor, shutdown_requested: Event, ) -> Generator[tuple[mss.screenshot.ScreenShot, float], None, None]: - # Keep track of the time when we want to get the next frame. We - # limit the frame time this way instead of sleeping 1/fps sec each - # frame, since we want to also account for the time taken to get - # the screenshot and other overhead. + # Keep track of the time when we want to get the next frame. We limit the frame time this way instead of sleeping + # 1/fps sec each frame, since we want to also account for the time taken to get the screenshot and other overhead. # - # Repeatedly adding small floating-point numbers to a total does - # cause some numeric inaccuracies, but it's small enough for our - # purposes. The program would have to run for three months to - # accumulate one millisecond of inaccuracy. + # Repeatedly adding small floating-point numbers to a total does cause some numeric inaccuracies, but it's small + # enough for our purposes. The program would have to run for three months to accumulate one millisecond of + # inaccuracy. next_frame_at = time.monotonic() - # Keep running this loop until the main thread says we should - # stop. + # Keep running this loop until the main thread says we should stop. while not shutdown_requested.is_set(): - # Wait until we're ready. This should, ideally, happen every - # 1/fps second. + # Wait until we're ready. This should, ideally, happen every 1/fps second. while (now := time.monotonic()) < next_frame_at: time.sleep(next_frame_at - now) @@ -220,23 +180,16 @@ def video_capture( screenshot = sct.grab(monitor) yield screenshot, now - # We try to keep the capture rate at the desired fps on - # average. If we can't quite keep up for a moment (such as if - # the computer is a little overloaded), then we'll accumulate - # a bit of "timing debt" in next_frame_at: it'll be a little - # sooner than now + one frame. We'll hopefully be able to - # catch up soon. + # We try to keep the capture rate at the desired fps on average. If we can't quite keep up for a moment (such + # as if the computer is a little overloaded), then we'll accumulate a bit of "timing debt" in next_frame_at: + # it'll be a little sooner than now + one frame. We'll hopefully be able to catch up soon. next_frame_at = next_frame_at + (1 / fps) - # If we've accumulated over one frame's worth of timing debt, - # then that will say that next_frame_at is sooner than now. - # If we're accumulating too much debt, we want to wipe it out, - # rather than having a huge burst of closely-spaced captures - # as soon as we can get back to our desired capture rate. - # When we wipe that out, we still try to preserve the timing - # cycle's phase to keep the capture cadence smooth, rather - # than having a jittery burst of closely-spaced captures. In - # other words, we increment next_frame_at by a multiple of the + # If we've accumulated over one frame's worth of timing debt, then that will say that next_frame_at is sooner + # than now. If we're accumulating too much debt, we want to wipe it out, rather than having a huge burst of + # closely-spaced captures as soon as we can get back to our desired capture rate. When we wipe that out, we + # still try to preserve the timing cycle's phase to keep the capture cadence smooth, rather than having a + # jittery burst of closely-spaced captures. In other words, we increment next_frame_at by a multiple of the # desired capture period. if next_frame_at < now: missed_frames = floor((now - next_frame_at) * fps) @@ -244,51 +197,42 @@ def video_capture( def video_process( - screenshot_and_timestamp: Iterable[ - tuple[mss.screenshot.ScreenShot, float] - ], + screenshot_and_timestamp: Iterable[tuple[mss.screenshot.ScreenShot, float]], ) -> Generator[av.VideoFrame, None, None]: - # We track when the first frame happened so we can make PTS start - # at 0. Many video players and other tools expect that. + # We track when the first frame happened so we can make PTS start at 0. Many video players and other tools expect + # that. first_frame_at: float | None = None for screenshot, timestamp in screenshot_and_timestamp: # Avoiding extra pixel copies # --------------------------- # - # Copying a full frame of pixels is expensive. On typical - # hardware, a plain CPU memcpy of a 4K BGRA image can cost on - # the order of ~3ms by itself, which is a big chunk of a 30fps - # budget (33ms) and an even bigger chunk of a 60fps budget - # (16.7ms). + # Copying a full frame of pixels is expensive. On typical hardware, a plain CPU memcpy of a 4K BGRA image can + # cost on the order of ~3ms by itself, which is a big chunk of a 30fps budget (33ms) and an even bigger chunk + # of a 60fps budget (16.7ms). # - # So we want to be careful about the *conversion* step from an - # MSS `ScreenShot` to a PyAV `VideoFrame`. Ideally, that step - # should reuse the same underlying bytes rather than creating - # additional intermediate copies. + # So we want to be careful about the *conversion* step from an MSS `ScreenShot` to a PyAV `VideoFrame`. + # Ideally, that step should reuse the same underlying bytes rather than creating additional intermediate + # copies. # # Buffers in Python # ----------------- # - # Many Python objects expose their underlying memory via the - # "buffer protocol". A buffer is just a view of raw bytes - # that other libraries can interpret without copying. + # Many Python objects expose their underlying memory via the "buffer protocol". A buffer is just a view of + # raw bytes that other libraries can interpret without copying. # - # Common buffer objects include: `bytes`, `bytearray`, - # `memoryview`, and `array.array`. `screenshot.bgra` is also - # a buffer (currently it is a `bytes` object, though that - # detail may change in the future). + # Common buffer objects include: `bytes`, `bytearray`, `memoryview`, and `array.array`. `screenshot.bgra` is + # also a buffer (currently it is a `bytes` object, though that detail may change in the future). # # Minimum-copy path: ScreenShot -> NumPy -> VideoFrame # ---------------------------------------------------- # - # `np.frombuffer()` creates an ndarray *view* of an existing - # buffer (no copy). Reshaping also stays as a view. + # `np.frombuffer()` creates an ndarray *view* of an existing buffer (no copy). Reshaping also stays as a + # view. # - # PyAV's `VideoFrame.from_ndarray()` always copies the data - # into a new frame-owned buffer. For this demo we use the - # undocumented `VideoFrame.from_numpy_buffer()`, which creates - # a `VideoFrame` that shares memory with the ndarray. + # PyAV's `VideoFrame.from_ndarray()` always copies the data into a new frame-owned buffer. For this demo we + # use the undocumented `VideoFrame.from_numpy_buffer()`, which creates a `VideoFrame` that shares memory with + # the ndarray. ndarray = np.frombuffer(screenshot.bgra, dtype=np.uint8) ndarray = ndarray.reshape(screenshot.height, screenshot.width, 4) frame = av.VideoFrame.from_numpy_buffer(ndarray, format="bgra") @@ -299,9 +243,8 @@ def video_process( frame.pts = int((timestamp - first_frame_at) / TIME_BASE) frame.time_base = TIME_BASE - # If we know the colorspace of our frames, mark them - # accordingly. See the comment where we set these attributes - # on video_stream for details. + # If we know the colorspace of our frames, mark them accordingly. See the comment where we set these + # attributes on video_stream for details. if DISPLAY_IS_SRGB: frame.colorspace = av.video.reformatter.Colorspace.ITU709 frame.color_range = av.video.reformatter.ColorRange.JPEG @@ -314,8 +257,8 @@ def video_encode( ) -> Generator[Sequence[av.Packet], None, None]: for frame in frames: yield video_stream.encode(frame) - # Our input has run out. Flush the frames that the encoder still - # is holding internally (such as to compute B-frames). + # Our input has run out. Flush the frames that the encoder still is holding internally (such as to compute + # B-frames). yield video_stream.encode(None) @@ -329,9 +272,8 @@ def show_stats( FPS indicates how fast the entire pipeline can run as a whole, not any individual stage. """ - # The start time is only used for showing the clock. The actual - # timing stats use packet timestamps (ultimately derived from the - # frame PTS we compute during capture). + # The start time is only used for showing the clock. The actual timing stats use packet timestamps (ultimately + # derived from the frame PTS we compute during capture). start_time = time.monotonic() time_deque: deque[int] = deque(maxlen=100) bit_count_deque: deque[int] = deque(maxlen=100) @@ -339,26 +281,21 @@ def show_stats( last_status_len = 0 for frame_count, packet_batch in enumerate(packet_batches): - # Yield the packet data immediately, so the mux gets it as - # soon as possible, while we update our stats. + # Yield the packet data immediately, so the mux gets it as soon as possible, while we update our stats. yield packet_batch for packet in packet_batch: # FPS from timestamps: why DTS, not PTS? # - # Intuitively, you'd expect to compute FPS from PTS (the - # time the viewer should *see* each frame). But encoders - # can reorder frames internally (especially with - # B-frames), so packets may come out in a different order - # than PTS. + # Intuitively, you'd expect to compute FPS from PTS (the time the viewer should *see* each frame). But + # encoders can reorder frames internally (especially with B-frames), so packets may come out in a + # different order than PTS. # - # If we update a sliding window with out-of-order PTS - # values, the window start/end can "wiggle" even when the - # pipeline is steady, which makes the displayed FPS noisy. + # If we update a sliding window with out-of-order PTS values, the window start/end can "wiggle" even when + # the pipeline is steady, which makes the displayed FPS noisy. # - # DTS is the time order the decoder must process packets. - # Packets are emitted in DTS order, so using DTS gives a - # stable, monotonic timeline for the sliding window. + # DTS is the time order the decoder must process packets. Packets are emitted in DTS order, so using DTS + # gives a stable, monotonic timeline for the sliding window. time_deque.append(packet.dts) bit_count = packet.size * 8 bit_count_deque.append(bit_count) @@ -370,8 +307,8 @@ def show_stats( running_minutes = int(running_time / 60) running_seconds = int(running_time % 60) window_secs = (time_deque[-1] - time_deque[0]) * TIME_BASE - # We can't use the last frame in the window when we divide - # by window_secs; that would be a fencepost error. + # We can't use the last frame in the window when we divide by window_secs; that would be a fencepost + # error. window_frames = len(time_deque) - 1 window_bits = sum(bit_count_deque) - bit_count_deque[-1] fps = window_frames / window_secs @@ -385,12 +322,10 @@ def show_stats( full_line = f"\r{line}{' ' * (last_status_len - this_status_len)}" print(full_line, end="") last_status_len = this_status_len - # Near shutdown the encoder flush can emit packets in large - # bursts, and we also throttle status updates (to avoid spamming - # the terminal). That combination means the last displayed line - # may be stale or not representative of the final frames. Rather - # than leaving potentially misleading numbers on screen, erase the - # status display. + + # At shutdown, the encoder flush can emit packets in large bursts, and we also throttle status updates (to avoid + # spamming the terminal). That combination means the last displayed line may be stale or not representative of + # the final frames. Rather than leaving potentially misleading numbers on screen, erase the status display. print(f"\r{' ' * last_status_len}\r", end="") @@ -417,13 +352,10 @@ def parse_region(s: str) -> tuple[int, int, int, int]: def main() -> None: logging.basicConfig(level=logging.DEBUG) - # If we don't enable PyAV's own logging, a lot of important error - # messages from libav won't be shown. + # If we don't enable PyAV's own logging, a lot of important error messages from libav won't be shown. av.logging.set_level(av.logging.VERBOSE) - parser = argparse.ArgumentParser( - description="Capture screen video to MP4 file" - ) + parser = argparse.ArgumentParser(description="Capture screen video to MP4 file") parser.add_argument( "-f", "--fps", @@ -488,103 +420,71 @@ def main() -> None: else: monitor = sct.monitors[args.monitor] - # We don't pass the container format to av.open here, so it - # will choose it based on the extension: .mp4, .mkv, etc. + # We don't pass the container format to av.open here, so it will choose it based on the extension: .mp4, .mkv, + # etc. with av.open(filename, "w") as avmux: - # We could initialize video_stream in video_encode, but - # doing it here means that we can open it before starting - # the capture thread, which avoids a warmup frame (one - # that takes longer to encode because the encoder is just - # starting). + # We could initialize video_stream in video_encode, but doing it here means that we can open it before + # starting the capture thread, which avoids a warmup frame (one that takes longer to encode because the + # encoder is just starting). # - # The rate= parameter here is just the nominal frame rate: - # some tools (like file browsers) might display this as - # the frame rate. But we actually control timing via the - # pts and time_base values on the frames themselves. - video_stream = avmux.add_stream( - codec, rate=fps, options=CODEC_OPTIONS - ) - - # Ideally, we would set attributes such as colorspace, - # color_range, color_primaries, and color_trc here to - # describe the colorspace accurately. Otherwise, the - # player has to guess whether this was recorded on an sRGB - # Windows machine, a Display P3 Mac, or if it's using - # linear RGB. Currently, MSS doesn't give us colorspace - # information (DISPLAY_IS_SRGB is always False in this - # demo), so we don't try to specify a particular - # colorspace. However, if your application knows the - # colorspace you're recording from, then you can set those - # attributes on the stream and the frames accordingly. + # The rate= parameter here is just the nominal frame rate: some tools (like file browsers) might display + # this as the frame rate. But we actually control timing via the pts and time_base values on the frames + # themselves. + video_stream = avmux.add_stream(codec, rate=fps, options=CODEC_OPTIONS) + + # Ideally, we would set attributes such as colorspace, color_range, color_primaries, and color_trc here to + # describe the colorspace accurately. Otherwise, the player has to guess whether this was recorded on an + # sRGB Windows machine, a Display P3 Mac, or if it's using linear RGB. Currently, MSS doesn't give us + # colorspace information (DISPLAY_IS_SRGB is always False in this demo), so we don't try to specify a + # particular colorspace. However, if your application knows the colorspace you're recording from, then + # you can set those attributes on the stream and the frames accordingly. # - # These properties on the stream (actually, they're - # attached to its CodecContext) are used to tell the - # stream and container how to label the video stream's - # colorspace. There are similar attributes on the frame - # itself; those are used to identify its colorspace, so - # the codec can do the correct RGB to YUV conversion. + # These properties on the stream (actually, they're attached to its CodecContext) are used to tell the + # stream and container how to label the video stream's colorspace. There are similar attributes on the + # frame itself; those are used to identify its colorspace, so the codec can do the correct RGB to YUV + # conversion. if DISPLAY_IS_SRGB: - # color_primaries=1 is libavutil's AVCOL_PRI_BT709; - # PyAV doesn't define named constants for color + # color_primaries=1 is libavutil's AVCOL_PRI_BT709; PyAV doesn't define named constants for color # primaries. video_stream.color_primaries = 1 - # What PyAV refers to as ITU709 is more commonly known - # as BT.709. - video_stream.colorspace = ( - av.video.reformatter.Colorspace.ITU709 - ) - # The "JPEG" color range is saying that we're using a - # color range like a computer, not like broadcast TV. + # What PyAV refers to as ITU709 is more commonly known as BT.709. + video_stream.colorspace = av.video.reformatter.Colorspace.ITU709 + # The "JPEG" color range is saying that we're using a color range like a computer, not like broadcast + # TV. video_stream.color_range = av.video.reformatter.ColorRange.JPEG - # PyAV doesn't define named constants for TRCs, so we - # pass it a numeric value. Technically, sRGB's - # transformation characteristic is - # AVCOL_TRC_IEC61966_2_1 (13). It's nearly the same - # as BT.709's TRC, so some video encoders will tag it - # as AVCOL_TRC_BT709 (1) instead. + # PyAV doesn't define named constants for TRCs, so we pass it a numeric value. Technically, sRGB's + # transformation characteristic is AVCOL_TRC_IEC61966_2_1 (13). It's nearly the same as BT.709's TRC, + # so some video encoders will tag it as AVCOL_TRC_BT709 (1) instead. video_stream.color_trc = 13 video_stream.width = monitor["width"] video_stream.height = monitor["height"] - # There are multiple time bases in play (stream, codec - # context, per-frame). Depending on the container and - # codec, some of these might be ignored or overridden. We - # set the desired time base consistently everywhere, so - # that the saved timestamps are correct regardless of what - # format we're saving to. + # There are multiple time bases in play (stream, codec context, per-frame). Depending on the container + # and codec, some of these might be ignored or overridden. We set the desired time base consistently + # everywhere, so that the saved timestamps are correct regardless of what format we're saving to. video_stream.time_base = TIME_BASE video_stream.codec_context.time_base = TIME_BASE - # `pix_fmt` here describes the pixel format we will *feed* - # into the encoder (not necessarily what the encoder will - # store in the bitstream). H.264 encoders ultimately - # convert to a YUV format internally. + # `pix_fmt` here describes the pixel format we will *feed* into the encoder (not necessarily what the + # encoder will store in the bitstream). H.264 encoders ultimately convert to a YUV format internally. # - # If the encoder accepts BGRA input (e.g., h264_nvenc), we - # can hand it MSS's BGRA frames directly and avoid an - # extra pre-conversion step on our side. + # If the encoder accepts BGRA input (e.g., h264_nvenc), we can hand it MSS's BGRA frames directly and + # avoid an extra pre-conversion step on our side. # - # If the encoder doesn't accept BGRA input (e.g., - # libx264), PyAV will insert a conversion step - # automatically. In that case, we let the codec choose - # the pix_fmt it's going to expect. + # If the encoder doesn't accept BGRA input (e.g., libx264), PyAV will insert a conversion step + # automatically. In that case, we let the codec choose the pix_fmt it's going to expect. # - # Note: the alpha channel is ignored by H.264. We may - # effectively be sending BGRx/BGR0. But PyAV's VideoFrame - # only exposes "bgra" as the closest supported format. + # Note: the alpha channel is ignored by H.264. We may effectively be sending BGRx/BGR0. But PyAV's + # VideoFrame only exposes "bgra" as the closest supported format. if any(f.name == "bgra" for f in video_stream.codec.video_formats): video_stream.pix_fmt = "bgra" - # We open (initialize) the codec explicitly here. PyAV - # will automatically open it the first time we call - # video_stream.encode, but the time it takes to set the - # codec up means the first frame would be particularly - # slow. + # We open (initialize) the codec explicitly here. PyAV will automatically open it the first time we + # call video_stream.encode, but the time it takes to set the codec up means the first frame would be + # particularly slow. video_stream.open() shutdown_requested = Event() - mailbox_screenshot: Mailbox[ - tuple[mss.screenshot.ScreenShot, float] - ] = Mailbox() + mailbox_screenshot: Mailbox[tuple[mss.screenshot.ScreenShot, float]] = Mailbox() mailbox_frame: Mailbox[av.VideoFrame] = Mailbox() mailbox_packet_to_stats: Mailbox[Sequence[av.Packet]] = Mailbox() mailbox_packet_to_mux: Mailbox[Sequence[av.Packet]] = Mailbox() @@ -639,15 +539,11 @@ def main() -> None: old_sigint_handler = signal.getsignal(signal.SIGINT) def sigint_handler(_signum: int, _frame: Any) -> None: - # Restore the default behavior, so if our shutdown - # doesn't work because of a bug in our code, the user - # can still press ^C again to terminate the program. - # (The default handler is also in - # signal.default_int_handler, but that's not - # documented.) + # Restore the default behavior, so if our shutdown doesn't work because of a bug in our code, the user + # can still press ^C again to terminate the program. (The default handler is also in + # signal.default_int_handler, but that's not documented.) signal.signal(signal.SIGINT, old_sigint_handler) - # The status line will typically be visible, so start - # a fresh line for this message. + # The status line will typically be visible, so start a fresh line for this message. print("\nShutting down") shutdown_requested.set() @@ -657,10 +553,8 @@ def sigint_handler(_signum: int, _frame: Any) -> None: if duration_secs is not None: stage_video_capture.join(timeout=duration_secs) - # Either the join timed out, or we processed a ^C and - # requested it exit. Either way, it's safe to set the - # shutdown event again, and return to our normal - # processing loop. + # Either the join timed out, or we processed a ^C and requested it exit. Either way, it's safe to set + # the shutdown event again, and return to our normal processing loop. shutdown_requested.set() stage_video_capture.join() @@ -669,34 +563,24 @@ def sigint_handler(_signum: int, _frame: Any) -> None: stage_show_stats.join() stage_mux.join() - # PyAV may insert an implicit conversion step between the - # frames we provide and what the encoder actually accepts - # (pixel format, colorspace, etc.). When that happens, - # `video_stream.reformatter` gets set. + # PyAV may insert an implicit conversion step between the frames we provide and what the encoder actually + # accepts (pixel format, colorspace, etc.). When that happens, `video_stream.reformatter` gets set. # - # This is useful to know for performance: those - # conversions are typically CPU-side work and can become a - # bottleneck. Hardware-accelerated encoders, such as - # `h264_nvenc`, often accept BGRx, and can perform the + # This is useful to know for performance: those conversions are typically CPU-side work and can become a + # bottleneck. Hardware-accelerated encoders, such as `h264_nvenc`, often accept BGRx, and can perform the # conversion using specialized hardware. # - # We already know that libx264 doesn't accept RGB input, - # so we don't warn about that. (There is a libx264rgb, - # but that writes to a different H.264 format.) We just - # want to warn about other codecs, since some of them - # might have ways to use BGRx input, and the programmer - # might want to investigate. + # We already know that libx264 doesn't accept RGB input, so we don't warn about that. (There is a + # libx264rgb, but that writes to a different H.264 format.) We just want to warn about other codecs, + # since some of them might have ways to use BGRx input, and the programmer might want to investigate. # - # Note: `reformatter` is created lazily, so it may only be - # set after frames have been sent through the encoder, - # which is why we check it at the end. + # Note: `reformatter` is created lazily, so it may only be set after frames have been sent through the + # encoder, which is why we check it at the end. if video_stream.reformatter is not None and codec != "libx264": LOGGER.warning( - "PyAV inserted a CPU-side pixel-format/colorspace " - "conversion step; this can reduce FPS. Check the " - "acceptable pix_fmts for this codec, and see if one " - "of them can accept some variation of BGRx input " - "directly." + "PyAV inserted a CPU-side pixel-format/colorspace conversion step; this can reduce FPS. " + "Check the acceptable pix_fmts for this codec, and see if one of them can accept some " + "variation of BGRx input directly." ) From ec9cd3742830825a4cf6eea4836be49230d1dcaf Mon Sep 17 00:00:00 2001 From: Joel Holveck Date: Tue, 20 Jan 2026 09:16:31 +0000 Subject: [PATCH 14/16] Comment improvements --- demos/video-capture.py | 53 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 43 insertions(+), 10 deletions(-) diff --git a/demos/video-capture.py b/demos/video-capture.py index 098d9cb..10543ec 100755 --- a/demos/video-capture.py +++ b/demos/video-capture.py @@ -124,9 +124,10 @@ from common.pipeline import Mailbox, PipelineStage -# These are the options you'd give to ffmpeg that it sends to the video codec. The options you can use here can be -# listed with `ffmpeg -help encoder=libx264`, or whatever encoder you're using for this demo's `--codec` flag. The -# options for each encoder are described in more detail in `man ffmpeg-codecs`. +# These are the options you'd give to ffmpeg that it sends to the video codec. Because ffmpeg and PyAV both use the +# libav libraries, you can get the list of available flags with `ffmpeg -help encoder=libx264`, or whatever encoder +# you're using for this demo's `--codec` flag. The options for each encoder are described in more detail in `man +# ffmpeg-codecs`. CODEC_OPTIONS = { # The "high" profile means that the encoder can use some H.264 features that are widely supported, but not # mandatory. If you're using a codec other than H.264, you'll need to comment out this line: the relevant @@ -238,6 +239,19 @@ def video_process( frame = av.VideoFrame.from_numpy_buffer(ndarray, format="bgra") # Set the PTS and time base for the frame. + # + # We compute PTS based on the actual time we captured the screenshot, relative to when we got the first + # frame. This gives us variable frame rate (VFR) video that accurately reflects the times the frames were + # captured. + # + # However, if we were muxing in an audio stream as well, we'd want to use a common clock for both audio and + # video PTS, preferably based on the audio clock. That's because audio glitches are more noticeable than + # video glitches, so audio timing should be prioritized. In that case, the video PTS would be based on the + # audio clock, not the actual capture time. + # + # The easiest way to do that is to record the monotonic clock in both the video and audio capture stages + # (taking the audio latency into account), record the audio PTS based on how many audio samples have been + # captured, and then adjust the video PTS based on the skew between the audio and monotonic clocks. if first_frame_at is None: first_frame_at = timestamp frame.pts = int((timestamp - first_frame_at) / TIME_BASE) @@ -465,16 +479,20 @@ def main() -> None: video_stream.time_base = TIME_BASE video_stream.codec_context.time_base = TIME_BASE # `pix_fmt` here describes the pixel format we will *feed* into the encoder (not necessarily what the - # encoder will store in the bitstream). H.264 encoders ultimately convert to a YUV format internally. + # encoder will store in the bitstream). H.264 encoders ultimately convert to a YUV 4:2:0 format + # internally. # - # If the encoder accepts BGRA input (e.g., h264_nvenc), we can hand it MSS's BGRA frames directly and - # avoid an extra pre-conversion step on our side. + # If the encoder accepts BGRx input (e.g., h264_nvenc), we can hand it MSS's BGRx frames directly and + # avoid an extra pre-conversion step on our side. For a hardware encoder, that lets specialized hardware + # do the conversion to YUV efficiently. # - # If the encoder doesn't accept BGRA input (e.g., libx264), PyAV will insert a conversion step - # automatically. In that case, we let the codec choose the pix_fmt it's going to expect. + # If the encoder doesn't accept BGRx input (e.g., libx264), PyAV will insert a conversion step + # automatically. In that case, we let the codec choose the pix_fmt it wants. # - # Note: the alpha channel is ignored by H.264. We may effectively be sending BGRx/BGR0. But PyAV's - # VideoFrame only exposes "bgra" as the closest supported format. + # Note: the alpha channel is ignored by H.264. We usually are sending sending BGRx/BGR0. But PyAV's + # VideoFrame only exposes "bgra" as the closest supported format, so that's how we tag our frames, and + # what we tell the codec to expect, if possible. You might need to change this for codecs like VP9 that + # can handle alpha channels. if any(f.name == "bgra" for f in video_stream.codec.video_formats): video_stream.pix_fmt = "bgra" # We open (initialize) the codec explicitly here. PyAV will automatically open it the first time we @@ -536,6 +554,19 @@ def main() -> None: LOGGER.debug(" Encode: %s", stage_video_encode.native_id) LOGGER.debug(" Mux: %s", stage_mux.native_id) + # Handle Ctrl-C gracefully by requesting shutdown. + # + # Python always routes signals to the main thread, so we don't have to worry about another thread getting + # a SIGINT (the Ctrl-C signal). That's significant because if the video capture stage tried to set the + # shutdown_requested event (which requires the event lock) while it was already waiting for it (hence + # holding the lock), it could end up deadlocked. The main thread doesn't ever acquire that lock. As + # another point of safety, Python only will invoke our signal handler at a "safe" point, such as between + # bytecode instructions. + + # We set old_sigint_handler twice: once here, and once when we change the handler. The first time is + # just in case a signal arrives in the tiny window between when we set the new handler (by calling + # signal.signal), and when we assign it to old_sigint_handler (with "="). Signal handling, like + # threading, is tricky to get right. old_sigint_handler = signal.getsignal(signal.SIGINT) def sigint_handler(_signum: int, _frame: Any) -> None: @@ -552,6 +583,8 @@ def sigint_handler(_signum: int, _frame: Any) -> None: print("Starting video capture. Press Ctrl-C to stop.") if duration_secs is not None: + # Wait for up to the specified duration. If the pipeline shuts down for other reasons (such as an + # exception), then we'll recognize it sooner with this join. stage_video_capture.join(timeout=duration_secs) # Either the join timed out, or we processed a ^C and requested it exit. Either way, it's safe to set # the shutdown event again, and return to our normal processing loop. From 6e126adf63086cfe83e694ec533a832439016499 Mon Sep 17 00:00:00 2001 From: Joel Holveck Date: Tue, 20 Jan 2026 09:26:21 +0000 Subject: [PATCH 15/16] Add docs references to demos --- CHANGELOG.md | 1 + docs/source/examples.rst | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5087f45..6841012 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ See Git checking messages for full history. - Windows: improve error checking and messages for Win32 API calls (#448) - Mac: fix memory leak (#450, #453) - improve multithreading: allow multiple threads to use the same MSS object, allow multiple MSS objects to concurrently take screenshots, and document multithreading guarantees (#446, #452) +- Add full demos for different ways to use MSS (#444, #456) - :heart: contributors: @jholveck ## 10.1.0 (2025-08-16) diff --git a/docs/source/examples.rst b/docs/source/examples.rst index 437e0f3..7b636bb 100644 --- a/docs/source/examples.rst +++ b/docs/source/examples.rst @@ -209,3 +209,19 @@ Different possibilities to convert raw BGRA values to RGB:: ... .. versionadded:: 3.2.0 + + +Demos +===== + +In addition to these simple examples, there are full demos of more complex use cases in the ``demos/`` directory of the +source code. The demos are not installed with the package, but you can run them directly from the source tree after +cloning the repository. + +These are complete, working programs that use MSS for screen capture as a key part of their functionality. They show +not only how to invoke MSS, but also some of the techniques for using the captured frames efficiently, in real-world +scenarios. + +These include: +- MP4 video capture with encoding using PyAV (FFmpeg bindings) +- Live streaming to a TinyTV as MJPEG From 5c26cd158c295a18595f5a5da8abe65dbd6ffa53 Mon Sep 17 00:00:00 2001 From: Joel Ray Holveck Date: Tue, 27 Jan 2026 19:23:48 -0800 Subject: [PATCH 16/16] Clarify a few comments based on reviews --- demos/video-capture.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/demos/video-capture.py b/demos/video-capture.py index 10543ec..18ff9f0 100755 --- a/demos/video-capture.py +++ b/demos/video-capture.py @@ -49,7 +49,8 @@ # Timestamps (PTS/DTS) # -------------------- # -# Every frame has a *presentation timestamp* (PTS): when the viewer should see it. +# Every frame has a *presentation timestamp* (PTS): when the viewer should see it. (See the next section for how +# these are represented.) # # Encoders may output packets in a different order due to B-frames. Those packets also have a *decode timestamp* # (DTS): when the decoder must decode them so the PTS schedule can be met. @@ -73,8 +74,12 @@ # Many video files run at a fixed frame rate, like 30 fps. Each frame is shown at 1/30 sec intervals. This is called # *constant frame rate*, or *CFR*, and that's what we do in the simple version of this demo. # -# One problem with this is that, if the encoder can't keep up, the video will appear sped-up when played back. The -# comments at the beginning of the simple version of this demo go into more detail about that problem. +# Applications using CFR usually set the time base to the frame rate, such as 1/30 sec. This lets them just use the +# frame number for the PTS. +# +# One problem with real-time recording to CFR is that, if the encoder can't keep up, the video will appear sped-up +# when played back. The comments at the beginning of the simple version of this demo go into more detail about that +# problem. # # In this advanced version, we use *variable frame rate*, or *VFR*. That's because we can't be sure that the encoder # will be able to work fast enough: we haven't tuned its settings for your screen resolution and hardware. While the