[scene_manager] Add ability to crop input

Breakthrough · Breakthrough · commit eef0adeb1952 · 2024-10-26T21:54:44.000-04:00
diff --git a/docs/cli.rst b/docs/cli.rst
@@ -56,6 +56,10 @@ Options
 
   Path to config file. See :ref:`config file reference <scenedetect_cli-config_file>` for details.
 
+.. option:: --crop X0 Y0 X1 Y1
+
+  Crop input video. Specified as two points representing top left and bottom right corner of crop region. 0 0 is top-left of the video frame. Bounds are inclusive (e.g. for a 100x100 video, the region covering the whole frame is 0 0 99 99).
+
 .. option:: -s CSV, --stats CSV
 
   Stats file (.csv) to write frame metrics. Existing files will be overwritten. Used for tuning detection parameters and data analysis.
diff --git a/scenedetect.cfg b/scenedetect.cfg
@@ -32,6 +32,10 @@
 # Video backend interface, must be one of: opencv, pyav.
 #backend = opencv
 
+# Crop input video to area. Specified as two points in the form X0 Y0 X1 Y1 or
+# as (X0 Y0), (X1 Y1). Coordinate (0, 0) is the top-left corner.
+#crop = 100 100 200 250
+
 # Downscale frame using a ratio of N. Set to 1 for no downscaling. If unset,
 # applied automatically based on input video resolution. Must be an integer value.
 #downscale = 1
diff --git a/scenedetect/_cli/__init__.py b/scenedetect/_cli/__init__.py
@@ -228,6 +228,14 @@ def _print_command_help(ctx: click.Context, command: click.Command):
     help="Backend to use for video input. Backend options can be set using a config file (-c/--config). [available: %s]%s"
     % (", ".join(AVAILABLE_BACKENDS.keys()), USER_CONFIG.get_help_string("global", "backend")),
 )
+@click.option(
+    "--crop",
+    metavar="X0 Y0 X1 Y1",
+    type=(int, int, int, int),
+    default=None,
+    help="Crop input video. Specified as two points representing top left and bottom right corner of crop region. 0 0 is top-left of the video frame. Bounds are inclusive (e.g. for a 100x100 video, the region covering the whole frame is 0 0 99 99).%s"
+    % (USER_CONFIG.get_help_string("global", "crop", show_default=False)),
+)
 @click.option(
     "--downscale",
     "-d",
@@ -284,6 +292,7 @@ def scenedetect(
     drop_short_scenes: ty.Optional[bool],
     merge_last_scene: ty.Optional[bool],
     backend: ty.Optional[str],
+    crop: ty.Optional[ty.Tuple[int, int, int, int]],
     downscale: ty.Optional[int],
     frame_skip: ty.Optional[int],
     verbosity: ty.Optional[str],
@@ -324,12 +333,13 @@ def scenedetect(
         output=output,
         framerate=framerate,
         stats_file=stats,
-        downscale=downscale,
         frame_skip=frame_skip,
         min_scene_len=min_scene_len,
         drop_short_scenes=drop_short_scenes,
         merge_last_scene=merge_last_scene,
         backend=backend,
+        crop=crop,
+        downscale=downscale,
         quiet=quiet,
         logfile=logfile,
         config=config,
diff --git a/scenedetect/_cli/config.py b/scenedetect/_cli/config.py
@@ -313,6 +313,12 @@ def format(self, timecode: FrameTimecode) -> str:
     },
     "global": {
         "backend": "opencv",
+        #
+        #
+        # FIXME: This should be a tuple of 4 valid ints similar to ScoreWeightsValue.
+        #
+        #
+        "crop": None,
         "default-detector": "detect-adaptive",
         "downscale": 0,
         "downscale-method": Interpolation.LINEAR,
diff --git a/scenedetect/_cli/context.py b/scenedetect/_cli/context.py
@@ -157,12 +157,13 @@ def handle_options(
         output: ty.Optional[ty.AnyStr],
         framerate: float,
         stats_file: ty.Optional[ty.AnyStr],
-        downscale: ty.Optional[int],
         frame_skip: int,
         min_scene_len: str,
         drop_short_scenes: ty.Optional[bool],
         merge_last_scene: ty.Optional[bool],
         backend: ty.Optional[str],
+        crop: ty.Optional[ty.Tuple[int, int, int, int]],
+        downscale: ty.Optional[int],
         quiet: bool,
         logfile: ty.Optional[ty.AnyStr],
         config: ty.Optional[ty.AnyStr],
@@ -287,6 +288,7 @@ def handle_options(
                 logger.debug(str(ex))
                 raise click.BadParameter(str(ex), param_hint="downscale factor") from None
         scene_manager.interpolation = self.config.get_value("global", "downscale-method")
+        scene_manager.crop = self.config.get_value("global", "crop", crop)
 
         self.scene_manager = scene_manager
 
@@ -545,7 +547,12 @@ def _open_video_stream(
                     framerate=framerate,
                     backend=backend,
                 )
-            logger.debug("Video opened using backend %s", type(self.video_stream).__name__)
+            logger.debug(f"""Video information:
+  Backend:      {type(self.video_stream).__name__}
+  Resolution:   {self.video_stream.frame_size}
+  Framerate:    {self.video_stream.frame_rate}
+  Duration:     {self.video_stream.duration} ({self.video_stream.duration.frame_num} frames)""")
+
         except FrameRateUnavailable as ex:
             raise click.BadParameter(
                 "Failed to obtain framerate for input video. Manually specify framerate with the"
diff --git a/scenedetect/scene_manager.py b/scenedetect/scene_manager.py
@@ -112,6 +112,11 @@ def on_new_scene(frame_img: numpy.ndarray, frame_num: int):
 CutList = List[FrameTimecode]
 """Type hint for a list of cuts, where each timecode represents the first frame of a new shot."""
 
+CropRegion = Tuple[int, int, int, int]
+"""Type hint for rectangle of the form X0 Y0 X1 Y1 for cropping frames. Coordinates are relative
+to source frame without downscaling.
+"""
+
 # TODO: This value can and should be tuned for performance improvements as much as possible,
 # until accuracy falls, on a large enough dataset. This has yet to be done, but the current
 # value doesn't seem to have caused any issues at least.
@@ -143,7 +148,7 @@ class Interpolation(Enum):
     """Lanczos interpolation over 8x8 neighborhood."""
 
 
-def compute_downscale_factor(frame_width: int, effective_width: int = DEFAULT_MIN_WIDTH) -> int:
+def compute_downscale_factor(frame_width: int, effective_width: int = DEFAULT_MIN_WIDTH) -> float:
     """Get the optimal default downscale factor based on a video's resolution (currently only
     the width in pixels is considered).
 
@@ -157,10 +162,10 @@ def compute_downscale_factor(frame_width: int, effective_width: int = DEFAULT_MI
     Returns:
         int: The default downscale factor to use to achieve at least the target effective_width.
     """
-    assert not (frame_width < 1 or effective_width < 1)
+    assert frame_width > 0 and effective_width > 0
     if frame_width < effective_width:
         return 1
-    return frame_width // effective_width
+    return frame_width / float(effective_width)
 
 
 def get_scenes_from_cuts(
@@ -651,6 +656,7 @@ def __init__(
 
         self._frame_buffer = []
         self._frame_buffer_size = 0
+        self._crop = None
 
     @property
     def interpolation(self) -> Interpolation:
@@ -666,6 +672,35 @@ def stats_manager(self) -> Optional[StatsManager]:
         """Getter for the StatsManager associated with this SceneManager, if any."""
         return self._stats_manager
 
+    @property
+    def crop(self) -> Optional[CropRegion]:
+        """Portion of the frame to crop. Tuple of 4 ints in the form (X0, Y0, X1, Y1) where X0, Y0
+        describes one point and X1, Y1 is another which describe a rectangle inside of the frame.
+        Coordinates start from 0 and are inclusive. For example, with a 100x100 pixel video,
+        (0, 0, 99, 99) covers the entire frame."""
+        if self._crop is None:
+            return None
+        (x0, y0, x1, y1) = self._crop
+        return (x0, y0, x1 - 1, y1 - 1)
+
+    @crop.setter
+    def crop(self, value: CropRegion):
+        """Raises:
+        ValueError: All coordinates must be >= 0.
+        """
+        if value is None:
+            self._crop = None
+            return
+        if not (len(value) == 4 and all(isinstance(v, int) for v in value)):
+            raise TypeError("crop region must be tuple of 4 ints")
+        # Verify that the provided crop results in a non-empty portion of the frame.
+        if any(coordinate < 0 for coordinate in value):
+            raise ValueError("crop coordinates must be >= 0")
+        (x0, y0, x1, y1) = value
+        # Internally we store the value in the form used to de-reference the image, which must be
+        # one-past the end.
+        self._crop = (x0, y0, x1 + 1, y1 + 1)
+
     @property
     def downscale(self) -> int:
         """Factor to downscale each frame by. Will always be >= 1, where 1
@@ -892,6 +927,33 @@ def detect_scenes(
         if end_time is not None and isinstance(end_time, (int, float)) and end_time < 0:
             raise ValueError("end_time must be greater than or equal to 0!")
 
+        effective_frame_size = video.frame_size
+        if self._crop:
+            logger.debug(f"Crop set: {self.crop}")
+            x0, y0, x1, y1 = self._crop
+            min_x, min_y = (min(x0, x1), min(y0, y1))
+            max_x, max_y = (max(x0, x1), max(y0, y1))
+            frame_width, frame_height = video.frame_size
+            if min_x >= frame_width or min_y >= frame_height:
+                raise ValueError("crop starts outside video boundary")
+            if max_x >= frame_width or max_y >= frame_height:
+                logger.warning("Warning: crop ends outside of video boundary.")
+            effective_frame_size = (
+                1 + min(max_x, frame_width) - min_x,
+                1 + min(max_y, frame_height) - min_y,
+            )
+        # Calculate downscale factor and log effective resolution.
+        if self.auto_downscale:
+            downscale_factor = compute_downscale_factor(max(effective_frame_size))
+        else:
+            downscale_factor = self.downscale
+        logger.debug(
+            "Processing resolution: %d x %d, downscale: %1.1f",
+            int(effective_frame_size[0] / downscale_factor),
+            int(effective_frame_size[1] / downscale_factor),
+            downscale_factor,
+        )
+
         self._base_timecode = video.base_timecode
 
         # TODO: Figure out a better solution for communicating framerate to StatsManager.
@@ -911,19 +973,6 @@ def detect_scenes(
             else:
                 total_frames = video.duration.get_frames() - start_frame_num
 
-        # Calculate the desired downscale factor and log the effective resolution.
-        if self.auto_downscale:
-            downscale_factor = compute_downscale_factor(frame_width=video.frame_size[0])
-        else:
-            downscale_factor = self.downscale
-        if downscale_factor > 1:
-            logger.info(
-                "Downscale factor set to %d, effective resolution: %d x %d",
-                downscale_factor,
-                video.frame_size[0] // downscale_factor,
-                video.frame_size[1] // downscale_factor,
-            )
-
         progress_bar = None
         if show_progress:
             progress_bar = tqdm(
@@ -980,7 +1029,7 @@ def _decode_thread(
         self,
         video: VideoStream,
         frame_skip: int,
-        downscale_factor: int,
+        downscale_factor: float,
         end_time: FrameTimecode,
         out_queue: queue.Queue,
     ):
@@ -1021,12 +1070,16 @@ def _decode_thread(
                         # Skip processing frames that have an incorrect size.
                         continue
 
-                    if downscale_factor > 1:
+                    if self._crop:
+                        (x0, y0, x1, y1) = self._crop
+                        frame_im = frame_im[y0:y1, x0:x1]
+
+                    if downscale_factor > 1.0:
                         frame_im = cv2.resize(
                             frame_im,
                             (
-                                round(frame_im.shape[1] / downscale_factor),
-                                round(frame_im.shape[0] / downscale_factor),
+                                max(1, round(frame_im.shape[1] / downscale_factor)),
+                                max(1, round(frame_im.shape[0] / downscale_factor)),
                             ),
                             interpolation=self._interpolation.value,
                         )
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -115,6 +115,11 @@ def test_cli_default_detector():
     assert invoke_scenedetect("-i {VIDEO} time {TIME}", config_file=None) == 0
 
 
+def test_cli_crop():
+    """Test --crop functionality."""
+    assert invoke_scenedetect("-i {VIDEO} --crop 0 0 256 256 time {TIME}", config_file=None) == 0
+
+
 @pytest.mark.parametrize("info_command", ["help", "about", "version"])
 def test_cli_info_command(info_command):
     """Test `scenedetect` info commands (e.g. help, about)."""
diff --git a/tests/test_scene_manager.py b/tests/test_scene_manager.py
@@ -20,6 +20,8 @@
 import os.path
 from typing import List
 
+import pytest
+
 from scenedetect.backends.opencv import VideoStreamCv2
 from scenedetect.detectors import AdaptiveDetector, ContentDetector
 from scenedetect.frame_timecode import FrameTimecode
@@ -255,3 +257,36 @@ def test_detect_scenes_callback_adaptive(test_video_file):
     scene_list = sm.get_scene_list()
     assert [start for start, end in scene_list] == TEST_VIDEO_START_FRAMES_ACTUAL
     assert fake_callback.scene_list == TEST_VIDEO_START_FRAMES_ACTUAL[1:]
+
+
+def test_detect_scenes_crop(test_video_file):
+    video = VideoStreamCv2(test_video_file)
+    sm = SceneManager()
+    sm.crop = (10, 10, 1900, 1000)
+    sm.add_detector(ContentDetector())
+
+    video_fps = video.frame_rate
+    start_time = FrameTimecode("00:00:05", video_fps)
+    end_time = FrameTimecode("00:00:15", video_fps)
+    video.seek(start_time)
+    sm.auto_downscale = True
+
+    _ = sm.detect_scenes(video=video, end_time=end_time)
+    scene_list = sm.get_scene_list()
+    assert [start for start, _ in scene_list] == TEST_VIDEO_START_FRAMES_ACTUAL
+
+
+def test_crop_invalid():
+    sm = SceneManager()
+    sm.crop = None
+    sm.crop = (0, 0, 0, 0)
+    sm.crop = (1, 1, 0, 0)
+    sm.crop = (0, 0, 1, 1)
+    with pytest.raises(TypeError):
+        sm.crop = 1
+    with pytest.raises(TypeError):
+        sm.crop = (1, 1)
+    with pytest.raises(TypeError):
+        sm.crop = (1, 1, 1)
+    with pytest.raises(ValueError):
+        sm.crop = (1, 1, 1, -1)
diff --git a/website/pages/changelog.md b/website/pages/changelog.md
@@ -588,3 +588,6 @@ Development
  - [bugfix] Fix `ContentDetector` crash when using callbacks [#416](https://github.com/Breakthrough/PySceneDetect/issues/416) [#420](https://github.com/Breakthrough/PySceneDetect/issues/420)
  - [general] Timecodes of the form `MM:SS[.nnn]` are now processed correctly [#443](https://github.com/Breakthrough/PySceneDetect/issues/443)
  - [api] The `save_to_csv` function now works correctly with paths from the `pathlib` module
+ - [feature] Add ability to crop input video before processing [#302](https://github.com/Breakthrough/PySceneDetect/issues/302) [#449](https://github.com/Breakthrough/PySceneDetect/issues/449)
+     - [cli] Add `--crop` option to `scenedetect` command and config file to crop video frames before scene detection
+     - [api] Add `crop` property to `SceneManager` to crop video frames before scene detection