Skip to content

ImageLocator / Screen

Image-based element finding and full-screen operations. Requires pip install dolphin-desktop[vision].


ImageLocator

ImageLocator

Locate a template image on screen using OpenCV template matching.

Parameters:

Name Type Description Default
template str | Path

Path to the template PNG/BMP/JPG image.

required
threshold float

Minimum match confidence (0-1, default 0.85).

0.85
scales list[float] | None

List of scale factors to try during multi-scale matching (e.g. [0.8, 1.0, 1.2]). When None only scale 1.0 is used.

None
region tuple[int, int, int, int] | None

Default bounding box (left, top, right, bottom) in screen coordinates to restrict the search area. Each method's own region parameter overrides this default.

None
Source code in src\dolphin_desktop\_image.py
class ImageLocator:
    """Locate a template image on screen using OpenCV template matching.

    Parameters
    ----------
    template:
        Path to the template PNG/BMP/JPG image.
    threshold:
        Minimum match confidence (0-1, default 0.85).
    scales:
        List of scale factors to try during multi-scale matching
        (e.g. ``[0.8, 1.0, 1.2]``).  When ``None`` only scale 1.0 is used.
    region:
        Default bounding box ``(left, top, right, bottom)`` in screen
        coordinates to restrict the search area.  Each method's own *region*
        parameter overrides this default.
    """

    def __init__(
        self,
        template: str | Path,
        threshold: float = 0.85,
        *,
        scales: list[float] | None = None,
        region: tuple[int, int, int, int] | None = None,
    ) -> None:
        self._template_path = Path(template)
        self._threshold = threshold
        self._scales: list[float] = scales if scales is not None else [1.0]
        self._region = region

    def _load_template(self):  # type: ignore[return]
        """Load the template image as a cv2 array."""
        cv2 = _require_cv2()
        tmpl = cv2.imread(str(self._template_path))
        if tmpl is None:
            raise FileNotFoundError(f"Template image not found: {self._template_path}")
        return tmpl

    def _match(
        self,
        screen,
        tmpl,
        scale: float,
    ) -> tuple[tuple[int, int] | None, int, int, float]:
        """Try matching *tmpl* at *scale* against *screen*.

        Returns ``(match_loc, tw, th, score)`` or ``(None, tw, th, score)``.
        *match_loc* is the top-left corner of the match in the (possibly cropped) screen.
        """
        cv2 = _require_cv2()
        if scale != 1.0:
            h, w = tmpl.shape[:2]
            new_w = max(1, int(w * scale))
            new_h = max(1, int(h * scale))
            tmpl_s = cv2.resize(tmpl, (new_w, new_h))
        else:
            tmpl_s = tmpl

        th, tw = tmpl_s.shape[:2]
        if th > screen.shape[0] or tw > screen.shape[1]:
            return None, tw, th, 0.0

        if _is_low_variance_image(screen) or _is_low_variance_image(tmpl_s):
            result = cv2.matchTemplate(screen, tmpl_s, cv2.TM_SQDIFF_NORMED)
            min_val, _, min_loc, _ = cv2.minMaxLoc(result)
            score = 1.0 - min_val
            if score < self._threshold:
                return None, tw, th, score
            return min_loc, tw, th, score

        result = cv2.matchTemplate(screen, tmpl_s, cv2.TM_CCOEFF_NORMED)
        _, max_val, _, max_loc = cv2.minMaxLoc(result)
        if max_val < self._threshold:
            return None, tw, th, max_val
        return max_loc, tw, th, max_val

    def _find_impl(
        self,
        effective_region: tuple[int, int, int, int] | None,
    ) -> tuple[int, int, int, int] | None:
        """Return ``(cx, cy, tw, th)`` of the best match or None."""
        tmpl = self._load_template()
        screen = _pil_to_cv(_grab(effective_region))
        rx, ry = (effective_region[0], effective_region[1]) if effective_region else (0, 0)

        best_val = -1.0
        best: tuple[int, int, int, int] | None = None

        for scale in self._scales:
            loc, tw, th, val = self._match(screen, tmpl, scale)
            if loc is not None and val > best_val:
                best_val = val
                cx = loc[0] + tw // 2 + rx
                cy = loc[1] + th // 2 + ry
                best = (cx, cy, tw, th)

        return best

    def find(
        self,
        region: tuple[int, int, int, int] | None = None,
    ) -> tuple[int, int] | None:
        """Return ``(cx, cy)`` of the best template match, or None."""
        effective = region if region is not None else self._region
        result = self._find_impl(effective)
        return (result[0], result[1]) if result is not None else None

    def find_with_size(
        self,
        region: tuple[int, int, int, int] | None = None,
    ) -> tuple[int, int, int, int] | None:
        """Return ``(cx, cy, template_w, template_h)`` or None.

        Used internally by the failover chain so that ``_ImageElement``
        has the correct bounding box.
        """
        effective = region if region is not None else self._region
        return self._find_impl(effective)

    def find_all(
        self,
        region: tuple[int, int, int, int] | None = None,
    ) -> list[tuple[int, int]]:
        """Return all match centres above threshold."""
        cv2 = _require_cv2()
        import numpy as np  # type: ignore[import-untyped]

        effective = region if region is not None else self._region
        tmpl = self._load_template()
        screen = _pil_to_cv(_grab(effective))
        rx, ry = (effective[0], effective[1]) if effective else (0, 0)

        points: list[tuple[int, int]] = []

        # For find_all use only scale 1.0 or the first scale (multi-scale deduplication
        # across scales is complex and rarely needed for enumeration).
        scale = self._scales[0]
        if scale != 1.0:
            h, w = tmpl.shape[:2]
            tmpl = cv2.resize(tmpl, (max(1, int(w * scale)), max(1, int(h * scale))))

        th, tw = tmpl.shape[:2]
        if th > screen.shape[0] or tw > screen.shape[1]:
            return points

        if _is_low_variance_image(screen) or _is_low_variance_image(tmpl):
            result = cv2.matchTemplate(screen, tmpl, cv2.TM_SQDIFF_NORMED)
            locations = np.where((1.0 - result) >= self._threshold)
        else:
            result = cv2.matchTemplate(screen, tmpl, cv2.TM_CCOEFF_NORMED)
            locations = np.where(result >= self._threshold)
        for pt in zip(locations[1], locations[0], strict=False):
            cx = pt[0] + tw // 2 + rx
            cy = pt[1] + th // 2 + ry
            points.append((cx, cy))
        return points

    def click(self, region: tuple[int, int, int, int] | None = None) -> None:
        """Click the template match centre."""
        import pywinauto.mouse as _mouse  # type: ignore[import-untyped]

        pt = self.find(region)
        if pt is None:
            raise RuntimeError(
                f"Template {self._template_path} not found on screen (threshold={self._threshold})"
            )
        _mouse.click(coords=pt)

    def double_click(self, region: tuple[int, int, int, int] | None = None) -> None:
        """Double-click the template match centre."""
        import pywinauto.mouse as _mouse  # type: ignore[import-untyped]

        pt = self.find(region)
        if pt is None:
            raise RuntimeError(
                f"Template {self._template_path} not found on screen (threshold={self._threshold})"
            )
        _mouse.double_click(coords=pt)

    def wait_for(
        self,
        timeout: float = 10.0,
        region: tuple[int, int, int, int] | None = None,
    ) -> tuple[int, int]:
        """Poll every 0.5 s until the template appears; raise on timeout."""
        deadline = time.monotonic() + timeout
        while time.monotonic() < deadline:
            pt = self.find(region)
            if pt is not None:
                return pt
            time.sleep(0.5)
        raise RuntimeError(f"Template {self._template_path} not found within {timeout}s")

    def exists(
        self,
        timeout: float = 0.0,
        region: tuple[int, int, int, int] | None = None,
    ) -> bool:
        """Return True if the template is found within *timeout* seconds."""
        if timeout <= 0:
            return self.find(region) is not None
        try:
            self.wait_for(timeout=timeout, region=region)
            return True
        except RuntimeError:
            return False

    def as_element(
        self,
        region: tuple[int, int, int, int] | None = None,
    ) -> _ImageElement | None:
        """Find the template and return an _ImageElement proxy, or None."""
        result = self.find_with_size(region)
        if result is None:
            return None
        cx, cy, tw, th = result
        return _ImageElement(cx, cy, tw, th)

find

find(
    region: tuple[int, int, int, int] | None = None,
) -> tuple[int, int] | None

Return (cx, cy) of the best template match, or None.

Source code in src\dolphin_desktop\_image.py
def find(
    self,
    region: tuple[int, int, int, int] | None = None,
) -> tuple[int, int] | None:
    """Return ``(cx, cy)`` of the best template match, or None."""
    effective = region if region is not None else self._region
    result = self._find_impl(effective)
    return (result[0], result[1]) if result is not None else None

find_with_size

find_with_size(
    region: tuple[int, int, int, int] | None = None,
) -> tuple[int, int, int, int] | None

Return (cx, cy, template_w, template_h) or None.

Used internally by the failover chain so that _ImageElement has the correct bounding box.

Source code in src\dolphin_desktop\_image.py
def find_with_size(
    self,
    region: tuple[int, int, int, int] | None = None,
) -> tuple[int, int, int, int] | None:
    """Return ``(cx, cy, template_w, template_h)`` or None.

    Used internally by the failover chain so that ``_ImageElement``
    has the correct bounding box.
    """
    effective = region if region is not None else self._region
    return self._find_impl(effective)

find_all

find_all(
    region: tuple[int, int, int, int] | None = None,
) -> list[tuple[int, int]]

Return all match centres above threshold.

Source code in src\dolphin_desktop\_image.py
def find_all(
    self,
    region: tuple[int, int, int, int] | None = None,
) -> list[tuple[int, int]]:
    """Return all match centres above threshold."""
    cv2 = _require_cv2()
    import numpy as np  # type: ignore[import-untyped]

    effective = region if region is not None else self._region
    tmpl = self._load_template()
    screen = _pil_to_cv(_grab(effective))
    rx, ry = (effective[0], effective[1]) if effective else (0, 0)

    points: list[tuple[int, int]] = []

    # For find_all use only scale 1.0 or the first scale (multi-scale deduplication
    # across scales is complex and rarely needed for enumeration).
    scale = self._scales[0]
    if scale != 1.0:
        h, w = tmpl.shape[:2]
        tmpl = cv2.resize(tmpl, (max(1, int(w * scale)), max(1, int(h * scale))))

    th, tw = tmpl.shape[:2]
    if th > screen.shape[0] or tw > screen.shape[1]:
        return points

    if _is_low_variance_image(screen) or _is_low_variance_image(tmpl):
        result = cv2.matchTemplate(screen, tmpl, cv2.TM_SQDIFF_NORMED)
        locations = np.where((1.0 - result) >= self._threshold)
    else:
        result = cv2.matchTemplate(screen, tmpl, cv2.TM_CCOEFF_NORMED)
        locations = np.where(result >= self._threshold)
    for pt in zip(locations[1], locations[0], strict=False):
        cx = pt[0] + tw // 2 + rx
        cy = pt[1] + th // 2 + ry
        points.append((cx, cy))
    return points

click

click(
    region: tuple[int, int, int, int] | None = None,
) -> None

Click the template match centre.

Source code in src\dolphin_desktop\_image.py
def click(self, region: tuple[int, int, int, int] | None = None) -> None:
    """Click the template match centre."""
    import pywinauto.mouse as _mouse  # type: ignore[import-untyped]

    pt = self.find(region)
    if pt is None:
        raise RuntimeError(
            f"Template {self._template_path} not found on screen (threshold={self._threshold})"
        )
    _mouse.click(coords=pt)

double_click

double_click(
    region: tuple[int, int, int, int] | None = None,
) -> None

Double-click the template match centre.

Source code in src\dolphin_desktop\_image.py
def double_click(self, region: tuple[int, int, int, int] | None = None) -> None:
    """Double-click the template match centre."""
    import pywinauto.mouse as _mouse  # type: ignore[import-untyped]

    pt = self.find(region)
    if pt is None:
        raise RuntimeError(
            f"Template {self._template_path} not found on screen (threshold={self._threshold})"
        )
    _mouse.double_click(coords=pt)

wait_for

wait_for(
    timeout: float = 10.0,
    region: tuple[int, int, int, int] | None = None,
) -> tuple[int, int]

Poll every 0.5 s until the template appears; raise on timeout.

Source code in src\dolphin_desktop\_image.py
def wait_for(
    self,
    timeout: float = 10.0,
    region: tuple[int, int, int, int] | None = None,
) -> tuple[int, int]:
    """Poll every 0.5 s until the template appears; raise on timeout."""
    deadline = time.monotonic() + timeout
    while time.monotonic() < deadline:
        pt = self.find(region)
        if pt is not None:
            return pt
        time.sleep(0.5)
    raise RuntimeError(f"Template {self._template_path} not found within {timeout}s")

exists

exists(
    timeout: float = 0.0,
    region: tuple[int, int, int, int] | None = None,
) -> bool

Return True if the template is found within timeout seconds.

Source code in src\dolphin_desktop\_image.py
def exists(
    self,
    timeout: float = 0.0,
    region: tuple[int, int, int, int] | None = None,
) -> bool:
    """Return True if the template is found within *timeout* seconds."""
    if timeout <= 0:
        return self.find(region) is not None
    try:
        self.wait_for(timeout=timeout, region=region)
        return True
    except RuntimeError:
        return False

as_element

as_element(
    region: tuple[int, int, int, int] | None = None,
) -> _ImageElement | None

Find the template and return an _ImageElement proxy, or None.

Source code in src\dolphin_desktop\_image.py
def as_element(
    self,
    region: tuple[int, int, int, int] | None = None,
) -> _ImageElement | None:
    """Find the template and return an _ImageElement proxy, or None."""
    result = self.find_with_size(region)
    if result is None:
        return None
    cx, cy, tw, th = result
    return _ImageElement(cx, cy, tw, th)

Usage

from dolphin_desktop import ImageLocator

btn = ImageLocator("templates/ok_button.png", threshold=0.9)

btn.exists()              # bool - found on screen right now
btn.wait_for(timeout=10)  # wait until it appears
btn.click()
btn.double_click()

# Find all occurrences
matches = btn.find_all()
for x, y in matches:
    print(f"({x}, {y})")

# Find one
match = btn.find()
if match is not None:
    x, y = match
    print(f"({x}, {y})")

Screen

Screen

Static helpers for full-screen capture, colour sampling, and OCR.

Source code in src\dolphin_desktop\_image.py
class Screen:
    """Static helpers for full-screen capture, colour sampling, and OCR."""

    @staticmethod
    def screenshot(region: tuple[int, int, int, int] | None = None):
        """Return a PIL Image of the screen or *region*."""
        return _grab(region)

    @staticmethod
    def pixel_color(x: int, y: int) -> tuple[int, int, int]:
        """Return the (R, G, B) colour at screen coordinate (*x*, *y*)."""
        img = _grab((x, y, x + 1, y + 1))
        return img.getpixel((0, 0))[:3]  # type: ignore[return-value]

    @staticmethod
    def find_image(
        template: str | Path,
        threshold: float = 0.85,
        region: tuple[int, int, int, int] | None = None,
    ) -> tuple[int, int] | None:
        """Convenience wrapper: create an ImageLocator and call find()."""
        return ImageLocator(template, threshold).find(region)

    @staticmethod
    def text(region: tuple[int, int, int, int] | None = None) -> str:
        """Return OCR text of the screen or *region* via pytesseract."""
        tess = _require_tesseract()
        img = _grab(region)
        return tess.image_to_string(img)

    @staticmethod
    def find_text(
        text: str,
        region: tuple[int, int, int, int] | None = None,
    ) -> tuple[int, int] | None:
        """Return the centre (x, y) of the first bounding box containing *text*, or None."""
        tess = _require_tesseract()
        img = _grab(region)
        data = tess.image_to_data(img, output_type=tess.Output.DICT)

        for i, word in enumerate(data["text"]):
            if text.lower() in str(word).lower():
                x = data["left"][i]
                y = data["top"][i]
                w = data["width"][i]
                h = data["height"][i]
                cx = x + w // 2
                cy = y + h // 2
                if region:
                    cx += region[0]
                    cy += region[1]
                return cx, cy
        return None

screenshot staticmethod

screenshot(region: tuple[int, int, int, int] | None = None)

Return a PIL Image of the screen or region.

Source code in src\dolphin_desktop\_image.py
@staticmethod
def screenshot(region: tuple[int, int, int, int] | None = None):
    """Return a PIL Image of the screen or *region*."""
    return _grab(region)

pixel_color staticmethod

pixel_color(x: int, y: int) -> tuple[int, int, int]

Return the (R, G, B) colour at screen coordinate (x, y).

Source code in src\dolphin_desktop\_image.py
@staticmethod
def pixel_color(x: int, y: int) -> tuple[int, int, int]:
    """Return the (R, G, B) colour at screen coordinate (*x*, *y*)."""
    img = _grab((x, y, x + 1, y + 1))
    return img.getpixel((0, 0))[:3]  # type: ignore[return-value]

find_image staticmethod

find_image(
    template: str | Path,
    threshold: float = 0.85,
    region: tuple[int, int, int, int] | None = None,
) -> tuple[int, int] | None

Convenience wrapper: create an ImageLocator and call find().

Source code in src\dolphin_desktop\_image.py
@staticmethod
def find_image(
    template: str | Path,
    threshold: float = 0.85,
    region: tuple[int, int, int, int] | None = None,
) -> tuple[int, int] | None:
    """Convenience wrapper: create an ImageLocator and call find()."""
    return ImageLocator(template, threshold).find(region)

text staticmethod

text(
    region: tuple[int, int, int, int] | None = None,
) -> str

Return OCR text of the screen or region via pytesseract.

Source code in src\dolphin_desktop\_image.py
@staticmethod
def text(region: tuple[int, int, int, int] | None = None) -> str:
    """Return OCR text of the screen or *region* via pytesseract."""
    tess = _require_tesseract()
    img = _grab(region)
    return tess.image_to_string(img)

find_text staticmethod

find_text(
    text: str,
    region: tuple[int, int, int, int] | None = None,
) -> tuple[int, int] | None

Return the centre (x, y) of the first bounding box containing text, or None.

Source code in src\dolphin_desktop\_image.py
@staticmethod
def find_text(
    text: str,
    region: tuple[int, int, int, int] | None = None,
) -> tuple[int, int] | None:
    """Return the centre (x, y) of the first bounding box containing *text*, or None."""
    tess = _require_tesseract()
    img = _grab(region)
    data = tess.image_to_data(img, output_type=tess.Output.DICT)

    for i, word in enumerate(data["text"]):
        if text.lower() in str(word).lower():
            x = data["left"][i]
            y = data["top"][i]
            w = data["width"][i]
            h = data["height"][i]
            cx = x + w // 2
            cy = y + h // 2
            if region:
                cx += region[0]
                cy += region[1]
            return cx, cy
    return None

Usage

from dolphin_desktop import Screen

# Full screenshot
img = Screen.screenshot()
img.save("screen.png")

# Region screenshot (left, top, right, bottom)
img = Screen.screenshot(region=(0, 0, 800, 600))

# Pixel colour
r, g, b = Screen.pixel_color(100, 200)

# OCR - all visible text
text = Screen.text()
text = Screen.text(region=(0, 50, 1920, 100))

# Find text - returns (cx, cy) of first matching word
point = Screen.find_text("Submit")
if point is not None:
    x, y = point
    from dolphin_desktop import Mouse
    Mouse.click(x, y)

# Shortcut
match = Screen.find_image("templates/btn.png")