From ed6137c8ae088bd6ebdee5bea824df3d1a965bce Mon Sep 17 00:00:00 2001
From: Jens <1742418+1cu@users.noreply.github.com>
Date: Sun, 1 Mar 2026 21:00:34 +0100
Subject: [PATCH] fix: use native page xpath api for xpath selectors (#853)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## ℹ️ Description
*Provide a concise summary of the changes introduced in this pull
request.*

- Link to the related issue(s): n/a
- Describe the motivation and context for this change.
This replaces the stacked XPath work from #845 with a standalone fix
from `main`. It makes `By.XPATH` use the native page XPath API instead
of routing XPath selectors through text lookup.

## 📋 Changes Summary

- Add private XPath helpers in `WebScrapingMixin` for first-match and
all-match lookups.
- Route `By.XPATH` in `_web_find_once()` and `_web_find_all_once()`
through `page.xpath(...)`.
- Add unit coverage for XPath helper behavior, empty results, and
unsupported parent scoping.
- No configuration changes or new dependencies.

### ⚙️ Type of Change
Select the type(s) of change(s) included in this pull request:
- [x] 🐞 Bug fix (non-breaking change which fixes an issue)
- [ ] ✨ New feature (adds new functionality without breaking existing
usage)
- [ ] 💥 Breaking change (changes that might break existing user setups,
scripts, or configurations)


## ✅ Checklist
Before requesting a review, confirm the following:
- [x] I have reviewed my changes to ensure they meet the project's
standards.
- [x] I have tested my changes and ensured that all tests pass (`pdm run
test`).
- [x] I have formatted the code (`pdm run format`).
- [x] I have verified that linting passes (`pdm run lint`).
- [x] I have updated documentation where necessary.

By submitting this pull request, I confirm that you can use, modify,
copy, and redistribute this contribution, under the terms of your
choice.


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

* **Refactoring**
* Improved web scraping element selection reliability through
streamlined XPath operations and better internal helper methods.

* **Tests**
* Added comprehensive unit tests for XPath-based element lookup
operations to ensure consistent behavior.

<!-- end of auto-generated comment: release notes by coderabbit.ai -->
---
 .../utils/web_scraping_mixin.py               | 15 ++++++-
 tests/unit/test_web_scraping_mixin.py         | 45 +++++++++++++++++++
 2 files changed, 58 insertions(+), 2 deletions(-)

diff --git a/src/kleinanzeigen_bot/utils/web_scraping_mixin.py b/src/kleinanzeigen_bot/utils/web_scraping_mixin.py
index 685cf66..a910f43 100644
--- a/src/kleinanzeigen_bot/utils/web_scraping_mixin.py
+++ b/src/kleinanzeigen_bot/utils/web_scraping_mixin.py
@@ -1009,6 +1009,17 @@ class WebScrapingMixin:
         # Return primitive values as-is
         return data
 
+    async def _xpath_first(self, selector_value:str) -> Element | None:
+        matches = await self.page.xpath(selector_value, timeout = 0)
+        for match in matches:
+            if match is not None:
+                return cast(Element, match)
+        return None
+
+    async def _xpath_all(self, selector_value:str) -> list[Element]:
+        matches = await self.page.xpath(selector_value, timeout = 0)
+        return [cast(Element, match) for match in matches if match is not None]
+
     async def web_find(self, selector_type:By, selector_value:str, *, parent:Element | None = None, timeout:int | float | None = None) -> Element:
         """
         Locates an HTML element by the given selector type and value.
@@ -1084,7 +1095,7 @@ class WebScrapingMixin:
             case By.XPATH:
                 ensure(not parent, f"Specifying a parent element currently not supported with selector type: {selector_type}")
                 return await self.web_await(
-                    lambda: self.page.find_element_by_text(selector_value, best_match = True),
+                    lambda: self._xpath_first(selector_value),
                     timeout = timeout,
                     timeout_error_message = f"No HTML element found using XPath '{selector_value}'{timeout_suffix}",
                     apply_multiplier = False,
@@ -1129,7 +1140,7 @@ class WebScrapingMixin:
             case By.XPATH:
                 ensure(not parent, f"Specifying a parent element currently not supported with selector type: {selector_type}")
                 return await self.web_await(
-                    lambda: self.page.find_elements_by_text(selector_value),
+                    lambda: self._xpath_all(selector_value),
                     timeout = timeout,
                     timeout_error_message = f"No HTML elements found using XPath '{selector_value}'{timeout_suffix}",
                     apply_multiplier = False,
diff --git a/tests/unit/test_web_scraping_mixin.py b/tests/unit/test_web_scraping_mixin.py
index b6fbf17..bb8995d 100644
--- a/tests/unit/test_web_scraping_mixin.py
+++ b/tests/unit/test_web_scraping_mixin.py
@@ -666,6 +666,51 @@ class TestSelectorTimeoutMessages:
         assert expected_message == call.kwargs["timeout_error_message"]
         assert call.kwargs["apply_multiplier"] is False
 
+    @pytest.mark.asyncio
+    async def test_web_find_once_xpath_uses_xpath_api(self, web_scraper:WebScrapingMixin, mock_page:TrulyAwaitableMockPage) -> None:
+        """By.XPATH should use the native page.xpath API and return first non-null match."""
+        xpath_match = MagicMock(spec = Element)
+        mock_page.xpath = AsyncMock(return_value = [None, xpath_match])
+
+        result = await web_scraper._web_find_once(By.XPATH, "//div[@id='hero']", 0.2)
+
+        assert result is xpath_match
+        mock_page.xpath.assert_awaited_once_with("//div[@id='hero']", timeout = 0)
+
+    @pytest.mark.asyncio
+    async def test_web_find_once_xpath_returns_none_when_no_matches(self, web_scraper:WebScrapingMixin, mock_page:TrulyAwaitableMockPage) -> None:
+        """_xpath_first should return None when xpath yields no usable match."""
+        mock_page.xpath = AsyncMock(return_value = [None, None])
+
+        result = await web_scraper._xpath_first("//missing")
+
+        assert result is None
+        mock_page.xpath.assert_awaited_once_with("//missing", timeout = 0)
+
+    @pytest.mark.asyncio
+    async def test_web_find_all_once_xpath_uses_xpath_api(self, web_scraper:WebScrapingMixin, mock_page:TrulyAwaitableMockPage) -> None:
+        """By.XPATH should use page.xpath and return all non-null matches."""
+        first_match = MagicMock(spec = Element)
+        second_match = MagicMock(spec = Element)
+        mock_page.xpath = AsyncMock(return_value = [first_match, None, second_match])
+
+        result = await web_scraper._web_find_all_once(By.XPATH, "//footer", 0.2)
+
+        assert result == [first_match, second_match]
+        mock_page.xpath.assert_awaited_once_with("//footer", timeout = 0)
+
+    @pytest.mark.asyncio
+    async def test_web_find_once_xpath_rejects_parent(self, web_scraper:WebScrapingMixin) -> None:
+        """XPath lookups currently do not support parent-scoped queries."""
+        with pytest.raises(AssertionError, match = "Specifying a parent element currently not supported"):
+            await web_scraper._web_find_once(By.XPATH, "//div", 0.1, parent = AsyncMock(spec = Element))
+
+    @pytest.mark.asyncio
+    async def test_web_find_all_once_xpath_rejects_parent(self, web_scraper:WebScrapingMixin) -> None:
+        """XPath multi-lookups currently do not support parent-scoped queries."""
+        with pytest.raises(AssertionError, match = "Specifying a parent element currently not supported"):
+            await web_scraper._web_find_all_once(By.XPATH, "//div", 0.1, parent = AsyncMock(spec = Element))
+
     @pytest.mark.asyncio
     @pytest.mark.parametrize(
         ("selector_type", "selector_value", "expected_message"),