From e76abc66e89eb20aad239988a0dfcb5c32bfcf63 Mon Sep 17 00:00:00 2001
From: Jens <1742418+1cu@users.noreply.github.com>
Date: Tue, 28 Oct 2025 15:10:01 +0100
Subject: [PATCH] fix: harden category extraction breadcrumb parsing (#668)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## ℹ️ Description
- Link to the related issue(s): Issue #667
- Harden breadcrumb category extraction so downloads no longer fail when
the breadcrumb structure changes.

## 📋 Changes Summary
- Parse breadcrumb anchors dynamically and fall back with debug logging
when legacy selectors are needed.
- Added unit coverage for multi-anchor, single-anchor, and fallback
scenarios to keep diff coverage above 80%.
- Documented required lint/format/test steps in PR checklist; no new
dependencies.

### ⚙️ Type of Change
- [x] 🐞 Bug fix (non-breaking change which fixes an issue)
- [ ] ✨ New feature (adds new functionality without breaking existing
usage)
- [ ] 💥 Breaking change (changes that might break existing user setups,
scripts, or configurations)

## ✅ Checklist
- [x] I have reviewed my changes to ensure they meet the project's
standards.
- [x] I have tested my changes and ensured that all tests pass (`pdm run
test`).
- [x] I have formatted the code (`pdm run format`).
- [x] I have verified that linting passes (`pdm run lint`).
- [x] I have updated documentation where necessary.

By submitting this pull request, I confirm that you can use, modify,
copy, and redistribute this contribution, under the terms of your
choice.


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

* **Bug Fixes**
* Improved category extraction accuracy with enhanced breadcrumb
parsing.
* Better handling for listings with a single breadcrumb (returns stable
category identifier).
* More resilient fallback when breadcrumb data is missing or malformed.
* Safer normalization of category identifiers to avoid incorrect parsing
across site variations.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
---
 src/kleinanzeigen_bot/extract.py              | 39 +++++++++++++--
 .../resources/translations.de.yaml            |  4 ++
 tests/unit/test_extract.py                    | 50 +++++++++++++------
 3 files changed, 73 insertions(+), 20 deletions(-)

diff --git a/src/kleinanzeigen_bot/extract.py b/src/kleinanzeigen_bot/extract.py
index 0afe96e..f59c105 100644
--- a/src/kleinanzeigen_bot/extract.py
+++ b/src/kleinanzeigen_bot/extract.py
@@ -1,7 +1,9 @@
 # SPDX-FileCopyrightText: © Sebastian Thomschke and contributors
 # SPDX-License-Identifier: AGPL-3.0-or-later
 # SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/
-import json, mimetypes, os, shutil  # isort: skip
+from gettext import gettext as _
+
+import json, mimetypes, os, re, shutil  # isort: skip
 import urllib.request as urllib_request
 from datetime import datetime
 from typing import Any, Final
@@ -19,6 +21,9 @@ __all__ = [
 
 LOG:Final[loggers.Logger] = loggers.get_logger(__name__)
 
+_BREADCRUMB_MIN_DEPTH:Final[int] = 2
+BREADCRUMB_RE = re.compile(r"/c(\d+)")
+
 
 class AdExtractor(WebScrapingMixin):
     """
@@ -402,13 +407,39 @@ class AdExtractor(WebScrapingMixin):
 
         :return: a category string of form abc/def, where a-f are digits
         """
-        category_line = await self.web_find(By.ID, "vap-brdcrmb")
+        try:
+            category_line = await self.web_find(By.ID, "vap-brdcrmb")
+        except TimeoutError as exc:
+            LOG.warning("Breadcrumb container 'vap-brdcrmb' not found; cannot extract ad category: %s", exc)
+            raise
+        try:
+            breadcrumb_links = await self.web_find_all(By.CSS_SELECTOR, "a", parent = category_line)
+        except TimeoutError:
+            breadcrumb_links = []
+
+        category_ids:list[str] = []
+        for link in breadcrumb_links:
+            href = str(link.attrs.get("href", "") or "")
+            matches = BREADCRUMB_RE.findall(href)
+            if matches:
+                category_ids.extend(matches)
+
+        # Use the deepest two breadcrumb category codes when available.
+        if len(category_ids) >= _BREADCRUMB_MIN_DEPTH:
+            return f"{category_ids[-2]}/{category_ids[-1]}"
+        if len(category_ids) == 1:
+            return f"{category_ids[0]}/{category_ids[0]}"
+
+        # Fallback to legacy selectors in case the breadcrumb structure is unexpected.
+        LOG.debug(_("Falling back to legacy breadcrumb selectors; collected ids: %s"), category_ids)
         category_first_part = await self.web_find(By.CSS_SELECTOR, "a:nth-of-type(2)", parent = category_line)
         category_second_part = await self.web_find(By.CSS_SELECTOR, "a:nth-of-type(3)", parent = category_line)
         href_first:str = str(category_first_part.attrs["href"])
         href_second:str = str(category_second_part.attrs["href"])
-        cat_num_first = href_first.rsplit("/", maxsplit = 1)[-1][1:]
-        cat_num_second = href_second.rsplit("/", maxsplit = 1)[-1][1:]
+        cat_num_first_raw = href_first.rsplit("/", maxsplit = 1)[-1]
+        cat_num_second_raw = href_second.rsplit("/", maxsplit = 1)[-1]
+        cat_num_first = cat_num_first_raw[1:] if cat_num_first_raw.startswith("c") else cat_num_first_raw
+        cat_num_second = cat_num_second_raw[1:] if cat_num_second_raw.startswith("c") else cat_num_second_raw
         category:str = cat_num_first + "/" + cat_num_second
 
         return category
diff --git a/src/kleinanzeigen_bot/resources/translations.de.yaml b/src/kleinanzeigen_bot/resources/translations.de.yaml
index ca76bde..f4fefb2 100644
--- a/src/kleinanzeigen_bot/resources/translations.de.yaml
+++ b/src/kleinanzeigen_bot/resources/translations.de.yaml
@@ -216,6 +216,10 @@ kleinanzeigen_bot/extract.py:
   _extract_contact_from_ad_page:
     "No street given in the contact.": "Keine Straße in den Kontaktdaten angegeben."
 
+  _extract_category_from_ad_page:
+    "Breadcrumb container 'vap-brdcrmb' not found; cannot extract ad category: %s": "Breadcrumb-Container 'vap-brdcrmb' nicht gefunden; kann Anzeigenkategorie nicht extrahieren: %s"
+    "Falling back to legacy breadcrumb selectors; collected ids: %s": "Weiche auf ältere Breadcrumb-Selektoren aus; gesammelte IDs: %s"
+
 #################################################
 kleinanzeigen_bot/utils/i18n.py:
 #################################################
diff --git a/tests/unit/test_extract.py b/tests/unit/test_extract.py
index 9f10c95..8244348 100644
--- a/tests/unit/test_extract.py
+++ b/tests/unit/test_extract.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: AGPL-3.0-or-later
 # SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/
 import json, os  # isort: skip
+from gettext import gettext as _
 from typing import Any, TypedDict
 from unittest.mock import AsyncMock, MagicMock, call, patch
 
@@ -435,7 +436,7 @@ class TestAdExtractorContent:
         page_mock.url = "https://www.kleinanzeigen.de/s-anzeige/test/12345"
         test_extractor.page = page_mock
 
-        for config, raw_description, _ in description_test_cases:  # Changed to _ since we don't use expected_description
+        for config, raw_description, _expected_description in description_test_cases:
             test_extractor.config = test_bot_config.with_values(config)
 
             with patch.multiple(test_extractor,
@@ -584,32 +585,47 @@ class TestAdExtractorCategory:
         second_part = MagicMock()
         second_part.attrs = {"href": "/s-spielzeug/c23"}
 
-        with patch.object(extractor, "web_find", new_callable = AsyncMock) as mock_web_find:
-            mock_web_find.side_effect = [
-                category_line,
-                first_part,
-                second_part
-            ]
+        with patch.object(extractor, "web_find", new_callable = AsyncMock, side_effect = [category_line]) as mock_web_find, \
+                patch.object(extractor, "web_find_all", new_callable = AsyncMock, return_value = [first_part, second_part]) as mock_web_find_all:
 
             result = await extractor._extract_category_from_ad_page()
             assert result == "17/23"
 
-            mock_web_find.assert_any_call(By.ID, "vap-brdcrmb")
-            mock_web_find.assert_any_call(By.CSS_SELECTOR, "a:nth-of-type(2)", parent = category_line)
-            mock_web_find.assert_any_call(By.CSS_SELECTOR, "a:nth-of-type(3)", parent = category_line)
+            mock_web_find.assert_awaited_once_with(By.ID, "vap-brdcrmb")
+            mock_web_find_all.assert_awaited_once_with(By.CSS_SELECTOR, "a", parent = category_line)
 
     @pytest.mark.asyncio
     # pylint: disable=protected-access
-    async def test_extract_category_with_non_string_href(self, extractor:AdExtractor) -> None:
-        """Test category extraction with non-string href attributes to cover str() conversion."""
+    async def test_extract_category_single_identifier(self, extractor:AdExtractor) -> None:
+        """Test category extraction when only a single breadcrumb code exists."""
         category_line = MagicMock()
         first_part = MagicMock()
-        # Use non-string href to test str() conversion
-        first_part.attrs = {"href": 12345}  # This will need str() conversion
+        first_part.attrs = {"href": "/s-kleidung/c42"}
+
+        with patch.object(extractor, "web_find", new_callable = AsyncMock, side_effect = [category_line]) as mock_web_find, \
+                patch.object(extractor, "web_find_all", new_callable = AsyncMock, return_value = [first_part]) as mock_web_find_all:
+
+            result = await extractor._extract_category_from_ad_page()
+            assert result == "42/42"
+
+            mock_web_find.assert_awaited_once_with(By.ID, "vap-brdcrmb")
+            mock_web_find_all.assert_awaited_once_with(By.CSS_SELECTOR, "a", parent = category_line)
+
+    @pytest.mark.asyncio
+    # pylint: disable=protected-access
+    async def test_extract_category_fallback_to_legacy_selectors(self, extractor:AdExtractor, caplog:pytest.LogCaptureFixture) -> None:
+        """Test category extraction when breadcrumb links are not available and legacy selectors are used."""
+        category_line = MagicMock()
+        first_part = MagicMock()
+        first_part.attrs = {"href": 12345}  # Ensure str() conversion happens
         second_part = MagicMock()
         second_part.attrs = {"href": 67890}  # This will need str() conversion
 
-        with patch.object(extractor, "web_find", new_callable = AsyncMock) as mock_web_find:
+        caplog.set_level("DEBUG")
+        expected_message = _("Falling back to legacy breadcrumb selectors; collected ids: %s") % []
+        with patch.object(extractor, "web_find", new_callable = AsyncMock) as mock_web_find, \
+                patch.object(extractor, "web_find_all", new_callable = AsyncMock, side_effect = TimeoutError) as mock_web_find_all:
+
             mock_web_find.side_effect = [
                 category_line,
                 first_part,
@@ -617,11 +633,13 @@ class TestAdExtractorCategory:
             ]
 
             result = await extractor._extract_category_from_ad_page()
-            assert result == "2345/7890"  # After str() conversion and slicing
+            assert result == "12345/67890"
+            assert sum(1 for record in caplog.records if record.message == expected_message) == 1
 
             mock_web_find.assert_any_call(By.ID, "vap-brdcrmb")
             mock_web_find.assert_any_call(By.CSS_SELECTOR, "a:nth-of-type(2)", parent = category_line)
             mock_web_find.assert_any_call(By.CSS_SELECTOR, "a:nth-of-type(3)", parent = category_line)
+            mock_web_find_all.assert_awaited_once_with(By.CSS_SELECTOR, "a", parent = category_line)
 
     @pytest.mark.asyncio
     # pylint: disable=protected-access