From e76abc66e89eb20aad239988a0dfcb5c32bfcf63 Mon Sep 17 00:00:00 2001 From: Jens <1742418+1cu@users.noreply.github.com> Date: Tue, 28 Oct 2025 15:10:01 +0100 Subject: [PATCH] fix: harden category extraction breadcrumb parsing (#668) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## ℹ️ Description - Link to the related issue(s): Issue #667 - Harden breadcrumb category extraction so downloads no longer fail when the breadcrumb structure changes. ## 📋 Changes Summary - Parse breadcrumb anchors dynamically and fall back with debug logging when legacy selectors are needed. - Added unit coverage for multi-anchor, single-anchor, and fallback scenarios to keep diff coverage above 80%. - Documented required lint/format/test steps in PR checklist; no new dependencies. ### ⚙️ Type of Change - [x] 🐞 Bug fix (non-breaking change which fixes an issue) - [ ] ✨ New feature (adds new functionality without breaking existing usage) - [ ] 💥 Breaking change (changes that might break existing user setups, scripts, or configurations) ## ✅ Checklist - [x] I have reviewed my changes to ensure they meet the project's standards. - [x] I have tested my changes and ensured that all tests pass (`pdm run test`). - [x] I have formatted the code (`pdm run format`). - [x] I have verified that linting passes (`pdm run lint`). - [x] I have updated documentation where necessary. By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of your choice. ## Summary by CodeRabbit * **Bug Fixes** * Improved category extraction accuracy with enhanced breadcrumb parsing. * Better handling for listings with a single breadcrumb (returns stable category identifier). * More resilient fallback when breadcrumb data is missing or malformed. * Safer normalization of category identifiers to avoid incorrect parsing across site variations. --- src/kleinanzeigen_bot/extract.py | 39 +++++++++++++-- .../resources/translations.de.yaml | 4 ++ tests/unit/test_extract.py | 50 +++++++++++++------ 3 files changed, 73 insertions(+), 20 deletions(-) diff --git a/src/kleinanzeigen_bot/extract.py b/src/kleinanzeigen_bot/extract.py index 0afe96e..f59c105 100644 --- a/src/kleinanzeigen_bot/extract.py +++ b/src/kleinanzeigen_bot/extract.py @@ -1,7 +1,9 @@ # SPDX-FileCopyrightText: © Sebastian Thomschke and contributors # SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/ -import json, mimetypes, os, shutil # isort: skip +from gettext import gettext as _ + +import json, mimetypes, os, re, shutil # isort: skip import urllib.request as urllib_request from datetime import datetime from typing import Any, Final @@ -19,6 +21,9 @@ __all__ = [ LOG:Final[loggers.Logger] = loggers.get_logger(__name__) +_BREADCRUMB_MIN_DEPTH:Final[int] = 2 +BREADCRUMB_RE = re.compile(r"/c(\d+)") + class AdExtractor(WebScrapingMixin): """ @@ -402,13 +407,39 @@ class AdExtractor(WebScrapingMixin): :return: a category string of form abc/def, where a-f are digits """ - category_line = await self.web_find(By.ID, "vap-brdcrmb") + try: + category_line = await self.web_find(By.ID, "vap-brdcrmb") + except TimeoutError as exc: + LOG.warning("Breadcrumb container 'vap-brdcrmb' not found; cannot extract ad category: %s", exc) + raise + try: + breadcrumb_links = await self.web_find_all(By.CSS_SELECTOR, "a", parent = category_line) + except TimeoutError: + breadcrumb_links = [] + + category_ids:list[str] = [] + for link in breadcrumb_links: + href = str(link.attrs.get("href", "") or "") + matches = BREADCRUMB_RE.findall(href) + if matches: + category_ids.extend(matches) + + # Use the deepest two breadcrumb category codes when available. + if len(category_ids) >= _BREADCRUMB_MIN_DEPTH: + return f"{category_ids[-2]}/{category_ids[-1]}" + if len(category_ids) == 1: + return f"{category_ids[0]}/{category_ids[0]}" + + # Fallback to legacy selectors in case the breadcrumb structure is unexpected. + LOG.debug(_("Falling back to legacy breadcrumb selectors; collected ids: %s"), category_ids) category_first_part = await self.web_find(By.CSS_SELECTOR, "a:nth-of-type(2)", parent = category_line) category_second_part = await self.web_find(By.CSS_SELECTOR, "a:nth-of-type(3)", parent = category_line) href_first:str = str(category_first_part.attrs["href"]) href_second:str = str(category_second_part.attrs["href"]) - cat_num_first = href_first.rsplit("/", maxsplit = 1)[-1][1:] - cat_num_second = href_second.rsplit("/", maxsplit = 1)[-1][1:] + cat_num_first_raw = href_first.rsplit("/", maxsplit = 1)[-1] + cat_num_second_raw = href_second.rsplit("/", maxsplit = 1)[-1] + cat_num_first = cat_num_first_raw[1:] if cat_num_first_raw.startswith("c") else cat_num_first_raw + cat_num_second = cat_num_second_raw[1:] if cat_num_second_raw.startswith("c") else cat_num_second_raw category:str = cat_num_first + "/" + cat_num_second return category diff --git a/src/kleinanzeigen_bot/resources/translations.de.yaml b/src/kleinanzeigen_bot/resources/translations.de.yaml index ca76bde..f4fefb2 100644 --- a/src/kleinanzeigen_bot/resources/translations.de.yaml +++ b/src/kleinanzeigen_bot/resources/translations.de.yaml @@ -216,6 +216,10 @@ kleinanzeigen_bot/extract.py: _extract_contact_from_ad_page: "No street given in the contact.": "Keine Straße in den Kontaktdaten angegeben." + _extract_category_from_ad_page: + "Breadcrumb container 'vap-brdcrmb' not found; cannot extract ad category: %s": "Breadcrumb-Container 'vap-brdcrmb' nicht gefunden; kann Anzeigenkategorie nicht extrahieren: %s" + "Falling back to legacy breadcrumb selectors; collected ids: %s": "Weiche auf ältere Breadcrumb-Selektoren aus; gesammelte IDs: %s" + ################################################# kleinanzeigen_bot/utils/i18n.py: ################################################# diff --git a/tests/unit/test_extract.py b/tests/unit/test_extract.py index 9f10c95..8244348 100644 --- a/tests/unit/test_extract.py +++ b/tests/unit/test_extract.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/ import json, os # isort: skip +from gettext import gettext as _ from typing import Any, TypedDict from unittest.mock import AsyncMock, MagicMock, call, patch @@ -435,7 +436,7 @@ class TestAdExtractorContent: page_mock.url = "https://www.kleinanzeigen.de/s-anzeige/test/12345" test_extractor.page = page_mock - for config, raw_description, _ in description_test_cases: # Changed to _ since we don't use expected_description + for config, raw_description, _expected_description in description_test_cases: test_extractor.config = test_bot_config.with_values(config) with patch.multiple(test_extractor, @@ -584,32 +585,47 @@ class TestAdExtractorCategory: second_part = MagicMock() second_part.attrs = {"href": "/s-spielzeug/c23"} - with patch.object(extractor, "web_find", new_callable = AsyncMock) as mock_web_find: - mock_web_find.side_effect = [ - category_line, - first_part, - second_part - ] + with patch.object(extractor, "web_find", new_callable = AsyncMock, side_effect = [category_line]) as mock_web_find, \ + patch.object(extractor, "web_find_all", new_callable = AsyncMock, return_value = [first_part, second_part]) as mock_web_find_all: result = await extractor._extract_category_from_ad_page() assert result == "17/23" - mock_web_find.assert_any_call(By.ID, "vap-brdcrmb") - mock_web_find.assert_any_call(By.CSS_SELECTOR, "a:nth-of-type(2)", parent = category_line) - mock_web_find.assert_any_call(By.CSS_SELECTOR, "a:nth-of-type(3)", parent = category_line) + mock_web_find.assert_awaited_once_with(By.ID, "vap-brdcrmb") + mock_web_find_all.assert_awaited_once_with(By.CSS_SELECTOR, "a", parent = category_line) @pytest.mark.asyncio # pylint: disable=protected-access - async def test_extract_category_with_non_string_href(self, extractor:AdExtractor) -> None: - """Test category extraction with non-string href attributes to cover str() conversion.""" + async def test_extract_category_single_identifier(self, extractor:AdExtractor) -> None: + """Test category extraction when only a single breadcrumb code exists.""" category_line = MagicMock() first_part = MagicMock() - # Use non-string href to test str() conversion - first_part.attrs = {"href": 12345} # This will need str() conversion + first_part.attrs = {"href": "/s-kleidung/c42"} + + with patch.object(extractor, "web_find", new_callable = AsyncMock, side_effect = [category_line]) as mock_web_find, \ + patch.object(extractor, "web_find_all", new_callable = AsyncMock, return_value = [first_part]) as mock_web_find_all: + + result = await extractor._extract_category_from_ad_page() + assert result == "42/42" + + mock_web_find.assert_awaited_once_with(By.ID, "vap-brdcrmb") + mock_web_find_all.assert_awaited_once_with(By.CSS_SELECTOR, "a", parent = category_line) + + @pytest.mark.asyncio + # pylint: disable=protected-access + async def test_extract_category_fallback_to_legacy_selectors(self, extractor:AdExtractor, caplog:pytest.LogCaptureFixture) -> None: + """Test category extraction when breadcrumb links are not available and legacy selectors are used.""" + category_line = MagicMock() + first_part = MagicMock() + first_part.attrs = {"href": 12345} # Ensure str() conversion happens second_part = MagicMock() second_part.attrs = {"href": 67890} # This will need str() conversion - with patch.object(extractor, "web_find", new_callable = AsyncMock) as mock_web_find: + caplog.set_level("DEBUG") + expected_message = _("Falling back to legacy breadcrumb selectors; collected ids: %s") % [] + with patch.object(extractor, "web_find", new_callable = AsyncMock) as mock_web_find, \ + patch.object(extractor, "web_find_all", new_callable = AsyncMock, side_effect = TimeoutError) as mock_web_find_all: + mock_web_find.side_effect = [ category_line, first_part, @@ -617,11 +633,13 @@ class TestAdExtractorCategory: ] result = await extractor._extract_category_from_ad_page() - assert result == "2345/7890" # After str() conversion and slicing + assert result == "12345/67890" + assert sum(1 for record in caplog.records if record.message == expected_message) == 1 mock_web_find.assert_any_call(By.ID, "vap-brdcrmb") mock_web_find.assert_any_call(By.CSS_SELECTOR, "a:nth-of-type(2)", parent = category_line) mock_web_find.assert_any_call(By.CSS_SELECTOR, "a:nth-of-type(3)", parent = category_line) + mock_web_find_all.assert_awaited_once_with(By.CSS_SELECTOR, "a", parent = category_line) @pytest.mark.asyncio # pylint: disable=protected-access