fix: harden category extraction breadcrumb parsing (#668)

## ℹ️ Description
- Link to the related issue(s): Issue #667
- Harden breadcrumb category extraction so downloads no longer fail when
the breadcrumb structure changes.

## 📋 Changes Summary
- Parse breadcrumb anchors dynamically and fall back with debug logging
when legacy selectors are needed.
- Added unit coverage for multi-anchor, single-anchor, and fallback
scenarios to keep diff coverage above 80%.
- Documented required lint/format/test steps in PR checklist; no new
dependencies.

### ⚙️ Type of Change
- [x] 🐞 Bug fix (non-breaking change which fixes an issue)
- [ ]  New feature (adds new functionality without breaking existing
usage)
- [ ] 💥 Breaking change (changes that might break existing user setups,
scripts, or configurations)

##  Checklist
- [x] I have reviewed my changes to ensure they meet the project's
standards.
- [x] I have tested my changes and ensured that all tests pass (`pdm run
test`).
- [x] I have formatted the code (`pdm run format`).
- [x] I have verified that linting passes (`pdm run lint`).
- [x] I have updated documentation where necessary.

By submitting this pull request, I confirm that you can use, modify,
copy, and redistribute this contribution, under the terms of your
choice.


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

* **Bug Fixes**
* Improved category extraction accuracy with enhanced breadcrumb
parsing.
* Better handling for listings with a single breadcrumb (returns stable
category identifier).
* More resilient fallback when breadcrumb data is missing or malformed.
* Safer normalization of category identifiers to avoid incorrect parsing
across site variations.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
Jens
2025-10-28 15:10:01 +01:00
committed by GitHub
parent 9c73696b29
commit e76abc66e8
3 changed files with 73 additions and 20 deletions

View File

@@ -1,7 +1,9 @@
# SPDX-FileCopyrightText: © Sebastian Thomschke and contributors
# SPDX-License-Identifier: AGPL-3.0-or-later
# SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/
import json, mimetypes, os, shutil # isort: skip
from gettext import gettext as _
import json, mimetypes, os, re, shutil # isort: skip
import urllib.request as urllib_request
from datetime import datetime
from typing import Any, Final
@@ -19,6 +21,9 @@ __all__ = [
LOG:Final[loggers.Logger] = loggers.get_logger(__name__)
_BREADCRUMB_MIN_DEPTH:Final[int] = 2
BREADCRUMB_RE = re.compile(r"/c(\d+)")
class AdExtractor(WebScrapingMixin):
"""
@@ -402,13 +407,39 @@ class AdExtractor(WebScrapingMixin):
:return: a category string of form abc/def, where a-f are digits
"""
category_line = await self.web_find(By.ID, "vap-brdcrmb")
try:
category_line = await self.web_find(By.ID, "vap-brdcrmb")
except TimeoutError as exc:
LOG.warning("Breadcrumb container 'vap-brdcrmb' not found; cannot extract ad category: %s", exc)
raise
try:
breadcrumb_links = await self.web_find_all(By.CSS_SELECTOR, "a", parent = category_line)
except TimeoutError:
breadcrumb_links = []
category_ids:list[str] = []
for link in breadcrumb_links:
href = str(link.attrs.get("href", "") or "")
matches = BREADCRUMB_RE.findall(href)
if matches:
category_ids.extend(matches)
# Use the deepest two breadcrumb category codes when available.
if len(category_ids) >= _BREADCRUMB_MIN_DEPTH:
return f"{category_ids[-2]}/{category_ids[-1]}"
if len(category_ids) == 1:
return f"{category_ids[0]}/{category_ids[0]}"
# Fallback to legacy selectors in case the breadcrumb structure is unexpected.
LOG.debug(_("Falling back to legacy breadcrumb selectors; collected ids: %s"), category_ids)
category_first_part = await self.web_find(By.CSS_SELECTOR, "a:nth-of-type(2)", parent = category_line)
category_second_part = await self.web_find(By.CSS_SELECTOR, "a:nth-of-type(3)", parent = category_line)
href_first:str = str(category_first_part.attrs["href"])
href_second:str = str(category_second_part.attrs["href"])
cat_num_first = href_first.rsplit("/", maxsplit = 1)[-1][1:]
cat_num_second = href_second.rsplit("/", maxsplit = 1)[-1][1:]
cat_num_first_raw = href_first.rsplit("/", maxsplit = 1)[-1]
cat_num_second_raw = href_second.rsplit("/", maxsplit = 1)[-1]
cat_num_first = cat_num_first_raw[1:] if cat_num_first_raw.startswith("c") else cat_num_first_raw
cat_num_second = cat_num_second_raw[1:] if cat_num_second_raw.startswith("c") else cat_num_second_raw
category:str = cat_num_first + "/" + cat_num_second
return category

View File

@@ -216,6 +216,10 @@ kleinanzeigen_bot/extract.py:
_extract_contact_from_ad_page:
"No street given in the contact.": "Keine Straße in den Kontaktdaten angegeben."
_extract_category_from_ad_page:
"Breadcrumb container 'vap-brdcrmb' not found; cannot extract ad category: %s": "Breadcrumb-Container 'vap-brdcrmb' nicht gefunden; kann Anzeigenkategorie nicht extrahieren: %s"
"Falling back to legacy breadcrumb selectors; collected ids: %s": "Weiche auf ältere Breadcrumb-Selektoren aus; gesammelte IDs: %s"
#################################################
kleinanzeigen_bot/utils/i18n.py:
#################################################