fix: JSON API Pagination for >25 Ads (#797)

## ℹ️ Description *Provide a concise summary of the changes introduced in this pull request.* - Link to the related issue(s): Closes #789 (completes the fix started in #793) - **Motivation**: Fix JSON API pagination for accounts with >25 ads. Aligns pagination logic with weidi’s approach (starts at page 1), while hardening error handling and tests. Based on https://github.com/weidi/kleinanzeigen-bot/pull/1. ## 📋 Changes Summary - Added pagination helper to fetch all published ads and use it in delete/extend/publish/update flows - Added robust handling for malformed JSON payloads and unexpected ads types (with translated warnings) - Improved sell_directly extraction with pagination, bounds checks, and shared coercion helper - Added/updated tests for pagination and edge cases; updated assertions to pytest.fail style ### ⚙️ Type of Change Select the type(s) of change(s) included in this pull request: - [x] 🐞 Bug fix (non-breaking change which fixes an issue) - [ ] ✨ New feature (adds new functionality without breaking existing usage) - [ ] 💥 Breaking change (changes that might break existing user setups, scripts, or configurations) ## ✅ Checklist Before requesting a review, confirm the following: - [x] I have reviewed my changes to ensure they meet the project's standards. - [x] I have tested my changes and ensured that all tests pass (`pdm run test:cov:unified`). - [x] I have formatted the code (`pdm run format`). - [x] I have verified that linting passes (`pdm run lint`). - [x] I have updated documentation where necessary. By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of your choice.  ## Summary by CodeRabbit * **New Features** * Reliable multi-page fetching for published ads and buy-now eligibility checks. * **Bug Fixes** * Safer pagination with per-page JSON handling, limits and improved termination diagnostics; ensures pageNum is used when needed. * **Tests** * New comprehensive pagination tests and updates to existing tests to reflect multi-page behavior. * **Chores** * Added a utility to safely coerce page numbers; minor utility signature cleanup. <sub>✏️ Tip: You can customize this high-level summary in your review settings.</sub>
2026-03-12 18:41:50 +01:00 · 2026-01-31 22:17:37 +01:00
parent 51a8042cda
commit 96f465d5bc
7 changed files with 651 additions and 118 deletions
--- a/src/kleinanzeigen_bot/init.py
+++ b/src/kleinanzeigen_bot/init.py
@@ -1047,10 +1047,97 @@ class KleinanzeigenBot(WebScrapingMixin):  # noqa: PLR0904
        LOG.debug("No login detected - DOM elements not found and server probe returned %s", state.name)
        return False

+    async def _fetch_published_ads(self) -> list[dict[str, Any]]:
+        """Fetch all published ads, handling API pagination.
+
+        Returns:
+            List of all published ads across all pages.
+        """
+        ads:list[dict[str, Any]] = []
+        page = 1
+        MAX_PAGE_LIMIT:Final[int] = 100
+        SNIPPET_LIMIT:Final[int] = 500
+
+        while True:
+            # Safety check: don't paginate beyond reasonable limit
+            if page > MAX_PAGE_LIMIT:
+                LOG.warning("Stopping pagination after %s pages to avoid infinite loop", MAX_PAGE_LIMIT)
+                break
+
+            try:
+                response = await self.web_request(f"{self.root_url}/m-meine-anzeigen-verwalten.json?sort=DEFAULT&pageNum={page}")
+            except TimeoutError as ex:
+                LOG.warning("Pagination request timed out on page %s: %s", page, ex)
+                break
+
+            content = response.get("content", "")
+            try:
+                json_data = json.loads(content)
+            except json.JSONDecodeError as ex:
+                if not content:
+                    LOG.warning("Empty JSON response content on page %s", page)
+                    break
+                snippet = content[:SNIPPET_LIMIT] + ("..." if len(content) > SNIPPET_LIMIT else "")
+                LOG.warning("Failed to parse JSON response on page %s: %s (content: %s)", page, ex, snippet)
+                break
+
+            if not isinstance(json_data, dict):
+                snippet = content[:SNIPPET_LIMIT] + ("..." if len(content) > SNIPPET_LIMIT else "")
+                LOG.warning("Unexpected JSON payload on page %s (content: %s)", page, snippet)
+                break
+
+            page_ads = json_data.get("ads", [])
+            if not isinstance(page_ads, list):
+                preview = str(page_ads)
+                if len(preview) > SNIPPET_LIMIT:
+                    preview = preview[:SNIPPET_LIMIT] + "..."
+                LOG.warning("Unexpected 'ads' type on page %s: %s value: %s", page, type(page_ads).__name__, preview)
+                break
+
+            ads.extend(page_ads)
+
+            paging = json_data.get("paging")
+            if not isinstance(paging, dict):
+                LOG.debug("No paging dict found on page %s, assuming single page", page)
+                break
+
+            # Use only real API fields (confirmed from production data)
+            current_page_num = misc.coerce_page_number(paging.get("pageNum"))
+            total_pages = misc.coerce_page_number(paging.get("last"))
+
+            if current_page_num is None:
+                LOG.warning("Invalid 'pageNum' in paging info: %s, stopping pagination", paging.get("pageNum"))
+                break
+
+            if total_pages is None:
+                LOG.debug("No pagination info found, assuming single page")
+                break
+
+            # Stop if reached last page
+            if current_page_num >= total_pages:
+                LOG.info("Reached last page %s of %s, stopping pagination", current_page_num, total_pages)
+                break
+
+            # Safety: stop if no ads returned
+            if len(page_ads) == 0:
+                LOG.info("No ads found on page %s, stopping pagination", page)
+                break
+
+            LOG.debug("Page %s: fetched %s ads (numFound=%s)", page, len(page_ads), paging.get("numFound"))
+
+            # Use API's next field for navigation (more robust than our counter)
+            next_page = misc.coerce_page_number(paging.get("next"))
+            if next_page is None:
+                LOG.warning("Invalid 'next' page value in paging info: %s, stopping pagination", paging.get("next"))
+                break
+            page = next_page
+
+        return ads
+
    async def delete_ads(self, ad_cfgs:list[tuple[str, Ad, dict[str, Any]]]) -> None:
        count = 0

-        published_ads = json.loads((await self.web_request(f"{self.root_url}/m-meine-anzeigen-verwalten.json?sort=DEFAULT"))["content"])["ads"]
+        published_ads = await self._fetch_published_ads()

        for ad_file, ad_cfg, _ad_cfg_orig in ad_cfgs:
            count += 1
@@ -1094,7 +1181,7 @@ class KleinanzeigenBot(WebScrapingMixin):  # noqa: PLR0904
    async def extend_ads(self, ad_cfgs:list[tuple[str, Ad, dict[str, Any]]]) -> None:
        """Extends ads that are close to expiry."""
        # Fetch currently published ads from API
-        published_ads = json.loads((await self.web_request(f"{self.root_url}/m-meine-anzeigen-verwalten.json?sort=DEFAULT"))["content"])["ads"]
+        published_ads = await self._fetch_published_ads()

        # Filter ads that need extension
        ads_to_extend = []
@@ -1213,7 +1300,7 @@ class KleinanzeigenBot(WebScrapingMixin):  # noqa: PLR0904
        failed_count = 0
        max_retries = 3

-        published_ads = json.loads((await self.web_request(f"{self.root_url}/m-meine-anzeigen-verwalten.json?sort=DEFAULT"))["content"])["ads"]
+        published_ads = await self._fetch_published_ads()

        for ad_file, ad_cfg, ad_cfg_orig in ad_cfgs:
            LOG.info("Processing %s/%s: '%s' from [%s]...", count + 1, len(ad_cfgs), ad_cfg.title, ad_file)
@@ -1561,12 +1648,13 @@ class KleinanzeigenBot(WebScrapingMixin):  # noqa: PLR0904
        """
        count = 0

-        published_ads = json.loads((await self.web_request(f"{self.root_url}/m-meine-anzeigen-verwalten.json?sort=DEFAULT"))["content"])["ads"]
+        published_ads = await self._fetch_published_ads()

        for ad_file, ad_cfg, ad_cfg_orig in ad_cfgs:
            ad = next((ad for ad in published_ads if ad["id"] == ad_cfg.id), None)

            if not ad:
+                LOG.warning(" -> SKIPPED: ad '%s' (ID: %s) not found in published ads", ad_cfg.title, ad_cfg.id)
                continue

            LOG.info("Processing %s/%s: '%s' from [%s]...", count + 1, len(ad_cfgs), ad_cfg.title, ad_file)
--- a/src/kleinanzeigen_bot/extract.py
+++ b/src/kleinanzeigen_bot/extract.py
@@ -25,6 +25,7 @@ __all__ = [
 LOG:Final[loggers.Logger] = loggers.get_logger(__name__)

 _BREADCRUMB_MIN_DEPTH:Final[int] = 2
+_SELL_DIRECTLY_MAX_PAGE_LIMIT:Final[int] = 100
 BREADCRUMB_RE = re.compile(r"/c(\d+)")


@@ -525,19 +526,56 @@ class AdExtractor(WebScrapingMixin):
                LOG.warning("Could not extract ad ID from URL: %s", self.page.url)
                return None

-            # Fetch the management JSON data using web_request
-            response = await self.web_request("https://www.kleinanzeigen.de/m-meine-anzeigen-verwalten.json")
-            json_data = json.loads(response["content"])
+            # Fetch the management JSON data using web_request with pagination support
+            page = 1

-            # Find the current ad in the ads list
-            if isinstance(json_data, dict) and "ads" in json_data:
-                ads_list = json_data["ads"]
-                if isinstance(ads_list, list):
-                    # Filter ads to find the current ad by ID
-                    current_ad = next((ad for ad in ads_list if ad.get("id") == current_ad_id), None)
-                    if current_ad and "buyNowEligible" in current_ad:
-                        buy_now_eligible = current_ad["buyNowEligible"]
-                        return buy_now_eligible if isinstance(buy_now_eligible, bool) else None
+            while True:
+                # Safety check: don't paginate beyond reasonable limit
+                if page > _SELL_DIRECTLY_MAX_PAGE_LIMIT:
+                    LOG.warning("Stopping pagination after %s pages to avoid infinite loop", _SELL_DIRECTLY_MAX_PAGE_LIMIT)
+                    break
+
+                response = await self.web_request(f"https://www.kleinanzeigen.de/m-meine-anzeigen-verwalten.json?sort=DEFAULT&pageNum={page}")
+
+                try:
+                    json_data = json.loads(response["content"])
+                except json.JSONDecodeError as ex:
+                    LOG.debug("Failed to parse JSON response on page %s: %s", page, ex)
+                    break
+
+                # Find the current ad in the ads list
+                if isinstance(json_data, dict) and "ads" in json_data:
+                    ads_list = json_data["ads"]
+                    if isinstance(ads_list, list):
+                        # Filter ads to find the current ad by ID
+                        current_ad = next((ad for ad in ads_list if ad.get("id") == current_ad_id), None)
+                        if current_ad and "buyNowEligible" in current_ad:
+                            buy_now_eligible = current_ad["buyNowEligible"]
+                            return buy_now_eligible if isinstance(buy_now_eligible, bool) else None
+
+                # Check if we need to fetch more pages
+                paging = json_data.get("paging") if isinstance(json_data, dict) else None
+                if not isinstance(paging, dict):
+                    break
+
+                # Parse pagination info using real API fields
+                current_page_num = misc.coerce_page_number(paging.get("pageNum"))
+                total_pages = misc.coerce_page_number(paging.get("last"))
+
+                if current_page_num is None:
+                    LOG.warning("Invalid 'pageNum' in paging info: %s, stopping pagination", paging.get("pageNum"))
+                    break
+
+                # Stop if we've reached the last page
+                if total_pages is None or current_page_num >= total_pages:
+                    break
+
+                # Use API's next field for navigation (more robust than our counter)
+                next_page = misc.coerce_page_number(paging.get("next"))
+                if next_page is None:
+                    LOG.warning("Invalid 'next' page value in paging info: %s, stopping pagination", paging.get("next"))
+                    break
+                page = next_page

            # If the key doesn't exist or ad not found, return None (unknown)
            return None
--- a/src/kleinanzeigen_bot/resources/translations.de.yaml
+++ b/src/kleinanzeigen_bot/resources/translations.de.yaml
@@ -31,6 +31,18 @@ kleinanzeigen_bot/__init__.py:
    "App version: %s": "App Version: %s"
    "Python version: %s": "Python Version: %s"

+  _fetch_published_ads:
+    "Empty JSON response content on page %s": "Leerer JSON-Antwortinhalt auf Seite %s"
+    "Failed to parse JSON response on page %s: %s (content: %s)": "Fehler beim Parsen der JSON-Antwort auf Seite %s: %s (Inhalt: %s)"
+    "Stopping pagination after %s pages to avoid infinite loop": "Stoppe die Seitenaufschaltung nach %s Seiten, um eine Endlosschleife zu vermeiden"
+    "Pagination request timed out on page %s: %s": "Zeitueberschreitung bei der Seitenabfrage auf Seite %s: %s"
+    "Unexpected JSON payload on page %s (content: %s)": "Unerwartete JSON-Antwort auf Seite %s (Inhalt: %s)"
+    "Unexpected 'ads' type on page %s: %s value: %s": "Unerwarteter 'ads'-Typ auf Seite %s: %s Wert: %s"
+    "Reached last page %s of %s, stopping pagination": "Letzte Seite %s von %s erreicht, beende Paginierung"
+    "No ads found on page %s, stopping pagination": "Keine Anzeigen auf Seite %s gefunden, beende Paginierung"
+    "Invalid 'next' page value in paging info: %s, stopping pagination": "Ungültiger 'next'-Seitenwert in Paginierungsinfo: %s, beende Paginierung"
+    "Invalid 'pageNum' in paging info: %s, stopping pagination": "Ungültiger 'pageNum'-Wert in Paginierungsinfo: %s, beende Paginierung"
+
  __check_ad_changed:
    "Hash comparison for [%s]:": "Hash-Vergleich für [%s]:"
    "    Stored hash: %s": "    Gespeicherter Hash: %s"
@@ -162,6 +174,7 @@ kleinanzeigen_bot/__init__.py:
  update_ads:
    "Processing %s/%s: '%s' from [%s]...": "Verarbeite %s/%s: '%s' von [%s]..."
    "Skipping because ad is reserved": "Überspringen, da Anzeige reserviert ist"
+    " -> SKIPPED: ad '%s' (ID: %s) not found in published ads": " -> ÜBERSPRUNGEN: Anzeige '%s' (ID: %s) nicht in veröffentlichten Anzeigen gefunden"
    "DONE: updated %s": "FERTIG: %s aktualisiert"
    "ad": "Anzeige"

@@ -299,6 +312,9 @@ kleinanzeigen_bot/extract.py:

  _extract_sell_directly_from_ad_page:
    "Could not extract ad ID from URL: %s": "Konnte Anzeigen-ID nicht aus der URL extrahieren: %s"
+    "Stopping pagination after %s pages to avoid infinite loop": "Stoppe die Seitenaufschaltung nach %s Seiten, um eine Endlosschleife zu vermeiden"
+    "Invalid 'next' page value in paging info: %s, stopping pagination": "Ungültiger 'next'-Seitenwert in Paginierungsinfo: %s, beende Paginierung"
+    "Invalid 'pageNum' in paging info: %s, stopping pagination": "Ungültiger 'pageNum'-Wert in Paginierungsinfo: %s, beende Paginierung"

 #################################################
 kleinanzeigen_bot/utils/i18n.py:
--- a/src/kleinanzeigen_bot/utils/misc.py
+++ b/src/kleinanzeigen_bot/utils/misc.py
@@ -16,12 +16,55 @@ from . import i18n
 T = TypeVar("T")


+def coerce_page_number(value:Any) -> int | None:
+    """Safely coerce a value to int or return None if conversion fails.
+
+    Whole-number floats are accepted; non-integer floats are rejected.
+
+    Args:
+        value: Value to coerce to int (can be int, str, float, or any type)
+
+    Returns:
+        int if value can be safely coerced, None otherwise
+
+    Examples:
+        >>> coerce_page_number(1)
+        1
+        >>> coerce_page_number("2")
+        2
+        >>> coerce_page_number(3.0)
+        3
+        >>> coerce_page_number(3.5) is None
+        True
+        >>> coerce_page_number(True) is None  # Not 1!
+        True
+        >>> coerce_page_number(None) is None
+        True
+        >>> coerce_page_number("invalid") is None
+        True
+        >>> coerce_page_number([1, 2, 3]) is None
+        True
+    """
+    if value is None:
+        return None
+    if isinstance(value, bool):
+        return None
+    if isinstance(value, float):
+        if value.is_integer():
+            return int(value)
+        return None
+    try:
+        return int(value)
+    except (TypeError, ValueError):
+        return None
+
+
 def ensure(
-        condition:Any | bool | Callable[[], bool],  # noqa: FBT001 Boolean-typed positional argument in function definition
-        error_message:str,
-        timeout:float = 5,
-        poll_frequency:float = 0.5
-    ) -> None:
+    condition:Any | bool | Callable[[], bool],  # noqa: FBT001 Boolean-typed positional argument in function definition
+    error_message:str,
+    timeout:float = 5,
+    poll_frequency:float = 0.5,
+) -> None:
    """
    Ensure a condition is true, retrying until timeout.

@@ -152,12 +195,7 @@ def parse_decimal(number:float | int | str) -> decimal.Decimal:
            raise decimal.DecimalException(f"Invalid number format: {number}") from ex


-def parse_datetime(
-    date:datetime | str | None,
-    *,
-    add_timezone_if_missing:bool = True,
-    use_local_timezone:bool = True
-) -> datetime | None:
+def parse_datetime(date:datetime | str | None, *, add_timezone_if_missing:bool = True, use_local_timezone:bool = True) -> datetime | None:
    """
    Parses a datetime object or ISO-formatted string.

@@ -184,10 +222,7 @@ def parse_datetime(
    dt = date if isinstance(date, datetime) else datetime.fromisoformat(date)

    if dt.tzinfo is None and add_timezone_if_missing:
-        dt = (
-            dt.astimezone() if use_local_timezone
-            else dt.replace(tzinfo = timezone.utc)
-        )
+        dt = dt.astimezone() if use_local_timezone else dt.replace(tzinfo = timezone.utc)

    return dt