fix: JSON API Pagination for >25 Ads (#797)

## ℹ️ Description
*Provide a concise summary of the changes introduced in this pull
request.*

- Link to the related issue(s): Closes #789 (completes the fix started
in #793)
- **Motivation**: Fix JSON API pagination for accounts with >25 ads.
Aligns pagination logic with weidi’s approach (starts at page 1), while
hardening error handling and tests. Based on
https://github.com/weidi/kleinanzeigen-bot/pull/1.

## 📋 Changes Summary

- Added pagination helper to fetch all published ads and use it in
delete/extend/publish/update flows
- Added robust handling for malformed JSON payloads and unexpected ads
types (with translated warnings)
- Improved sell_directly extraction with pagination, bounds checks, and
shared coercion helper
- Added/updated tests for pagination and edge cases; updated assertions
to pytest.fail style

### ⚙️ Type of Change
Select the type(s) of change(s) included in this pull request:
- [x] 🐞 Bug fix (non-breaking change which fixes an issue)
- [ ]  New feature (adds new functionality without breaking existing
usage)
- [ ] 💥 Breaking change (changes that might break existing user setups,
scripts, or configurations)


##  Checklist
Before requesting a review, confirm the following:
- [x] I have reviewed my changes to ensure they meet the project's
standards.
- [x] I have tested my changes and ensured that all tests pass (`pdm run
test:cov:unified`).
- [x] I have formatted the code (`pdm run format`).
- [x] I have verified that linting passes (`pdm run lint`).
- [x] I have updated documentation where necessary.

By submitting this pull request, I confirm that you can use, modify,
copy, and redistribute this contribution, under the terms of your
choice.

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

* **New Features**
* Reliable multi-page fetching for published ads and buy-now eligibility
checks.

* **Bug Fixes**
* Safer pagination with per-page JSON handling, limits and improved
termination diagnostics; ensures pageNum is used when needed.

* **Tests**
* New comprehensive pagination tests and updates to existing tests to
reflect multi-page behavior.

* **Chores**
* Added a utility to safely coerce page numbers; minor utility signature
cleanup.

<sub>✏️ Tip: You can customize this high-level summary in your review
settings.</sub>
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
Jens
2026-01-31 22:17:37 +01:00
committed by GitHub
parent 51a8042cda
commit 96f465d5bc
7 changed files with 651 additions and 118 deletions

View File

@@ -1047,10 +1047,97 @@ class KleinanzeigenBot(WebScrapingMixin): # noqa: PLR0904
LOG.debug("No login detected - DOM elements not found and server probe returned %s", state.name)
return False
async def _fetch_published_ads(self) -> list[dict[str, Any]]:
"""Fetch all published ads, handling API pagination.
Returns:
List of all published ads across all pages.
"""
ads:list[dict[str, Any]] = []
page = 1
MAX_PAGE_LIMIT:Final[int] = 100
SNIPPET_LIMIT:Final[int] = 500
while True:
# Safety check: don't paginate beyond reasonable limit
if page > MAX_PAGE_LIMIT:
LOG.warning("Stopping pagination after %s pages to avoid infinite loop", MAX_PAGE_LIMIT)
break
try:
response = await self.web_request(f"{self.root_url}/m-meine-anzeigen-verwalten.json?sort=DEFAULT&pageNum={page}")
except TimeoutError as ex:
LOG.warning("Pagination request timed out on page %s: %s", page, ex)
break
content = response.get("content", "")
try:
json_data = json.loads(content)
except json.JSONDecodeError as ex:
if not content:
LOG.warning("Empty JSON response content on page %s", page)
break
snippet = content[:SNIPPET_LIMIT] + ("..." if len(content) > SNIPPET_LIMIT else "")
LOG.warning("Failed to parse JSON response on page %s: %s (content: %s)", page, ex, snippet)
break
if not isinstance(json_data, dict):
snippet = content[:SNIPPET_LIMIT] + ("..." if len(content) > SNIPPET_LIMIT else "")
LOG.warning("Unexpected JSON payload on page %s (content: %s)", page, snippet)
break
page_ads = json_data.get("ads", [])
if not isinstance(page_ads, list):
preview = str(page_ads)
if len(preview) > SNIPPET_LIMIT:
preview = preview[:SNIPPET_LIMIT] + "..."
LOG.warning("Unexpected 'ads' type on page %s: %s value: %s", page, type(page_ads).__name__, preview)
break
ads.extend(page_ads)
paging = json_data.get("paging")
if not isinstance(paging, dict):
LOG.debug("No paging dict found on page %s, assuming single page", page)
break
# Use only real API fields (confirmed from production data)
current_page_num = misc.coerce_page_number(paging.get("pageNum"))
total_pages = misc.coerce_page_number(paging.get("last"))
if current_page_num is None:
LOG.warning("Invalid 'pageNum' in paging info: %s, stopping pagination", paging.get("pageNum"))
break
if total_pages is None:
LOG.debug("No pagination info found, assuming single page")
break
# Stop if reached last page
if current_page_num >= total_pages:
LOG.info("Reached last page %s of %s, stopping pagination", current_page_num, total_pages)
break
# Safety: stop if no ads returned
if len(page_ads) == 0:
LOG.info("No ads found on page %s, stopping pagination", page)
break
LOG.debug("Page %s: fetched %s ads (numFound=%s)", page, len(page_ads), paging.get("numFound"))
# Use API's next field for navigation (more robust than our counter)
next_page = misc.coerce_page_number(paging.get("next"))
if next_page is None:
LOG.warning("Invalid 'next' page value in paging info: %s, stopping pagination", paging.get("next"))
break
page = next_page
return ads
async def delete_ads(self, ad_cfgs:list[tuple[str, Ad, dict[str, Any]]]) -> None:
count = 0
published_ads = json.loads((await self.web_request(f"{self.root_url}/m-meine-anzeigen-verwalten.json?sort=DEFAULT"))["content"])["ads"]
published_ads = await self._fetch_published_ads()
for ad_file, ad_cfg, _ad_cfg_orig in ad_cfgs:
count += 1
@@ -1094,7 +1181,7 @@ class KleinanzeigenBot(WebScrapingMixin): # noqa: PLR0904
async def extend_ads(self, ad_cfgs:list[tuple[str, Ad, dict[str, Any]]]) -> None:
"""Extends ads that are close to expiry."""
# Fetch currently published ads from API
published_ads = json.loads((await self.web_request(f"{self.root_url}/m-meine-anzeigen-verwalten.json?sort=DEFAULT"))["content"])["ads"]
published_ads = await self._fetch_published_ads()
# Filter ads that need extension
ads_to_extend = []
@@ -1213,7 +1300,7 @@ class KleinanzeigenBot(WebScrapingMixin): # noqa: PLR0904
failed_count = 0
max_retries = 3
published_ads = json.loads((await self.web_request(f"{self.root_url}/m-meine-anzeigen-verwalten.json?sort=DEFAULT"))["content"])["ads"]
published_ads = await self._fetch_published_ads()
for ad_file, ad_cfg, ad_cfg_orig in ad_cfgs:
LOG.info("Processing %s/%s: '%s' from [%s]...", count + 1, len(ad_cfgs), ad_cfg.title, ad_file)
@@ -1561,12 +1648,13 @@ class KleinanzeigenBot(WebScrapingMixin): # noqa: PLR0904
"""
count = 0
published_ads = json.loads((await self.web_request(f"{self.root_url}/m-meine-anzeigen-verwalten.json?sort=DEFAULT"))["content"])["ads"]
published_ads = await self._fetch_published_ads()
for ad_file, ad_cfg, ad_cfg_orig in ad_cfgs:
ad = next((ad for ad in published_ads if ad["id"] == ad_cfg.id), None)
if not ad:
LOG.warning(" -> SKIPPED: ad '%s' (ID: %s) not found in published ads", ad_cfg.title, ad_cfg.id)
continue
LOG.info("Processing %s/%s: '%s' from [%s]...", count + 1, len(ad_cfgs), ad_cfg.title, ad_file)

View File

@@ -25,6 +25,7 @@ __all__ = [
LOG:Final[loggers.Logger] = loggers.get_logger(__name__)
_BREADCRUMB_MIN_DEPTH:Final[int] = 2
_SELL_DIRECTLY_MAX_PAGE_LIMIT:Final[int] = 100
BREADCRUMB_RE = re.compile(r"/c(\d+)")
@@ -525,19 +526,56 @@ class AdExtractor(WebScrapingMixin):
LOG.warning("Could not extract ad ID from URL: %s", self.page.url)
return None
# Fetch the management JSON data using web_request
response = await self.web_request("https://www.kleinanzeigen.de/m-meine-anzeigen-verwalten.json")
json_data = json.loads(response["content"])
# Fetch the management JSON data using web_request with pagination support
page = 1
# Find the current ad in the ads list
if isinstance(json_data, dict) and "ads" in json_data:
ads_list = json_data["ads"]
if isinstance(ads_list, list):
# Filter ads to find the current ad by ID
current_ad = next((ad for ad in ads_list if ad.get("id") == current_ad_id), None)
if current_ad and "buyNowEligible" in current_ad:
buy_now_eligible = current_ad["buyNowEligible"]
return buy_now_eligible if isinstance(buy_now_eligible, bool) else None
while True:
# Safety check: don't paginate beyond reasonable limit
if page > _SELL_DIRECTLY_MAX_PAGE_LIMIT:
LOG.warning("Stopping pagination after %s pages to avoid infinite loop", _SELL_DIRECTLY_MAX_PAGE_LIMIT)
break
response = await self.web_request(f"https://www.kleinanzeigen.de/m-meine-anzeigen-verwalten.json?sort=DEFAULT&pageNum={page}")
try:
json_data = json.loads(response["content"])
except json.JSONDecodeError as ex:
LOG.debug("Failed to parse JSON response on page %s: %s", page, ex)
break
# Find the current ad in the ads list
if isinstance(json_data, dict) and "ads" in json_data:
ads_list = json_data["ads"]
if isinstance(ads_list, list):
# Filter ads to find the current ad by ID
current_ad = next((ad for ad in ads_list if ad.get("id") == current_ad_id), None)
if current_ad and "buyNowEligible" in current_ad:
buy_now_eligible = current_ad["buyNowEligible"]
return buy_now_eligible if isinstance(buy_now_eligible, bool) else None
# Check if we need to fetch more pages
paging = json_data.get("paging") if isinstance(json_data, dict) else None
if not isinstance(paging, dict):
break
# Parse pagination info using real API fields
current_page_num = misc.coerce_page_number(paging.get("pageNum"))
total_pages = misc.coerce_page_number(paging.get("last"))
if current_page_num is None:
LOG.warning("Invalid 'pageNum' in paging info: %s, stopping pagination", paging.get("pageNum"))
break
# Stop if we've reached the last page
if total_pages is None or current_page_num >= total_pages:
break
# Use API's next field for navigation (more robust than our counter)
next_page = misc.coerce_page_number(paging.get("next"))
if next_page is None:
LOG.warning("Invalid 'next' page value in paging info: %s, stopping pagination", paging.get("next"))
break
page = next_page
# If the key doesn't exist or ad not found, return None (unknown)
return None

View File

@@ -31,6 +31,18 @@ kleinanzeigen_bot/__init__.py:
"App version: %s": "App Version: %s"
"Python version: %s": "Python Version: %s"
_fetch_published_ads:
"Empty JSON response content on page %s": "Leerer JSON-Antwortinhalt auf Seite %s"
"Failed to parse JSON response on page %s: %s (content: %s)": "Fehler beim Parsen der JSON-Antwort auf Seite %s: %s (Inhalt: %s)"
"Stopping pagination after %s pages to avoid infinite loop": "Stoppe die Seitenaufschaltung nach %s Seiten, um eine Endlosschleife zu vermeiden"
"Pagination request timed out on page %s: %s": "Zeitueberschreitung bei der Seitenabfrage auf Seite %s: %s"
"Unexpected JSON payload on page %s (content: %s)": "Unerwartete JSON-Antwort auf Seite %s (Inhalt: %s)"
"Unexpected 'ads' type on page %s: %s value: %s": "Unerwarteter 'ads'-Typ auf Seite %s: %s Wert: %s"
"Reached last page %s of %s, stopping pagination": "Letzte Seite %s von %s erreicht, beende Paginierung"
"No ads found on page %s, stopping pagination": "Keine Anzeigen auf Seite %s gefunden, beende Paginierung"
"Invalid 'next' page value in paging info: %s, stopping pagination": "Ungültiger 'next'-Seitenwert in Paginierungsinfo: %s, beende Paginierung"
"Invalid 'pageNum' in paging info: %s, stopping pagination": "Ungültiger 'pageNum'-Wert in Paginierungsinfo: %s, beende Paginierung"
__check_ad_changed:
"Hash comparison for [%s]:": "Hash-Vergleich für [%s]:"
" Stored hash: %s": " Gespeicherter Hash: %s"
@@ -162,6 +174,7 @@ kleinanzeigen_bot/__init__.py:
update_ads:
"Processing %s/%s: '%s' from [%s]...": "Verarbeite %s/%s: '%s' von [%s]..."
"Skipping because ad is reserved": "Überspringen, da Anzeige reserviert ist"
" -> SKIPPED: ad '%s' (ID: %s) not found in published ads": " -> ÜBERSPRUNGEN: Anzeige '%s' (ID: %s) nicht in veröffentlichten Anzeigen gefunden"
"DONE: updated %s": "FERTIG: %s aktualisiert"
"ad": "Anzeige"
@@ -299,6 +312,9 @@ kleinanzeigen_bot/extract.py:
_extract_sell_directly_from_ad_page:
"Could not extract ad ID from URL: %s": "Konnte Anzeigen-ID nicht aus der URL extrahieren: %s"
"Stopping pagination after %s pages to avoid infinite loop": "Stoppe die Seitenaufschaltung nach %s Seiten, um eine Endlosschleife zu vermeiden"
"Invalid 'next' page value in paging info: %s, stopping pagination": "Ungültiger 'next'-Seitenwert in Paginierungsinfo: %s, beende Paginierung"
"Invalid 'pageNum' in paging info: %s, stopping pagination": "Ungültiger 'pageNum'-Wert in Paginierungsinfo: %s, beende Paginierung"
#################################################
kleinanzeigen_bot/utils/i18n.py:

View File

@@ -16,12 +16,55 @@ from . import i18n
T = TypeVar("T")
def coerce_page_number(value:Any) -> int | None:
"""Safely coerce a value to int or return None if conversion fails.
Whole-number floats are accepted; non-integer floats are rejected.
Args:
value: Value to coerce to int (can be int, str, float, or any type)
Returns:
int if value can be safely coerced, None otherwise
Examples:
>>> coerce_page_number(1)
1
>>> coerce_page_number("2")
2
>>> coerce_page_number(3.0)
3
>>> coerce_page_number(3.5) is None
True
>>> coerce_page_number(True) is None # Not 1!
True
>>> coerce_page_number(None) is None
True
>>> coerce_page_number("invalid") is None
True
>>> coerce_page_number([1, 2, 3]) is None
True
"""
if value is None:
return None
if isinstance(value, bool):
return None
if isinstance(value, float):
if value.is_integer():
return int(value)
return None
try:
return int(value)
except (TypeError, ValueError):
return None
def ensure(
condition:Any | bool | Callable[[], bool], # noqa: FBT001 Boolean-typed positional argument in function definition
error_message:str,
timeout:float = 5,
poll_frequency:float = 0.5
) -> None:
condition:Any | bool | Callable[[], bool], # noqa: FBT001 Boolean-typed positional argument in function definition
error_message:str,
timeout:float = 5,
poll_frequency:float = 0.5,
) -> None:
"""
Ensure a condition is true, retrying until timeout.
@@ -152,12 +195,7 @@ def parse_decimal(number:float | int | str) -> decimal.Decimal:
raise decimal.DecimalException(f"Invalid number format: {number}") from ex
def parse_datetime(
date:datetime | str | None,
*,
add_timezone_if_missing:bool = True,
use_local_timezone:bool = True
) -> datetime | None:
def parse_datetime(date:datetime | str | None, *, add_timezone_if_missing:bool = True, use_local_timezone:bool = True) -> datetime | None:
"""
Parses a datetime object or ISO-formatted string.
@@ -184,10 +222,7 @@ def parse_datetime(
dt = date if isinstance(date, datetime) else datetime.fromisoformat(date)
if dt.tzinfo is None and add_timezone_if_missing:
dt = (
dt.astimezone() if use_local_timezone
else dt.replace(tzinfo = timezone.utc)
)
dt = dt.astimezone() if use_local_timezone else dt.replace(tzinfo = timezone.utc)
return dt