fix: extend command fails with >25 ads due to pagination (#793)

This commit is contained in:
Jens
2026-01-28 06:08:03 +01:00
committed by GitHub
parent d954e849a2
commit 7098719d5b
7 changed files with 589 additions and 327 deletions

View File

@@ -999,15 +999,23 @@ class KleinanzeigenBot(WebScrapingMixin): # noqa: PLR0904
LOG.info("Extending ad '%s' (ID: %s)...", ad_cfg.title, ad_cfg.id)
try:
# Navigate to ad management page
await self.web_open(f"{self.root_url}/m-meine-anzeigen.html")
# Find and click "Verlängern" (extend) button for this ad
# Navigate to ad management page and find extend button across all pages
extend_button_xpath = f'//li[@data-adid="{ad_cfg.id}"]//button[contains(., "Verlängern")]'
try:
await self.web_click(By.XPATH, extend_button_xpath)
except TimeoutError:
async def find_and_click_extend_button(page_num:int) -> bool:
"""Try to find and click extend button on current page."""
try:
extend_button = await self.web_find(By.XPATH, extend_button_xpath, timeout = self._timeout("quick_dom"))
LOG.info("Found extend button on page %s", page_num)
await extend_button.click()
return True # Success - stop pagination
except TimeoutError:
LOG.debug("Extend button not found on page %s", page_num)
return False # Continue to next page
success = await self._navigate_paginated_ad_overview(find_and_click_extend_button, page_url = f"{self.root_url}/m-meine-anzeigen.html")
if not success:
LOG.error(" -> FAILED: Could not find extend button for ad ID %s", ad_cfg.id)
return False

View File

@@ -148,104 +148,34 @@ class AdExtractor(WebScrapingMixin):
:return: the links to your ad pages
"""
# navigate to "your ads" page
await self.web_open("https://www.kleinanzeigen.de/m-meine-anzeigen.html")
await self.web_sleep(2000, 3000) # Consider replacing with explicit waits later
# Try to find the main ad list container first
try:
_ = await self.web_find(By.ID, "my-manageitems-adlist")
except TimeoutError:
LOG.warning("Ad list container #my-manageitems-adlist not found. Maybe no ads present?")
return []
# --- Pagination handling ---
multi_page = False
pagination_timeout = self._timeout("pagination_initial")
try:
# Correct selector: Use uppercase '.Pagination'
pagination_section = await self.web_find(By.CSS_SELECTOR, ".Pagination", timeout = pagination_timeout) # Increased timeout slightly
# Correct selector: Use 'aria-label'
# Also check if the button is actually present AND potentially enabled (though enabled check isn't strictly necessary here, only for clicking later)
next_buttons = await self.web_find_all(By.CSS_SELECTOR, 'button[aria-label="Nächste"]', parent = pagination_section)
if next_buttons:
# Check if at least one 'Nächste' button is not disabled (optional but good practice)
enabled_next_buttons = [btn for btn in next_buttons if not btn.attrs.get("disabled")]
if enabled_next_buttons:
multi_page = True
LOG.info("Multiple ad pages detected.")
else:
LOG.info("Next button found but is disabled. Assuming single effective page.")
else:
LOG.info('No "Naechste" button found within pagination. Assuming single page.')
except TimeoutError:
# This will now correctly trigger only if the '.Pagination' div itself is not found
LOG.info("No pagination controls found. Assuming single page.")
except Exception as e:
LOG.exception("Error during pagination detection: %s", e)
LOG.info("Assuming single page due to error during pagination check.")
# --- End Pagination Handling ---
refs:list[str] = []
current_page = 1
while True: # Loop reference extraction
LOG.info("Extracting ads from page %s...", current_page)
# scroll down to load dynamically if necessary
await self.web_scroll_page_down()
await self.web_sleep(2000, 3000) # Consider replacing with explicit waits
# Re-find the ad list container on the current page/state
async def extract_page_refs(page_num:int) -> bool:
"""Extract ad reference URLs from the current page.
:param page_num: The current page number being processed
:return: True to stop pagination (e.g. ads container disappeared), False to continue to next page
"""
try:
ad_list_container = await self.web_find(By.ID, "my-manageitems-adlist")
list_items = await self.web_find_all(By.CLASS_NAME, "cardbox", parent = ad_list_container)
LOG.info("Found %s ad items on page %s.", len(list_items), current_page)
except TimeoutError:
LOG.warning("Could not find ad list container or items on page %s.", current_page)
break # Stop if ads disappear
LOG.info("Found %s ad items on page %s.", len(list_items), page_num)
# Extract references using the CORRECTED selector
try:
page_refs:list[str] = [str((await self.web_find(By.CSS_SELECTOR, "div h3 a.text-onSurface", parent = li)).attrs["href"]) for li in list_items]
refs.extend(page_refs)
LOG.info("Successfully extracted %s refs from page %s.", len(page_refs), current_page)
except Exception as e:
# Log the error if extraction fails for some items, but try to continue
LOG.exception("Error extracting refs on page %s: %s", current_page, e)
LOG.info("Successfully extracted %s refs from page %s.", len(page_refs), page_num)
return False # Continue to next page
if not multi_page: # only one iteration for single-page overview
break
# --- Navigate to next page ---
follow_up_timeout = self._timeout("pagination_follow_up")
try:
# Find the pagination section again (scope might have changed after scroll/wait)
pagination_section = await self.web_find(By.CSS_SELECTOR, ".Pagination", timeout = follow_up_timeout)
# Find the "Next" button using the correct aria-label selector and ensure it's not disabled
next_button_element = None
possible_next_buttons = await self.web_find_all(By.CSS_SELECTOR, 'button[aria-label="Nächste"]', parent = pagination_section)
for btn in possible_next_buttons:
if not btn.attrs.get("disabled"): # Check if the button is enabled
next_button_element = btn
break # Found an enabled next button
if next_button_element:
LOG.info("Navigating to next page...")
await next_button_element.click()
current_page += 1
# Wait for page load - consider waiting for a specific element on the new page instead of fixed sleep
await self.web_sleep(3000, 4000)
else:
LOG.info('Last ad overview page explored (no enabled "Naechste" button found).')
break
except TimeoutError:
# This might happen if pagination disappears on the last page after loading
LOG.info("No pagination controls found after scrolling/waiting. Assuming last page.")
break
LOG.warning("Could not find ad list container or items on page %s.", page_num)
return True # Stop pagination (ads disappeared)
except Exception as e:
LOG.exception("Error during pagination navigation: %s", e)
break
# --- End Navigation ---
# Continue despite error for resilience against transient web scraping issues
# (e.g., DOM structure changes, network glitches). LOG.exception ensures visibility.
LOG.exception("Error extracting refs on page %s: %s", page_num, e)
return False # Continue to next page
await self._navigate_paginated_ad_overview(extract_page_refs)
if not refs:
LOG.warning("No ad URLs were extracted.")

View File

@@ -112,6 +112,9 @@ kleinanzeigen_bot/__init__.py:
" -> FAILED: Timeout while extending ad '%s': %s": " -> FEHLER: Zeitüberschreitung beim Verlängern der Anzeige '%s': %s"
" -> FAILED: Could not persist extension for ad '%s': %s": " -> FEHLER: Verlängerung der Anzeige '%s' konnte nicht gespeichert werden: %s"
find_and_click_extend_button:
"Found extend button on page %s": "'Verlängern'-Button auf Seite %s gefunden"
finalize_installation_mode:
"Config file: %s": "Konfigurationsdatei: %s"
"First run detected, prompting user for installation mode": "Erster Start erkannt, frage Benutzer nach Installationsmodus"
@@ -259,21 +262,11 @@ kleinanzeigen_bot/extract.py:
"Failed to extract ad ID from URL '%s': %s": "Fehler beim Extrahieren der Anzeigen-ID aus der URL '%s': %s"
extract_own_ads_urls:
"Ad list container #my-manageitems-adlist not found. Maybe no ads present?": "Anzeigenlistencontainer #my-manageitems-adlist nicht gefunden. Vielleicht sind keine Anzeigen vorhanden?"
"Multiple ad pages detected.": "Mehrere Anzeigenseiten erkannt."
"Next button found but is disabled. Assuming single effective page.": "Weiter-Button gefunden, aber deaktiviert. Es wird von einer einzelnen effektiven Seite ausgegangen."
"No \"Naechste\" button found within pagination. Assuming single page.": "Kein \"Nächste\"-Button in der Paginierung gefunden. Es wird von einer einzelnen Seite ausgegangen."
"No pagination controls found. Assuming single page.": "Keine Paginierungssteuerung gefunden. Es wird von einer einzelnen Seite ausgegangen."
"Assuming single page due to error during pagination check.": "Es wird von einer einzelnen Seite ausgegangen wegen eines Fehlers bei der Paginierungsprüfung."
"Navigating to next page...": "Navigiere zur nächsten Seite..."
"Last ad overview page explored (no enabled \"Naechste\" button found).": "Letzte Anzeigenübersichtsseite erkundet (kein aktivierter \"Nächste\"-Button gefunden)."
"No pagination controls found after scrolling/waiting. Assuming last page.": "Keine Paginierungssteuerung nach dem Scrollen/Warten gefunden. Es wird von der letzten Seite ausgegangen."
"No ad URLs were extracted.": "Es wurden keine Anzeigen-URLs extrahiert."
extract_page_refs:
"Could not find ad list container or items on page %s.": "Anzeigenlistencontainer oder Elemente auf Seite %s nicht gefunden."
"Error during pagination detection: %s": "Fehler bei der Paginierungserkennung: %s"
"Error during pagination navigation: %s": "Fehler bei der Paginierungsnavigation: %s"
"Error extracting refs on page %s: %s": "Fehler beim Extrahieren der Referenzen auf Seite %s: %s"
"Extracting ads from page %s...": "Extrahiere Anzeigen von Seite %s..."
"Found %s ad items on page %s.": "%s Anzeigen-Elemente auf Seite %s gefunden."
"Successfully extracted %s refs from page %s.": "%s Referenzen von Seite %s erfolgreich extrahiert."
@@ -488,6 +481,18 @@ kleinanzeigen_bot/utils/web_scraping_mixin.py:
"Combobox missing aria-controls attribute": "Combobox fehlt aria-controls Attribut"
"No matching option found in combobox: '%s'": "Keine passende Option in Combobox gefunden: '%s'"
_navigate_paginated_ad_overview:
"Failed to open ad overview page at %s: timeout": "Fehler beim Öffnen der Anzeigenübersichtsseite unter %s: Zeitüberschreitung"
"Scroll timeout on page %s (non-critical, continuing)": "Zeitüberschreitung beim Scrollen auf Seite %s (nicht kritisch, wird fortgesetzt)"
"Page action timed out on page %s": "Seitenaktion hat auf Seite %s eine Zeitüberschreitung erreicht"
"Ad list container not found. Maybe no ads present?": "Anzeigenlistencontainer nicht gefunden. Vielleicht sind keine Anzeigen vorhanden?"
"Multiple ad pages detected.": "Mehrere Anzeigenseiten erkannt."
"No pagination controls found. Assuming single page.": "Keine Paginierungssteuerung gefunden. Es wird von einer einzelnen Seite ausgegangen."
"Processing page %s...": "Verarbeite Seite %s..."
"Navigating to page %s...": "Navigiere zu Seite %s..."
"Last page reached (no enabled 'Naechste' button found).": "Letzte Seite erreicht (kein aktivierter 'Naechste'-Button gefunden)."
"No pagination controls found. Assuming last page.": "Keine Paginierungssteuerung gefunden. Es wird von der letzten Seite ausgegangen."
close_browser_session:
"Closing Browser session...": "Schließe Browser-Sitzung..."

View File

@@ -969,6 +969,111 @@ class WebScrapingMixin:
)
await self.page.sleep(duration / 1_000)
async def _navigate_paginated_ad_overview(
self,
page_action:Callable[[int], Awaitable[bool]],
page_url:str = "https://www.kleinanzeigen.de/m-meine-anzeigen.html",
*,
max_pages:int = 10,
) -> bool:
"""
Navigate through paginated ad overview page, calling page_action on each page.
This helper guarantees to return a boolean result and never propagates TimeoutError.
All timeout conditions are handled internally and logged appropriately.
Args:
page_action: Async callable that receives current_page number and returns True if action succeeded/should stop
page_url: URL of the paginated overview page (default: kleinanzeigen ad management page)
max_pages: Maximum number of pages to navigate (safety limit)
Returns:
True if page_action returned True on any page, False otherwise
Example:
async def find_ad_callback(page_num: int) -> bool:
element = await self.web_find(By.XPATH, "//div[@id='my-ad']")
if element:
await element.click()
return True
return False
success = await self._navigate_paginated_ad_overview(find_ad_callback)
"""
try:
await self.web_open(page_url)
except TimeoutError:
LOG.warning("Failed to open ad overview page at %s: timeout", page_url)
return False
await self.web_sleep(2000, 3000)
# Check if ad list container exists
try:
_ = await self.web_find(By.ID, "my-manageitems-adlist")
except TimeoutError:
LOG.warning("Ad list container not found. Maybe no ads present?")
return False
# Check for pagination controls
multi_page = False
pagination_timeout = self._timeout("pagination_initial")
try:
pagination_section = await self.web_find(By.CSS_SELECTOR, ".Pagination", timeout = pagination_timeout)
next_buttons = await self.web_find_all(By.CSS_SELECTOR, 'button[aria-label="Nächste"]', parent = pagination_section)
if next_buttons:
enabled_next_buttons = [btn for btn in next_buttons if not btn.attrs.get("disabled")]
if enabled_next_buttons:
multi_page = True
LOG.info("Multiple ad pages detected.")
except TimeoutError:
LOG.info("No pagination controls found. Assuming single page.")
current_page = 1
while current_page <= max_pages:
LOG.info("Processing page %s...", current_page)
try:
await self.web_scroll_page_down()
except TimeoutError:
LOG.debug("Scroll timeout on page %s (non-critical, continuing)", current_page)
await self.web_sleep(2000, 3000)
try:
if await page_action(current_page):
return True
except TimeoutError:
LOG.warning("Page action timed out on page %s", current_page)
return False
if not multi_page:
break
follow_up_timeout = self._timeout("pagination_follow_up")
try:
pagination_section = await self.web_find(By.CSS_SELECTOR, ".Pagination", timeout = follow_up_timeout)
next_button_element = None
possible_next_buttons = await self.web_find_all(By.CSS_SELECTOR, 'button[aria-label="Nächste"]', parent = pagination_section)
for btn in possible_next_buttons:
if not btn.attrs.get("disabled"):
next_button_element = btn
break
if next_button_element:
LOG.info("Navigating to page %s...", current_page + 1)
await next_button_element.click()
await self.web_sleep(3000, 4000)
current_page += 1
else:
LOG.info("Last page reached (no enabled 'Naechste' button found).")
break
except TimeoutError:
LOG.info("No pagination controls found. Assuming last page.")
break
return False
async def web_request(self, url:str, method:str = "GET", valid_response_codes:int | Iterable[int] = 200, headers:dict[str, str] | None = None) -> Any:
method = method.upper()
LOG.debug(" -> HTTP %s [%s]...", method, url)