mirror of
https://github.com/Second-Hand-Friends/kleinanzeigen-bot.git
synced 2026-03-12 18:41:50 +01:00
fix: extend command fails with >25 ads due to pagination (#793)
This commit is contained in:
@@ -999,15 +999,23 @@ class KleinanzeigenBot(WebScrapingMixin): # noqa: PLR0904
|
||||
LOG.info("Extending ad '%s' (ID: %s)...", ad_cfg.title, ad_cfg.id)
|
||||
|
||||
try:
|
||||
# Navigate to ad management page
|
||||
await self.web_open(f"{self.root_url}/m-meine-anzeigen.html")
|
||||
|
||||
# Find and click "Verlängern" (extend) button for this ad
|
||||
# Navigate to ad management page and find extend button across all pages
|
||||
extend_button_xpath = f'//li[@data-adid="{ad_cfg.id}"]//button[contains(., "Verlängern")]'
|
||||
|
||||
try:
|
||||
await self.web_click(By.XPATH, extend_button_xpath)
|
||||
except TimeoutError:
|
||||
async def find_and_click_extend_button(page_num:int) -> bool:
|
||||
"""Try to find and click extend button on current page."""
|
||||
try:
|
||||
extend_button = await self.web_find(By.XPATH, extend_button_xpath, timeout = self._timeout("quick_dom"))
|
||||
LOG.info("Found extend button on page %s", page_num)
|
||||
await extend_button.click()
|
||||
return True # Success - stop pagination
|
||||
except TimeoutError:
|
||||
LOG.debug("Extend button not found on page %s", page_num)
|
||||
return False # Continue to next page
|
||||
|
||||
success = await self._navigate_paginated_ad_overview(find_and_click_extend_button, page_url = f"{self.root_url}/m-meine-anzeigen.html")
|
||||
|
||||
if not success:
|
||||
LOG.error(" -> FAILED: Could not find extend button for ad ID %s", ad_cfg.id)
|
||||
return False
|
||||
|
||||
|
||||
@@ -148,104 +148,34 @@ class AdExtractor(WebScrapingMixin):
|
||||
|
||||
:return: the links to your ad pages
|
||||
"""
|
||||
# navigate to "your ads" page
|
||||
await self.web_open("https://www.kleinanzeigen.de/m-meine-anzeigen.html")
|
||||
await self.web_sleep(2000, 3000) # Consider replacing with explicit waits later
|
||||
|
||||
# Try to find the main ad list container first
|
||||
try:
|
||||
_ = await self.web_find(By.ID, "my-manageitems-adlist")
|
||||
except TimeoutError:
|
||||
LOG.warning("Ad list container #my-manageitems-adlist not found. Maybe no ads present?")
|
||||
return []
|
||||
|
||||
# --- Pagination handling ---
|
||||
multi_page = False
|
||||
pagination_timeout = self._timeout("pagination_initial")
|
||||
try:
|
||||
# Correct selector: Use uppercase '.Pagination'
|
||||
pagination_section = await self.web_find(By.CSS_SELECTOR, ".Pagination", timeout = pagination_timeout) # Increased timeout slightly
|
||||
# Correct selector: Use 'aria-label'
|
||||
# Also check if the button is actually present AND potentially enabled (though enabled check isn't strictly necessary here, only for clicking later)
|
||||
next_buttons = await self.web_find_all(By.CSS_SELECTOR, 'button[aria-label="Nächste"]', parent = pagination_section)
|
||||
if next_buttons:
|
||||
# Check if at least one 'Nächste' button is not disabled (optional but good practice)
|
||||
enabled_next_buttons = [btn for btn in next_buttons if not btn.attrs.get("disabled")]
|
||||
if enabled_next_buttons:
|
||||
multi_page = True
|
||||
LOG.info("Multiple ad pages detected.")
|
||||
else:
|
||||
LOG.info("Next button found but is disabled. Assuming single effective page.")
|
||||
|
||||
else:
|
||||
LOG.info('No "Naechste" button found within pagination. Assuming single page.')
|
||||
except TimeoutError:
|
||||
# This will now correctly trigger only if the '.Pagination' div itself is not found
|
||||
LOG.info("No pagination controls found. Assuming single page.")
|
||||
except Exception as e:
|
||||
LOG.exception("Error during pagination detection: %s", e)
|
||||
LOG.info("Assuming single page due to error during pagination check.")
|
||||
# --- End Pagination Handling ---
|
||||
|
||||
refs:list[str] = []
|
||||
current_page = 1
|
||||
while True: # Loop reference extraction
|
||||
LOG.info("Extracting ads from page %s...", current_page)
|
||||
# scroll down to load dynamically if necessary
|
||||
await self.web_scroll_page_down()
|
||||
await self.web_sleep(2000, 3000) # Consider replacing with explicit waits
|
||||
|
||||
# Re-find the ad list container on the current page/state
|
||||
async def extract_page_refs(page_num:int) -> bool:
|
||||
"""Extract ad reference URLs from the current page.
|
||||
|
||||
:param page_num: The current page number being processed
|
||||
:return: True to stop pagination (e.g. ads container disappeared), False to continue to next page
|
||||
"""
|
||||
try:
|
||||
ad_list_container = await self.web_find(By.ID, "my-manageitems-adlist")
|
||||
list_items = await self.web_find_all(By.CLASS_NAME, "cardbox", parent = ad_list_container)
|
||||
LOG.info("Found %s ad items on page %s.", len(list_items), current_page)
|
||||
except TimeoutError:
|
||||
LOG.warning("Could not find ad list container or items on page %s.", current_page)
|
||||
break # Stop if ads disappear
|
||||
LOG.info("Found %s ad items on page %s.", len(list_items), page_num)
|
||||
|
||||
# Extract references using the CORRECTED selector
|
||||
try:
|
||||
page_refs:list[str] = [str((await self.web_find(By.CSS_SELECTOR, "div h3 a.text-onSurface", parent = li)).attrs["href"]) for li in list_items]
|
||||
refs.extend(page_refs)
|
||||
LOG.info("Successfully extracted %s refs from page %s.", len(page_refs), current_page)
|
||||
except Exception as e:
|
||||
# Log the error if extraction fails for some items, but try to continue
|
||||
LOG.exception("Error extracting refs on page %s: %s", current_page, e)
|
||||
LOG.info("Successfully extracted %s refs from page %s.", len(page_refs), page_num)
|
||||
return False # Continue to next page
|
||||
|
||||
if not multi_page: # only one iteration for single-page overview
|
||||
break
|
||||
|
||||
# --- Navigate to next page ---
|
||||
follow_up_timeout = self._timeout("pagination_follow_up")
|
||||
try:
|
||||
# Find the pagination section again (scope might have changed after scroll/wait)
|
||||
pagination_section = await self.web_find(By.CSS_SELECTOR, ".Pagination", timeout = follow_up_timeout)
|
||||
# Find the "Next" button using the correct aria-label selector and ensure it's not disabled
|
||||
next_button_element = None
|
||||
possible_next_buttons = await self.web_find_all(By.CSS_SELECTOR, 'button[aria-label="Nächste"]', parent = pagination_section)
|
||||
for btn in possible_next_buttons:
|
||||
if not btn.attrs.get("disabled"): # Check if the button is enabled
|
||||
next_button_element = btn
|
||||
break # Found an enabled next button
|
||||
|
||||
if next_button_element:
|
||||
LOG.info("Navigating to next page...")
|
||||
await next_button_element.click()
|
||||
current_page += 1
|
||||
# Wait for page load - consider waiting for a specific element on the new page instead of fixed sleep
|
||||
await self.web_sleep(3000, 4000)
|
||||
else:
|
||||
LOG.info('Last ad overview page explored (no enabled "Naechste" button found).')
|
||||
break
|
||||
except TimeoutError:
|
||||
# This might happen if pagination disappears on the last page after loading
|
||||
LOG.info("No pagination controls found after scrolling/waiting. Assuming last page.")
|
||||
break
|
||||
LOG.warning("Could not find ad list container or items on page %s.", page_num)
|
||||
return True # Stop pagination (ads disappeared)
|
||||
except Exception as e:
|
||||
LOG.exception("Error during pagination navigation: %s", e)
|
||||
break
|
||||
# --- End Navigation ---
|
||||
# Continue despite error for resilience against transient web scraping issues
|
||||
# (e.g., DOM structure changes, network glitches). LOG.exception ensures visibility.
|
||||
LOG.exception("Error extracting refs on page %s: %s", page_num, e)
|
||||
return False # Continue to next page
|
||||
|
||||
await self._navigate_paginated_ad_overview(extract_page_refs)
|
||||
|
||||
if not refs:
|
||||
LOG.warning("No ad URLs were extracted.")
|
||||
|
||||
@@ -112,6 +112,9 @@ kleinanzeigen_bot/__init__.py:
|
||||
" -> FAILED: Timeout while extending ad '%s': %s": " -> FEHLER: Zeitüberschreitung beim Verlängern der Anzeige '%s': %s"
|
||||
" -> FAILED: Could not persist extension for ad '%s': %s": " -> FEHLER: Verlängerung der Anzeige '%s' konnte nicht gespeichert werden: %s"
|
||||
|
||||
find_and_click_extend_button:
|
||||
"Found extend button on page %s": "'Verlängern'-Button auf Seite %s gefunden"
|
||||
|
||||
finalize_installation_mode:
|
||||
"Config file: %s": "Konfigurationsdatei: %s"
|
||||
"First run detected, prompting user for installation mode": "Erster Start erkannt, frage Benutzer nach Installationsmodus"
|
||||
@@ -259,21 +262,11 @@ kleinanzeigen_bot/extract.py:
|
||||
"Failed to extract ad ID from URL '%s': %s": "Fehler beim Extrahieren der Anzeigen-ID aus der URL '%s': %s"
|
||||
|
||||
extract_own_ads_urls:
|
||||
"Ad list container #my-manageitems-adlist not found. Maybe no ads present?": "Anzeigenlistencontainer #my-manageitems-adlist nicht gefunden. Vielleicht sind keine Anzeigen vorhanden?"
|
||||
"Multiple ad pages detected.": "Mehrere Anzeigenseiten erkannt."
|
||||
"Next button found but is disabled. Assuming single effective page.": "Weiter-Button gefunden, aber deaktiviert. Es wird von einer einzelnen effektiven Seite ausgegangen."
|
||||
"No \"Naechste\" button found within pagination. Assuming single page.": "Kein \"Nächste\"-Button in der Paginierung gefunden. Es wird von einer einzelnen Seite ausgegangen."
|
||||
"No pagination controls found. Assuming single page.": "Keine Paginierungssteuerung gefunden. Es wird von einer einzelnen Seite ausgegangen."
|
||||
"Assuming single page due to error during pagination check.": "Es wird von einer einzelnen Seite ausgegangen wegen eines Fehlers bei der Paginierungsprüfung."
|
||||
"Navigating to next page...": "Navigiere zur nächsten Seite..."
|
||||
"Last ad overview page explored (no enabled \"Naechste\" button found).": "Letzte Anzeigenübersichtsseite erkundet (kein aktivierter \"Nächste\"-Button gefunden)."
|
||||
"No pagination controls found after scrolling/waiting. Assuming last page.": "Keine Paginierungssteuerung nach dem Scrollen/Warten gefunden. Es wird von der letzten Seite ausgegangen."
|
||||
"No ad URLs were extracted.": "Es wurden keine Anzeigen-URLs extrahiert."
|
||||
|
||||
extract_page_refs:
|
||||
"Could not find ad list container or items on page %s.": "Anzeigenlistencontainer oder Elemente auf Seite %s nicht gefunden."
|
||||
"Error during pagination detection: %s": "Fehler bei der Paginierungserkennung: %s"
|
||||
"Error during pagination navigation: %s": "Fehler bei der Paginierungsnavigation: %s"
|
||||
"Error extracting refs on page %s: %s": "Fehler beim Extrahieren der Referenzen auf Seite %s: %s"
|
||||
"Extracting ads from page %s...": "Extrahiere Anzeigen von Seite %s..."
|
||||
"Found %s ad items on page %s.": "%s Anzeigen-Elemente auf Seite %s gefunden."
|
||||
"Successfully extracted %s refs from page %s.": "%s Referenzen von Seite %s erfolgreich extrahiert."
|
||||
|
||||
@@ -488,6 +481,18 @@ kleinanzeigen_bot/utils/web_scraping_mixin.py:
|
||||
"Combobox missing aria-controls attribute": "Combobox fehlt aria-controls Attribut"
|
||||
"No matching option found in combobox: '%s'": "Keine passende Option in Combobox gefunden: '%s'"
|
||||
|
||||
_navigate_paginated_ad_overview:
|
||||
"Failed to open ad overview page at %s: timeout": "Fehler beim Öffnen der Anzeigenübersichtsseite unter %s: Zeitüberschreitung"
|
||||
"Scroll timeout on page %s (non-critical, continuing)": "Zeitüberschreitung beim Scrollen auf Seite %s (nicht kritisch, wird fortgesetzt)"
|
||||
"Page action timed out on page %s": "Seitenaktion hat auf Seite %s eine Zeitüberschreitung erreicht"
|
||||
"Ad list container not found. Maybe no ads present?": "Anzeigenlistencontainer nicht gefunden. Vielleicht sind keine Anzeigen vorhanden?"
|
||||
"Multiple ad pages detected.": "Mehrere Anzeigenseiten erkannt."
|
||||
"No pagination controls found. Assuming single page.": "Keine Paginierungssteuerung gefunden. Es wird von einer einzelnen Seite ausgegangen."
|
||||
"Processing page %s...": "Verarbeite Seite %s..."
|
||||
"Navigating to page %s...": "Navigiere zu Seite %s..."
|
||||
"Last page reached (no enabled 'Naechste' button found).": "Letzte Seite erreicht (kein aktivierter 'Naechste'-Button gefunden)."
|
||||
"No pagination controls found. Assuming last page.": "Keine Paginierungssteuerung gefunden. Es wird von der letzten Seite ausgegangen."
|
||||
|
||||
close_browser_session:
|
||||
"Closing Browser session...": "Schließe Browser-Sitzung..."
|
||||
|
||||
|
||||
@@ -969,6 +969,111 @@ class WebScrapingMixin:
|
||||
)
|
||||
await self.page.sleep(duration / 1_000)
|
||||
|
||||
async def _navigate_paginated_ad_overview(
|
||||
self,
|
||||
page_action:Callable[[int], Awaitable[bool]],
|
||||
page_url:str = "https://www.kleinanzeigen.de/m-meine-anzeigen.html",
|
||||
*,
|
||||
max_pages:int = 10,
|
||||
) -> bool:
|
||||
"""
|
||||
Navigate through paginated ad overview page, calling page_action on each page.
|
||||
|
||||
This helper guarantees to return a boolean result and never propagates TimeoutError.
|
||||
All timeout conditions are handled internally and logged appropriately.
|
||||
|
||||
Args:
|
||||
page_action: Async callable that receives current_page number and returns True if action succeeded/should stop
|
||||
page_url: URL of the paginated overview page (default: kleinanzeigen ad management page)
|
||||
max_pages: Maximum number of pages to navigate (safety limit)
|
||||
|
||||
Returns:
|
||||
True if page_action returned True on any page, False otherwise
|
||||
|
||||
Example:
|
||||
async def find_ad_callback(page_num: int) -> bool:
|
||||
element = await self.web_find(By.XPATH, "//div[@id='my-ad']")
|
||||
if element:
|
||||
await element.click()
|
||||
return True
|
||||
return False
|
||||
|
||||
success = await self._navigate_paginated_ad_overview(find_ad_callback)
|
||||
"""
|
||||
try:
|
||||
await self.web_open(page_url)
|
||||
except TimeoutError:
|
||||
LOG.warning("Failed to open ad overview page at %s: timeout", page_url)
|
||||
return False
|
||||
|
||||
await self.web_sleep(2000, 3000)
|
||||
|
||||
# Check if ad list container exists
|
||||
try:
|
||||
_ = await self.web_find(By.ID, "my-manageitems-adlist")
|
||||
except TimeoutError:
|
||||
LOG.warning("Ad list container not found. Maybe no ads present?")
|
||||
return False
|
||||
|
||||
# Check for pagination controls
|
||||
multi_page = False
|
||||
pagination_timeout = self._timeout("pagination_initial")
|
||||
try:
|
||||
pagination_section = await self.web_find(By.CSS_SELECTOR, ".Pagination", timeout = pagination_timeout)
|
||||
next_buttons = await self.web_find_all(By.CSS_SELECTOR, 'button[aria-label="Nächste"]', parent = pagination_section)
|
||||
if next_buttons:
|
||||
enabled_next_buttons = [btn for btn in next_buttons if not btn.attrs.get("disabled")]
|
||||
if enabled_next_buttons:
|
||||
multi_page = True
|
||||
LOG.info("Multiple ad pages detected.")
|
||||
except TimeoutError:
|
||||
LOG.info("No pagination controls found. Assuming single page.")
|
||||
|
||||
current_page = 1
|
||||
while current_page <= max_pages:
|
||||
LOG.info("Processing page %s...", current_page)
|
||||
|
||||
try:
|
||||
await self.web_scroll_page_down()
|
||||
except TimeoutError:
|
||||
LOG.debug("Scroll timeout on page %s (non-critical, continuing)", current_page)
|
||||
|
||||
await self.web_sleep(2000, 3000)
|
||||
|
||||
try:
|
||||
if await page_action(current_page):
|
||||
return True
|
||||
except TimeoutError:
|
||||
LOG.warning("Page action timed out on page %s", current_page)
|
||||
return False
|
||||
|
||||
if not multi_page:
|
||||
break
|
||||
|
||||
follow_up_timeout = self._timeout("pagination_follow_up")
|
||||
try:
|
||||
pagination_section = await self.web_find(By.CSS_SELECTOR, ".Pagination", timeout = follow_up_timeout)
|
||||
next_button_element = None
|
||||
possible_next_buttons = await self.web_find_all(By.CSS_SELECTOR, 'button[aria-label="Nächste"]', parent = pagination_section)
|
||||
for btn in possible_next_buttons:
|
||||
if not btn.attrs.get("disabled"):
|
||||
next_button_element = btn
|
||||
break
|
||||
|
||||
if next_button_element:
|
||||
LOG.info("Navigating to page %s...", current_page + 1)
|
||||
await next_button_element.click()
|
||||
await self.web_sleep(3000, 4000)
|
||||
current_page += 1
|
||||
else:
|
||||
LOG.info("Last page reached (no enabled 'Naechste' button found).")
|
||||
break
|
||||
except TimeoutError:
|
||||
LOG.info("No pagination controls found. Assuming last page.")
|
||||
break
|
||||
|
||||
return False
|
||||
|
||||
async def web_request(self, url:str, method:str = "GET", valid_response_codes:int | Iterable[int] = 200, headers:dict[str, str] | None = None) -> Any:
|
||||
method = method.upper()
|
||||
LOG.debug(" -> HTTP %s [%s]...", method, url)
|
||||
|
||||
Reference in New Issue
Block a user