fix: Correct pagination selectors and logic for issue #477 (#479)

2026-03-12 10:31:50 +01:00 · 2025-04-21 20:26:02 +02:00
parent c144801d2e
commit 79af6ba861
4 changed files with 161 additions and 71 deletions
--- a/src/kleinanzeigen_bot/init.py
+++ b/src/kleinanzeigen_bot/init.py
@@ -157,7 +157,8 @@ class KleinanzeigenBot(WebScrapingMixin):
                    * new: Veröffentlicht nur neue Anzeigen (d.h. Anzeigen ohne ID in der Konfigurationsdatei)
                    * changed: Veröffentlicht nur Anzeigen, die seit der letzten Veröffentlichung geändert wurden
                    * <id(s)>: Gibt eine oder mehrere Anzeigen-IDs an, die veröffentlicht werden sollen, z. B. "--ads=1,2,3", ignoriert republication_interval
-                    * Kombinationen: Sie können mehrere Selektoren mit Kommas kombinieren, z. B. "--ads=changed,due" um sowohl geänderte als auch fällige Anzeigen zu veröffentlichen
+                    * Kombinationen: Sie können mehrere Selektoren mit Kommas kombinieren, z. B. "--ads=changed,due" um sowohl geänderte als auch
                      fällige Anzeigen zu veröffentlichen
              --ads=all|new|<id(s)> (download) - Gibt an, welche Anzeigen heruntergeladen werden sollen (STANDARD: new)
                    Mögliche Werte:
                    * all: Lädt alle Anzeigen aus Ihrem Profil herunter
@@ -401,7 +402,8 @@ class KleinanzeigenBot(WebScrapingMixin):
                ensure(dicts.safe_get(ad_cfg, *path.split(".")) in allowed, f"-> property [{path}] must be one of: {allowed} @ [{ad_file}]")
            def assert_min_len(path:str, minlen:int) -> None:
-                ensure(len(dicts.safe_get(ad_cfg, *path.split("."))) >= minlen, f"-> property [{path}] must be at least {minlen} characters long @ [{ad_file}]")
+                ensure(len(dicts.safe_get(ad_cfg, *path.split("."))) >= minlen,
                       f"-> property [{path}] must be at least {minlen} characters long @ [{ad_file}]")
            def assert_has_value(path:str) -> None:
                ensure(dicts.safe_get(ad_cfg, *path.split(".")), f"-> property [{path}] not specified @ [{ad_file}]")
--- a/src/kleinanzeigen_bot/extract.py
+++ b/src/kleinanzeigen_bot/extract.py
@@ -135,54 +135,106 @@ class AdExtractor(WebScrapingMixin):
        """
        # navigate to "your ads" page
        await self.web_open('https://www.kleinanzeigen.de/m-meine-anzeigen.html')
-        await self.web_sleep(2000, 3000)
+        await self.web_sleep(2000, 3000)  # Consider replacing with explicit waits later
-        # collect ad references:
+        # Try to find the main ad list container first
        pagination_section = await self.web_find(By.CSS_SELECTOR, 'section:nth-of-type(4)',
                parent = await self.web_find(By.CSS_SELECTOR, '.l-splitpage'))
        # scroll down to load dynamically
        await self.web_scroll_page_down()
        await self.web_sleep(2000, 3000)
        # detect multi-page
        try:
-            pagination = await self.web_find(By.CSS_SELECTOR, 'div > div:nth-of-type(2) > div:nth-of-type(2) > div',
+            ad_list_container = await self.web_find(By.ID, 'my-manageitems-adlist')
-                    parent = pagination_section)
+        except TimeoutError:
-        except TimeoutError:  # 0 ads - no pagination area
+            LOG.warning('Ad list container #my-manageitems-adlist not found. Maybe no ads present?')
            LOG.warning('There are currently no ads on your profile!')
            return []
-        n_buttons = len(await self.web_find_all(By.CSS_SELECTOR, 'em',
+        # --- Pagination handling ---
-                parent = await self.web_find(By.CSS_SELECTOR, 'div:nth-of-type(1)', parent = pagination)))
+        multi_page = False
-        if n_buttons > 1:
+        try:
-            multi_page = True
+            # Correct selector: Use uppercase '.Pagination'
-            LOG.info('It looks like you have many ads!')
+            pagination_section = await self.web_find(By.CSS_SELECTOR, '.Pagination', timeout=10)  # Increased timeout slightly
-        else:
+            # Correct selector: Use 'aria-label'
-            multi_page = False
+            # Also check if the button is actually present AND potentially enabled (though enabled check isn't strictly necessary here, only for clicking later)
-            LOG.info('It looks like all your ads fit on one overview page.')
+            next_buttons = await self.web_find_all(By.CSS_SELECTOR, 'button[aria-label="Nächste"]', parent=pagination_section)
            if next_buttons:
                # Check if at least one 'Nächste' button is not disabled (optional but good practice)
                enabled_next_buttons = [btn for btn in next_buttons if not btn.attrs.get('disabled')]
                if enabled_next_buttons:
                    multi_page = True
                    LOG.info('Multiple ad pages detected.')
                else:
                    LOG.info('Next button found but is disabled. Assuming single effective page.')
            else:
                LOG.info('No "Naechste" button found within pagination. Assuming single page.')
        except TimeoutError:
            # This will now correctly trigger only if the '.Pagination' div itself is not found
            LOG.info('No pagination controls found. Assuming single page.')
        except Exception as e:
            LOG.error("Error during pagination detection: %s", e, exc_info=True)
            LOG.info('Assuming single page due to error during pagination check.')
        # --- End Pagination Handling ---
        refs:list[str] = []
-        while True:  # loop reference extraction until no more forward page
+        current_page = 1
-            # extract references
+        while True:  # Loop reference extraction
-            list_items = await self.web_find_all(By.CLASS_NAME, 'cardbox',
+            LOG.info("Extracting ads from page %s...", current_page)
-                    parent = await self.web_find(By.ID, 'my-manageitems-adlist'))
+            # scroll down to load dynamically if necessary
-            refs += [
+            await self.web_scroll_page_down()
-                (await self.web_find(By.CSS_SELECTOR, 'article > section > section:nth-of-type(2) > h3 > div > a', parent = li)).attrs['href']
+            await self.web_sleep(2000, 3000)  # Consider replacing with explicit waits
-                for li in list_items
+
-            ]
+            # Re-find the ad list container on the current page/state
            try:
                ad_list_container = await self.web_find(By.ID, 'my-manageitems-adlist')
                list_items = await self.web_find_all(By.CLASS_NAME, 'cardbox', parent=ad_list_container)
                LOG.info("Found %s ad items on page %s.", len(list_items), current_page)
            except TimeoutError:
                LOG.warning("Could not find ad list container or items on page %s.", current_page)
                break  # Stop if ads disappear
            # Extract references using the CORRECTED selector
            try:
                page_refs = [
                    (await self.web_find(By.CSS_SELECTOR, 'div.manageitems-item-ad h3 a.text-onSurface', parent=li)).attrs['href']
                    for li in list_items
                ]
                refs.extend(page_refs)
                LOG.info("Successfully extracted %s refs from page %s.", len(page_refs), current_page)
            except Exception as e:
                # Log the error if extraction fails for some items, but try to continue
                LOG.error("Error extracting refs on page %s: %s", current_page, e, exc_info=True)
            if not multi_page:  # only one iteration for single-page overview
                break
-            # check if last page
+
-            nav_button:Element = (await self.web_find_all(By.CSS_SELECTOR, 'button.jsx-1553636621'))[-1]
+            # --- Navigate to next page ---
-            if nav_button.attrs['title'] != 'Nächste':
+            try:
-                LOG.info('Last ad overview page explored.')
+                # Find the pagination section again (scope might have changed after scroll/wait)
                pagination_section = await self.web_find(By.CSS_SELECTOR, '.Pagination', timeout=5)
                # Find the "Next" button using the correct aria-label selector and ensure it's not disabled
                next_button_element = None
                possible_next_buttons = await self.web_find_all(By.CSS_SELECTOR, 'button[aria-label="Nächste"]', parent=pagination_section)
                for btn in possible_next_buttons:
                    if not btn.attrs.get('disabled'):  # Check if the button is enabled
                        next_button_element = btn
                        break  # Found an enabled next button
                if next_button_element:
                    LOG.info("Navigating to next page...")
                    await next_button_element.click()
                    current_page += 1
                    # Wait for page load - consider waiting for a specific element on the new page instead of fixed sleep
                    await self.web_sleep(3000, 4000)
                else:
                    LOG.info('Last ad overview page explored (no enabled "Naechste" button found).')
                    break
            except TimeoutError:
                # This might happen if pagination disappears on the last page after loading
                LOG.info("No pagination controls found after scrolling/waiting. Assuming last page.")
                break
-            # navigate to next overview page
+            except Exception as e:
-            await nav_button.click()
+                LOG.error("Error during pagination navigation: %s", e, exc_info=True)
-            await self.web_sleep(2000, 3000)
+                break
-            await self.web_scroll_page_down()
+            # --- End Navigation ---
        if not refs:
            LOG.warning('No ad URLs were extracted.')
        return refs
--- a/src/kleinanzeigen_bot/resources/translations.de.yaml
+++ b/src/kleinanzeigen_bot/resources/translations.de.yaml
@@ -152,10 +152,23 @@ kleinanzeigen_bot/extract.py:
    "The ad ID could not be extracted from the given URL %s": "Die Anzeigen-ID konnte nicht aus der angegebenen URL %s extrahiert werden"
  extract_own_ads_urls:
-    "There are currently no ads on your profile!": "Es gibt derzeit keine Anzeigen in deinem Profil!"
+    "Ad list container #my-manageitems-adlist not found. Maybe no ads present?": "Anzeigenlistencontainer #my-manageitems-adlist nicht gefunden. Vielleicht sind keine Anzeigen vorhanden?"
-    "It looks like you have many ads!": "Es sieht so aus, als hättest du viele Anzeigen!"
+    "Multiple ad pages detected.": "Mehrere Anzeigenseiten erkannt."
-    "It looks like all your ads fit on one overview page.": "Es sieht so aus, als würden alle deine Anzeigen auf eine Übersichtsseite passen."
+    "Next button found but is disabled. Assuming single effective page.": "Weiter-Button gefunden, aber deaktiviert. Es wird von einer einzelnen effektiven Seite ausgegangen."
-    "Last ad overview page explored.": "Letzte Übersichtsseite erkundet."
+    "No \"Naechste\" button found within pagination. Assuming single page.": "Kein \"Nächste\"-Button in der Paginierung gefunden. Es wird von einer einzelnen Seite ausgegangen."
    "No pagination controls found. Assuming single page.": "Keine Paginierungssteuerung gefunden. Es wird von einer einzelnen Seite ausgegangen."
    "Assuming single page due to error during pagination check.": "Es wird von einer einzelnen Seite ausgegangen wegen eines Fehlers bei der Paginierungsprüfung."
    "Navigating to next page...": "Navigiere zur nächsten Seite..."
    "Last ad overview page explored (no enabled \"Naechste\" button found).": "Letzte Anzeigenübersichtsseite erkundet (kein aktivierter \"Nächste\"-Button gefunden)."
    "No pagination controls found after scrolling/waiting. Assuming last page.": "Keine Paginierungssteuerung nach dem Scrollen/Warten gefunden. Es wird von der letzten Seite ausgegangen."
    "No ad URLs were extracted.": "Es wurden keine Anzeigen-URLs extrahiert."
    "Could not find ad list container or items on page %s.": "Anzeigenlistencontainer oder Elemente auf Seite %s nicht gefunden."
    "Error during pagination detection: %s": "Fehler bei der Paginierungserkennung: %s"
    "Error during pagination navigation: %s": "Fehler bei der Paginierungsnavigation: %s"
    "Error extracting refs on page %s: %s": "Fehler beim Extrahieren der Referenzen auf Seite %s: %s"
    "Extracting ads from page %s...": "Extrahiere Anzeigen von Seite %s..."
    "Found %s ad items on page %s.": "%s Anzeigen-Elemente auf Seite %s gefunden."
    "Successfully extracted %s refs from page %s.": "%s Referenzen von Seite %s erfolgreich extrahiert."
  naviagte_to_ad_page:
    "There is no ad under the given ID.": "Es gibt keine Anzeige unter der angegebenen ID."
--- a/tests/unit/test_extract.py
+++ b/tests/unit/test_extract.py
@@ -261,42 +261,65 @@ class TestAdExtractorNavigation:
    @pytest.mark.asyncio
    async def test_extract_own_ads_urls(self, test_extractor: AdExtractor) -> None:
        """Test extraction of own ads URLs - basic test."""
-        with patch.object(test_extractor, 'web_open', new_callable = AsyncMock), \
+        with patch.object(test_extractor, 'web_open', new_callable=AsyncMock), \
-                patch.object(test_extractor, 'web_sleep', new_callable = AsyncMock), \
+                patch.object(test_extractor, 'web_sleep', new_callable=AsyncMock), \
-                patch.object(test_extractor, 'web_find', new_callable = AsyncMock) as mock_web_find, \
+                patch.object(test_extractor, 'web_find', new_callable=AsyncMock) as mock_web_find, \
-                patch.object(test_extractor, 'web_find_all', new_callable = AsyncMock) as mock_web_find_all, \
+                patch.object(test_extractor, 'web_find_all', new_callable=AsyncMock) as mock_web_find_all, \
-                patch.object(test_extractor, 'web_scroll_page_down', new_callable = AsyncMock), \
+                patch.object(test_extractor, 'web_scroll_page_down', new_callable=AsyncMock), \
-                patch.object(test_extractor, 'web_execute', new_callable = AsyncMock):
+                patch.object(test_extractor, 'web_execute', new_callable=AsyncMock):
-            # Setup mock objects for DOM elements
+            # --- Setup mock objects for DOM elements ---
-            splitpage = MagicMock()
+            # Mocks needed for the actual execution flow
-            pagination_section = MagicMock()
+            ad_list_container_mock = MagicMock()
-            pagination = MagicMock()
+            pagination_section_mock = MagicMock()
-            pagination_div = MagicMock()
+            cardbox_mock = MagicMock()    # Represents the <li> element
-            ad_list = MagicMock()
+            link_mock = MagicMock()      # Represents the <a> element
-            cardbox = MagicMock()
+            link_mock.attrs = {'href': '/s-anzeige/test/12345'}    # Configure the desired output
            link = MagicMock()
            link.attrs = {'href': '/s-anzeige/test/12345'}
-            # Setup mock responses for web_find
+            # Mocks for elements potentially checked but maybe not strictly needed for output
            # (depending on how robust the mocking is)
            # next_button_mock = MagicMock() # If needed for multi_page logic
            # --- Setup mock responses for web_find and web_find_all in CORRECT ORDER ---
            # 1. Initial find for ad list container (before loop)
            # 2. Find for pagination section (pagination check)
            # 3. Find for ad list container (inside loop)
            # 4. Find for the link (inside list comprehension)
            mock_web_find.side_effect = [
-                splitpage,  # .l-splitpage
+                ad_list_container_mock,          # Call 1: find #my-manageitems-adlist (before loop)
-                pagination_section,  # section:nth-of-type(4)
+                pagination_section_mock,         # Call 2: find .Pagination
-                pagination,  # div > div:nth-of-type(2) > div:nth-of-type(2) > div
+                ad_list_container_mock,          # Call 3: find #my-manageitems-adlist (inside loop)
-                pagination_div,  # div:nth-of-type(1)
+                link_mock                        # Call 4: find 'div.manageitems-item-ad h3 a.text-onSurface'
-                ad_list,  # my-manageitems-adlist
+                # Add more mocks here if the pagination navigation logic calls web_find again
                link  # article > section > section:nth-of-type(2) > h2 > div > a
            ]
-            # Setup mock responses for web_find_all
+            # 1. Find all 'Nächste' buttons (pagination check) - Return empty list for single page test case
            # 2. Find all '.cardbox' elements (inside loop)
            mock_web_find_all.side_effect = [
-                [MagicMock()],  # buttons in pagination
+                [],                              # Call 1: find 'button[aria-label="Nächste"]' -> No next button = single page
-                [cardbox]  # cardbox elements
+                [cardbox_mock]                   # Call 2: find .cardbox -> One ad item
                # Add more mocks here if pagination navigation calls web_find_all
            ]
-            # Execute test and verify results
+            # --- Execute test and verify results ---
            refs = await test_extractor.extract_own_ads_urls()
-            assert refs == ['/s-anzeige/test/12345']
+
            # --- Assertions ---
            assert refs == ['/s-anzeige/test/12345']  # Now it should match
            # Optional: Verify calls were made as expected
            mock_web_find.assert_has_calls([
                call(By.ID, 'my-manageitems-adlist'),
                call(By.CSS_SELECTOR, '.Pagination', timeout=10),
                call(By.ID, 'my-manageitems-adlist'),
                call(By.CSS_SELECTOR, 'div.manageitems-item-ad h3 a.text-onSurface', parent=cardbox_mock),
            ], any_order=False)  # Check order if important
            mock_web_find_all.assert_has_calls([
                call(By.CSS_SELECTOR, 'button[aria-label="Nächste"]', parent=pagination_section_mock),
                call(By.CLASS_NAME, 'cardbox', parent=ad_list_container_mock),
            ], any_order=False)
 class TestAdExtractorContent: