diff --git a/src/kleinanzeigen_bot/__init__.py b/src/kleinanzeigen_bot/__init__.py index 78ee497..8499d6a 100644 --- a/src/kleinanzeigen_bot/__init__.py +++ b/src/kleinanzeigen_bot/__init__.py @@ -157,7 +157,8 @@ class KleinanzeigenBot(WebScrapingMixin): * new: Veröffentlicht nur neue Anzeigen (d.h. Anzeigen ohne ID in der Konfigurationsdatei) * changed: Veröffentlicht nur Anzeigen, die seit der letzten Veröffentlichung geändert wurden * : Gibt eine oder mehrere Anzeigen-IDs an, die veröffentlicht werden sollen, z. B. "--ads=1,2,3", ignoriert republication_interval - * Kombinationen: Sie können mehrere Selektoren mit Kommas kombinieren, z. B. "--ads=changed,due" um sowohl geänderte als auch fällige Anzeigen zu veröffentlichen + * Kombinationen: Sie können mehrere Selektoren mit Kommas kombinieren, z. B. "--ads=changed,due" um sowohl geänderte als auch + fällige Anzeigen zu veröffentlichen --ads=all|new| (download) - Gibt an, welche Anzeigen heruntergeladen werden sollen (STANDARD: new) Mögliche Werte: * all: Lädt alle Anzeigen aus Ihrem Profil herunter @@ -401,7 +402,8 @@ class KleinanzeigenBot(WebScrapingMixin): ensure(dicts.safe_get(ad_cfg, *path.split(".")) in allowed, f"-> property [{path}] must be one of: {allowed} @ [{ad_file}]") def assert_min_len(path:str, minlen:int) -> None: - ensure(len(dicts.safe_get(ad_cfg, *path.split("."))) >= minlen, f"-> property [{path}] must be at least {minlen} characters long @ [{ad_file}]") + ensure(len(dicts.safe_get(ad_cfg, *path.split("."))) >= minlen, + f"-> property [{path}] must be at least {minlen} characters long @ [{ad_file}]") def assert_has_value(path:str) -> None: ensure(dicts.safe_get(ad_cfg, *path.split(".")), f"-> property [{path}] not specified @ [{ad_file}]") diff --git a/src/kleinanzeigen_bot/extract.py b/src/kleinanzeigen_bot/extract.py index 8374a4b..2b586bf 100644 --- a/src/kleinanzeigen_bot/extract.py +++ b/src/kleinanzeigen_bot/extract.py @@ -135,54 +135,106 @@ class AdExtractor(WebScrapingMixin): """ # navigate to "your ads" page await self.web_open('https://www.kleinanzeigen.de/m-meine-anzeigen.html') - await self.web_sleep(2000, 3000) + await self.web_sleep(2000, 3000) # Consider replacing with explicit waits later - # collect ad references: - pagination_section = await self.web_find(By.CSS_SELECTOR, 'section:nth-of-type(4)', - parent = await self.web_find(By.CSS_SELECTOR, '.l-splitpage')) - - # scroll down to load dynamically - await self.web_scroll_page_down() - await self.web_sleep(2000, 3000) - - # detect multi-page + # Try to find the main ad list container first try: - pagination = await self.web_find(By.CSS_SELECTOR, 'div > div:nth-of-type(2) > div:nth-of-type(2) > div', - parent = pagination_section) - except TimeoutError: # 0 ads - no pagination area - LOG.warning('There are currently no ads on your profile!') + ad_list_container = await self.web_find(By.ID, 'my-manageitems-adlist') + except TimeoutError: + LOG.warning('Ad list container #my-manageitems-adlist not found. Maybe no ads present?') return [] - n_buttons = len(await self.web_find_all(By.CSS_SELECTOR, 'em', - parent = await self.web_find(By.CSS_SELECTOR, 'div:nth-of-type(1)', parent = pagination))) - if n_buttons > 1: - multi_page = True - LOG.info('It looks like you have many ads!') - else: - multi_page = False - LOG.info('It looks like all your ads fit on one overview page.') + # --- Pagination handling --- + multi_page = False + try: + # Correct selector: Use uppercase '.Pagination' + pagination_section = await self.web_find(By.CSS_SELECTOR, '.Pagination', timeout=10) # Increased timeout slightly + # Correct selector: Use 'aria-label' + # Also check if the button is actually present AND potentially enabled (though enabled check isn't strictly necessary here, only for clicking later) + next_buttons = await self.web_find_all(By.CSS_SELECTOR, 'button[aria-label="Nächste"]', parent=pagination_section) + if next_buttons: + # Check if at least one 'Nächste' button is not disabled (optional but good practice) + enabled_next_buttons = [btn for btn in next_buttons if not btn.attrs.get('disabled')] + if enabled_next_buttons: + multi_page = True + LOG.info('Multiple ad pages detected.') + else: + LOG.info('Next button found but is disabled. Assuming single effective page.') + + else: + LOG.info('No "Naechste" button found within pagination. Assuming single page.') + except TimeoutError: + # This will now correctly trigger only if the '.Pagination' div itself is not found + LOG.info('No pagination controls found. Assuming single page.') + except Exception as e: + LOG.error("Error during pagination detection: %s", e, exc_info=True) + LOG.info('Assuming single page due to error during pagination check.') + # --- End Pagination Handling --- refs:list[str] = [] - while True: # loop reference extraction until no more forward page - # extract references - list_items = await self.web_find_all(By.CLASS_NAME, 'cardbox', - parent = await self.web_find(By.ID, 'my-manageitems-adlist')) - refs += [ - (await self.web_find(By.CSS_SELECTOR, 'article > section > section:nth-of-type(2) > h3 > div > a', parent = li)).attrs['href'] - for li in list_items - ] + current_page = 1 + while True: # Loop reference extraction + LOG.info("Extracting ads from page %s...", current_page) + # scroll down to load dynamically if necessary + await self.web_scroll_page_down() + await self.web_sleep(2000, 3000) # Consider replacing with explicit waits + + # Re-find the ad list container on the current page/state + try: + ad_list_container = await self.web_find(By.ID, 'my-manageitems-adlist') + list_items = await self.web_find_all(By.CLASS_NAME, 'cardbox', parent=ad_list_container) + LOG.info("Found %s ad items on page %s.", len(list_items), current_page) + except TimeoutError: + LOG.warning("Could not find ad list container or items on page %s.", current_page) + break # Stop if ads disappear + + # Extract references using the CORRECTED selector + try: + page_refs = [ + (await self.web_find(By.CSS_SELECTOR, 'div.manageitems-item-ad h3 a.text-onSurface', parent=li)).attrs['href'] + for li in list_items + ] + refs.extend(page_refs) + LOG.info("Successfully extracted %s refs from page %s.", len(page_refs), current_page) + except Exception as e: + # Log the error if extraction fails for some items, but try to continue + LOG.error("Error extracting refs on page %s: %s", current_page, e, exc_info=True) if not multi_page: # only one iteration for single-page overview break - # check if last page - nav_button:Element = (await self.web_find_all(By.CSS_SELECTOR, 'button.jsx-1553636621'))[-1] - if nav_button.attrs['title'] != 'Nächste': - LOG.info('Last ad overview page explored.') + + # --- Navigate to next page --- + try: + # Find the pagination section again (scope might have changed after scroll/wait) + pagination_section = await self.web_find(By.CSS_SELECTOR, '.Pagination', timeout=5) + # Find the "Next" button using the correct aria-label selector and ensure it's not disabled + next_button_element = None + possible_next_buttons = await self.web_find_all(By.CSS_SELECTOR, 'button[aria-label="Nächste"]', parent=pagination_section) + for btn in possible_next_buttons: + if not btn.attrs.get('disabled'): # Check if the button is enabled + next_button_element = btn + break # Found an enabled next button + + if next_button_element: + LOG.info("Navigating to next page...") + await next_button_element.click() + current_page += 1 + # Wait for page load - consider waiting for a specific element on the new page instead of fixed sleep + await self.web_sleep(3000, 4000) + else: + LOG.info('Last ad overview page explored (no enabled "Naechste" button found).') + break + except TimeoutError: + # This might happen if pagination disappears on the last page after loading + LOG.info("No pagination controls found after scrolling/waiting. Assuming last page.") break - # navigate to next overview page - await nav_button.click() - await self.web_sleep(2000, 3000) - await self.web_scroll_page_down() + except Exception as e: + LOG.error("Error during pagination navigation: %s", e, exc_info=True) + break + # --- End Navigation --- + + if not refs: + LOG.warning('No ad URLs were extracted.') return refs diff --git a/src/kleinanzeigen_bot/resources/translations.de.yaml b/src/kleinanzeigen_bot/resources/translations.de.yaml index a02c4e7..099abc5 100644 --- a/src/kleinanzeigen_bot/resources/translations.de.yaml +++ b/src/kleinanzeigen_bot/resources/translations.de.yaml @@ -152,10 +152,23 @@ kleinanzeigen_bot/extract.py: "The ad ID could not be extracted from the given URL %s": "Die Anzeigen-ID konnte nicht aus der angegebenen URL %s extrahiert werden" extract_own_ads_urls: - "There are currently no ads on your profile!": "Es gibt derzeit keine Anzeigen in deinem Profil!" - "It looks like you have many ads!": "Es sieht so aus, als hättest du viele Anzeigen!" - "It looks like all your ads fit on one overview page.": "Es sieht so aus, als würden alle deine Anzeigen auf eine Übersichtsseite passen." - "Last ad overview page explored.": "Letzte Übersichtsseite erkundet." + "Ad list container #my-manageitems-adlist not found. Maybe no ads present?": "Anzeigenlistencontainer #my-manageitems-adlist nicht gefunden. Vielleicht sind keine Anzeigen vorhanden?" + "Multiple ad pages detected.": "Mehrere Anzeigenseiten erkannt." + "Next button found but is disabled. Assuming single effective page.": "Weiter-Button gefunden, aber deaktiviert. Es wird von einer einzelnen effektiven Seite ausgegangen." + "No \"Naechste\" button found within pagination. Assuming single page.": "Kein \"Nächste\"-Button in der Paginierung gefunden. Es wird von einer einzelnen Seite ausgegangen." + "No pagination controls found. Assuming single page.": "Keine Paginierungssteuerung gefunden. Es wird von einer einzelnen Seite ausgegangen." + "Assuming single page due to error during pagination check.": "Es wird von einer einzelnen Seite ausgegangen wegen eines Fehlers bei der Paginierungsprüfung." + "Navigating to next page...": "Navigiere zur nächsten Seite..." + "Last ad overview page explored (no enabled \"Naechste\" button found).": "Letzte Anzeigenübersichtsseite erkundet (kein aktivierter \"Nächste\"-Button gefunden)." + "No pagination controls found after scrolling/waiting. Assuming last page.": "Keine Paginierungssteuerung nach dem Scrollen/Warten gefunden. Es wird von der letzten Seite ausgegangen." + "No ad URLs were extracted.": "Es wurden keine Anzeigen-URLs extrahiert." + "Could not find ad list container or items on page %s.": "Anzeigenlistencontainer oder Elemente auf Seite %s nicht gefunden." + "Error during pagination detection: %s": "Fehler bei der Paginierungserkennung: %s" + "Error during pagination navigation: %s": "Fehler bei der Paginierungsnavigation: %s" + "Error extracting refs on page %s: %s": "Fehler beim Extrahieren der Referenzen auf Seite %s: %s" + "Extracting ads from page %s...": "Extrahiere Anzeigen von Seite %s..." + "Found %s ad items on page %s.": "%s Anzeigen-Elemente auf Seite %s gefunden." + "Successfully extracted %s refs from page %s.": "%s Referenzen von Seite %s erfolgreich extrahiert." naviagte_to_ad_page: "There is no ad under the given ID.": "Es gibt keine Anzeige unter der angegebenen ID." diff --git a/tests/unit/test_extract.py b/tests/unit/test_extract.py index b2a7b8c..a9c3632 100644 --- a/tests/unit/test_extract.py +++ b/tests/unit/test_extract.py @@ -261,42 +261,65 @@ class TestAdExtractorNavigation: @pytest.mark.asyncio async def test_extract_own_ads_urls(self, test_extractor: AdExtractor) -> None: """Test extraction of own ads URLs - basic test.""" - with patch.object(test_extractor, 'web_open', new_callable = AsyncMock), \ - patch.object(test_extractor, 'web_sleep', new_callable = AsyncMock), \ - patch.object(test_extractor, 'web_find', new_callable = AsyncMock) as mock_web_find, \ - patch.object(test_extractor, 'web_find_all', new_callable = AsyncMock) as mock_web_find_all, \ - patch.object(test_extractor, 'web_scroll_page_down', new_callable = AsyncMock), \ - patch.object(test_extractor, 'web_execute', new_callable = AsyncMock): + with patch.object(test_extractor, 'web_open', new_callable=AsyncMock), \ + patch.object(test_extractor, 'web_sleep', new_callable=AsyncMock), \ + patch.object(test_extractor, 'web_find', new_callable=AsyncMock) as mock_web_find, \ + patch.object(test_extractor, 'web_find_all', new_callable=AsyncMock) as mock_web_find_all, \ + patch.object(test_extractor, 'web_scroll_page_down', new_callable=AsyncMock), \ + patch.object(test_extractor, 'web_execute', new_callable=AsyncMock): - # Setup mock objects for DOM elements - splitpage = MagicMock() - pagination_section = MagicMock() - pagination = MagicMock() - pagination_div = MagicMock() - ad_list = MagicMock() - cardbox = MagicMock() - link = MagicMock() - link.attrs = {'href': '/s-anzeige/test/12345'} + # --- Setup mock objects for DOM elements --- + # Mocks needed for the actual execution flow + ad_list_container_mock = MagicMock() + pagination_section_mock = MagicMock() + cardbox_mock = MagicMock() # Represents the
  • element + link_mock = MagicMock() # Represents the element + link_mock.attrs = {'href': '/s-anzeige/test/12345'} # Configure the desired output - # Setup mock responses for web_find + # Mocks for elements potentially checked but maybe not strictly needed for output + # (depending on how robust the mocking is) + # next_button_mock = MagicMock() # If needed for multi_page logic + + # --- Setup mock responses for web_find and web_find_all in CORRECT ORDER --- + + # 1. Initial find for ad list container (before loop) + # 2. Find for pagination section (pagination check) + # 3. Find for ad list container (inside loop) + # 4. Find for the link (inside list comprehension) mock_web_find.side_effect = [ - splitpage, # .l-splitpage - pagination_section, # section:nth-of-type(4) - pagination, # div > div:nth-of-type(2) > div:nth-of-type(2) > div - pagination_div, # div:nth-of-type(1) - ad_list, # my-manageitems-adlist - link # article > section > section:nth-of-type(2) > h2 > div > a + ad_list_container_mock, # Call 1: find #my-manageitems-adlist (before loop) + pagination_section_mock, # Call 2: find .Pagination + ad_list_container_mock, # Call 3: find #my-manageitems-adlist (inside loop) + link_mock # Call 4: find 'div.manageitems-item-ad h3 a.text-onSurface' + # Add more mocks here if the pagination navigation logic calls web_find again ] - # Setup mock responses for web_find_all + # 1. Find all 'Nächste' buttons (pagination check) - Return empty list for single page test case + # 2. Find all '.cardbox' elements (inside loop) mock_web_find_all.side_effect = [ - [MagicMock()], # buttons in pagination - [cardbox] # cardbox elements + [], # Call 1: find 'button[aria-label="Nächste"]' -> No next button = single page + [cardbox_mock] # Call 2: find .cardbox -> One ad item + # Add more mocks here if pagination navigation calls web_find_all ] - # Execute test and verify results + # --- Execute test and verify results --- refs = await test_extractor.extract_own_ads_urls() - assert refs == ['/s-anzeige/test/12345'] + + # --- Assertions --- + assert refs == ['/s-anzeige/test/12345'] # Now it should match + + # Optional: Verify calls were made as expected + mock_web_find.assert_has_calls([ + call(By.ID, 'my-manageitems-adlist'), + call(By.CSS_SELECTOR, '.Pagination', timeout=10), + call(By.ID, 'my-manageitems-adlist'), + call(By.CSS_SELECTOR, 'div.manageitems-item-ad h3 a.text-onSurface', parent=cardbox_mock), + ], any_order=False) # Check order if important + + mock_web_find_all.assert_has_calls([ + call(By.CSS_SELECTOR, 'button[aria-label="Nächste"]', parent=pagination_section_mock), + call(By.CLASS_NAME, 'cardbox', parent=ad_list_container_mock), + ], any_order=False) class TestAdExtractorContent: