diff --git a/src/kleinanzeigen_bot/extract.py b/src/kleinanzeigen_bot/extract.py index 6fa82cd..2b74be4 100644 --- a/src/kleinanzeigen_bot/extract.py +++ b/src/kleinanzeigen_bot/extract.py @@ -164,13 +164,33 @@ class AdExtractor(WebScrapingMixin): list_items = await self.web_find_all(By.CLASS_NAME, "cardbox", parent = ad_list_container) LOG.info("Found %s ad items on page %s.", len(list_items), page_num) - page_refs:list[str] = [str((await self.web_find(By.CSS_SELECTOR, "div h3 a.text-onSurface", parent = li)).attrs["href"]) for li in list_items] + page_refs:list[str] = [] + for index, li in enumerate(list_items, start = 1): + try: + link_elem = await self.web_find(By.CSS_SELECTOR, "div h3 a.text-onSurface", parent = li) + href = link_elem.attrs.get("href") + if href: + page_refs.append(str(href)) + else: + LOG.warning( + "Skipping ad item %s/%s on page %s: ad reference link has no href attribute.", + index, + len(list_items), + page_num, + ) + except TimeoutError: + LOG.warning( + "Skipping ad item %s/%s on page %s: no ad reference link found (likely unpublished or draft item).", + index, + len(list_items), + page_num, + ) refs.extend(page_refs) LOG.info("Successfully extracted %s refs from page %s.", len(page_refs), page_num) return False # Continue to next page except TimeoutError: - LOG.warning("Could not find ad list container or items on page %s.", page_num) + LOG.warning("Could not find ad list container or ad items on page %s.", page_num) return True # Stop pagination (ads disappeared) except Exception as e: # Continue despite error for resilience against transient web scraping issues diff --git a/src/kleinanzeigen_bot/resources/translations.de.yaml b/src/kleinanzeigen_bot/resources/translations.de.yaml index 6c0618c..90d4045 100644 --- a/src/kleinanzeigen_bot/resources/translations.de.yaml +++ b/src/kleinanzeigen_bot/resources/translations.de.yaml @@ -299,9 +299,11 @@ kleinanzeigen_bot/extract.py: "No ad URLs were extracted.": "Es wurden keine Anzeigen-URLs extrahiert." extract_page_refs: - "Could not find ad list container or items on page %s.": "Anzeigenlistencontainer oder Elemente auf Seite %s nicht gefunden." + "Could not find ad list container or ad items on page %s.": "Anzeigenlistencontainer oder Anzeigenelemente auf Seite %s nicht gefunden." "Error extracting refs on page %s: %s": "Fehler beim Extrahieren der Referenzen auf Seite %s: %s" "Found %s ad items on page %s.": "%s Anzeigen-Elemente auf Seite %s gefunden." + "Skipping ad item %s/%s on page %s: ad reference link has no href attribute.": "Überspringe Anzeigenelement %s/%s auf Seite %s: Anzeigenlink hat kein href-Attribut." + "Skipping ad item %s/%s on page %s: no ad reference link found (likely unpublished or draft item).": "Überspringe Anzeigenelement %s/%s auf Seite %s: kein Anzeigenlink gefunden (wahrscheinlich unveröffentlicht oder Entwurf)." "Successfully extracted %s refs from page %s.": "%s Referenzen von Seite %s erfolgreich extrahiert." navigate_to_ad_page: diff --git a/tests/unit/test_extract.py b/tests/unit/test_extract.py index 58db50c..b61ac8d 100644 --- a/tests/unit/test_extract.py +++ b/tests/unit/test_extract.py @@ -703,6 +703,58 @@ class TestAdExtractorNavigation: # Pagination should stop (TimeoutError in callback returns True) assert refs == [] + @pytest.mark.asyncio + async def test_extract_own_ads_urls_skips_single_item_timeout(self, test_extractor:extract_module.AdExtractor) -> None: + """Timeout on one ad item should skip that item but keep extracting others.""" + ad_list_container_mock = MagicMock() + first_item = MagicMock() + second_item = MagicMock() + valid_link = MagicMock() + valid_link.attrs = {"href": "/s-anzeige/ok/999"} + + with ( + patch.object(test_extractor, "web_open", new_callable = AsyncMock), + patch.object(test_extractor, "web_sleep", new_callable = AsyncMock), + patch.object(test_extractor, "web_scroll_page_down", new_callable = AsyncMock), + patch.object(test_extractor, "web_find_all", new_callable = AsyncMock, return_value = [first_item, second_item]), + patch.object( + test_extractor, + "web_find", + new_callable = AsyncMock, + side_effect = [ad_list_container_mock, TimeoutError(), ad_list_container_mock, TimeoutError(), valid_link], + ), + ): + refs = await test_extractor.extract_own_ads_urls() + + assert refs == ["/s-anzeige/ok/999"] + + @pytest.mark.asyncio + async def test_extract_own_ads_urls_skips_single_item_without_href(self, test_extractor:extract_module.AdExtractor) -> None: + """Anchor without href should be skipped instead of adding a 'None' entry.""" + ad_list_container_mock = MagicMock() + first_item = MagicMock() + second_item = MagicMock() + missing_href_link = MagicMock() + missing_href_link.attrs = {} + valid_link = MagicMock() + valid_link.attrs = {"href": "/s-anzeige/ok/999"} + + with ( + patch.object(test_extractor, "web_open", new_callable = AsyncMock), + patch.object(test_extractor, "web_sleep", new_callable = AsyncMock), + patch.object(test_extractor, "web_scroll_page_down", new_callable = AsyncMock), + patch.object(test_extractor, "web_find_all", new_callable = AsyncMock, return_value = [first_item, second_item]), + patch.object( + test_extractor, + "web_find", + new_callable = AsyncMock, + side_effect = [ad_list_container_mock, TimeoutError(), ad_list_container_mock, missing_href_link, valid_link], + ), + ): + refs = await test_extractor.extract_own_ads_urls() + + assert refs == ["/s-anzeige/ok/999"] + @pytest.mark.asyncio async def test_extract_own_ads_urls_generic_exception_in_callback(self, test_extractor:extract_module.AdExtractor) -> None: """Test that generic Exception in extract_page_refs callback continues pagination."""