mirror of
https://github.com/Second-Hand-Friends/kleinanzeigen-bot.git
synced 2026-03-12 02:31:45 +01:00
fix: continue own-ad extraction when links are incomplete (#854)
This commit is contained in:
@@ -164,13 +164,33 @@ class AdExtractor(WebScrapingMixin):
|
|||||||
list_items = await self.web_find_all(By.CLASS_NAME, "cardbox", parent = ad_list_container)
|
list_items = await self.web_find_all(By.CLASS_NAME, "cardbox", parent = ad_list_container)
|
||||||
LOG.info("Found %s ad items on page %s.", len(list_items), page_num)
|
LOG.info("Found %s ad items on page %s.", len(list_items), page_num)
|
||||||
|
|
||||||
page_refs:list[str] = [str((await self.web_find(By.CSS_SELECTOR, "div h3 a.text-onSurface", parent = li)).attrs["href"]) for li in list_items]
|
page_refs:list[str] = []
|
||||||
|
for index, li in enumerate(list_items, start = 1):
|
||||||
|
try:
|
||||||
|
link_elem = await self.web_find(By.CSS_SELECTOR, "div h3 a.text-onSurface", parent = li)
|
||||||
|
href = link_elem.attrs.get("href")
|
||||||
|
if href:
|
||||||
|
page_refs.append(str(href))
|
||||||
|
else:
|
||||||
|
LOG.warning(
|
||||||
|
"Skipping ad item %s/%s on page %s: ad reference link has no href attribute.",
|
||||||
|
index,
|
||||||
|
len(list_items),
|
||||||
|
page_num,
|
||||||
|
)
|
||||||
|
except TimeoutError:
|
||||||
|
LOG.warning(
|
||||||
|
"Skipping ad item %s/%s on page %s: no ad reference link found (likely unpublished or draft item).",
|
||||||
|
index,
|
||||||
|
len(list_items),
|
||||||
|
page_num,
|
||||||
|
)
|
||||||
refs.extend(page_refs)
|
refs.extend(page_refs)
|
||||||
LOG.info("Successfully extracted %s refs from page %s.", len(page_refs), page_num)
|
LOG.info("Successfully extracted %s refs from page %s.", len(page_refs), page_num)
|
||||||
return False # Continue to next page
|
return False # Continue to next page
|
||||||
|
|
||||||
except TimeoutError:
|
except TimeoutError:
|
||||||
LOG.warning("Could not find ad list container or items on page %s.", page_num)
|
LOG.warning("Could not find ad list container or ad items on page %s.", page_num)
|
||||||
return True # Stop pagination (ads disappeared)
|
return True # Stop pagination (ads disappeared)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# Continue despite error for resilience against transient web scraping issues
|
# Continue despite error for resilience against transient web scraping issues
|
||||||
|
|||||||
@@ -299,9 +299,11 @@ kleinanzeigen_bot/extract.py:
|
|||||||
"No ad URLs were extracted.": "Es wurden keine Anzeigen-URLs extrahiert."
|
"No ad URLs were extracted.": "Es wurden keine Anzeigen-URLs extrahiert."
|
||||||
|
|
||||||
extract_page_refs:
|
extract_page_refs:
|
||||||
"Could not find ad list container or items on page %s.": "Anzeigenlistencontainer oder Elemente auf Seite %s nicht gefunden."
|
"Could not find ad list container or ad items on page %s.": "Anzeigenlistencontainer oder Anzeigenelemente auf Seite %s nicht gefunden."
|
||||||
"Error extracting refs on page %s: %s": "Fehler beim Extrahieren der Referenzen auf Seite %s: %s"
|
"Error extracting refs on page %s: %s": "Fehler beim Extrahieren der Referenzen auf Seite %s: %s"
|
||||||
"Found %s ad items on page %s.": "%s Anzeigen-Elemente auf Seite %s gefunden."
|
"Found %s ad items on page %s.": "%s Anzeigen-Elemente auf Seite %s gefunden."
|
||||||
|
"Skipping ad item %s/%s on page %s: ad reference link has no href attribute.": "Überspringe Anzeigenelement %s/%s auf Seite %s: Anzeigenlink hat kein href-Attribut."
|
||||||
|
"Skipping ad item %s/%s on page %s: no ad reference link found (likely unpublished or draft item).": "Überspringe Anzeigenelement %s/%s auf Seite %s: kein Anzeigenlink gefunden (wahrscheinlich unveröffentlicht oder Entwurf)."
|
||||||
"Successfully extracted %s refs from page %s.": "%s Referenzen von Seite %s erfolgreich extrahiert."
|
"Successfully extracted %s refs from page %s.": "%s Referenzen von Seite %s erfolgreich extrahiert."
|
||||||
|
|
||||||
navigate_to_ad_page:
|
navigate_to_ad_page:
|
||||||
|
|||||||
@@ -703,6 +703,58 @@ class TestAdExtractorNavigation:
|
|||||||
# Pagination should stop (TimeoutError in callback returns True)
|
# Pagination should stop (TimeoutError in callback returns True)
|
||||||
assert refs == []
|
assert refs == []
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_extract_own_ads_urls_skips_single_item_timeout(self, test_extractor:extract_module.AdExtractor) -> None:
|
||||||
|
"""Timeout on one ad item should skip that item but keep extracting others."""
|
||||||
|
ad_list_container_mock = MagicMock()
|
||||||
|
first_item = MagicMock()
|
||||||
|
second_item = MagicMock()
|
||||||
|
valid_link = MagicMock()
|
||||||
|
valid_link.attrs = {"href": "/s-anzeige/ok/999"}
|
||||||
|
|
||||||
|
with (
|
||||||
|
patch.object(test_extractor, "web_open", new_callable = AsyncMock),
|
||||||
|
patch.object(test_extractor, "web_sleep", new_callable = AsyncMock),
|
||||||
|
patch.object(test_extractor, "web_scroll_page_down", new_callable = AsyncMock),
|
||||||
|
patch.object(test_extractor, "web_find_all", new_callable = AsyncMock, return_value = [first_item, second_item]),
|
||||||
|
patch.object(
|
||||||
|
test_extractor,
|
||||||
|
"web_find",
|
||||||
|
new_callable = AsyncMock,
|
||||||
|
side_effect = [ad_list_container_mock, TimeoutError(), ad_list_container_mock, TimeoutError(), valid_link],
|
||||||
|
),
|
||||||
|
):
|
||||||
|
refs = await test_extractor.extract_own_ads_urls()
|
||||||
|
|
||||||
|
assert refs == ["/s-anzeige/ok/999"]
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_extract_own_ads_urls_skips_single_item_without_href(self, test_extractor:extract_module.AdExtractor) -> None:
|
||||||
|
"""Anchor without href should be skipped instead of adding a 'None' entry."""
|
||||||
|
ad_list_container_mock = MagicMock()
|
||||||
|
first_item = MagicMock()
|
||||||
|
second_item = MagicMock()
|
||||||
|
missing_href_link = MagicMock()
|
||||||
|
missing_href_link.attrs = {}
|
||||||
|
valid_link = MagicMock()
|
||||||
|
valid_link.attrs = {"href": "/s-anzeige/ok/999"}
|
||||||
|
|
||||||
|
with (
|
||||||
|
patch.object(test_extractor, "web_open", new_callable = AsyncMock),
|
||||||
|
patch.object(test_extractor, "web_sleep", new_callable = AsyncMock),
|
||||||
|
patch.object(test_extractor, "web_scroll_page_down", new_callable = AsyncMock),
|
||||||
|
patch.object(test_extractor, "web_find_all", new_callable = AsyncMock, return_value = [first_item, second_item]),
|
||||||
|
patch.object(
|
||||||
|
test_extractor,
|
||||||
|
"web_find",
|
||||||
|
new_callable = AsyncMock,
|
||||||
|
side_effect = [ad_list_container_mock, TimeoutError(), ad_list_container_mock, missing_href_link, valid_link],
|
||||||
|
),
|
||||||
|
):
|
||||||
|
refs = await test_extractor.extract_own_ads_urls()
|
||||||
|
|
||||||
|
assert refs == ["/s-anzeige/ok/999"]
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_extract_own_ads_urls_generic_exception_in_callback(self, test_extractor:extract_module.AdExtractor) -> None:
|
async def test_extract_own_ads_urls_generic_exception_in_callback(self, test_extractor:extract_module.AdExtractor) -> None:
|
||||||
"""Test that generic Exception in extract_page_refs callback continues pagination."""
|
"""Test that generic Exception in extract_page_refs callback continues pagination."""
|
||||||
|
|||||||
Reference in New Issue
Block a user