fix: Correct pagination selectors and logic for issue #477 (#479)

This commit is contained in:
marvinkcode
2025-04-21 20:26:02 +02:00
committed by GitHub
parent c144801d2e
commit 79af6ba861
4 changed files with 161 additions and 71 deletions

View File

@@ -157,7 +157,8 @@ class KleinanzeigenBot(WebScrapingMixin):
* new: Veröffentlicht nur neue Anzeigen (d.h. Anzeigen ohne ID in der Konfigurationsdatei)
* changed: Veröffentlicht nur Anzeigen, die seit der letzten Veröffentlichung geändert wurden
* <id(s)>: Gibt eine oder mehrere Anzeigen-IDs an, die veröffentlicht werden sollen, z. B. "--ads=1,2,3", ignoriert republication_interval
* Kombinationen: Sie können mehrere Selektoren mit Kommas kombinieren, z. B. "--ads=changed,due" um sowohl geänderte als auch fällige Anzeigen zu veröffentlichen
* Kombinationen: Sie können mehrere Selektoren mit Kommas kombinieren, z. B. "--ads=changed,due" um sowohl geänderte als auch
fällige Anzeigen zu veröffentlichen
--ads=all|new|<id(s)> (download) - Gibt an, welche Anzeigen heruntergeladen werden sollen (STANDARD: new)
Mögliche Werte:
* all: Lädt alle Anzeigen aus Ihrem Profil herunter
@@ -401,7 +402,8 @@ class KleinanzeigenBot(WebScrapingMixin):
ensure(dicts.safe_get(ad_cfg, *path.split(".")) in allowed, f"-> property [{path}] must be one of: {allowed} @ [{ad_file}]")
def assert_min_len(path:str, minlen:int) -> None:
ensure(len(dicts.safe_get(ad_cfg, *path.split("."))) >= minlen, f"-> property [{path}] must be at least {minlen} characters long @ [{ad_file}]")
ensure(len(dicts.safe_get(ad_cfg, *path.split("."))) >= minlen,
f"-> property [{path}] must be at least {minlen} characters long @ [{ad_file}]")
def assert_has_value(path:str) -> None:
ensure(dicts.safe_get(ad_cfg, *path.split(".")), f"-> property [{path}] not specified @ [{ad_file}]")

View File

@@ -135,54 +135,106 @@ class AdExtractor(WebScrapingMixin):
"""
# navigate to "your ads" page
await self.web_open('https://www.kleinanzeigen.de/m-meine-anzeigen.html')
await self.web_sleep(2000, 3000)
await self.web_sleep(2000, 3000) # Consider replacing with explicit waits later
# collect ad references:
pagination_section = await self.web_find(By.CSS_SELECTOR, 'section:nth-of-type(4)',
parent = await self.web_find(By.CSS_SELECTOR, '.l-splitpage'))
# scroll down to load dynamically
await self.web_scroll_page_down()
await self.web_sleep(2000, 3000)
# detect multi-page
# Try to find the main ad list container first
try:
pagination = await self.web_find(By.CSS_SELECTOR, 'div > div:nth-of-type(2) > div:nth-of-type(2) > div',
parent = pagination_section)
except TimeoutError: # 0 ads - no pagination area
LOG.warning('There are currently no ads on your profile!')
ad_list_container = await self.web_find(By.ID, 'my-manageitems-adlist')
except TimeoutError:
LOG.warning('Ad list container #my-manageitems-adlist not found. Maybe no ads present?')
return []
n_buttons = len(await self.web_find_all(By.CSS_SELECTOR, 'em',
parent = await self.web_find(By.CSS_SELECTOR, 'div:nth-of-type(1)', parent = pagination)))
if n_buttons > 1:
multi_page = True
LOG.info('It looks like you have many ads!')
else:
multi_page = False
LOG.info('It looks like all your ads fit on one overview page.')
# --- Pagination handling ---
multi_page = False
try:
# Correct selector: Use uppercase '.Pagination'
pagination_section = await self.web_find(By.CSS_SELECTOR, '.Pagination', timeout=10) # Increased timeout slightly
# Correct selector: Use 'aria-label'
# Also check if the button is actually present AND potentially enabled (though enabled check isn't strictly necessary here, only for clicking later)
next_buttons = await self.web_find_all(By.CSS_SELECTOR, 'button[aria-label="Nächste"]', parent=pagination_section)
if next_buttons:
# Check if at least one 'Nächste' button is not disabled (optional but good practice)
enabled_next_buttons = [btn for btn in next_buttons if not btn.attrs.get('disabled')]
if enabled_next_buttons:
multi_page = True
LOG.info('Multiple ad pages detected.')
else:
LOG.info('Next button found but is disabled. Assuming single effective page.')
else:
LOG.info('No "Naechste" button found within pagination. Assuming single page.')
except TimeoutError:
# This will now correctly trigger only if the '.Pagination' div itself is not found
LOG.info('No pagination controls found. Assuming single page.')
except Exception as e:
LOG.error("Error during pagination detection: %s", e, exc_info=True)
LOG.info('Assuming single page due to error during pagination check.')
# --- End Pagination Handling ---
refs:list[str] = []
while True: # loop reference extraction until no more forward page
# extract references
list_items = await self.web_find_all(By.CLASS_NAME, 'cardbox',
parent = await self.web_find(By.ID, 'my-manageitems-adlist'))
refs += [
(await self.web_find(By.CSS_SELECTOR, 'article > section > section:nth-of-type(2) > h3 > div > a', parent = li)).attrs['href']
for li in list_items
]
current_page = 1
while True: # Loop reference extraction
LOG.info("Extracting ads from page %s...", current_page)
# scroll down to load dynamically if necessary
await self.web_scroll_page_down()
await self.web_sleep(2000, 3000) # Consider replacing with explicit waits
# Re-find the ad list container on the current page/state
try:
ad_list_container = await self.web_find(By.ID, 'my-manageitems-adlist')
list_items = await self.web_find_all(By.CLASS_NAME, 'cardbox', parent=ad_list_container)
LOG.info("Found %s ad items on page %s.", len(list_items), current_page)
except TimeoutError:
LOG.warning("Could not find ad list container or items on page %s.", current_page)
break # Stop if ads disappear
# Extract references using the CORRECTED selector
try:
page_refs = [
(await self.web_find(By.CSS_SELECTOR, 'div.manageitems-item-ad h3 a.text-onSurface', parent=li)).attrs['href']
for li in list_items
]
refs.extend(page_refs)
LOG.info("Successfully extracted %s refs from page %s.", len(page_refs), current_page)
except Exception as e:
# Log the error if extraction fails for some items, but try to continue
LOG.error("Error extracting refs on page %s: %s", current_page, e, exc_info=True)
if not multi_page: # only one iteration for single-page overview
break
# check if last page
nav_button:Element = (await self.web_find_all(By.CSS_SELECTOR, 'button.jsx-1553636621'))[-1]
if nav_button.attrs['title'] != 'Nächste':
LOG.info('Last ad overview page explored.')
# --- Navigate to next page ---
try:
# Find the pagination section again (scope might have changed after scroll/wait)
pagination_section = await self.web_find(By.CSS_SELECTOR, '.Pagination', timeout=5)
# Find the "Next" button using the correct aria-label selector and ensure it's not disabled
next_button_element = None
possible_next_buttons = await self.web_find_all(By.CSS_SELECTOR, 'button[aria-label="Nächste"]', parent=pagination_section)
for btn in possible_next_buttons:
if not btn.attrs.get('disabled'): # Check if the button is enabled
next_button_element = btn
break # Found an enabled next button
if next_button_element:
LOG.info("Navigating to next page...")
await next_button_element.click()
current_page += 1
# Wait for page load - consider waiting for a specific element on the new page instead of fixed sleep
await self.web_sleep(3000, 4000)
else:
LOG.info('Last ad overview page explored (no enabled "Naechste" button found).')
break
except TimeoutError:
# This might happen if pagination disappears on the last page after loading
LOG.info("No pagination controls found after scrolling/waiting. Assuming last page.")
break
# navigate to next overview page
await nav_button.click()
await self.web_sleep(2000, 3000)
await self.web_scroll_page_down()
except Exception as e:
LOG.error("Error during pagination navigation: %s", e, exc_info=True)
break
# --- End Navigation ---
if not refs:
LOG.warning('No ad URLs were extracted.')
return refs

View File

@@ -152,10 +152,23 @@ kleinanzeigen_bot/extract.py:
"The ad ID could not be extracted from the given URL %s": "Die Anzeigen-ID konnte nicht aus der angegebenen URL %s extrahiert werden"
extract_own_ads_urls:
"There are currently no ads on your profile!": "Es gibt derzeit keine Anzeigen in deinem Profil!"
"It looks like you have many ads!": "Es sieht so aus, als hättest du viele Anzeigen!"
"It looks like all your ads fit on one overview page.": "Es sieht so aus, als würden alle deine Anzeigen auf eine Übersichtsseite passen."
"Last ad overview page explored.": "Letzte Übersichtsseite erkundet."
"Ad list container #my-manageitems-adlist not found. Maybe no ads present?": "Anzeigenlistencontainer #my-manageitems-adlist nicht gefunden. Vielleicht sind keine Anzeigen vorhanden?"
"Multiple ad pages detected.": "Mehrere Anzeigenseiten erkannt."
"Next button found but is disabled. Assuming single effective page.": "Weiter-Button gefunden, aber deaktiviert. Es wird von einer einzelnen effektiven Seite ausgegangen."
"No \"Naechste\" button found within pagination. Assuming single page.": "Kein \"Nächste\"-Button in der Paginierung gefunden. Es wird von einer einzelnen Seite ausgegangen."
"No pagination controls found. Assuming single page.": "Keine Paginierungssteuerung gefunden. Es wird von einer einzelnen Seite ausgegangen."
"Assuming single page due to error during pagination check.": "Es wird von einer einzelnen Seite ausgegangen wegen eines Fehlers bei der Paginierungsprüfung."
"Navigating to next page...": "Navigiere zur nächsten Seite..."
"Last ad overview page explored (no enabled \"Naechste\" button found).": "Letzte Anzeigenübersichtsseite erkundet (kein aktivierter \"Nächste\"-Button gefunden)."
"No pagination controls found after scrolling/waiting. Assuming last page.": "Keine Paginierungssteuerung nach dem Scrollen/Warten gefunden. Es wird von der letzten Seite ausgegangen."
"No ad URLs were extracted.": "Es wurden keine Anzeigen-URLs extrahiert."
"Could not find ad list container or items on page %s.": "Anzeigenlistencontainer oder Elemente auf Seite %s nicht gefunden."
"Error during pagination detection: %s": "Fehler bei der Paginierungserkennung: %s"
"Error during pagination navigation: %s": "Fehler bei der Paginierungsnavigation: %s"
"Error extracting refs on page %s: %s": "Fehler beim Extrahieren der Referenzen auf Seite %s: %s"
"Extracting ads from page %s...": "Extrahiere Anzeigen von Seite %s..."
"Found %s ad items on page %s.": "%s Anzeigen-Elemente auf Seite %s gefunden."
"Successfully extracted %s refs from page %s.": "%s Referenzen von Seite %s erfolgreich extrahiert."
naviagte_to_ad_page:
"There is no ad under the given ID.": "Es gibt keine Anzeige unter der angegebenen ID."

View File

@@ -261,42 +261,65 @@ class TestAdExtractorNavigation:
@pytest.mark.asyncio
async def test_extract_own_ads_urls(self, test_extractor: AdExtractor) -> None:
"""Test extraction of own ads URLs - basic test."""
with patch.object(test_extractor, 'web_open', new_callable = AsyncMock), \
patch.object(test_extractor, 'web_sleep', new_callable = AsyncMock), \
patch.object(test_extractor, 'web_find', new_callable = AsyncMock) as mock_web_find, \
patch.object(test_extractor, 'web_find_all', new_callable = AsyncMock) as mock_web_find_all, \
patch.object(test_extractor, 'web_scroll_page_down', new_callable = AsyncMock), \
patch.object(test_extractor, 'web_execute', new_callable = AsyncMock):
with patch.object(test_extractor, 'web_open', new_callable=AsyncMock), \
patch.object(test_extractor, 'web_sleep', new_callable=AsyncMock), \
patch.object(test_extractor, 'web_find', new_callable=AsyncMock) as mock_web_find, \
patch.object(test_extractor, 'web_find_all', new_callable=AsyncMock) as mock_web_find_all, \
patch.object(test_extractor, 'web_scroll_page_down', new_callable=AsyncMock), \
patch.object(test_extractor, 'web_execute', new_callable=AsyncMock):
# Setup mock objects for DOM elements
splitpage = MagicMock()
pagination_section = MagicMock()
pagination = MagicMock()
pagination_div = MagicMock()
ad_list = MagicMock()
cardbox = MagicMock()
link = MagicMock()
link.attrs = {'href': '/s-anzeige/test/12345'}
# --- Setup mock objects for DOM elements ---
# Mocks needed for the actual execution flow
ad_list_container_mock = MagicMock()
pagination_section_mock = MagicMock()
cardbox_mock = MagicMock() # Represents the <li> element
link_mock = MagicMock() # Represents the <a> element
link_mock.attrs = {'href': '/s-anzeige/test/12345'} # Configure the desired output
# Setup mock responses for web_find
# Mocks for elements potentially checked but maybe not strictly needed for output
# (depending on how robust the mocking is)
# next_button_mock = MagicMock() # If needed for multi_page logic
# --- Setup mock responses for web_find and web_find_all in CORRECT ORDER ---
# 1. Initial find for ad list container (before loop)
# 2. Find for pagination section (pagination check)
# 3. Find for ad list container (inside loop)
# 4. Find for the link (inside list comprehension)
mock_web_find.side_effect = [
splitpage, # .l-splitpage
pagination_section, # section:nth-of-type(4)
pagination, # div > div:nth-of-type(2) > div:nth-of-type(2) > div
pagination_div, # div:nth-of-type(1)
ad_list, # my-manageitems-adlist
link # article > section > section:nth-of-type(2) > h2 > div > a
ad_list_container_mock, # Call 1: find #my-manageitems-adlist (before loop)
pagination_section_mock, # Call 2: find .Pagination
ad_list_container_mock, # Call 3: find #my-manageitems-adlist (inside loop)
link_mock # Call 4: find 'div.manageitems-item-ad h3 a.text-onSurface'
# Add more mocks here if the pagination navigation logic calls web_find again
]
# Setup mock responses for web_find_all
# 1. Find all 'Nächste' buttons (pagination check) - Return empty list for single page test case
# 2. Find all '.cardbox' elements (inside loop)
mock_web_find_all.side_effect = [
[MagicMock()], # buttons in pagination
[cardbox] # cardbox elements
[], # Call 1: find 'button[aria-label="Nächste"]' -> No next button = single page
[cardbox_mock] # Call 2: find .cardbox -> One ad item
# Add more mocks here if pagination navigation calls web_find_all
]
# Execute test and verify results
# --- Execute test and verify results ---
refs = await test_extractor.extract_own_ads_urls()
assert refs == ['/s-anzeige/test/12345']
# --- Assertions ---
assert refs == ['/s-anzeige/test/12345'] # Now it should match
# Optional: Verify calls were made as expected
mock_web_find.assert_has_calls([
call(By.ID, 'my-manageitems-adlist'),
call(By.CSS_SELECTOR, '.Pagination', timeout=10),
call(By.ID, 'my-manageitems-adlist'),
call(By.CSS_SELECTOR, 'div.manageitems-item-ad h3 a.text-onSurface', parent=cardbox_mock),
], any_order=False) # Check order if important
mock_web_find_all.assert_has_calls([
call(By.CSS_SELECTOR, 'button[aria-label="Nächste"]', parent=pagination_section_mock),
call(By.CLASS_NAME, 'cardbox', parent=ad_list_container_mock),
], any_order=False)
class TestAdExtractorContent: