fix: Correct pagination selectors and logic for issue #477 (#479)

This commit is contained in:
marvinkcode
2025-04-21 20:26:02 +02:00
committed by GitHub
parent c144801d2e
commit 79af6ba861
4 changed files with 161 additions and 71 deletions

View File

@@ -157,7 +157,8 @@ class KleinanzeigenBot(WebScrapingMixin):
* new: Veröffentlicht nur neue Anzeigen (d.h. Anzeigen ohne ID in der Konfigurationsdatei) * new: Veröffentlicht nur neue Anzeigen (d.h. Anzeigen ohne ID in der Konfigurationsdatei)
* changed: Veröffentlicht nur Anzeigen, die seit der letzten Veröffentlichung geändert wurden * changed: Veröffentlicht nur Anzeigen, die seit der letzten Veröffentlichung geändert wurden
* <id(s)>: Gibt eine oder mehrere Anzeigen-IDs an, die veröffentlicht werden sollen, z. B. "--ads=1,2,3", ignoriert republication_interval * <id(s)>: Gibt eine oder mehrere Anzeigen-IDs an, die veröffentlicht werden sollen, z. B. "--ads=1,2,3", ignoriert republication_interval
* Kombinationen: Sie können mehrere Selektoren mit Kommas kombinieren, z. B. "--ads=changed,due" um sowohl geänderte als auch fällige Anzeigen zu veröffentlichen * Kombinationen: Sie können mehrere Selektoren mit Kommas kombinieren, z. B. "--ads=changed,due" um sowohl geänderte als auch
fällige Anzeigen zu veröffentlichen
--ads=all|new|<id(s)> (download) - Gibt an, welche Anzeigen heruntergeladen werden sollen (STANDARD: new) --ads=all|new|<id(s)> (download) - Gibt an, welche Anzeigen heruntergeladen werden sollen (STANDARD: new)
Mögliche Werte: Mögliche Werte:
* all: Lädt alle Anzeigen aus Ihrem Profil herunter * all: Lädt alle Anzeigen aus Ihrem Profil herunter
@@ -401,7 +402,8 @@ class KleinanzeigenBot(WebScrapingMixin):
ensure(dicts.safe_get(ad_cfg, *path.split(".")) in allowed, f"-> property [{path}] must be one of: {allowed} @ [{ad_file}]") ensure(dicts.safe_get(ad_cfg, *path.split(".")) in allowed, f"-> property [{path}] must be one of: {allowed} @ [{ad_file}]")
def assert_min_len(path:str, minlen:int) -> None: def assert_min_len(path:str, minlen:int) -> None:
ensure(len(dicts.safe_get(ad_cfg, *path.split("."))) >= minlen, f"-> property [{path}] must be at least {minlen} characters long @ [{ad_file}]") ensure(len(dicts.safe_get(ad_cfg, *path.split("."))) >= minlen,
f"-> property [{path}] must be at least {minlen} characters long @ [{ad_file}]")
def assert_has_value(path:str) -> None: def assert_has_value(path:str) -> None:
ensure(dicts.safe_get(ad_cfg, *path.split(".")), f"-> property [{path}] not specified @ [{ad_file}]") ensure(dicts.safe_get(ad_cfg, *path.split(".")), f"-> property [{path}] not specified @ [{ad_file}]")

View File

@@ -135,54 +135,106 @@ class AdExtractor(WebScrapingMixin):
""" """
# navigate to "your ads" page # navigate to "your ads" page
await self.web_open('https://www.kleinanzeigen.de/m-meine-anzeigen.html') await self.web_open('https://www.kleinanzeigen.de/m-meine-anzeigen.html')
await self.web_sleep(2000, 3000) await self.web_sleep(2000, 3000) # Consider replacing with explicit waits later
# collect ad references: # Try to find the main ad list container first
pagination_section = await self.web_find(By.CSS_SELECTOR, 'section:nth-of-type(4)',
parent = await self.web_find(By.CSS_SELECTOR, '.l-splitpage'))
# scroll down to load dynamically
await self.web_scroll_page_down()
await self.web_sleep(2000, 3000)
# detect multi-page
try: try:
pagination = await self.web_find(By.CSS_SELECTOR, 'div > div:nth-of-type(2) > div:nth-of-type(2) > div', ad_list_container = await self.web_find(By.ID, 'my-manageitems-adlist')
parent = pagination_section) except TimeoutError:
except TimeoutError: # 0 ads - no pagination area LOG.warning('Ad list container #my-manageitems-adlist not found. Maybe no ads present?')
LOG.warning('There are currently no ads on your profile!')
return [] return []
n_buttons = len(await self.web_find_all(By.CSS_SELECTOR, 'em', # --- Pagination handling ---
parent = await self.web_find(By.CSS_SELECTOR, 'div:nth-of-type(1)', parent = pagination)))
if n_buttons > 1:
multi_page = True
LOG.info('It looks like you have many ads!')
else:
multi_page = False multi_page = False
LOG.info('It looks like all your ads fit on one overview page.') try:
# Correct selector: Use uppercase '.Pagination'
pagination_section = await self.web_find(By.CSS_SELECTOR, '.Pagination', timeout=10) # Increased timeout slightly
# Correct selector: Use 'aria-label'
# Also check if the button is actually present AND potentially enabled (though enabled check isn't strictly necessary here, only for clicking later)
next_buttons = await self.web_find_all(By.CSS_SELECTOR, 'button[aria-label="Nächste"]', parent=pagination_section)
if next_buttons:
# Check if at least one 'Nächste' button is not disabled (optional but good practice)
enabled_next_buttons = [btn for btn in next_buttons if not btn.attrs.get('disabled')]
if enabled_next_buttons:
multi_page = True
LOG.info('Multiple ad pages detected.')
else:
LOG.info('Next button found but is disabled. Assuming single effective page.')
else:
LOG.info('No "Naechste" button found within pagination. Assuming single page.')
except TimeoutError:
# This will now correctly trigger only if the '.Pagination' div itself is not found
LOG.info('No pagination controls found. Assuming single page.')
except Exception as e:
LOG.error("Error during pagination detection: %s", e, exc_info=True)
LOG.info('Assuming single page due to error during pagination check.')
# --- End Pagination Handling ---
refs:list[str] = [] refs:list[str] = []
while True: # loop reference extraction until no more forward page current_page = 1
# extract references while True: # Loop reference extraction
list_items = await self.web_find_all(By.CLASS_NAME, 'cardbox', LOG.info("Extracting ads from page %s...", current_page)
parent = await self.web_find(By.ID, 'my-manageitems-adlist')) # scroll down to load dynamically if necessary
refs += [ await self.web_scroll_page_down()
(await self.web_find(By.CSS_SELECTOR, 'article > section > section:nth-of-type(2) > h3 > div > a', parent = li)).attrs['href'] await self.web_sleep(2000, 3000) # Consider replacing with explicit waits
# Re-find the ad list container on the current page/state
try:
ad_list_container = await self.web_find(By.ID, 'my-manageitems-adlist')
list_items = await self.web_find_all(By.CLASS_NAME, 'cardbox', parent=ad_list_container)
LOG.info("Found %s ad items on page %s.", len(list_items), current_page)
except TimeoutError:
LOG.warning("Could not find ad list container or items on page %s.", current_page)
break # Stop if ads disappear
# Extract references using the CORRECTED selector
try:
page_refs = [
(await self.web_find(By.CSS_SELECTOR, 'div.manageitems-item-ad h3 a.text-onSurface', parent=li)).attrs['href']
for li in list_items for li in list_items
] ]
refs.extend(page_refs)
LOG.info("Successfully extracted %s refs from page %s.", len(page_refs), current_page)
except Exception as e:
# Log the error if extraction fails for some items, but try to continue
LOG.error("Error extracting refs on page %s: %s", current_page, e, exc_info=True)
if not multi_page: # only one iteration for single-page overview if not multi_page: # only one iteration for single-page overview
break break
# check if last page
nav_button:Element = (await self.web_find_all(By.CSS_SELECTOR, 'button.jsx-1553636621'))[-1] # --- Navigate to next page ---
if nav_button.attrs['title'] != 'Nächste': try:
LOG.info('Last ad overview page explored.') # Find the pagination section again (scope might have changed after scroll/wait)
pagination_section = await self.web_find(By.CSS_SELECTOR, '.Pagination', timeout=5)
# Find the "Next" button using the correct aria-label selector and ensure it's not disabled
next_button_element = None
possible_next_buttons = await self.web_find_all(By.CSS_SELECTOR, 'button[aria-label="Nächste"]', parent=pagination_section)
for btn in possible_next_buttons:
if not btn.attrs.get('disabled'): # Check if the button is enabled
next_button_element = btn
break # Found an enabled next button
if next_button_element:
LOG.info("Navigating to next page...")
await next_button_element.click()
current_page += 1
# Wait for page load - consider waiting for a specific element on the new page instead of fixed sleep
await self.web_sleep(3000, 4000)
else:
LOG.info('Last ad overview page explored (no enabled "Naechste" button found).')
break break
# navigate to next overview page except TimeoutError:
await nav_button.click() # This might happen if pagination disappears on the last page after loading
await self.web_sleep(2000, 3000) LOG.info("No pagination controls found after scrolling/waiting. Assuming last page.")
await self.web_scroll_page_down() break
except Exception as e:
LOG.error("Error during pagination navigation: %s", e, exc_info=True)
break
# --- End Navigation ---
if not refs:
LOG.warning('No ad URLs were extracted.')
return refs return refs

View File

@@ -152,10 +152,23 @@ kleinanzeigen_bot/extract.py:
"The ad ID could not be extracted from the given URL %s": "Die Anzeigen-ID konnte nicht aus der angegebenen URL %s extrahiert werden" "The ad ID could not be extracted from the given URL %s": "Die Anzeigen-ID konnte nicht aus der angegebenen URL %s extrahiert werden"
extract_own_ads_urls: extract_own_ads_urls:
"There are currently no ads on your profile!": "Es gibt derzeit keine Anzeigen in deinem Profil!" "Ad list container #my-manageitems-adlist not found. Maybe no ads present?": "Anzeigenlistencontainer #my-manageitems-adlist nicht gefunden. Vielleicht sind keine Anzeigen vorhanden?"
"It looks like you have many ads!": "Es sieht so aus, als hättest du viele Anzeigen!" "Multiple ad pages detected.": "Mehrere Anzeigenseiten erkannt."
"It looks like all your ads fit on one overview page.": "Es sieht so aus, als würden alle deine Anzeigen auf eine Übersichtsseite passen." "Next button found but is disabled. Assuming single effective page.": "Weiter-Button gefunden, aber deaktiviert. Es wird von einer einzelnen effektiven Seite ausgegangen."
"Last ad overview page explored.": "Letzte Übersichtsseite erkundet." "No \"Naechste\" button found within pagination. Assuming single page.": "Kein \"Nächste\"-Button in der Paginierung gefunden. Es wird von einer einzelnen Seite ausgegangen."
"No pagination controls found. Assuming single page.": "Keine Paginierungssteuerung gefunden. Es wird von einer einzelnen Seite ausgegangen."
"Assuming single page due to error during pagination check.": "Es wird von einer einzelnen Seite ausgegangen wegen eines Fehlers bei der Paginierungsprüfung."
"Navigating to next page...": "Navigiere zur nächsten Seite..."
"Last ad overview page explored (no enabled \"Naechste\" button found).": "Letzte Anzeigenübersichtsseite erkundet (kein aktivierter \"Nächste\"-Button gefunden)."
"No pagination controls found after scrolling/waiting. Assuming last page.": "Keine Paginierungssteuerung nach dem Scrollen/Warten gefunden. Es wird von der letzten Seite ausgegangen."
"No ad URLs were extracted.": "Es wurden keine Anzeigen-URLs extrahiert."
"Could not find ad list container or items on page %s.": "Anzeigenlistencontainer oder Elemente auf Seite %s nicht gefunden."
"Error during pagination detection: %s": "Fehler bei der Paginierungserkennung: %s"
"Error during pagination navigation: %s": "Fehler bei der Paginierungsnavigation: %s"
"Error extracting refs on page %s: %s": "Fehler beim Extrahieren der Referenzen auf Seite %s: %s"
"Extracting ads from page %s...": "Extrahiere Anzeigen von Seite %s..."
"Found %s ad items on page %s.": "%s Anzeigen-Elemente auf Seite %s gefunden."
"Successfully extracted %s refs from page %s.": "%s Referenzen von Seite %s erfolgreich extrahiert."
naviagte_to_ad_page: naviagte_to_ad_page:
"There is no ad under the given ID.": "Es gibt keine Anzeige unter der angegebenen ID." "There is no ad under the given ID.": "Es gibt keine Anzeige unter der angegebenen ID."

View File

@@ -261,42 +261,65 @@ class TestAdExtractorNavigation:
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_extract_own_ads_urls(self, test_extractor: AdExtractor) -> None: async def test_extract_own_ads_urls(self, test_extractor: AdExtractor) -> None:
"""Test extraction of own ads URLs - basic test.""" """Test extraction of own ads URLs - basic test."""
with patch.object(test_extractor, 'web_open', new_callable = AsyncMock), \ with patch.object(test_extractor, 'web_open', new_callable=AsyncMock), \
patch.object(test_extractor, 'web_sleep', new_callable = AsyncMock), \ patch.object(test_extractor, 'web_sleep', new_callable=AsyncMock), \
patch.object(test_extractor, 'web_find', new_callable = AsyncMock) as mock_web_find, \ patch.object(test_extractor, 'web_find', new_callable=AsyncMock) as mock_web_find, \
patch.object(test_extractor, 'web_find_all', new_callable = AsyncMock) as mock_web_find_all, \ patch.object(test_extractor, 'web_find_all', new_callable=AsyncMock) as mock_web_find_all, \
patch.object(test_extractor, 'web_scroll_page_down', new_callable = AsyncMock), \ patch.object(test_extractor, 'web_scroll_page_down', new_callable=AsyncMock), \
patch.object(test_extractor, 'web_execute', new_callable = AsyncMock): patch.object(test_extractor, 'web_execute', new_callable=AsyncMock):
# Setup mock objects for DOM elements # --- Setup mock objects for DOM elements ---
splitpage = MagicMock() # Mocks needed for the actual execution flow
pagination_section = MagicMock() ad_list_container_mock = MagicMock()
pagination = MagicMock() pagination_section_mock = MagicMock()
pagination_div = MagicMock() cardbox_mock = MagicMock() # Represents the <li> element
ad_list = MagicMock() link_mock = MagicMock() # Represents the <a> element
cardbox = MagicMock() link_mock.attrs = {'href': '/s-anzeige/test/12345'} # Configure the desired output
link = MagicMock()
link.attrs = {'href': '/s-anzeige/test/12345'}
# Setup mock responses for web_find # Mocks for elements potentially checked but maybe not strictly needed for output
# (depending on how robust the mocking is)
# next_button_mock = MagicMock() # If needed for multi_page logic
# --- Setup mock responses for web_find and web_find_all in CORRECT ORDER ---
# 1. Initial find for ad list container (before loop)
# 2. Find for pagination section (pagination check)
# 3. Find for ad list container (inside loop)
# 4. Find for the link (inside list comprehension)
mock_web_find.side_effect = [ mock_web_find.side_effect = [
splitpage, # .l-splitpage ad_list_container_mock, # Call 1: find #my-manageitems-adlist (before loop)
pagination_section, # section:nth-of-type(4) pagination_section_mock, # Call 2: find .Pagination
pagination, # div > div:nth-of-type(2) > div:nth-of-type(2) > div ad_list_container_mock, # Call 3: find #my-manageitems-adlist (inside loop)
pagination_div, # div:nth-of-type(1) link_mock # Call 4: find 'div.manageitems-item-ad h3 a.text-onSurface'
ad_list, # my-manageitems-adlist # Add more mocks here if the pagination navigation logic calls web_find again
link # article > section > section:nth-of-type(2) > h2 > div > a
] ]
# Setup mock responses for web_find_all # 1. Find all 'Nächste' buttons (pagination check) - Return empty list for single page test case
# 2. Find all '.cardbox' elements (inside loop)
mock_web_find_all.side_effect = [ mock_web_find_all.side_effect = [
[MagicMock()], # buttons in pagination [], # Call 1: find 'button[aria-label="Nächste"]' -> No next button = single page
[cardbox] # cardbox elements [cardbox_mock] # Call 2: find .cardbox -> One ad item
# Add more mocks here if pagination navigation calls web_find_all
] ]
# Execute test and verify results # --- Execute test and verify results ---
refs = await test_extractor.extract_own_ads_urls() refs = await test_extractor.extract_own_ads_urls()
assert refs == ['/s-anzeige/test/12345']
# --- Assertions ---
assert refs == ['/s-anzeige/test/12345'] # Now it should match
# Optional: Verify calls were made as expected
mock_web_find.assert_has_calls([
call(By.ID, 'my-manageitems-adlist'),
call(By.CSS_SELECTOR, '.Pagination', timeout=10),
call(By.ID, 'my-manageitems-adlist'),
call(By.CSS_SELECTOR, 'div.manageitems-item-ad h3 a.text-onSurface', parent=cardbox_mock),
], any_order=False) # Check order if important
mock_web_find_all.assert_has_calls([
call(By.CSS_SELECTOR, 'button[aria-label="Nächste"]', parent=pagination_section_mock),
call(By.CLASS_NAME, 'cardbox', parent=ad_list_container_mock),
], any_order=False)
class TestAdExtractorContent: class TestAdExtractorContent: