fix: Correct pagination selectors and logic for issue #477 (#479)

This commit is contained in:
marvinkcode
2025-04-21 20:26:02 +02:00
committed by GitHub
parent c144801d2e
commit 79af6ba861
4 changed files with 161 additions and 71 deletions

View File

@@ -157,7 +157,8 @@ class KleinanzeigenBot(WebScrapingMixin):
* new: Veröffentlicht nur neue Anzeigen (d.h. Anzeigen ohne ID in der Konfigurationsdatei) * new: Veröffentlicht nur neue Anzeigen (d.h. Anzeigen ohne ID in der Konfigurationsdatei)
* changed: Veröffentlicht nur Anzeigen, die seit der letzten Veröffentlichung geändert wurden * changed: Veröffentlicht nur Anzeigen, die seit der letzten Veröffentlichung geändert wurden
* <id(s)>: Gibt eine oder mehrere Anzeigen-IDs an, die veröffentlicht werden sollen, z. B. "--ads=1,2,3", ignoriert republication_interval * <id(s)>: Gibt eine oder mehrere Anzeigen-IDs an, die veröffentlicht werden sollen, z. B. "--ads=1,2,3", ignoriert republication_interval
* Kombinationen: Sie können mehrere Selektoren mit Kommas kombinieren, z. B. "--ads=changed,due" um sowohl geänderte als auch fällige Anzeigen zu veröffentlichen * Kombinationen: Sie können mehrere Selektoren mit Kommas kombinieren, z. B. "--ads=changed,due" um sowohl geänderte als auch
fällige Anzeigen zu veröffentlichen
--ads=all|new|<id(s)> (download) - Gibt an, welche Anzeigen heruntergeladen werden sollen (STANDARD: new) --ads=all|new|<id(s)> (download) - Gibt an, welche Anzeigen heruntergeladen werden sollen (STANDARD: new)
Mögliche Werte: Mögliche Werte:
* all: Lädt alle Anzeigen aus Ihrem Profil herunter * all: Lädt alle Anzeigen aus Ihrem Profil herunter
@@ -401,7 +402,8 @@ class KleinanzeigenBot(WebScrapingMixin):
ensure(dicts.safe_get(ad_cfg, *path.split(".")) in allowed, f"-> property [{path}] must be one of: {allowed} @ [{ad_file}]") ensure(dicts.safe_get(ad_cfg, *path.split(".")) in allowed, f"-> property [{path}] must be one of: {allowed} @ [{ad_file}]")
def assert_min_len(path:str, minlen:int) -> None: def assert_min_len(path:str, minlen:int) -> None:
ensure(len(dicts.safe_get(ad_cfg, *path.split("."))) >= minlen, f"-> property [{path}] must be at least {minlen} characters long @ [{ad_file}]") ensure(len(dicts.safe_get(ad_cfg, *path.split("."))) >= minlen,
f"-> property [{path}] must be at least {minlen} characters long @ [{ad_file}]")
def assert_has_value(path:str) -> None: def assert_has_value(path:str) -> None:
ensure(dicts.safe_get(ad_cfg, *path.split(".")), f"-> property [{path}] not specified @ [{ad_file}]") ensure(dicts.safe_get(ad_cfg, *path.split(".")), f"-> property [{path}] not specified @ [{ad_file}]")

View File

@@ -135,54 +135,106 @@ class AdExtractor(WebScrapingMixin):
""" """
# navigate to "your ads" page # navigate to "your ads" page
await self.web_open('https://www.kleinanzeigen.de/m-meine-anzeigen.html') await self.web_open('https://www.kleinanzeigen.de/m-meine-anzeigen.html')
await self.web_sleep(2000, 3000) await self.web_sleep(2000, 3000) # Consider replacing with explicit waits later
# collect ad references: # Try to find the main ad list container first
pagination_section = await self.web_find(By.CSS_SELECTOR, 'section:nth-of-type(4)',
parent = await self.web_find(By.CSS_SELECTOR, '.l-splitpage'))
# scroll down to load dynamically
await self.web_scroll_page_down()
await self.web_sleep(2000, 3000)
# detect multi-page
try: try:
pagination = await self.web_find(By.CSS_SELECTOR, 'div > div:nth-of-type(2) > div:nth-of-type(2) > div', ad_list_container = await self.web_find(By.ID, 'my-manageitems-adlist')
parent = pagination_section) except TimeoutError:
except TimeoutError: # 0 ads - no pagination area LOG.warning('Ad list container #my-manageitems-adlist not found. Maybe no ads present?')
LOG.warning('There are currently no ads on your profile!')
return [] return []
n_buttons = len(await self.web_find_all(By.CSS_SELECTOR, 'em', # --- Pagination handling ---
parent = await self.web_find(By.CSS_SELECTOR, 'div:nth-of-type(1)', parent = pagination))) multi_page = False
if n_buttons > 1: try:
multi_page = True # Correct selector: Use uppercase '.Pagination'
LOG.info('It looks like you have many ads!') pagination_section = await self.web_find(By.CSS_SELECTOR, '.Pagination', timeout=10) # Increased timeout slightly
else: # Correct selector: Use 'aria-label'
multi_page = False # Also check if the button is actually present AND potentially enabled (though enabled check isn't strictly necessary here, only for clicking later)
LOG.info('It looks like all your ads fit on one overview page.') next_buttons = await self.web_find_all(By.CSS_SELECTOR, 'button[aria-label="Nächste"]', parent=pagination_section)
if next_buttons:
# Check if at least one 'Nächste' button is not disabled (optional but good practice)
enabled_next_buttons = [btn for btn in next_buttons if not btn.attrs.get('disabled')]
if enabled_next_buttons:
multi_page = True
LOG.info('Multiple ad pages detected.')
else:
LOG.info('Next button found but is disabled. Assuming single effective page.')
else:
LOG.info('No "Naechste" button found within pagination. Assuming single page.')
except TimeoutError:
# This will now correctly trigger only if the '.Pagination' div itself is not found
LOG.info('No pagination controls found. Assuming single page.')
except Exception as e:
LOG.error("Error during pagination detection: %s", e, exc_info=True)
LOG.info('Assuming single page due to error during pagination check.')
# --- End Pagination Handling ---
refs:list[str] = [] refs:list[str] = []
while True: # loop reference extraction until no more forward page current_page = 1
# extract references while True: # Loop reference extraction
list_items = await self.web_find_all(By.CLASS_NAME, 'cardbox', LOG.info("Extracting ads from page %s...", current_page)
parent = await self.web_find(By.ID, 'my-manageitems-adlist')) # scroll down to load dynamically if necessary
refs += [ await self.web_scroll_page_down()
(await self.web_find(By.CSS_SELECTOR, 'article > section > section:nth-of-type(2) > h3 > div > a', parent = li)).attrs['href'] await self.web_sleep(2000, 3000) # Consider replacing with explicit waits
for li in list_items
] # Re-find the ad list container on the current page/state
try:
ad_list_container = await self.web_find(By.ID, 'my-manageitems-adlist')
list_items = await self.web_find_all(By.CLASS_NAME, 'cardbox', parent=ad_list_container)
LOG.info("Found %s ad items on page %s.", len(list_items), current_page)
except TimeoutError:
LOG.warning("Could not find ad list container or items on page %s.", current_page)
break # Stop if ads disappear
# Extract references using the CORRECTED selector
try:
page_refs = [
(await self.web_find(By.CSS_SELECTOR, 'div.manageitems-item-ad h3 a.text-onSurface', parent=li)).attrs['href']
for li in list_items
]
refs.extend(page_refs)
LOG.info("Successfully extracted %s refs from page %s.", len(page_refs), current_page)
except Exception as e:
# Log the error if extraction fails for some items, but try to continue
LOG.error("Error extracting refs on page %s: %s", current_page, e, exc_info=True)
if not multi_page: # only one iteration for single-page overview if not multi_page: # only one iteration for single-page overview
break break
# check if last page
nav_button:Element = (await self.web_find_all(By.CSS_SELECTOR, 'button.jsx-1553636621'))[-1] # --- Navigate to next page ---
if nav_button.attrs['title'] != 'Nächste': try:
LOG.info('Last ad overview page explored.') # Find the pagination section again (scope might have changed after scroll/wait)
pagination_section = await self.web_find(By.CSS_SELECTOR, '.Pagination', timeout=5)
# Find the "Next" button using the correct aria-label selector and ensure it's not disabled
next_button_element = None
possible_next_buttons = await self.web_find_all(By.CSS_SELECTOR, 'button[aria-label="Nächste"]', parent=pagination_section)
for btn in possible_next_buttons:
if not btn.attrs.get('disabled'): # Check if the button is enabled
next_button_element = btn
break # Found an enabled next button
if next_button_element:
LOG.info("Navigating to next page...")
await next_button_element.click()
current_page += 1
# Wait for page load - consider waiting for a specific element on the new page instead of fixed sleep
await self.web_sleep(3000, 4000)
else:
LOG.info('Last ad overview page explored (no enabled "Naechste" button found).')
break
except TimeoutError:
# This might happen if pagination disappears on the last page after loading
LOG.info("No pagination controls found after scrolling/waiting. Assuming last page.")
break break
# navigate to next overview page except Exception as e:
await nav_button.click() LOG.error("Error during pagination navigation: %s", e, exc_info=True)
await self.web_sleep(2000, 3000) break
await self.web_scroll_page_down() # --- End Navigation ---
if not refs:
LOG.warning('No ad URLs were extracted.')
return refs return refs

View File

@@ -152,10 +152,23 @@ kleinanzeigen_bot/extract.py:
"The ad ID could not be extracted from the given URL %s": "Die Anzeigen-ID konnte nicht aus der angegebenen URL %s extrahiert werden" "The ad ID could not be extracted from the given URL %s": "Die Anzeigen-ID konnte nicht aus der angegebenen URL %s extrahiert werden"
extract_own_ads_urls: extract_own_ads_urls:
"There are currently no ads on your profile!": "Es gibt derzeit keine Anzeigen in deinem Profil!" "Ad list container #my-manageitems-adlist not found. Maybe no ads present?": "Anzeigenlistencontainer #my-manageitems-adlist nicht gefunden. Vielleicht sind keine Anzeigen vorhanden?"
"It looks like you have many ads!": "Es sieht so aus, als hättest du viele Anzeigen!" "Multiple ad pages detected.": "Mehrere Anzeigenseiten erkannt."
"It looks like all your ads fit on one overview page.": "Es sieht so aus, als würden alle deine Anzeigen auf eine Übersichtsseite passen." "Next button found but is disabled. Assuming single effective page.": "Weiter-Button gefunden, aber deaktiviert. Es wird von einer einzelnen effektiven Seite ausgegangen."
"Last ad overview page explored.": "Letzte Übersichtsseite erkundet." "No \"Naechste\" button found within pagination. Assuming single page.": "Kein \"Nächste\"-Button in der Paginierung gefunden. Es wird von einer einzelnen Seite ausgegangen."
"No pagination controls found. Assuming single page.": "Keine Paginierungssteuerung gefunden. Es wird von einer einzelnen Seite ausgegangen."
"Assuming single page due to error during pagination check.": "Es wird von einer einzelnen Seite ausgegangen wegen eines Fehlers bei der Paginierungsprüfung."
"Navigating to next page...": "Navigiere zur nächsten Seite..."
"Last ad overview page explored (no enabled \"Naechste\" button found).": "Letzte Anzeigenübersichtsseite erkundet (kein aktivierter \"Nächste\"-Button gefunden)."
"No pagination controls found after scrolling/waiting. Assuming last page.": "Keine Paginierungssteuerung nach dem Scrollen/Warten gefunden. Es wird von der letzten Seite ausgegangen."
"No ad URLs were extracted.": "Es wurden keine Anzeigen-URLs extrahiert."
"Could not find ad list container or items on page %s.": "Anzeigenlistencontainer oder Elemente auf Seite %s nicht gefunden."
"Error during pagination detection: %s": "Fehler bei der Paginierungserkennung: %s"
"Error during pagination navigation: %s": "Fehler bei der Paginierungsnavigation: %s"
"Error extracting refs on page %s: %s": "Fehler beim Extrahieren der Referenzen auf Seite %s: %s"
"Extracting ads from page %s...": "Extrahiere Anzeigen von Seite %s..."
"Found %s ad items on page %s.": "%s Anzeigen-Elemente auf Seite %s gefunden."
"Successfully extracted %s refs from page %s.": "%s Referenzen von Seite %s erfolgreich extrahiert."
naviagte_to_ad_page: naviagte_to_ad_page:
"There is no ad under the given ID.": "Es gibt keine Anzeige unter der angegebenen ID." "There is no ad under the given ID.": "Es gibt keine Anzeige unter der angegebenen ID."

View File

@@ -261,42 +261,65 @@ class TestAdExtractorNavigation:
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_extract_own_ads_urls(self, test_extractor: AdExtractor) -> None: async def test_extract_own_ads_urls(self, test_extractor: AdExtractor) -> None:
"""Test extraction of own ads URLs - basic test.""" """Test extraction of own ads URLs - basic test."""
with patch.object(test_extractor, 'web_open', new_callable = AsyncMock), \ with patch.object(test_extractor, 'web_open', new_callable=AsyncMock), \
patch.object(test_extractor, 'web_sleep', new_callable = AsyncMock), \ patch.object(test_extractor, 'web_sleep', new_callable=AsyncMock), \
patch.object(test_extractor, 'web_find', new_callable = AsyncMock) as mock_web_find, \ patch.object(test_extractor, 'web_find', new_callable=AsyncMock) as mock_web_find, \
patch.object(test_extractor, 'web_find_all', new_callable = AsyncMock) as mock_web_find_all, \ patch.object(test_extractor, 'web_find_all', new_callable=AsyncMock) as mock_web_find_all, \
patch.object(test_extractor, 'web_scroll_page_down', new_callable = AsyncMock), \ patch.object(test_extractor, 'web_scroll_page_down', new_callable=AsyncMock), \
patch.object(test_extractor, 'web_execute', new_callable = AsyncMock): patch.object(test_extractor, 'web_execute', new_callable=AsyncMock):
# Setup mock objects for DOM elements # --- Setup mock objects for DOM elements ---
splitpage = MagicMock() # Mocks needed for the actual execution flow
pagination_section = MagicMock() ad_list_container_mock = MagicMock()
pagination = MagicMock() pagination_section_mock = MagicMock()
pagination_div = MagicMock() cardbox_mock = MagicMock() # Represents the <li> element
ad_list = MagicMock() link_mock = MagicMock() # Represents the <a> element
cardbox = MagicMock() link_mock.attrs = {'href': '/s-anzeige/test/12345'} # Configure the desired output
link = MagicMock()
link.attrs = {'href': '/s-anzeige/test/12345'}
# Setup mock responses for web_find # Mocks for elements potentially checked but maybe not strictly needed for output
# (depending on how robust the mocking is)
# next_button_mock = MagicMock() # If needed for multi_page logic
# --- Setup mock responses for web_find and web_find_all in CORRECT ORDER ---
# 1. Initial find for ad list container (before loop)
# 2. Find for pagination section (pagination check)
# 3. Find for ad list container (inside loop)
# 4. Find for the link (inside list comprehension)
mock_web_find.side_effect = [ mock_web_find.side_effect = [
splitpage, # .l-splitpage ad_list_container_mock, # Call 1: find #my-manageitems-adlist (before loop)
pagination_section, # section:nth-of-type(4) pagination_section_mock, # Call 2: find .Pagination
pagination, # div > div:nth-of-type(2) > div:nth-of-type(2) > div ad_list_container_mock, # Call 3: find #my-manageitems-adlist (inside loop)
pagination_div, # div:nth-of-type(1) link_mock # Call 4: find 'div.manageitems-item-ad h3 a.text-onSurface'
ad_list, # my-manageitems-adlist # Add more mocks here if the pagination navigation logic calls web_find again
link # article > section > section:nth-of-type(2) > h2 > div > a
] ]
# Setup mock responses for web_find_all # 1. Find all 'Nächste' buttons (pagination check) - Return empty list for single page test case
# 2. Find all '.cardbox' elements (inside loop)
mock_web_find_all.side_effect = [ mock_web_find_all.side_effect = [
[MagicMock()], # buttons in pagination [], # Call 1: find 'button[aria-label="Nächste"]' -> No next button = single page
[cardbox] # cardbox elements [cardbox_mock] # Call 2: find .cardbox -> One ad item
# Add more mocks here if pagination navigation calls web_find_all
] ]
# Execute test and verify results # --- Execute test and verify results ---
refs = await test_extractor.extract_own_ads_urls() refs = await test_extractor.extract_own_ads_urls()
assert refs == ['/s-anzeige/test/12345']
# --- Assertions ---
assert refs == ['/s-anzeige/test/12345'] # Now it should match
# Optional: Verify calls were made as expected
mock_web_find.assert_has_calls([
call(By.ID, 'my-manageitems-adlist'),
call(By.CSS_SELECTOR, '.Pagination', timeout=10),
call(By.ID, 'my-manageitems-adlist'),
call(By.CSS_SELECTOR, 'div.manageitems-item-ad h3 a.text-onSurface', parent=cardbox_mock),
], any_order=False) # Check order if important
mock_web_find_all.assert_has_calls([
call(By.CSS_SELECTOR, 'button[aria-label="Nächste"]', parent=pagination_section_mock),
call(By.CLASS_NAME, 'cardbox', parent=ad_list_container_mock),
], any_order=False)
class TestAdExtractorContent: class TestAdExtractorContent: