mirror of
https://github.com/Second-Hand-Friends/kleinanzeigen-bot.git
synced 2026-03-12 10:31:50 +01:00
@@ -157,7 +157,8 @@ class KleinanzeigenBot(WebScrapingMixin):
|
|||||||
* new: Veröffentlicht nur neue Anzeigen (d.h. Anzeigen ohne ID in der Konfigurationsdatei)
|
* new: Veröffentlicht nur neue Anzeigen (d.h. Anzeigen ohne ID in der Konfigurationsdatei)
|
||||||
* changed: Veröffentlicht nur Anzeigen, die seit der letzten Veröffentlichung geändert wurden
|
* changed: Veröffentlicht nur Anzeigen, die seit der letzten Veröffentlichung geändert wurden
|
||||||
* <id(s)>: Gibt eine oder mehrere Anzeigen-IDs an, die veröffentlicht werden sollen, z. B. "--ads=1,2,3", ignoriert republication_interval
|
* <id(s)>: Gibt eine oder mehrere Anzeigen-IDs an, die veröffentlicht werden sollen, z. B. "--ads=1,2,3", ignoriert republication_interval
|
||||||
* Kombinationen: Sie können mehrere Selektoren mit Kommas kombinieren, z. B. "--ads=changed,due" um sowohl geänderte als auch fällige Anzeigen zu veröffentlichen
|
* Kombinationen: Sie können mehrere Selektoren mit Kommas kombinieren, z. B. "--ads=changed,due" um sowohl geänderte als auch
|
||||||
|
fällige Anzeigen zu veröffentlichen
|
||||||
--ads=all|new|<id(s)> (download) - Gibt an, welche Anzeigen heruntergeladen werden sollen (STANDARD: new)
|
--ads=all|new|<id(s)> (download) - Gibt an, welche Anzeigen heruntergeladen werden sollen (STANDARD: new)
|
||||||
Mögliche Werte:
|
Mögliche Werte:
|
||||||
* all: Lädt alle Anzeigen aus Ihrem Profil herunter
|
* all: Lädt alle Anzeigen aus Ihrem Profil herunter
|
||||||
@@ -401,7 +402,8 @@ class KleinanzeigenBot(WebScrapingMixin):
|
|||||||
ensure(dicts.safe_get(ad_cfg, *path.split(".")) in allowed, f"-> property [{path}] must be one of: {allowed} @ [{ad_file}]")
|
ensure(dicts.safe_get(ad_cfg, *path.split(".")) in allowed, f"-> property [{path}] must be one of: {allowed} @ [{ad_file}]")
|
||||||
|
|
||||||
def assert_min_len(path:str, minlen:int) -> None:
|
def assert_min_len(path:str, minlen:int) -> None:
|
||||||
ensure(len(dicts.safe_get(ad_cfg, *path.split("."))) >= minlen, f"-> property [{path}] must be at least {minlen} characters long @ [{ad_file}]")
|
ensure(len(dicts.safe_get(ad_cfg, *path.split("."))) >= minlen,
|
||||||
|
f"-> property [{path}] must be at least {minlen} characters long @ [{ad_file}]")
|
||||||
|
|
||||||
def assert_has_value(path:str) -> None:
|
def assert_has_value(path:str) -> None:
|
||||||
ensure(dicts.safe_get(ad_cfg, *path.split(".")), f"-> property [{path}] not specified @ [{ad_file}]")
|
ensure(dicts.safe_get(ad_cfg, *path.split(".")), f"-> property [{path}] not specified @ [{ad_file}]")
|
||||||
|
|||||||
@@ -135,54 +135,106 @@ class AdExtractor(WebScrapingMixin):
|
|||||||
"""
|
"""
|
||||||
# navigate to "your ads" page
|
# navigate to "your ads" page
|
||||||
await self.web_open('https://www.kleinanzeigen.de/m-meine-anzeigen.html')
|
await self.web_open('https://www.kleinanzeigen.de/m-meine-anzeigen.html')
|
||||||
await self.web_sleep(2000, 3000)
|
await self.web_sleep(2000, 3000) # Consider replacing with explicit waits later
|
||||||
|
|
||||||
# collect ad references:
|
# Try to find the main ad list container first
|
||||||
pagination_section = await self.web_find(By.CSS_SELECTOR, 'section:nth-of-type(4)',
|
|
||||||
parent = await self.web_find(By.CSS_SELECTOR, '.l-splitpage'))
|
|
||||||
|
|
||||||
# scroll down to load dynamically
|
|
||||||
await self.web_scroll_page_down()
|
|
||||||
await self.web_sleep(2000, 3000)
|
|
||||||
|
|
||||||
# detect multi-page
|
|
||||||
try:
|
try:
|
||||||
pagination = await self.web_find(By.CSS_SELECTOR, 'div > div:nth-of-type(2) > div:nth-of-type(2) > div',
|
ad_list_container = await self.web_find(By.ID, 'my-manageitems-adlist')
|
||||||
parent = pagination_section)
|
except TimeoutError:
|
||||||
except TimeoutError: # 0 ads - no pagination area
|
LOG.warning('Ad list container #my-manageitems-adlist not found. Maybe no ads present?')
|
||||||
LOG.warning('There are currently no ads on your profile!')
|
|
||||||
return []
|
return []
|
||||||
|
|
||||||
n_buttons = len(await self.web_find_all(By.CSS_SELECTOR, 'em',
|
# --- Pagination handling ---
|
||||||
parent = await self.web_find(By.CSS_SELECTOR, 'div:nth-of-type(1)', parent = pagination)))
|
multi_page = False
|
||||||
if n_buttons > 1:
|
try:
|
||||||
multi_page = True
|
# Correct selector: Use uppercase '.Pagination'
|
||||||
LOG.info('It looks like you have many ads!')
|
pagination_section = await self.web_find(By.CSS_SELECTOR, '.Pagination', timeout=10) # Increased timeout slightly
|
||||||
else:
|
# Correct selector: Use 'aria-label'
|
||||||
multi_page = False
|
# Also check if the button is actually present AND potentially enabled (though enabled check isn't strictly necessary here, only for clicking later)
|
||||||
LOG.info('It looks like all your ads fit on one overview page.')
|
next_buttons = await self.web_find_all(By.CSS_SELECTOR, 'button[aria-label="Nächste"]', parent=pagination_section)
|
||||||
|
if next_buttons:
|
||||||
|
# Check if at least one 'Nächste' button is not disabled (optional but good practice)
|
||||||
|
enabled_next_buttons = [btn for btn in next_buttons if not btn.attrs.get('disabled')]
|
||||||
|
if enabled_next_buttons:
|
||||||
|
multi_page = True
|
||||||
|
LOG.info('Multiple ad pages detected.')
|
||||||
|
else:
|
||||||
|
LOG.info('Next button found but is disabled. Assuming single effective page.')
|
||||||
|
|
||||||
|
else:
|
||||||
|
LOG.info('No "Naechste" button found within pagination. Assuming single page.')
|
||||||
|
except TimeoutError:
|
||||||
|
# This will now correctly trigger only if the '.Pagination' div itself is not found
|
||||||
|
LOG.info('No pagination controls found. Assuming single page.')
|
||||||
|
except Exception as e:
|
||||||
|
LOG.error("Error during pagination detection: %s", e, exc_info=True)
|
||||||
|
LOG.info('Assuming single page due to error during pagination check.')
|
||||||
|
# --- End Pagination Handling ---
|
||||||
|
|
||||||
refs:list[str] = []
|
refs:list[str] = []
|
||||||
while True: # loop reference extraction until no more forward page
|
current_page = 1
|
||||||
# extract references
|
while True: # Loop reference extraction
|
||||||
list_items = await self.web_find_all(By.CLASS_NAME, 'cardbox',
|
LOG.info("Extracting ads from page %s...", current_page)
|
||||||
parent = await self.web_find(By.ID, 'my-manageitems-adlist'))
|
# scroll down to load dynamically if necessary
|
||||||
refs += [
|
await self.web_scroll_page_down()
|
||||||
(await self.web_find(By.CSS_SELECTOR, 'article > section > section:nth-of-type(2) > h3 > div > a', parent = li)).attrs['href']
|
await self.web_sleep(2000, 3000) # Consider replacing with explicit waits
|
||||||
for li in list_items
|
|
||||||
]
|
# Re-find the ad list container on the current page/state
|
||||||
|
try:
|
||||||
|
ad_list_container = await self.web_find(By.ID, 'my-manageitems-adlist')
|
||||||
|
list_items = await self.web_find_all(By.CLASS_NAME, 'cardbox', parent=ad_list_container)
|
||||||
|
LOG.info("Found %s ad items on page %s.", len(list_items), current_page)
|
||||||
|
except TimeoutError:
|
||||||
|
LOG.warning("Could not find ad list container or items on page %s.", current_page)
|
||||||
|
break # Stop if ads disappear
|
||||||
|
|
||||||
|
# Extract references using the CORRECTED selector
|
||||||
|
try:
|
||||||
|
page_refs = [
|
||||||
|
(await self.web_find(By.CSS_SELECTOR, 'div.manageitems-item-ad h3 a.text-onSurface', parent=li)).attrs['href']
|
||||||
|
for li in list_items
|
||||||
|
]
|
||||||
|
refs.extend(page_refs)
|
||||||
|
LOG.info("Successfully extracted %s refs from page %s.", len(page_refs), current_page)
|
||||||
|
except Exception as e:
|
||||||
|
# Log the error if extraction fails for some items, but try to continue
|
||||||
|
LOG.error("Error extracting refs on page %s: %s", current_page, e, exc_info=True)
|
||||||
|
|
||||||
if not multi_page: # only one iteration for single-page overview
|
if not multi_page: # only one iteration for single-page overview
|
||||||
break
|
break
|
||||||
# check if last page
|
|
||||||
nav_button:Element = (await self.web_find_all(By.CSS_SELECTOR, 'button.jsx-1553636621'))[-1]
|
# --- Navigate to next page ---
|
||||||
if nav_button.attrs['title'] != 'Nächste':
|
try:
|
||||||
LOG.info('Last ad overview page explored.')
|
# Find the pagination section again (scope might have changed after scroll/wait)
|
||||||
|
pagination_section = await self.web_find(By.CSS_SELECTOR, '.Pagination', timeout=5)
|
||||||
|
# Find the "Next" button using the correct aria-label selector and ensure it's not disabled
|
||||||
|
next_button_element = None
|
||||||
|
possible_next_buttons = await self.web_find_all(By.CSS_SELECTOR, 'button[aria-label="Nächste"]', parent=pagination_section)
|
||||||
|
for btn in possible_next_buttons:
|
||||||
|
if not btn.attrs.get('disabled'): # Check if the button is enabled
|
||||||
|
next_button_element = btn
|
||||||
|
break # Found an enabled next button
|
||||||
|
|
||||||
|
if next_button_element:
|
||||||
|
LOG.info("Navigating to next page...")
|
||||||
|
await next_button_element.click()
|
||||||
|
current_page += 1
|
||||||
|
# Wait for page load - consider waiting for a specific element on the new page instead of fixed sleep
|
||||||
|
await self.web_sleep(3000, 4000)
|
||||||
|
else:
|
||||||
|
LOG.info('Last ad overview page explored (no enabled "Naechste" button found).')
|
||||||
|
break
|
||||||
|
except TimeoutError:
|
||||||
|
# This might happen if pagination disappears on the last page after loading
|
||||||
|
LOG.info("No pagination controls found after scrolling/waiting. Assuming last page.")
|
||||||
break
|
break
|
||||||
# navigate to next overview page
|
except Exception as e:
|
||||||
await nav_button.click()
|
LOG.error("Error during pagination navigation: %s", e, exc_info=True)
|
||||||
await self.web_sleep(2000, 3000)
|
break
|
||||||
await self.web_scroll_page_down()
|
# --- End Navigation ---
|
||||||
|
|
||||||
|
if not refs:
|
||||||
|
LOG.warning('No ad URLs were extracted.')
|
||||||
|
|
||||||
return refs
|
return refs
|
||||||
|
|
||||||
|
|||||||
@@ -152,10 +152,23 @@ kleinanzeigen_bot/extract.py:
|
|||||||
"The ad ID could not be extracted from the given URL %s": "Die Anzeigen-ID konnte nicht aus der angegebenen URL %s extrahiert werden"
|
"The ad ID could not be extracted from the given URL %s": "Die Anzeigen-ID konnte nicht aus der angegebenen URL %s extrahiert werden"
|
||||||
|
|
||||||
extract_own_ads_urls:
|
extract_own_ads_urls:
|
||||||
"There are currently no ads on your profile!": "Es gibt derzeit keine Anzeigen in deinem Profil!"
|
"Ad list container #my-manageitems-adlist not found. Maybe no ads present?": "Anzeigenlistencontainer #my-manageitems-adlist nicht gefunden. Vielleicht sind keine Anzeigen vorhanden?"
|
||||||
"It looks like you have many ads!": "Es sieht so aus, als hättest du viele Anzeigen!"
|
"Multiple ad pages detected.": "Mehrere Anzeigenseiten erkannt."
|
||||||
"It looks like all your ads fit on one overview page.": "Es sieht so aus, als würden alle deine Anzeigen auf eine Übersichtsseite passen."
|
"Next button found but is disabled. Assuming single effective page.": "Weiter-Button gefunden, aber deaktiviert. Es wird von einer einzelnen effektiven Seite ausgegangen."
|
||||||
"Last ad overview page explored.": "Letzte Übersichtsseite erkundet."
|
"No \"Naechste\" button found within pagination. Assuming single page.": "Kein \"Nächste\"-Button in der Paginierung gefunden. Es wird von einer einzelnen Seite ausgegangen."
|
||||||
|
"No pagination controls found. Assuming single page.": "Keine Paginierungssteuerung gefunden. Es wird von einer einzelnen Seite ausgegangen."
|
||||||
|
"Assuming single page due to error during pagination check.": "Es wird von einer einzelnen Seite ausgegangen wegen eines Fehlers bei der Paginierungsprüfung."
|
||||||
|
"Navigating to next page...": "Navigiere zur nächsten Seite..."
|
||||||
|
"Last ad overview page explored (no enabled \"Naechste\" button found).": "Letzte Anzeigenübersichtsseite erkundet (kein aktivierter \"Nächste\"-Button gefunden)."
|
||||||
|
"No pagination controls found after scrolling/waiting. Assuming last page.": "Keine Paginierungssteuerung nach dem Scrollen/Warten gefunden. Es wird von der letzten Seite ausgegangen."
|
||||||
|
"No ad URLs were extracted.": "Es wurden keine Anzeigen-URLs extrahiert."
|
||||||
|
"Could not find ad list container or items on page %s.": "Anzeigenlistencontainer oder Elemente auf Seite %s nicht gefunden."
|
||||||
|
"Error during pagination detection: %s": "Fehler bei der Paginierungserkennung: %s"
|
||||||
|
"Error during pagination navigation: %s": "Fehler bei der Paginierungsnavigation: %s"
|
||||||
|
"Error extracting refs on page %s: %s": "Fehler beim Extrahieren der Referenzen auf Seite %s: %s"
|
||||||
|
"Extracting ads from page %s...": "Extrahiere Anzeigen von Seite %s..."
|
||||||
|
"Found %s ad items on page %s.": "%s Anzeigen-Elemente auf Seite %s gefunden."
|
||||||
|
"Successfully extracted %s refs from page %s.": "%s Referenzen von Seite %s erfolgreich extrahiert."
|
||||||
|
|
||||||
naviagte_to_ad_page:
|
naviagte_to_ad_page:
|
||||||
"There is no ad under the given ID.": "Es gibt keine Anzeige unter der angegebenen ID."
|
"There is no ad under the given ID.": "Es gibt keine Anzeige unter der angegebenen ID."
|
||||||
|
|||||||
@@ -261,42 +261,65 @@ class TestAdExtractorNavigation:
|
|||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_extract_own_ads_urls(self, test_extractor: AdExtractor) -> None:
|
async def test_extract_own_ads_urls(self, test_extractor: AdExtractor) -> None:
|
||||||
"""Test extraction of own ads URLs - basic test."""
|
"""Test extraction of own ads URLs - basic test."""
|
||||||
with patch.object(test_extractor, 'web_open', new_callable = AsyncMock), \
|
with patch.object(test_extractor, 'web_open', new_callable=AsyncMock), \
|
||||||
patch.object(test_extractor, 'web_sleep', new_callable = AsyncMock), \
|
patch.object(test_extractor, 'web_sleep', new_callable=AsyncMock), \
|
||||||
patch.object(test_extractor, 'web_find', new_callable = AsyncMock) as mock_web_find, \
|
patch.object(test_extractor, 'web_find', new_callable=AsyncMock) as mock_web_find, \
|
||||||
patch.object(test_extractor, 'web_find_all', new_callable = AsyncMock) as mock_web_find_all, \
|
patch.object(test_extractor, 'web_find_all', new_callable=AsyncMock) as mock_web_find_all, \
|
||||||
patch.object(test_extractor, 'web_scroll_page_down', new_callable = AsyncMock), \
|
patch.object(test_extractor, 'web_scroll_page_down', new_callable=AsyncMock), \
|
||||||
patch.object(test_extractor, 'web_execute', new_callable = AsyncMock):
|
patch.object(test_extractor, 'web_execute', new_callable=AsyncMock):
|
||||||
|
|
||||||
# Setup mock objects for DOM elements
|
# --- Setup mock objects for DOM elements ---
|
||||||
splitpage = MagicMock()
|
# Mocks needed for the actual execution flow
|
||||||
pagination_section = MagicMock()
|
ad_list_container_mock = MagicMock()
|
||||||
pagination = MagicMock()
|
pagination_section_mock = MagicMock()
|
||||||
pagination_div = MagicMock()
|
cardbox_mock = MagicMock() # Represents the <li> element
|
||||||
ad_list = MagicMock()
|
link_mock = MagicMock() # Represents the <a> element
|
||||||
cardbox = MagicMock()
|
link_mock.attrs = {'href': '/s-anzeige/test/12345'} # Configure the desired output
|
||||||
link = MagicMock()
|
|
||||||
link.attrs = {'href': '/s-anzeige/test/12345'}
|
|
||||||
|
|
||||||
# Setup mock responses for web_find
|
# Mocks for elements potentially checked but maybe not strictly needed for output
|
||||||
|
# (depending on how robust the mocking is)
|
||||||
|
# next_button_mock = MagicMock() # If needed for multi_page logic
|
||||||
|
|
||||||
|
# --- Setup mock responses for web_find and web_find_all in CORRECT ORDER ---
|
||||||
|
|
||||||
|
# 1. Initial find for ad list container (before loop)
|
||||||
|
# 2. Find for pagination section (pagination check)
|
||||||
|
# 3. Find for ad list container (inside loop)
|
||||||
|
# 4. Find for the link (inside list comprehension)
|
||||||
mock_web_find.side_effect = [
|
mock_web_find.side_effect = [
|
||||||
splitpage, # .l-splitpage
|
ad_list_container_mock, # Call 1: find #my-manageitems-adlist (before loop)
|
||||||
pagination_section, # section:nth-of-type(4)
|
pagination_section_mock, # Call 2: find .Pagination
|
||||||
pagination, # div > div:nth-of-type(2) > div:nth-of-type(2) > div
|
ad_list_container_mock, # Call 3: find #my-manageitems-adlist (inside loop)
|
||||||
pagination_div, # div:nth-of-type(1)
|
link_mock # Call 4: find 'div.manageitems-item-ad h3 a.text-onSurface'
|
||||||
ad_list, # my-manageitems-adlist
|
# Add more mocks here if the pagination navigation logic calls web_find again
|
||||||
link # article > section > section:nth-of-type(2) > h2 > div > a
|
|
||||||
]
|
]
|
||||||
|
|
||||||
# Setup mock responses for web_find_all
|
# 1. Find all 'Nächste' buttons (pagination check) - Return empty list for single page test case
|
||||||
|
# 2. Find all '.cardbox' elements (inside loop)
|
||||||
mock_web_find_all.side_effect = [
|
mock_web_find_all.side_effect = [
|
||||||
[MagicMock()], # buttons in pagination
|
[], # Call 1: find 'button[aria-label="Nächste"]' -> No next button = single page
|
||||||
[cardbox] # cardbox elements
|
[cardbox_mock] # Call 2: find .cardbox -> One ad item
|
||||||
|
# Add more mocks here if pagination navigation calls web_find_all
|
||||||
]
|
]
|
||||||
|
|
||||||
# Execute test and verify results
|
# --- Execute test and verify results ---
|
||||||
refs = await test_extractor.extract_own_ads_urls()
|
refs = await test_extractor.extract_own_ads_urls()
|
||||||
assert refs == ['/s-anzeige/test/12345']
|
|
||||||
|
# --- Assertions ---
|
||||||
|
assert refs == ['/s-anzeige/test/12345'] # Now it should match
|
||||||
|
|
||||||
|
# Optional: Verify calls were made as expected
|
||||||
|
mock_web_find.assert_has_calls([
|
||||||
|
call(By.ID, 'my-manageitems-adlist'),
|
||||||
|
call(By.CSS_SELECTOR, '.Pagination', timeout=10),
|
||||||
|
call(By.ID, 'my-manageitems-adlist'),
|
||||||
|
call(By.CSS_SELECTOR, 'div.manageitems-item-ad h3 a.text-onSurface', parent=cardbox_mock),
|
||||||
|
], any_order=False) # Check order if important
|
||||||
|
|
||||||
|
mock_web_find_all.assert_has_calls([
|
||||||
|
call(By.CSS_SELECTOR, 'button[aria-label="Nächste"]', parent=pagination_section_mock),
|
||||||
|
call(By.CLASS_NAME, 'cardbox', parent=ad_list_container_mock),
|
||||||
|
], any_order=False)
|
||||||
|
|
||||||
|
|
||||||
class TestAdExtractorContent:
|
class TestAdExtractorContent:
|
||||||
|
|||||||
Reference in New Issue
Block a user