Files
kleinanzeigen-bot/tests/unit/test_extract.py
Jens a3ac27c441 feat: add configurable timeouts (#673)
## ℹ️ Description
- Related issues: #671, #658
- Introduces configurable timeout controls plus retry/backoff handling
for flaky DOM operations.

We often see timeouts which are note reproducible in certain
configurations. I suspect timeout issues based on a combination of
internet speed, browser, os, age of the computer and the weather.

This PR introduces a comprehensive config model to tweak timeouts.

## 📋 Changes Summary
- add TimeoutConfig to the main config/schema and expose timeouts in
README/docs
- wire WebScrapingMixin, extractor, update checker, and browser
diagnostics to honor the configurable timeouts and retries
- update translations/tests to cover the new behaviour and ensure
lint/mypy/pyright pipelines remain green

### ⚙️ Type of Change
- [ ] 🐞 Bug fix (non-breaking change which fixes an issue)
- [x]  New feature (adds new functionality without breaking existing
usage)
- [ ] 💥 Breaking change (changes that might break existing user setups,
scripts, or configurations)

##  Checklist
- [x] I have reviewed my changes to ensure they meet the project's
standards.
- [x] I have tested my changes and ensured that all tests pass (`pdm run
test`).
- [x] I have formatted the code (`pdm run format`).
- [x] I have verified that linting passes (`pdm run lint`).
- [x] I have updated documentation where necessary.


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

* **New Features**
* Centralized, configurable timeout system for web interactions,
detection flows, publishing, and pagination.
* Optional retry with exponential backoff for operations that time out.

* **Improvements**
* Replaced fixed wait times with dynamic timeouts throughout workflows.
  * More informative timeout-related messages and diagnostics.

* **Tests**
* New and expanded test coverage for timeout behavior, pagination,
diagnostics, and retry logic.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
2025-11-13 15:08:52 +01:00

1090 lines
51 KiB
Python

# SPDX-FileCopyrightText: © Sebastian Thomschke and contributors
# SPDX-License-Identifier: AGPL-3.0-or-later
# SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/
import json, os # isort: skip
from gettext import gettext as _
from typing import Any, TypedDict
from unittest.mock import AsyncMock, MagicMock, call, patch
import pytest
from kleinanzeigen_bot.extract import AdExtractor
from kleinanzeigen_bot.model.ad_model import AdPartial, ContactPartial
from kleinanzeigen_bot.model.config_model import Config, DownloadConfig
from kleinanzeigen_bot.utils.web_scraping_mixin import Browser, By, Element
class _DimensionsDict(TypedDict):
dimension108:str
class _UniversalAnalyticsOptsDict(TypedDict):
dimensions:_DimensionsDict
class _BelenConfDict(TypedDict):
universalAnalyticsOpts:_UniversalAnalyticsOptsDict
class _SpecialAttributesDict(TypedDict, total = False):
art_s:str
condition_s:str
class _TestCaseDict(TypedDict): # noqa: PYI049 Private TypedDict `...` is never used
belen_conf:_BelenConfDict
expected:_SpecialAttributesDict
@pytest.fixture
def test_extractor(browser_mock:MagicMock, test_bot_config:Config) -> AdExtractor:
"""Provides a fresh AdExtractor instance for testing.
Dependencies:
- browser_mock: Used to mock browser interactions
- test_bot_config: Used to initialize the extractor with a valid configuration
"""
return AdExtractor(browser_mock, test_bot_config)
class TestAdExtractorBasics:
"""Basic synchronous tests for AdExtractor."""
def test_constructor(self, browser_mock:MagicMock, test_bot_config:Config) -> None:
"""Test the constructor of AdExtractor"""
extractor = AdExtractor(browser_mock, test_bot_config)
assert extractor.browser == browser_mock
assert extractor.config == test_bot_config
@pytest.mark.parametrize(("url", "expected_id"), [
("https://www.kleinanzeigen.de/s-anzeige/test-title/12345678", 12345678),
("https://www.kleinanzeigen.de/s-anzeige/another-test/98765432", 98765432),
("https://www.kleinanzeigen.de/s-anzeige/invalid-id/abc", -1),
("https://www.kleinanzeigen.de/invalid-url", -1),
])
def test_extract_ad_id_from_ad_url(self, test_extractor:AdExtractor, url:str, expected_id:int) -> None:
"""Test extraction of ad ID from different URL formats."""
assert test_extractor.extract_ad_id_from_ad_url(url) == expected_id
class TestAdExtractorPricing:
"""Tests for pricing related functionality."""
@pytest.mark.parametrize(("price_text", "expected_price", "expected_type"), [
("50 €", 50, "FIXED"),
("1.234 €", 1234, "FIXED"),
("50 € VB", 50, "NEGOTIABLE"),
("VB", None, "NEGOTIABLE"),
("Zu verschenken", None, "GIVE_AWAY"),
])
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_pricing_info(
self, test_extractor:AdExtractor, price_text:str, expected_price:int | None, expected_type:str
) -> None:
"""Test price extraction with different formats"""
with patch.object(test_extractor, "web_text", new_callable = AsyncMock, return_value = price_text):
price, price_type = await test_extractor._extract_pricing_info_from_ad_page()
assert price == expected_price
assert price_type == expected_type
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_pricing_info_timeout(self, test_extractor:AdExtractor) -> None:
"""Test price extraction when element is not found"""
with patch.object(test_extractor, "web_text", new_callable = AsyncMock, side_effect = TimeoutError):
price, price_type = await test_extractor._extract_pricing_info_from_ad_page()
assert price is None
assert price_type == "NOT_APPLICABLE"
class TestAdExtractorShipping:
"""Tests for shipping related functionality."""
@pytest.mark.parametrize(("shipping_text", "expected_type", "expected_cost"), [
("+ Versand ab 2,99 €", "SHIPPING", 2.99),
("Nur Abholung", "PICKUP", None),
("Versand möglich", "SHIPPING", None),
])
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_shipping_info(
self, test_extractor:AdExtractor, shipping_text:str, expected_type:str, expected_cost:float | None
) -> None:
"""Test shipping info extraction with different text formats."""
with patch.object(test_extractor, "page", MagicMock()), \
patch.object(test_extractor, "web_text", new_callable = AsyncMock, return_value = shipping_text), \
patch.object(test_extractor, "web_request", new_callable = AsyncMock) as mock_web_request:
if expected_cost:
shipping_response:dict[str, Any] = {
"data": {
"shippingOptionsResponse": {
"options": [
{"id": "DHL_001", "priceInEuroCent": int(expected_cost * 100), "packageSize": "SMALL"}
]
}
}
}
mock_web_request.return_value = {"content": json.dumps(shipping_response)}
shipping_type, costs, options = await test_extractor._extract_shipping_info_from_ad_page()
assert shipping_type == expected_type
assert costs == expected_cost
if expected_cost:
assert options == ["DHL_2"]
else:
assert options is None
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_shipping_info_with_options(self, test_extractor:AdExtractor) -> None:
"""Test shipping info extraction with shipping options."""
shipping_response = {
"content": json.dumps({
"data": {
"shippingOptionsResponse": {
"options": [
{"id": "DHL_001", "priceInEuroCent": 549, "packageSize": "SMALL"}
]
}
}
})
}
with patch.object(test_extractor, "page", MagicMock()), \
patch.object(test_extractor, "web_text", new_callable = AsyncMock, return_value = "+ Versand ab 5,49 €"), \
patch.object(test_extractor, "web_request", new_callable = AsyncMock, return_value = shipping_response):
shipping_type, costs, options = await test_extractor._extract_shipping_info_from_ad_page()
assert shipping_type == "SHIPPING"
assert costs == 5.49
assert options == ["DHL_2"]
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_shipping_info_with_all_matching_options(self, test_extractor:AdExtractor) -> None:
"""Test shipping info extraction with all matching options enabled."""
shipping_response = {
"content": json.dumps({
"data": {
"shippingOptionsResponse": {
"options": [
{"id": "HERMES_001", "priceInEuroCent": 489, "packageSize": "SMALL"},
{"id": "HERMES_002", "priceInEuroCent": 549, "packageSize": "SMALL"},
{"id": "DHL_001", "priceInEuroCent": 619, "packageSize": "SMALL"}
]
}
}
})
}
# Enable all matching options in config
test_extractor.config.download = DownloadConfig.model_validate({"include_all_matching_shipping_options": True})
with patch.object(test_extractor, "page", MagicMock()), \
patch.object(test_extractor, "web_text", new_callable = AsyncMock, return_value = "+ Versand ab 4,89 €"), \
patch.object(test_extractor, "web_request", new_callable = AsyncMock, return_value = shipping_response):
shipping_type, costs, options = await test_extractor._extract_shipping_info_from_ad_page()
assert shipping_type == "SHIPPING"
assert costs == 4.89
if options is not None:
assert sorted(options) == ["DHL_2", "Hermes_Päckchen", "Hermes_S"]
else:
assert options is None
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_shipping_info_with_excluded_options(self, test_extractor:AdExtractor) -> None:
"""Test shipping info extraction with excluded options."""
shipping_response = {
"content": json.dumps({
"data": {
"shippingOptionsResponse": {
"options": [
{"id": "HERMES_001", "priceInEuroCent": 489, "packageSize": "SMALL"},
{"id": "HERMES_002", "priceInEuroCent": 549, "packageSize": "SMALL"},
{"id": "DHL_001", "priceInEuroCent": 619, "packageSize": "SMALL"}
]
}
}
})
}
# Enable all matching options and exclude DHL in config
test_extractor.config.download = DownloadConfig.model_validate({
"include_all_matching_shipping_options": True,
"excluded_shipping_options": ["DHL_2"]
})
with patch.object(test_extractor, "page", MagicMock()), \
patch.object(test_extractor, "web_text", new_callable = AsyncMock, return_value = "+ Versand ab 4,89 €"), \
patch.object(test_extractor, "web_request", new_callable = AsyncMock, return_value = shipping_response):
shipping_type, costs, options = await test_extractor._extract_shipping_info_from_ad_page()
assert shipping_type == "SHIPPING"
assert costs == 4.89
if options is not None:
assert sorted(options) == ["Hermes_Päckchen", "Hermes_S"]
else:
assert options is None
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_shipping_info_with_excluded_matching_option(self, test_extractor:AdExtractor) -> None:
"""Test shipping info extraction when the matching option is excluded."""
shipping_response = {
"content": json.dumps({
"data": {
"shippingOptionsResponse": {
"options": [
{"id": "HERMES_001", "priceInEuroCent": 489, "packageSize": "SMALL"},
{"id": "HERMES_002", "priceInEuroCent": 549, "packageSize": "SMALL"}
]
}
}
})
}
# Exclude the matching option
test_extractor.config.download = DownloadConfig.model_validate({
"excluded_shipping_options": ["Hermes_Päckchen"]
})
with patch.object(test_extractor, "page", MagicMock()), \
patch.object(test_extractor, "web_text", new_callable = AsyncMock, return_value = "+ Versand ab 4,89 €"), \
patch.object(test_extractor, "web_request", new_callable = AsyncMock, return_value = shipping_response):
shipping_type, costs, options = await test_extractor._extract_shipping_info_from_ad_page()
assert shipping_type == "NOT_APPLICABLE"
assert costs == 4.89
assert options is None
class TestAdExtractorNavigation:
"""Tests for navigation related functionality."""
@pytest.mark.asyncio
async def test_navigate_to_ad_page_with_url(self, test_extractor:AdExtractor) -> None:
"""Test navigation to ad page using a URL."""
page_mock = AsyncMock()
page_mock.url = "https://www.kleinanzeigen.de/s-anzeige/test/12345"
with patch.object(test_extractor, "page", page_mock), \
patch.object(test_extractor, "web_open", new_callable = AsyncMock) as mock_web_open, \
patch.object(test_extractor, "web_find", new_callable = AsyncMock, side_effect = TimeoutError):
result = await test_extractor.navigate_to_ad_page("https://www.kleinanzeigen.de/s-anzeige/test/12345")
assert result is True
mock_web_open.assert_called_with("https://www.kleinanzeigen.de/s-anzeige/test/12345")
@pytest.mark.asyncio
async def test_navigate_to_ad_page_with_id(self, test_extractor:AdExtractor) -> None:
"""Test navigation to ad page using an ID."""
ad_id = 12345
page_mock = AsyncMock()
page_mock.url = "https://www.kleinanzeigen.de/s-anzeige/test/{0}".format(ad_id)
popup_close_mock = AsyncMock()
popup_close_mock.click = AsyncMock()
popup_close_mock.apply = AsyncMock(return_value = True)
def find_mock(selector_type:By, selector_value:str, **_:Any) -> Element | None:
if selector_type == By.CLASS_NAME and selector_value == "mfp-close":
return popup_close_mock
return None
with patch.object(test_extractor, "page", page_mock), \
patch.object(test_extractor, "web_open", new_callable = AsyncMock) as mock_web_open, \
patch.object(test_extractor, "web_find", new_callable = AsyncMock, side_effect = find_mock):
result = await test_extractor.navigate_to_ad_page(ad_id)
assert result is True
mock_web_open.assert_called_with("https://www.kleinanzeigen.de/s-suchanfrage.html?keywords={0}".format(ad_id))
popup_close_mock.click.assert_awaited_once()
@pytest.mark.asyncio
async def test_navigate_to_ad_page_with_popup(self, test_extractor:AdExtractor) -> None:
"""Test navigation to ad page with popup handling."""
page_mock = AsyncMock()
page_mock.url = "https://www.kleinanzeigen.de/s-anzeige/test/12345"
input_mock = AsyncMock()
input_mock.clear_input = AsyncMock()
input_mock.send_keys = AsyncMock()
input_mock.apply = AsyncMock(return_value = True)
with patch.object(test_extractor, "page", page_mock), \
patch.object(test_extractor, "web_open", new_callable = AsyncMock), \
patch.object(test_extractor, "web_find", new_callable = AsyncMock, return_value = input_mock), \
patch.object(test_extractor, "web_click", new_callable = AsyncMock) as mock_web_click, \
patch.object(test_extractor, "web_check", new_callable = AsyncMock, return_value = True):
result = await test_extractor.navigate_to_ad_page(12345)
assert result is True
mock_web_click.assert_called_with(By.CLASS_NAME, "mfp-close")
@pytest.mark.asyncio
async def test_navigate_to_ad_page_invalid_id(self, test_extractor:AdExtractor) -> None:
"""Test navigation to ad page with invalid ID."""
page_mock = AsyncMock()
page_mock.url = "https://www.kleinanzeigen.de/s-suchen.html?k0"
input_mock = AsyncMock()
input_mock.clear_input = AsyncMock()
input_mock.send_keys = AsyncMock()
input_mock.apply = AsyncMock(return_value = True)
input_mock.attrs = {}
with patch.object(test_extractor, "page", page_mock), \
patch.object(test_extractor, "web_open", new_callable = AsyncMock), \
patch.object(test_extractor, "web_find", new_callable = AsyncMock, return_value = input_mock):
result = await test_extractor.navigate_to_ad_page(99999)
assert result is False
@pytest.mark.asyncio
async def test_extract_own_ads_urls(self, test_extractor:AdExtractor) -> None:
"""Test extraction of own ads URLs - basic test."""
with patch.object(test_extractor, "web_open", new_callable = AsyncMock), \
patch.object(test_extractor, "web_sleep", new_callable = AsyncMock), \
patch.object(test_extractor, "web_find", new_callable = AsyncMock) as mock_web_find, \
patch.object(test_extractor, "web_find_all", new_callable = AsyncMock) as mock_web_find_all, \
patch.object(test_extractor, "web_scroll_page_down", new_callable = AsyncMock), \
patch.object(test_extractor, "web_execute", new_callable = AsyncMock):
# --- Setup mock objects for DOM elements ---
# Mocks needed for the actual execution flow
ad_list_container_mock = MagicMock()
pagination_section_mock = MagicMock()
cardbox_mock = MagicMock() # Represents the <li> element
link_mock = MagicMock() # Represents the <a> element
link_mock.attrs = {"href": "/s-anzeige/test/12345"} # Configure the desired output
# Mocks for elements potentially checked but maybe not strictly needed for output
# (depending on how robust the mocking is)
# next_button_mock = MagicMock() # If needed for multi_page logic
# --- Setup mock responses for web_find and web_find_all in CORRECT ORDER ---
# 1. Initial find for ad list container (before loop)
# 2. Find for pagination section (pagination check)
# 3. Find for ad list container (inside loop)
# 4. Find for the link (inside list comprehension)
mock_web_find.side_effect = [
ad_list_container_mock, # Call 1: find #my-manageitems-adlist (before loop)
pagination_section_mock, # Call 2: find .Pagination
ad_list_container_mock, # Call 3: find #my-manageitems-adlist (inside loop)
link_mock # Call 4: find 'div.manageitems-item-ad h3 a.text-onSurface'
# Add more mocks here if the pagination navigation logic calls web_find again
]
# 1. Find all 'Nächste' buttons (pagination check) - Return empty list for single page test case
# 2. Find all '.cardbox' elements (inside loop)
mock_web_find_all.side_effect = [
[], # Call 1: find 'button[aria-label="Nächste"]' -> No next button = single page
[cardbox_mock] # Call 2: find .cardbox -> One ad item
# Add more mocks here if pagination navigation calls web_find_all
]
# --- Execute test and verify results ---
refs = await test_extractor.extract_own_ads_urls()
# --- Assertions ---
assert refs == ["/s-anzeige/test/12345"] # Now it should match
# Optional: Verify calls were made as expected
mock_web_find.assert_has_calls([
call(By.ID, "my-manageitems-adlist"),
call(By.CSS_SELECTOR, ".Pagination", timeout = 10),
call(By.ID, "my-manageitems-adlist"),
call(By.CSS_SELECTOR, "div h3 a.text-onSurface", parent = cardbox_mock),
], any_order = False) # Check order if important
mock_web_find_all.assert_has_calls([
call(By.CSS_SELECTOR, 'button[aria-label="Nächste"]', parent = pagination_section_mock),
call(By.CLASS_NAME, "cardbox", parent = ad_list_container_mock),
], any_order = False)
@pytest.mark.asyncio
async def test_extract_own_ads_urls_paginates_with_enabled_next_button(self, test_extractor:AdExtractor) -> None:
"""Ensure the paginator clicks the first enabled next button and advances."""
ad_list_container_mock = MagicMock()
pagination_section_mock = MagicMock()
cardbox_page_one = MagicMock()
cardbox_page_two = MagicMock()
link_page_one = MagicMock(attrs = {"href": "/s-anzeige/page-one/111"})
link_page_two = MagicMock(attrs = {"href": "/s-anzeige/page-two/222"})
next_button_enabled = AsyncMock()
next_button_enabled.attrs = {}
disabled_button = MagicMock()
disabled_button.attrs = {"disabled": True}
link_queue = [link_page_one, link_page_two]
next_button_call = {"count": 0}
cardbox_call = {"count": 0}
async def fake_web_find(selector_type:By, selector_value:str, *, parent:Element | None = None,
timeout:int | float | None = None) -> Element:
if selector_type == By.ID and selector_value == "my-manageitems-adlist":
return ad_list_container_mock
if selector_type == By.CSS_SELECTOR and selector_value == ".Pagination":
return pagination_section_mock
if selector_type == By.CSS_SELECTOR and selector_value == "div h3 a.text-onSurface":
return link_queue.pop(0)
raise AssertionError(f"Unexpected selector {selector_type} {selector_value}")
async def fake_web_find_all(selector_type:By, selector_value:str, *, parent:Element | None = None,
timeout:int | float | None = None) -> list[Element]:
if selector_type == By.CSS_SELECTOR and selector_value == 'button[aria-label="Nächste"]':
next_button_call["count"] += 1
if next_button_call["count"] == 1:
return [next_button_enabled] # initial detection -> multi page
if next_button_call["count"] == 2:
return [disabled_button, next_button_enabled] # navigation on page 1
return [] # after navigating, stop
if selector_type == By.CLASS_NAME and selector_value == "cardbox":
cardbox_call["count"] += 1
return [cardbox_page_one] if cardbox_call["count"] == 1 else [cardbox_page_two]
raise AssertionError(f"Unexpected find_all selector {selector_type} {selector_value}")
with patch.object(test_extractor, "web_open", new_callable = AsyncMock), \
patch.object(test_extractor, "web_scroll_page_down", new_callable = AsyncMock), \
patch.object(test_extractor, "web_sleep", new_callable = AsyncMock), \
patch.object(test_extractor, "web_find", new_callable = AsyncMock, side_effect = fake_web_find), \
patch.object(test_extractor, "web_find_all", new_callable = AsyncMock, side_effect = fake_web_find_all):
refs = await test_extractor.extract_own_ads_urls()
assert refs == ["/s-anzeige/page-one/111", "/s-anzeige/page-two/222"]
next_button_enabled.click.assert_awaited() # triggered once during navigation
class TestAdExtractorContent:
"""Tests for content extraction functionality."""
# pylint: disable=protected-access
@pytest.fixture
def extractor_with_config(self) -> AdExtractor:
"""Create extractor with specific config for testing prefix/suffix handling."""
browser_mock = MagicMock(spec = Browser)
return AdExtractor(browser_mock, Config()) # Empty config, will be overridden in tests
@pytest.mark.asyncio
async def test_extract_description_with_affixes(
self,
test_extractor:AdExtractor,
description_test_cases:list[tuple[dict[str, Any], str, str]],
test_bot_config:Config
) -> None:
"""Test extraction of description with various prefix/suffix configurations."""
# Mock the page
page_mock = MagicMock()
page_mock.url = "https://www.kleinanzeigen.de/s-anzeige/test/12345"
test_extractor.page = page_mock
for config, raw_description, _expected_description in description_test_cases:
test_extractor.config = test_bot_config.with_values(config)
with patch.multiple(test_extractor,
web_text = AsyncMock(side_effect = [
"Test Title", # Title
raw_description, # Raw description (without affixes)
"03.02.2025" # Creation date
]),
web_execute = AsyncMock(return_value = {
"universalAnalyticsOpts": {
"dimensions": {
"dimension92": "",
"dimension108": ""
}
}
}),
_extract_category_from_ad_page = AsyncMock(return_value = "160"),
_extract_special_attributes_from_ad_page = AsyncMock(return_value = {}),
_extract_pricing_info_from_ad_page = AsyncMock(return_value = (None, "NOT_APPLICABLE")),
_extract_shipping_info_from_ad_page = AsyncMock(return_value = ("NOT_APPLICABLE", None, None)),
_extract_sell_directly_from_ad_page = AsyncMock(return_value = False),
_download_images_from_ad_page = AsyncMock(return_value = []),
_extract_contact_from_ad_page = AsyncMock(return_value = {})
):
info = await test_extractor._extract_ad_page_info("/some/dir", 12345)
assert info.description == raw_description
@pytest.mark.asyncio
async def test_extract_description_with_affixes_timeout(
self,
test_extractor:AdExtractor
) -> None:
"""Test handling of timeout when extracting description."""
# Mock the page
page_mock = MagicMock()
page_mock.url = "https://www.kleinanzeigen.de/s-anzeige/test/12345"
test_extractor.page = page_mock
with patch.multiple(test_extractor,
web_text = AsyncMock(side_effect = [
"Test Title", # Title succeeds
TimeoutError("Timeout"), # Description times out
"03.02.2025" # Date succeeds
]),
web_execute = AsyncMock(return_value = {
"universalAnalyticsOpts": {
"dimensions": {
"dimension92": "",
"dimension108": ""
}
}
}),
_extract_category_from_ad_page = AsyncMock(return_value = "160"),
_extract_special_attributes_from_ad_page = AsyncMock(return_value = {}),
_extract_pricing_info_from_ad_page = AsyncMock(return_value = (None, "NOT_APPLICABLE")),
_extract_shipping_info_from_ad_page = AsyncMock(return_value = ("NOT_APPLICABLE", None, None)),
_extract_sell_directly_from_ad_page = AsyncMock(return_value = False),
_download_images_from_ad_page = AsyncMock(return_value = []),
_extract_contact_from_ad_page = AsyncMock(return_value = ContactPartial())
):
try:
info = await test_extractor._extract_ad_page_info("/some/dir", 12345)
assert not info.description
except TimeoutError:
# This is also acceptable - depends on how we want to handle timeouts
pass
@pytest.mark.asyncio
async def test_extract_description_with_affixes_no_affixes(
self,
test_extractor:AdExtractor
) -> None:
"""Test extraction of description without any affixes in config."""
# Mock the page
page_mock = MagicMock()
page_mock.url = "https://www.kleinanzeigen.de/s-anzeige/test/12345"
test_extractor.page = page_mock
raw_description = "Original Description"
with patch.multiple(test_extractor,
web_text = AsyncMock(side_effect = [
"Test Title", # Title
raw_description, # Description without affixes
"03.02.2025" # Creation date
]),
web_execute = AsyncMock(return_value = {
"universalAnalyticsOpts": {
"dimensions": {
"dimension92": "",
"dimension108": ""
}
}
}),
_extract_category_from_ad_page = AsyncMock(return_value = "160"),
_extract_special_attributes_from_ad_page = AsyncMock(return_value = {}),
_extract_pricing_info_from_ad_page = AsyncMock(return_value = (None, "NOT_APPLICABLE")),
_extract_shipping_info_from_ad_page = AsyncMock(return_value = ("NOT_APPLICABLE", None, None)),
_extract_sell_directly_from_ad_page = AsyncMock(return_value = False),
_download_images_from_ad_page = AsyncMock(return_value = []),
_extract_contact_from_ad_page = AsyncMock(return_value = ContactPartial())
):
info = await test_extractor._extract_ad_page_info("/some/dir", 12345)
assert info.description == raw_description
@pytest.mark.asyncio
async def test_extract_sell_directly(self, test_extractor:AdExtractor) -> None:
"""Test extraction of sell directly option."""
test_cases = [
("Direkt kaufen", True),
("Other text", False),
]
for text, expected in test_cases:
with patch.object(test_extractor, "web_text", new_callable = AsyncMock, return_value = text):
result = await test_extractor._extract_sell_directly_from_ad_page()
assert result is expected
with patch.object(test_extractor, "web_text", new_callable = AsyncMock, side_effect = TimeoutError):
result = await test_extractor._extract_sell_directly_from_ad_page()
assert result is None
class TestAdExtractorCategory:
"""Tests for category extraction functionality."""
@pytest.fixture
def extractor(self, test_bot_config:Config) -> AdExtractor:
browser_mock = MagicMock(spec = Browser)
config = test_bot_config.with_values({
"ad_defaults": {
"description": {
"prefix": "Test Prefix",
"suffix": "Test Suffix"
}
}
})
return AdExtractor(browser_mock, config)
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_category(self, extractor:AdExtractor) -> None:
"""Test category extraction from breadcrumb."""
category_line = MagicMock()
first_part = MagicMock()
first_part.attrs = {"href": "/s-familie-kind-baby/c17"}
second_part = MagicMock()
second_part.attrs = {"href": "/s-spielzeug/c23"}
with patch.object(extractor, "web_find", new_callable = AsyncMock, side_effect = [category_line]) as mock_web_find, \
patch.object(extractor, "web_find_all", new_callable = AsyncMock, return_value = [first_part, second_part]) as mock_web_find_all:
result = await extractor._extract_category_from_ad_page()
assert result == "17/23"
mock_web_find.assert_awaited_once_with(By.ID, "vap-brdcrmb")
mock_web_find_all.assert_awaited_once_with(By.CSS_SELECTOR, "a", parent = category_line)
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_category_single_identifier(self, extractor:AdExtractor) -> None:
"""Test category extraction when only a single breadcrumb code exists."""
category_line = MagicMock()
first_part = MagicMock()
first_part.attrs = {"href": "/s-kleidung/c42"}
with patch.object(extractor, "web_find", new_callable = AsyncMock, side_effect = [category_line]) as mock_web_find, \
patch.object(extractor, "web_find_all", new_callable = AsyncMock, return_value = [first_part]) as mock_web_find_all:
result = await extractor._extract_category_from_ad_page()
assert result == "42/42"
mock_web_find.assert_awaited_once_with(By.ID, "vap-brdcrmb")
mock_web_find_all.assert_awaited_once_with(By.CSS_SELECTOR, "a", parent = category_line)
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_category_fallback_to_legacy_selectors(self, extractor:AdExtractor, caplog:pytest.LogCaptureFixture) -> None:
"""Test category extraction when breadcrumb links are not available and legacy selectors are used."""
category_line = MagicMock()
first_part = MagicMock()
first_part.attrs = {"href": 12345} # Ensure str() conversion happens
second_part = MagicMock()
second_part.attrs = {"href": 67890} # This will need str() conversion
caplog.set_level("DEBUG")
expected_message = _("Falling back to legacy breadcrumb selectors; collected ids: %s") % []
with patch.object(extractor, "web_find", new_callable = AsyncMock) as mock_web_find, \
patch.object(extractor, "web_find_all", new_callable = AsyncMock, side_effect = TimeoutError) as mock_web_find_all:
mock_web_find.side_effect = [
category_line,
first_part,
second_part
]
result = await extractor._extract_category_from_ad_page()
assert result == "12345/67890"
assert sum(1 for record in caplog.records if record.message == expected_message) == 1
mock_web_find.assert_any_call(By.ID, "vap-brdcrmb")
mock_web_find.assert_any_call(By.CSS_SELECTOR, "a:nth-of-type(2)", parent = category_line)
mock_web_find.assert_any_call(By.CSS_SELECTOR, "a:nth-of-type(3)", parent = category_line)
mock_web_find_all.assert_awaited_once_with(By.CSS_SELECTOR, "a", parent = category_line)
@pytest.mark.asyncio
async def test_extract_category_legacy_selectors_timeout(self, extractor:AdExtractor, caplog:pytest.LogCaptureFixture) -> None:
"""Ensure fallback timeout logs the error and re-raises with translated message."""
category_line = MagicMock()
async def fake_web_find(selector_type:By, selector_value:str, *, parent:Element | None = None,
timeout:int | float | None = None) -> Element:
if selector_type == By.ID and selector_value == "vap-brdcrmb":
return category_line
raise TimeoutError("legacy selectors missing")
with patch.object(extractor, "web_find", new_callable = AsyncMock, side_effect = fake_web_find), \
patch.object(extractor, "web_find_all", new_callable = AsyncMock, side_effect = TimeoutError), \
caplog.at_level("ERROR"), pytest.raises(TimeoutError, match = "Unable to locate breadcrumb fallback selectors"):
await extractor._extract_category_from_ad_page()
assert any("Legacy breadcrumb selectors not found" in record.message for record in caplog.records)
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_special_attributes_empty(self, extractor:AdExtractor) -> None:
"""Test extraction of special attributes when empty."""
with patch.object(extractor, "web_execute", new_callable = AsyncMock) as mock_web_execute:
mock_web_execute.return_value = {
"universalAnalyticsOpts": {
"dimensions": {
"dimension108": ""
}
}
}
result = await extractor._extract_special_attributes_from_ad_page(mock_web_execute.return_value)
assert result == {}
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_special_attributes_not_empty(self, extractor:AdExtractor) -> None:
"""Test extraction of special attributes when not empty."""
special_atts = {
"universalAnalyticsOpts": {
"dimensions": {
"dimension108": "versand_s:t|color_s:creme|groesse_s:68|condition_s:alright|type_s:accessoires|art_s:maedchen"
}
}
}
result = await extractor._extract_special_attributes_from_ad_page(special_atts)
assert len(result) == 5
assert "versand_s" not in result
assert "color_s" in result
assert result["color_s"] == "creme"
assert "groesse_s" in result
assert result["groesse_s"] == "68"
assert "condition_s" in result
assert result["condition_s"] == "alright"
assert "type_s" in result
assert result["type_s"] == "accessoires"
assert "art_s" in result
assert result["art_s"] == "maedchen"
class TestAdExtractorContact:
"""Tests for contact information extraction."""
@pytest.fixture
def extractor(self, test_bot_config:Config) -> AdExtractor:
browser_mock = MagicMock(spec = Browser)
config = test_bot_config.with_values({
"ad_defaults": {
"description": {
"prefix": "Test Prefix",
"suffix": "Test Suffix"
}
}
})
return AdExtractor(browser_mock, config)
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_contact_info(self, extractor:AdExtractor) -> None:
"""Test extraction of contact information."""
with patch.object(extractor, "page", MagicMock()), \
patch.object(extractor, "web_text", new_callable = AsyncMock) as mock_web_text, \
patch.object(extractor, "web_find", new_callable = AsyncMock) as mock_web_find:
mock_web_text.side_effect = [
"12345 Berlin - Mitte",
"Example Street 123,",
"Test User",
]
mock_web_find.side_effect = [
MagicMock(), # contact person element
MagicMock(), # name element
TimeoutError(), # phone element (simulating no phone)
]
contact_info = await extractor._extract_contact_from_ad_page()
assert contact_info.street == "Example Street 123"
assert contact_info.zipcode == "12345"
assert contact_info.location == "Berlin - Mitte"
assert contact_info.name == "Test User"
assert contact_info.phone is None
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_contact_info_timeout(self, extractor:AdExtractor) -> None:
"""Test contact info extraction when elements are not found."""
with patch.object(extractor, "page", MagicMock()), \
patch.object(extractor, "web_text", new_callable = AsyncMock, side_effect = TimeoutError()), \
patch.object(extractor, "web_find", new_callable = AsyncMock, side_effect = TimeoutError()), \
pytest.raises(TimeoutError):
await extractor._extract_contact_from_ad_page()
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_contact_info_with_phone(self, extractor:AdExtractor) -> None:
"""Test extraction of contact information including phone number."""
with patch.object(extractor, "page", MagicMock()), \
patch.object(extractor, "web_text", new_callable = AsyncMock) as mock_web_text, \
patch.object(extractor, "web_find", new_callable = AsyncMock) as mock_web_find:
mock_web_text.side_effect = [
"12345 Berlin - Mitte",
"Example Street 123,",
"Test User",
"+49(0)1234 567890"
]
phone_element = MagicMock()
mock_web_find.side_effect = [
MagicMock(), # contact person element
MagicMock(), # name element
phone_element, # phone element
]
contact_info = await extractor._extract_contact_from_ad_page()
assert contact_info.phone == "01234567890" # Normalized phone number
class TestAdExtractorDownload:
"""Tests for download functionality."""
@pytest.fixture
def extractor(self, test_bot_config:Config) -> AdExtractor:
browser_mock = MagicMock(spec = Browser)
config = test_bot_config.with_values({
"ad_defaults": {
"description": {
"prefix": "Test Prefix",
"suffix": "Test Suffix"
}
}
})
return AdExtractor(browser_mock, config)
@pytest.mark.asyncio
async def test_download_ad_existing_directory(self, extractor:AdExtractor) -> None:
"""Test downloading an ad when the directory already exists."""
with patch("os.path.exists") as mock_exists, \
patch("os.path.isdir") as mock_isdir, \
patch("os.makedirs") as mock_makedirs, \
patch("os.mkdir") as mock_mkdir, \
patch("os.rename") as mock_rename, \
patch("shutil.rmtree") as mock_rmtree, \
patch("kleinanzeigen_bot.extract.dicts.save_dict", autospec = True) as mock_save_dict, \
patch.object(extractor, "_extract_ad_page_info_with_directory_handling", new_callable = AsyncMock) as mock_extract_with_dir:
base_dir = "downloaded-ads"
final_dir = os.path.join(base_dir, "ad_12345_Test Advertisement Title")
yaml_path = os.path.join(final_dir, "ad_12345.yaml")
# Configure mocks for directory checks
existing_paths = {base_dir, final_dir} # Final directory with title exists
mock_exists.side_effect = lambda path: path in existing_paths
mock_isdir.side_effect = lambda path: path == base_dir
# Mock the new method that handles directory creation and extraction
mock_extract_with_dir.return_value = (
AdPartial.model_validate({
"title": "Test Advertisement Title",
"description": "Test Description",
"category": "Dienstleistungen",
"price": 100,
"images": [],
"contact": {
"name": "Test User",
"street": "Test Street 123",
"zipcode": "12345",
"location": "Test City"
}
}),
final_dir
)
await extractor.download_ad(12345)
# Verify the correct functions were called
mock_extract_with_dir.assert_called_once()
# Directory handling is now done inside _extract_ad_page_info_with_directory_handling
# so we don't expect rmtree/mkdir to be called directly in download_ad
mock_rmtree.assert_not_called() # Directory handling is done internally
mock_mkdir.assert_not_called() # Directory handling is done internally
mock_makedirs.assert_not_called() # Directory already exists
mock_rename.assert_not_called() # No renaming needed
# Get the actual call arguments
actual_call = mock_save_dict.call_args
assert actual_call is not None
actual_path = actual_call[0][0].replace("/", os.path.sep)
assert actual_path == yaml_path
assert actual_call[0][1] == mock_extract_with_dir.return_value[0].model_dump()
@pytest.mark.asyncio
async def test_download_ad(self, extractor:AdExtractor) -> None:
"""Test downloading an entire ad."""
with patch("os.path.exists") as mock_exists, \
patch("os.path.isdir") as mock_isdir, \
patch("os.makedirs") as mock_makedirs, \
patch("os.mkdir") as mock_mkdir, \
patch("os.rename") as mock_rename, \
patch("shutil.rmtree") as mock_rmtree, \
patch("kleinanzeigen_bot.extract.dicts.save_dict", autospec = True) as mock_save_dict, \
patch.object(extractor, "_extract_ad_page_info_with_directory_handling", new_callable = AsyncMock) as mock_extract_with_dir:
base_dir = "downloaded-ads"
final_dir = os.path.join(base_dir, "ad_12345_Test Advertisement Title")
yaml_path = os.path.join(final_dir, "ad_12345.yaml")
# Configure mocks for directory checks
mock_exists.return_value = False
mock_isdir.return_value = False
# Mock the new method that handles directory creation and extraction
mock_extract_with_dir.return_value = (
AdPartial.model_validate({
"title": "Test Advertisement Title",
"description": "Test Description",
"category": "Dienstleistungen",
"price": 100,
"images": [],
"contact": {
"name": "Test User",
"street": "Test Street 123",
"zipcode": "12345",
"location": "Test City"
}
}),
final_dir
)
await extractor.download_ad(12345)
# Verify the correct functions were called
mock_extract_with_dir.assert_called_once()
# Directory handling is now done inside _extract_ad_page_info_with_directory_handling
mock_rmtree.assert_not_called() # Directory handling is done internally
mock_mkdir.assert_has_calls([call(base_dir)]) # Only base directory creation
mock_makedirs.assert_not_called() # Using mkdir instead
mock_rename.assert_not_called() # No renaming needed
# Get the actual call arguments
actual_call = mock_save_dict.call_args
assert actual_call is not None
actual_path = actual_call[0][0].replace("/", os.path.sep)
assert actual_path == yaml_path
assert actual_call[0][1] == mock_extract_with_dir.return_value[0].model_dump()
@pytest.mark.asyncio
async def test_download_ad_use_existing_folder(self, extractor:AdExtractor) -> None:
"""Test downloading an ad when an old folder without title exists (default behavior)."""
with patch("os.path.exists") as mock_exists, \
patch("os.path.isdir") as mock_isdir, \
patch("os.makedirs") as mock_makedirs, \
patch("os.mkdir") as mock_mkdir, \
patch("os.rename") as mock_rename, \
patch("shutil.rmtree") as mock_rmtree, \
patch("kleinanzeigen_bot.extract.dicts.save_dict", autospec = True) as mock_save_dict, \
patch.object(extractor, "_extract_ad_page_info_with_directory_handling", new_callable = AsyncMock) as mock_extract_with_dir:
base_dir = "downloaded-ads"
temp_dir = os.path.join(base_dir, "ad_12345")
yaml_path = os.path.join(temp_dir, "ad_12345.yaml")
# Configure mocks for directory checks
# Base directory exists, temp directory exists
existing_paths = {base_dir, temp_dir}
mock_exists.side_effect = lambda path: path in existing_paths
mock_isdir.side_effect = lambda path: path == base_dir
# Mock the new method that handles directory creation and extraction
mock_extract_with_dir.return_value = (
AdPartial.model_validate({
"title": "Test Advertisement Title",
"description": "Test Description",
"category": "Dienstleistungen",
"price": 100,
"images": [],
"contact": {
"name": "Test User",
"street": "Test Street 123",
"zipcode": "12345",
"location": "Test City"
}
}),
temp_dir # Use existing temp directory
)
await extractor.download_ad(12345)
# Verify the correct functions were called
mock_extract_with_dir.assert_called_once()
mock_rmtree.assert_not_called() # No directory to remove
mock_mkdir.assert_not_called() # Base directory already exists
mock_makedirs.assert_not_called() # Using mkdir instead
mock_rename.assert_not_called() # No renaming (default behavior)
# Get the actual call arguments
actual_call = mock_save_dict.call_args
assert actual_call is not None
actual_path = actual_call[0][0].replace("/", os.path.sep)
assert actual_path == yaml_path
assert actual_call[0][1] == mock_extract_with_dir.return_value[0].model_dump()
@pytest.mark.asyncio
async def test_download_ad_rename_existing_folder_when_enabled(self, extractor:AdExtractor) -> None:
"""Test downloading an ad when an old folder without title exists and renaming is enabled."""
# Enable renaming in config
extractor.config.download.rename_existing_folders = True
with patch("os.path.exists") as mock_exists, \
patch("os.path.isdir") as mock_isdir, \
patch("os.makedirs") as mock_makedirs, \
patch("os.mkdir") as mock_mkdir, \
patch("os.rename") as mock_rename, \
patch("shutil.rmtree") as mock_rmtree, \
patch("kleinanzeigen_bot.extract.dicts.save_dict", autospec = True) as mock_save_dict, \
patch.object(extractor, "_extract_ad_page_info_with_directory_handling", new_callable = AsyncMock) as mock_extract_with_dir:
base_dir = "downloaded-ads"
temp_dir = os.path.join(base_dir, "ad_12345")
final_dir = os.path.join(base_dir, "ad_12345_Test Advertisement Title")
yaml_path = os.path.join(final_dir, "ad_12345.yaml")
# Configure mocks for directory checks
# Base directory exists, temp directory exists, final directory doesn't exist
existing_paths = {base_dir, temp_dir}
mock_exists.side_effect = lambda path: path in existing_paths
mock_isdir.side_effect = lambda path: path == base_dir
# Mock the new method that handles directory creation and extraction
mock_extract_with_dir.return_value = (
AdPartial.model_validate({
"title": "Test Advertisement Title",
"description": "Test Description",
"category": "Dienstleistungen",
"price": 100,
"images": [],
"contact": {
"name": "Test User",
"street": "Test Street 123",
"zipcode": "12345",
"location": "Test City"
}
}),
final_dir
)
await extractor.download_ad(12345)
# Verify the correct functions were called
mock_extract_with_dir.assert_called_once() # Extract to final directory
# Directory handling (including renaming) is now done inside _extract_ad_page_info_with_directory_handling
mock_rmtree.assert_not_called() # Directory handling is done internally
mock_mkdir.assert_not_called() # Directory handling is done internally
mock_makedirs.assert_not_called() # Using mkdir instead
mock_rename.assert_not_called() # Directory handling is done internally
# Get the actual call arguments
actual_call = mock_save_dict.call_args
assert actual_call is not None
actual_path = actual_call[0][0].replace("/", os.path.sep)
assert actual_path == yaml_path
assert actual_call[0][1] == mock_extract_with_dir.return_value[0].model_dump()
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_download_images_no_images(self, extractor:AdExtractor) -> None:
"""Test image download when no images are found."""
with patch.object(extractor, "web_find", new_callable = AsyncMock, side_effect = TimeoutError):
image_paths = await extractor._download_images_from_ad_page("/some/dir", 12345)
assert len(image_paths) == 0