# SPDX-FileCopyrightText: © Sebastian Thomschke and contributors
# SPDX-License-Identifier: AGPL-3.0-or-later
# SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/
import json # isort: skip
import asyncio
from gettext import gettext as _
from pathlib import Path
from typing import Any, Final, TypedDict
from unittest.mock import AsyncMock, MagicMock, call, patch
from urllib.error import URLError
import pytest
from jsonschema import Draft202012Validator
from ruamel.yaml import YAML
import kleinanzeigen_bot.extract as extract_module
from kleinanzeigen_bot.model.ad_model import AdPartial, ContactPartial
from kleinanzeigen_bot.model.config_model import Config, DownloadConfig
from kleinanzeigen_bot.utils.web_scraping_mixin import Browser, By, Element
SCHEMA_PATH:Final[Path] = Path(__file__).resolve().parents[2] / "schemas" / "ad.schema.json"
def _read_text_file(path:Path) -> str:
return path.read_text(encoding = "utf-8")
class _DimensionsDict(TypedDict):
ad_attributes:str
class _UniversalAnalyticsOptsDict(TypedDict):
dimensions:_DimensionsDict
class _BelenConfDict(TypedDict):
universalAnalyticsOpts:_UniversalAnalyticsOptsDict
class _SpecialAttributesDict(TypedDict, total=False):
art_s:str
condition_s:str
class _TestCaseDict(TypedDict): # noqa: PYI049 Private TypedDict `...` is never used
belen_conf:_BelenConfDict
expected:_SpecialAttributesDict
@pytest.fixture
def test_extractor(browser_mock:MagicMock, test_bot_config:Config) -> extract_module.AdExtractor:
"""Provides a fresh extract_module.AdExtractor instance for testing.
Dependencies:
- browser_mock: Used to mock browser interactions
- test_bot_config: Used to initialize the extractor with a valid configuration
"""
return extract_module.AdExtractor(browser_mock, test_bot_config, Path("downloaded-ads"))
class TestAdExtractorBasics:
"""Basic synchronous tests for extract_module.AdExtractor."""
def test_constructor(self, browser_mock:MagicMock, test_bot_config:Config) -> None:
"""Test the constructor of extract_module.AdExtractor"""
extractor = extract_module.AdExtractor(browser_mock, test_bot_config, Path("downloaded-ads"))
assert extractor.browser == browser_mock
assert extractor.config == test_bot_config
assert extractor.download_dir == Path("downloaded-ads")
@pytest.mark.parametrize(
("url", "expected_id"),
[
("https://www.kleinanzeigen.de/s-anzeige/test-title/12345678", 12345678),
("https://www.kleinanzeigen.de/s-anzeige/another-test/98765432", 98765432),
("https://www.kleinanzeigen.de/s-anzeige/invalid-id/abc", -1),
("https://www.kleinanzeigen.de/invalid-url", -1),
],
)
def test_extract_ad_id_from_ad_url(self, test_extractor:extract_module.AdExtractor, url:str, expected_id:int) -> None:
"""Test extraction of ad ID from different URL formats."""
assert test_extractor.extract_ad_id_from_ad_url(url) == expected_id
@pytest.mark.asyncio
async def test_path_exists_helper(self, tmp_path:Path) -> None:
"""Test files.exists helper function."""
from kleinanzeigen_bot.utils import files # noqa: PLC0415
# Test with existing path
existing_file = tmp_path / "test.txt"
existing_file.write_text("test")
assert await files.exists(existing_file) is True
assert await files.exists(str(existing_file)) is True
# Test with non-existing path
non_existing = tmp_path / "nonexistent.txt"
assert await files.exists(non_existing) is False
assert await files.exists(str(non_existing)) is False
@pytest.mark.asyncio
async def test_path_is_dir_helper(self, tmp_path:Path) -> None:
"""Test files.is_dir helper function."""
from kleinanzeigen_bot.utils import files # noqa: PLC0415
# Test with directory
test_dir = tmp_path / "testdir"
test_dir.mkdir()
assert await files.is_dir(test_dir) is True
assert await files.is_dir(str(test_dir)) is True
# Test with file
test_file = tmp_path / "test.txt"
test_file.write_text("test")
assert await files.is_dir(test_file) is False
assert await files.is_dir(str(test_file)) is False
# Test with non-existing path
non_existing = tmp_path / "nonexistent"
assert await files.is_dir(non_existing) is False
assert await files.is_dir(str(non_existing)) is False
@pytest.mark.asyncio
async def test_exists_async_helper(self, tmp_path:Path) -> None:
"""Test files.exists async helper function."""
from kleinanzeigen_bot.utils import files # noqa: PLC0415
# Test with existing path
existing_file = tmp_path / "test.txt"
existing_file.write_text("test")
assert await files.exists(existing_file) is True
assert await files.exists(str(existing_file)) is True
# Test with non-existing path
non_existing = tmp_path / "nonexistent.txt"
assert await files.exists(non_existing) is False
assert await files.exists(str(non_existing)) is False
@pytest.mark.asyncio
async def test_isdir_async_helper(self, tmp_path:Path) -> None:
"""Test files.is_dir async helper function."""
from kleinanzeigen_bot.utils import files # noqa: PLC0415
# Test with directory
test_dir = tmp_path / "testdir"
test_dir.mkdir()
assert await files.is_dir(test_dir) is True
assert await files.is_dir(str(test_dir)) is True
# Test with file
test_file = tmp_path / "test.txt"
test_file.write_text("test")
assert await files.is_dir(test_file) is False
assert await files.is_dir(str(test_file)) is False
# Test with non-existing path
non_existing = tmp_path / "nonexistent"
assert await files.is_dir(non_existing) is False
assert await files.is_dir(str(non_existing)) is False
def test_download_and_save_image_sync_success(self, tmp_path:Path) -> None:
"""Test _download_and_save_image_sync with successful download."""
from unittest.mock import MagicMock, mock_open # noqa: PLC0415
test_dir = tmp_path / "images"
test_dir.mkdir()
# Mock urllib response
mock_response = MagicMock()
mock_response.info().get_content_type.return_value = "image/jpeg"
mock_response.__enter__ = MagicMock(return_value = mock_response)
mock_response.__exit__ = MagicMock(return_value = False)
with (
patch("kleinanzeigen_bot.extract.urllib_request.urlopen", return_value = mock_response),
patch("kleinanzeigen_bot.extract.open", mock_open()),
patch("kleinanzeigen_bot.extract.shutil.copyfileobj"),
):
result = extract_module.AdExtractor._download_and_save_image_sync("http://example.com/image.jpg", str(test_dir), "test_", 1)
assert result is not None
assert result.endswith((".jpe", ".jpeg", ".jpg"))
assert "test_1" in result
def test_download_and_save_image_sync_failure(self, tmp_path:Path) -> None:
"""Test _download_and_save_image_sync with download failure."""
with patch("kleinanzeigen_bot.extract.urllib_request.urlopen", side_effect = URLError("Network error")):
result = extract_module.AdExtractor._download_and_save_image_sync("http://example.com/image.jpg", str(tmp_path), "test_", 1)
assert result is None
class TestAdExtractorPricing:
"""Tests for pricing related functionality."""
@pytest.mark.parametrize(
("price_text", "expected_price", "expected_type"),
[
("50 €", 50, "FIXED"),
("1.234 €", 1234, "FIXED"),
("50 € VB", 50, "NEGOTIABLE"),
("VB", None, "NEGOTIABLE"),
("Zu verschenken", None, "GIVE_AWAY"),
],
)
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_pricing_info(
self, test_extractor:extract_module.AdExtractor, price_text:str, expected_price:int | None, expected_type:str
) -> None:
"""Test price extraction with different formats"""
with patch.object(test_extractor, "web_text", new_callable = AsyncMock, return_value = price_text):
price, price_type = await test_extractor._extract_pricing_info_from_ad_page()
assert price == expected_price
assert price_type == expected_type
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_pricing_info_timeout(self, test_extractor:extract_module.AdExtractor) -> None:
"""Test price extraction when element is not found"""
with patch.object(test_extractor, "web_text", new_callable = AsyncMock, side_effect = TimeoutError):
price, price_type = await test_extractor._extract_pricing_info_from_ad_page()
assert price is None
assert price_type == "NOT_APPLICABLE"
class TestAdExtractorShipping:
"""Tests for shipping related functionality."""
@pytest.mark.parametrize(
("shipping_text", "expected_type", "expected_cost"),
[
("+ Versand ab 2,99 €", "SHIPPING", 2.99),
("Nur Abholung", "PICKUP", None),
("Versand möglich", "SHIPPING", None),
],
)
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_shipping_info(
self, test_extractor:extract_module.AdExtractor, shipping_text:str, expected_type:str, expected_cost:float | None
) -> None:
"""Test shipping info extraction with different text formats."""
with (
patch.object(test_extractor, "page", MagicMock()),
patch.object(test_extractor, "web_text", new_callable = AsyncMock, return_value = shipping_text),
patch.object(test_extractor, "web_request", new_callable = AsyncMock) as mock_web_request,
):
if expected_cost:
shipping_response:dict[str, Any] = {
"data": {"shippingOptionsResponse": {"options": [{"id": "DHL_001", "priceInEuroCent": int(expected_cost * 100), "packageSize": "SMALL"}]}}
}
mock_web_request.return_value = {"content": json.dumps(shipping_response)}
shipping_type, costs, options = await test_extractor._extract_shipping_info_from_ad_page()
assert shipping_type == expected_type
assert costs == expected_cost
if expected_cost:
assert options == ["DHL_2"]
else:
assert options is None
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_shipping_info_with_options(self, test_extractor:extract_module.AdExtractor) -> None:
"""Test shipping info extraction with shipping options."""
shipping_response = {
"content": json.dumps({"data": {"shippingOptionsResponse": {"options": [{"id": "DHL_001", "priceInEuroCent": 549, "packageSize": "SMALL"}]}}})
}
with (
patch.object(test_extractor, "page", MagicMock()),
patch.object(test_extractor, "web_text", new_callable = AsyncMock, return_value = "+ Versand ab 5,49 €"),
patch.object(test_extractor, "web_request", new_callable = AsyncMock, return_value = shipping_response),
):
shipping_type, costs, options = await test_extractor._extract_shipping_info_from_ad_page()
assert shipping_type == "SHIPPING"
assert costs == 5.49
assert options == ["DHL_2"]
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_shipping_info_with_all_matching_options(self, test_extractor:extract_module.AdExtractor) -> None:
"""Test shipping info extraction with all matching options enabled."""
shipping_response = {
"content": json.dumps(
{
"data": {
"shippingOptionsResponse": {
"options": [
{"id": "HERMES_001", "priceInEuroCent": 489, "packageSize": "SMALL"},
{"id": "HERMES_002", "priceInEuroCent": 549, "packageSize": "SMALL"},
{"id": "DHL_001", "priceInEuroCent": 619, "packageSize": "SMALL"},
]
}
}
}
)
}
# Enable all matching options in config
test_extractor.config.download = DownloadConfig.model_validate({"include_all_matching_shipping_options": True})
with (
patch.object(test_extractor, "page", MagicMock()),
patch.object(test_extractor, "web_text", new_callable = AsyncMock, return_value = "+ Versand ab 4,89 €"),
patch.object(test_extractor, "web_request", new_callable = AsyncMock, return_value = shipping_response),
):
shipping_type, costs, options = await test_extractor._extract_shipping_info_from_ad_page()
assert shipping_type == "SHIPPING"
assert costs == 4.89
if options is not None:
assert sorted(options) == ["DHL_2", "Hermes_Päckchen", "Hermes_S"]
else:
assert options is None
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_shipping_info_with_all_matching_options_no_match(self, test_extractor:extract_module.AdExtractor) -> None:
"""Test shipping extraction when include-all is enabled but no option matches the price."""
shipping_response = {
"content": json.dumps(
{
"data": {
"shippingOptionsResponse": {
"options": [
{"id": "DHL_001", "priceInEuroCent": 500, "packageSize": "SMALL"},
{"id": "HERMES_001", "priceInEuroCent": 600, "packageSize": "SMALL"},
]
}
}
}
)
}
test_extractor.config.download = DownloadConfig.model_validate({"include_all_matching_shipping_options": True})
with (
patch.object(test_extractor, "page", MagicMock()),
patch.object(test_extractor, "web_text", new_callable = AsyncMock, return_value = "+ Versand ab 4,89 €"),
patch.object(test_extractor, "web_request", new_callable = AsyncMock, return_value = shipping_response),
):
shipping_type, costs, options = await test_extractor._extract_shipping_info_from_ad_page()
assert shipping_type == "SHIPPING"
assert costs == 4.89
assert options is None
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_shipping_info_with_excluded_options(self, test_extractor:extract_module.AdExtractor) -> None:
"""Test shipping info extraction with excluded options."""
shipping_response = {
"content": json.dumps(
{
"data": {
"shippingOptionsResponse": {
"options": [
{"id": "HERMES_001", "priceInEuroCent": 489, "packageSize": "SMALL"},
{"id": "HERMES_002", "priceInEuroCent": 549, "packageSize": "SMALL"},
{"id": "DHL_001", "priceInEuroCent": 619, "packageSize": "SMALL"},
]
}
}
}
)
}
# Enable all matching options and exclude DHL in config
test_extractor.config.download = DownloadConfig.model_validate({"include_all_matching_shipping_options": True, "excluded_shipping_options": ["DHL_2"]})
with (
patch.object(test_extractor, "page", MagicMock()),
patch.object(test_extractor, "web_text", new_callable = AsyncMock, return_value = "+ Versand ab 4,89 €"),
patch.object(test_extractor, "web_request", new_callable = AsyncMock, return_value = shipping_response),
):
shipping_type, costs, options = await test_extractor._extract_shipping_info_from_ad_page()
assert shipping_type == "SHIPPING"
assert costs == 4.89
if options is not None:
assert sorted(options) == ["Hermes_Päckchen", "Hermes_S"]
else:
assert options is None
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_shipping_info_with_excluded_matching_option(self, test_extractor:extract_module.AdExtractor) -> None:
"""Test shipping info extraction when the matching option is excluded."""
shipping_response = {
"content": json.dumps(
{
"data": {
"shippingOptionsResponse": {
"options": [
{"id": "HERMES_001", "priceInEuroCent": 489, "packageSize": "SMALL"},
{"id": "HERMES_002", "priceInEuroCent": 549, "packageSize": "SMALL"},
]
}
}
}
)
}
# Exclude the matching option
test_extractor.config.download = DownloadConfig.model_validate({"excluded_shipping_options": ["Hermes_Päckchen"]})
with (
patch.object(test_extractor, "page", MagicMock()),
patch.object(test_extractor, "web_text", new_callable = AsyncMock, return_value = "+ Versand ab 4,89 €"),
patch.object(test_extractor, "web_request", new_callable = AsyncMock, return_value = shipping_response),
):
shipping_type, costs, options = await test_extractor._extract_shipping_info_from_ad_page()
assert shipping_type == "SHIPPING"
assert costs == 4.89
assert options is None
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_shipping_info_with_no_matching_option(self, test_extractor:extract_module.AdExtractor) -> None:
"""Test shipping info extraction when price exists but NO matching option in API response."""
shipping_response = {
"content": json.dumps(
{
"data": {
"shippingOptionsResponse": {
"options": [
{"id": "DHL_001", "priceInEuroCent": 500, "packageSize": "SMALL"},
{"id": "HERMES_001", "priceInEuroCent": 600, "packageSize": "SMALL"},
]
}
}
}
)
}
with (
patch.object(test_extractor, "page", MagicMock()),
patch.object(test_extractor, "web_text", new_callable = AsyncMock, return_value = "+ Versand ab 7,00 €"),
patch.object(test_extractor, "web_request", new_callable = AsyncMock, return_value = shipping_response),
):
shipping_type, costs, options = await test_extractor._extract_shipping_info_from_ad_page()
assert shipping_type == "SHIPPING"
assert costs == 7.0
assert options is None
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_shipping_info_timeout(self, test_extractor:extract_module.AdExtractor) -> None:
"""Test shipping info extraction when shipping element is missing (TimeoutError)."""
with (
patch.object(test_extractor, "page", MagicMock()),
patch.object(test_extractor, "web_text", new_callable = AsyncMock, side_effect = TimeoutError),
):
shipping_type, costs, options = await test_extractor._extract_shipping_info_from_ad_page()
assert shipping_type == "NOT_APPLICABLE"
assert costs is None
assert options is None
class TestAdExtractorNavigation:
"""Tests for navigation related functionality."""
@pytest.mark.asyncio
async def test_navigate_to_ad_page_with_url(self, test_extractor:extract_module.AdExtractor) -> None:
"""Test navigation to ad page using a URL."""
page_mock = AsyncMock()
page_mock.url = "https://www.kleinanzeigen.de/s-anzeige/test/12345"
with (
patch.object(test_extractor, "page", page_mock),
patch.object(test_extractor, "web_open", new_callable = AsyncMock) as mock_web_open,
patch.object(test_extractor, "web_find", new_callable = AsyncMock, side_effect = TimeoutError),
):
result = await test_extractor.navigate_to_ad_page("https://www.kleinanzeigen.de/s-anzeige/test/12345")
assert result is True
mock_web_open.assert_called_with("https://www.kleinanzeigen.de/s-anzeige/test/12345")
@pytest.mark.asyncio
async def test_navigate_to_ad_page_with_id(self, test_extractor:extract_module.AdExtractor) -> None:
"""Test navigation to ad page using an ID."""
ad_id = 12345
page_mock = AsyncMock()
page_mock.url = "https://www.kleinanzeigen.de/s-anzeige/test/{0}".format(ad_id)
popup_close_mock = AsyncMock()
popup_close_mock.click = AsyncMock()
popup_close_mock.apply = AsyncMock(return_value = True)
def find_mock(selector_type:By, selector_value:str, **_:Any) -> Element | None:
if selector_type == By.CLASS_NAME and selector_value == "mfp-close":
return popup_close_mock
return None
with (
patch.object(test_extractor, "page", page_mock),
patch.object(test_extractor, "web_open", new_callable = AsyncMock) as mock_web_open,
patch.object(test_extractor, "web_find", new_callable = AsyncMock, side_effect = find_mock),
):
result = await test_extractor.navigate_to_ad_page(ad_id)
assert result is True
mock_web_open.assert_called_with("https://www.kleinanzeigen.de/s-suchanfrage.html?keywords={0}".format(ad_id))
popup_close_mock.click.assert_awaited_once()
@pytest.mark.asyncio
async def test_navigate_to_ad_page_with_popup(self, test_extractor:extract_module.AdExtractor) -> None:
"""Test navigation to ad page with popup handling."""
page_mock = AsyncMock()
page_mock.url = "https://www.kleinanzeigen.de/s-anzeige/test/12345"
input_mock = AsyncMock()
input_mock.clear_input = AsyncMock()
input_mock.send_keys = AsyncMock()
input_mock.apply = AsyncMock(return_value = True)
with (
patch.object(test_extractor, "page", page_mock),
patch.object(test_extractor, "web_open", new_callable = AsyncMock),
patch.object(test_extractor, "web_find", new_callable = AsyncMock, return_value = input_mock),
patch.object(test_extractor, "web_click", new_callable = AsyncMock) as mock_web_click,
patch.object(test_extractor, "web_check", new_callable = AsyncMock, return_value = True),
):
result = await test_extractor.navigate_to_ad_page(12345)
assert result is True
mock_web_click.assert_called_with(By.CLASS_NAME, "mfp-close")
@pytest.mark.asyncio
async def test_navigate_to_ad_page_invalid_id(self, test_extractor:extract_module.AdExtractor) -> None:
"""Test navigation to ad page with invalid ID."""
page_mock = AsyncMock()
page_mock.url = "https://www.kleinanzeigen.de/s-suchen.html?k0"
input_mock = AsyncMock()
input_mock.clear_input = AsyncMock()
input_mock.send_keys = AsyncMock()
input_mock.apply = AsyncMock(return_value = True)
input_mock.attrs = {}
with (
patch.object(test_extractor, "page", page_mock),
patch.object(test_extractor, "web_open", new_callable = AsyncMock),
patch.object(test_extractor, "web_find", new_callable = AsyncMock, return_value = input_mock),
):
result = await test_extractor.navigate_to_ad_page(99999)
assert result is False
@pytest.mark.asyncio
async def test_extract_own_ads_urls(self, test_extractor:extract_module.AdExtractor) -> None:
"""Test extraction of own ads URLs - basic test."""
with (
patch.object(test_extractor, "web_open", new_callable = AsyncMock),
patch.object(test_extractor, "web_sleep", new_callable = AsyncMock),
patch.object(test_extractor, "web_find", new_callable = AsyncMock) as mock_web_find,
patch.object(test_extractor, "web_find_all", new_callable = AsyncMock) as mock_web_find_all,
patch.object(test_extractor, "web_scroll_page_down", new_callable = AsyncMock),
patch.object(test_extractor, "web_execute", new_callable = AsyncMock),
):
# --- Setup mock objects for DOM elements ---
# Mocks needed for the actual execution flow
ad_list_container_mock = MagicMock()
pagination_section_mock = MagicMock()
cardbox_mock = MagicMock() # Represents the
element
link_mock = MagicMock() # Represents the element
link_mock.attrs = {"href": "/s-anzeige/test/12345"} # Configure the desired output
# Mocks for elements potentially checked but maybe not strictly needed for output
# (depending on how robust the mocking is)
# next_button_mock = MagicMock() # If needed for multi_page logic
# --- Setup mock responses for web_find and web_find_all in CORRECT ORDER ---
# 1. Initial find for ad list container (before loop)
# 2. Find for pagination section (pagination check)
# 3. Find for ad list container (inside loop)
# 4. Find for the link (inside list comprehension)
mock_web_find.side_effect = [
ad_list_container_mock, # Call 1: find #my-manageitems-adlist (before loop)
pagination_section_mock, # Call 2: find .Pagination
ad_list_container_mock, # Call 3: find #my-manageitems-adlist (inside loop)
link_mock, # Call 4: find 'div.manageitems-item-ad h3 a.text-onSurface'
# Add more mocks here if the pagination navigation logic calls web_find again
]
# 1. Find all 'Nächste' buttons (pagination check) - Return empty list for single page test case
# 2. Find all '.cardbox' elements (inside loop)
mock_web_find_all.side_effect = [
[], # Call 1: find 'button[aria-label="Nächste"]' -> No next button = single page
[cardbox_mock], # Call 2: find .cardbox -> One ad item
# Add more mocks here if pagination navigation calls web_find_all
]
# --- Execute test and verify results ---
refs = await test_extractor.extract_own_ads_urls()
# --- Assertions ---
assert refs == ["/s-anzeige/test/12345"] # Now it should match
# Optional: Verify calls were made as expected
mock_web_find.assert_has_calls(
[
call(By.ID, "my-manageitems-adlist"),
call(By.CSS_SELECTOR, ".Pagination", timeout = 10),
call(By.ID, "my-manageitems-adlist"),
call(By.CSS_SELECTOR, "div h3 a.text-onSurface", parent = cardbox_mock),
],
any_order = False,
) # Check order if important
mock_web_find_all.assert_has_calls(
[
call(By.CSS_SELECTOR, 'button[aria-label="Nächste"]', parent = pagination_section_mock),
call(By.CLASS_NAME, "cardbox", parent = ad_list_container_mock),
],
any_order = False,
)
@pytest.mark.asyncio
async def test_extract_own_ads_urls_paginates_with_enabled_next_button(self, test_extractor:extract_module.AdExtractor) -> None:
"""Ensure the paginator clicks the first enabled next button and advances."""
ad_list_container_mock = MagicMock()
pagination_section_mock = MagicMock()
cardbox_page_one = MagicMock()
cardbox_page_two = MagicMock()
link_page_one = MagicMock(attrs = {"href": "/s-anzeige/page-one/111"})
link_page_two = MagicMock(attrs = {"href": "/s-anzeige/page-two/222"})
next_button_enabled = AsyncMock()
next_button_enabled.attrs = {}
disabled_button = MagicMock()
disabled_button.attrs = {"disabled": True}
link_queue = [link_page_one, link_page_two]
next_button_call = {"count": 0}
cardbox_call = {"count": 0}
async def fake_web_find(selector_type:By, selector_value:str, *, parent:Element | None = None, timeout:int | float | None = None) -> Element:
if selector_type == By.ID and selector_value == "my-manageitems-adlist":
return ad_list_container_mock
if selector_type == By.CSS_SELECTOR and selector_value == ".Pagination":
return pagination_section_mock
if selector_type == By.CSS_SELECTOR and selector_value == "div h3 a.text-onSurface":
return link_queue.pop(0)
raise AssertionError(f"Unexpected selector {selector_type} {selector_value}")
async def fake_web_find_all(
selector_type:By, selector_value:str, *, parent:Element | None = None, timeout:int | float | None = None
) -> list[Element]:
if selector_type == By.CSS_SELECTOR and selector_value == 'button[aria-label="Nächste"]':
next_button_call["count"] += 1
if next_button_call["count"] == 1:
return [next_button_enabled] # initial detection -> multi page
if next_button_call["count"] == 2:
return [disabled_button, next_button_enabled] # navigation on page 1
return [] # after navigating, stop
if selector_type == By.CLASS_NAME and selector_value == "cardbox":
cardbox_call["count"] += 1
return [cardbox_page_one] if cardbox_call["count"] == 1 else [cardbox_page_two]
raise AssertionError(f"Unexpected find_all selector {selector_type} {selector_value}")
with (
patch.object(test_extractor, "web_open", new_callable = AsyncMock),
patch.object(test_extractor, "web_scroll_page_down", new_callable = AsyncMock),
patch.object(test_extractor, "web_sleep", new_callable = AsyncMock),
patch.object(test_extractor, "web_find", new_callable = AsyncMock, side_effect = fake_web_find),
patch.object(test_extractor, "web_find_all", new_callable = AsyncMock, side_effect = fake_web_find_all),
):
refs = await test_extractor.extract_own_ads_urls()
assert refs == ["/s-anzeige/page-one/111", "/s-anzeige/page-two/222"]
next_button_enabled.click.assert_awaited() # triggered once during navigation
@pytest.mark.asyncio
async def test_extract_own_ads_urls_timeout_in_callback(self, test_extractor:extract_module.AdExtractor) -> None:
"""Test that TimeoutError in extract_page_refs callback stops pagination."""
with (
patch.object(test_extractor, "web_open", new_callable = AsyncMock),
patch.object(test_extractor, "web_sleep", new_callable = AsyncMock),
patch.object(test_extractor, "web_find", new_callable = AsyncMock) as mock_web_find,
patch.object(test_extractor, "web_find_all", new_callable = AsyncMock, return_value = []),
patch.object(test_extractor, "web_scroll_page_down", new_callable = AsyncMock),
patch.object(test_extractor, "web_execute", new_callable = AsyncMock),
):
# Setup: ad list container exists, but web_find_all for cardbox raises TimeoutError
ad_list_container_mock = MagicMock()
call_count = {"count": 0}
def mock_find_side_effect(*args:Any, **kwargs:Any) -> Element:
call_count["count"] += 1
if call_count["count"] == 1:
# First call: ad list container (before pagination loop)
return ad_list_container_mock
# Second call: ad list container (inside callback)
return ad_list_container_mock
mock_web_find.side_effect = mock_find_side_effect
# Make web_find_all for cardbox raise TimeoutError (simulating missing ad items)
async def mock_find_all_side_effect(*args:Any, **kwargs:Any) -> list[Element]:
raise TimeoutError("Ad items not found")
with patch.object(test_extractor, "web_find_all", new_callable = AsyncMock, side_effect = mock_find_all_side_effect):
refs = await test_extractor.extract_own_ads_urls()
# Pagination should stop (TimeoutError in callback returns True)
assert refs == []
@pytest.mark.asyncio
async def test_extract_own_ads_urls_skips_single_item_timeout(self, test_extractor:extract_module.AdExtractor) -> None:
"""Timeout on one ad item should skip that item but keep extracting others."""
ad_list_container_mock = MagicMock()
first_item = MagicMock()
second_item = MagicMock()
valid_link = MagicMock()
valid_link.attrs = {"href": "/s-anzeige/ok/999"}
with (
patch.object(test_extractor, "web_open", new_callable = AsyncMock),
patch.object(test_extractor, "web_sleep", new_callable = AsyncMock),
patch.object(test_extractor, "web_scroll_page_down", new_callable = AsyncMock),
patch.object(test_extractor, "web_find_all", new_callable = AsyncMock, return_value = [first_item, second_item]),
patch.object(
test_extractor,
"web_find",
new_callable = AsyncMock,
side_effect = [ad_list_container_mock, TimeoutError(), ad_list_container_mock, TimeoutError(), valid_link],
),
):
refs = await test_extractor.extract_own_ads_urls()
assert refs == ["/s-anzeige/ok/999"]
@pytest.mark.asyncio
async def test_extract_own_ads_urls_skips_single_item_without_href(self, test_extractor:extract_module.AdExtractor) -> None:
"""Anchor without href should be skipped instead of adding a 'None' entry."""
ad_list_container_mock = MagicMock()
first_item = MagicMock()
second_item = MagicMock()
missing_href_link = MagicMock()
missing_href_link.attrs = {}
valid_link = MagicMock()
valid_link.attrs = {"href": "/s-anzeige/ok/999"}
with (
patch.object(test_extractor, "web_open", new_callable = AsyncMock),
patch.object(test_extractor, "web_sleep", new_callable = AsyncMock),
patch.object(test_extractor, "web_scroll_page_down", new_callable = AsyncMock),
patch.object(test_extractor, "web_find_all", new_callable = AsyncMock, return_value = [first_item, second_item]),
patch.object(
test_extractor,
"web_find",
new_callable = AsyncMock,
side_effect = [ad_list_container_mock, TimeoutError(), ad_list_container_mock, missing_href_link, valid_link],
),
):
refs = await test_extractor.extract_own_ads_urls()
assert refs == ["/s-anzeige/ok/999"]
@pytest.mark.asyncio
async def test_extract_own_ads_urls_generic_exception_in_callback(self, test_extractor:extract_module.AdExtractor) -> None:
"""Test that generic Exception in extract_page_refs callback continues pagination."""
with (
patch.object(test_extractor, "web_open", new_callable = AsyncMock),
patch.object(test_extractor, "web_sleep", new_callable = AsyncMock),
patch.object(test_extractor, "web_find", new_callable = AsyncMock) as mock_web_find,
patch.object(test_extractor, "web_scroll_page_down", new_callable = AsyncMock),
):
# Setup: ad list container exists, but web_find_all raises generic Exception
ad_list_container_mock = MagicMock()
call_count = {"count": 0}
def mock_find_side_effect(*args:Any, **kwargs:Any) -> Element:
call_count["count"] += 1
if call_count["count"] == 1:
# First call: ad list container (before pagination loop)
return ad_list_container_mock
# Second call: pagination check - raise TimeoutError to indicate no pagination
if call_count["count"] == 2:
raise TimeoutError("No pagination")
# Third call: ad list container (inside callback)
return ad_list_container_mock
mock_web_find.side_effect = mock_find_side_effect
# Make web_find_all raise a generic exception
async def mock_find_all_side_effect(*args:Any, **kwargs:Any) -> list[Element]:
raise AttributeError("Unexpected error")
with patch.object(test_extractor, "web_find_all", new_callable = AsyncMock, side_effect = mock_find_all_side_effect):
refs = await test_extractor.extract_own_ads_urls()
# Pagination should continue despite exception (callback returns False)
# Since it's a single page (no pagination), refs should be empty
assert refs == []
class TestAdExtractorContent:
"""Tests for content extraction functionality."""
# pylint: disable=protected-access
@pytest.mark.asyncio
async def test_extract_description_with_affixes(
self, test_extractor:extract_module.AdExtractor, description_test_cases:list[tuple[dict[str, Any], str, str]], test_bot_config:Config
) -> None:
"""Test extraction of description with various prefix/suffix configurations."""
# Mock the page
page_mock = MagicMock()
page_mock.url = "https://www.kleinanzeigen.de/s-anzeige/test/12345"
test_extractor.page = page_mock
for config, raw_description, _expected_description in description_test_cases:
test_extractor.config = test_bot_config.with_values(config)
with patch.multiple(
test_extractor,
web_text = AsyncMock(
side_effect = [
"Test Title", # Title
raw_description, # Raw description (without affixes)
"03.02.2025", # Creation date
]
),
web_execute = AsyncMock(return_value = {"universalAnalyticsOpts": {"dimensions": {"l3_category_id": "", "ad_attributes": ""}}}),
_extract_category_from_ad_page = AsyncMock(return_value = "160"),
_extract_special_attributes_from_ad_page = AsyncMock(return_value = {}),
_extract_pricing_info_from_ad_page = AsyncMock(return_value = (None, "NOT_APPLICABLE")),
_extract_shipping_info_from_ad_page = AsyncMock(return_value = ("NOT_APPLICABLE", None, None)),
_extract_sell_directly_from_ad_page = AsyncMock(return_value = False),
_download_images_from_ad_page = AsyncMock(return_value = []),
_extract_contact_from_ad_page = AsyncMock(return_value = {}),
):
info = await test_extractor._extract_ad_page_info("/some/dir", 12345)
assert info.description == raw_description
@pytest.mark.asyncio
async def test_extract_description_with_affixes_timeout(self, test_extractor:extract_module.AdExtractor) -> None:
"""Test handling of timeout when extracting description."""
# Mock the page
page_mock = MagicMock()
page_mock.url = "https://www.kleinanzeigen.de/s-anzeige/test/12345"
test_extractor.page = page_mock
with patch.multiple(
test_extractor,
web_text = AsyncMock(
side_effect = [
"Test Title", # Title succeeds
TimeoutError("Timeout"), # Description times out
"03.02.2025", # Date succeeds
]
),
web_execute = AsyncMock(return_value = {"universalAnalyticsOpts": {"dimensions": {"l3_category_id": "", "ad_attributes": ""}}}),
_extract_category_from_ad_page = AsyncMock(return_value = "160"),
_extract_special_attributes_from_ad_page = AsyncMock(return_value = {}),
_extract_pricing_info_from_ad_page = AsyncMock(return_value = (None, "NOT_APPLICABLE")),
_extract_shipping_info_from_ad_page = AsyncMock(return_value = ("NOT_APPLICABLE", None, None)),
_extract_sell_directly_from_ad_page = AsyncMock(return_value = False),
_download_images_from_ad_page = AsyncMock(return_value = []),
_extract_contact_from_ad_page = AsyncMock(return_value = ContactPartial()),
):
try:
info = await test_extractor._extract_ad_page_info("/some/dir", 12345)
assert not info.description
except TimeoutError:
# This is also acceptable - depends on how we want to handle timeouts
pass
@pytest.mark.asyncio
async def test_extract_description_with_affixes_no_affixes(self, test_extractor:extract_module.AdExtractor) -> None:
"""Test extraction of description without any affixes in config."""
# Mock the page
page_mock = MagicMock()
page_mock.url = "https://www.kleinanzeigen.de/s-anzeige/test/12345"
test_extractor.page = page_mock
raw_description = "Original Description"
with patch.multiple(
test_extractor,
web_text = AsyncMock(
side_effect = [
"Test Title", # Title
raw_description, # Description without affixes
"03.02.2025", # Creation date
]
),
web_execute = AsyncMock(return_value = {"universalAnalyticsOpts": {"dimensions": {"l3_category_id": "", "ad_attributes": ""}}}),
_extract_category_from_ad_page = AsyncMock(return_value = "160"),
_extract_special_attributes_from_ad_page = AsyncMock(return_value = {}),
_extract_pricing_info_from_ad_page = AsyncMock(return_value = (None, "NOT_APPLICABLE")),
_extract_shipping_info_from_ad_page = AsyncMock(return_value = ("NOT_APPLICABLE", None, None)),
_extract_sell_directly_from_ad_page = AsyncMock(return_value = False),
_download_images_from_ad_page = AsyncMock(return_value = []),
_extract_contact_from_ad_page = AsyncMock(return_value = ContactPartial()),
):
info = await test_extractor._extract_ad_page_info("/some/dir", 12345)
assert info.description == raw_description
@pytest.mark.asyncio
async def test_extract_sell_directly_data_hit_true(self, test_extractor:extract_module.AdExtractor) -> None:
"""Test sell_directly extraction with data hit - buyNowEligible=True."""
# Setup extractor with published ads data
test_extractor.published_ads_by_id = {123456789: {"id": 123456789, "buyNowEligible": True}}
# Setup page URL
test_extractor.page = MagicMock()
test_extractor.page.url = "https://www.kleinanzeigen.de/s-anzeige/test-ad/123456789"
result = await test_extractor._extract_sell_directly_from_ad_page()
assert result is True
@pytest.mark.asyncio
async def test_extract_sell_directly_data_hit_false(self, test_extractor:extract_module.AdExtractor) -> None:
"""Test sell_directly extraction with data hit - buyNowEligible=False."""
test_extractor.published_ads_by_id = {123456789: {"id": 123456789, "buyNowEligible": False}}
test_extractor.page = MagicMock()
test_extractor.page.url = "https://www.kleinanzeigen.de/s-anzeige/test-ad/123456789"
result = await test_extractor._extract_sell_directly_from_ad_page()
assert result is False
@pytest.mark.asyncio
async def test_extract_sell_directly_data_miss(self, test_extractor:extract_module.AdExtractor) -> None:
"""Test sell_directly extraction with data miss - ad ID not in cache returns None."""
# Cache has a different ad ID than the one in the URL - true data miss
test_extractor.published_ads_by_id = {987654321: {"id": 987654321, "buyNowEligible": True}}
test_extractor.page = MagicMock()
test_extractor.page.url = "https://www.kleinanzeigen.de/s-anzeige/test-ad/123456789"
result = await test_extractor._extract_sell_directly_from_ad_page()
assert result is None
@pytest.mark.asyncio
async def test_extract_sell_directly_empty_published_ads(self, test_extractor:extract_module.AdExtractor) -> None:
"""Test sell_directly extraction with empty published_ads_by_id - returns None."""
test_extractor.published_ads_by_id = {}
test_extractor.page = MagicMock()
test_extractor.page.url = "https://www.kleinanzeigen.de/s-anzeige/test-ad/123456789"
result = await test_extractor._extract_sell_directly_from_ad_page()
assert result is None
@pytest.mark.asyncio
async def test_extract_sell_directly_invalid_url(self, test_extractor:extract_module.AdExtractor) -> None:
"""Test sell_directly extraction with invalid URL - returns None."""
test_extractor.published_ads_by_id = {123456789: {"id": 123456789, "buyNowEligible": True}}
test_extractor.page = MagicMock()
test_extractor.page.url = "https://www.kleinanzeigen.de/invalid-url"
result = await test_extractor._extract_sell_directly_from_ad_page()
assert result is None
@pytest.mark.asyncio
async def test_extract_sell_directly_non_boolean_value(self, test_extractor:extract_module.AdExtractor) -> None:
"""Test sell_directly extraction when buyNowEligible is not a boolean."""
test_extractor.published_ads_by_id = {123456789: {"id": 123456789, "buyNowEligible": "true"}} # String, not bool
test_extractor.page = MagicMock()
test_extractor.page.url = "https://www.kleinanzeigen.de/s-anzeige/test-ad/123456789"
result = await test_extractor._extract_sell_directly_from_ad_page()
assert result is None
@pytest.mark.asyncio
async def test_extract_sell_directly_missing_buy_now_field(self, test_extractor:extract_module.AdExtractor) -> None:
"""Test sell_directly extraction when buyNowEligible field is missing."""
test_extractor.published_ads_by_id = {123456789: {"id": 123456789, "state": "active"}} # No buyNowEligible
test_extractor.page = MagicMock()
test_extractor.page.url = "https://www.kleinanzeigen.de/s-anzeige/test-ad/123456789"
result = await test_extractor._extract_sell_directly_from_ad_page()
assert result is None
@pytest.mark.asyncio
async def test_extract_sell_directly_integer_value(self, test_extractor:extract_module.AdExtractor) -> None:
"""Test sell_directly extraction when buyNowEligible is an integer (not bool)."""
test_extractor.published_ads_by_id = {123456789: {"id": 123456789, "buyNowEligible": 1}} # Integer, not bool
test_extractor.page = MagicMock()
test_extractor.page.url = "https://www.kleinanzeigen.de/s-anzeige/test-ad/123456789"
result = await test_extractor._extract_sell_directly_from_ad_page()
assert result is None
class TestAdExtractorCategory:
"""Tests for category extraction functionality."""
@pytest.fixture
def extractor(self, test_bot_config:Config) -> extract_module.AdExtractor:
browser_mock = MagicMock(spec = Browser)
config = test_bot_config.with_values({"ad_defaults": {"description": {"prefix": "Test Prefix", "suffix": "Test Suffix"}}})
return extract_module.AdExtractor(browser_mock, config, Path("downloaded-ads"))
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_category(self, extractor:extract_module.AdExtractor) -> None:
"""Test category extraction from breadcrumb."""
category_line = MagicMock()
first_part = MagicMock()
first_part.attrs = {"href": "/s-familie-kind-baby/c17"}
second_part = MagicMock()
second_part.attrs = {"href": "/s-spielzeug/c23"}
with (
patch.object(extractor, "web_find", new_callable = AsyncMock, side_effect = [category_line]) as mock_web_find,
patch.object(extractor, "web_find_all", new_callable = AsyncMock, return_value = [first_part, second_part]) as mock_web_find_all,
):
result = await extractor._extract_category_from_ad_page()
assert result == "17/23"
mock_web_find.assert_awaited_once_with(By.ID, "vap-brdcrmb")
mock_web_find_all.assert_awaited_once_with(By.CSS_SELECTOR, "a", parent = category_line)
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_category_single_identifier(self, extractor:extract_module.AdExtractor) -> None:
"""Test category extraction when only a single breadcrumb code exists."""
category_line = MagicMock()
first_part = MagicMock()
first_part.attrs = {"href": "/s-kleidung/c42"}
with (
patch.object(extractor, "web_find", new_callable = AsyncMock, side_effect = [category_line]) as mock_web_find,
patch.object(extractor, "web_find_all", new_callable = AsyncMock, return_value = [first_part]) as mock_web_find_all,
):
result = await extractor._extract_category_from_ad_page()
assert result == "42/42"
mock_web_find.assert_awaited_once_with(By.ID, "vap-brdcrmb")
mock_web_find_all.assert_awaited_once_with(By.CSS_SELECTOR, "a", parent = category_line)
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_category_fallback_to_legacy_selectors(self, extractor:extract_module.AdExtractor, caplog:pytest.LogCaptureFixture) -> None:
"""Test category extraction when breadcrumb links are not available and legacy selectors are used."""
category_line = MagicMock()
first_part = MagicMock()
first_part.attrs = {"href": 12345} # Ensure str() conversion happens
second_part = MagicMock()
second_part.attrs = {"href": 67890} # This will need str() conversion
caplog.set_level("DEBUG")
expected_message = _("Falling back to legacy breadcrumb selectors; collected ids: %s") % []
with (
patch.object(extractor, "web_find", new_callable = AsyncMock) as mock_web_find,
patch.object(extractor, "web_find_all", new_callable = AsyncMock, side_effect = TimeoutError) as mock_web_find_all,
):
mock_web_find.side_effect = [category_line, first_part, second_part]
result = await extractor._extract_category_from_ad_page()
assert result == "12345/67890"
assert sum(1 for record in caplog.records if record.message == expected_message) == 1
mock_web_find.assert_any_call(By.ID, "vap-brdcrmb")
mock_web_find.assert_any_call(By.CSS_SELECTOR, "a:nth-of-type(2)", parent = category_line)
mock_web_find.assert_any_call(By.CSS_SELECTOR, "a:nth-of-type(3)", parent = category_line)
mock_web_find_all.assert_awaited_once_with(By.CSS_SELECTOR, "a", parent = category_line)
@pytest.mark.asyncio
async def test_extract_category_legacy_selectors_timeout(self, extractor:extract_module.AdExtractor, caplog:pytest.LogCaptureFixture) -> None:
"""Ensure fallback timeout logs the error and re-raises with translated message."""
category_line = MagicMock()
async def fake_web_find(selector_type:By, selector_value:str, *, parent:Element | None = None, timeout:int | float | None = None) -> Element:
if selector_type == By.ID and selector_value == "vap-brdcrmb":
return category_line
raise TimeoutError("legacy selectors missing")
with (
patch.object(extractor, "web_find", new_callable = AsyncMock, side_effect = fake_web_find),
patch.object(extractor, "web_find_all", new_callable = AsyncMock, side_effect = TimeoutError),
caplog.at_level("ERROR"),
pytest.raises(TimeoutError, match = "Unable to locate breadcrumb fallback selectors"),
):
await extractor._extract_category_from_ad_page()
assert any("Legacy breadcrumb selectors not found" in record.message for record in caplog.records)
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_special_attributes_empty(self, extractor:extract_module.AdExtractor) -> None:
"""Test extraction of special attributes when empty."""
with patch.object(extractor, "web_execute", new_callable = AsyncMock) as mock_web_execute:
mock_web_execute.return_value = {"universalAnalyticsOpts": {"dimensions": {"ad_attributes": ""}}}
result = await extractor._extract_special_attributes_from_ad_page(mock_web_execute.return_value)
assert result == {}
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_special_attributes_not_empty(self, extractor:extract_module.AdExtractor) -> None:
"""Test extraction of special attributes when not empty."""
special_atts = {
"universalAnalyticsOpts": {
"dimensions": {"ad_attributes": "versand_s:t|color_s:creme|groesse_s:68|condition_s:alright|type_s:accessoires|art_s:maedchen"}
}
}
result = await extractor._extract_special_attributes_from_ad_page(special_atts)
assert len(result) == 5
assert "versand_s" not in result
assert "color_s" in result
assert result["color_s"] == "creme"
assert "groesse_s" in result
assert result["groesse_s"] == "68"
assert "condition_s" in result
assert result["condition_s"] == "alright"
assert "type_s" in result
assert result["type_s"] == "accessoires"
assert "art_s" in result
assert result["art_s"] == "maedchen"
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_special_attributes_missing_ad_attributes(self, extractor:extract_module.AdExtractor) -> None:
"""Test extraction of special attributes when ad_attributes key is missing."""
belen_conf:dict[str, Any] = {
"universalAnalyticsOpts": {
"dimensions": {
# ad_attributes key is completely missing
}
}
}
result = await extractor._extract_special_attributes_from_ad_page(belen_conf)
assert result == {}
class TestAdExtractorContact:
"""Tests for contact information extraction."""
@pytest.fixture
def extractor(self, test_bot_config:Config) -> extract_module.AdExtractor:
browser_mock = MagicMock(spec = Browser)
config = test_bot_config.with_values({"ad_defaults": {"description": {"prefix": "Test Prefix", "suffix": "Test Suffix"}}})
return extract_module.AdExtractor(browser_mock, config, Path("downloaded-ads"))
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_contact_info(self, extractor:extract_module.AdExtractor) -> None:
"""Test extraction of contact information."""
with (
patch.object(extractor, "page", MagicMock()),
patch.object(extractor, "web_text", new_callable = AsyncMock) as mock_web_text,
patch.object(extractor, "web_find", new_callable = AsyncMock) as mock_web_find,
):
mock_web_text.side_effect = [
"12345 Berlin - Mitte",
"Example Street 123,",
"Test User",
]
mock_web_find.side_effect = [
MagicMock(), # contact person element
MagicMock(), # name element
TimeoutError(), # phone element (simulating no phone)
]
contact_info = await extractor._extract_contact_from_ad_page()
assert contact_info.street == "Example Street 123"
assert contact_info.zipcode == "12345"
assert contact_info.location == "Berlin - Mitte"
assert contact_info.name == "Test User"
assert contact_info.phone is None
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_contact_info_timeout(self, extractor:extract_module.AdExtractor) -> None:
"""Test contact info extraction when elements are not found."""
with (
patch.object(extractor, "page", MagicMock()),
patch.object(extractor, "web_text", new_callable = AsyncMock, side_effect = TimeoutError()),
patch.object(extractor, "web_find", new_callable = AsyncMock, side_effect = TimeoutError()),
pytest.raises(TimeoutError),
):
await extractor._extract_contact_from_ad_page()
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_contact_info_with_phone(self, extractor:extract_module.AdExtractor) -> None:
"""Test extraction of contact information including phone number."""
with (
patch.object(extractor, "page", MagicMock()),
patch.object(extractor, "web_text", new_callable = AsyncMock) as mock_web_text,
patch.object(extractor, "web_find", new_callable = AsyncMock) as mock_web_find,
):
mock_web_text.side_effect = ["12345 Berlin - Mitte", "Example Street 123,", "Test User", "+49(0)1234 567890"]
phone_element = MagicMock()
mock_web_find.side_effect = [
MagicMock(), # contact person element
MagicMock(), # name element
phone_element, # phone element
]
contact_info = await extractor._extract_contact_from_ad_page()
assert contact_info.phone == "01234567890" # Normalized phone number
class TestAdExtractorDownload:
"""Tests for download functionality."""
@pytest.fixture
def extractor(self, test_bot_config:Config) -> extract_module.AdExtractor:
browser_mock = MagicMock(spec = Browser)
config = test_bot_config.with_values({"ad_defaults": {"description": {"prefix": "Test Prefix", "suffix": "Test Suffix"}}})
return extract_module.AdExtractor(browser_mock, config, Path("downloaded-ads"))
@pytest.mark.asyncio
async def test_download_ad(self, extractor:extract_module.AdExtractor, tmp_path:Path) -> None:
"""Test downloading an ad - directory creation and saving ad data."""
# Use tmp_path for OS-agnostic path handling
download_base = tmp_path / "downloaded-ads"
final_dir = download_base / "ad_12345_Test Advertisement Title"
yaml_path = final_dir / "ad_12345.yaml"
extractor.download_dir = download_base
with (
patch("kleinanzeigen_bot.extract.dicts.save_dict", autospec = True) as mock_save_dict,
patch.object(extractor, "_extract_ad_page_info_with_directory_handling", new_callable = AsyncMock) as mock_extract_with_dir,
):
mock_extract_with_dir.return_value = (
AdPartial.model_validate(
{
"title": "Test Advertisement Title",
"description": "Test Description",
"category": "Dienstleistungen",
"price": 100,
"images": [],
"contact": {"name": "Test User", "street": "Test Street 123", "zipcode": "12345", "location": "Test City"},
}
),
str(final_dir),
)
await extractor.download_ad(12345)
# Verify observable behavior: extraction and save were called
mock_extract_with_dir.assert_called_once()
mock_save_dict.assert_called_once()
# Verify saved to correct location with correct data
actual_call = mock_save_dict.call_args
actual_path = Path(actual_call[0][0])
assert actual_path == yaml_path
assert actual_call[0][1] == mock_extract_with_dir.return_value[0].model_dump(mode = "json")
@pytest.mark.asyncio
async def test_download_ad_writes_schema_compliant_yaml(self, extractor:extract_module.AdExtractor, tmp_path:Path) -> None:
"""Test that downloaded ad YAML validates against ad.schema.json."""
download_base = tmp_path / "downloaded-ads"
final_dir = download_base / "ad_12345_Test Advertisement Title"
yaml_path = final_dir / "ad_12345.yaml"
extractor.download_dir = download_base
with patch.object(extractor, "_extract_ad_page_info_with_directory_handling", new_callable = AsyncMock) as mock_extract_with_dir:
mock_extract_with_dir.return_value = (
AdPartial.model_validate(
{
"title": "Test Advertisement Title",
"description": "Test Description",
"category": "Dienstleistungen",
"created_on": "2026-03-08T00:00:00+01:00",
"updated_on": "2026-03-09T01:02:03+01:00",
}
),
final_dir,
)
await extractor.download_ad(12345)
loaded_ad = YAML(typ = "safe").load(await asyncio.to_thread(_read_text_file, yaml_path))
schema = json.loads(await asyncio.to_thread(_read_text_file, SCHEMA_PATH))
Draft202012Validator(schema).validate(loaded_ad)
assert isinstance(loaded_ad["created_on"], str)
assert isinstance(loaded_ad["updated_on"], str)
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_download_images_no_images(self, extractor:extract_module.AdExtractor) -> None:
"""Test image download when no images are found."""
with patch.object(extractor, "web_find", new_callable = AsyncMock, side_effect = TimeoutError):
image_paths = await extractor._download_images_from_ad_page("/some/dir", 12345)
assert len(image_paths) == 0
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_download_images_with_none_url(self, extractor:extract_module.AdExtractor) -> None:
"""Test image download when some images have None as src attribute."""
image_box_mock = MagicMock()
# Create image elements - one with valid src, one with None src
img_with_url = MagicMock()
img_with_url.attrs = {"src": "http://example.com/valid_image.jpg"}
img_without_url = MagicMock()
img_without_url.attrs = {"src": None}
with (
patch.object(extractor, "web_find", new_callable = AsyncMock, return_value = image_box_mock),
patch.object(extractor, "web_find_all", new_callable = AsyncMock, return_value = [img_with_url, img_without_url]),
patch.object(extract_module.AdExtractor, "_download_and_save_image_sync", return_value = "/some/dir/ad_12345__img1.jpg"),
):
image_paths = await extractor._download_images_from_ad_page("/some/dir", 12345)
# Should only download the one valid image (skip the None)
assert len(image_paths) == 1
assert image_paths[0] == "ad_12345__img1.jpg"
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_ad_page_info_with_directory_handling_final_dir_exists(self, extractor:extract_module.AdExtractor, tmp_path:Path) -> None:
"""Test directory handling when final_dir already exists - it should be deleted."""
base_dir = tmp_path / "downloaded-ads"
base_dir.mkdir()
# Create the final directory that should be deleted
final_dir = base_dir / "ad_12345_Test Title"
final_dir.mkdir()
old_file = final_dir / "old_file.txt"
old_file.write_text("old content")
# Mock the page
page_mock = MagicMock()
page_mock.url = "https://www.kleinanzeigen.de/s-anzeige/test/12345"
extractor.page = page_mock
with (
patch.object(
extractor,
"web_text",
new_callable = AsyncMock,
side_effect = [
"Test Title", # Title extraction
"Test Title", # Second title call for full extraction
"Description text", # Description
"03.02.2025", # Creation date
],
),
patch.object(
extractor,
"web_execute",
new_callable = AsyncMock,
return_value = {"universalAnalyticsOpts": {"dimensions": {"l3_category_id": "", "ad_attributes": ""}}},
),
patch.object(extractor, "_extract_category_from_ad_page", new_callable = AsyncMock, return_value = "160"),
patch.object(extractor, "_extract_special_attributes_from_ad_page", new_callable = AsyncMock, return_value = {}),
patch.object(extractor, "_extract_pricing_info_from_ad_page", new_callable = AsyncMock, return_value = (None, "NOT_APPLICABLE")),
patch.object(extractor, "_extract_shipping_info_from_ad_page", new_callable = AsyncMock, return_value = ("NOT_APPLICABLE", None, None)),
patch.object(extractor, "_extract_sell_directly_from_ad_page", new_callable = AsyncMock, return_value = False),
patch.object(extractor, "_download_images_from_ad_page", new_callable = AsyncMock, return_value = []),
patch.object(
extractor,
"_extract_contact_from_ad_page",
new_callable = AsyncMock,
return_value = ContactPartial(
name = "Test",
zipcode = "12345",
location = "Berlin",
),
),
):
ad_cfg, result_dir = await extractor._extract_ad_page_info_with_directory_handling(base_dir, 12345)
# Verify the old directory was deleted and recreated
assert result_dir == final_dir
assert result_dir.exists()
assert not old_file.exists() # Old file should be gone
assert ad_cfg.title == "Test Title"
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_ad_page_info_with_directory_handling_rename_enabled(self, extractor:extract_module.AdExtractor, tmp_path:Path) -> None:
"""Test directory handling when temp_dir exists and rename_existing_folders is True."""
base_dir = tmp_path / "downloaded-ads"
base_dir.mkdir()
# Create the temp directory (without title)
temp_dir = base_dir / "ad_12345"
temp_dir.mkdir()
existing_file = temp_dir / "existing_image.jpg"
existing_file.write_text("existing image data")
# Enable rename_existing_folders in config
extractor.config.download.rename_existing_folders = True
# Mock the page
page_mock = MagicMock()
page_mock.url = "https://www.kleinanzeigen.de/s-anzeige/test/12345"
extractor.page = page_mock
with (
patch.object(
extractor,
"web_text",
new_callable = AsyncMock,
side_effect = [
"Test Title", # Title extraction
"Test Title", # Second title call for full extraction
"Description text", # Description
"03.02.2025", # Creation date
],
),
patch.object(
extractor,
"web_execute",
new_callable = AsyncMock,
return_value = {"universalAnalyticsOpts": {"dimensions": {"l3_category_id": "", "ad_attributes": ""}}},
),
patch.object(extractor, "_extract_category_from_ad_page", new_callable = AsyncMock, return_value = "160"),
patch.object(extractor, "_extract_special_attributes_from_ad_page", new_callable = AsyncMock, return_value = {}),
patch.object(extractor, "_extract_pricing_info_from_ad_page", new_callable = AsyncMock, return_value = (None, "NOT_APPLICABLE")),
patch.object(extractor, "_extract_shipping_info_from_ad_page", new_callable = AsyncMock, return_value = ("NOT_APPLICABLE", None, None)),
patch.object(extractor, "_extract_sell_directly_from_ad_page", new_callable = AsyncMock, return_value = False),
patch.object(extractor, "_download_images_from_ad_page", new_callable = AsyncMock, return_value = []),
patch.object(
extractor,
"_extract_contact_from_ad_page",
new_callable = AsyncMock,
return_value = ContactPartial(
name = "Test",
zipcode = "12345",
location = "Berlin",
),
),
):
ad_cfg, result_dir = await extractor._extract_ad_page_info_with_directory_handling(base_dir, 12345)
# Verify the directory was renamed from temp_dir to final_dir
final_dir = base_dir / "ad_12345_Test Title"
assert result_dir == final_dir
assert result_dir.exists()
assert not temp_dir.exists() # Old temp dir should be gone
assert (result_dir / "existing_image.jpg").exists() # File should be preserved
assert ad_cfg.title == "Test Title"
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_ad_page_info_with_directory_handling_use_existing(self, extractor:extract_module.AdExtractor, tmp_path:Path) -> None:
"""Test directory handling when temp_dir exists and rename_existing_folders is False (default)."""
base_dir = tmp_path / "downloaded-ads"
base_dir.mkdir()
# Create the temp directory (without title)
temp_dir = base_dir / "ad_12345"
temp_dir.mkdir()
existing_file = temp_dir / "existing_image.jpg"
existing_file.write_text("existing image data")
# Ensure rename_existing_folders is False (default)
extractor.config.download.rename_existing_folders = False
# Mock the page
page_mock = MagicMock()
page_mock.url = "https://www.kleinanzeigen.de/s-anzeige/test/12345"
extractor.page = page_mock
with (
patch.object(
extractor,
"web_text",
new_callable = AsyncMock,
side_effect = [
"Test Title", # Title extraction
"Test Title", # Second title call for full extraction
"Description text", # Description
"03.02.2025", # Creation date
],
),
patch.object(
extractor,
"web_execute",
new_callable = AsyncMock,
return_value = {"universalAnalyticsOpts": {"dimensions": {"l3_category_id": "", "ad_attributes": ""}}},
),
patch.object(extractor, "_extract_category_from_ad_page", new_callable = AsyncMock, return_value = "160"),
patch.object(extractor, "_extract_special_attributes_from_ad_page", new_callable = AsyncMock, return_value = {}),
patch.object(extractor, "_extract_pricing_info_from_ad_page", new_callable = AsyncMock, return_value = (None, "NOT_APPLICABLE")),
patch.object(extractor, "_extract_shipping_info_from_ad_page", new_callable = AsyncMock, return_value = ("NOT_APPLICABLE", None, None)),
patch.object(extractor, "_extract_sell_directly_from_ad_page", new_callable = AsyncMock, return_value = False),
patch.object(extractor, "_download_images_from_ad_page", new_callable = AsyncMock, return_value = []),
patch.object(
extractor,
"_extract_contact_from_ad_page",
new_callable = AsyncMock,
return_value = ContactPartial(
name = "Test",
zipcode = "12345",
location = "Berlin",
),
),
):
ad_cfg, result_dir = await extractor._extract_ad_page_info_with_directory_handling(base_dir, 12345)
# Verify the existing temp_dir was used (not renamed)
assert result_dir == temp_dir
assert result_dir.exists()
assert (result_dir / "existing_image.jpg").exists() # File should be preserved
assert ad_cfg.title == "Test Title"
@pytest.mark.asyncio
async def test_download_ad_with_umlauts_in_title(self, extractor:extract_module.AdExtractor, tmp_path:Path) -> None:
"""Test cross-platform Unicode handling for ad titles with umlauts (issue #728).
Verifies that:
1. Directories are created with NFC-normalized names (via sanitize_folder_name)
2. Files can be saved to those directories (via save_dict's NFC normalization)
3. No FileNotFoundError occurs due to NFC/NFD mismatch on Linux/Windows
"""
# Title with German umlauts (ä) - common in real ads
title_with_umlauts = "KitchenAid Zuhälter - nie benutzt"
# Mock the page
page_mock = MagicMock()
page_mock.url = "https://www.kleinanzeigen.de/s-anzeige/test/12345"
extractor.page = page_mock
base_dir = tmp_path / "downloaded-ads"
base_dir.mkdir()
with (
patch.object(
extractor,
"web_text",
new_callable = AsyncMock,
side_effect = [
title_with_umlauts, # Title extraction
title_with_umlauts, # Second title call for full extraction
"Description text", # Description
"03.02.2025", # Creation date
],
),
patch.object(
extractor,
"web_execute",
new_callable = AsyncMock,
return_value = {"universalAnalyticsOpts": {"dimensions": {"l3_category_id": "", "ad_attributes": ""}}},
),
patch.object(extractor, "_extract_category_from_ad_page", new_callable = AsyncMock, return_value = "160"),
patch.object(extractor, "_extract_special_attributes_from_ad_page", new_callable = AsyncMock, return_value = {}),
patch.object(extractor, "_extract_pricing_info_from_ad_page", new_callable = AsyncMock, return_value = (None, "NOT_APPLICABLE")),
patch.object(extractor, "_extract_shipping_info_from_ad_page", new_callable = AsyncMock, return_value = ("NOT_APPLICABLE", None, None)),
patch.object(extractor, "_extract_sell_directly_from_ad_page", new_callable = AsyncMock, return_value = False),
patch.object(extractor, "_download_images_from_ad_page", new_callable = AsyncMock, return_value = []),
patch.object(
extractor,
"_extract_contact_from_ad_page",
new_callable = AsyncMock,
return_value = ContactPartial(
name = "Test",
zipcode = "12345",
location = "Berlin",
),
),
):
ad_cfg, result_dir = await extractor._extract_ad_page_info_with_directory_handling(base_dir, 12345)
# Verify directory was created with NFC-normalized name
assert result_dir.exists()
assert ad_cfg.title == title_with_umlauts
# Test saving YAML file to the Unicode directory path
# Before fix: Failed on Linux/Windows due to NFC/NFD mismatch
# After fix: Both directory and file use NFC normalization
ad_file_path = Path(result_dir) / "ad_12345.yaml"
from kleinanzeigen_bot.utils import dicts # noqa: PLC0415
header_string = (
"# yaml-language-server: $schema=https://raw.githubusercontent.com/Second-Hand-Friends/kleinanzeigen-bot/refs/heads/main/schemas/ad.schema.json"
)
# save_dict normalizes path to NFC, matching the NFC directory name
dicts.save_dict(str(ad_file_path), ad_cfg.model_dump(), header = header_string)
# Verify file was created successfully (no FileNotFoundError)
assert ad_file_path.exists()
assert ad_file_path.is_file()